# RiveScript-Python
#
# This code is released under the MIT License.
# See the "LICENSE" file for more information.
#
# https://www.rivescript.com/
from .regexp import RE
import re
# Version of RiveScript we support.
rs_version = 2.0
[docs]class Parser(object):
"""The RiveScript language parser.
Parameters:
master (RiveScript): A reference to the parent RiveScript bot instance,
mostly useful for its debug methods like ``warn()``.
strict (bool): Strict syntax checking (true by default).
utf8 (bool): Enable UTF-8 mode (false by default).
"""
# Concatenation mode characters.
concat_modes = dict(
none="",
space=" ",
newline="\n",
)
def __init__(self, master, strict=True, utf8=False):
self.master = master
self.strict = strict
self.utf8 = utf8
# Proxy functions
[docs] def say(self, *args, **kwargs):
self.master._say(*args, **kwargs)
[docs] def warn(self, *args, **kwargs):
self.master._warn(*args, **kwargs)
[docs] def parse(self, filename, code):
"""Read and parse a RiveScript document.
Returns a data structure that represents all of the useful contents of
the document, in this format::
{
"begin": { # "begin" data
"global": {}, # map of !global vars
"var": {}, # bot !var's
"sub": {}, # !sub substitutions
"person": {}, # !person substitutions
"array": {}, # !array lists
},
"topics": { # main reply data
"random": { # (topic name)
"includes": {}, # map of included topics (values=1)
"inherits": {}, # map of inherited topics
"triggers": [ # array of triggers
{
"trigger": "hello bot",
"reply": [], # array of replies
"condition": [], # array of conditions
"redirect": None, # redirect command
"previous": None, # 'previous' reply
},
# ...
]
}
}
"objects": [ # parsed object macros
{
"name": "", # object name
"language": "", # programming language
"code": [], # array of lines of code
}
]
}
Args:
filename (str): The name of the file that the code came from, for
syntax error reporting purposes.
code (str[]): The source code to parse.
Returns:
dict: The aforementioned data structure.
"""
# Eventual returned structure ("abstract syntax tree" but not really)
ast = {
"begin": {
"global": {},
"var": {},
"sub": {},
"person": {},
"array": {},
},
"topics": {},
"objects": [],
}
# Track temporary variables.
topic = 'random' # Default topic=random
lineno = 0 # Line numbers for syntax tracking
comment = False # In a multi-line comment
inobj = False # In an object
objname = '' # The name of the object we're in
objlang = '' # The programming language of the object
objbuf = [] # Object contents buffer
curtrig = None # Pointer to the current trigger in ast.topics
isThat = None # Is a %Previous trigger
# Local (file scoped) parser options.
local_options = dict(
concat="none", # Concat mode for ^Continue command
)
# Read each line.
for lp, line in enumerate(code):
lineno += 1
self.say("Line: " + line + " (topic: " + topic + ") incomment: " + str(inobj))
if len(line.strip()) == 0: # Skip blank lines
continue
# In an object?
if inobj:
if re.match(RE.objend, line):
# End the object.
if len(objname):
ast["objects"].append({
"name": objname,
"language": objlang,
"code": objbuf,
})
objname = ''
objlang = ''
objbuf = []
inobj = False
else:
objbuf.append(line)
continue
line = line.strip() # Trim excess space. We do it down here so we
# don't mess up python objects!
# Look for comments.
if line[:2] == '//': # A single-line comment.
continue
elif line[0] == '#':
self.warn("Using the # symbol for comments is deprecated", filename, lineno)
elif line[:2] == '/*': # Start of a multi-line comment.
if '*/' not in line: # Cancel if the end is here too.
comment = True
continue
elif '*/' in line:
comment = False
continue
if comment:
continue
# Separate the command from the data.
if len(line) < 2:
self.warn("Weird single-character line '" + line + "' found.", filename, lineno)
continue
cmd = line[0]
line = line[1:].strip()
# Ignore inline comments if there's a space before the // symbols.
if " //" in line:
line = line.split(" //")[0].strip()
# Run a syntax check on this line.
syntax_error = self.check_syntax(cmd, line)
if syntax_error:
# There was a syntax error! Are we enforcing strict mode?
syntax_error = "Syntax error in " + filename + " line " + str(lineno) + ": " \
+ syntax_error + " (near: " + cmd + " " + line + ")"
if self.strict:
raise Exception(syntax_error)
else:
self.warn(syntax_error)
return # Don't try to continue
# Reset the %Previous state if this is a new +Trigger.
if cmd == '+':
isThat = None
# Do a lookahead for ^Continue and %Previous commands.
for i in range(lp + 1, len(code)):
lookahead = code[i].strip()
if len(lookahead) < 2:
continue
lookCmd = lookahead[0]
lookahead = lookahead[1:].strip()
# Only continue if the lookahead line has any data.
if len(lookahead) != 0:
# The lookahead command has to be either a % or a ^.
if lookCmd != '^' and lookCmd != '%':
break
# If the current command is a +, see if the following is
# a %.
if cmd == '+':
if lookCmd == '%':
isThat = lookahead
break
else:
isThat = None
# If the current command is a ! and the next command(s) are
# ^, we'll tack each extension on as a line break (which is
# useful information for arrays).
if cmd == '!':
if lookCmd == '^':
line += "<crlf>" + lookahead
continue
# If the current command is not a ^ and the line after is
# not a %, but the line after IS a ^, then tack it on to the
# end of the current line.
if cmd != '^' and lookCmd != '%':
if lookCmd == '^':
line += self.concat_modes.get(
local_options["concat"], ""
) + lookahead
else:
break
self.say("Command: " + cmd + "; line: " + line)
# Handle the types of RiveScript commands.
if cmd == '!':
# ! DEFINE
halves = re.split(RE.equals, line, 2)
left = re.split(RE.ws, halves[0].strip(), 2)
value, type, var = '', '', ''
if len(halves) == 2:
value = halves[1].strip()
if len(left) >= 1:
type = left[0].strip()
if len(left) >= 2:
var = ' '.join(left[1:]).strip()
# Remove 'fake' line breaks unless this is an array.
if type != 'array':
value = re.sub(RE.crlf, '', value)
# Handle version numbers.
if type == 'version':
# Verify we support it.
try:
if float(value) > rs_version:
self.warn("Unsupported RiveScript version. We only support " + rs_version, filename, lineno)
return
except:
self.warn("Error parsing RiveScript version number: not a number", filename, lineno)
continue
# All other types of defines require a variable and value name.
if len(var) == 0:
self.warn("Undefined variable name", filename, lineno)
continue
elif len(value) == 0:
self.warn("Undefined variable value", filename, lineno)
continue
# Handle the rest of the types.
if type == 'local':
# Local file-scoped parser options.
self.say("\tSet parser option " + var + " = " + value)
local_options[var] = value
elif type == 'global':
# 'Global' variables
self.say("\tSet global " + var + " = " + value)
if value == '<undef>':
try:
del(ast["begin"]["global"][var])
except:
self.warn("Failed to delete missing global variable", filename, lineno)
else:
ast["begin"]["global"][var] = value
# Handle flipping debug and depth vars.
if var == 'debug':
if value.lower() == 'true':
value = True
else:
value = False
elif var == 'depth':
try:
value = int(value)
except:
self.warn("Failed to set 'depth' because the value isn't a number!", filename, lineno)
elif var == 'strict':
if value.lower() == 'true':
value = True
else:
value = False
elif type == 'var':
# Bot variables
self.say("\tSet bot variable " + var + " = " + value)
if value == '<undef>':
try:
del(ast["begin"]["var"][var])
except:
self.warn("Failed to delete missing bot variable", filename, lineno)
else:
ast["begin"]["var"][var] = value
elif type == 'array':
# Arrays
self.say("\tArray " + var + " = " + value)
if value == '<undef>':
try:
del(ast["begin"]["array"][var])
except:
self.warn("Failed to delete missing array", filename, lineno)
continue
# Did this have multiple parts?
parts = value.split("<crlf>")
# Process each line of array data.
fields = []
for val in parts:
if '|' in val:
fields.extend(val.split('|'))
else:
fields.extend(re.split(RE.ws, val))
# Convert any remaining '\s' escape codes into spaces.
for f in fields:
f = f.replace('\s', ' ')
ast["begin"]["array"][var] = fields
elif type == 'sub':
# Substitutions
self.say("\tSubstitution " + var + " => " + value)
if value == '<undef>':
try:
del(ast["begin"]["sub"][var])
except:
self.warn("Failed to delete missing substitution", filename, lineno)
else:
ast["begin"]["sub"][var] = value
elif type == 'person':
# Person Substitutions
self.say("\tPerson Substitution " + var + " => " + value)
if value == '<undef>':
try:
del(ast["begin"]["person"][var])
except:
self.warn("Failed to delete missing person substitution", filename, lineno)
else:
ast["begin"]["person"][var] = value
else:
self.warn("Unknown definition type '" + type + "'", filename, lineno)
elif cmd == '>':
# > LABEL
temp = re.split(RE.ws, line)
type = temp[0]
name = ''
fields = []
if len(temp) >= 2:
name = temp[1]
if len(temp) >= 3:
fields = temp[2:]
# Handle the label types.
if type == 'begin':
# The BEGIN block.
self.say("\tFound the BEGIN block.")
type = 'topic'
name = '__begin__'
if type == 'topic':
# Starting a new topic.
self.say("\tSet topic to " + name)
curtrig = None
topic = name
# Initialize the topic tree.
self._init_topic(ast["topics"], topic)
# Does this topic include or inherit another one?
mode = '' # or 'inherits' or 'includes'
if len(fields) >= 2:
for field in fields:
if field == 'includes':
mode = 'includes'
elif field == 'inherits':
mode = 'inherits'
elif mode != '':
# This topic is either inherited or included.
if mode == 'includes':
ast["topics"][name]["includes"][field] = 1
else:
ast["topics"][name]["inherits"][field] = 1
elif type == 'object':
# If a field was provided, it should be the programming
# language.
lang = None
if len(fields) > 0:
lang = fields[0].lower()
# Only try to parse a language we support.
curtrig = None
if lang is None:
self.warn("Trying to parse unknown programming language", filename, lineno)
lang = 'python' # Assume it's Python.
# We have a handler, so start loading the code.
objname = name
objlang = lang
objbuf = []
inobj = True
else:
self.warn("Unknown label type '" + type + "'", filename, lineno)
elif cmd == '<':
# < LABEL
type = line
if type == 'begin' or type == 'topic':
self.say("\tEnd topic label.")
topic = 'random'
elif type == 'object':
self.say("\tEnd object label.")
inobj = False
elif cmd == '+':
# + TRIGGER
self.say("\tTrigger pattern: " + line)
# Initialize the topic tree.
self._init_topic(ast["topics"], topic)
curtrig = {
"trigger": line,
"reply": [],
"condition": [],
"redirect": None,
"previous": isThat,
}
ast["topics"][topic]["triggers"].append(curtrig)
elif cmd == '-':
# - REPLY
if curtrig is None:
self.warn("Response found before trigger", filename, lineno)
continue
self.say("\tResponse: " + line)
curtrig["reply"].append(line.strip())
elif cmd == '%':
# % PREVIOUS
pass # This was handled above.
elif cmd == '^':
# ^ CONTINUE
pass # This was handled above.
elif cmd == '@':
# @ REDIRECT
if curtrig is None:
self.warn("Redirect found before trigger", filename, lineno)
continue
self.say("\tRedirect: " + line)
curtrig["redirect"] = line.strip()
elif cmd == '*':
# * CONDITION
if curtrig is None:
self.warn("Condition found before trigger", filename, lineno)
continue
self.say("\tAdding condition: " + line)
curtrig["condition"].append(line.strip())
else:
self.warn("Unrecognized command \"" + cmd + "\"", filename, lineno)
continue
return ast
[docs] def check_syntax(self, cmd, line):
"""Syntax check a line of RiveScript code.
Args:
str cmd: The command symbol for the line of code, such as one
of ``+``, ``-``, ``*``, ``>``, etc.
str line: The remainder of the line of code, such as the text of
a trigger or reply.
Return:
str: A string syntax error message or ``None`` if no errors.
"""
# Run syntax checks based on the type of command.
if cmd == '!':
# ! Definition
# - Must be formatted like this:
# ! type name = value
# OR
# ! type = value
match = re.match(RE.def_syntax, line)
if not match:
return "Invalid format for !Definition line: must be '! type name = value' OR '! type = value'"
elif cmd == '>':
# > Label
# - The "begin" label must have only one argument ("begin")
# - "topic" labels must be lowercased but can inherit other topics (a-z0-9_\s)
# - "object" labels must follow the same rules as "topic", but don't need to be lowercase
parts = re.split(" ", line, 2)
if parts[0] == "begin" and len(parts) > 1:
return "The 'begin' label takes no additional arguments, should be verbatim '> begin'"
elif parts[0] == "topic":
match = re.match(RE.name_syntax, line)
if match:
return "Topics should be lowercased and contain only numbers and letters"
elif parts[0] == "object":
match = re.match(RE.name_syntax, line)
if match:
return "Objects can only contain numbers and letters"
elif cmd == '+' or cmd == '%' or cmd == '@':
# + Trigger, % Previous, @ Redirect
# This one is strict. The triggers are to be run through the regexp engine,
# therefore it should be acceptable for the regexp engine.
# - Entirely lowercase
# - No symbols except: ( | ) [ ] * _ # @ { } < > =
# - All brackets should be matched
parens = 0 # Open parenthesis
square = 0 # Open square brackets
curly = 0 # Open curly brackets
angle = 0 # Open angled brackets
# Count brackets.
for char in line:
if char == '(':
parens += 1
elif char == ')':
parens -= 1
elif char == '[':
square += 1
elif char == ']':
square -= 1
elif char == '{':
curly += 1
elif char == '}':
curly -= 1
elif char == '<':
angle += 1
elif char == '>':
angle -= 1
# Any mismatches?
if parens != 0:
return "Unmatched parenthesis brackets"
elif square != 0:
return "Unmatched square brackets"
elif curly != 0:
return "Unmatched curly brackets"
elif angle != 0:
return "Unmatched angle brackets"
# In UTF-8 mode, most symbols are allowed.
if self.utf8:
match = re.match(RE.utf8_trig, line)
if match:
return "Triggers can't contain uppercase letters, backslashes or dots in UTF-8 mode."
else:
match = re.match(RE.trig_syntax, line)
if match:
return "Triggers may only contain lowercase letters, numbers, and these symbols: ( | ) [ ] * _ # @ { } < > ="
elif cmd == '-' or cmd == '^' or cmd == '/':
# - Trigger, ^ Continue, / Comment
# These commands take verbatim arguments, so their syntax is loose.
pass
elif cmd == '*':
# * Condition
# Syntax for a conditional is as follows:
# * value symbol value => response
match = re.match(RE.cond_syntax, line)
if not match:
return "Invalid format for !Condition: should be like '* value symbol value => response'"
return None
def _init_topic(self, topics, name):
"""Initialize a Topic Tree data structure.
Sets up the topic under ``ast.topics`` with all its relevant keys
and sub-keys, etc.
Args:
topics (dict): A reference to the ``ast.topics``
name (str): The name of the topic to initialize.
Returns:
None
"""
if not name in topics:
topics[name] = {
"includes": {},
"inherits": {},
"triggers": [],
}