diff options
Diffstat (limited to 'util/newconfig')
-rw-r--r-- | util/newconfig/LICENSE | 18 | ||||
-rw-r--r-- | util/newconfig/Makefile | 29 | ||||
-rw-r--r-- | util/newconfig/NOTES | 46 | ||||
-rw-r--r-- | util/newconfig/config.g | 912 | ||||
-rw-r--r-- | util/newconfig/parsedesc.g | 196 | ||||
-rw-r--r-- | util/newconfig/test.config | 6 | ||||
-rw-r--r-- | util/newconfig/yapps2.py | 779 | ||||
-rw-r--r-- | util/newconfig/yapps2.tex | 1225 | ||||
-rw-r--r-- | util/newconfig/yappsrt.py | 172 |
9 files changed, 3383 insertions, 0 deletions
diff --git a/util/newconfig/LICENSE b/util/newconfig/LICENSE new file mode 100644 index 0000000000..64f38b89f2 --- /dev/null +++ b/util/newconfig/LICENSE @@ -0,0 +1,18 @@ +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/util/newconfig/Makefile b/util/newconfig/Makefile new file mode 100644 index 0000000000..42e7573931 --- /dev/null +++ b/util/newconfig/Makefile @@ -0,0 +1,29 @@ +ALL: $(shell echo *.g examples/*.g | sed s/.g/.py/g ) + +%.py: %.g yapps2.py yappsrt.py Makefile + python yapps2.py $< + +DOC: yapps2.ps yapps2.pdf manual/index.html + +yapps2.ps: yapps2.dvi + dvips -q yapps2.dvi -o yapps2.ps + +yapps2.pdf: yapps2.ps + ps2pdf yapps2.ps + +yapps2.dvi: yapps2.tex + latex yapps2.tex + +manual/index.html: yapps2.aux yapps2.tex + rm manual/yapps2.css + latex2html -dir 'manual' -mkdir -lcase_tags -font_size 12pt -split 4 -toc_depth 4 -html_version 4.0,unicode,table -t 'Yapps 2.0 Manual' -address 'Amit J Patel, amitp@cs.stanford.edu' -info 0 -show_section_numbers -up_title 'Yapps Page' -up_url 'http://theory.stanford.edu/~amitp/yapps/' -strict -image_type png yapps2.tex + echo '@import url("http://www-cs-students.stanford.edu/~amitp/amitp.css");' > manual/yapps2-new.css + echo 'hr { display:none; }' >> manual/yapps2-new.css + echo 'h1 br, h2 br { display:none; }' >>manual/yapps2-new.css + cat manual/yapps2.css >> manual/yapps2-new.css + rm manual/yapps2.css + mv manual/yapps2-new.css manual/yapps2.css + +DISTRIB: + cd ..; zip -u yapps2.zip yapps2/{LICENSE,yapps2.py,yappsrt.py,parsedesc.g,examples/*.g,NOTES,yapps2.tex,Makefile,manual/*.html,manual/*.css,manual/*.png} + diff --git a/util/newconfig/NOTES b/util/newconfig/NOTES new file mode 100644 index 0000000000..325e76a479 --- /dev/null +++ b/util/newconfig/NOTES @@ -0,0 +1,46 @@ +April 14, 2002: + +I haven't worked on Yapps for a while, mainly because I spent all my energy +on trying to graduate. Now that I've finished school, I have several projects +I want to start working on again, including Yapps. + +Notes for myself: + +Add a debugging mode that helps you understand how the grammar + is constructed and how things are being parsed +Look into an English output mode that would use natural language + to describe a grammar +Optimize unused variables +Add a convenience to automatically gather up the values returned + from subpatterns, put them into a list, and return them +Improve the documentation +Write some larger examples +Get rid of old-style regex support +Use SRE's lex support to speed up lexing (this may be hard given that + yapps allows for context-sensitive lexers) +Look over Dan Connoly's experience with Yapps (bugs, frustrations, etc.) + and see what improvements could be made +Add something to pretty-print the grammar (without the actions) +Maybe conditionals? Follow this rule only if <condition> holds. + But this would be useful mainly when multiple rules match, and we + want the first matching rule. The conditional would mean we skip to + the next rule. Maybe this is part of the attribute grammar system, + where rule X<0> can be specified separately from X<N>. +Convenience functions that could build return values for all rules + without specifying the code for each rule individually +Patterns (abstractions over rules) -- for example, comma separated values + have a certain rule pattern that gets replicated all over the place +"Gather" mode that simply outputs the return values for certain nodes. + For example, if you just want all expressions, you could ask yapps + to gather the results of the 'expr' rule into a list. This would + ignore all the higher level structure. +Look at everyone's Yapps grammars, and come up with larger examples + http://www.w3.org/2000/10/swap/SemEnglish.g + http://www.w3.org/2000/10/swap/kifExpr.g + http://www.w3.org/2000/10/swap/rdfn3.g +It would be nice if you could feed text into Yapps (push model) instead + of Yapps reading text out of a string (pull model). However, I think + that would make the resulting parser code mostly unreadable + (like yacc, etc.). Coroutines/stacklesspython may be the answer. + + diff --git a/util/newconfig/config.g b/util/newconfig/config.g new file mode 100644 index 0000000000..4222f57630 --- /dev/null +++ b/util/newconfig/config.g @@ -0,0 +1,912 @@ +import sys +import os +import re +import string + +debug = 3 + +arch = '' +makebase = '' +ldscriptbase = '' +payloadfile = '' + +makeoptions = {} +makeexpressions = [] + +# Key is the rule name. Value is a mkrule object. +makebaserules = {} + +# List of targets in the order defined by makerule commands. +makerule_targets = {} + +treetop = '' +target_dir = '' + +sources = {} +objectrules = {} +# make these a hash so they will be unique. +driverrules = {} +ldscripts = [] +userdefines = [] +curpart = 0 + +globalvars = {} # We will globals here +parts = {} + +options = {} +# options in order defined. These will be unique due to use of hash +# for options. +options_by_order = [] +crt0includes = [] +partinstance = 0 +curdir = '' +dirstack = [] +config_file_list = [] + +local_path = re.compile(r'^\.') + +# ----------------------------------------------------------------------------- +# Error Handling +# ----------------------------------------------------------------------------- + +class location: + class place: + def __init__(self, file, line, command): + self.file = file + self.line = line + self.command = command + def next_line(self, command): + self.line = self.line + 1 + self.command = command + def at(self): + return "%s:%d" % (self.file, self.line) + + def __init__ (self): + self.stack = [] + + def file(self): + return self.stack[-1].file + def line(self): + return self.stack[-1].line + def command(self): + return self.stack[-1].command + + def push_file(self, file): + self.stack.append(self.place(file, 0, "")) + def pop_file(self): + self.stack.pop() + def next_line(self, command): + self.stack[-1].next_line(command) + def at(self): + return self.stack[-1].at() +loc = location() + +class makerule: + def __init__ (self, target): + self.target = target + self.dependency = [] + self.actions = [] + + def addaction(self, action): + self.actions.append(action) + + def adddependency(self, dependency): + self.dependency.append(dependency) + + def gtarget(self): + return self.target + + def gdependency(self): + return self.dependency + + def gaction(self): + return self.actions + +class option: + def __init__ (self, name): + self.name = name + self.loc = 0 + self.value = 0 + self.set = 0 + self.default = 0 + + def setvalue(self, value, loc): + if (self.set): + print "Option %s: \n" % self.name + print "Attempt to set %s at %s\n" % (value, loc.at()) + print "Already set to %s at %s\n" % \ + (self.value, self.loc.at()) + sys.exit(1) + self.set = 1 + self.value = value + self.loc = loc + + + def getvalue(self): + self.set = 1 + return self.value + + def setdefault(self, value, loc): + if (self.default): + print "%s: " % self.name + print "Attempt to default %s at %s\n" % (value, loc) + print "Already defaulted to %s at %s\n" % \ + (self.value, self.loc.at()) + print "Warning only\n" + if (self.set): + print "%s: " % self.name + print "Attempt to default %s at %s\n" % (value, loc) + print "Already set to %s at %s\n" % \ + (self.value, self.loc.at()) + print "Warning only\n" + return + self.default = 1 + self.value = value + self.loc = loc + def where(self): + return self.loc + + +class partobj: + def __init__ (self, dir, parent, type): + global partinstance + print "partobj dir %s parent %s type %s" %(dir,parent,type) + self.children = [] + self.initcode = [] + self.registercode = [] + self.siblings = 0 + self.type = type + self.objects = [] + self.dir = dir + self.irq = 0 + self.instance = partinstance + 1 + partinstance = partinstance + 1 + self.devfn = 0 + self.private = 0 + if (parent): + print "add to parent" + self.parent = parent + parent.children.append(self) + else: + self.parent = self + + def dumpme(self, lvl): + print "%d: type %s" % (lvl, self.type) + print "%d: dir %s" % (lvl,self.dir) + print "%d: parent %s" % (lvl,self.parent.type) + print "%d: parent dir %s" % (lvl,self.parent.dir) + print "%d: initcode " % lvl + for i in self.initcode: + print " %s\n" % i + print "%d: registercode " % lvl + for i in self.registercode: + print " %s\n" % i + + def gencode(self): + print "struct cdev dev%d = {" % self.instance + print "/* %s %s */" % (self.type, self.dir) + print " .devfn = %d" % self.devfn + if (self.siblings): + print " .next = &dev%d" % self.sibling.instance + if (self.children): + print " .children = &dev%d" % \ + self.children[0].instance + if (self.private): + print " .private = private%d" % self.instance + print "};" + + + def irq(self, irq): + self.irq = irq + + def addinit(self, code): + self.initcode.append(code) + + def addregister(self, code): + self.registercode.append(code) + + +class partsstack: + def __init__ (self): + self.stack = [] + + def push(self, part): + self.stack.append(part) + + def pop(self): + return self.stack.pop() + + def tos(self): + return self.stack[-1] +pstack = partsstack() + +def fatal(string): + global loc + size = len(loc.stack) + i = 0 + while(i < size -1): + print loc.stack[i].at() + i = i + 1 + print "%s: %s"% (loc.at(), string) + sys.exit(1) + +def warning(string): + global loc + print "===> Warning:" + size = len(loc.stack) + i = 0 + while(i < size -1): + print loc.stack[i].at() + i = i + 1 + print "%s: %s"% (loc.at(), string) + + +# ----------------------------------------------------------------------------- +# statements +# ----------------------------------------------------------------------------- + +def getvalue(dict, name): + if name not in dict.keys(): print 'Undefined:', name + v = dict.get(name, 0) + if (debug > 1): + print "getvalue %s returning %s\n" % (name, v) + return v + +def setvalue(dict, name, value): + print "name %s value %s" % (name, value) + if name in dict.keys(): print "Warning, %s previously defined" % name + if (debug > 1): + print "setvalue sets %s to %s\n" % (name, value) + dict[name] = value + +# options. +# to create an option, it has to no exist. +# When an option value is fetched, the fact that it was used is +# remembered. +# Legal things to do: +# set a default value, then set a real value before the option is used. +# set a value, try to set a default, default silently fails. +# Illegal: +# use the value, then try to set the value + +def getoption(name): + print "getoption %s" % name + o = getvalue(options, name) + if (o == 0): + fatal( "Error. Option %s Undefined. Fix me.\n" % name) + if (debug > 2): + print "returns %s" % o + print "%s" % o.where() + return o.getvalue() + +# stupid code, refactor later. +def setoption(name, value): + o = getvalue(options, name) + if (o): + o.setvalue(value, loc) + return + o = option(name) + o.setvalue(value, loc) + setvalue(options, name, o) + options_by_order.append(name) + +def setdefault(name, value): + o = getvalue(options, name) + if (o): + o.setdefault(value, loc) + return + print "setdefault: %s not here, setting to %s" % \ + (name, value) + o = option(name) + o.setdefault(value, loc) + setvalue(options, name, o) + options_by_order.append(name) + +# we do the crt0include as a dictionary, so that if needed we +# can trace who added what when. Also it makes the keys +# nice and unique. +def addcrt0include(path): + global crt0includes + #fullpath = os.path.join(curdir, path) + #fullpath = path + #setvalue(crt0includes, fullpath, loc) + # oh shoot. Order matters. All right, we'll worry about this + # later. + fullpath = path + if (debug > 2): + print "ADDCRT0: %s\n" % fullpath + crt0includes.append(fullpath) + +def addldscript(path): + global ldscripts + if (path[0] == '/'): + fullpath = treetop + '/src/' + path + else: + fullpath = curdir + '/' + path + #fullpath = os.path.join(curdir, path) + #setvalue(ldscripts, fullpath, loc) + if (debug): + print "fullpath :%s: curdir :%s: path :%s:\n" % (fullpath, curdir, path) + ldscripts.append(fullpath) + +def payload(path): + global payloadfile + adduserdefine("PAYLOAD:=%s"%path) +# addrule('payload') +# adddep('payload', path) +# addaction('payload', 'cp $< $@') + +# this is called with an an object name. +# the easiest thing to do is add this object to the current +# component. +# such kludgery. If the name starts with '.' then make the +# dependency be on ./thing.x gag me. +def addobjectdriver(dict, object_name): + print "add object object_name %s" % object_name + suffix = object_name[-2:] + if (suffix == '.o'): + suffix = '.c' + base = object_name[:-2] + if (object_name[0] == '.'): + source = base + suffix + else: + source = os.path.join(curdir, base + suffix) + object = base + '.o' + print "addobject %s source %s\n" % (object, source) + setvalue(dict, base, [object, source]) + +def addobject(object_name): + addobjectdriver(objectrules, object_name) + +def adddriver(driver_name): + addobjectdriver(driverrules, driver_name) + +def target(targ_name): + global target_dir + global curpart + print "TARGET loc.file is %s\n" % loc.file() + target_dir = os.path.join(os.path.dirname(loc.file()), targ_name) + print "TARGET dir.loc.file is %s\n" % os.path.dirname(loc.file()) + if not os.path.isdir(target_dir): + print 'Creating directory', target_dir + os.makedirs(target_dir) + print 'Will place Makefile, crt0.S, etc. in ', target_dir + print '%s\n' % loc.file() + board = partobj(target_dir, 0, 'board') + curpart = board + + + +def part(name, path, file): + global curpart,curdir + if (debug): + print "%s " % name + dirstack.append(curdir) + curdir = os.path.join(treetop, 'src', name, path) + newpart = partobj(curdir, curpart, name) + print "PUSH part " , curpart.dir + pstack.push(curpart) + curpart = newpart + # option(parts, name, path) + # open the file, and parse it. +# curpart.option('MAINBOARD_PART_NUMBER', +# os.path.basename(lookup(parts, 'mainboard'))) +# option(options, 'MAINBOARD_VENDOR', os.path.dirname(getvalue(parts, 'mainboard'))) + + doconfigfile(curdir, file) + +def partpop(): + global curpart,curdir + curpart = pstack.pop() + curdir = dirstack.pop() + print "POP PART %s\n" % curpart.dir + +# dodir is like part but there is no new part +def dodir(path, file): + global curdir, treetop + # if the first char is '/', it is relative to treetop, + # else relative to curdir + # os.path.join screws up if the name starts with '/', sigh. + if (path[0] == '/'): + fullpath = treetop + '/src/' + path + else: + fullpath = os.path.join(curdir, path) + if (debug): + print "DODIR: path %s, fullpath %s\n" % (path, fullpath) + print "DODIR: curdis %s treetop %s\n" % (curdir, treetop) + dirstack.append(curdir) + curdir = fullpath + file = os.path.join(fullpath, file) + config_file_list.append(file) + doconfigfile(fullpath, file) + curdir = dirstack.pop() + +def ternary(expr, yes, no): + print "ternary %s" % expr + a = tohex(expr) # expr # eval(expr) + print "expr %s a %d yes %d no %d\n"% (expr, a, yes, no) + if (a == 0): + print "Ternary returns %d\n" % yes + return yes + else: + print "Ternary returns %d\n" % no + return no +# atoi is in the python library, but not strtol? Weird! +def tohex(name): + return eval('int(%s)' % name) + +def IsInt( str ): + """ Is the given string an integer?""" + try: + num = int(str) + return 1 + except ValueError: + return 0 + +def addrule(id): + o = makerule(id) + setvalue(makebaserules, id, o) + +def adduserdefine(str): + userdefines.append(str) + +def dequote(str): + a = re.sub("^\"", "", str) + a = re.sub("\"$", "", a) + # highly un-intuitive, need four \! + a = re.sub("\\\\\"", "\"", a) + return a + +def addaction(id, str): + o = getvalue(makebaserules, id) + a = dequote(str) + o.addaction(a) + +def adddep(id, str): + o = getvalue(makebaserules, id) + a = dequote(str) + o.adddependency(a) + +# If the first part of <path> matches treetop, replace that part with "$(TOP)" +def topify(path): + global treetop + if path[0:len(treetop)] == treetop: + path = path[len(treetop):len(path)] + if (path[0:1] == "/"): + path = path[1:len(path)] + path = "$(TOP)/" + path + return path + +# arch is 'different' ... darn it. +def set_arch(my_arch): + global arch, makebase + global curdir + arch = my_arch + setdefault('ARCH', my_arch) + part('arch', my_arch, 'config/make.base.lb') + + +def mainboard(path): + global mainboard_dir + mainboard_dir = path + full_mainboard_dir = os.path.join(treetop, 'src/mainboard', path) + setdefault('MAINBOARD', full_mainboard_dir) + vendor = re.sub("/.*", "", path) + mainboard_part_number = re.sub("[^/]/", "", path) + setdefault('MAINBOARD_VENDOR', vendor) + setdefault('MAINBOARD_PART_NUMBER', mainboard_part_number) + part('mainboard', path, 'Config.lb') +#============================================================================= +# FILE OUTPUT +#============================================================================= +def writemakefileheader(file, fname): + file.write("# File: %s\n" % fname) + file.write("# This file was generated by '%s %s %s'\n\n" + % (sys.argv[0], sys.argv[1], sys.argv[2])) + + +def writemakefilesettings(path): + global treetop, arch, mainboard_dir, target_dir, options + # Write Makefile.settings to seperate the settings + # from the actual makefile creation + # In practice you need to rerun NLBConfig.py to change + # these but in theory you shouldn't need to. + + filename = os.path.join(path, "Makefile.settings") + print "Creating", filename + file = open(filename, 'w+') + writemakefileheader(file, filename) + file.write("TOP:=%s\n" % (treetop)) + #file.write("ARCH:=%s\n" % (arch)) + #file.write("MAINBOARD:=%s\n" % (mainboard_dir)) + file.write("TARGET_DIR:=%s\n" % (target_dir)) + for i in options_by_order: + o = options[i] + # get the number in hex + v = o.getvalue() + if IsInt(v): + vi = int(v) + s = "export %s:=0x%x\n"% (i, vi) + file.write(s) + else: + file.write("export %s:=%s\n" % (i, v)) + file.write("export VARIABLES := "); + for i in options.keys(): + file.write("%s " % i) + file.write("\n"); +# file.write("CPUFLAGS := "); +# for i in options.keys(): +# o = options[i] +# file.write("-D%s=%s " % (i, o.getvalue())) +# file.write("\n"); + file.close(); + + +# write the makefile +# let's try the Makefile +# first, dump all the -D stuff + +def writemakefile(path): + makefilepath = os.path.join(path, "Makefile") + print "Creating", makefilepath + file = open(makefilepath, 'w+') + writemakefileheader(file, makefilepath) + + # file.write("include Makefile.settings\n") + # file.write("include cpuflags\n") + # Putting "include cpuflags" in the Makefile has the problem that the + # cpuflags file would be generated _after_ we want to include it. + # Instead, let make do the work of computing CPUFLAGS: + file.write("""\ +# Get the value of TOP, VARIABLES, and several other variables. +include Makefile.settings + +# Function to create an item like -Di586 or -DMAX_CPUS='1' or -Ui686 +D_item = $(if $(subst undefined,,$(origin $1)),-D$1$(if $($1),='$($1)',),-U$1) + +# Compute the value of CPUFLAGS here during make's first pass. +CPUFLAGS := $(foreach _var_,$(VARIABLES),$(call D_item,$(_var_))) +""") + + for i in userdefines: + file.write("%s\n" %i) + file.write("\n"); + + # print out all the object dependencies + file.write("\n# object dependencies (objectrules:)\n") + file.write("OBJECTS :=\n") + file.write("DRIVERS :=\n") + for objrule in objectrules.keys(): + obj = objectrules[objrule] + obj_name = obj[0] + obj_source = obj[1] + file.write("OBJECTS-1 += %s\n" % (obj_name)) + + for driverrule in driverrules.keys(): + driver = driverrules[driverrule] + obj_name = driver[0] + obj_source = driver[1] + file.write("DRIVER += %s\n" % (obj_name)) + + # Print out all ldscript.ld dependencies. + file.write("\n# ldscript.ld dependencies:\n") + file.write("LDSUBSCRIPTS-1 := \n" ) + for script in ldscripts: + file.write("LDSUBSCRIPTS-1 += %s\n" % topify(script)) + + # Print out the dependencies for crt0_includes.h + file.write("\n# Dependencies for crt0_includes.h\n") + file.write("CRT0_INCLUDES:=\n") + for i in crt0includes: + if (local_path.match(i)): + file.write("CRT0_INCLUDES += %s\n" % i) + else: + file.write("CRT0_INCLUDES += $(TOP)/src/%s\n" % i) + + + + + file.write("\nSOURCES=\n") + #for source in sources: + #file.write("SOURCES += %s\n" % source) + + # Print out the user defines. + file.write("\n# userdefines:\n") + #for udef in userdefines: + #file.write("%s\n" % udef) + + # Print out the base rules. + # Need to have a rule that counts on 'all'. + file.write("\n# mainrulelist:") + #file.write("\nmainrule: %s\n" % mainrulelist) + + # Print out any user rules. + file.write("\n# From makerule or docipl commands:\n") + # Old way (hash order): for target in makebaserules.keys(): + # New way (config file order): + #for target in makerule_targets: + #makebaserules[target].write(file) + + file.write("\n# objectrules:\n") + for objrule in objectrules.keys(): + obj = objectrules[objrule] + source = topify(obj[1]) + file.write("%s: %s\n" % (obj[0], source)) + file.write("\t$(CC) -c $(CFLAGS) -o $@ $<\n") + #file.write("%s\n" % objrule[2]) + + for driverrule in driverrules.keys(): + driver = driverrules[driverrule] + source = topify(driver[1]) + file.write("%s: %s\n" % (driver[0], source)) + #file.write("%s\n" % objrule[2]) + + # Print out the rules that will make cause the files + # generated by NLBConfig.py to be remade if any dependencies change. + + file.write("\n# Remember the automatically generated files\n") + file.write("GENERATED:=\n") + for genfile in [ 'Makefile', + 'Makefile.settings', + 'crt0_includes.h', + 'nsuperio.c', + 'chip.c', + 'LinuxBIOSDoc.config' ]: + file.write("GENERATED += %s\n" % genfile) + + file.write("\n# Remake Makefile (and the other files generated by\n") + file.write("# NLBConfig.py) if any config dependencies change.\n") + + for cfile in config_file_list: + file.write("$(GENERATED): %s\n" % topify(cfile)) + + for depfile in [ '%s' % top_config_file, # This a duplicate, remove? + '$(TOP)/util/config/NLBConfig.py', + '$(TOP)/src/arch/$(ARCH)/config/make.base' ]: + file.write("$(GENERATED): %s\n" % depfile) + + file.write("$(GENERATED):\n") + file.write("\tpython $(TOP)/util/config/NLBConfig.py %s $(TOP)\n" + % top_config_file) + + keys = makeoptions.keys() + keys.sort() + file.write("\necho:\n") + for key in keys: + file.write("\t@echo %s='$(%s)'\n"% (key,key)) + + + for i in makebaserules.keys(): + m = makebaserules[i] + file.write("%s: " %i) + for i in m.dependency: + file.write("%s " % i) + file.write("\n") + for i in m.actions: + file.write("\t%s\n" % i) + file.close(); + +# Write out crt0_includes.h (top-level assembly language) include file. +def writecrt0_includes(path): + crt0filepath = os.path.join(path, "crt0_includes.h") + print "Creating", crt0filepath + file = open(crt0filepath, 'w+') + + for i in crt0includes: + file.write("#include <%s>\n" % i) + + file.close(); + + +%% +parser Config: + ignore: r'\s+' + ignore: "#.*?\r?\n" + token NUM: r'[0-9]+' + token XNUM: r'0x[0-9a-fA-F]+' +# Why is path separate? Because paths to resources have to at least +# have a slash, we thinks + token PATH: r'[a-zA-Z0-9_.][a-zA-Z0-9/_.]+[a-zA-Z0-9_.]+' +# Dir's on the other hand are abitrary +# this may all be stupid. + token DIRPATH: r'[a-zA-Z0-9_$()./]+' + token ID: r'[a-zA-Z_.]+[a-zA-Z0-9_.]*' + token STR: r'"([^\\"]+|\\.)*"' + token RAWTEXT: r'.*' + token OPTION: 'option' + token MAINBOARD: 'mainboard' + token MAINBOARDINIT: 'mainboardinit' + token EQ: '=' + token END: '$|end' + token TARGET: 'target' + token OBJECT: 'object' + token DRIVER: 'driver' + token NORTHBRIDGE: 'northbridge' + token SOUTHBRIDGE: 'southbridge' + token SUPERIO: 'superio' + token IF: 'if' + token MAKERULE: 'makerule' + token DEP: 'dep' + token ACT: 'act' + token MAKEDEFINE: 'makedefine' + token ADDACTION: 'addaction' + token DEFAULT: 'default' + token INIT: 'init' + token REGISTER: 'register' + token CPU: 'cpu' + token ARCH: 'arch' + token DIR: 'dir' + token LDSCRIPT: 'ldscript' + token PAYLOAD: 'payload' + + # An expression is the sum and difference of factors + rule expr<<V>>: factor<<V>> {{ n = factor }} + ( "[+]" factor<<V>> {{ n = n+factor }} + | "-" factor<<V>> {{ n = n-factor }} + )* {{ return n }} + + # A factor is the product and division of terms + rule factor<<V>>: term<<V>> {{ v = term }} + ( "[*]" term<<V>> {{ v = v*term }} + | "/" term<<V>> {{ v = v/term }} + | "<<" term<<V>> {{ v = v << term }} + | ">=" term<<V>> {{ v = (v < term)}} + )* {{ return v }} + + rule unop<<V>>: "!" ID {{ return ternary(getoption(ID), 1, 0)}} + # A term is a number, variable, or an expression surrounded by parentheses + rule term<<V>>: + NUM {{ return atoi(NUM) }} + | XNUM {{ return tohex(XNUM) }} + | ID {{ return tohex(getoption(ID)) }} + | unop<<V>> {{ return unop }} + | "\\(" expr<<V>> "\\)" {{ return expr }} + + rule option<<C>>: OPTION ID EQ + ( + STR {{ if (C): setoption(ID, dequote(STR)) }} + | term<<[]>> {{ if (C): setoption(ID, term) }} + ) + rule default<<C>>: DEFAULT ID EQ RAWTEXT {{ if (C): setdefault(ID, RAWTEXT) }} + + rule partend<<C>>: partstmts<<C>> END + rule mainboard: MAINBOARD PATH {{mainboard(PATH) }} + partend<<1>> + + rule northbridge<<C>>: NORTHBRIDGE PATH + {{if (C): part('northbridge', PATH, 'Config.lb')}} partend<<C>> + rule superio<<C>>: SUPERIO PATH + {{if (C): part('superio', PATH, 'Config.lb')}} partend<<C>> + rule cpu<<C>>: CPU ID + {{if (C): part('cpu', ID, 'Config.lb')}} partend<<C>> + rule arch<<C>>: ARCH ID + {{if (C): set_arch(ID) }} partend<<C>> + rule southbridge<<C>>: SOUTHBRIDGE PATH + {{if (C): part('southbridge', PATH, 'Config.lb')}} partend<<C>> + + rule mainboardinit<<C>>: + MAINBOARDINIT DIRPATH {{ if (C): addcrt0include(DIRPATH)}} + + rule object<<C>>: OBJECT DIRPATH {{if (C): addobject(DIRPATH)}} + rule driver<<C>>: DRIVER DIRPATH {{if (C): adddriver(DIRPATH)}} + rule dir<<C>>: DIR DIRPATH {{if (C): dodir(DIRPATH, 'Config.lb') }} + + rule ldscript<<C>>: LDSCRIPT DIRPATH {{if (C): addldscript(DIRPATH) }} + rule payload<<C>>: PAYLOAD DIRPATH {{if (C): payload(DIRPATH) }} +# if is a bad id .... + rule iif<<C>>: +# needs to be C and ID, but nested if's are, we hope, not going to +# happen. IF so, possibly ID && C could be used. + IF ID {{ c = tohex(getoption(ID)); print "IF %d\n" % c }} (stmt<<c>>)* END + + rule depsacts<<ID, C>>: ( + DEP STR {{ if (C): adddep(ID, STR) }} + | ACT STR {{ if (C): addaction(ID, STR) }} + )* + rule makerule<<C>>: MAKERULE DIRPATH {{ if (C): addrule(DIRPATH) }} + depsacts<<DIRPATH, C>> + rule makedefine<<C>>: MAKEDEFINE RAWTEXT + {{ if (C): adduserdefine(RAWTEXT) }} + rule addaction<<C>>: ADDACTION ID STR + {{ if (C): addaction(ID, STR) }} + rule init<<C>>: INIT STR {{ if (C): curpart.addinit(STR) }} + rule register<<C>>: REGISTER STR {{ if (C): curpart.addregister(STR) }} + +# to make if work without 2 passses, we use an old hack from SIMD, the +# context bit. If the bit is 1, then ops get done, otherwise +# ops don't get done. From the top level, context is always +# 1. In an if, context depends on eval of the if condition + rule stmt<<C>>: + option<<C>> {{ return option }} + | default<<C>> {{ return default }} + | cpu<<C>> {{ return cpu}} + | arch<<C>> {{ return arch}} + | northbridge<<C>> {{ return northbridge }} + | southbridge<<C>> {{ return southbridge }} + | superio<<C>> {{ return superio }} + | object<<C>> {{ return object }} + | driver<<C>> {{ return driver }} + | mainboardinit<<C>> {{ return mainboardinit }} + | makerule<<C>> {{ return makerule }} + | makedefine<<C>> {{ return makedefine }} + | addaction<<C>> {{ return addaction }} + | init<<C>> {{ return init }} + | register<<C>> {{ return register}} + | iif<<C>> {{ return iif }} + | dir<<C>> {{ return dir}} + | ldscript<<C>> {{ return ldscript}} + | payload<<C>> {{ return payload}} + + rule stmts<<C>>: (stmt<<C>>)* {{ }} + + rule partstmts<<C>>: (stmt<<C>>)* {{ partpop()}} +# need this to get from python to the rules, I think. + + rule pstmts: stmts<<1>> {{ }} + rule target: TARGET DIRPATH {{target(DIRPATH)}} + rule board: target mainboard {{ }} +%% + +def dumptree(part, lvl): + print "DUMPTREE ME is" + part.dumpme(lvl) + # dump the siblings -- actually are there any? not sure + # dump the kids + print "DUMPTREE KIDS are" + for kid in part.children: + dumptree(kid, lvl+1) + print "DONE DUMPTREE" + +def gencode(part): + print "GENCODE ME is" + part.gencode() + # dump the siblings -- actually are there any? not sure + # dump the kids + print "GENCODE KIDS are" + for kid in part.children: + gencode(kid) + print "DONE GENCODE" + + +def doconfigfile(path, file): + if (debug): + print "DOCONFIGFILE", path, " ", file + filename = os.path.join(path, file) + loc.push_file(filename); + parse('pstmts', open(filename, 'r').read()) + +if __name__=='__main__': + from sys import argv + if (len(argv) < 3): + print 'Args: <file> <path to linuxbios>' + + top_config_file = os.path.abspath(sys.argv[1]) + + treetop = os.path.abspath(sys.argv[2]) + + # Set the default locations for config files. + makebase = os.path.join(treetop, "util/config/make.base") + crt0base = os.path.join(treetop, "arch/i386/config/crt0.base") + doxyscriptbase = os.path.join(treetop, "src/config/doxyscript.base") + + # Now read in the customizing script... + loc.push_file(argv[1]); + parse('board', open(argv[1], 'r').read()) + dumptree(curpart, 0) + gencode(curpart) + for i in options.keys(): + o = options[i] + print "key %s @%s: val %s" % (i, o.where(), o.getvalue()) + # crt0 includes + for i in crt0includes: + print "crt0include file %s \n" % (i) + for i in driverrules.keys(): + print "driver file %s \n" % (i) + for i in ldscripts: + print "ldscript file %s\n" % (i) + for i in makebaserules.keys(): + m = makebaserules[i] + print " makerule %s dep %s act %s\n" % (i, m.dependency, m.actions) + writemakefilesettings(target_dir) + writecrt0_includes(target_dir) + writemakefile(target_dir) diff --git a/util/newconfig/parsedesc.g b/util/newconfig/parsedesc.g new file mode 100644 index 0000000000..7113c6d6f3 --- /dev/null +++ b/util/newconfig/parsedesc.g @@ -0,0 +1,196 @@ +###################################################################### +# The remainder of this file is from parsedesc.{g,py} + +def append(lst, x): + "Imperative append" + lst.append(x) + return lst + +def add_inline_token(tokens, str): + tokens.insert( 0, (str, eval(str, {}, {})) ) + return Terminal(str) + +def cleanup_choice(lst): + if len(lst) == 0: return Sequence([]) + if len(lst) == 1: return lst[0] + return apply(Choice, tuple(lst)) + +def cleanup_sequence(lst): + if len(lst) == 1: return lst[0] + return apply(Sequence, tuple(lst)) + +def cleanup_rep(node, rep): + if rep == 'star': return Star(node) + elif rep == 'plus': return Plus(node) + else: return node + +def resolve_name(tokens, id, args): + if id in map(lambda x: x[0], tokens): + # It's a token + if args: + print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args) + return Terminal(id) + else: + # It's a name, so assume it's a nonterminal + return NonTerminal(id, args) + +%% +parser ParserDescription: + option: "context-insensitive-scanner" + + ignore: "[ \t\r\n]+" + ignore: "#.*?\r?\n" + token END: "$" + token ATTR: "<<.+?>>" + token STMT: "{{.+?}}" + token ID: '[a-zA-Z_][a-zA-Z_0-9]*' + token STR: '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"' + token LP: '\\(' + token RP: '\\)' + token LB: '\\[' + token RB: '\\]' + token OR: '[|]' + token STAR: '[*]' + token PLUS: '[+]' + token QUEST: '[?]' + token COLON: ':' + + rule Parser: "parser" ID ":" + Options + Tokens + Rules<<Tokens>> + END + {{ return Generator(ID,Options,Tokens,Rules) }} + + rule Options: {{ opt = {} }} + ( "option" ":" Str {{ opt[Str] = 1 }} )* + {{ return opt }} + + rule Tokens: {{ tok = [] }} + ( + "token" ID ":" Str {{ tok.append( (ID,Str) ) }} + | "ignore" ":" Str {{ tok.append( ('#ignore',Str) ) }} + )* + {{ return tok }} + + rule Rules<<tokens>>: + {{ rul = [] }} + ( + "rule" ID OptParam ":" ClauseA<<tokens>> + {{ rul.append( (ID,OptParam,ClauseA) ) }} + )* + {{ return rul }} + + rule ClauseA<<tokens>>: + ClauseB<<tokens>> + {{ v = [ClauseB] }} + ( OR ClauseB<<tokens>> {{ v.append(ClauseB) }} )* + {{ return cleanup_choice(v) }} + + rule ClauseB<<tokens>>: + {{ v = [] }} + ( ClauseC<<tokens>> {{ v.append(ClauseC) }} )* + {{ return cleanup_sequence(v) }} + + rule ClauseC<<tokens>>: + ClauseD<<tokens>> + ( PLUS {{ return Plus(ClauseD) }} + | STAR {{ return Star(ClauseD) }} + | {{ return ClauseD }} ) + + rule ClauseD<<tokens>>: + STR {{ t = (STR, eval(STR,{},{})) }} + {{ if t not in tokens: tokens.insert( 0, t ) }} + {{ return Terminal(STR) }} + | ID OptParam {{ return resolve_name(tokens, ID, OptParam) }} + | LP ClauseA<<tokens>> RP {{ return ClauseA }} + | LB ClauseA<<tokens>> RB {{ return Option(ClauseA) }} + | STMT {{ return Eval(STMT[2:-2]) }} + + rule OptParam: [ ATTR {{ return ATTR[2:-2] }} ] {{ return '' }} + rule Str: STR {{ return eval(STR,{},{}) }} +%% + +# This replaces the default main routine + +yapps_options = [ + ('context-insensitive-scanner', 'context-insensitive-scanner', + 'Scan all tokens (see docs)') + ] + +def generate(inputfilename, outputfilename='', dump=0, **flags): + """Generate a grammar, given an input filename (X.g) + and an output filename (defaulting to X.py).""" + + if not outputfilename: + if inputfilename[-2:]=='.g': outputfilename = inputfilename[:-2]+'.py' + else: raise "Invalid Filename", outputfilename + + print 'Input Grammar:', inputfilename + print 'Output File:', outputfilename + + DIVIDER = '\n%%\n' # This pattern separates the pre/post parsers + preparser, postparser = None, None # Code before and after the parser desc + + # Read the entire file + s = open(inputfilename,'r').read() + + # See if there's a separation between the pre-parser and parser + f = find(s, DIVIDER) + if f >= 0: preparser, s = s[:f]+'\n\n', s[f+len(DIVIDER):] + + # See if there's a separation between the parser and post-parser + f = find(s, DIVIDER) + if f >= 0: s, postparser = s[:f], '\n\n'+s[f+len(DIVIDER):] + + # Create the parser and scanner + p = ParserDescription(ParserDescriptionScanner(s)) + if not p: return + + # Now parse the file + t = wrap_error_reporter(p, 'Parser') + if not t: return # Error + if preparser is not None: t.preparser = preparser + if postparser is not None: t.postparser = postparser + + # Check the options + for f in t.options.keys(): + for opt,_,_ in yapps_options: + if f == opt: break + else: + print 'Warning: unrecognized option', f + # Add command line options to the set + for f in flags.keys(): t.options[f] = flags[f] + + # Generate the output + if dump: + t.dump_information() + else: + t.output = open(outputfilename, 'w') + t.generate_output() + +if __name__=='__main__': + import sys, getopt + optlist, args = getopt.getopt(sys.argv[1:], 'f:', ['dump']) + if not args or len(args) > 2: + print 'Usage:' + print ' python', sys.argv[0], '[flags] input.g [output.py]' + print 'Flags:' + print (' --dump' + ' '*40)[:35] + 'Dump out grammar information' + for flag, _, doc in yapps_options: + print (' -f' + flag + ' '*40)[:35] + doc + else: + # Read in the options and create a list of flags + flags = {} + for opt in optlist: + for flag, name, _ in yapps_options: + if opt == ('-f', flag): + flags[name] = 1 + break + else: + if opt == ('--dump', ''): + flags['dump'] = 1 + else: + print 'Warning - unrecognized option: ', opt[0], opt[1] + + apply(generate, tuple(args), flags) diff --git a/util/newconfig/test.config b/util/newconfig/test.config new file mode 100644 index 0000000000..c492f200fa --- /dev/null +++ b/util/newconfig/test.config @@ -0,0 +1,6 @@ +target x +mainboard amd/solo +# option X=1 +# makerule x y "z" + payload /dev/null +end diff --git a/util/newconfig/yapps2.py b/util/newconfig/yapps2.py new file mode 100644 index 0000000000..71bfa05ca6 --- /dev/null +++ b/util/newconfig/yapps2.py @@ -0,0 +1,779 @@ +# Yapps 2.0 - yet another python parser system +# Amit J Patel, January 1999 +# See http://theory.stanford.edu/~amitp/Yapps/ for documentation and updates + +# v2.0.1 changes (October 2001): +# * The exceptions inherit the standard Exception class (thanks Rich Salz) +# * The scanner can use either a different set of regular expressions +# per instance, or allows the subclass to define class fields with +# the patterns. This improves performance when many Scanner objects +# are being created, because the regular expressions don't have to +# be recompiled each time. (thanks Amaury Forgeot d'Arc) +# v2.0.2 changes (April 2002) +# * Fixed a bug in generating the 'else' clause when the comment was too +# long. v2.0.1 was missing a newline. (thanks Steven Engelhardt) +# v2.0.3 changes (August 2002) +# * Fixed a bug with inline tokens using the r"" syntax. + +from string import * +from yappsrt import * +import re + +INDENT = " "*4 + +class Generator: + def __init__(self, name, options, tokens, rules): + self.change_count = 0 + self.name = name + self.options = options + self.preparser = '' + self.postparser = None + + self.tokens = {} # Map from tokens to regexps + self.ignore = [] # List of token names to ignore in parsing + self.terminals = [] # List of token names (to maintain ordering) + for n,t in tokens: + if n == '#ignore': + n = t + self.ignore.append(n) + if n in self.tokens.keys() and self.tokens[n] != t: + print 'Warning: token', n, 'multiply defined.' + self.tokens[n] = t + self.terminals.append(n) + + self.rules = {} # Map from rule names to parser nodes + self.params = {} # Map from rule names to parameters + self.goals = [] # List of rule names (to maintain ordering) + for n,p,r in rules: + self.params[n] = p + self.rules[n] = r + self.goals.append(n) + + import sys + self.output = sys.stdout + + def __getitem__(self, name): + # Get options + return self.options.get(name, 0) + + def non_ignored_tokens(self): + return filter(lambda x, i=self.ignore: x not in i, self.terminals) + + def changed(self): + self.change_count = 1+self.change_count + + def subset(self, a, b): + "See if all elements of a are inside b" + for x in a: + if x not in b: return 0 + return 1 + + def equal_set(self, a, b): + "See if a and b have the same elements" + if len(a) != len(b): return 0 + if a == b: return 1 + return self.subset(a, b) and self.subset(b, a) + + def add_to(self, parent, additions): + "Modify parent to include all elements in additions" + for x in additions: + if x not in parent: + parent.append(x) + self.changed() + + def equate(self, a, b): + self.add_to(a, b) + self.add_to(b, a) + + def write(self, *args): + for a in args: + self.output.write(a) + + def in_test(self, x, full, b): + if not b: return '0' + if len(b)==1: return '%s == %s' % (x, `b[0]`) + if full and len(b) > len(full)/2: + # Reverse the sense of the test. + not_b = filter(lambda x, b=b: x not in b, full) + return self.not_in_test(x, full, not_b) + return '%s in %s' % (x, `b`) + + def not_in_test(self, x, full, b): + if not b: return '1' + if len(b)==1: return '%s != %s' % (x, `b[0]`) + return '%s not in %s' % (x, `b`) + + def peek_call(self, a): + a_set = (`a`[1:-1]) + if self.equal_set(a, self.non_ignored_tokens()): a_set = '' + if self['context-insensitive-scanner']: a_set = '' + return 'self._peek(%s)' % a_set + + def peek_test(self, a, b): + if self.subset(a, b): return '1' + if self['context-insensitive-scanner']: a = self.non_ignored_tokens() + return self.in_test(self.peek_call(a), a, b) + + def not_peek_test(self, a, b): + if self.subset(a, b): return '0' + return self.not_in_test(self.peek_call(a), a, b) + + def calculate(self): + while 1: + for r in self.goals: + self.rules[r].setup(self, r) + if self.change_count == 0: break + self.change_count = 0 + + while 1: + for r in self.goals: + self.rules[r].update(self) + if self.change_count == 0: break + self.change_count = 0 + + def dump_information(self): + self.calculate() + for r in self.goals: + print ' _____' + '_'*len(r) + print ('___/Rule '+r+'\\' + '_'*80)[:79] + queue = [self.rules[r]] + while queue: + top = queue[0] + del queue[0] + + print `top` + top.first.sort() + top.follow.sort() + eps = [] + if top.accepts_epsilon: eps = ['(null)'] + print ' FIRST:', join(top.first+eps, ', ') + print ' FOLLOW:', join(top.follow, ', ') + for x in top.get_children(): queue.append(x) + + def generate_output(self): + self.calculate() + self.write(self.preparser) + self.write("from string import *\n") + self.write("import re\n") + self.write("from yappsrt import *\n") + self.write("\n") + self.write("class ", self.name, "Scanner(Scanner):\n") + self.write(" patterns = [\n") + for p in self.terminals: + self.write(" (%s, re.compile(%s)),\n" % ( + `p`, `self.tokens[p]`)) + self.write(" ]\n") + self.write(" def __init__(self, str):\n") + self.write(" Scanner.__init__(self,None,%s,str)\n" % + `self.ignore`) + self.write("\n") + + self.write("class ", self.name, "(Parser):\n") + for r in self.goals: + self.write(INDENT, "def ", r, "(self") + if self.params[r]: self.write(", ", self.params[r]) + self.write("):\n") + self.rules[r].output(self, INDENT+INDENT) + self.write("\n") + + self.write("\n") + self.write("def parse(rule, text):\n") + self.write(" P = ", self.name, "(", self.name, "Scanner(text))\n") + self.write(" return wrap_error_reporter(P, rule)\n") + self.write("\n") + if self.postparser is not None: + self.write(self.postparser) + else: + self.write("if __name__=='__main__':\n") + self.write(INDENT, "from sys import argv, stdin\n") + self.write(INDENT, "if len(argv) >= 2:\n") + self.write(INDENT*2, "if len(argv) >= 3:\n") + self.write(INDENT*3, "f = open(argv[2],'r')\n") + self.write(INDENT*2, "else:\n") + self.write(INDENT*3, "f = stdin\n") + self.write(INDENT*2, "print parse(argv[1], f.read())\n") + self.write(INDENT, "else: print 'Args: <rule> [<filename>]'\n") + +###################################################################### +class Node: + def __init__(self): + self.first = [] + self.follow = [] + self.accepts_epsilon = 0 + self.rule = '?' + + def setup(self, gen, rule): + # Setup will change accepts_epsilon, + # sometimes from 0 to 1 but never 1 to 0. + # It will take a finite number of steps to set things up + self.rule = rule + + def used(self, vars): + "Return two lists: one of vars used, and the other of vars assigned" + return vars, [] + + def get_children(self): + "Return a list of sub-nodes" + return [] + + def __repr__(self): + return str(self) + + def update(self, gen): + if self.accepts_epsilon: + gen.add_to(self.first, self.follow) + + def output(self, gen, indent): + "Write out code to _gen_ with _indent_:string indentation" + gen.write(indent, "assert 0 # Invalid parser node\n") + +class Terminal(Node): + def __init__(self, token): + Node.__init__(self) + self.token = token + self.accepts_epsilon = 0 + + def __str__(self): + return self.token + + def update(self, gen): + Node.update(self, gen) + if self.first != [self.token]: + self.first = [self.token] + gen.changed() + + def output(self, gen, indent): + gen.write(indent) + if re.match('[a-zA-Z_]+$', self.token): + gen.write(self.token, " = ") + gen.write("self._scan(%s)\n" % `self.token`) + +class Eval(Node): + def __init__(self, expr): + Node.__init__(self) + self.expr = expr + + def setup(self, gen, rule): + Node.setup(self, gen, rule) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '{{ %s }}' % strip(self.expr) + + def output(self, gen, indent): + gen.write(indent, strip(self.expr), '\n') + +class NonTerminal(Node): + def __init__(self, name, args): + Node.__init__(self) + self.name = name + self.args = args + + def setup(self, gen, rule): + Node.setup(self, gen, rule) + try: + self.target = gen.rules[self.name] + if self.accepts_epsilon != self.target.accepts_epsilon: + self.accepts_epsilon = self.target.accepts_epsilon + gen.changed() + except KeyError: # Oops, it's nonexistent + print 'Error: no rule <%s>' % self.name + self.target = self + + def __str__(self): + return '<%s>' % self.name + + def update(self, gen): + Node.update(self, gen) + gen.equate(self.first, self.target.first) + gen.equate(self.follow, self.target.follow) + + def output(self, gen, indent): + gen.write(indent) + gen.write(self.name, " = ") + gen.write("self.", self.name, "(", self.args, ")\n") + +class Sequence(Node): + def __init__(self, *children): + Node.__init__(self) + self.children = children + + def setup(self, gen, rule): + Node.setup(self, gen, rule) + for c in self.children: c.setup(gen, rule) + + if not self.accepts_epsilon: + # If it's not already accepting epsilon, it might now do so. + for c in self.children: + # any non-epsilon means all is non-epsilon + if not c.accepts_epsilon: break + else: + self.accepts_epsilon = 1 + gen.changed() + + def get_children(self): + return self.children + + def __str__(self): + return '( %s )' % join(map(lambda x: str(x), self.children)) + + def update(self, gen): + Node.update(self, gen) + for g in self.children: + g.update(gen) + + empty = 1 + for g_i in range(len(self.children)): + g = self.children[g_i] + + if empty: gen.add_to(self.first, g.first) + if not g.accepts_epsilon: empty = 0 + + if g_i == len(self.children)-1: + next = self.follow + else: + next = self.children[1+g_i].first + gen.add_to(g.follow, next) + + if self.children: + gen.add_to(self.follow, self.children[-1].follow) + + def output(self, gen, indent): + if self.children: + for c in self.children: + c.output(gen, indent) + else: + # Placeholder for empty sequences, just in case + gen.write(indent, 'pass\n') + +class Choice(Node): + def __init__(self, *children): + Node.__init__(self) + self.children = children + + def setup(self, gen, rule): + Node.setup(self, gen, rule) + for c in self.children: c.setup(gen, rule) + + if not self.accepts_epsilon: + for c in self.children: + if c.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def get_children(self): + return self.children + + def __str__(self): + return '( %s )' % join(map(lambda x: str(x), self.children), ' | ') + + def update(self, gen): + Node.update(self, gen) + for g in self.children: + g.update(gen) + + for g in self.children: + gen.add_to(self.first, g.first) + gen.add_to(self.follow, g.follow) + for g in self.children: + gen.add_to(g.follow, self.follow) + if self.accepts_epsilon: + gen.add_to(self.first, self.follow) + + def output(self, gen, indent): + test = "if" + gen.write(indent, "_token_ = ", gen.peek_call(self.first), "\n") + tokens_seen = [] + tokens_unseen = self.first[:] + if gen['context-insensitive-scanner']: + # Context insensitive scanners can return ANY token, + # not only the ones in first. + tokens_unseen = gen.non_ignored_tokens() + for c in self.children: + testset = c.first[:] + removed = [] + for x in testset: + if x in tokens_seen: + testset.remove(x) + removed.append(x) + if x in tokens_unseen: tokens_unseen.remove(x) + tokens_seen = tokens_seen + testset + if removed: + if not testset: + print 'Error in rule', self.rule+':', c, 'never matches.' + else: + print 'Warning:', self + print ' * These tokens are being ignored:', join(removed, ', ') + print ' due to previous choices using them.' + + if testset: + if not tokens_unseen: # context sensitive scanners only! + if test=='if': + # if it's the first AND last test, then + # we can simply put the code without an if/else + c.output(gen, indent) + else: + gen.write(indent, "else: ") + t = gen.in_test('', [], testset) + if len(t) < 70-len(indent): + gen.write("#", t) + gen.write("\n") + c.output(gen, indent+INDENT) + else: + gen.write(indent, test, " ", + gen.in_test('_token_', tokens_unseen, testset), + ":\n") + c.output(gen, indent+INDENT) + test = "elif" + + if gen['context-insensitive-scanner'] and tokens_unseen: + gen.write(indent, "else:\n") + gen.write(indent, INDENT, "raise SyntaxError(self._pos, ") + gen.write("'Could not match ", self.rule, "')\n") + +class Wrapper(Node): + def __init__(self, child): + Node.__init__(self) + self.child = child + + def setup(self, gen, rule): + Node.setup(self, gen, rule) + self.child.setup(gen, rule) + + def get_children(self): + return [self.child] + + def update(self, gen): + Node.update(self, gen) + self.child.update(gen) + gen.add_to(self.first, self.child.first) + gen.equate(self.follow, self.child.follow) + +class Option(Wrapper): + def setup(self, gen, rule): + Wrapper.setup(self, gen, rule) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '[ %s ]' % str(self.child) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print 'Warning in rule', self.rule+': contents may be empty.' + gen.write(indent, "if %s:\n" % + gen.peek_test(self.first, self.child.first)) + self.child.output(gen, indent+INDENT) + +class Plus(Wrapper): + def setup(self, gen, rule): + Wrapper.setup(self, gen, rule) + if self.accepts_epsilon != self.child.accepts_epsilon: + self.accepts_epsilon = self.child.accepts_epsilon + gen.changed() + + def __str__(self): + return '%s+' % str(self.child) + + def update(self, gen): + Wrapper.update(self, gen) + gen.add_to(self.follow, self.first) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print 'Warning in rule', self.rule+':' + print ' * The repeated pattern could be empty. The resulting' + print ' parser may not work properly.' + gen.write(indent, "while 1:\n") + self.child.output(gen, indent+INDENT) + union = self.first[:] + gen.add_to(union, self.follow) + gen.write(indent+INDENT, "if %s: break\n" % + gen.not_peek_test(union, self.child.first)) + +class Star(Plus): + def setup(self, gen, rule): + Wrapper.setup(self, gen, rule) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '%s*' % str(self.child) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print 'Warning in rule', self.rule+':' + print ' * The repeated pattern could be empty. The resulting' + print ' parser probably will not work properly.' + gen.write(indent, "while %s:\n" % + gen.peek_test(self.follow, self.child.first)) + self.child.output(gen, indent+INDENT) + +###################################################################### +# The remainder of this file is from parsedesc.{g,py} + +def append(lst, x): + "Imperative append" + lst.append(x) + return lst + +def add_inline_token(tokens, str): + tokens.insert( 0, (str, eval(str, {}, {})) ) + return Terminal(str) + +def cleanup_choice(lst): + if len(lst) == 0: return Sequence([]) + if len(lst) == 1: return lst[0] + return apply(Choice, tuple(lst)) + +def cleanup_sequence(lst): + if len(lst) == 1: return lst[0] + return apply(Sequence, tuple(lst)) + +def cleanup_rep(node, rep): + if rep == 'star': return Star(node) + elif rep == 'plus': return Plus(node) + else: return node + +def resolve_name(tokens, id, args): + if id in map(lambda x: x[0], tokens): + # It's a token + if args: + print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args) + return Terminal(id) + else: + # It's a name, so assume it's a nonterminal + return NonTerminal(id, args) + + +from string import * +import re +from yappsrt import * + +class ParserDescriptionScanner(Scanner): + def __init__(self, str): + Scanner.__init__(self,[ + ('"rule"', 'rule'), + ('"ignore"', 'ignore'), + ('"token"', 'token'), + ('"option"', 'option'), + ('":"', ':'), + ('"parser"', 'parser'), + ('[ \011\015\012]+', '[ \011\015\012]+'), + ('#.*?\015?\012', '#.*?\015?\012'), + ('END', '$'), + ('ATTR', '<<.+?>>'), + ('STMT', '{{.+?}}'), + ('ID', '[a-zA-Z_][a-zA-Z_0-9]*'), + ('STR', '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"'), + ('LP', '\\('), + ('RP', '\\)'), + ('LB', '\\['), + ('RB', '\\]'), + ('OR', '[|]'), + ('STAR', '[*]'), + ('PLUS', '[+]'), + ], ['[ \011\015\012]+', '#.*?\015?\012'], str) + +class ParserDescription(Parser): + def Parser(self): + self._scan('"parser"') + ID = self._scan('ID') + self._scan('":"') + Options = self.Options() + Tokens = self.Tokens() + Rules = self.Rules(Tokens) + END = self._scan('END') + return Generator(ID,Options,Tokens,Rules) + + def Options(self): + opt = {} + while self._peek('"option"', '"token"', '"ignore"', 'END', '"rule"') == '"option"': + self._scan('"option"') + self._scan('":"') + Str = self.Str() + opt[Str] = 1 + return opt + + def Tokens(self): + tok = [] + while self._peek('"token"', '"ignore"', 'END', '"rule"') in ['"token"', '"ignore"']: + _token_ = self._peek('"token"', '"ignore"') + if _token_ == '"token"': + self._scan('"token"') + ID = self._scan('ID') + self._scan('":"') + Str = self.Str() + tok.append( (ID,Str) ) + else: # == '"ignore"' + self._scan('"ignore"') + self._scan('":"') + Str = self.Str() + tok.append( ('#ignore',Str) ) + return tok + + def Rules(self, tokens): + rul = [] + while self._peek('"rule"', 'END') == '"rule"': + self._scan('"rule"') + ID = self._scan('ID') + OptParam = self.OptParam() + self._scan('":"') + ClauseA = self.ClauseA(tokens) + rul.append( (ID,OptParam,ClauseA) ) + return rul + + def ClauseA(self, tokens): + ClauseB = self.ClauseB(tokens) + v = [ClauseB] + while self._peek('OR', 'RP', 'RB', '"rule"', 'END') == 'OR': + OR = self._scan('OR') + ClauseB = self.ClauseB(tokens) + v.append(ClauseB) + return cleanup_choice(v) + + def ClauseB(self, tokens): + v = [] + while self._peek('STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END') in ['STR', 'ID', 'LP', 'LB', 'STMT']: + ClauseC = self.ClauseC(tokens) + v.append(ClauseC) + return cleanup_sequence(v) + + def ClauseC(self, tokens): + ClauseD = self.ClauseD(tokens) + _token_ = self._peek('PLUS', 'STAR', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END') + if _token_ == 'PLUS': + PLUS = self._scan('PLUS') + return Plus(ClauseD) + elif _token_ == 'STAR': + STAR = self._scan('STAR') + return Star(ClauseD) + else: + return ClauseD + + def ClauseD(self, tokens): + _token_ = self._peek('STR', 'ID', 'LP', 'LB', 'STMT') + if _token_ == 'STR': + STR = self._scan('STR') + t = (STR, eval(STR,{},{})) + if t not in tokens: tokens.insert( 0, t ) + return Terminal(STR) + elif _token_ == 'ID': + ID = self._scan('ID') + OptParam = self.OptParam() + return resolve_name(tokens, ID, OptParam) + elif _token_ == 'LP': + LP = self._scan('LP') + ClauseA = self.ClauseA(tokens) + RP = self._scan('RP') + return ClauseA + elif _token_ == 'LB': + LB = self._scan('LB') + ClauseA = self.ClauseA(tokens) + RB = self._scan('RB') + return Option(ClauseA) + else: # == 'STMT' + STMT = self._scan('STMT') + return Eval(STMT[2:-2]) + + def OptParam(self): + if self._peek('ATTR', '":"', 'PLUS', 'STAR', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END') == 'ATTR': + ATTR = self._scan('ATTR') + return ATTR[2:-2] + return '' + + def Str(self): + STR = self._scan('STR') + return eval(STR,{},{}) + + + + + +# This replaces the default main routine + +yapps_options = [ + ('context-insensitive-scanner', 'context-insensitive-scanner', + 'Scan all tokens (see docs)') + ] + +def generate(inputfilename, outputfilename='', dump=0, **flags): + """Generate a grammar, given an input filename (X.g) + and an output filename (defaulting to X.py).""" + + if not outputfilename: + if inputfilename[-2:]=='.g': outputfilename = inputfilename[:-2]+'.py' + else: raise "Invalid Filename", outputfilename + + print 'Input Grammar:', inputfilename + print 'Output File:', outputfilename + + DIVIDER = '\n%%\n' # This pattern separates the pre/post parsers + preparser, postparser = None, None # Code before and after the parser desc + + # Read the entire file + s = open(inputfilename,'r').read() + + # See if there's a separation between the pre-parser and parser + f = find(s, DIVIDER) + if f >= 0: preparser, s = s[:f]+'\n\n', s[f+len(DIVIDER):] + + # See if there's a separation between the parser and post-parser + f = find(s, DIVIDER) + if f >= 0: s, postparser = s[:f], '\n\n'+s[f+len(DIVIDER):] + + # Create the parser and scanner + p = ParserDescription(ParserDescriptionScanner(s)) + if not p: return + + # Now parse the file + t = wrap_error_reporter(p, 'Parser') + if not t: return # Error + if preparser is not None: t.preparser = preparser + if postparser is not None: t.postparser = postparser + + # Check the options + for f in t.options.keys(): + for opt,_,_ in yapps_options: + if f == opt: break + else: + print 'Warning: unrecognized option', f + # Add command line options to the set + for f in flags.keys(): t.options[f] = flags[f] + + # Generate the output + if dump: + t.dump_information() + else: + t.output = open(outputfilename, 'w') + t.generate_output() + +if __name__=='__main__': + import sys, getopt + optlist, args = getopt.getopt(sys.argv[1:], 'f:', ['dump']) + if not args or len(args) > 2: + print 'Usage:' + print ' python', sys.argv[0], '[flags] input.g [output.py]' + print 'Flags:' + print (' --dump' + ' '*40)[:35] + 'Dump out grammar information' + for flag, _, doc in yapps_options: + print (' -f' + flag + ' '*40)[:35] + doc + else: + # Read in the options and create a list of flags + flags = {} + for opt in optlist: + for flag, name, _ in yapps_options: + if opt == ('-f', flag): + flags[name] = 1 + break + else: + if opt == ('--dump', ''): + flags['dump'] = 1 + else: + print 'Warning - unrecognized option: ', opt[0], opt[1] + + apply(generate, tuple(args), flags) diff --git a/util/newconfig/yapps2.tex b/util/newconfig/yapps2.tex new file mode 100644 index 0000000000..9d2bddf19c --- /dev/null +++ b/util/newconfig/yapps2.tex @@ -0,0 +1,1225 @@ +\documentclass[10pt]{article} +\usepackage{palatino} +\usepackage{html} +\usepackage{color} + +\setlength{\headsep}{0in} +\setlength{\headheight}{0in} +\setlength{\textheight}{8.5in} +\setlength{\textwidth}{5.9in} +\setlength{\oddsidemargin}{0.25in} + +\definecolor{darkblue}{rgb}{0,0,0.6} +\definecolor{darkerblue}{rgb}{0,0,0.3} + +%% \newcommand{\mysection}[1]{\section{\textcolor{darkblue}{#1}}} +%% \newcommand{\mysubsection}[1]{\subsection{\textcolor{darkerblue}{#1}}} +\newcommand{\mysection}[1]{\section{#1}} +\newcommand{\mysubsection}[1]{\subsection{#1}} + +\bodytext{bgcolor=white text=black link=#004080 vlink=#006020} + +\newcommand{\first}{\textsc{first}} +\newcommand{\follow}{\textsc{follow}} + +\begin{document} + +\begin{center} +\hfill \begin{tabular}{c} +{\Large The \emph{Yapps} Parser Generator System}\\ +\verb|http://theory.stanford.edu/~amitp/Yapps/|\\ + Version 2\\ +\\ +Amit J. Patel\\ +\htmladdnormallink{http://www-cs-students.stanford.edu/~amitp/} +{http://www-cs-students.stanford.edu/~amitp/} + +\end{tabular} \hfill \rule{0in}{0in} +\end{center} + +\mysection{Introduction} + +\emph{Yapps} (\underline{Y}et \underline{A}nother \underline{P}ython +\underline{P}arser \underline{S}ystem) is an easy to use parser +generator that is written in Python and generates Python code. There +are several parser generator systems already available for Python, +including \texttt{PyLR, kjParsing, PyBison,} and \texttt{mcf.pars,} +but I had different goals for my parser. Yapps is simple, is easy to +use, and produces human-readable parsers. It is not the fastest or +most powerful parser. Yapps is designed to be used when regular +expressions are not enough and other parser systems are too much: +situations where you may write your own recursive descent parser. + +Some unusual features of Yapps that may be of interest are: + +\begin{enumerate} + + \item Yapps produces recursive descent parsers that are readable by + humans, as opposed to table-driven parsers that are difficult to + read. A Yapps parser for a simple calculator looks similar to the + one that Mark Lutz wrote by hand for \emph{Programming Python.} + + \item Yapps also allows for rules that accept parameters and pass + arguments to be used while parsing subexpressions. Grammars that + allow for arguments to be passed to subrules and for values to be + passed back are often called \emph{attribute grammars.} In many + cases parameterized rules can be used to perform actions at ``parse + time'' that are usually delayed until later. For example, + information about variable declarations can be passed into the + rules that parse a procedure body, so that undefined variables can + be detected at parse time. The types of defined variables can be + used in parsing as well---for example, if the type of {\tt X} is + known, we can determine whether {\tt X(1)} is an array reference or + a function call. + + \item Yapps grammars are fairly easy to write, although there are + some inconveniences having to do with ELL(1) parsing that have to be + worked around. For example, rules have to be left factored and + rules may not be left recursive. However, neither limitation seems + to be a problem in practice. + + Yapps grammars look similar to the notation used in the Python + reference manual, with operators like \verb:*:, \verb:+:, \verb:|:, + \verb:[]:, and \verb:(): for patterns, names ({\tt tim}) for rules, + regular expressions (\verb:"[a-z]+":) for tokens, and \verb:#: for + comments. + + \item The Yapps parser generator is written as a single Python module + with no C extensions. Yapps produces parsers that are written + entirely in Python, and require only the Yapps run-time module (5k) + for support. + + \item Yapps's scanner is context-sensitive, picking tokens based on + the types of the tokens accepted by the parser. This can be + helpful when implementing certain kinds of parsers, such as for a + preprocessor. + +\end{enumerate} + +There are several disadvantages of using Yapps over another parser system: + +\begin{enumerate} + + \item Yapps parsers are \texttt{ELL(1)} (Extended LL(1)), which is + less powerful than \texttt{LALR} (used by \texttt{PyLR}) or + \texttt{SLR} (used by \texttt{kjParsing}), so Yapps would not be a + good choice for parsing complex languages. For example, allowing + both \texttt{x := 5;} and \texttt{x;} as statements is difficult + because we must distinguish based on only one token of lookahead. + Seeing only \texttt{x}, we cannot decide whether we have an + assignment statement or an expression statement. (Note however + that this kind of grammar can be matched with backtracking; see + section \ref{sec:future}.) + + \item The scanner that Yapps provides can only read from strings, not + files, so an entire file has to be read in before scanning can + begin. It is possible to build a custom scanner, though, so in + cases where stream input is needed (from the console, a network, or + a large file are examples), the Yapps parser can be given a custom + scanner that reads from a stream instead of a string. + + \item Yapps is not designed with efficiency in mind. + +\end{enumerate} + +Yapps provides an easy to use parser generator that produces parsers +similar to what you might write by hand. It is not meant to be a +solution for all parsing problems, but instead an aid for those times +you would write a parser by hand rather than using one of the more +powerful parsing packages available. + +Yapps 2.0 is easier to use than Yapps 1.0. New features include a +less restrictive input syntax, which allows mixing of sequences, +choices, terminals, and nonterminals; optional matching; the ability +to insert single-line statements into the generated parser; and +looping constructs \verb|*| and \verb|+| similar to the repetitive +matching constructs in regular expressions. Unfortunately, the +addition of these constructs has made Yapps 2.0 incompatible with +Yapps 1.0, so grammars will have to be rewritten. See section +\ref{sec:Upgrading} for tips on changing Yapps 1.0 grammars for use +with Yapps 2.0. + +\mysection{Examples} + +In this section are several examples that show the use of Yapps. +First, an introduction shows how to construct grammars and write them +in Yapps form. This example can be skipped by someone familiar with +grammars and parsing. Next is a Lisp expression grammar that produces +a parse tree as output. This example demonstrates the use of tokens +and rules, as well as returning values from rules. The third example +is a expression evaluation grammar that evaluates during parsing +(instead of producing a parse tree). + +\mysubsection{Introduction to Grammars} + +A \emph{grammar} for a natural language specifies how words can be put +together to form large structures, such as phrases and sentences. A +grammar for a computer language is similar in that it specifies how +small components (called \emph{tokens}) can be put together to form +larger structures. In this section we will write a grammar for a tiny +subset of English. + +Simple English sentences can be described as being a noun phrase +followed by a verb followed by a noun phrase. For example, in the +sentence, ``Jack sank the blue ship,'' the word ``Jack'' is the first +noun phrase, ``sank'' is the verb, and ``the blue ship'' is the second +noun phrase. In addition we should say what a noun phrase is; for +this example we shall say that a noun phrase is an optional article +(a, an, the) followed by any number of adjectives followed by a noun. +The tokens in our language are the articles, nouns, verbs, and +adjectives. The \emph{rules} in our language will tell us how to +combine the tokens together to form lists of adjectives, noun phrases, +and sentences: + +\begin{itemize} + \item \texttt{sentence: noun\_phrase verb noun\_phrase} + \item \texttt{noun\_phrase: [article] adjective* noun} +\end{itemize} + +Notice that some things that we said easily in English, such as +``optional article,'' are expressed using special syntax, such as +brackets. When we said, ``any number of adjectives,'' we wrote +\texttt{adjective*}, where the \texttt{*} means ``zero or more of the +preceding pattern''. + +The grammar given above is close to a Yapps grammar. We also have to +specify what the tokens are, and what to do when a pattern is matched. +For this example, we will do nothing when patterns are matched; the +next example will explain how to perform match actions. + +\begin{verbatim} +parser TinyEnglish: + ignore: "\\W+" + token noun: "(Jack|spam|ship)" + token verb: "(sank|threw)" + token article: "(a|an|the)" + token adjective: "(blue|red|green)" + + rule sentence: noun_phrase verb noun_phrase + rule noun_phrase: [article] adjective* noun +\end{verbatim} + +The tokens are specified as Python \emph{regular expressions}. Since +Yapps produces Python code, you can write any regular expression that +would be accepted by Python. (\emph{Note:} These are Python 1.5 +regular expressions from the \texttt{re} module, not Python 1.4 +regular expressions from the \texttt{regex} module.) In addition to +tokens that you want to see (which are given names), you can also +specify tokens to ignore, marked by the \texttt{ignore} keyword. In +this parser we want to ignore whitespace. + +The TinyEnglish grammar shows how you define tokens and rules, but it +does not specify what should happen once we've matched the rules. In +the next example, we will take a grammar and produce a \emph{parse +tree} from it. + +\mysubsection{Lisp Expressions} + +Lisp syntax, although hated by many, has a redeeming quality: it is +simple to parse. In this section we will construct a Yapps grammar to +parse Lisp expressions and produce a parse tree as output. + +\subsubsection*{Defining the Grammar} + +The syntax of Lisp is simple. It has expressions, which are +identifiers, strings, numbers, and lists. A list is a left +parenthesis followed by some number of expressions (separated by +spaces) followed by a right parenthesis. For example, \verb|5|, +\verb|"ni"|, and \verb|(print "1+2 = " (+ 1 2))| are Lisp expressions. +Written as a grammar, + +\begin{verbatim} + expr: ID | STR | NUM | list + list: ( expr* ) +\end{verbatim} + +In addition to having a grammar, we need to specify what to do every +time something is matched. For the tokens, which are strings, we just +want to get the ``value'' of the token, attach its type (identifier, +string, or number) in some way, and return it. For the lists, we want +to construct and return a Python list. + +Once some pattern is matched, we enclose a return statement enclosed +in \verb|{{...}}|. The braces allow us to insert any one-line +statement into the parser. Within this statement, we can refer to the +values returned by matching each part of the rule. After matching a +token such as \texttt{ID}, ``ID'' will be bound to the text of the +matched token. Let's take a look at the rule: + +\begin{verbatim} + rule expr: ID {{ return ('id', ID) }} + ... +\end{verbatim} + +In a rule, tokens return the text that was matched. For identifiers, +we just return the identifier, along with a ``tag'' telling us that +this is an identifier and not a string or some other value. Sometimes +we may need to convert this text to a different form. For example, if +a string is matched, we want to remove quotes and handle special forms +like \verb|\n|. If a number is matched, we want to convert it into a +number. Let's look at the return values for the other tokens: + +\begin{verbatim} + ... + | STR {{ return ('str', eval(STR)) }} + | NUM {{ return ('num', atoi(NUM)) }} + ... +\end{verbatim} + +If we get a string, we want to remove the quotes and process any +special backslash codes, so we run \texttt{eval} on the quoted string. +If we get a number, we convert it to an integer with \texttt{atoi} and +then return the number along with its type tag. + +For matching a list, we need to do something slightly more +complicated. If we match a Lisp list of expressions, we want to +create a Python list with those values. + +\begin{verbatim} + rule list: "\\(" # Match the opening parenthesis + {{ result = [] }} # Create a Python list + ( + expr # When we match an expression, + {{ result.append(expr) }} # add it to the list + )* # * means repeat this if needed + "\\)" # Match the closing parenthesis + {{ return result }} # Return the Python list +\end{verbatim} + +In this rule we first match the opening parenthesis, then go into a +loop. In this loop we match expressions and add them to the list. +When there are no more expressions to match, we match the closing +parenthesis and return the resulting. Note that \verb:#: is used for +comments, just as in Python. + +The complete grammar is specified as follows: +\begin{verbatim} +parser Lisp: + ignore: '\\s+' + token NUM: '[0-9]+' + token ID: '[-+*/!@%^&=.a-zA-Z0-9_]+' + token STR: '"([^\\"]+|\\\\.)*"' + + rule expr: ID {{ return ('id', ID) }} + | STR {{ return ('str', eval(STR)) }} + | NUM {{ return ('num', atoi(NUM)) }} + | list {{ return list }} + rule list: "\\(" {{ result = [] }} + ( expr {{ result.append(expr) }} + )* + "\\)" {{ return result }} +\end{verbatim} + +One thing you may have noticed is that \verb|"\\("| and \verb|"\\)"| +appear in the \texttt{list} rule. These are \emph{inline tokens}: +they appear in the rules without being given a name with the +\texttt{token} keyword. Inline tokens are more convenient to use, but +since they do not have a name, the text that is matched cannot be used +in the return value. They are best used for short simple patterns +(usually punctuation or keywords). + +Another thing to notice is that the number and identifier tokens +overlap. For example, ``487'' matches both NUM and ID. In Yapps, the +scanner only tries to match tokens that are acceptable to the parser. +This rule doesn't help here, since both NUM and ID can appear in the +same place in the grammar. There are two rules used to pick tokens if +more than one matches. One is that the \emph{longest} match is +preferred. For example, ``487x'' will match as an ID (487x) rather +than as a NUM (487) followed by an ID (x). The second rule is that if +the two matches are the same length, the \emph{first} one listed in +the grammar is preferred. For example, ``487'' will match as an NUM +rather than an ID because NUM is listed first in the grammar. Inline +tokens have preference over any tokens you have listed. + +Now that our grammar is defined, we can run Yapps to produce a parser, +and then run the parser to produce a parse tree. + +\subsubsection*{Running Yapps} + +In the Yapps module is a function \texttt{generate} that takes an +input filename and writes a parser to another file. We can use this +function to generate the Lisp parser, which is assumed to be in +\texttt{lisp.g}. + +\begin{verbatim} +% python +Python 1.5.1 (#1, Sep 3 1998, 22:51:17) [GCC 2.7.2.3] on linux-i386 +Copyright 1991-1995 Stichting Mathematisch Centrum, Amsterdam +>>> import yapps +>>> yapps.generate('lisp.g') +\end{verbatim} + +At this point, Yapps has written a file \texttt{lisp.py} that contains +the parser. In that file are two classes (one scanner and one parser) +and a function (called \texttt{parse}) that puts things together for +you. + +Alternatively, we can run Yapps from the command line to generate the +parser file: + +\begin{verbatim} +% python yapps.py lisp.g +\end{verbatim} + +After running Yapps either from within Python or from the command +line, we can use the Lisp parser by calling the \texttt{parse} +function. The first parameter should be the rule we want to match, +and the second parameter should be the string to parse. + +\begin{verbatim} +>>> import lisp +>>> lisp.parse('expr', '(+ 3 4)') +[('id', '+'), ('num', 3), ('num', 4)] +>>> lisp.parse('expr', '(print "3 = " (+ 1 2))') +[('id', 'print'), ('str', '3 = '), [('id', '+'), ('num', 1), ('num', 2)]] +\end{verbatim} + +The \texttt{parse} function is not the only way to use the parser; +section \ref{sec:Parser-Objects} describes how to access parser objects +directly. + +We've now gone through the steps in creating a grammar, writing a +grammar file for Yapps, producing a parser, and using the parser. In +the next example we'll see how rules can take parameters and also how +to do computations instead of just returning a parse tree. + +\mysubsection{Calculator} + +A common example parser given in many textbooks is that for simple +expressions, with numbers, addition, subtraction, multiplication, +division, and parenthesization of subexpressions. We'll write this +example in Yapps, evaluating the expression as we parse. + +Unlike \texttt{yacc}, Yapps does not have any way to specify +precedence rules, so we have to do it ourselves. We say that an +expression is the sum of terms, and that a term is the product of +factors, and that a factor is a number or a parenthesized expression: + +\begin{verbatim} + expr: factor ( ("+"|"-") factor )* + factor: term ( ("*"|"/") term )* + term: NUM | "(" expr ")" +\end{verbatim} + +In order to evaluate the expression as we go, we should keep along an +accumulator while evaluating the lists of terms or factors. Just as +we kept a ``result'' variable to build a parse tree for Lisp +expressions, we will use a variable to evaluate numerical +expressions. The full grammar is given below: + +\begin{verbatim} +parser Calculator: + token END: "$" # $ means end of string + token NUM: "[0-9]+" + + rule goal: expr END {{ return expr }} + + # An expression is the sum and difference of factors + rule expr: factor {{ v = factor }} + ( "[+]" factor {{ v = v+factor }} + | "-" factor {{ v = v-factor }} + )* {{ return v }} + + # A factor is the product and division of terms + rule factor: term {{ v = term }} + ( "[*]" term {{ v = v*term }} + | "/" term {{ v = v/term }} + )* {{ return v }} + + # A term is either a number or an expression surrounded by parentheses + rule term: NUM {{ return atoi(NUM) }} + | "\\(" expr "\\)" {{ return expr }} +\end{verbatim} + +The top-level rule is \emph{goal}, which says that we are looking for +an expression followed by the end of the string. The \texttt{END} +token is needed because without it, it isn't clear when to stop +parsing. For example, the string ``1+3'' could be parsed either as +the expression ``1'' followed by the string ``+3'' or it could be +parsed as the expression ``1+3''. By requiring expressions to end +with \texttt{END}, the parser is forced to take ``1+3''. + +In the two rules with repetition, the accumulator is named \texttt{v}. +After reading in one expression, we initialize the accumulator. Each +time through the loop, we modify the accumulator by adding, +subtracting, multiplying by, or dividing the previous accumulator by +the expression that has been parsed. At the end of the rule, we +return the accumulator. + +The calculator example shows how to process lists of elements using +loops, as well as how to handle precedence of operators. + +\emph{Note:} It's often important to put the \texttt{END} token in, so +put it in unless you are sure that your grammar has some other +non-ambiguous token marking the end of the program. + +\mysubsection{Calculator with Memory} + +In the previous example we learned how to write a calculator that +evaluates simple numerical expressions. In this section we will +extend the example to support both local and global variables. + +To support global variables, we will add assignment statements to the +``goal'' rule. + +\begin{verbatim} + rule goal: expr END {{ return expr }} + | 'set' ID expr END {{ global_vars[ID] = expr }} + {{ return expr }} +\end{verbatim} + +To use these variables, we need a new kind of terminal: + +\begin{verbatim} + rule term: ... | ID {{ return global_vars[ID] }} +\end{verbatim} + +So far, these changes are straightforward. We simply have a global +dictionary \texttt{global\_vars} that stores the variables and values, +we modify it when there is an assignment statement, and we look up +variables in it when we see a variable name. + +To support local variables, we will add variable declarations to the +set of allowed expressions. + +\begin{verbatim} + rule term: ... | 'let' VAR '=' expr 'in' expr ... +\end{verbatim} + +This is where it becomes tricky. Local variables should be stored in +a local dictionary, not in the global one. One trick would be to save +a copy of the global dictionary, modify it, and then restore it +later. In this example we will instead use \emph{attributes} to +create local information and pass it to subrules. + +A rule can optionally take parameters. When we invoke the rule, we +must pass in arguments. For local variables, let's use a single +parameter, \texttt{local\_vars}: + +\begin{verbatim} + rule expr<<local_vars>>: ... + rule factor<<local_vars>>: ... + rule term<<local_vars>>: ... +\end{verbatim} + +Each time we want to match \texttt{expr}, \texttt{factor}, or +\texttt{term}, we will pass the local variables in the current rule to +the subrule. One interesting case is when we pass as an argument +something \emph{other} than \texttt{local\_vars}: + +\begin{verbatim} + rule term<<local_vars>>: ... + | 'let' VAR '=' expr<<local_vars>> + {{ local_vars = [(VAR, expr)] + local_vars }} + 'in' expr<<local_vars>> + {{ return expr }} +\end{verbatim} + +Note that the assignment to the local variables list does not modify +the original list. This is important to keep local variables from +being seen outside the ``let''. + +The other interesting case is when we find a variable: + +\begin{verbatim} +global_vars = {} + +def lookup(map, name): + for x,v in map: if x==name: return v + return global_vars[name] +%% + ... + rule term<<local_vars>: ... + | VAR {{ return lookup(local_vars, VAR) }} +\end{verbatim} + +The lookup function will search through the local variable list, and +if it cannot find the name there, it will look it up in the global +variable dictionary. + +A complete grammar for this example, including a read-eval-print loop +for interacting with the calculator, can be found in the examples +subdirectory included with Yapps. + +In this section we saw how to insert code before the parser. We also +saw how to use attributes to transmit local information from one rule +to its subrules. + +\mysection{Grammars} + +Each Yapps grammar has a name, a list of tokens, and a set of +production rules. A grammar named \texttt{X} will be used to produce +a parser named \texttt{X} and a scanner anmed \texttt{XScanner}. As +in Python, names are case sensitive, start with a letter, and contain +letters, numbers, and underscores (\_). + +There are three kinds of tokens in Yapps: named, inline, and ignored. +As their name implies, named tokens are given a name, using the token +construct: \texttt{token \emph{name} : \emph{regexp}}. In a rule, the +token can be matched by using the name. Inline tokens are regular +expressions that are used in rules without being declared. Ignored +tokens are declared using the ignore construct: \texttt{ignore: + \emph{regexp}}. These tokens are ignored by the scanner, and are +not seen by the parser. Often whitespace is an ignored token. The +regular expressions used to define tokens should use the syntax +defined in the \texttt{re} module, so some symbols may have to be +backslashed. + +Production rules in Yapps have a name and a pattern to match. If the +rule is parameterized, the name should be followed by a list of +parameter names in \verb|<<...>>|. A pattern can be a simple pattern +or a compound pattern. Simple patterns are the name of a named token, +a regular expression in quotes (inline token), the name of a +production rule (followed by arguments in \verb|<<...>>|, if the rule +has parameters), and single line Python statements (\verb|{{...}}|). +Compound patterns are sequences (\verb|A B C ...|), choices ( +\verb:A | B | C | ...:), options (\verb|[...]|), zero-or-more repetitions +(\verb|...*|), and one-or-more repetitions (\verb|...+|). Like +regular expressions, repetition operators have a higher precedence +than sequences, and sequences have a higher precedence than choices. + +Whenever \verb|{{...}}| is used, a legal one-line Python statement +should be put inside the braces. The token \verb|}}| should not +appear within the \verb|{{...}}| section, even within a string, since +Yapps does not attempt to parse the Python statement. A workaround +for strings is to put two strings together (\verb|"}" "}"|), or to use +backslashes (\verb|"}\}"|). At the end of a rule you should use a +\verb|{{ return X }}| statement to return a value. However, you +should \emph{not} use any control statements (\texttt{return}, +\texttt{continue}, \texttt{break}) in the middle of a rule. Yapps +needs to make assumptions about the control flow to generate a parser, +and any changes to the control flow will confuse Yapps. + +The \verb|<<...>>| form can occur in two places: to define parameters +to a rule and to give arguments when matching a rule. Parameters use +the syntax used for Python functions, so they can include default +arguments and the special forms (\verb|*args| and \verb|**kwargs|). +Arguments use the syntax for Python function call arguments, so they +can include normal arguments and keyword arguments. The token +\verb|>>| should not appear within the \verb|<<...>>| section. + +In both the statements and rule arguments, you can use names defined +by the parser to refer to matched patterns. You can refer to the text +matched by a named token by using the token name. You can use the +value returned by a production rule by using the name of that rule. +If a name \texttt{X} is matched more than once (such as in loops), you +will have to save the earlier value(s) in a temporary variable, and +then use that temporary variable in the return value. The next +section has an example of a name that occurs more than once. + +\mysubsection{Left Factoring} +\label{sec:Left-Factoring} + +Yapps produces ELL(1) parsers, which determine which clause to match +based on the first token available. Sometimes the leftmost tokens of +several clauses may be the same. The classic example is the +\emph{if/then/else} construct in Pascal: + +\begin{verbatim} +rule stmt: "if" expr "then" stmt {{ then_part = stmt }} + "else" stmt {{ return ('If',expr,then_part,stmt) }} + | "if" expr "then" stmt {{ return ('If',expr,stmt,[]) }} +\end{verbatim} + +(Note that we have to save the first \texttt{stmt} into a variable +because there is another \texttt{stmt} that will be matched.) The +left portions of the two clauses are the same, which presents a +problem for the parser. The solution is \emph{left-factoring}: the +common parts are put together, and \emph{then} a choice is made about +the remaining part: + +\begin{verbatim} +rule stmt: "if" expr + "then" stmt {{ then_part = stmt }} + {{ else_part = [] }} + [ "else" stmt {{ else_part = stmt }} ] + {{ return ('If', expr, then_part, else_part) }} +\end{verbatim} + +Unfortunately, the classic \emph{if/then/else} situation is +\emph{still} ambiguous when you left-factor. Yapps can deal with this +situation, but will report a warning; see section +\ref{sec:Ambiguous-Grammars} for details. + +In general, replace rules of the form: + +\begin{verbatim} +rule A: a b1 {{ return E1 }} + | a b2 {{ return E2 }} + | c3 {{ return E3 }} + | c4 {{ return E4 }} +\end{verbatim} + +with rules of the form: + +\begin{verbatim} +rule A: a ( b1 {{ return E1 }} + | b2 {{ return E2 }} + ) + | c3 {{ return E3 }} + | c4 {{ return E4 }} +\end{verbatim} + +\mysubsection{Left Recursion} + +A common construct in grammars is for matching a list of patterns, +sometimes separated with delimiters such as commas or semicolons. In +LR-based parser systems, we can parse a list with something like this: + +\begin{verbatim} +rule sum: NUM {{ return NUM }} + | sum "+" NUM {{ return (sum, NUM) }} +\end{verbatim} + +Parsing \texttt{1+2+3+4} would produce the output +\texttt{(((1,2),3),4)}, which is what we want from a left-associative +addition operator. Unfortunately, this grammar is \emph{left +recursive,} because the \texttt{sum} rule contains a clause that +begins with \texttt{sum}. (The recursion occurs at the left side of +the clause.) + +We must restructure this grammar to be \emph{right recursive} instead: + +\begin{verbatim} +rule sum: NUM {{ return NUM }} + | NUM "+" sum {{ return (NUM, sum) }} +\end{verbatim} + +Unfortunately, using this grammar, \texttt{1+2+3+4} would be parsed as +\texttt{(1,(2,(3,4)))}, which no longer follows left associativity. +The rule also needs to be left-factored. Instead, we write the +pattern as a loop instead: + +\begin{verbatim} +rule sum: NUM {{ v = NUM }} + ( "[+]" NUM {{ v = (v,NUM) }} )* + {{ return v }} +\end{verbatim} + +In general, replace rules of the form: + +\begin{verbatim} +rule A: A a1 -> << E1 >> + | A a2 -> << E2 >> + | b3 -> << E3 >> + | b4 -> << E4 >> +\end{verbatim} + +with rules of the form: + +\begin{verbatim} +rule A: ( b3 {{ A = E3 }} + | b4 {{ A = E4 }} ) + ( a1 {{ A = E1 }} + | a2 {{ A = E2 }} )* + {{ return A }} +\end{verbatim} + +We have taken a rule that proved problematic for with recursion and +turned it into a rule that works well with looping constructs. + +\mysubsection{Ambiguous Grammars} +\label{sec:Ambiguous-Grammars} + +In section \ref{sec:Left-Factoring} we saw the classic if/then/else +ambiguity, which occurs because the ``else \ldots'' portion of an ``if +\ldots then \ldots else \ldots'' construct is optional. Programs with +nested if/then/else constructs can be ambiguous when one of the else +clauses is missing: +\begin{verbatim} +if 1 then if 1 then + if 5 then if 5 then + x := 1; x := 1; + else else + y := 9; y := 9; +\end{verbatim} + +The indentation shows that the program can be parsed in two different +ways. (Of course, if we all would adopt Python's indentation-based +structuring, this would never happen!) Usually we want the parsing on +the left: the ``else'' should be associated with the closest ``if'' +statement. In section \ref{sec:Left-Factoring} we ``solved'' the +problem by using the following grammar: + +\begin{verbatim} +rule stmt: "if" expr + "then" stmt {{ then_part = stmt }} + {{ else_part = [] }} + [ "else" stmt {{ else_part = stmt }} ] + {{ return ('If', expr, then_part, else_part) }} +\end{verbatim} + +Here, we have an optional match of ``else'' followed by a statement. +The ambiguity is that if an ``else'' is present, it is not clear +whether you want it parsed immediately or if you want it to be parsed +by the outer ``if''. + +Yapps will deal with the situation by matching when the else pattern +when it can. The parser will work in this case because it prefers the +\emph{first} matching clause, which tells Yapps to parse the ``else''. +That is exactly what we want! + +For ambiguity cases with choices, Yapps will choose the \emph{first} +matching choice. However, remember that Yapps only looks at the first +token to determine its decision, so {\tt (a b | a c)} will result in +Yapps choosing {\tt a b} even when the input is {\tt a c}. It only +looks at the first token, {\tt a}, to make its decision. + +\mysection{Customization} + +Both the parsers and the scanners can be customized. The parser is +usually extended by subclassing, and the scanner can either be +subclassed or completely replaced. + +\mysubsection{Customizing Parsers} + +If additional fields and methods are needed in order for a parser to +work, Python subclassing can be used. (This is unlike parser classes +written in static languages, in which these fields and methods must be +defined in the generated parser class.) We simply subclass the +generated parser, and add any fields or methods required. Expressions +in the grammar can call methods of the subclass to perform any actions +that cannot be expressed as a simple expression. For example, +consider this simple grammar: + +\begin{verbatim} +parser X: + rule goal: "something" {{ self.printmsg() }} +\end{verbatim} + +The \texttt{printmsg} function need not be implemented in the parser +class \texttt{X}; it can be implemented in a subclass: + +\begin{verbatim} +import Xparser + +class MyX(Xparser.X): + def printmsg(self): + print "Hello!" +\end{verbatim} + +\mysubsection{Customizing Scanners} + +The generated parser class is not dependent on the generated scanner +class. A scanner object is passed to the parser object's constructor +in the \texttt{parse} function. To use a different scanner, write +your own function to construct parser objects, with an instance of a +different scanner. Scanner objects must have a \texttt{token} method +that accepts an integer \texttt{N} as well as a list of allowed token +types, and returns the Nth token, as a tuple. The default scanner +raises \texttt{NoMoreTokens} if no tokens are available, and +\texttt{SyntaxError} if no token could be matched. However, the +parser does not rely on these exceptions; only the \texttt{parse} +convenience function (which calls \texttt{wrap\_error\_reporter}) and +the \texttt{print\_error} error display function use those exceptions. + +The tuples representing tokens have four elements. The first two are +the beginning and ending indices of the matched text in the input +string. The third element is the type tag, matching either the name +of a named token or the quoted regexp of an inline or ignored token. +The fourth element of the token tuple is the matched text. If the +input string is \texttt{s}, and the token tuple is +\texttt{(b,e,type,val)}, then \texttt{val} should be equal to +\texttt{s[b:e]}. + +The generated parsers do not the beginning or ending index. They use +only the token type and value. However, the default error reporter +uses the beginning and ending index to show the user where the error +is. + +\mysection{Parser Mechanics} + +The base parser class (Parser) defines two methods, \texttt{\_scan} +and \texttt{\_peek}, and two fields, \texttt{\_pos} and +\texttt{\_scanner}. The generated parser inherits from the base +parser, and contains one method for each rule in the grammar. To +avoid name clashes, do not use names that begin with an underscore +(\texttt{\_}). + +\mysubsection{Parser Objects} +\label{sec:Parser-Objects} + +Yapps produces as output two exception classes, a scanner class, a +parser class, and a function \texttt{parse} that puts everything +together. The \texttt{parse} function does not have to be used; +instead, one can create a parser and scanner object and use them +together for parsing. + +\begin{verbatim} + def parse(rule, text): + P = X(XScanner(text)) + return wrap_error_reporter(P, rule) +\end{verbatim} + +The \texttt{parse} function takes a name of a rule and an input string +as input. It creates a scanner and parser object, then calls +\texttt{wrap\_error\_reporter} to execute the method in the parser +object named \texttt{rule}. The wrapper function will call the +appropriate parser rule and report any parsing errors to standard +output. + +There are several situations in which the \texttt{parse} function +would not be useful. If a different parser or scanner is being used, +or exceptions are to be handled differently, a new \texttt{parse} +function would be required. The supplied \texttt{parse} function can +be used as a template for writing a function for your own needs. An +example of a custom parse function is the \texttt{generate} function +in \texttt{Yapps.py}. + +\mysubsection{Context Sensitive Scanner} + +Unlike most scanners, the scanner produced by Yapps can take into +account the context in which tokens are needed, and try to match only +good tokens. For example, in the grammar: + +\begin{verbatim} +parser IniFile: + token ID: "[a-zA-Z_0-9]+" + token VAL: ".*" + + rule pair: ID "[ \t]*=[ \t]*" VAL "\n" +\end{verbatim} + +we would like to scan lines of text and pick out a name/value pair. +In a conventional scanner, the input string \texttt{shell=progman.exe} +would be turned into a single token of type \texttt{VAL}. The Yapps +scanner, however, knows that at the beginning of the line, an +\texttt{ID} is expected, so it will return \texttt{"shell"} as a token +of type \texttt{ID}. Later, it will return \texttt{"progman.exe"} as +a token of type \texttt{VAL}. + +Context sensitivity decreases the separation between scanner and +parser, but it is useful in parsers like \texttt{IniFile}, where the +tokens themselves are not unambiguous, but \emph{are} unambiguous +given a particular stage in the parsing process. + +Unfortunately, context sensitivity can make it more difficult to +detect errors in the input. For example, in parsing a Pascal-like +language with ``begin'' and ``end'' as keywords, a context sensitive +scanner would only match ``end'' as the END token if the parser is in +a place that will accept the END token. If not, then the scanner +would match ``end'' as an identifier. To disable the context +sensitive scanner in Yapps, add the +\texttt{context-insensitive-scanner} option to the grammar: + +\begin{verbatim} +Parser X: + option: "context-insensitive-scanner" +\end{verbatim} + +Context-insensitive scanning makes the parser look cleaner as well. + +\mysubsection{Internal Variables} + +There are two internal fields that may be of use. The parser object +has two fields, \texttt{\_pos}, which is the index of the current +token being matched, and \texttt{\_scanner}, which is the scanner +object. The token itself can be retrieved by accessing the scanner +object and calling the \texttt{token} method with the token index. However, if you call \texttt{token} before the token has been requested by the parser, it may mess up a context-sensitive scanner.\footnote{When using a context-sensitive scanner, the parser tells the scanner what the valid token types are at each point. If you call \texttt{token} before the parser can tell the scanner the valid token types, the scanner will attempt to match without considering the context.} A +potentially useful combination of these fields is to extract the +portion of the input matched by the current rule. To do this, just save the scanner state (\texttt{\_scanner.pos}) before the text is matched and then again after the text is matched: + +\begin{verbatim} + rule R: + {{ start = self._scanner.pos }} + a b c + {{ end = self._scanner.pos }} + {{ print 'Text is', self._scanner.input[start:end] }} +\end{verbatim} + +\mysubsection{Pre- and Post-Parser Code} + +Sometimes the parser code needs to rely on helper variables, +functions, and classes. A Yapps grammar can optionally be surrounded +by double percent signs, to separate the grammar from Python code. + +\begin{verbatim} +... Python code ... +%% +... Yapps grammar ... +%% +... Python code ... +\end{verbatim} + +The second \verb|%%| can be omitted if there is no Python code at the +end, and the first \verb|%%| can be omitted if there is no extra +Python code at all. (To have code only at the end, both separators +are required.) + +If the second \verb|%%| is omitted, Yapps will insert testing code +that allows you to use the generated parser to parse a file. + +The extended calculator example in the Yapps examples subdirectory +includes both pre-parser and post-parser code. + +\mysubsection{Representation of Grammars} + +For each kind of pattern there is a class derived from Pattern. Yapps +has classes for Terminal, NonTerminal, Sequence, Choice, Option, Plus, +Star, and Eval. Each of these classes has the following interface: + +\begin{itemize} + \item[setup(\emph{gen})] Set accepts-$\epsilon$, and call + \emph{gen.changed()} if it changed. This function can change the + flag from false to true but \emph{not} from true to false. + \item[update(\emph(gen))] Set \first and \follow, and call + \emph{gen.changed()} if either changed. This function can add to + the sets but \emph{not} remove from them. + \item[output(\emph{gen}, \emph{indent})] Generate code for matching + this rule, using \emph{indent} as the current indentation level. + Writes are performed using \emph{gen.write}. + \item[used(\emph{vars})] Given a list of variables \emph{vars}, + return two lists: one containing the variables that are used, and + one containing the variables that are assigned. This function is + used for optimizing the resulting code. +\end{itemize} + +Both \emph{setup} and \emph{update} monotonically increase the +variables they modify. Since the variables can only increase a finite +number of times, we can repeatedly call the function until the +variable stabilized. The \emph{used} function is not currently +implemented. + +With each pattern in the grammar Yapps associates three pieces of +information: the \first set, the \follow set, and the +accepts-$\epsilon$ flag. + +The \first set contains the tokens that can appear as we start +matching the pattern. The \follow set contains the tokens that can +appear immediately after we match the pattern. The accepts-$\epsilon$ +flag is true if the pattern can match no tokens. In this case, \first +will contain all the elements in \follow. The \follow set is not +needed when accepts-$\epsilon$ is false, and may not be accurate in +those cases. + +Yapps does not compute these sets precisely. Its approximation can +miss certain cases, such as this one: + +\begin{verbatim} + rule C: ( A* | B ) + rule B: C [A] +\end{verbatim} + +Yapps will calculate {\tt C}'s \follow set to include {\tt A}. +However, {\tt C} will always match all the {\tt A}'s, so {\tt A} will +never follow it. Yapps 2.0 does not properly handle this construct, +but if it seems important, I may add support for it in a future +version. + +Yapps also cannot handle constructs that depend on the calling +sequence. For example: + +\begin{verbatim} + rule R: U | 'b' + rule S: | 'c' + rule T: S 'b' + rule U: S 'a' +\end{verbatim} + +The \follow set for {\tt S} includes {\tt a} and {\tt b}. Since {\tt + S} can be empty, the \first set for {\tt S} should include {\tt a}, +{\tt b}, and {\tt c}. However, when parsing {\tt R}, if the lookahead +is {\tt b} we should \emph{not} parse {\tt U}. That's because in {\tt + U}, {\tt S} is followed by {\tt a} and not {\tt b}. Therefore in +{\tt R}, we should choose rule {\tt U} only if there is an {\tt a} or +{\tt c}, but not if there is a {\tt b}. Yapps and many other LL(1) +systems do not distinguish {\tt S b} and {\tt S a}, making {\tt + S}'s \follow set {\tt a, b}, and making {\tt R} always try to match +{\tt U}. In this case we can solve the problem by changing {\tt R} to +\verb:'b' | U: but it may not always be possible to solve all such +problems in this way. + +\appendix + +\mysection{Grammar for Parsers} + +This is the grammar for parsers, without any Python code mixed in. +The complete grammar can be found in \texttt{parsedesc.g} in the Yapps +distribution. + +\begin{verbatim} +parser ParserDescription: + ignore: "\\s+" + ignore: "#.*?\r?\n" + token END: "$" # $ means end of string + token ATTR: "<<.+?>>" + token STMT: "{{.+?}}" + token ID: '[a-zA-Z_][a-zA-Z_0-9]*' + token STR: '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"' + + rule Parser: "parser" ID ":" + Options + Tokens + Rules + END + + rule Options: ( "option" ":" STR )* + rule Tokens: ( "token" ID ":" STR | "ignore" ":" STR )* + rule Rules: ( "rule" ID OptParam ":" ClauseA )* + + rule ClauseA: ClauseB ( '[|]' ClauseB )* + rule ClauseB: ClauseC* + rule ClauseC: ClauseD [ '[+]' | '[*]' ] + rule ClauseD: STR | ID [ATTR] | STMT + | '\\(' ClauseA '\\) | '\\[' ClauseA '\\]' +\end{verbatim} + +\mysection{Upgrading} + +Yapps 2.0 is not backwards compatible with Yapps 1.0. In this section +are some tips for upgrading: + +\begin{enumerate} + \item Yapps 1.0 was distributed as a single file. Yapps 2.0 is + instead distributed as two Python files: a \emph{parser generator} + (26k) and a \emph{parser runtime} (5k). You need both files to + create parsers, but you need only the runtime (\texttt{yappsrt.py}) + to use the parsers. + + \item Yapps 1.0 supported Python 1.4 regular expressions from the + \texttt{regex} module. Yapps 2.0 uses Python 1.5 regular + expressions from the \texttt{re} module. \emph{The new syntax for + regular expressions is not compatible with the old syntax.} + Andrew Kuchling has a \htmladdnormallink{guide to converting + regular + expressions}{http://www.python.org/doc/howto/regex-to-re/} on his + web page. + + \item Yapps 1.0 wants a pattern and then a return value in \verb|->| + \verb|<<...>>|. Yapps 2.0 allows patterns and Python statements to + be mixed. To convert a rule like this: + +\begin{verbatim} +rule R: A B C -> << E1 >> + | X Y Z -> << E2 >> +\end{verbatim} + + to Yapps 2.0 form, replace the return value specifiers with return + statements: + +\begin{verbatim} +rule R: A B C {{ return E1 }} + | X Y Z {{ return E2 }} +\end{verbatim} + + \item Yapps 2.0 does not perform tail recursion elimination. This + means any recursive rules you write will be turned into recursive + methods in the parser. The parser will work, but may be slower. + It can be made faster by rewriting recursive rules, using instead + the looping operators \verb|*| and \verb|+| provided in Yapps 2.0. + +\end{enumerate} + +\mysection{Troubleshooting} + +\begin{itemize} + \item A common error is to write a grammar that doesn't have an END + token. End tokens are needed when it is not clear when to stop + parsing. For example, when parsing the expression {\tt 3+5}, it is + not clear after reading {\tt 3} whether to treat it as a complete + expression or whether the parser should continue reading. + Therefore the grammar for numeric expressions should include an end + token. Another example is the grammar for Lisp expressions. In + Lisp, it is always clear when you should stop parsing, so you do + \emph{not} need an end token. In fact, it may be more useful not + to have an end token, so that you can read in several Lisp expressions. + \item If there is a chance of ambiguity, make sure to put the choices + in the order you want them checked. Usually the most specific + choice should be first. Empty sequences should usually be last. + \item The context sensitive scanner is not appropriate for all + grammars. You might try using the insensitive scanner with the + {\tt context-insensitive-scanner} option in the grammar. + \item If performance turns out to be a problem, try writing a custom + scanner. The Yapps scanner is rather slow (but flexible and easy + to understand). +\end{itemize} + +\mysection{History} + +Yapps 1 had several limitations that bothered me while writing +parsers: + +\begin{enumerate} + \item It was not possible to insert statements into the generated + parser. A common workaround was to write an auxilliary function + that executed those statements, and to call that function as part + of the return value calculation. For example, several of my + parsers had an ``append(x,y)'' function that existed solely to call + ``x.append(y)''. + \item The way in which grammars were specified was rather + restrictive: a rule was a choice of clauses. Each clause was a + sequence of tokens and rule names, followed by a return value. + \item Optional matching had to be put into a separate rule because + choices were only made at the beginning of a rule. + \item Repetition had to be specified in terms of recursion. Not only + was this awkward (sometimes requiring additional rules), I had to + add a tail recursion optimization to Yapps to transform the + recursion back into a loop. +\end{enumerate} + +Yapps 2 addresses each of these limitations. + +\begin{enumerate} + \item Statements can occur anywhere within a rule. (However, only + one-line statements are allowed; multiline blocks marked by + indentation are not.) + \item Grammars can be specified using any mix of sequences, choices, + tokens, and rule names. To allow for complex structures, + parentheses can be used for grouping. + \item Given choices and parenthesization, optional matching can be + expressed as a choice between some pattern and nothing. In + addition, Yapps 2 has the convenience syntax \verb|[A B ...]| for + matching \verb|A B ...| optionally. + \item Repetition operators \verb|*| for zero or more and \verb|+| for + one or more make it easy to specify repeating patterns. +\end{enumerate} + +It is my hope that Yapps 2 will be flexible enough to meet my needs +for another year, yet simple enough that I do not hesitate to use it. + +\mysection{Future Extensions} +\label{sec:future} + +I am still investigating the possibility of LL(2) and higher +lookahead. However, it looks like the resulting parsers will be +somewhat ugly. + +It would be nice to control choices with user-defined predicates. + +The most likely future extension is backtracking. A grammar pattern +like \verb|(VAR ':=' expr)? {{ return Assign(VAR,expr) }} : expr {{ return expr }}| +would turn into code that attempted to match \verb|VAR ':=' expr|. If +it succeeded, it would run \verb|{{ return ... }}|. If it failed, it +would match \verb|expr {{ return expr }}|. Backtracking may make it +less necessary to write LL(2) grammars. + +\mysection{References} + +\begin{enumerate} + \item The \htmladdnormallink{Python-Parser + SIG}{http://www.python.org/sigs/parser-sig/} is the first place + to look for a list of parser systems for Python. + + \item ANTLR/PCCTS, by Terrence Parr, is available at + \htmladdnormallink{The ANTLR Home Page}{http://www.antlr.org/}. + + \item PyLR, by Scott Cotton, is at \htmladdnormallink{his Starship + page}{http://starship.skyport.net/crew/scott/PyLR.html}. + + \item John Aycock's \htmladdnormallink{Compiling Little Languages + Framework}{http://www.csr.UVic.CA/~aycock/python/}. + + \item PyBison, by Scott Hassan, can be found at + \htmladdnormallink{his Python Projects + page}{http://coho.stanford.edu/\~{}hassan/Python/}. + + \item mcf.pars, by Mike C. Fletcher, is available at + \htmladdnormallink{his web + page}{http://www.golden.net/\~{}mcfletch/programming/}. + + \item kwParsing, by Aaron Watters, is available at + \htmladdnormallink{his Starship + page}{http://starship.skyport.net/crew/aaron_watters/kwParsing/}. +\end{enumerate} + +\end{document} diff --git a/util/newconfig/yappsrt.py b/util/newconfig/yappsrt.py new file mode 100644 index 0000000000..2ce2480f08 --- /dev/null +++ b/util/newconfig/yappsrt.py @@ -0,0 +1,172 @@ +# Yapps 2.0 Runtime +# +# This module is needed to run generated parsers. + +from string import * +import exceptions +import re + +class SyntaxError(Exception): + """When we run into an unexpected token, this is the exception to use""" + def __init__(self, pos=-1, msg="Bad Token"): + self.pos = pos + self.msg = msg + def __repr__(self): + if self.pos < 0: return "#<syntax-error>" + else: return "SyntaxError[@ char " + `self.pos` + ": " + self.msg + "]" + +class NoMoreTokens(Exception): + """Another exception object, for when we run out of tokens""" + pass + +class Scanner: + def __init__(self, patterns, ignore, input): + """Patterns is [(terminal,regex)...] + Ignore is [terminal,...]; + Input is a string""" + self.tokens = [] + self.restrictions = [] + self.input = input + self.pos = 0 + self.ignore = ignore + # The stored patterns are a pair (compiled regex,source + # regex). If the patterns variable passed in to the + # constructor is None, we assume that the class already has a + # proper .patterns list constructed + if patterns is not None: + self.patterns = [] + for k,r in patterns: + self.patterns.append( (k, re.compile(r)) ) + + def token(self, i, restrict=0): + """Get the i'th token, and if i is one past the end, then scan + for another token; restrict is a list of tokens that + are allowed, or 0 for any token.""" + if i == len(self.tokens): self.scan(restrict) + if i < len(self.tokens): + # Make sure the restriction is more restricted + if restrict and self.restrictions[i]: + for r in restrict: + if r not in self.restrictions[i]: + raise "Unimplemented: restriction set changed" + return self.tokens[i] + raise NoMoreTokens() + + def __repr__(self): + """Print the last 10 tokens that have been scanned in""" + output = '' + for t in self.tokens[-10:]: + output = '%s\n (@%s) %s = %s' % (output,t[0],t[2],`t[3]`) + return output + + def scan(self, restrict): + """Should scan another token and add it to the list, self.tokens, + and add the restriction to self.restrictions""" + # Keep looking for a token, ignoring any in self.ignore + while 1: + # Search the patterns for the longest match, with earlier + # tokens in the list having preference + best_match = -1 + best_pat = '(error)' + for p, regexp in self.patterns: + # First check to see if we're ignoring this token + if restrict and p not in restrict and p not in self.ignore: + continue + m = regexp.match(self.input, self.pos) + if m and len(m.group(0)) > best_match: + # We got a match that's better than the previous one + best_pat = p + best_match = len(m.group(0)) + + # If we didn't find anything, raise an error + if best_pat == '(error)' and best_match < 0: + msg = "Bad Token" + if restrict: + msg = "Trying to find one of "+join(restrict,", ") + raise SyntaxError(self.pos, msg) + + # If we found something that isn't to be ignored, return it + if best_pat not in self.ignore: + # Create a token with this data + token = (self.pos, self.pos+best_match, best_pat, + self.input[self.pos:self.pos+best_match]) + self.pos = self.pos + best_match + # Only add this token if it's not in the list + # (to prevent looping) + if not self.tokens or token != self.tokens[-1]: + self.tokens.append(token) + self.restrictions.append(restrict) + return + else: + # This token should be ignored .. + self.pos = self.pos + best_match + +class Parser: + def __init__(self, scanner): + self._scanner = scanner + self._pos = 0 + + def _peek(self, *types): + """Returns the token type for lookahead; if there are any args + then the list of args is the set of token types to allow""" + tok = self._scanner.token(self._pos, types) + return tok[2] + + def _scan(self, type): + """Returns the matched text, and moves to the next token""" + tok = self._scanner.token(self._pos, [type]) + if tok[2] != type: + raise SyntaxError(tok[0], 'Trying to find '+type) + self._pos = 1+self._pos + return tok[3] + + + +def print_error(input, err, scanner): + """This is a really dumb long function to print error messages nicely.""" + p = err.pos + # Figure out the line number + line = count(input[:p], '\n') + print err.msg+" on line "+`line+1`+":" + # Now try printing part of the line + text = input[max(p-80,0):p+80] + p = p - max(p-80,0) + + # Strip to the left + i = rfind(text[:p],'\n') + j = rfind(text[:p],'\r') + if i < 0 or (j < i and j >= 0): i = j + if i >= 0 and i < p: + p = p - i - 1 + text = text[i+1:] + + # Strip to the right + i = find(text,'\n',p) + j = find(text,'\r',p) + if i < 0 or (j < i and j >= 0): i = j + if i >= 0: + text = text[:i] + + # Now shorten the text + while len(text) > 70 and p > 60: + # Cut off 10 chars + text = "..." + text[10:] + p = p - 7 + + # Now print the string, along with an indicator + print '> ',text + print '> ',' '*p + '^' + print 'List of nearby tokens:', scanner + +def wrap_error_reporter(parser, rule): + try: return getattr(parser, rule)() + except SyntaxError, s: + input = parser._scanner.input + try: + print_error(input, s, parser._scanner) + except ImportError: + print 'Syntax Error',s.msg,'on line',1+count(input[:s.pos], '\n') + except NoMoreTokens: + print 'Could not complete parsing; stopped around here:' + print parser._scanner + |