diff options
Diffstat (limited to 'util/sconfig')
-rw-r--r-- | util/sconfig/LICENSE | 18 | ||||
-rw-r--r-- | util/sconfig/Makefile | 31 | ||||
-rw-r--r-- | util/sconfig/NOTES | 46 | ||||
-rw-r--r-- | util/sconfig/config.g | 1029 | ||||
-rw-r--r-- | util/sconfig/parsedesc.g | 196 | ||||
-rw-r--r-- | util/sconfig/test.config | 6 | ||||
-rw-r--r-- | util/sconfig/yapps2.py | 779 | ||||
-rw-r--r-- | util/sconfig/yapps2.tex | 1225 | ||||
-rw-r--r-- | util/sconfig/yappsrt.py | 172 |
9 files changed, 3502 insertions, 0 deletions
diff --git a/util/sconfig/LICENSE b/util/sconfig/LICENSE new file mode 100644 index 0000000000..64f38b89f2 --- /dev/null +++ b/util/sconfig/LICENSE @@ -0,0 +1,18 @@ +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/util/sconfig/Makefile b/util/sconfig/Makefile new file mode 100644 index 0000000000..3328380569 --- /dev/null +++ b/util/sconfig/Makefile @@ -0,0 +1,31 @@ +ALL: $(shell echo *.g | sed s/\\.g/\\.py/g ) + +%.py: %.g yapps2.py yappsrt.py Makefile + python yapps2.py $< + +DOC: yapps2.ps yapps2.pdf manual/index.html + +yapps2.ps: yapps2.dvi + dvips -q yapps2.dvi -o yapps2.ps + +yapps2.pdf: yapps2.ps + ps2pdf yapps2.ps + +yapps2.dvi: yapps2.tex + latex yapps2.tex + +manual/index.html: yapps2.aux yapps2.tex + rm manual/yapps2.css + latex2html -dir 'manual' -mkdir -lcase_tags -font_size 12pt -split 4 -toc_depth 4 -html_version 4.0,unicode,table -t 'Yapps 2.0 Manual' -address 'Amit J Patel, amitp@cs.stanford.edu' -info 0 -show_section_numbers -up_title 'Yapps Page' -up_url 'http://theory.stanford.edu/~amitp/yapps/' -strict -image_type png yapps2.tex + echo '@import url("http://www-cs-students.stanford.edu/~amitp/amitp.css");' > manual/yapps2-new.css + echo 'hr { display:none; }' >> manual/yapps2-new.css + echo 'h1 br, h2 br { display:none; }' >>manual/yapps2-new.css + cat manual/yapps2.css >> manual/yapps2-new.css + rm manual/yapps2.css + mv manual/yapps2-new.css manual/yapps2.css + +DISTRIB: + cd ..; zip -u yapps2.zip yapps2/{LICENSE,yapps2.py,yappsrt.py,parsedesc.g,examples/*.g,NOTES,yapps2.tex,Makefile,manual/*.html,manual/*.css,manual/*.png} + +clean: + rm -f config.py yappsrt.pyc parsedesc.py diff --git a/util/sconfig/NOTES b/util/sconfig/NOTES new file mode 100644 index 0000000000..325e76a479 --- /dev/null +++ b/util/sconfig/NOTES @@ -0,0 +1,46 @@ +April 14, 2002: + +I haven't worked on Yapps for a while, mainly because I spent all my energy +on trying to graduate. Now that I've finished school, I have several projects +I want to start working on again, including Yapps. + +Notes for myself: + +Add a debugging mode that helps you understand how the grammar + is constructed and how things are being parsed +Look into an English output mode that would use natural language + to describe a grammar +Optimize unused variables +Add a convenience to automatically gather up the values returned + from subpatterns, put them into a list, and return them +Improve the documentation +Write some larger examples +Get rid of old-style regex support +Use SRE's lex support to speed up lexing (this may be hard given that + yapps allows for context-sensitive lexers) +Look over Dan Connoly's experience with Yapps (bugs, frustrations, etc.) + and see what improvements could be made +Add something to pretty-print the grammar (without the actions) +Maybe conditionals? Follow this rule only if <condition> holds. + But this would be useful mainly when multiple rules match, and we + want the first matching rule. The conditional would mean we skip to + the next rule. Maybe this is part of the attribute grammar system, + where rule X<0> can be specified separately from X<N>. +Convenience functions that could build return values for all rules + without specifying the code for each rule individually +Patterns (abstractions over rules) -- for example, comma separated values + have a certain rule pattern that gets replicated all over the place +"Gather" mode that simply outputs the return values for certain nodes. + For example, if you just want all expressions, you could ask yapps + to gather the results of the 'expr' rule into a list. This would + ignore all the higher level structure. +Look at everyone's Yapps grammars, and come up with larger examples + http://www.w3.org/2000/10/swap/SemEnglish.g + http://www.w3.org/2000/10/swap/kifExpr.g + http://www.w3.org/2000/10/swap/rdfn3.g +It would be nice if you could feed text into Yapps (push model) instead + of Yapps reading text out of a string (pull model). However, I think + that would make the resulting parser code mostly unreadable + (like yacc, etc.). Coroutines/stacklesspython may be the answer. + + diff --git a/util/sconfig/config.g b/util/sconfig/config.g new file mode 100644 index 0000000000..db3c516583 --- /dev/null +++ b/util/sconfig/config.g @@ -0,0 +1,1029 @@ +# -*- python -*- +import sys +import os +import re +import string +import types + +import traceback + +warnings = 0 +errors = 0 + +treetop = '' +full_mainboard_path = '' +mainboard_path = '' +romimages = {} +curimage = 0 + +# ----------------------------------------------------------------------------- +# Utility Classes +# ----------------------------------------------------------------------------- + +class stack: + """Used to keep track of the current part or dir""" + class __stack_iter: + def __init__ (self, stack): + self.index = 0 + self.len = len(stack) + self.stack = stack + + def __iter__ (self): + return self + + def next (self): + if (self.index < self.len): + s = self.stack[self.index] + self.index = self.index + 1 + return s + raise StopIteration + + def __init__ (self): + self.stack = [] + + def __len__ (self): + return len(self.stack) + + def __getitem__ (self, i): + return self.stack[i] + + def __iter__ (self): + return self.__stack_iter(self.stack) + + def push(self, part): + self.stack.append(part) + + def pop(self): + try: + return self.stack.pop() + except IndexError: + return 0 + + def tos(self): + try: + return self.stack[-1] + except IndexError: + return 0 + + def empty(self): + return (len(self.stack) == 0) +partstack = stack() + +class debug_info: + none = 0 + gencode = 1 + dumptree = 2 + object = 3 + dict = 4 + statement = 5 + dump = 6 + gengraph = 7 + + def __init__(self, *level): + self.__level = level + + def setdebug(self, *level): + self.__level = level + + def level(self, level): + return level in self.__level + + def info(self, level, str): + if level in self.__level: + print str + +global debug +debug = debug_info(debug_info.none) +#debug = debug_info(debug_info.dumptree) +#debug = debug_info(debug_info.object) +#debug = debug_info(debug_info.gencode) + +# ----------------------------------------------------------------------------- +# Error Handling +# ----------------------------------------------------------------------------- + +def error(string): + """Print error message""" + global errors, loc + errors = errors + 1 + print "===> ERROR: %s" % string + +def fatal(string): + """Print error message and exit""" + error(string) + exitiferrors() + +def warning(string): + """Print warning message""" + global warnings, loc + warnings = warnings + 1 + print "===> WARNING: %s" % string + +def exitiferrors(): + """Exit parser if an error has been encountered""" + if (errors != 0): + sys.exit(1) + +def safe_open(file, mode): + try: + return open(file, mode) + except IOError: + fatal("Could not open file \"%s\"" % file) + +# ----------------------------------------------------------------------------- +# Main classes +# ----------------------------------------------------------------------------- + +class romimage: + """A rom image is the ultimate goal of coreboot""" + def __init__ (self, name): + # name of this rom image + self.name = name + + # instance counter for parts + self.partinstance = 0 + + # chip config files included by the 'config' directive + self.configincludes = {} + + # root of part tree + self.root = 0 + + # Last device built + self.last_device = 0 + + def getname(self): + return self.name + + def addconfiginclude(self, part, path): + setdict(self.configincludes, part, path) + + def getconfigincludes(self): + return self.configincludes + + def getincludefilename(self): + if (self.useinitincludes): + return "crt0.S" + else: + return "crt0_includes.h" + + def newformat(self): + return self.useinitincludes + + def numparts(self): + return self.partinstance + + def newpartinstance(self): + i = self.partinstance + self.partinstance = self.partinstance + 1 + return i + + def setroot(self, part): + self.root = part + + def getroot(self): + return self.root + +class partobj: + """A configuration part""" + def __init__ (self, image, dir, parent, part, type_name, instance_name, chip_or_device): + if (parent): + debug.info(debug.object, "partobj dir %s parent %s part %s" \ + % (dir, parent.instance_name, part)) + else: + debug.info(debug.object, "partobj dir %s part %s" \ + % (dir, part)) + + # romimage that is configuring this part + self.image = image + + # links for static device tree + self.children = 0 + self.prev_sibling = 0 + self.next_sibling = 0 + self.prev_device = 0 + self.next_device = 0 + self.chip_or_device = chip_or_device + + # initializers for static device tree + self.registercode = {} + + # part name + self.part = part + + # type name of this part + self.type_name = type_name + + # directory containing part files + self.dir = dir + + # instance number, used to distinguish anonymous + # instances of this part + self.instance = image.newpartinstance() + debug.info(debug.object, "INSTANCE %d" % self.instance) + + # Name of chip config file (0 if not needed) + self.chipconfig = 0 + + # Flag to indicate that we have generated type + # definitions for this part (only want to do it once) + self.done_types = 0 + + # Path to the device + self.path = "" + + # Resources of the device + self.resoruce = "" + self.resources = 0 + + # Enabled state of the device + self.enabled = 1 + + # Flag if I am a duplicate device + self.dup = 0 + + # If there is a chip.h file, we will create an + # include for it. + if (dir): + chiph = os.path.join(dir, "chip.h") + if (os.path.exists(chiph)): + debug.info(debug.object, "%s has chip at %s" % (self, dir)) + self.addconfig(chiph) + + # If no instance name is supplied then generate + # a unique name + if (instance_name == 0): + self.instance_name = self.type_name + \ + "_dev%d" % self.instance + self.chipinfo_name = "%s_info_%d" \ + % (self.type_name, self.instance) + else: + self.instance_name = instance_name + self.chipinfo_name = "%s_info_%d" % (self.instance_name, self.instance) + + # Link this part into the device list + if (self.chip_or_device == 'device'): + if (image.last_device): + image.last_device.next_device = self + self.prev_device = image.last_device + image.last_device = self + + # Link this part into the tree + if (parent and (part != 'arch')): + debug.info(debug.gencode, "add to parent") + self.parent = parent + # add current child as my sibling, + # me as the child. + if (parent.children): + debug.info(debug.gencode, "add %s (%d) as sibling" % (parent.children.dir, parent.children.instance)) + youngest = parent.children + while(youngest.next_sibling): + youngest = youngest.next_sibling + youngest.next_sibling = self + self.prev_sibling = youngest + else: + parent.children = self + else: + self.parent = self + + def info(self): + return "%s: %s" % (self.part, self.type) + def type(self): + return self.chip_or_device + + def readable_name(self): + name = "" + name = "%s_%d" % (self.type_name, self.instance) + if (self.chip_or_device == 'chip'): + name = "%s %s %s" % (name, self.part, self.dir) + else: + name = "%s %s" % (name, self.path) + return name + + def graph_name(self): + name = "{ {_dev%d|" % self.instance + if (self.part): + name = "%s%s" % (name, self.part) + else: + name = "%s%s" % (name, self.chip_or_device) + if (self.type_name): + name = "%s}|%s}" % (name, self.type_name) + else: + name = "%s}|%s}" % (name, self.parent.type_name) + return name + + def dumpme(self, lvl): + """Dump information about this part for debugging""" + print "%d: %s" % (lvl, self.readable_name()) + print "%d: part %s" % (lvl, self.part) + print "%d: instance %d" % (lvl, self.instance) + print "%d: chip_or_device %s" % (lvl, self.chip_or_device) + print "%d: dir %s" % (lvl,self.dir) + print "%d: type_name %s" % (lvl,self.type_name) + print "%d: parent: %s" % (lvl, self.parent.readable_name()) + if (self.children): + print "%d: child %s" % (lvl, self.children.readable_name()) + if (self.next_sibling): + print "%d: siblings %s" % (lvl, self.next_sibling.readable_name()) + print "%d: registercode " % lvl + for f, v in self.registercode.items(): + print "\t%s = %s" % (f, v) + print "%d: chipconfig %s" % (lvl, self.chipconfig) + print "\n" + + def firstchilddevice(self): + """Find the first device in the children link.""" + kid = self.children + while (kid): + if (kid.chip_or_device == 'device'): + return kid + else: + kid = kid.children + return 0 + + def firstparentdevice(self): + """Find the first device in the parent link.""" + parent = self.parent + while (parent and (parent.parent != parent) and (parent.chip_or_device != 'device')): + parent = parent.parent + if ((parent.parent != parent) and (parent.chip_or_device != 'device')): + parent = 0 + while(parent and (parent.dup == 1)): + parent = parent.prev_sibling + if (not parent): + fatal("Device %s has no device parent; this is a config file error" % self.readable_name()) + return parent + + def firstparentdevicelink(self): + """Find the first device in the parent link and record which link it is.""" + link = 0 + parent = self.parent + while (parent and (parent.parent != parent) and (parent.chip_or_device != 'device')): + parent = parent.parent + if ((parent.parent != parent) and (parent.chip_or_device != 'device')): + parent = 0 + while(parent and (parent.dup == 1)): + parent = parent.prev_sibling + link = link + 1 + if (not parent): + fatal("Device %s has no device parent; this is a config file error" % self.readable_name()) + return link + + + def firstparentchip(self): + """Find the first chip in the parent link.""" + parent = self.parent + while (parent): + if ((parent.parent == parent) or (parent.chip_or_device == 'chip')): + return parent + else: + parent = parent.parent + fatal("Device %s has no chip parent; this is a config file error" % self.readable_name()) + + def firstsiblingdevice(self): + """Find the first device in the sibling link.""" + sibling = self.next_sibling + while(sibling and (sibling.path == self.path)): + sibling = sibling.next_sibling + if ((not sibling) and (self.parent.chip_or_device == 'chip')): + sibling = self.parent.next_sibling + while(sibling): + if (sibling.chip_or_device == 'device'): + return sibling + else: + sibling = sibling.children + return 0 + + def gencode(self, file, pass_num): + """Generate static initalizer code for this part. Two passes + are used - the first generates type information, and the second + generates instance information""" + if (pass_num == 0): + if (self.chip_or_device == 'chip'): + return; + else: + if (self.instance): + file.write("struct device %s;\n" \ + % self.instance_name) + else: + file.write("struct device dev_root;\n") + return + # This is pass the second, which is pass number 1 + # this is really just a case statement ... + + if (self.chip_or_device == 'chip'): + if (self.chipconfig): + debug.info(debug.gencode, "gencode: chipconfig(%d)" % \ + self.instance) + file.write("struct %s_config %s" % (self.type_name ,\ + self.chipinfo_name)) + if (self.registercode): + file.write("\t= {\n") + for f, v in self.registercode.items(): + file.write( "\t.%s = %s,\n" % (f, v)) + file.write("};\n") + else: + file.write(";") + file.write("\n") + + if (self.instance == 0): + self.instance_name = "dev_root" + file.write("struct device **last_dev_p = &%s.next;\n" % (self.image.last_device.instance_name)) + file.write("struct device dev_root = {\n") + file.write("\t.ops = &default_dev_ops_root,\n") + file.write("\t.bus = &dev_root.link[0],\n") + file.write("\t.path = { .type = DEVICE_PATH_ROOT },\n") + file.write("\t.enabled = 1,\n\t.links = 1,\n") + file.write("\t.on_mainboard = 1,\n") + file.write("\t.link = {\n\t\t[0] = {\n") + file.write("\t\t\t.dev=&dev_root,\n\t\t\t.link = 0,\n") + file.write("\t\t\t.children = &%s,\n" % self.firstchilddevice().instance_name) + file.write("\t\t},\n") + file.write("\t},\n") + if (self.chipconfig): + file.write("\t.chip_ops = &%s_ops,\n" % self.type_name) + file.write("\t.chip_info = &%s_info_%s,\n" % (self.type_name, self.instance)) + file.write("\t.next = &%s,\n" % self.firstchilddevice().instance_name) + file.write("};\n") + return + + # Don't print duplicate devices, just print their children + if (self.dup): + return + + file.write("struct device %s = {\n" % self.instance_name) + file.write("\t.ops = 0,\n") + file.write("\t.bus = &%s.link[%d],\n" % \ + (self.firstparentdevice().instance_name, \ + self.firstparentdevicelink())) + file.write("\t.path = {%s},\n" % self.path) + file.write("\t.enabled = %d,\n" % self.enabled) + file.write("\t.on_mainboard = 1,\n") + if (self.resources): + file.write("\t.resources = %d,\n" % self.resources) + file.write("\t.resource = {%s\n\t },\n" % self.resource) + file.write("\t.link = {\n"); + links = 0 + bus = self + while(bus and (bus.path == self.path)): + child = bus.firstchilddevice() + if (child or (bus != self) or (bus.next_sibling and (bus.next_sibling.path == self.path))): + file.write("\t\t[%d] = {\n" % links) + file.write("\t\t\t.link = %d,\n" % links) + file.write("\t\t\t.dev = &%s,\n" % self.instance_name) + if (child): + file.write("\t\t\t.children = &%s,\n" %child.instance_name) + file.write("\t\t},\n") + links = links + 1 + if (1): + bus = bus.next_sibling + else: + bus = 0 + file.write("\t},\n") + file.write("\t.links = %d,\n" % (links)) + sibling = self.firstsiblingdevice(); + if (sibling): + file.write("\t.sibling = &%s,\n" % sibling.instance_name) + chip = self.firstparentchip() + if (chip and chip.chipconfig): + file.write("\t.chip_ops = &%s_ops,\n" % chip.type_name) + file.write("\t.chip_info = &%s_info_%s,\n" % (chip.type_name, chip.instance)) + if (self.next_device): + file.write("\t.next=&%s\n" % self.next_device.instance_name) + file.write("};\n") + return + + def addconfig(self, path): + """Add chip config file to this part""" + self.chipconfig = os.path.join(self.dir, path) + self.image.addconfiginclude(self.type_name, self.chipconfig) + + def addregister(self, field, value): + """Register static initialization information""" + if (self.chip_or_device != 'chip'): + fatal("Only chips can have register values") + field = dequote(field) + value = dequote(value) + setdict(self.registercode, field, value) + + def set_enabled(self, enabled): + self.enabled = enabled + + def start_resources(self): + self.resource = "" + self.resources = 0 + + def end_resources(self): + self.resource = "%s" % (self.resource) + + def add_resource(self, type, index, value): + """ Add a resource to a device """ + self.resource = "%s\n\t\t{ .flags=%s, .index=0x%x, .base=0x%x}," % (self.resource, type, index, value) + self.resources = self.resources + 1 + + def set_path(self, path): + self.path = path + if (self.prev_sibling and (self.prev_sibling.path == self.path)): + self.dup = 1 + if (self.prev_device): + self.prev_device.next_device = self.next_device + if (self.next_device): + self.next_device.prev_device = self.prev_device + if (self.image.last_device == self): + self.image.last_device = self.prev_device + self.prev_device = 0 + self.next_device = 0 + + def addpcipath(self, slot, function): + """ Add a relative pci style path from our parent to this device """ + if ((slot < 0) or (slot > 0x1f)): + fatal("Invalid device id") + if ((function < 0) or (function > 7)): + fatal("Invalid pci function %s" % function ) + self.set_path(".type=DEVICE_PATH_PCI,{.pci={ .devfn = PCI_DEVFN(0x%x,%d)}}" % (slot, function)) + + def addpnppath(self, port, device): + """ Add a relative path to a pnp device hanging off our parent """ + if ((port < 0) or (port > 65536)): + fatal("Invalid port") + if ((device < 0) or (device > 0xffff)): + fatal("Invalid device") + self.set_path(".type=DEVICE_PATH_PNP,{.pnp={ .port = 0x%x, .device = 0x%x }}" % (port, device)) + + def addi2cpath(self, device): + """ Add a relative path to a i2c device hanging off our parent """ + if ((device < 0) or (device > 0x7f)): + fatal("Invalid device") + self.set_path(".type=DEVICE_PATH_I2C,{.i2c={ .device = 0x%x }}" % (device)) + + def addapicpath(self, apic_id): + """ Add a relative path to a cpu device hanging off our parent """ + if ((apic_id < 0) or (apic_id > 255)): + fatal("Invalid device") + self.set_path(".type=DEVICE_PATH_APIC,{.apic={ .apic_id = 0x%x }}" % (apic_id)) + + def addpci_domainpath(self, pci_domain): + """ Add a pci_domain number to a chip """ + if ((pci_domain < 0) or (pci_domain > 0xffff)): + fatal("Invalid pci_domain: 0x%x is out of the range 0 to 0xffff" % pci_domain) + self.set_path(".type=DEVICE_PATH_PCI_DOMAIN,{.pci_domain={ .domain = 0x%x }}" % (pci_domain)) + + def addapic_clusterpath(self, cluster): + """ Add an apic cluster to a chip """ + if ((cluster < 0) or (cluster > 15)): + fatal("Invalid apic cluster: %d is out of the range 0 to ff" % cluster) + self.set_path(".type=DEVICE_PATH_APIC_CLUSTER,{.apic_cluster={ .cluster = 0x%x }}" % (cluster)) + + def addcpupath(self, cpu_id): + """ Add a relative path to a cpu device hanging off our parent """ + if ((cpu_id < 0) or (cpu_id > 255)): + fatal("Invalid device") + self.set_path(".type=DEVICE_PATH_CPU,{.cpu={ .id = 0x%x }}" % (cpu_id)) + + + def addcpu_buspath(self, id): + """ Add a cpu_bus to a chip """ + if ((id < 0) or (id > 255)): + fatal("Invalid device") + self.set_path(".type=DEVICE_PATH_CPU_BUS,{.cpu_bus={ .id = 0x%x }}" % (id)) + + +# ----------------------------------------------------------------------------- +# statements +# ----------------------------------------------------------------------------- + +def getdict(dict, name): + if name not in dict.keys(): + debug.info(debug.dict, "Undefined: %s" % name) + return 0 + v = dict.get(name, 0) + debug.info(debug.dict, "getdict %s returning %s" % (name, v)) + return v + +def setdict(dict, name, value): + debug.info(debug.dict, "setdict sets %s to %s" % (name, value)) + if name in dict.keys(): + print "Duplicate in dict: %s" % name + dict[name] = value + + +def addconfig(path): + global partstack + curpart = partstack.tos() + curpart.addconfig(path) + +def addregister(field, value): + global partstack + curpart = partstack.tos() + curpart.addregister(field, value) + +def devicepart(type): + global curimage, partstack + newpart = partobj(curimage, 0, partstack.tos(), type, \ + '', 0, 'device') + #print "Configuring PART %s" % (type) + partstack.push(newpart) + #print " new PART tos is now %s\n" %partstack.tos().info() + # just push TOS, so that we can pop later. + +def part(type, path, file, name): + global curimage, partstack + partdir = os.path.join(type, path) + srcdir = os.path.join(treetop, 'src') + fulldir = os.path.join(srcdir, partdir) + type_name = flatten_name(partdir) + #print "PART(%s, %s, %s, %s)\n" % (type, path, file, name) + newpart = partobj(curimage, fulldir, partstack.tos(), type, \ + type_name, name, 'chip') + #print "Configuring PART %s, path %s" % (type, path) + partstack.push(newpart) + +def partpop(): + global partstack + curpart = partstack.tos() + if (curpart == 0): + fatal("Trying to pop non-existent part") + #print "End PART %s" % curpart.part + oldpart = partstack.pop() + #print "partstack.pop, TOS is now %s\n" % oldpart.info() + +#============================================================================= +# MISC FUNCTIONS +#============================================================================= +def dequote(str): + a = re.sub("^\"", "", str) + a = re.sub("\"$", "", a) + # highly un-intuitive, need four \! + a = re.sub("\\\\\"", "\"", a) + return a + +def flatten_name(str): + a = re.sub("[/-]", "_", str) + return a +%% +parser Config: + ignore: r'\s+' + ignore: "#.*?\r?\n" + + # less general tokens should come first, otherwise they get matched + # by the re's + token COMMENT: 'comment' + token CPU: 'cpu' + token CPU_BUS: 'cpu_bus' + token CHIP: 'chip' + token DEVICE: 'device' + token DEVICE_ID: 'device_id' + token DRQ: 'drq' + token END: 'end' + token EOF: '$' + token EQ: '=' + token FORMAT: 'format' + token IO: 'io' + token IRQ: 'irq' + token MEM: 'mem' + token NEVER: 'never' + token NONE: 'none' + token PMC: 'pmc' + token PRINT: 'print' + token REGISTER: 'register' + token VENDOR_ID: 'vendor_id' + token WRITE: 'write' + token NUM: '[0-9]+' + token HEX_NUM: '[0-9a-fA-F]+' + token HEX_PREFIX: '0x' + # Why is path separate? Because paths to resources have to at least + # have a slash, we thinks + token PATH: r'[-a-zA-Z0-9_.][-a-zA-Z0-9/_.]+[-a-zA-Z0-9_.]+' + # Dir's on the other hand are abitrary + # this may all be stupid. + token RULE: r'[-a-zA-Z0-9_$()./]+[-a-zA-Z0-9_ $()./]+[-a-zA-Z0-9_$()./]+' + token ID: r'[a-zA-Z_.]+[a-zA-Z0-9_.]*' + token STR: r'"([^\\"]+|\\.)*"' + token RAWTEXT: r'.*' + token ON: 'on' + token OFF: 'off' + token PCI: 'pci' + token PNP: 'pnp' + token I2C: 'i2c' + token APIC: 'apic' + token APIC_CLUSTER: 'apic_cluster' + token CPU: 'cpu' + token CPU_BUS: 'cpu_bus' + token PCI_DOMAIN: 'pci_domain' + + + rule expr: logical {{ l = logical }} + ( "&&" logical {{ l = l and logical }} + | "[|][|]" logical {{ l = l or logical }} + )* {{ return l }} + + rule logical: factor {{ n = factor }} + ( "[+]" factor {{ n = n+factor }} + | "-" factor {{ n = n-factor }} + )* {{ return n }} + + rule factor: term {{ v = term }} + ( "[*]" term {{ v = v*term }} + | "/" term {{ v = v/term }} + | "<<" term {{ v = v << term }} + | ">=" term {{ v = (v < term)}} + )* {{ return v }} + + # A term is a number, variable, or an expression surrounded by parentheses + rule term: NUM {{ return long(NUM, 10) }} + | HEX_PREFIX HEX_NUM {{ return long(HEX_NUM, 16) }} + | ID {{ return lookup(ID) }} + | unop {{ return unop }} + | "\\(" expr "\\)" {{ return expr }} + + rule unop: "!" expr {{ return not(expr) }} + + rule partend<<C>>: (stmt<<C>>)* END {{ if (C): partpop()}} + + # This is needed because the legacy cpu command could not distinguish + # between cpu vendors. It should just be PATH, but getting this change + # into the source tree will be tricky... + # DO NOT USE ID AS IT MAY GO AWAY IN THE FUTURE + rule partid: ID {{ return ID }} + | PATH {{ return PATH }} + + rule parttype: CHIP {{ return '' }} + + rule partdef<<C>>: {{ name = 0 }} + parttype partid + [ STR {{ name = dequote(STR) }} + ] {{ if (C): part(parttype, partid, 'Config.lb', name) }} + partend<<C>> + + rule field: STR {{ return STR }} + + rule register<<C>>: REGISTER field '=' STR {{ if (C): addregister(field, STR) }} + + rule enable<<C>>: {{ val = 1 }} + ( ON {{ val = 1 }} + | OFF {{ val = 0 }} + ) {{ if(C): partstack.tos().set_enabled(val) }} + + rule resource<<C>>: {{ type = "" }} + ( IO {{ type = "IORESOURCE_FIXED | IORESOURCE_ASSIGNED | IORESOURCE_IO" }} + | MEM {{ type = "IORESOURCE_FIXED | IORESOURCE_ASSIGNED | IORESOURCE_MEM" }} + | IRQ {{ type = "IORESOURCE_FIXED | IORESOURCE_ASSIGNED | IORESOURCE_IRQ" }} + | DRQ {{ type = "IORESOURCE_FIXED | IORESOURCE_ASSIGNED | IORESOURCE_DRQ" }} + ) + term '=' {{ index = term }} + term {{ value = term }} + {{ if (C): partstack.tos().add_resource(type, index, value) }} + + + rule resources<<C>>: {{ if (C): partstack.tos().start_resources() }} + ( resource<<C>> )* + {{ if (C): partstack.tos().end_resources() }} + + + rule pci<<C>>: PCI {{ if (C): devicepart('pci') }} + + HEX_NUM {{ slot = int(HEX_NUM,16) }} + '.' HEX_NUM {{ function = int(HEX_NUM, 16) }} + {{ if (C): partstack.tos().addpcipath(slot, function) }} + rule pci_domain<<C>>: + PCI_DOMAIN {{ if (C): devicepart('pci_domain') }} + HEX_NUM {{ pci_domain = int(HEX_NUM, 16) }} + {{ if (C): partstack.tos().addpci_domainpath(pci_domain) }} + + rule pnp<<C>>: PNP {{ if (C): devicepart('pnp') }} + HEX_NUM {{ port = int(HEX_NUM,16) }} + '.' HEX_NUM {{ device = int(HEX_NUM, 16) }} + {{ if (C): partstack.tos().addpnppath(port, device) }} + + rule i2c<<C>>: I2C {{ if (C): devicepart('i2c') }} + HEX_NUM {{ device = int(HEX_NUM, 16) }} + {{ if (C): partstack.tos().addi2cpath(device) }} + + rule apic<<C>>: APIC {{ if (C): devicepart('apic') }} + HEX_NUM {{ apic_id = int(HEX_NUM, 16) }} + {{ if (C): partstack.tos().addapicpath(apic_id) }} + + rule apic_cluster<<C>>: APIC_CLUSTER {{ if (C): devicepart('apic_cluster') }} + HEX_NUM {{ cluster = int(HEX_NUM, 16) }} + {{ if (C): partstack.tos().addapic_clusterpath(cluster) }} + + rule cpu<<C>>: CPU {{ if (C): devicepart('cpu') }} + HEX_NUM {{ id = int(HEX_NUM, 16) }} + {{ if (C): partstack.tos().addcpupath(id) }} + + rule cpu_bus<<C>>: CPU_BUS {{ if (C): devicepart('cpu_bus') }} + HEX_NUM {{ id = int(HEX_NUM, 16) }} + {{ if (C): partstack.tos().addcpu_buspath(id) }} + + rule dev_path<<C>>: + pci<<C>> {{ return pci }} + | pci_domain<<C>> {{ return pci_domain }} + | pnp<<C>> {{ return pnp }} + | i2c<<C>> {{ return i2c }} + | apic<<C>> {{ return apic }} + | apic_cluster<<C>> {{ return apic_cluster }} + | cpu<<C>> {{ return cpu }} + | cpu_bus<<C>> {{ return cpu_bus }} + + rule prtval: expr {{ return str(expr) }} + | STR {{ return STR }} + + rule prtlist: prtval {{ el = "%(" + prtval }} + ( "," prtval {{ el = el + "," + prtval }} + )* {{ return el + ")" }} + + rule prtstmt<<C>>: PRINT STR {{ val = STR }} + [ "," prtlist {{ val = val + prtlist }} + ] {{ if (C): print eval(val) }} + + rule device<<C>>: DEVICE dev_path<<C>> + enable<<C>> + resources<<C>> + partend<<C>> + + rule stmt<<C>>: + partdef<<C>> {{ return partdef }} + | prtstmt<<C>> {{ return prtstmt }} + | register<<C>> {{ return register }} + | device<<C>> {{ return device }} + + rule value: STR {{ return dequote(STR) }} + | expr {{ return expr }} + + rule devicetree: partdef<<1>> + EOF {{ return 1 }} + + rule wrstr<<ID>>: STR {{ setwrite(ID, dequote(STR)) }} + +%% + +#============================================================================= +# FILE OUTPUT +#============================================================================= + +def dumptree(part, lvl): + debug.info(debug.dumptree, "DUMPTREE ME is") + print "%s " % part + part.dumpme(lvl) + # dump the siblings -- actually are there any? not sure + # siblings are: + debug.info(debug.dumptree, "DUMPTREE SIBLINGS are") + kid = part.next_sibling + while (kid): + kid.dumpme(lvl) + kid = kid.next_sibling + # dump the kids + debug.info(debug.dumptree, "DUMPTREE KIDS are") + #for kid in part.children: + if (part.children): + dumptree(part.children, lvl+1) + kid = part.next_sibling + while (kid): + if (kid.children): + dumptree(kid.children, lvl + 1) + kid = kid.next_sibling + debug.info(debug.dumptree, "DONE DUMPTREE") + +def writecode(image): + filename = os.path.join(img_dir, "static.c") + print "Creating", filename + file = safe_open(filename, 'w+') + file.write("#include <device/device.h>\n") + file.write("#include <device/pci.h>\n") + for path in image.getconfigincludes().values(): + file.write("#include \"%s\"\n" % path) + file.write("\n/* pass 0 */\n") + gencode(image.getroot(), file, 0) + file.write("\n/* pass 1 */\n") + gencode(image.getroot(), file, 1) + file.close() + +def gencode(part, file, pass_num): + debug.info(debug.gencode, "GENCODE ME is") + part.gencode(file, pass_num) + # dump the siblings -- actually are there any? not sure + debug.info(debug.gencode, "GENCODE SIBLINGS are") + kid = part.next_sibling + while (kid): + kid.gencode(file, pass_num) + kid = kid.next_sibling + # now dump the children + debug.info(debug.gencode, "GENCODE KIDS are") + if (part.children): + gencode(part.children, file, pass_num) + kid = part.next_sibling + while (kid): + if (kid.children): + gencode(kid.children, file, pass_num) + kid = kid.next_sibling + debug.info(debug.gencode, "DONE GENCODE") + +def writegraph(image): + filename = os.path.join(img_dir, "static.dot") + print "Creating", filename + file = safe_open(filename, 'w+') + file.write("digraph devicetree {\n") + file.write(" rankdir=LR\n") + genranks(image.getroot(), file, 0) + gennodes(image.getroot(), file) + gengraph(image.getroot(), file) + file.write("}\n") + file.close() + +def genranks(part, file, level): + #file.write(" # Level %d\n" % level ) + file.write(" { rank = same; \"dev_%s_%d\"" % (part.type_name,part.instance )) + sib = part.next_sibling + while (sib): + file.write("; \"dev_%s_%d\"" % (sib.type_name, sib.instance)) + sib = sib.next_sibling + file.write("}\n" ) + # now dump the children + if (part.children): + genranks(part.children, file, level + 1) + + kid = part.next_sibling + while (kid): + if (kid.children): + genranks(kid.children, file, level + 1) + kid = kid.next_sibling + + +def gennodes(part, file): + file.write(" dev_%s_%d[shape=record, label=\"%s\"];\n" % (part.type_name,part.instance,part.graph_name() )) + sib = part.next_sibling + while (sib): + file.write(" dev_%s_%d[shape=record, label=\"%s\"];\n" % (sib.type_name,sib.instance,sib.graph_name() )) + sib = sib.next_sibling + # now dump the children + if (part.children): + gennodes(part.children, file) + + kid = part.next_sibling + while (kid): + if (kid.children): + gennodes(kid.children, file) + kid = kid.next_sibling + + +def gengraph(part, file): + if (part.parent != part): + file.write(" dev_%s_%d -> dev_%s_%d;\n" % \ + (part.parent.type_name, part.parent.instance, \ + part.type_name, part.instance )) + sib = part.next_sibling + while (sib): + file.write(" dev_%s_%d -> dev_%s_%d;\n" % \ + (sib.parent.type_name, sib.parent.instance, \ + sib.type_name, sib.instance )) + sib = sib.next_sibling + + kid = part.next_sibling + while (kid): + if (kid.children): + gengraph(kid.children, file) + kid = kid.next_sibling + + if (part.children): + gengraph(part.children, file) + +#============================================================================= +# MAIN PROGRAM +#============================================================================= +if __name__=='__main__': + from sys import argv + if (len(argv) < 4): + fatal("Args: <file> <path to coreboot> <output-dir>") + + file = "devicetree.cb" + partdir = os.path.join("mainboard", sys.argv[1]) + treetop = argv[2] + srcdir = os.path.join(treetop, 'src') + fulldir = os.path.join(srcdir, partdir) + type_name = flatten_name(partdir) + config_file = os.path.join(fulldir, file) + + curimage = romimage("new") + image = curimage + + newpart = partobj(curimage, fulldir, partstack.tos(), 'mainboard', \ + 'mainboard', 0, 'chip') + #print "Configuring PART %s, path %s" % (type, path) + image.setroot(newpart); + partstack.push(newpart) + + fp = safe_open(config_file, 'r') + if (not parse('devicetree', fp.read())): + fatal("Could not parse file") + print "PARSED THE TREE" + partstack.pop() + + img_dir = argv[3] + + #debug.info(debug.dumptree, "DEVICE TREE:") + #dumptree(curimage.getroot(), 0) + + writecode(image) + writegraph(image) + + sys.exit(0) diff --git a/util/sconfig/parsedesc.g b/util/sconfig/parsedesc.g new file mode 100644 index 0000000000..7113c6d6f3 --- /dev/null +++ b/util/sconfig/parsedesc.g @@ -0,0 +1,196 @@ +###################################################################### +# The remainder of this file is from parsedesc.{g,py} + +def append(lst, x): + "Imperative append" + lst.append(x) + return lst + +def add_inline_token(tokens, str): + tokens.insert( 0, (str, eval(str, {}, {})) ) + return Terminal(str) + +def cleanup_choice(lst): + if len(lst) == 0: return Sequence([]) + if len(lst) == 1: return lst[0] + return apply(Choice, tuple(lst)) + +def cleanup_sequence(lst): + if len(lst) == 1: return lst[0] + return apply(Sequence, tuple(lst)) + +def cleanup_rep(node, rep): + if rep == 'star': return Star(node) + elif rep == 'plus': return Plus(node) + else: return node + +def resolve_name(tokens, id, args): + if id in map(lambda x: x[0], tokens): + # It's a token + if args: + print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args) + return Terminal(id) + else: + # It's a name, so assume it's a nonterminal + return NonTerminal(id, args) + +%% +parser ParserDescription: + option: "context-insensitive-scanner" + + ignore: "[ \t\r\n]+" + ignore: "#.*?\r?\n" + token END: "$" + token ATTR: "<<.+?>>" + token STMT: "{{.+?}}" + token ID: '[a-zA-Z_][a-zA-Z_0-9]*' + token STR: '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"' + token LP: '\\(' + token RP: '\\)' + token LB: '\\[' + token RB: '\\]' + token OR: '[|]' + token STAR: '[*]' + token PLUS: '[+]' + token QUEST: '[?]' + token COLON: ':' + + rule Parser: "parser" ID ":" + Options + Tokens + Rules<<Tokens>> + END + {{ return Generator(ID,Options,Tokens,Rules) }} + + rule Options: {{ opt = {} }} + ( "option" ":" Str {{ opt[Str] = 1 }} )* + {{ return opt }} + + rule Tokens: {{ tok = [] }} + ( + "token" ID ":" Str {{ tok.append( (ID,Str) ) }} + | "ignore" ":" Str {{ tok.append( ('#ignore',Str) ) }} + )* + {{ return tok }} + + rule Rules<<tokens>>: + {{ rul = [] }} + ( + "rule" ID OptParam ":" ClauseA<<tokens>> + {{ rul.append( (ID,OptParam,ClauseA) ) }} + )* + {{ return rul }} + + rule ClauseA<<tokens>>: + ClauseB<<tokens>> + {{ v = [ClauseB] }} + ( OR ClauseB<<tokens>> {{ v.append(ClauseB) }} )* + {{ return cleanup_choice(v) }} + + rule ClauseB<<tokens>>: + {{ v = [] }} + ( ClauseC<<tokens>> {{ v.append(ClauseC) }} )* + {{ return cleanup_sequence(v) }} + + rule ClauseC<<tokens>>: + ClauseD<<tokens>> + ( PLUS {{ return Plus(ClauseD) }} + | STAR {{ return Star(ClauseD) }} + | {{ return ClauseD }} ) + + rule ClauseD<<tokens>>: + STR {{ t = (STR, eval(STR,{},{})) }} + {{ if t not in tokens: tokens.insert( 0, t ) }} + {{ return Terminal(STR) }} + | ID OptParam {{ return resolve_name(tokens, ID, OptParam) }} + | LP ClauseA<<tokens>> RP {{ return ClauseA }} + | LB ClauseA<<tokens>> RB {{ return Option(ClauseA) }} + | STMT {{ return Eval(STMT[2:-2]) }} + + rule OptParam: [ ATTR {{ return ATTR[2:-2] }} ] {{ return '' }} + rule Str: STR {{ return eval(STR,{},{}) }} +%% + +# This replaces the default main routine + +yapps_options = [ + ('context-insensitive-scanner', 'context-insensitive-scanner', + 'Scan all tokens (see docs)') + ] + +def generate(inputfilename, outputfilename='', dump=0, **flags): + """Generate a grammar, given an input filename (X.g) + and an output filename (defaulting to X.py).""" + + if not outputfilename: + if inputfilename[-2:]=='.g': outputfilename = inputfilename[:-2]+'.py' + else: raise "Invalid Filename", outputfilename + + print 'Input Grammar:', inputfilename + print 'Output File:', outputfilename + + DIVIDER = '\n%%\n' # This pattern separates the pre/post parsers + preparser, postparser = None, None # Code before and after the parser desc + + # Read the entire file + s = open(inputfilename,'r').read() + + # See if there's a separation between the pre-parser and parser + f = find(s, DIVIDER) + if f >= 0: preparser, s = s[:f]+'\n\n', s[f+len(DIVIDER):] + + # See if there's a separation between the parser and post-parser + f = find(s, DIVIDER) + if f >= 0: s, postparser = s[:f], '\n\n'+s[f+len(DIVIDER):] + + # Create the parser and scanner + p = ParserDescription(ParserDescriptionScanner(s)) + if not p: return + + # Now parse the file + t = wrap_error_reporter(p, 'Parser') + if not t: return # Error + if preparser is not None: t.preparser = preparser + if postparser is not None: t.postparser = postparser + + # Check the options + for f in t.options.keys(): + for opt,_,_ in yapps_options: + if f == opt: break + else: + print 'Warning: unrecognized option', f + # Add command line options to the set + for f in flags.keys(): t.options[f] = flags[f] + + # Generate the output + if dump: + t.dump_information() + else: + t.output = open(outputfilename, 'w') + t.generate_output() + +if __name__=='__main__': + import sys, getopt + optlist, args = getopt.getopt(sys.argv[1:], 'f:', ['dump']) + if not args or len(args) > 2: + print 'Usage:' + print ' python', sys.argv[0], '[flags] input.g [output.py]' + print 'Flags:' + print (' --dump' + ' '*40)[:35] + 'Dump out grammar information' + for flag, _, doc in yapps_options: + print (' -f' + flag + ' '*40)[:35] + doc + else: + # Read in the options and create a list of flags + flags = {} + for opt in optlist: + for flag, name, _ in yapps_options: + if opt == ('-f', flag): + flags[name] = 1 + break + else: + if opt == ('--dump', ''): + flags['dump'] = 1 + else: + print 'Warning - unrecognized option: ', opt[0], opt[1] + + apply(generate, tuple(args), flags) diff --git a/util/sconfig/test.config b/util/sconfig/test.config new file mode 100644 index 0000000000..c492f200fa --- /dev/null +++ b/util/sconfig/test.config @@ -0,0 +1,6 @@ +target x +mainboard amd/solo +# option X=1 +# makerule x y "z" + payload /dev/null +end diff --git a/util/sconfig/yapps2.py b/util/sconfig/yapps2.py new file mode 100644 index 0000000000..71bfa05ca6 --- /dev/null +++ b/util/sconfig/yapps2.py @@ -0,0 +1,779 @@ +# Yapps 2.0 - yet another python parser system +# Amit J Patel, January 1999 +# See http://theory.stanford.edu/~amitp/Yapps/ for documentation and updates + +# v2.0.1 changes (October 2001): +# * The exceptions inherit the standard Exception class (thanks Rich Salz) +# * The scanner can use either a different set of regular expressions +# per instance, or allows the subclass to define class fields with +# the patterns. This improves performance when many Scanner objects +# are being created, because the regular expressions don't have to +# be recompiled each time. (thanks Amaury Forgeot d'Arc) +# v2.0.2 changes (April 2002) +# * Fixed a bug in generating the 'else' clause when the comment was too +# long. v2.0.1 was missing a newline. (thanks Steven Engelhardt) +# v2.0.3 changes (August 2002) +# * Fixed a bug with inline tokens using the r"" syntax. + +from string import * +from yappsrt import * +import re + +INDENT = " "*4 + +class Generator: + def __init__(self, name, options, tokens, rules): + self.change_count = 0 + self.name = name + self.options = options + self.preparser = '' + self.postparser = None + + self.tokens = {} # Map from tokens to regexps + self.ignore = [] # List of token names to ignore in parsing + self.terminals = [] # List of token names (to maintain ordering) + for n,t in tokens: + if n == '#ignore': + n = t + self.ignore.append(n) + if n in self.tokens.keys() and self.tokens[n] != t: + print 'Warning: token', n, 'multiply defined.' + self.tokens[n] = t + self.terminals.append(n) + + self.rules = {} # Map from rule names to parser nodes + self.params = {} # Map from rule names to parameters + self.goals = [] # List of rule names (to maintain ordering) + for n,p,r in rules: + self.params[n] = p + self.rules[n] = r + self.goals.append(n) + + import sys + self.output = sys.stdout + + def __getitem__(self, name): + # Get options + return self.options.get(name, 0) + + def non_ignored_tokens(self): + return filter(lambda x, i=self.ignore: x not in i, self.terminals) + + def changed(self): + self.change_count = 1+self.change_count + + def subset(self, a, b): + "See if all elements of a are inside b" + for x in a: + if x not in b: return 0 + return 1 + + def equal_set(self, a, b): + "See if a and b have the same elements" + if len(a) != len(b): return 0 + if a == b: return 1 + return self.subset(a, b) and self.subset(b, a) + + def add_to(self, parent, additions): + "Modify parent to include all elements in additions" + for x in additions: + if x not in parent: + parent.append(x) + self.changed() + + def equate(self, a, b): + self.add_to(a, b) + self.add_to(b, a) + + def write(self, *args): + for a in args: + self.output.write(a) + + def in_test(self, x, full, b): + if not b: return '0' + if len(b)==1: return '%s == %s' % (x, `b[0]`) + if full and len(b) > len(full)/2: + # Reverse the sense of the test. + not_b = filter(lambda x, b=b: x not in b, full) + return self.not_in_test(x, full, not_b) + return '%s in %s' % (x, `b`) + + def not_in_test(self, x, full, b): + if not b: return '1' + if len(b)==1: return '%s != %s' % (x, `b[0]`) + return '%s not in %s' % (x, `b`) + + def peek_call(self, a): + a_set = (`a`[1:-1]) + if self.equal_set(a, self.non_ignored_tokens()): a_set = '' + if self['context-insensitive-scanner']: a_set = '' + return 'self._peek(%s)' % a_set + + def peek_test(self, a, b): + if self.subset(a, b): return '1' + if self['context-insensitive-scanner']: a = self.non_ignored_tokens() + return self.in_test(self.peek_call(a), a, b) + + def not_peek_test(self, a, b): + if self.subset(a, b): return '0' + return self.not_in_test(self.peek_call(a), a, b) + + def calculate(self): + while 1: + for r in self.goals: + self.rules[r].setup(self, r) + if self.change_count == 0: break + self.change_count = 0 + + while 1: + for r in self.goals: + self.rules[r].update(self) + if self.change_count == 0: break + self.change_count = 0 + + def dump_information(self): + self.calculate() + for r in self.goals: + print ' _____' + '_'*len(r) + print ('___/Rule '+r+'\\' + '_'*80)[:79] + queue = [self.rules[r]] + while queue: + top = queue[0] + del queue[0] + + print `top` + top.first.sort() + top.follow.sort() + eps = [] + if top.accepts_epsilon: eps = ['(null)'] + print ' FIRST:', join(top.first+eps, ', ') + print ' FOLLOW:', join(top.follow, ', ') + for x in top.get_children(): queue.append(x) + + def generate_output(self): + self.calculate() + self.write(self.preparser) + self.write("from string import *\n") + self.write("import re\n") + self.write("from yappsrt import *\n") + self.write("\n") + self.write("class ", self.name, "Scanner(Scanner):\n") + self.write(" patterns = [\n") + for p in self.terminals: + self.write(" (%s, re.compile(%s)),\n" % ( + `p`, `self.tokens[p]`)) + self.write(" ]\n") + self.write(" def __init__(self, str):\n") + self.write(" Scanner.__init__(self,None,%s,str)\n" % + `self.ignore`) + self.write("\n") + + self.write("class ", self.name, "(Parser):\n") + for r in self.goals: + self.write(INDENT, "def ", r, "(self") + if self.params[r]: self.write(", ", self.params[r]) + self.write("):\n") + self.rules[r].output(self, INDENT+INDENT) + self.write("\n") + + self.write("\n") + self.write("def parse(rule, text):\n") + self.write(" P = ", self.name, "(", self.name, "Scanner(text))\n") + self.write(" return wrap_error_reporter(P, rule)\n") + self.write("\n") + if self.postparser is not None: + self.write(self.postparser) + else: + self.write("if __name__=='__main__':\n") + self.write(INDENT, "from sys import argv, stdin\n") + self.write(INDENT, "if len(argv) >= 2:\n") + self.write(INDENT*2, "if len(argv) >= 3:\n") + self.write(INDENT*3, "f = open(argv[2],'r')\n") + self.write(INDENT*2, "else:\n") + self.write(INDENT*3, "f = stdin\n") + self.write(INDENT*2, "print parse(argv[1], f.read())\n") + self.write(INDENT, "else: print 'Args: <rule> [<filename>]'\n") + +###################################################################### +class Node: + def __init__(self): + self.first = [] + self.follow = [] + self.accepts_epsilon = 0 + self.rule = '?' + + def setup(self, gen, rule): + # Setup will change accepts_epsilon, + # sometimes from 0 to 1 but never 1 to 0. + # It will take a finite number of steps to set things up + self.rule = rule + + def used(self, vars): + "Return two lists: one of vars used, and the other of vars assigned" + return vars, [] + + def get_children(self): + "Return a list of sub-nodes" + return [] + + def __repr__(self): + return str(self) + + def update(self, gen): + if self.accepts_epsilon: + gen.add_to(self.first, self.follow) + + def output(self, gen, indent): + "Write out code to _gen_ with _indent_:string indentation" + gen.write(indent, "assert 0 # Invalid parser node\n") + +class Terminal(Node): + def __init__(self, token): + Node.__init__(self) + self.token = token + self.accepts_epsilon = 0 + + def __str__(self): + return self.token + + def update(self, gen): + Node.update(self, gen) + if self.first != [self.token]: + self.first = [self.token] + gen.changed() + + def output(self, gen, indent): + gen.write(indent) + if re.match('[a-zA-Z_]+$', self.token): + gen.write(self.token, " = ") + gen.write("self._scan(%s)\n" % `self.token`) + +class Eval(Node): + def __init__(self, expr): + Node.__init__(self) + self.expr = expr + + def setup(self, gen, rule): + Node.setup(self, gen, rule) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '{{ %s }}' % strip(self.expr) + + def output(self, gen, indent): + gen.write(indent, strip(self.expr), '\n') + +class NonTerminal(Node): + def __init__(self, name, args): + Node.__init__(self) + self.name = name + self.args = args + + def setup(self, gen, rule): + Node.setup(self, gen, rule) + try: + self.target = gen.rules[self.name] + if self.accepts_epsilon != self.target.accepts_epsilon: + self.accepts_epsilon = self.target.accepts_epsilon + gen.changed() + except KeyError: # Oops, it's nonexistent + print 'Error: no rule <%s>' % self.name + self.target = self + + def __str__(self): + return '<%s>' % self.name + + def update(self, gen): + Node.update(self, gen) + gen.equate(self.first, self.target.first) + gen.equate(self.follow, self.target.follow) + + def output(self, gen, indent): + gen.write(indent) + gen.write(self.name, " = ") + gen.write("self.", self.name, "(", self.args, ")\n") + +class Sequence(Node): + def __init__(self, *children): + Node.__init__(self) + self.children = children + + def setup(self, gen, rule): + Node.setup(self, gen, rule) + for c in self.children: c.setup(gen, rule) + + if not self.accepts_epsilon: + # If it's not already accepting epsilon, it might now do so. + for c in self.children: + # any non-epsilon means all is non-epsilon + if not c.accepts_epsilon: break + else: + self.accepts_epsilon = 1 + gen.changed() + + def get_children(self): + return self.children + + def __str__(self): + return '( %s )' % join(map(lambda x: str(x), self.children)) + + def update(self, gen): + Node.update(self, gen) + for g in self.children: + g.update(gen) + + empty = 1 + for g_i in range(len(self.children)): + g = self.children[g_i] + + if empty: gen.add_to(self.first, g.first) + if not g.accepts_epsilon: empty = 0 + + if g_i == len(self.children)-1: + next = self.follow + else: + next = self.children[1+g_i].first + gen.add_to(g.follow, next) + + if self.children: + gen.add_to(self.follow, self.children[-1].follow) + + def output(self, gen, indent): + if self.children: + for c in self.children: + c.output(gen, indent) + else: + # Placeholder for empty sequences, just in case + gen.write(indent, 'pass\n') + +class Choice(Node): + def __init__(self, *children): + Node.__init__(self) + self.children = children + + def setup(self, gen, rule): + Node.setup(self, gen, rule) + for c in self.children: c.setup(gen, rule) + + if not self.accepts_epsilon: + for c in self.children: + if c.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def get_children(self): + return self.children + + def __str__(self): + return '( %s )' % join(map(lambda x: str(x), self.children), ' | ') + + def update(self, gen): + Node.update(self, gen) + for g in self.children: + g.update(gen) + + for g in self.children: + gen.add_to(self.first, g.first) + gen.add_to(self.follow, g.follow) + for g in self.children: + gen.add_to(g.follow, self.follow) + if self.accepts_epsilon: + gen.add_to(self.first, self.follow) + + def output(self, gen, indent): + test = "if" + gen.write(indent, "_token_ = ", gen.peek_call(self.first), "\n") + tokens_seen = [] + tokens_unseen = self.first[:] + if gen['context-insensitive-scanner']: + # Context insensitive scanners can return ANY token, + # not only the ones in first. + tokens_unseen = gen.non_ignored_tokens() + for c in self.children: + testset = c.first[:] + removed = [] + for x in testset: + if x in tokens_seen: + testset.remove(x) + removed.append(x) + if x in tokens_unseen: tokens_unseen.remove(x) + tokens_seen = tokens_seen + testset + if removed: + if not testset: + print 'Error in rule', self.rule+':', c, 'never matches.' + else: + print 'Warning:', self + print ' * These tokens are being ignored:', join(removed, ', ') + print ' due to previous choices using them.' + + if testset: + if not tokens_unseen: # context sensitive scanners only! + if test=='if': + # if it's the first AND last test, then + # we can simply put the code without an if/else + c.output(gen, indent) + else: + gen.write(indent, "else: ") + t = gen.in_test('', [], testset) + if len(t) < 70-len(indent): + gen.write("#", t) + gen.write("\n") + c.output(gen, indent+INDENT) + else: + gen.write(indent, test, " ", + gen.in_test('_token_', tokens_unseen, testset), + ":\n") + c.output(gen, indent+INDENT) + test = "elif" + + if gen['context-insensitive-scanner'] and tokens_unseen: + gen.write(indent, "else:\n") + gen.write(indent, INDENT, "raise SyntaxError(self._pos, ") + gen.write("'Could not match ", self.rule, "')\n") + +class Wrapper(Node): + def __init__(self, child): + Node.__init__(self) + self.child = child + + def setup(self, gen, rule): + Node.setup(self, gen, rule) + self.child.setup(gen, rule) + + def get_children(self): + return [self.child] + + def update(self, gen): + Node.update(self, gen) + self.child.update(gen) + gen.add_to(self.first, self.child.first) + gen.equate(self.follow, self.child.follow) + +class Option(Wrapper): + def setup(self, gen, rule): + Wrapper.setup(self, gen, rule) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '[ %s ]' % str(self.child) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print 'Warning in rule', self.rule+': contents may be empty.' + gen.write(indent, "if %s:\n" % + gen.peek_test(self.first, self.child.first)) + self.child.output(gen, indent+INDENT) + +class Plus(Wrapper): + def setup(self, gen, rule): + Wrapper.setup(self, gen, rule) + if self.accepts_epsilon != self.child.accepts_epsilon: + self.accepts_epsilon = self.child.accepts_epsilon + gen.changed() + + def __str__(self): + return '%s+' % str(self.child) + + def update(self, gen): + Wrapper.update(self, gen) + gen.add_to(self.follow, self.first) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print 'Warning in rule', self.rule+':' + print ' * The repeated pattern could be empty. The resulting' + print ' parser may not work properly.' + gen.write(indent, "while 1:\n") + self.child.output(gen, indent+INDENT) + union = self.first[:] + gen.add_to(union, self.follow) + gen.write(indent+INDENT, "if %s: break\n" % + gen.not_peek_test(union, self.child.first)) + +class Star(Plus): + def setup(self, gen, rule): + Wrapper.setup(self, gen, rule) + if not self.accepts_epsilon: + self.accepts_epsilon = 1 + gen.changed() + + def __str__(self): + return '%s*' % str(self.child) + + def output(self, gen, indent): + if self.child.accepts_epsilon: + print 'Warning in rule', self.rule+':' + print ' * The repeated pattern could be empty. The resulting' + print ' parser probably will not work properly.' + gen.write(indent, "while %s:\n" % + gen.peek_test(self.follow, self.child.first)) + self.child.output(gen, indent+INDENT) + +###################################################################### +# The remainder of this file is from parsedesc.{g,py} + +def append(lst, x): + "Imperative append" + lst.append(x) + return lst + +def add_inline_token(tokens, str): + tokens.insert( 0, (str, eval(str, {}, {})) ) + return Terminal(str) + +def cleanup_choice(lst): + if len(lst) == 0: return Sequence([]) + if len(lst) == 1: return lst[0] + return apply(Choice, tuple(lst)) + +def cleanup_sequence(lst): + if len(lst) == 1: return lst[0] + return apply(Sequence, tuple(lst)) + +def cleanup_rep(node, rep): + if rep == 'star': return Star(node) + elif rep == 'plus': return Plus(node) + else: return node + +def resolve_name(tokens, id, args): + if id in map(lambda x: x[0], tokens): + # It's a token + if args: + print 'Warning: ignoring parameters on TOKEN %s<<%s>>' % (id, args) + return Terminal(id) + else: + # It's a name, so assume it's a nonterminal + return NonTerminal(id, args) + + +from string import * +import re +from yappsrt import * + +class ParserDescriptionScanner(Scanner): + def __init__(self, str): + Scanner.__init__(self,[ + ('"rule"', 'rule'), + ('"ignore"', 'ignore'), + ('"token"', 'token'), + ('"option"', 'option'), + ('":"', ':'), + ('"parser"', 'parser'), + ('[ \011\015\012]+', '[ \011\015\012]+'), + ('#.*?\015?\012', '#.*?\015?\012'), + ('END', '$'), + ('ATTR', '<<.+?>>'), + ('STMT', '{{.+?}}'), + ('ID', '[a-zA-Z_][a-zA-Z_0-9]*'), + ('STR', '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"'), + ('LP', '\\('), + ('RP', '\\)'), + ('LB', '\\['), + ('RB', '\\]'), + ('OR', '[|]'), + ('STAR', '[*]'), + ('PLUS', '[+]'), + ], ['[ \011\015\012]+', '#.*?\015?\012'], str) + +class ParserDescription(Parser): + def Parser(self): + self._scan('"parser"') + ID = self._scan('ID') + self._scan('":"') + Options = self.Options() + Tokens = self.Tokens() + Rules = self.Rules(Tokens) + END = self._scan('END') + return Generator(ID,Options,Tokens,Rules) + + def Options(self): + opt = {} + while self._peek('"option"', '"token"', '"ignore"', 'END', '"rule"') == '"option"': + self._scan('"option"') + self._scan('":"') + Str = self.Str() + opt[Str] = 1 + return opt + + def Tokens(self): + tok = [] + while self._peek('"token"', '"ignore"', 'END', '"rule"') in ['"token"', '"ignore"']: + _token_ = self._peek('"token"', '"ignore"') + if _token_ == '"token"': + self._scan('"token"') + ID = self._scan('ID') + self._scan('":"') + Str = self.Str() + tok.append( (ID,Str) ) + else: # == '"ignore"' + self._scan('"ignore"') + self._scan('":"') + Str = self.Str() + tok.append( ('#ignore',Str) ) + return tok + + def Rules(self, tokens): + rul = [] + while self._peek('"rule"', 'END') == '"rule"': + self._scan('"rule"') + ID = self._scan('ID') + OptParam = self.OptParam() + self._scan('":"') + ClauseA = self.ClauseA(tokens) + rul.append( (ID,OptParam,ClauseA) ) + return rul + + def ClauseA(self, tokens): + ClauseB = self.ClauseB(tokens) + v = [ClauseB] + while self._peek('OR', 'RP', 'RB', '"rule"', 'END') == 'OR': + OR = self._scan('OR') + ClauseB = self.ClauseB(tokens) + v.append(ClauseB) + return cleanup_choice(v) + + def ClauseB(self, tokens): + v = [] + while self._peek('STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END') in ['STR', 'ID', 'LP', 'LB', 'STMT']: + ClauseC = self.ClauseC(tokens) + v.append(ClauseC) + return cleanup_sequence(v) + + def ClauseC(self, tokens): + ClauseD = self.ClauseD(tokens) + _token_ = self._peek('PLUS', 'STAR', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END') + if _token_ == 'PLUS': + PLUS = self._scan('PLUS') + return Plus(ClauseD) + elif _token_ == 'STAR': + STAR = self._scan('STAR') + return Star(ClauseD) + else: + return ClauseD + + def ClauseD(self, tokens): + _token_ = self._peek('STR', 'ID', 'LP', 'LB', 'STMT') + if _token_ == 'STR': + STR = self._scan('STR') + t = (STR, eval(STR,{},{})) + if t not in tokens: tokens.insert( 0, t ) + return Terminal(STR) + elif _token_ == 'ID': + ID = self._scan('ID') + OptParam = self.OptParam() + return resolve_name(tokens, ID, OptParam) + elif _token_ == 'LP': + LP = self._scan('LP') + ClauseA = self.ClauseA(tokens) + RP = self._scan('RP') + return ClauseA + elif _token_ == 'LB': + LB = self._scan('LB') + ClauseA = self.ClauseA(tokens) + RB = self._scan('RB') + return Option(ClauseA) + else: # == 'STMT' + STMT = self._scan('STMT') + return Eval(STMT[2:-2]) + + def OptParam(self): + if self._peek('ATTR', '":"', 'PLUS', 'STAR', 'STR', 'ID', 'LP', 'LB', 'STMT', 'OR', 'RP', 'RB', '"rule"', 'END') == 'ATTR': + ATTR = self._scan('ATTR') + return ATTR[2:-2] + return '' + + def Str(self): + STR = self._scan('STR') + return eval(STR,{},{}) + + + + + +# This replaces the default main routine + +yapps_options = [ + ('context-insensitive-scanner', 'context-insensitive-scanner', + 'Scan all tokens (see docs)') + ] + +def generate(inputfilename, outputfilename='', dump=0, **flags): + """Generate a grammar, given an input filename (X.g) + and an output filename (defaulting to X.py).""" + + if not outputfilename: + if inputfilename[-2:]=='.g': outputfilename = inputfilename[:-2]+'.py' + else: raise "Invalid Filename", outputfilename + + print 'Input Grammar:', inputfilename + print 'Output File:', outputfilename + + DIVIDER = '\n%%\n' # This pattern separates the pre/post parsers + preparser, postparser = None, None # Code before and after the parser desc + + # Read the entire file + s = open(inputfilename,'r').read() + + # See if there's a separation between the pre-parser and parser + f = find(s, DIVIDER) + if f >= 0: preparser, s = s[:f]+'\n\n', s[f+len(DIVIDER):] + + # See if there's a separation between the parser and post-parser + f = find(s, DIVIDER) + if f >= 0: s, postparser = s[:f], '\n\n'+s[f+len(DIVIDER):] + + # Create the parser and scanner + p = ParserDescription(ParserDescriptionScanner(s)) + if not p: return + + # Now parse the file + t = wrap_error_reporter(p, 'Parser') + if not t: return # Error + if preparser is not None: t.preparser = preparser + if postparser is not None: t.postparser = postparser + + # Check the options + for f in t.options.keys(): + for opt,_,_ in yapps_options: + if f == opt: break + else: + print 'Warning: unrecognized option', f + # Add command line options to the set + for f in flags.keys(): t.options[f] = flags[f] + + # Generate the output + if dump: + t.dump_information() + else: + t.output = open(outputfilename, 'w') + t.generate_output() + +if __name__=='__main__': + import sys, getopt + optlist, args = getopt.getopt(sys.argv[1:], 'f:', ['dump']) + if not args or len(args) > 2: + print 'Usage:' + print ' python', sys.argv[0], '[flags] input.g [output.py]' + print 'Flags:' + print (' --dump' + ' '*40)[:35] + 'Dump out grammar information' + for flag, _, doc in yapps_options: + print (' -f' + flag + ' '*40)[:35] + doc + else: + # Read in the options and create a list of flags + flags = {} + for opt in optlist: + for flag, name, _ in yapps_options: + if opt == ('-f', flag): + flags[name] = 1 + break + else: + if opt == ('--dump', ''): + flags['dump'] = 1 + else: + print 'Warning - unrecognized option: ', opt[0], opt[1] + + apply(generate, tuple(args), flags) diff --git a/util/sconfig/yapps2.tex b/util/sconfig/yapps2.tex new file mode 100644 index 0000000000..9d2bddf19c --- /dev/null +++ b/util/sconfig/yapps2.tex @@ -0,0 +1,1225 @@ +\documentclass[10pt]{article} +\usepackage{palatino} +\usepackage{html} +\usepackage{color} + +\setlength{\headsep}{0in} +\setlength{\headheight}{0in} +\setlength{\textheight}{8.5in} +\setlength{\textwidth}{5.9in} +\setlength{\oddsidemargin}{0.25in} + +\definecolor{darkblue}{rgb}{0,0,0.6} +\definecolor{darkerblue}{rgb}{0,0,0.3} + +%% \newcommand{\mysection}[1]{\section{\textcolor{darkblue}{#1}}} +%% \newcommand{\mysubsection}[1]{\subsection{\textcolor{darkerblue}{#1}}} +\newcommand{\mysection}[1]{\section{#1}} +\newcommand{\mysubsection}[1]{\subsection{#1}} + +\bodytext{bgcolor=white text=black link=#004080 vlink=#006020} + +\newcommand{\first}{\textsc{first}} +\newcommand{\follow}{\textsc{follow}} + +\begin{document} + +\begin{center} +\hfill \begin{tabular}{c} +{\Large The \emph{Yapps} Parser Generator System}\\ +\verb|http://theory.stanford.edu/~amitp/Yapps/|\\ + Version 2\\ +\\ +Amit J. Patel\\ +\htmladdnormallink{http://www-cs-students.stanford.edu/~amitp/} +{http://www-cs-students.stanford.edu/~amitp/} + +\end{tabular} \hfill \rule{0in}{0in} +\end{center} + +\mysection{Introduction} + +\emph{Yapps} (\underline{Y}et \underline{A}nother \underline{P}ython +\underline{P}arser \underline{S}ystem) is an easy to use parser +generator that is written in Python and generates Python code. There +are several parser generator systems already available for Python, +including \texttt{PyLR, kjParsing, PyBison,} and \texttt{mcf.pars,} +but I had different goals for my parser. Yapps is simple, is easy to +use, and produces human-readable parsers. It is not the fastest or +most powerful parser. Yapps is designed to be used when regular +expressions are not enough and other parser systems are too much: +situations where you may write your own recursive descent parser. + +Some unusual features of Yapps that may be of interest are: + +\begin{enumerate} + + \item Yapps produces recursive descent parsers that are readable by + humans, as opposed to table-driven parsers that are difficult to + read. A Yapps parser for a simple calculator looks similar to the + one that Mark Lutz wrote by hand for \emph{Programming Python.} + + \item Yapps also allows for rules that accept parameters and pass + arguments to be used while parsing subexpressions. Grammars that + allow for arguments to be passed to subrules and for values to be + passed back are often called \emph{attribute grammars.} In many + cases parameterized rules can be used to perform actions at ``parse + time'' that are usually delayed until later. For example, + information about variable declarations can be passed into the + rules that parse a procedure body, so that undefined variables can + be detected at parse time. The types of defined variables can be + used in parsing as well---for example, if the type of {\tt X} is + known, we can determine whether {\tt X(1)} is an array reference or + a function call. + + \item Yapps grammars are fairly easy to write, although there are + some inconveniences having to do with ELL(1) parsing that have to be + worked around. For example, rules have to be left factored and + rules may not be left recursive. However, neither limitation seems + to be a problem in practice. + + Yapps grammars look similar to the notation used in the Python + reference manual, with operators like \verb:*:, \verb:+:, \verb:|:, + \verb:[]:, and \verb:(): for patterns, names ({\tt tim}) for rules, + regular expressions (\verb:"[a-z]+":) for tokens, and \verb:#: for + comments. + + \item The Yapps parser generator is written as a single Python module + with no C extensions. Yapps produces parsers that are written + entirely in Python, and require only the Yapps run-time module (5k) + for support. + + \item Yapps's scanner is context-sensitive, picking tokens based on + the types of the tokens accepted by the parser. This can be + helpful when implementing certain kinds of parsers, such as for a + preprocessor. + +\end{enumerate} + +There are several disadvantages of using Yapps over another parser system: + +\begin{enumerate} + + \item Yapps parsers are \texttt{ELL(1)} (Extended LL(1)), which is + less powerful than \texttt{LALR} (used by \texttt{PyLR}) or + \texttt{SLR} (used by \texttt{kjParsing}), so Yapps would not be a + good choice for parsing complex languages. For example, allowing + both \texttt{x := 5;} and \texttt{x;} as statements is difficult + because we must distinguish based on only one token of lookahead. + Seeing only \texttt{x}, we cannot decide whether we have an + assignment statement or an expression statement. (Note however + that this kind of grammar can be matched with backtracking; see + section \ref{sec:future}.) + + \item The scanner that Yapps provides can only read from strings, not + files, so an entire file has to be read in before scanning can + begin. It is possible to build a custom scanner, though, so in + cases where stream input is needed (from the console, a network, or + a large file are examples), the Yapps parser can be given a custom + scanner that reads from a stream instead of a string. + + \item Yapps is not designed with efficiency in mind. + +\end{enumerate} + +Yapps provides an easy to use parser generator that produces parsers +similar to what you might write by hand. It is not meant to be a +solution for all parsing problems, but instead an aid for those times +you would write a parser by hand rather than using one of the more +powerful parsing packages available. + +Yapps 2.0 is easier to use than Yapps 1.0. New features include a +less restrictive input syntax, which allows mixing of sequences, +choices, terminals, and nonterminals; optional matching; the ability +to insert single-line statements into the generated parser; and +looping constructs \verb|*| and \verb|+| similar to the repetitive +matching constructs in regular expressions. Unfortunately, the +addition of these constructs has made Yapps 2.0 incompatible with +Yapps 1.0, so grammars will have to be rewritten. See section +\ref{sec:Upgrading} for tips on changing Yapps 1.0 grammars for use +with Yapps 2.0. + +\mysection{Examples} + +In this section are several examples that show the use of Yapps. +First, an introduction shows how to construct grammars and write them +in Yapps form. This example can be skipped by someone familiar with +grammars and parsing. Next is a Lisp expression grammar that produces +a parse tree as output. This example demonstrates the use of tokens +and rules, as well as returning values from rules. The third example +is a expression evaluation grammar that evaluates during parsing +(instead of producing a parse tree). + +\mysubsection{Introduction to Grammars} + +A \emph{grammar} for a natural language specifies how words can be put +together to form large structures, such as phrases and sentences. A +grammar for a computer language is similar in that it specifies how +small components (called \emph{tokens}) can be put together to form +larger structures. In this section we will write a grammar for a tiny +subset of English. + +Simple English sentences can be described as being a noun phrase +followed by a verb followed by a noun phrase. For example, in the +sentence, ``Jack sank the blue ship,'' the word ``Jack'' is the first +noun phrase, ``sank'' is the verb, and ``the blue ship'' is the second +noun phrase. In addition we should say what a noun phrase is; for +this example we shall say that a noun phrase is an optional article +(a, an, the) followed by any number of adjectives followed by a noun. +The tokens in our language are the articles, nouns, verbs, and +adjectives. The \emph{rules} in our language will tell us how to +combine the tokens together to form lists of adjectives, noun phrases, +and sentences: + +\begin{itemize} + \item \texttt{sentence: noun\_phrase verb noun\_phrase} + \item \texttt{noun\_phrase: [article] adjective* noun} +\end{itemize} + +Notice that some things that we said easily in English, such as +``optional article,'' are expressed using special syntax, such as +brackets. When we said, ``any number of adjectives,'' we wrote +\texttt{adjective*}, where the \texttt{*} means ``zero or more of the +preceding pattern''. + +The grammar given above is close to a Yapps grammar. We also have to +specify what the tokens are, and what to do when a pattern is matched. +For this example, we will do nothing when patterns are matched; the +next example will explain how to perform match actions. + +\begin{verbatim} +parser TinyEnglish: + ignore: "\\W+" + token noun: "(Jack|spam|ship)" + token verb: "(sank|threw)" + token article: "(a|an|the)" + token adjective: "(blue|red|green)" + + rule sentence: noun_phrase verb noun_phrase + rule noun_phrase: [article] adjective* noun +\end{verbatim} + +The tokens are specified as Python \emph{regular expressions}. Since +Yapps produces Python code, you can write any regular expression that +would be accepted by Python. (\emph{Note:} These are Python 1.5 +regular expressions from the \texttt{re} module, not Python 1.4 +regular expressions from the \texttt{regex} module.) In addition to +tokens that you want to see (which are given names), you can also +specify tokens to ignore, marked by the \texttt{ignore} keyword. In +this parser we want to ignore whitespace. + +The TinyEnglish grammar shows how you define tokens and rules, but it +does not specify what should happen once we've matched the rules. In +the next example, we will take a grammar and produce a \emph{parse +tree} from it. + +\mysubsection{Lisp Expressions} + +Lisp syntax, although hated by many, has a redeeming quality: it is +simple to parse. In this section we will construct a Yapps grammar to +parse Lisp expressions and produce a parse tree as output. + +\subsubsection*{Defining the Grammar} + +The syntax of Lisp is simple. It has expressions, which are +identifiers, strings, numbers, and lists. A list is a left +parenthesis followed by some number of expressions (separated by +spaces) followed by a right parenthesis. For example, \verb|5|, +\verb|"ni"|, and \verb|(print "1+2 = " (+ 1 2))| are Lisp expressions. +Written as a grammar, + +\begin{verbatim} + expr: ID | STR | NUM | list + list: ( expr* ) +\end{verbatim} + +In addition to having a grammar, we need to specify what to do every +time something is matched. For the tokens, which are strings, we just +want to get the ``value'' of the token, attach its type (identifier, +string, or number) in some way, and return it. For the lists, we want +to construct and return a Python list. + +Once some pattern is matched, we enclose a return statement enclosed +in \verb|{{...}}|. The braces allow us to insert any one-line +statement into the parser. Within this statement, we can refer to the +values returned by matching each part of the rule. After matching a +token such as \texttt{ID}, ``ID'' will be bound to the text of the +matched token. Let's take a look at the rule: + +\begin{verbatim} + rule expr: ID {{ return ('id', ID) }} + ... +\end{verbatim} + +In a rule, tokens return the text that was matched. For identifiers, +we just return the identifier, along with a ``tag'' telling us that +this is an identifier and not a string or some other value. Sometimes +we may need to convert this text to a different form. For example, if +a string is matched, we want to remove quotes and handle special forms +like \verb|\n|. If a number is matched, we want to convert it into a +number. Let's look at the return values for the other tokens: + +\begin{verbatim} + ... + | STR {{ return ('str', eval(STR)) }} + | NUM {{ return ('num', atoi(NUM)) }} + ... +\end{verbatim} + +If we get a string, we want to remove the quotes and process any +special backslash codes, so we run \texttt{eval} on the quoted string. +If we get a number, we convert it to an integer with \texttt{atoi} and +then return the number along with its type tag. + +For matching a list, we need to do something slightly more +complicated. If we match a Lisp list of expressions, we want to +create a Python list with those values. + +\begin{verbatim} + rule list: "\\(" # Match the opening parenthesis + {{ result = [] }} # Create a Python list + ( + expr # When we match an expression, + {{ result.append(expr) }} # add it to the list + )* # * means repeat this if needed + "\\)" # Match the closing parenthesis + {{ return result }} # Return the Python list +\end{verbatim} + +In this rule we first match the opening parenthesis, then go into a +loop. In this loop we match expressions and add them to the list. +When there are no more expressions to match, we match the closing +parenthesis and return the resulting. Note that \verb:#: is used for +comments, just as in Python. + +The complete grammar is specified as follows: +\begin{verbatim} +parser Lisp: + ignore: '\\s+' + token NUM: '[0-9]+' + token ID: '[-+*/!@%^&=.a-zA-Z0-9_]+' + token STR: '"([^\\"]+|\\\\.)*"' + + rule expr: ID {{ return ('id', ID) }} + | STR {{ return ('str', eval(STR)) }} + | NUM {{ return ('num', atoi(NUM)) }} + | list {{ return list }} + rule list: "\\(" {{ result = [] }} + ( expr {{ result.append(expr) }} + )* + "\\)" {{ return result }} +\end{verbatim} + +One thing you may have noticed is that \verb|"\\("| and \verb|"\\)"| +appear in the \texttt{list} rule. These are \emph{inline tokens}: +they appear in the rules without being given a name with the +\texttt{token} keyword. Inline tokens are more convenient to use, but +since they do not have a name, the text that is matched cannot be used +in the return value. They are best used for short simple patterns +(usually punctuation or keywords). + +Another thing to notice is that the number and identifier tokens +overlap. For example, ``487'' matches both NUM and ID. In Yapps, the +scanner only tries to match tokens that are acceptable to the parser. +This rule doesn't help here, since both NUM and ID can appear in the +same place in the grammar. There are two rules used to pick tokens if +more than one matches. One is that the \emph{longest} match is +preferred. For example, ``487x'' will match as an ID (487x) rather +than as a NUM (487) followed by an ID (x). The second rule is that if +the two matches are the same length, the \emph{first} one listed in +the grammar is preferred. For example, ``487'' will match as an NUM +rather than an ID because NUM is listed first in the grammar. Inline +tokens have preference over any tokens you have listed. + +Now that our grammar is defined, we can run Yapps to produce a parser, +and then run the parser to produce a parse tree. + +\subsubsection*{Running Yapps} + +In the Yapps module is a function \texttt{generate} that takes an +input filename and writes a parser to another file. We can use this +function to generate the Lisp parser, which is assumed to be in +\texttt{lisp.g}. + +\begin{verbatim} +% python +Python 1.5.1 (#1, Sep 3 1998, 22:51:17) [GCC 2.7.2.3] on linux-i386 +Copyright 1991-1995 Stichting Mathematisch Centrum, Amsterdam +>>> import yapps +>>> yapps.generate('lisp.g') +\end{verbatim} + +At this point, Yapps has written a file \texttt{lisp.py} that contains +the parser. In that file are two classes (one scanner and one parser) +and a function (called \texttt{parse}) that puts things together for +you. + +Alternatively, we can run Yapps from the command line to generate the +parser file: + +\begin{verbatim} +% python yapps.py lisp.g +\end{verbatim} + +After running Yapps either from within Python or from the command +line, we can use the Lisp parser by calling the \texttt{parse} +function. The first parameter should be the rule we want to match, +and the second parameter should be the string to parse. + +\begin{verbatim} +>>> import lisp +>>> lisp.parse('expr', '(+ 3 4)') +[('id', '+'), ('num', 3), ('num', 4)] +>>> lisp.parse('expr', '(print "3 = " (+ 1 2))') +[('id', 'print'), ('str', '3 = '), [('id', '+'), ('num', 1), ('num', 2)]] +\end{verbatim} + +The \texttt{parse} function is not the only way to use the parser; +section \ref{sec:Parser-Objects} describes how to access parser objects +directly. + +We've now gone through the steps in creating a grammar, writing a +grammar file for Yapps, producing a parser, and using the parser. In +the next example we'll see how rules can take parameters and also how +to do computations instead of just returning a parse tree. + +\mysubsection{Calculator} + +A common example parser given in many textbooks is that for simple +expressions, with numbers, addition, subtraction, multiplication, +division, and parenthesization of subexpressions. We'll write this +example in Yapps, evaluating the expression as we parse. + +Unlike \texttt{yacc}, Yapps does not have any way to specify +precedence rules, so we have to do it ourselves. We say that an +expression is the sum of terms, and that a term is the product of +factors, and that a factor is a number or a parenthesized expression: + +\begin{verbatim} + expr: factor ( ("+"|"-") factor )* + factor: term ( ("*"|"/") term )* + term: NUM | "(" expr ")" +\end{verbatim} + +In order to evaluate the expression as we go, we should keep along an +accumulator while evaluating the lists of terms or factors. Just as +we kept a ``result'' variable to build a parse tree for Lisp +expressions, we will use a variable to evaluate numerical +expressions. The full grammar is given below: + +\begin{verbatim} +parser Calculator: + token END: "$" # $ means end of string + token NUM: "[0-9]+" + + rule goal: expr END {{ return expr }} + + # An expression is the sum and difference of factors + rule expr: factor {{ v = factor }} + ( "[+]" factor {{ v = v+factor }} + | "-" factor {{ v = v-factor }} + )* {{ return v }} + + # A factor is the product and division of terms + rule factor: term {{ v = term }} + ( "[*]" term {{ v = v*term }} + | "/" term {{ v = v/term }} + )* {{ return v }} + + # A term is either a number or an expression surrounded by parentheses + rule term: NUM {{ return atoi(NUM) }} + | "\\(" expr "\\)" {{ return expr }} +\end{verbatim} + +The top-level rule is \emph{goal}, which says that we are looking for +an expression followed by the end of the string. The \texttt{END} +token is needed because without it, it isn't clear when to stop +parsing. For example, the string ``1+3'' could be parsed either as +the expression ``1'' followed by the string ``+3'' or it could be +parsed as the expression ``1+3''. By requiring expressions to end +with \texttt{END}, the parser is forced to take ``1+3''. + +In the two rules with repetition, the accumulator is named \texttt{v}. +After reading in one expression, we initialize the accumulator. Each +time through the loop, we modify the accumulator by adding, +subtracting, multiplying by, or dividing the previous accumulator by +the expression that has been parsed. At the end of the rule, we +return the accumulator. + +The calculator example shows how to process lists of elements using +loops, as well as how to handle precedence of operators. + +\emph{Note:} It's often important to put the \texttt{END} token in, so +put it in unless you are sure that your grammar has some other +non-ambiguous token marking the end of the program. + +\mysubsection{Calculator with Memory} + +In the previous example we learned how to write a calculator that +evaluates simple numerical expressions. In this section we will +extend the example to support both local and global variables. + +To support global variables, we will add assignment statements to the +``goal'' rule. + +\begin{verbatim} + rule goal: expr END {{ return expr }} + | 'set' ID expr END {{ global_vars[ID] = expr }} + {{ return expr }} +\end{verbatim} + +To use these variables, we need a new kind of terminal: + +\begin{verbatim} + rule term: ... | ID {{ return global_vars[ID] }} +\end{verbatim} + +So far, these changes are straightforward. We simply have a global +dictionary \texttt{global\_vars} that stores the variables and values, +we modify it when there is an assignment statement, and we look up +variables in it when we see a variable name. + +To support local variables, we will add variable declarations to the +set of allowed expressions. + +\begin{verbatim} + rule term: ... | 'let' VAR '=' expr 'in' expr ... +\end{verbatim} + +This is where it becomes tricky. Local variables should be stored in +a local dictionary, not in the global one. One trick would be to save +a copy of the global dictionary, modify it, and then restore it +later. In this example we will instead use \emph{attributes} to +create local information and pass it to subrules. + +A rule can optionally take parameters. When we invoke the rule, we +must pass in arguments. For local variables, let's use a single +parameter, \texttt{local\_vars}: + +\begin{verbatim} + rule expr<<local_vars>>: ... + rule factor<<local_vars>>: ... + rule term<<local_vars>>: ... +\end{verbatim} + +Each time we want to match \texttt{expr}, \texttt{factor}, or +\texttt{term}, we will pass the local variables in the current rule to +the subrule. One interesting case is when we pass as an argument +something \emph{other} than \texttt{local\_vars}: + +\begin{verbatim} + rule term<<local_vars>>: ... + | 'let' VAR '=' expr<<local_vars>> + {{ local_vars = [(VAR, expr)] + local_vars }} + 'in' expr<<local_vars>> + {{ return expr }} +\end{verbatim} + +Note that the assignment to the local variables list does not modify +the original list. This is important to keep local variables from +being seen outside the ``let''. + +The other interesting case is when we find a variable: + +\begin{verbatim} +global_vars = {} + +def lookup(map, name): + for x,v in map: if x==name: return v + return global_vars[name] +%% + ... + rule term<<local_vars>: ... + | VAR {{ return lookup(local_vars, VAR) }} +\end{verbatim} + +The lookup function will search through the local variable list, and +if it cannot find the name there, it will look it up in the global +variable dictionary. + +A complete grammar for this example, including a read-eval-print loop +for interacting with the calculator, can be found in the examples +subdirectory included with Yapps. + +In this section we saw how to insert code before the parser. We also +saw how to use attributes to transmit local information from one rule +to its subrules. + +\mysection{Grammars} + +Each Yapps grammar has a name, a list of tokens, and a set of +production rules. A grammar named \texttt{X} will be used to produce +a parser named \texttt{X} and a scanner anmed \texttt{XScanner}. As +in Python, names are case sensitive, start with a letter, and contain +letters, numbers, and underscores (\_). + +There are three kinds of tokens in Yapps: named, inline, and ignored. +As their name implies, named tokens are given a name, using the token +construct: \texttt{token \emph{name} : \emph{regexp}}. In a rule, the +token can be matched by using the name. Inline tokens are regular +expressions that are used in rules without being declared. Ignored +tokens are declared using the ignore construct: \texttt{ignore: + \emph{regexp}}. These tokens are ignored by the scanner, and are +not seen by the parser. Often whitespace is an ignored token. The +regular expressions used to define tokens should use the syntax +defined in the \texttt{re} module, so some symbols may have to be +backslashed. + +Production rules in Yapps have a name and a pattern to match. If the +rule is parameterized, the name should be followed by a list of +parameter names in \verb|<<...>>|. A pattern can be a simple pattern +or a compound pattern. Simple patterns are the name of a named token, +a regular expression in quotes (inline token), the name of a +production rule (followed by arguments in \verb|<<...>>|, if the rule +has parameters), and single line Python statements (\verb|{{...}}|). +Compound patterns are sequences (\verb|A B C ...|), choices ( +\verb:A | B | C | ...:), options (\verb|[...]|), zero-or-more repetitions +(\verb|...*|), and one-or-more repetitions (\verb|...+|). Like +regular expressions, repetition operators have a higher precedence +than sequences, and sequences have a higher precedence than choices. + +Whenever \verb|{{...}}| is used, a legal one-line Python statement +should be put inside the braces. The token \verb|}}| should not +appear within the \verb|{{...}}| section, even within a string, since +Yapps does not attempt to parse the Python statement. A workaround +for strings is to put two strings together (\verb|"}" "}"|), or to use +backslashes (\verb|"}\}"|). At the end of a rule you should use a +\verb|{{ return X }}| statement to return a value. However, you +should \emph{not} use any control statements (\texttt{return}, +\texttt{continue}, \texttt{break}) in the middle of a rule. Yapps +needs to make assumptions about the control flow to generate a parser, +and any changes to the control flow will confuse Yapps. + +The \verb|<<...>>| form can occur in two places: to define parameters +to a rule and to give arguments when matching a rule. Parameters use +the syntax used for Python functions, so they can include default +arguments and the special forms (\verb|*args| and \verb|**kwargs|). +Arguments use the syntax for Python function call arguments, so they +can include normal arguments and keyword arguments. The token +\verb|>>| should not appear within the \verb|<<...>>| section. + +In both the statements and rule arguments, you can use names defined +by the parser to refer to matched patterns. You can refer to the text +matched by a named token by using the token name. You can use the +value returned by a production rule by using the name of that rule. +If a name \texttt{X} is matched more than once (such as in loops), you +will have to save the earlier value(s) in a temporary variable, and +then use that temporary variable in the return value. The next +section has an example of a name that occurs more than once. + +\mysubsection{Left Factoring} +\label{sec:Left-Factoring} + +Yapps produces ELL(1) parsers, which determine which clause to match +based on the first token available. Sometimes the leftmost tokens of +several clauses may be the same. The classic example is the +\emph{if/then/else} construct in Pascal: + +\begin{verbatim} +rule stmt: "if" expr "then" stmt {{ then_part = stmt }} + "else" stmt {{ return ('If',expr,then_part,stmt) }} + | "if" expr "then" stmt {{ return ('If',expr,stmt,[]) }} +\end{verbatim} + +(Note that we have to save the first \texttt{stmt} into a variable +because there is another \texttt{stmt} that will be matched.) The +left portions of the two clauses are the same, which presents a +problem for the parser. The solution is \emph{left-factoring}: the +common parts are put together, and \emph{then} a choice is made about +the remaining part: + +\begin{verbatim} +rule stmt: "if" expr + "then" stmt {{ then_part = stmt }} + {{ else_part = [] }} + [ "else" stmt {{ else_part = stmt }} ] + {{ return ('If', expr, then_part, else_part) }} +\end{verbatim} + +Unfortunately, the classic \emph{if/then/else} situation is +\emph{still} ambiguous when you left-factor. Yapps can deal with this +situation, but will report a warning; see section +\ref{sec:Ambiguous-Grammars} for details. + +In general, replace rules of the form: + +\begin{verbatim} +rule A: a b1 {{ return E1 }} + | a b2 {{ return E2 }} + | c3 {{ return E3 }} + | c4 {{ return E4 }} +\end{verbatim} + +with rules of the form: + +\begin{verbatim} +rule A: a ( b1 {{ return E1 }} + | b2 {{ return E2 }} + ) + | c3 {{ return E3 }} + | c4 {{ return E4 }} +\end{verbatim} + +\mysubsection{Left Recursion} + +A common construct in grammars is for matching a list of patterns, +sometimes separated with delimiters such as commas or semicolons. In +LR-based parser systems, we can parse a list with something like this: + +\begin{verbatim} +rule sum: NUM {{ return NUM }} + | sum "+" NUM {{ return (sum, NUM) }} +\end{verbatim} + +Parsing \texttt{1+2+3+4} would produce the output +\texttt{(((1,2),3),4)}, which is what we want from a left-associative +addition operator. Unfortunately, this grammar is \emph{left +recursive,} because the \texttt{sum} rule contains a clause that +begins with \texttt{sum}. (The recursion occurs at the left side of +the clause.) + +We must restructure this grammar to be \emph{right recursive} instead: + +\begin{verbatim} +rule sum: NUM {{ return NUM }} + | NUM "+" sum {{ return (NUM, sum) }} +\end{verbatim} + +Unfortunately, using this grammar, \texttt{1+2+3+4} would be parsed as +\texttt{(1,(2,(3,4)))}, which no longer follows left associativity. +The rule also needs to be left-factored. Instead, we write the +pattern as a loop instead: + +\begin{verbatim} +rule sum: NUM {{ v = NUM }} + ( "[+]" NUM {{ v = (v,NUM) }} )* + {{ return v }} +\end{verbatim} + +In general, replace rules of the form: + +\begin{verbatim} +rule A: A a1 -> << E1 >> + | A a2 -> << E2 >> + | b3 -> << E3 >> + | b4 -> << E4 >> +\end{verbatim} + +with rules of the form: + +\begin{verbatim} +rule A: ( b3 {{ A = E3 }} + | b4 {{ A = E4 }} ) + ( a1 {{ A = E1 }} + | a2 {{ A = E2 }} )* + {{ return A }} +\end{verbatim} + +We have taken a rule that proved problematic for with recursion and +turned it into a rule that works well with looping constructs. + +\mysubsection{Ambiguous Grammars} +\label{sec:Ambiguous-Grammars} + +In section \ref{sec:Left-Factoring} we saw the classic if/then/else +ambiguity, which occurs because the ``else \ldots'' portion of an ``if +\ldots then \ldots else \ldots'' construct is optional. Programs with +nested if/then/else constructs can be ambiguous when one of the else +clauses is missing: +\begin{verbatim} +if 1 then if 1 then + if 5 then if 5 then + x := 1; x := 1; + else else + y := 9; y := 9; +\end{verbatim} + +The indentation shows that the program can be parsed in two different +ways. (Of course, if we all would adopt Python's indentation-based +structuring, this would never happen!) Usually we want the parsing on +the left: the ``else'' should be associated with the closest ``if'' +statement. In section \ref{sec:Left-Factoring} we ``solved'' the +problem by using the following grammar: + +\begin{verbatim} +rule stmt: "if" expr + "then" stmt {{ then_part = stmt }} + {{ else_part = [] }} + [ "else" stmt {{ else_part = stmt }} ] + {{ return ('If', expr, then_part, else_part) }} +\end{verbatim} + +Here, we have an optional match of ``else'' followed by a statement. +The ambiguity is that if an ``else'' is present, it is not clear +whether you want it parsed immediately or if you want it to be parsed +by the outer ``if''. + +Yapps will deal with the situation by matching when the else pattern +when it can. The parser will work in this case because it prefers the +\emph{first} matching clause, which tells Yapps to parse the ``else''. +That is exactly what we want! + +For ambiguity cases with choices, Yapps will choose the \emph{first} +matching choice. However, remember that Yapps only looks at the first +token to determine its decision, so {\tt (a b | a c)} will result in +Yapps choosing {\tt a b} even when the input is {\tt a c}. It only +looks at the first token, {\tt a}, to make its decision. + +\mysection{Customization} + +Both the parsers and the scanners can be customized. The parser is +usually extended by subclassing, and the scanner can either be +subclassed or completely replaced. + +\mysubsection{Customizing Parsers} + +If additional fields and methods are needed in order for a parser to +work, Python subclassing can be used. (This is unlike parser classes +written in static languages, in which these fields and methods must be +defined in the generated parser class.) We simply subclass the +generated parser, and add any fields or methods required. Expressions +in the grammar can call methods of the subclass to perform any actions +that cannot be expressed as a simple expression. For example, +consider this simple grammar: + +\begin{verbatim} +parser X: + rule goal: "something" {{ self.printmsg() }} +\end{verbatim} + +The \texttt{printmsg} function need not be implemented in the parser +class \texttt{X}; it can be implemented in a subclass: + +\begin{verbatim} +import Xparser + +class MyX(Xparser.X): + def printmsg(self): + print "Hello!" +\end{verbatim} + +\mysubsection{Customizing Scanners} + +The generated parser class is not dependent on the generated scanner +class. A scanner object is passed to the parser object's constructor +in the \texttt{parse} function. To use a different scanner, write +your own function to construct parser objects, with an instance of a +different scanner. Scanner objects must have a \texttt{token} method +that accepts an integer \texttt{N} as well as a list of allowed token +types, and returns the Nth token, as a tuple. The default scanner +raises \texttt{NoMoreTokens} if no tokens are available, and +\texttt{SyntaxError} if no token could be matched. However, the +parser does not rely on these exceptions; only the \texttt{parse} +convenience function (which calls \texttt{wrap\_error\_reporter}) and +the \texttt{print\_error} error display function use those exceptions. + +The tuples representing tokens have four elements. The first two are +the beginning and ending indices of the matched text in the input +string. The third element is the type tag, matching either the name +of a named token or the quoted regexp of an inline or ignored token. +The fourth element of the token tuple is the matched text. If the +input string is \texttt{s}, and the token tuple is +\texttt{(b,e,type,val)}, then \texttt{val} should be equal to +\texttt{s[b:e]}. + +The generated parsers do not the beginning or ending index. They use +only the token type and value. However, the default error reporter +uses the beginning and ending index to show the user where the error +is. + +\mysection{Parser Mechanics} + +The base parser class (Parser) defines two methods, \texttt{\_scan} +and \texttt{\_peek}, and two fields, \texttt{\_pos} and +\texttt{\_scanner}. The generated parser inherits from the base +parser, and contains one method for each rule in the grammar. To +avoid name clashes, do not use names that begin with an underscore +(\texttt{\_}). + +\mysubsection{Parser Objects} +\label{sec:Parser-Objects} + +Yapps produces as output two exception classes, a scanner class, a +parser class, and a function \texttt{parse} that puts everything +together. The \texttt{parse} function does not have to be used; +instead, one can create a parser and scanner object and use them +together for parsing. + +\begin{verbatim} + def parse(rule, text): + P = X(XScanner(text)) + return wrap_error_reporter(P, rule) +\end{verbatim} + +The \texttt{parse} function takes a name of a rule and an input string +as input. It creates a scanner and parser object, then calls +\texttt{wrap\_error\_reporter} to execute the method in the parser +object named \texttt{rule}. The wrapper function will call the +appropriate parser rule and report any parsing errors to standard +output. + +There are several situations in which the \texttt{parse} function +would not be useful. If a different parser or scanner is being used, +or exceptions are to be handled differently, a new \texttt{parse} +function would be required. The supplied \texttt{parse} function can +be used as a template for writing a function for your own needs. An +example of a custom parse function is the \texttt{generate} function +in \texttt{Yapps.py}. + +\mysubsection{Context Sensitive Scanner} + +Unlike most scanners, the scanner produced by Yapps can take into +account the context in which tokens are needed, and try to match only +good tokens. For example, in the grammar: + +\begin{verbatim} +parser IniFile: + token ID: "[a-zA-Z_0-9]+" + token VAL: ".*" + + rule pair: ID "[ \t]*=[ \t]*" VAL "\n" +\end{verbatim} + +we would like to scan lines of text and pick out a name/value pair. +In a conventional scanner, the input string \texttt{shell=progman.exe} +would be turned into a single token of type \texttt{VAL}. The Yapps +scanner, however, knows that at the beginning of the line, an +\texttt{ID} is expected, so it will return \texttt{"shell"} as a token +of type \texttt{ID}. Later, it will return \texttt{"progman.exe"} as +a token of type \texttt{VAL}. + +Context sensitivity decreases the separation between scanner and +parser, but it is useful in parsers like \texttt{IniFile}, where the +tokens themselves are not unambiguous, but \emph{are} unambiguous +given a particular stage in the parsing process. + +Unfortunately, context sensitivity can make it more difficult to +detect errors in the input. For example, in parsing a Pascal-like +language with ``begin'' and ``end'' as keywords, a context sensitive +scanner would only match ``end'' as the END token if the parser is in +a place that will accept the END token. If not, then the scanner +would match ``end'' as an identifier. To disable the context +sensitive scanner in Yapps, add the +\texttt{context-insensitive-scanner} option to the grammar: + +\begin{verbatim} +Parser X: + option: "context-insensitive-scanner" +\end{verbatim} + +Context-insensitive scanning makes the parser look cleaner as well. + +\mysubsection{Internal Variables} + +There are two internal fields that may be of use. The parser object +has two fields, \texttt{\_pos}, which is the index of the current +token being matched, and \texttt{\_scanner}, which is the scanner +object. The token itself can be retrieved by accessing the scanner +object and calling the \texttt{token} method with the token index. However, if you call \texttt{token} before the token has been requested by the parser, it may mess up a context-sensitive scanner.\footnote{When using a context-sensitive scanner, the parser tells the scanner what the valid token types are at each point. If you call \texttt{token} before the parser can tell the scanner the valid token types, the scanner will attempt to match without considering the context.} A +potentially useful combination of these fields is to extract the +portion of the input matched by the current rule. To do this, just save the scanner state (\texttt{\_scanner.pos}) before the text is matched and then again after the text is matched: + +\begin{verbatim} + rule R: + {{ start = self._scanner.pos }} + a b c + {{ end = self._scanner.pos }} + {{ print 'Text is', self._scanner.input[start:end] }} +\end{verbatim} + +\mysubsection{Pre- and Post-Parser Code} + +Sometimes the parser code needs to rely on helper variables, +functions, and classes. A Yapps grammar can optionally be surrounded +by double percent signs, to separate the grammar from Python code. + +\begin{verbatim} +... Python code ... +%% +... Yapps grammar ... +%% +... Python code ... +\end{verbatim} + +The second \verb|%%| can be omitted if there is no Python code at the +end, and the first \verb|%%| can be omitted if there is no extra +Python code at all. (To have code only at the end, both separators +are required.) + +If the second \verb|%%| is omitted, Yapps will insert testing code +that allows you to use the generated parser to parse a file. + +The extended calculator example in the Yapps examples subdirectory +includes both pre-parser and post-parser code. + +\mysubsection{Representation of Grammars} + +For each kind of pattern there is a class derived from Pattern. Yapps +has classes for Terminal, NonTerminal, Sequence, Choice, Option, Plus, +Star, and Eval. Each of these classes has the following interface: + +\begin{itemize} + \item[setup(\emph{gen})] Set accepts-$\epsilon$, and call + \emph{gen.changed()} if it changed. This function can change the + flag from false to true but \emph{not} from true to false. + \item[update(\emph(gen))] Set \first and \follow, and call + \emph{gen.changed()} if either changed. This function can add to + the sets but \emph{not} remove from them. + \item[output(\emph{gen}, \emph{indent})] Generate code for matching + this rule, using \emph{indent} as the current indentation level. + Writes are performed using \emph{gen.write}. + \item[used(\emph{vars})] Given a list of variables \emph{vars}, + return two lists: one containing the variables that are used, and + one containing the variables that are assigned. This function is + used for optimizing the resulting code. +\end{itemize} + +Both \emph{setup} and \emph{update} monotonically increase the +variables they modify. Since the variables can only increase a finite +number of times, we can repeatedly call the function until the +variable stabilized. The \emph{used} function is not currently +implemented. + +With each pattern in the grammar Yapps associates three pieces of +information: the \first set, the \follow set, and the +accepts-$\epsilon$ flag. + +The \first set contains the tokens that can appear as we start +matching the pattern. The \follow set contains the tokens that can +appear immediately after we match the pattern. The accepts-$\epsilon$ +flag is true if the pattern can match no tokens. In this case, \first +will contain all the elements in \follow. The \follow set is not +needed when accepts-$\epsilon$ is false, and may not be accurate in +those cases. + +Yapps does not compute these sets precisely. Its approximation can +miss certain cases, such as this one: + +\begin{verbatim} + rule C: ( A* | B ) + rule B: C [A] +\end{verbatim} + +Yapps will calculate {\tt C}'s \follow set to include {\tt A}. +However, {\tt C} will always match all the {\tt A}'s, so {\tt A} will +never follow it. Yapps 2.0 does not properly handle this construct, +but if it seems important, I may add support for it in a future +version. + +Yapps also cannot handle constructs that depend on the calling +sequence. For example: + +\begin{verbatim} + rule R: U | 'b' + rule S: | 'c' + rule T: S 'b' + rule U: S 'a' +\end{verbatim} + +The \follow set for {\tt S} includes {\tt a} and {\tt b}. Since {\tt + S} can be empty, the \first set for {\tt S} should include {\tt a}, +{\tt b}, and {\tt c}. However, when parsing {\tt R}, if the lookahead +is {\tt b} we should \emph{not} parse {\tt U}. That's because in {\tt + U}, {\tt S} is followed by {\tt a} and not {\tt b}. Therefore in +{\tt R}, we should choose rule {\tt U} only if there is an {\tt a} or +{\tt c}, but not if there is a {\tt b}. Yapps and many other LL(1) +systems do not distinguish {\tt S b} and {\tt S a}, making {\tt + S}'s \follow set {\tt a, b}, and making {\tt R} always try to match +{\tt U}. In this case we can solve the problem by changing {\tt R} to +\verb:'b' | U: but it may not always be possible to solve all such +problems in this way. + +\appendix + +\mysection{Grammar for Parsers} + +This is the grammar for parsers, without any Python code mixed in. +The complete grammar can be found in \texttt{parsedesc.g} in the Yapps +distribution. + +\begin{verbatim} +parser ParserDescription: + ignore: "\\s+" + ignore: "#.*?\r?\n" + token END: "$" # $ means end of string + token ATTR: "<<.+?>>" + token STMT: "{{.+?}}" + token ID: '[a-zA-Z_][a-zA-Z_0-9]*' + token STR: '[rR]?\'([^\\n\'\\\\]|\\\\.)*\'|[rR]?"([^\\n"\\\\]|\\\\.)*"' + + rule Parser: "parser" ID ":" + Options + Tokens + Rules + END + + rule Options: ( "option" ":" STR )* + rule Tokens: ( "token" ID ":" STR | "ignore" ":" STR )* + rule Rules: ( "rule" ID OptParam ":" ClauseA )* + + rule ClauseA: ClauseB ( '[|]' ClauseB )* + rule ClauseB: ClauseC* + rule ClauseC: ClauseD [ '[+]' | '[*]' ] + rule ClauseD: STR | ID [ATTR] | STMT + | '\\(' ClauseA '\\) | '\\[' ClauseA '\\]' +\end{verbatim} + +\mysection{Upgrading} + +Yapps 2.0 is not backwards compatible with Yapps 1.0. In this section +are some tips for upgrading: + +\begin{enumerate} + \item Yapps 1.0 was distributed as a single file. Yapps 2.0 is + instead distributed as two Python files: a \emph{parser generator} + (26k) and a \emph{parser runtime} (5k). You need both files to + create parsers, but you need only the runtime (\texttt{yappsrt.py}) + to use the parsers. + + \item Yapps 1.0 supported Python 1.4 regular expressions from the + \texttt{regex} module. Yapps 2.0 uses Python 1.5 regular + expressions from the \texttt{re} module. \emph{The new syntax for + regular expressions is not compatible with the old syntax.} + Andrew Kuchling has a \htmladdnormallink{guide to converting + regular + expressions}{http://www.python.org/doc/howto/regex-to-re/} on his + web page. + + \item Yapps 1.0 wants a pattern and then a return value in \verb|->| + \verb|<<...>>|. Yapps 2.0 allows patterns and Python statements to + be mixed. To convert a rule like this: + +\begin{verbatim} +rule R: A B C -> << E1 >> + | X Y Z -> << E2 >> +\end{verbatim} + + to Yapps 2.0 form, replace the return value specifiers with return + statements: + +\begin{verbatim} +rule R: A B C {{ return E1 }} + | X Y Z {{ return E2 }} +\end{verbatim} + + \item Yapps 2.0 does not perform tail recursion elimination. This + means any recursive rules you write will be turned into recursive + methods in the parser. The parser will work, but may be slower. + It can be made faster by rewriting recursive rules, using instead + the looping operators \verb|*| and \verb|+| provided in Yapps 2.0. + +\end{enumerate} + +\mysection{Troubleshooting} + +\begin{itemize} + \item A common error is to write a grammar that doesn't have an END + token. End tokens are needed when it is not clear when to stop + parsing. For example, when parsing the expression {\tt 3+5}, it is + not clear after reading {\tt 3} whether to treat it as a complete + expression or whether the parser should continue reading. + Therefore the grammar for numeric expressions should include an end + token. Another example is the grammar for Lisp expressions. In + Lisp, it is always clear when you should stop parsing, so you do + \emph{not} need an end token. In fact, it may be more useful not + to have an end token, so that you can read in several Lisp expressions. + \item If there is a chance of ambiguity, make sure to put the choices + in the order you want them checked. Usually the most specific + choice should be first. Empty sequences should usually be last. + \item The context sensitive scanner is not appropriate for all + grammars. You might try using the insensitive scanner with the + {\tt context-insensitive-scanner} option in the grammar. + \item If performance turns out to be a problem, try writing a custom + scanner. The Yapps scanner is rather slow (but flexible and easy + to understand). +\end{itemize} + +\mysection{History} + +Yapps 1 had several limitations that bothered me while writing +parsers: + +\begin{enumerate} + \item It was not possible to insert statements into the generated + parser. A common workaround was to write an auxilliary function + that executed those statements, and to call that function as part + of the return value calculation. For example, several of my + parsers had an ``append(x,y)'' function that existed solely to call + ``x.append(y)''. + \item The way in which grammars were specified was rather + restrictive: a rule was a choice of clauses. Each clause was a + sequence of tokens and rule names, followed by a return value. + \item Optional matching had to be put into a separate rule because + choices were only made at the beginning of a rule. + \item Repetition had to be specified in terms of recursion. Not only + was this awkward (sometimes requiring additional rules), I had to + add a tail recursion optimization to Yapps to transform the + recursion back into a loop. +\end{enumerate} + +Yapps 2 addresses each of these limitations. + +\begin{enumerate} + \item Statements can occur anywhere within a rule. (However, only + one-line statements are allowed; multiline blocks marked by + indentation are not.) + \item Grammars can be specified using any mix of sequences, choices, + tokens, and rule names. To allow for complex structures, + parentheses can be used for grouping. + \item Given choices and parenthesization, optional matching can be + expressed as a choice between some pattern and nothing. In + addition, Yapps 2 has the convenience syntax \verb|[A B ...]| for + matching \verb|A B ...| optionally. + \item Repetition operators \verb|*| for zero or more and \verb|+| for + one or more make it easy to specify repeating patterns. +\end{enumerate} + +It is my hope that Yapps 2 will be flexible enough to meet my needs +for another year, yet simple enough that I do not hesitate to use it. + +\mysection{Future Extensions} +\label{sec:future} + +I am still investigating the possibility of LL(2) and higher +lookahead. However, it looks like the resulting parsers will be +somewhat ugly. + +It would be nice to control choices with user-defined predicates. + +The most likely future extension is backtracking. A grammar pattern +like \verb|(VAR ':=' expr)? {{ return Assign(VAR,expr) }} : expr {{ return expr }}| +would turn into code that attempted to match \verb|VAR ':=' expr|. If +it succeeded, it would run \verb|{{ return ... }}|. If it failed, it +would match \verb|expr {{ return expr }}|. Backtracking may make it +less necessary to write LL(2) grammars. + +\mysection{References} + +\begin{enumerate} + \item The \htmladdnormallink{Python-Parser + SIG}{http://www.python.org/sigs/parser-sig/} is the first place + to look for a list of parser systems for Python. + + \item ANTLR/PCCTS, by Terrence Parr, is available at + \htmladdnormallink{The ANTLR Home Page}{http://www.antlr.org/}. + + \item PyLR, by Scott Cotton, is at \htmladdnormallink{his Starship + page}{http://starship.skyport.net/crew/scott/PyLR.html}. + + \item John Aycock's \htmladdnormallink{Compiling Little Languages + Framework}{http://www.csr.UVic.CA/~aycock/python/}. + + \item PyBison, by Scott Hassan, can be found at + \htmladdnormallink{his Python Projects + page}{http://coho.stanford.edu/\~{}hassan/Python/}. + + \item mcf.pars, by Mike C. Fletcher, is available at + \htmladdnormallink{his web + page}{http://www.golden.net/\~{}mcfletch/programming/}. + + \item kwParsing, by Aaron Watters, is available at + \htmladdnormallink{his Starship + page}{http://starship.skyport.net/crew/aaron_watters/kwParsing/}. +\end{enumerate} + +\end{document} diff --git a/util/sconfig/yappsrt.py b/util/sconfig/yappsrt.py new file mode 100644 index 0000000000..2ce2480f08 --- /dev/null +++ b/util/sconfig/yappsrt.py @@ -0,0 +1,172 @@ +# Yapps 2.0 Runtime +# +# This module is needed to run generated parsers. + +from string import * +import exceptions +import re + +class SyntaxError(Exception): + """When we run into an unexpected token, this is the exception to use""" + def __init__(self, pos=-1, msg="Bad Token"): + self.pos = pos + self.msg = msg + def __repr__(self): + if self.pos < 0: return "#<syntax-error>" + else: return "SyntaxError[@ char " + `self.pos` + ": " + self.msg + "]" + +class NoMoreTokens(Exception): + """Another exception object, for when we run out of tokens""" + pass + +class Scanner: + def __init__(self, patterns, ignore, input): + """Patterns is [(terminal,regex)...] + Ignore is [terminal,...]; + Input is a string""" + self.tokens = [] + self.restrictions = [] + self.input = input + self.pos = 0 + self.ignore = ignore + # The stored patterns are a pair (compiled regex,source + # regex). If the patterns variable passed in to the + # constructor is None, we assume that the class already has a + # proper .patterns list constructed + if patterns is not None: + self.patterns = [] + for k,r in patterns: + self.patterns.append( (k, re.compile(r)) ) + + def token(self, i, restrict=0): + """Get the i'th token, and if i is one past the end, then scan + for another token; restrict is a list of tokens that + are allowed, or 0 for any token.""" + if i == len(self.tokens): self.scan(restrict) + if i < len(self.tokens): + # Make sure the restriction is more restricted + if restrict and self.restrictions[i]: + for r in restrict: + if r not in self.restrictions[i]: + raise "Unimplemented: restriction set changed" + return self.tokens[i] + raise NoMoreTokens() + + def __repr__(self): + """Print the last 10 tokens that have been scanned in""" + output = '' + for t in self.tokens[-10:]: + output = '%s\n (@%s) %s = %s' % (output,t[0],t[2],`t[3]`) + return output + + def scan(self, restrict): + """Should scan another token and add it to the list, self.tokens, + and add the restriction to self.restrictions""" + # Keep looking for a token, ignoring any in self.ignore + while 1: + # Search the patterns for the longest match, with earlier + # tokens in the list having preference + best_match = -1 + best_pat = '(error)' + for p, regexp in self.patterns: + # First check to see if we're ignoring this token + if restrict and p not in restrict and p not in self.ignore: + continue + m = regexp.match(self.input, self.pos) + if m and len(m.group(0)) > best_match: + # We got a match that's better than the previous one + best_pat = p + best_match = len(m.group(0)) + + # If we didn't find anything, raise an error + if best_pat == '(error)' and best_match < 0: + msg = "Bad Token" + if restrict: + msg = "Trying to find one of "+join(restrict,", ") + raise SyntaxError(self.pos, msg) + + # If we found something that isn't to be ignored, return it + if best_pat not in self.ignore: + # Create a token with this data + token = (self.pos, self.pos+best_match, best_pat, + self.input[self.pos:self.pos+best_match]) + self.pos = self.pos + best_match + # Only add this token if it's not in the list + # (to prevent looping) + if not self.tokens or token != self.tokens[-1]: + self.tokens.append(token) + self.restrictions.append(restrict) + return + else: + # This token should be ignored .. + self.pos = self.pos + best_match + +class Parser: + def __init__(self, scanner): + self._scanner = scanner + self._pos = 0 + + def _peek(self, *types): + """Returns the token type for lookahead; if there are any args + then the list of args is the set of token types to allow""" + tok = self._scanner.token(self._pos, types) + return tok[2] + + def _scan(self, type): + """Returns the matched text, and moves to the next token""" + tok = self._scanner.token(self._pos, [type]) + if tok[2] != type: + raise SyntaxError(tok[0], 'Trying to find '+type) + self._pos = 1+self._pos + return tok[3] + + + +def print_error(input, err, scanner): + """This is a really dumb long function to print error messages nicely.""" + p = err.pos + # Figure out the line number + line = count(input[:p], '\n') + print err.msg+" on line "+`line+1`+":" + # Now try printing part of the line + text = input[max(p-80,0):p+80] + p = p - max(p-80,0) + + # Strip to the left + i = rfind(text[:p],'\n') + j = rfind(text[:p],'\r') + if i < 0 or (j < i and j >= 0): i = j + if i >= 0 and i < p: + p = p - i - 1 + text = text[i+1:] + + # Strip to the right + i = find(text,'\n',p) + j = find(text,'\r',p) + if i < 0 or (j < i and j >= 0): i = j + if i >= 0: + text = text[:i] + + # Now shorten the text + while len(text) > 70 and p > 60: + # Cut off 10 chars + text = "..." + text[10:] + p = p - 7 + + # Now print the string, along with an indicator + print '> ',text + print '> ',' '*p + '^' + print 'List of nearby tokens:', scanner + +def wrap_error_reporter(parser, rule): + try: return getattr(parser, rule)() + except SyntaxError, s: + input = parser._scanner.input + try: + print_error(input, s, parser._scanner) + except ImportError: + print 'Syntax Error',s.msg,'on line',1+count(input[:s.pos], '\n') + except NoMoreTokens: + print 'Could not complete parsing; stopped around here:' + print parser._scanner + |