123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436 |
- """Very simple and fast XML parser, used for intra-paragraph text.
- Devised by Aaron Watters in the bad old days before Python had fast
- parsers available. Constructs the lightest possible in-memory
- representation; parses most files we have seen in pure python very
- quickly.
- The output structure is the same as the one produced by pyRXP,
- our validating C-based parser, which was written later. It will
- use pyRXP if available.
- This is used to parse intra-paragraph markup.
- Example parse::
- <this type="xml">text <b>in</b> xml</this>
- ( "this",
- {"type": "xml"},
- [ "text ",
- ("b", None, ["in"], None),
- " xml"
- ]
- None )
- { 0: "this"
- "type": "xml"
- 1: ["text ",
- {0: "b", 1:["in"]},
- " xml"]
- }
- Ie, xml tag translates to a tuple:
- (name, dictofattributes, contentlist, miscellaneousinfo)
- where miscellaneousinfo can be anything, (but defaults to None)
- (with the intention of adding, eg, line number information)
- special cases: name of "" means "top level, no containing tag".
- Top level parse always looks like this::
- ("", list, None, None)
- contained text of None means <simple_tag/>
- In order to support stuff like::
- <this></this><one></one>
- AT THE MOMENT & ETCETERA ARE IGNORED. THEY MUST BE PROCESSED
- IN A POST-PROCESSING STEP.
- PROLOGUES ARE NOT UNDERSTOOD. OTHER STUFF IS PROBABLY MISSING.
- """
- RequirePyRXP = 0 # set this to 1 to disable the nonvalidating fallback parser.
- try:
- #raise ImportError, "dummy error"
- simpleparse = 0
- import pyRXPU
- def warnCB(s):
- print(s)
- pyRXP_parser = pyRXPU.Parser(
- ErrorOnValidityErrors=1,
- NoNoDTDWarning=1,
- ExpandCharacterEntities=1,
- ExpandGeneralEntities=1,
- warnCB = warnCB,
- srcName='string input',
- ReturnUTF8 = 0,
- )
- def parsexml(xmlText, oneOutermostTag=0,eoCB=None,entityReplacer=None,parseOpts={}):
- pyRXP_parser.eoCB = eoCB
- p = pyRXP_parser.parse(xmlText,**parseOpts)
- return oneOutermostTag and p or ('',None,[p],None)
- except ImportError:
- simpleparse = 1
- NONAME = ""
- NAMEKEY = 0
- CONTENTSKEY = 1
- CDATAMARKER = "<![CDATA["
- LENCDATAMARKER = len(CDATAMARKER)
- CDATAENDMARKER = "]]>"
- replacelist = [("<", "<"), (">", ">"), ("&", "&")] # amp must be last
- #replacelist = []
- def unEscapeContentList(contentList):
- result = []
- for e in contentList:
- if "&" in e:
- for (old, new) in replacelist:
- e = e.replace(old, new)
- result.append(e)
- return result
- def parsexmlSimple(xmltext, oneOutermostTag=0,eoCB=None,entityReplacer=unEscapeContentList):
- """official interface: discard unused cursor info"""
- if RequirePyRXP:
- raise ImportError("pyRXP not found, fallback parser disabled")
- (result, cursor) = parsexml0(xmltext,entityReplacer=entityReplacer)
- if oneOutermostTag:
- return result[2][0]
- else:
- return result
- if simpleparse:
- parsexml = parsexmlSimple
- def parseFile(filename):
- raw = open(filename, 'r').read()
- return parsexml(raw)
- verbose = 0
- def skip_prologue(text, cursor):
- """skip any prologue found after cursor, return index of rest of text"""
- ### NOT AT ALL COMPLETE!!! definitely can be confused!!!
- prologue_elements = ("!DOCTYPE", "?xml", "!--")
- done = None
- while done is None:
- #print "trying to skip:", repr(text[cursor:cursor+20])
- openbracket = text.find("<", cursor)
- if openbracket<0: break
- past = openbracket+1
- found = None
- for e in prologue_elements:
- le = len(e)
- if text[past:past+le]==e:
- found = 1
- cursor = text.find(">", past)
- if cursor<0:
- raise ValueError("can't close prologue %r" % e)
- cursor = cursor+1
- if found is None:
- done=1
- #print "done skipping"
- return cursor
- def parsexml0(xmltext, startingat=0, toplevel=1,
- # snarf in some globals
- entityReplacer=unEscapeContentList,
- #len=len, None=None
- #LENCDATAMARKER=LENCDATAMARKER, CDATAMARKER=CDATAMARKER
- ):
- """simple recursive descent xml parser...
- return (dictionary, endcharacter)
- special case: comment returns (None, endcharacter)"""
- #print "parsexml0", repr(xmltext[startingat: startingat+10])
- # DEFAULTS
- NameString = NONAME
- ContentList = AttDict = ExtraStuff = None
- if toplevel is not None:
- #if verbose: print "at top level"
- #if startingat!=0:
- # raise ValueError, "have to start at 0 for top level!"
- xmltext = xmltext.strip()
- cursor = startingat
- #look for interesting starting points
- firstbracket = xmltext.find("<", cursor)
- afterbracket2char = xmltext[firstbracket+1:firstbracket+3]
- #print "a", repr(afterbracket2char)
- #firstampersand = xmltext.find("&", cursor)
- #if firstampersand>0 and firstampersand<firstbracket:
- # raise ValueError, "I don't handle ampersands yet!!!"
- docontents = 1
- if firstbracket<0:
- # no tags
- #if verbose: print "no tags"
- if toplevel is not None:
- #D = {NAMEKEY: NONAME, CONTENTSKEY: [xmltext[cursor:]]}
- ContentList = [xmltext[cursor:]]
- if entityReplacer: ContentList = entityReplacer(ContentList)
- return (NameString, AttDict, ContentList, ExtraStuff), len(xmltext)
- else:
- raise ValueError("no tags at non-toplevel %s" % repr(xmltext[cursor:cursor+20]))
- #D = {}
- L = []
- # look for start tag
- # NEED to force always outer level is unnamed!!!
- #if toplevel and firstbracket>0:
- #afterbracket2char = xmltext[firstbracket:firstbracket+2]
- if toplevel is not None:
- #print "toplevel with no outer tag"
- NameString = name = NONAME
- cursor = skip_prologue(xmltext, cursor)
- #break
- elif firstbracket<0:
- raise ValueError("non top level entry should be at start tag: %s" % repr(xmltext[:10]))
- # special case: CDATA
- elif afterbracket2char=="![" and xmltext[firstbracket:firstbracket+9]=="<![CDATA[":
- #print "in CDATA", cursor
- # skip straight to the close marker
- startcdata = firstbracket+9
- endcdata = xmltext.find(CDATAENDMARKER, startcdata)
- if endcdata<0:
- raise ValueError("unclosed CDATA %s" % repr(xmltext[cursor:cursor+20]))
- NameString = CDATAMARKER
- ContentList = [xmltext[startcdata: endcdata]]
- cursor = endcdata+len(CDATAENDMARKER)
- docontents = None
- # special case COMMENT
- elif afterbracket2char=="!-" and xmltext[firstbracket:firstbracket+4]=="<!--":
- #print "in COMMENT"
- endcommentdashes = xmltext.find("--", firstbracket+4)
- if endcommentdashes<firstbracket:
- raise ValueError("unterminated comment %s" % repr(xmltext[cursor:cursor+20]))
- endcomment = endcommentdashes+2
- if xmltext[endcomment]!=">":
- raise ValueError("invalid comment: contains double dashes %s" % repr(xmltext[cursor:cursor+20]))
- return (None, endcomment+1) # shortcut exit
- else:
- # get the rest of the tag
- #if verbose: print "parsing start tag"
- # make sure the tag isn't in doublequote pairs
- closebracket = xmltext.find(">", firstbracket)
- noclose = closebracket<0
- startsearch = closebracket+1
- pastfirstbracket = firstbracket+1
- tagcontent = xmltext[pastfirstbracket:closebracket]
- # shortcut, no equal means nothing but name in the tag content
- if '=' not in tagcontent:
- if tagcontent[-1]=="/":
- # simple case
- #print "simple case", tagcontent
- tagcontent = tagcontent[:-1]
- docontents = None
- name = tagcontent.strip()
- NameString = name
- cursor = startsearch
- else:
- if '"' in tagcontent:
- # check double quotes
- stop = None
- # not inside double quotes! (the split should have odd length)
- if noclose or len((tagcontent+".").split('"'))% 2:
- stop=1
- while stop is None:
- closebracket = xmltext.find(">", startsearch)
- startsearch = closebracket+1
- noclose = closebracket<0
- tagcontent = xmltext[pastfirstbracket:closebracket]
- # not inside double quotes! (the split should have odd length)
- if noclose or len((tagcontent+".").split('"'))% 2:
- stop=1
- if noclose:
- raise ValueError("unclosed start tag %s" % repr(xmltext[firstbracket:firstbracket+20]))
- cursor = startsearch
- #cursor = closebracket+1
- # handle simple tag /> syntax
- if xmltext[closebracket-1]=="/":
- #if verbose: print "it's a simple tag"
- closebracket = closebracket-1
- tagcontent = tagcontent[:-1]
- docontents = None
- #tagcontent = xmltext[firstbracket+1:closebracket]
- tagcontent = tagcontent.strip()
- taglist = tagcontent.split("=")
- #if not taglist:
- # raise ValueError, "tag with no name %s" % repr(xmltext[firstbracket:firstbracket+20])
- taglist0 = taglist[0]
- taglist0list = taglist0.split()
- #if len(taglist0list)>2:
- # raise ValueError, "bad tag head %s" % repr(taglist0)
- name = taglist0list[0]
- #print "tag name is", name
- NameString = name
- # now parse the attributes
- attributename = taglist0list[-1]
- # put a fake att name at end of last taglist entry for consistent parsing
- taglist[-1] = taglist[-1]+" f"
- AttDict = D = {}
- taglistindex = 1
- lasttaglistindex = len(taglist)
- #for attentry in taglist[1:]:
- while taglistindex<lasttaglistindex:
- #print "looking for attribute named", attributename
- attentry = taglist[taglistindex]
- taglistindex = taglistindex+1
- attentry = attentry.strip()
- if attentry[0]!='"':
- raise ValueError("attribute value must start with double quotes" + repr(attentry))
- while '"' not in attentry[1:]:
- # must have an = inside the attribute value...
- if taglistindex>lasttaglistindex:
- raise ValueError("unclosed value " + repr(attentry))
- nextattentry = taglist[taglistindex]
- taglistindex = taglistindex+1
- attentry = "%s=%s" % (attentry, nextattentry)
- attentry = attentry.strip() # only needed for while loop...
- attlist = attentry.split()
- nextattname = attlist[-1]
- attvalue = attentry[:-len(nextattname)]
- attvalue = attvalue.strip()
- try:
- first = attvalue[0]; last=attvalue[-1]
- except:
- raise ValueError("attvalue,attentry,attlist="+repr((attvalue, attentry,attlist)))
- if first==last=='"' or first==last=="'":
- attvalue = attvalue[1:-1]
- #print attributename, "=", attvalue
- D[attributename] = attvalue
- attributename = nextattname
- # pass over other tags and content looking for end tag
- if docontents is not None:
- #print "now looking for end tag"
- ContentList = L
- while docontents is not None:
- nextopenbracket = xmltext.find("<", cursor)
- if nextopenbracket<cursor:
- #if verbose: print "no next open bracket found"
- if name==NONAME:
- #print "no more tags for noname", repr(xmltext[cursor:cursor+10])
- docontents=None # done
- remainder = xmltext[cursor:]
- cursor = len(xmltext)
- if remainder:
- L.append(remainder)
- else:
- raise ValueError("no close bracket for %s found after %s" % (name,repr(xmltext[cursor: cursor+20])))
- # is it a close bracket?
- elif xmltext[nextopenbracket+1]=="/":
- #print "found close bracket", repr(xmltext[nextopenbracket:nextopenbracket+20])
- nextclosebracket = xmltext.find(">", nextopenbracket)
- if nextclosebracket<nextopenbracket:
- raise ValueError("unclosed close tag %s" % repr(xmltext[nextopenbracket: nextopenbracket+20]))
- closetagcontents = xmltext[nextopenbracket+2: nextclosebracket]
- closetaglist = closetagcontents.split()
- #if len(closetaglist)!=1:
- #print closetagcontents
- #raise ValueError, "bad close tag format %s" % repr(xmltext[nextopenbracket: nextopenbracket+20])
- # name should match
- closename = closetaglist[0]
- #if verbose: print "closetag name is", closename
- if name!=closename:
- prefix = xmltext[:cursor]
- endlinenum = len(prefix.split("\n"))
- prefix = xmltext[:startingat]
- linenum = len(prefix.split("\n"))
- raise ValueError("at lines %s...%s close tag name doesn't match %s...%s %s" %(
- linenum, endlinenum, repr(name), repr(closename), repr(xmltext[cursor: cursor+100])))
- remainder = xmltext[cursor:nextopenbracket]
- if remainder:
- #if verbose: print "remainder", repr(remainder)
- L.append(remainder)
- cursor = nextclosebracket+1
- #print "for", name, "found close tag"
- docontents = None # done
- # otherwise we are looking at a new tag, recursively parse it...
- # first record any intervening content
- else:
- remainder = xmltext[cursor:nextopenbracket]
- if remainder:
- L.append(remainder)
- #if verbose:
- # #print "skipping", repr(remainder)
- # #print "--- recursively parsing starting at", xmltext[nextopenbracket:nextopenbracket+20]
- (parsetree, cursor) = parsexml0(xmltext, startingat=nextopenbracket, toplevel=None, entityReplacer=entityReplacer)
- if parsetree:
- L.append(parsetree)
- # maybe should check for trailing garbage?
- # toplevel:
- # remainder = xmltext[cursor:].strip()
- # if remainder:
- # raise ValueError, "trailing garbage at top level %s" % repr(remainder[:20])
- if ContentList:
- if entityReplacer: ContentList = entityReplacer(ContentList)
- t = (NameString, AttDict, ContentList, ExtraStuff)
- return (t, cursor)
- import types
- def pprettyprint(parsedxml):
- """pretty printer mainly for testing"""
- st = bytes
- if type(parsedxml) is st:
- return parsedxml
- (name, attdict, textlist, extra) = parsedxml
- if not attdict: attdict={}
- attlist = []
- for k in attdict.keys():
- v = attdict[k]
- attlist.append("%s=%s" % (k, repr(v)))
- attributes = " ".join(attlist)
- if not name and attributes:
- raise ValueError("name missing with attributes???")
- if textlist is not None:
- # with content
- textlistpprint = list(map(pprettyprint, textlist))
- textpprint = "\n".join(textlistpprint)
- if not name:
- return textpprint # no outer tag
- # indent it
- nllist = textpprint.split("\n")
- textpprint = " "+ ("\n ".join(nllist))
- return "<%s %s>\n%s\n</%s>" % (name, attributes, textpprint, name)
- # otherwise must be a simple tag
- return "<%s %s/>" % (name, attributes)
- dump = 0
- def testparse(s):
- from time import time
- from pprint import pprint
- now = time()
- D = parsexmlSimple(s)
- print("DONE", time()-now)
- if dump&4:
- pprint(D)
- #pprint(D)
- if dump&1:
- print("============== reformatting")
- p = pprettyprint(D)
- print(p)
- def test():
- testparse("""<this type="xml">text <><b>in</b> <funnytag foo="bar"/> xml</this>
- <!-- comment -->
- <![CDATA[
- <this type="xml">text <b>in</b> xml</this> ]]>
- <tag with="<brackets in values>">just testing brackets feature</tag>
- """)
- filenames = [ #"../../reportlab/demos/pythonpoint/pythonpoint.xml",
- "samples/hamlet.xml"]
- #filenames = ["moa.xml"]
- dump=1
- if __name__=="__main__":
- test()
- from time import time
- now = time()
- for f in filenames:
- t = open(f).read()
- print("parsing", f)
- testparse(t)
- print("elapsed", time()-now)
|