"""Very simple and fast XML parser, used for intra-paragraph text. Devised by Aaron Watters in the bad old days before Python had fast parsers available. Constructs the lightest possible in-memory representation; parses most files we have seen in pure python very quickly. The output structure is the same as the one produced by pyRXP, our validating C-based parser, which was written later. It will use pyRXP if available. This is used to parse intra-paragraph markup. Example parse:: text in xml ( "this", {"type": "xml"}, [ "text ", ("b", None, ["in"], None), " xml" ] None ) { 0: "this" "type": "xml" 1: ["text ", {0: "b", 1:["in"]}, " xml"] } Ie, xml tag translates to a tuple: (name, dictofattributes, contentlist, miscellaneousinfo) where miscellaneousinfo can be anything, (but defaults to None) (with the intention of adding, eg, line number information) special cases: name of "" means "top level, no containing tag". Top level parse always looks like this:: ("", list, None, None) contained text of None means In order to support stuff like:: AT THE MOMENT & ETCETERA ARE IGNORED. THEY MUST BE PROCESSED IN A POST-PROCESSING STEP. PROLOGUES ARE NOT UNDERSTOOD. OTHER STUFF IS PROBABLY MISSING. """ RequirePyRXP = 0 # set this to 1 to disable the nonvalidating fallback parser. try: #raise ImportError, "dummy error" simpleparse = 0 import pyRXPU def warnCB(s): print(s) pyRXP_parser = pyRXPU.Parser( ErrorOnValidityErrors=1, NoNoDTDWarning=1, ExpandCharacterEntities=1, ExpandGeneralEntities=1, warnCB = warnCB, srcName='string input', ReturnUTF8 = 0, ) def parsexml(xmlText, oneOutermostTag=0,eoCB=None,entityReplacer=None,parseOpts={}): pyRXP_parser.eoCB = eoCB p = pyRXP_parser.parse(xmlText,**parseOpts) return oneOutermostTag and p or ('',None,[p],None) except ImportError: simpleparse = 1 NONAME = "" NAMEKEY = 0 CONTENTSKEY = 1 CDATAMARKER = "" replacelist = [("<", "<"), (">", ">"), ("&", "&")] # amp must be last #replacelist = [] def unEscapeContentList(contentList): result = [] for e in contentList: if "&" in e: for (old, new) in replacelist: e = e.replace(old, new) result.append(e) return result def parsexmlSimple(xmltext, oneOutermostTag=0,eoCB=None,entityReplacer=unEscapeContentList): """official interface: discard unused cursor info""" if RequirePyRXP: raise ImportError("pyRXP not found, fallback parser disabled") (result, cursor) = parsexml0(xmltext,entityReplacer=entityReplacer) if oneOutermostTag: return result[2][0] else: return result if simpleparse: parsexml = parsexmlSimple def parseFile(filename): raw = open(filename, 'r').read() return parsexml(raw) verbose = 0 def skip_prologue(text, cursor): """skip any prologue found after cursor, return index of rest of text""" ### NOT AT ALL COMPLETE!!! definitely can be confused!!! prologue_elements = ("!DOCTYPE", "?xml", "!--") done = None while done is None: #print "trying to skip:", repr(text[cursor:cursor+20]) openbracket = text.find("<", cursor) if openbracket<0: break past = openbracket+1 found = None for e in prologue_elements: le = len(e) if text[past:past+le]==e: found = 1 cursor = text.find(">", past) if cursor<0: raise ValueError("can't close prologue %r" % e) cursor = cursor+1 if found is None: done=1 #print "done skipping" return cursor def parsexml0(xmltext, startingat=0, toplevel=1, # snarf in some globals entityReplacer=unEscapeContentList, #len=len, None=None #LENCDATAMARKER=LENCDATAMARKER, CDATAMARKER=CDATAMARKER ): """simple recursive descent xml parser... return (dictionary, endcharacter) special case: comment returns (None, endcharacter)""" #print "parsexml0", repr(xmltext[startingat: startingat+10]) # DEFAULTS NameString = NONAME ContentList = AttDict = ExtraStuff = None if toplevel is not None: #if verbose: print "at top level" #if startingat!=0: # raise ValueError, "have to start at 0 for top level!" xmltext = xmltext.strip() cursor = startingat #look for interesting starting points firstbracket = xmltext.find("<", cursor) afterbracket2char = xmltext[firstbracket+1:firstbracket+3] #print "a", repr(afterbracket2char) #firstampersand = xmltext.find("&", cursor) #if firstampersand>0 and firstampersand0: #afterbracket2char = xmltext[firstbracket:firstbracket+2] if toplevel is not None: #print "toplevel with no outer tag" NameString = name = NONAME cursor = skip_prologue(xmltext, cursor) #break elif firstbracket<0: raise ValueError("non top level entry should be at start tag: %s" % repr(xmltext[:10])) # special case: CDATA elif afterbracket2char=="![" and xmltext[firstbracket:firstbracket+9]=="": raise ValueError("invalid comment: contains double dashes %s" % repr(xmltext[cursor:cursor+20])) return (None, endcomment+1) # shortcut exit else: # get the rest of the tag #if verbose: print "parsing start tag" # make sure the tag isn't in doublequote pairs closebracket = xmltext.find(">", firstbracket) noclose = closebracket<0 startsearch = closebracket+1 pastfirstbracket = firstbracket+1 tagcontent = xmltext[pastfirstbracket:closebracket] # shortcut, no equal means nothing but name in the tag content if '=' not in tagcontent: if tagcontent[-1]=="/": # simple case #print "simple case", tagcontent tagcontent = tagcontent[:-1] docontents = None name = tagcontent.strip() NameString = name cursor = startsearch else: if '"' in tagcontent: # check double quotes stop = None # not inside double quotes! (the split should have odd length) if noclose or len((tagcontent+".").split('"'))% 2: stop=1 while stop is None: closebracket = xmltext.find(">", startsearch) startsearch = closebracket+1 noclose = closebracket<0 tagcontent = xmltext[pastfirstbracket:closebracket] # not inside double quotes! (the split should have odd length) if noclose or len((tagcontent+".").split('"'))% 2: stop=1 if noclose: raise ValueError("unclosed start tag %s" % repr(xmltext[firstbracket:firstbracket+20])) cursor = startsearch #cursor = closebracket+1 # handle simple tag /> syntax if xmltext[closebracket-1]=="/": #if verbose: print "it's a simple tag" closebracket = closebracket-1 tagcontent = tagcontent[:-1] docontents = None #tagcontent = xmltext[firstbracket+1:closebracket] tagcontent = tagcontent.strip() taglist = tagcontent.split("=") #if not taglist: # raise ValueError, "tag with no name %s" % repr(xmltext[firstbracket:firstbracket+20]) taglist0 = taglist[0] taglist0list = taglist0.split() #if len(taglist0list)>2: # raise ValueError, "bad tag head %s" % repr(taglist0) name = taglist0list[0] #print "tag name is", name NameString = name # now parse the attributes attributename = taglist0list[-1] # put a fake att name at end of last taglist entry for consistent parsing taglist[-1] = taglist[-1]+" f" AttDict = D = {} taglistindex = 1 lasttaglistindex = len(taglist) #for attentry in taglist[1:]: while taglistindexlasttaglistindex: raise ValueError("unclosed value " + repr(attentry)) nextattentry = taglist[taglistindex] taglistindex = taglistindex+1 attentry = "%s=%s" % (attentry, nextattentry) attentry = attentry.strip() # only needed for while loop... attlist = attentry.split() nextattname = attlist[-1] attvalue = attentry[:-len(nextattname)] attvalue = attvalue.strip() try: first = attvalue[0]; last=attvalue[-1] except: raise ValueError("attvalue,attentry,attlist="+repr((attvalue, attentry,attlist))) if first==last=='"' or first==last=="'": attvalue = attvalue[1:-1] #print attributename, "=", attvalue D[attributename] = attvalue attributename = nextattname # pass over other tags and content looking for end tag if docontents is not None: #print "now looking for end tag" ContentList = L while docontents is not None: nextopenbracket = xmltext.find("<", cursor) if nextopenbracket", nextopenbracket) if nextclosebracket\n%s\n" % (name, attributes, textpprint, name) # otherwise must be a simple tag return "<%s %s/>" % (name, attributes) dump = 0 def testparse(s): from time import time from pprint import pprint now = time() D = parsexmlSimple(s) print("DONE", time()-now) if dump&4: pprint(D) #pprint(D) if dump&1: print("============== reformatting") p = pprettyprint(D) print(p) def test(): testparse("""text <>in xml text in xml ]]> just testing brackets feature """) filenames = [ #"../../reportlab/demos/pythonpoint/pythonpoint.xml", "samples/hamlet.xml"] #filenames = ["moa.xml"] dump=1 if __name__=="__main__": test() from time import time now = time() for f in filenames: t = open(f).read() print("parsing", f) testparse(t) print("elapsed", time()-now)