rparsexml.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. """Very simple and fast XML parser, used for intra-paragraph text.
  2. Devised by Aaron Watters in the bad old days before Python had fast
  3. parsers available. Constructs the lightest possible in-memory
  4. representation; parses most files we have seen in pure python very
  5. quickly.
  6. The output structure is the same as the one produced by pyRXP,
  7. our validating C-based parser, which was written later. It will
  8. use pyRXP if available.
  9. This is used to parse intra-paragraph markup.
  10. Example parse::
  11. <this type="xml">text <b>in</b> xml</this>
  12. ( "this",
  13. {"type": "xml"},
  14. [ "text ",
  15. ("b", None, ["in"], None),
  16. " xml"
  17. ]
  18. None )
  19. { 0: "this"
  20. "type": "xml"
  21. 1: ["text ",
  22. {0: "b", 1:["in"]},
  23. " xml"]
  24. }
  25. Ie, xml tag translates to a tuple:
  26. (name, dictofattributes, contentlist, miscellaneousinfo)
  27. where miscellaneousinfo can be anything, (but defaults to None)
  28. (with the intention of adding, eg, line number information)
  29. special cases: name of "" means "top level, no containing tag".
  30. Top level parse always looks like this::
  31. ("", list, None, None)
  32. contained text of None means <simple_tag/>
  33. In order to support stuff like::
  34. <this></this><one></one>
  35. AT THE MOMENT &amp; ETCETERA ARE IGNORED. THEY MUST BE PROCESSED
  36. IN A POST-PROCESSING STEP.
  37. PROLOGUES ARE NOT UNDERSTOOD. OTHER STUFF IS PROBABLY MISSING.
  38. """
  39. RequirePyRXP = 0 # set this to 1 to disable the nonvalidating fallback parser.
  40. try:
  41. #raise ImportError, "dummy error"
  42. simpleparse = 0
  43. import pyRXPU
  44. def warnCB(s):
  45. print(s)
  46. pyRXP_parser = pyRXPU.Parser(
  47. ErrorOnValidityErrors=1,
  48. NoNoDTDWarning=1,
  49. ExpandCharacterEntities=1,
  50. ExpandGeneralEntities=1,
  51. warnCB = warnCB,
  52. srcName='string input',
  53. ReturnUTF8 = 0,
  54. )
  55. def parsexml(xmlText, oneOutermostTag=0,eoCB=None,entityReplacer=None,parseOpts={}):
  56. pyRXP_parser.eoCB = eoCB
  57. p = pyRXP_parser.parse(xmlText,**parseOpts)
  58. return oneOutermostTag and p or ('',None,[p],None)
  59. except ImportError:
  60. simpleparse = 1
  61. NONAME = ""
  62. NAMEKEY = 0
  63. CONTENTSKEY = 1
  64. CDATAMARKER = "<![CDATA["
  65. LENCDATAMARKER = len(CDATAMARKER)
  66. CDATAENDMARKER = "]]>"
  67. replacelist = [("&lt;", "<"), ("&gt;", ">"), ("&amp;", "&")] # amp must be last
  68. #replacelist = []
  69. def unEscapeContentList(contentList):
  70. result = []
  71. for e in contentList:
  72. if "&" in e:
  73. for (old, new) in replacelist:
  74. e = e.replace(old, new)
  75. result.append(e)
  76. return result
  77. def parsexmlSimple(xmltext, oneOutermostTag=0,eoCB=None,entityReplacer=unEscapeContentList):
  78. """official interface: discard unused cursor info"""
  79. if RequirePyRXP:
  80. raise ImportError("pyRXP not found, fallback parser disabled")
  81. (result, cursor) = parsexml0(xmltext,entityReplacer=entityReplacer)
  82. if oneOutermostTag:
  83. return result[2][0]
  84. else:
  85. return result
  86. if simpleparse:
  87. parsexml = parsexmlSimple
  88. def parseFile(filename):
  89. raw = open(filename, 'r').read()
  90. return parsexml(raw)
  91. verbose = 0
  92. def skip_prologue(text, cursor):
  93. """skip any prologue found after cursor, return index of rest of text"""
  94. ### NOT AT ALL COMPLETE!!! definitely can be confused!!!
  95. prologue_elements = ("!DOCTYPE", "?xml", "!--")
  96. done = None
  97. while done is None:
  98. #print "trying to skip:", repr(text[cursor:cursor+20])
  99. openbracket = text.find("<", cursor)
  100. if openbracket<0: break
  101. past = openbracket+1
  102. found = None
  103. for e in prologue_elements:
  104. le = len(e)
  105. if text[past:past+le]==e:
  106. found = 1
  107. cursor = text.find(">", past)
  108. if cursor<0:
  109. raise ValueError("can't close prologue %r" % e)
  110. cursor = cursor+1
  111. if found is None:
  112. done=1
  113. #print "done skipping"
  114. return cursor
  115. def parsexml0(xmltext, startingat=0, toplevel=1,
  116. # snarf in some globals
  117. entityReplacer=unEscapeContentList,
  118. #len=len, None=None
  119. #LENCDATAMARKER=LENCDATAMARKER, CDATAMARKER=CDATAMARKER
  120. ):
  121. """simple recursive descent xml parser...
  122. return (dictionary, endcharacter)
  123. special case: comment returns (None, endcharacter)"""
  124. #print "parsexml0", repr(xmltext[startingat: startingat+10])
  125. # DEFAULTS
  126. NameString = NONAME
  127. ContentList = AttDict = ExtraStuff = None
  128. if toplevel is not None:
  129. #if verbose: print "at top level"
  130. #if startingat!=0:
  131. # raise ValueError, "have to start at 0 for top level!"
  132. xmltext = xmltext.strip()
  133. cursor = startingat
  134. #look for interesting starting points
  135. firstbracket = xmltext.find("<", cursor)
  136. afterbracket2char = xmltext[firstbracket+1:firstbracket+3]
  137. #print "a", repr(afterbracket2char)
  138. #firstampersand = xmltext.find("&", cursor)
  139. #if firstampersand>0 and firstampersand<firstbracket:
  140. # raise ValueError, "I don't handle ampersands yet!!!"
  141. docontents = 1
  142. if firstbracket<0:
  143. # no tags
  144. #if verbose: print "no tags"
  145. if toplevel is not None:
  146. #D = {NAMEKEY: NONAME, CONTENTSKEY: [xmltext[cursor:]]}
  147. ContentList = [xmltext[cursor:]]
  148. if entityReplacer: ContentList = entityReplacer(ContentList)
  149. return (NameString, AttDict, ContentList, ExtraStuff), len(xmltext)
  150. else:
  151. raise ValueError("no tags at non-toplevel %s" % repr(xmltext[cursor:cursor+20]))
  152. #D = {}
  153. L = []
  154. # look for start tag
  155. # NEED to force always outer level is unnamed!!!
  156. #if toplevel and firstbracket>0:
  157. #afterbracket2char = xmltext[firstbracket:firstbracket+2]
  158. if toplevel is not None:
  159. #print "toplevel with no outer tag"
  160. NameString = name = NONAME
  161. cursor = skip_prologue(xmltext, cursor)
  162. #break
  163. elif firstbracket<0:
  164. raise ValueError("non top level entry should be at start tag: %s" % repr(xmltext[:10]))
  165. # special case: CDATA
  166. elif afterbracket2char=="![" and xmltext[firstbracket:firstbracket+9]=="<![CDATA[":
  167. #print "in CDATA", cursor
  168. # skip straight to the close marker
  169. startcdata = firstbracket+9
  170. endcdata = xmltext.find(CDATAENDMARKER, startcdata)
  171. if endcdata<0:
  172. raise ValueError("unclosed CDATA %s" % repr(xmltext[cursor:cursor+20]))
  173. NameString = CDATAMARKER
  174. ContentList = [xmltext[startcdata: endcdata]]
  175. cursor = endcdata+len(CDATAENDMARKER)
  176. docontents = None
  177. # special case COMMENT
  178. elif afterbracket2char=="!-" and xmltext[firstbracket:firstbracket+4]=="<!--":
  179. #print "in COMMENT"
  180. endcommentdashes = xmltext.find("--", firstbracket+4)
  181. if endcommentdashes<firstbracket:
  182. raise ValueError("unterminated comment %s" % repr(xmltext[cursor:cursor+20]))
  183. endcomment = endcommentdashes+2
  184. if xmltext[endcomment]!=">":
  185. raise ValueError("invalid comment: contains double dashes %s" % repr(xmltext[cursor:cursor+20]))
  186. return (None, endcomment+1) # shortcut exit
  187. else:
  188. # get the rest of the tag
  189. #if verbose: print "parsing start tag"
  190. # make sure the tag isn't in doublequote pairs
  191. closebracket = xmltext.find(">", firstbracket)
  192. noclose = closebracket<0
  193. startsearch = closebracket+1
  194. pastfirstbracket = firstbracket+1
  195. tagcontent = xmltext[pastfirstbracket:closebracket]
  196. # shortcut, no equal means nothing but name in the tag content
  197. if '=' not in tagcontent:
  198. if tagcontent[-1]=="/":
  199. # simple case
  200. #print "simple case", tagcontent
  201. tagcontent = tagcontent[:-1]
  202. docontents = None
  203. name = tagcontent.strip()
  204. NameString = name
  205. cursor = startsearch
  206. else:
  207. if '"' in tagcontent:
  208. # check double quotes
  209. stop = None
  210. # not inside double quotes! (the split should have odd length)
  211. if noclose or len((tagcontent+".").split('"'))% 2:
  212. stop=1
  213. while stop is None:
  214. closebracket = xmltext.find(">", startsearch)
  215. startsearch = closebracket+1
  216. noclose = closebracket<0
  217. tagcontent = xmltext[pastfirstbracket:closebracket]
  218. # not inside double quotes! (the split should have odd length)
  219. if noclose or len((tagcontent+".").split('"'))% 2:
  220. stop=1
  221. if noclose:
  222. raise ValueError("unclosed start tag %s" % repr(xmltext[firstbracket:firstbracket+20]))
  223. cursor = startsearch
  224. #cursor = closebracket+1
  225. # handle simple tag /> syntax
  226. if xmltext[closebracket-1]=="/":
  227. #if verbose: print "it's a simple tag"
  228. closebracket = closebracket-1
  229. tagcontent = tagcontent[:-1]
  230. docontents = None
  231. #tagcontent = xmltext[firstbracket+1:closebracket]
  232. tagcontent = tagcontent.strip()
  233. taglist = tagcontent.split("=")
  234. #if not taglist:
  235. # raise ValueError, "tag with no name %s" % repr(xmltext[firstbracket:firstbracket+20])
  236. taglist0 = taglist[0]
  237. taglist0list = taglist0.split()
  238. #if len(taglist0list)>2:
  239. # raise ValueError, "bad tag head %s" % repr(taglist0)
  240. name = taglist0list[0]
  241. #print "tag name is", name
  242. NameString = name
  243. # now parse the attributes
  244. attributename = taglist0list[-1]
  245. # put a fake att name at end of last taglist entry for consistent parsing
  246. taglist[-1] = taglist[-1]+" f"
  247. AttDict = D = {}
  248. taglistindex = 1
  249. lasttaglistindex = len(taglist)
  250. #for attentry in taglist[1:]:
  251. while taglistindex<lasttaglistindex:
  252. #print "looking for attribute named", attributename
  253. attentry = taglist[taglistindex]
  254. taglistindex = taglistindex+1
  255. attentry = attentry.strip()
  256. if attentry[0]!='"':
  257. raise ValueError("attribute value must start with double quotes" + repr(attentry))
  258. while '"' not in attentry[1:]:
  259. # must have an = inside the attribute value...
  260. if taglistindex>lasttaglistindex:
  261. raise ValueError("unclosed value " + repr(attentry))
  262. nextattentry = taglist[taglistindex]
  263. taglistindex = taglistindex+1
  264. attentry = "%s=%s" % (attentry, nextattentry)
  265. attentry = attentry.strip() # only needed for while loop...
  266. attlist = attentry.split()
  267. nextattname = attlist[-1]
  268. attvalue = attentry[:-len(nextattname)]
  269. attvalue = attvalue.strip()
  270. try:
  271. first = attvalue[0]; last=attvalue[-1]
  272. except:
  273. raise ValueError("attvalue,attentry,attlist="+repr((attvalue, attentry,attlist)))
  274. if first==last=='"' or first==last=="'":
  275. attvalue = attvalue[1:-1]
  276. #print attributename, "=", attvalue
  277. D[attributename] = attvalue
  278. attributename = nextattname
  279. # pass over other tags and content looking for end tag
  280. if docontents is not None:
  281. #print "now looking for end tag"
  282. ContentList = L
  283. while docontents is not None:
  284. nextopenbracket = xmltext.find("<", cursor)
  285. if nextopenbracket<cursor:
  286. #if verbose: print "no next open bracket found"
  287. if name==NONAME:
  288. #print "no more tags for noname", repr(xmltext[cursor:cursor+10])
  289. docontents=None # done
  290. remainder = xmltext[cursor:]
  291. cursor = len(xmltext)
  292. if remainder:
  293. L.append(remainder)
  294. else:
  295. raise ValueError("no close bracket for %s found after %s" % (name,repr(xmltext[cursor: cursor+20])))
  296. # is it a close bracket?
  297. elif xmltext[nextopenbracket+1]=="/":
  298. #print "found close bracket", repr(xmltext[nextopenbracket:nextopenbracket+20])
  299. nextclosebracket = xmltext.find(">", nextopenbracket)
  300. if nextclosebracket<nextopenbracket:
  301. raise ValueError("unclosed close tag %s" % repr(xmltext[nextopenbracket: nextopenbracket+20]))
  302. closetagcontents = xmltext[nextopenbracket+2: nextclosebracket]
  303. closetaglist = closetagcontents.split()
  304. #if len(closetaglist)!=1:
  305. #print closetagcontents
  306. #raise ValueError, "bad close tag format %s" % repr(xmltext[nextopenbracket: nextopenbracket+20])
  307. # name should match
  308. closename = closetaglist[0]
  309. #if verbose: print "closetag name is", closename
  310. if name!=closename:
  311. prefix = xmltext[:cursor]
  312. endlinenum = len(prefix.split("\n"))
  313. prefix = xmltext[:startingat]
  314. linenum = len(prefix.split("\n"))
  315. raise ValueError("at lines %s...%s close tag name doesn't match %s...%s %s" %(
  316. linenum, endlinenum, repr(name), repr(closename), repr(xmltext[cursor: cursor+100])))
  317. remainder = xmltext[cursor:nextopenbracket]
  318. if remainder:
  319. #if verbose: print "remainder", repr(remainder)
  320. L.append(remainder)
  321. cursor = nextclosebracket+1
  322. #print "for", name, "found close tag"
  323. docontents = None # done
  324. # otherwise we are looking at a new tag, recursively parse it...
  325. # first record any intervening content
  326. else:
  327. remainder = xmltext[cursor:nextopenbracket]
  328. if remainder:
  329. L.append(remainder)
  330. #if verbose:
  331. # #print "skipping", repr(remainder)
  332. # #print "--- recursively parsing starting at", xmltext[nextopenbracket:nextopenbracket+20]
  333. (parsetree, cursor) = parsexml0(xmltext, startingat=nextopenbracket, toplevel=None, entityReplacer=entityReplacer)
  334. if parsetree:
  335. L.append(parsetree)
  336. # maybe should check for trailing garbage?
  337. # toplevel:
  338. # remainder = xmltext[cursor:].strip()
  339. # if remainder:
  340. # raise ValueError, "trailing garbage at top level %s" % repr(remainder[:20])
  341. if ContentList:
  342. if entityReplacer: ContentList = entityReplacer(ContentList)
  343. t = (NameString, AttDict, ContentList, ExtraStuff)
  344. return (t, cursor)
  345. import types
  346. def pprettyprint(parsedxml):
  347. """pretty printer mainly for testing"""
  348. st = bytes
  349. if type(parsedxml) is st:
  350. return parsedxml
  351. (name, attdict, textlist, extra) = parsedxml
  352. if not attdict: attdict={}
  353. attlist = []
  354. for k in attdict.keys():
  355. v = attdict[k]
  356. attlist.append("%s=%s" % (k, repr(v)))
  357. attributes = " ".join(attlist)
  358. if not name and attributes:
  359. raise ValueError("name missing with attributes???")
  360. if textlist is not None:
  361. # with content
  362. textlistpprint = list(map(pprettyprint, textlist))
  363. textpprint = "\n".join(textlistpprint)
  364. if not name:
  365. return textpprint # no outer tag
  366. # indent it
  367. nllist = textpprint.split("\n")
  368. textpprint = " "+ ("\n ".join(nllist))
  369. return "<%s %s>\n%s\n</%s>" % (name, attributes, textpprint, name)
  370. # otherwise must be a simple tag
  371. return "<%s %s/>" % (name, attributes)
  372. dump = 0
  373. def testparse(s):
  374. from time import time
  375. from pprint import pprint
  376. now = time()
  377. D = parsexmlSimple(s)
  378. print("DONE", time()-now)
  379. if dump&4:
  380. pprint(D)
  381. #pprint(D)
  382. if dump&1:
  383. print("============== reformatting")
  384. p = pprettyprint(D)
  385. print(p)
  386. def test():
  387. testparse("""<this type="xml">text &lt;&gt;<b>in</b> <funnytag foo="bar"/> xml</this>
  388. <!-- comment -->
  389. <![CDATA[
  390. <this type="xml">text <b>in</b> xml</this> ]]>
  391. <tag with="<brackets in values>">just testing brackets feature</tag>
  392. """)
  393. filenames = [ #"../../reportlab/demos/pythonpoint/pythonpoint.xml",
  394. "samples/hamlet.xml"]
  395. #filenames = ["moa.xml"]
  396. dump=1
  397. if __name__=="__main__":
  398. test()
  399. from time import time
  400. now = time()
  401. for f in filenames:
  402. t = open(f).read()
  403. print("parsing", f)
  404. testparse(t)
  405. print("elapsed", time()-now)