"""Very simple and fast XML parser, used for intra-paragraph text.
Devised by Aaron Watters in the bad old days before Python had fast
parsers available. Constructs the lightest possible in-memory
representation; parses most files we have seen in pure python very
quickly.
The output structure is the same as the one produced by pyRXP,
our validating C-based parser, which was written later. It will
use pyRXP if available.
This is used to parse intra-paragraph markup.
Example parse::
text in xml
( "this",
{"type": "xml"},
[ "text ",
("b", None, ["in"], None),
" xml"
]
None )
{ 0: "this"
"type": "xml"
1: ["text ",
{0: "b", 1:["in"]},
" xml"]
}
Ie, xml tag translates to a tuple:
(name, dictofattributes, contentlist, miscellaneousinfo)
where miscellaneousinfo can be anything, (but defaults to None)
(with the intention of adding, eg, line number information)
special cases: name of "" means "top level, no containing tag".
Top level parse always looks like this::
("", list, None, None)
contained text of None means
In order to support stuff like::
AT THE MOMENT & ETCETERA ARE IGNORED. THEY MUST BE PROCESSED
IN A POST-PROCESSING STEP.
PROLOGUES ARE NOT UNDERSTOOD. OTHER STUFF IS PROBABLY MISSING.
"""
RequirePyRXP = 0 # set this to 1 to disable the nonvalidating fallback parser.
try:
#raise ImportError, "dummy error"
simpleparse = 0
import pyRXPU
def warnCB(s):
print(s)
pyRXP_parser = pyRXPU.Parser(
ErrorOnValidityErrors=1,
NoNoDTDWarning=1,
ExpandCharacterEntities=1,
ExpandGeneralEntities=1,
warnCB = warnCB,
srcName='string input',
ReturnUTF8 = 0,
)
def parsexml(xmlText, oneOutermostTag=0,eoCB=None,entityReplacer=None,parseOpts={}):
pyRXP_parser.eoCB = eoCB
p = pyRXP_parser.parse(xmlText,**parseOpts)
return oneOutermostTag and p or ('',None,[p],None)
except ImportError:
simpleparse = 1
NONAME = ""
NAMEKEY = 0
CONTENTSKEY = 1
CDATAMARKER = ""
replacelist = [("<", "<"), (">", ">"), ("&", "&")] # amp must be last
#replacelist = []
def unEscapeContentList(contentList):
result = []
for e in contentList:
if "&" in e:
for (old, new) in replacelist:
e = e.replace(old, new)
result.append(e)
return result
def parsexmlSimple(xmltext, oneOutermostTag=0,eoCB=None,entityReplacer=unEscapeContentList):
"""official interface: discard unused cursor info"""
if RequirePyRXP:
raise ImportError("pyRXP not found, fallback parser disabled")
(result, cursor) = parsexml0(xmltext,entityReplacer=entityReplacer)
if oneOutermostTag:
return result[2][0]
else:
return result
if simpleparse:
parsexml = parsexmlSimple
def parseFile(filename):
raw = open(filename, 'r').read()
return parsexml(raw)
verbose = 0
def skip_prologue(text, cursor):
"""skip any prologue found after cursor, return index of rest of text"""
### NOT AT ALL COMPLETE!!! definitely can be confused!!!
prologue_elements = ("!DOCTYPE", "?xml", "!--")
done = None
while done is None:
#print "trying to skip:", repr(text[cursor:cursor+20])
openbracket = text.find("<", cursor)
if openbracket<0: break
past = openbracket+1
found = None
for e in prologue_elements:
le = len(e)
if text[past:past+le]==e:
found = 1
cursor = text.find(">", past)
if cursor<0:
raise ValueError("can't close prologue %r" % e)
cursor = cursor+1
if found is None:
done=1
#print "done skipping"
return cursor
def parsexml0(xmltext, startingat=0, toplevel=1,
# snarf in some globals
entityReplacer=unEscapeContentList,
#len=len, None=None
#LENCDATAMARKER=LENCDATAMARKER, CDATAMARKER=CDATAMARKER
):
"""simple recursive descent xml parser...
return (dictionary, endcharacter)
special case: comment returns (None, endcharacter)"""
#print "parsexml0", repr(xmltext[startingat: startingat+10])
# DEFAULTS
NameString = NONAME
ContentList = AttDict = ExtraStuff = None
if toplevel is not None:
#if verbose: print "at top level"
#if startingat!=0:
# raise ValueError, "have to start at 0 for top level!"
xmltext = xmltext.strip()
cursor = startingat
#look for interesting starting points
firstbracket = xmltext.find("<", cursor)
afterbracket2char = xmltext[firstbracket+1:firstbracket+3]
#print "a", repr(afterbracket2char)
#firstampersand = xmltext.find("&", cursor)
#if firstampersand>0 and firstampersand0:
#afterbracket2char = xmltext[firstbracket:firstbracket+2]
if toplevel is not None:
#print "toplevel with no outer tag"
NameString = name = NONAME
cursor = skip_prologue(xmltext, cursor)
#break
elif firstbracket<0:
raise ValueError("non top level entry should be at start tag: %s" % repr(xmltext[:10]))
# special case: CDATA
elif afterbracket2char=="![" and xmltext[firstbracket:firstbracket+9]=="":
raise ValueError("invalid comment: contains double dashes %s" % repr(xmltext[cursor:cursor+20]))
return (None, endcomment+1) # shortcut exit
else:
# get the rest of the tag
#if verbose: print "parsing start tag"
# make sure the tag isn't in doublequote pairs
closebracket = xmltext.find(">", firstbracket)
noclose = closebracket<0
startsearch = closebracket+1
pastfirstbracket = firstbracket+1
tagcontent = xmltext[pastfirstbracket:closebracket]
# shortcut, no equal means nothing but name in the tag content
if '=' not in tagcontent:
if tagcontent[-1]=="/":
# simple case
#print "simple case", tagcontent
tagcontent = tagcontent[:-1]
docontents = None
name = tagcontent.strip()
NameString = name
cursor = startsearch
else:
if '"' in tagcontent:
# check double quotes
stop = None
# not inside double quotes! (the split should have odd length)
if noclose or len((tagcontent+".").split('"'))% 2:
stop=1
while stop is None:
closebracket = xmltext.find(">", startsearch)
startsearch = closebracket+1
noclose = closebracket<0
tagcontent = xmltext[pastfirstbracket:closebracket]
# not inside double quotes! (the split should have odd length)
if noclose or len((tagcontent+".").split('"'))% 2:
stop=1
if noclose:
raise ValueError("unclosed start tag %s" % repr(xmltext[firstbracket:firstbracket+20]))
cursor = startsearch
#cursor = closebracket+1
# handle simple tag /> syntax
if xmltext[closebracket-1]=="/":
#if verbose: print "it's a simple tag"
closebracket = closebracket-1
tagcontent = tagcontent[:-1]
docontents = None
#tagcontent = xmltext[firstbracket+1:closebracket]
tagcontent = tagcontent.strip()
taglist = tagcontent.split("=")
#if not taglist:
# raise ValueError, "tag with no name %s" % repr(xmltext[firstbracket:firstbracket+20])
taglist0 = taglist[0]
taglist0list = taglist0.split()
#if len(taglist0list)>2:
# raise ValueError, "bad tag head %s" % repr(taglist0)
name = taglist0list[0]
#print "tag name is", name
NameString = name
# now parse the attributes
attributename = taglist0list[-1]
# put a fake att name at end of last taglist entry for consistent parsing
taglist[-1] = taglist[-1]+" f"
AttDict = D = {}
taglistindex = 1
lasttaglistindex = len(taglist)
#for attentry in taglist[1:]:
while taglistindexlasttaglistindex:
raise ValueError("unclosed value " + repr(attentry))
nextattentry = taglist[taglistindex]
taglistindex = taglistindex+1
attentry = "%s=%s" % (attentry, nextattentry)
attentry = attentry.strip() # only needed for while loop...
attlist = attentry.split()
nextattname = attlist[-1]
attvalue = attentry[:-len(nextattname)]
attvalue = attvalue.strip()
try:
first = attvalue[0]; last=attvalue[-1]
except:
raise ValueError("attvalue,attentry,attlist="+repr((attvalue, attentry,attlist)))
if first==last=='"' or first==last=="'":
attvalue = attvalue[1:-1]
#print attributename, "=", attvalue
D[attributename] = attvalue
attributename = nextattname
# pass over other tags and content looking for end tag
if docontents is not None:
#print "now looking for end tag"
ContentList = L
while docontents is not None:
nextopenbracket = xmltext.find("<", cursor)
if nextopenbracket", nextopenbracket)
if nextclosebracket\n%s\n%s>" % (name, attributes, textpprint, name)
# otherwise must be a simple tag
return "<%s %s/>" % (name, attributes)
dump = 0
def testparse(s):
from time import time
from pprint import pprint
now = time()
D = parsexmlSimple(s)
print("DONE", time()-now)
if dump&4:
pprint(D)
#pprint(D)
if dump&1:
print("============== reformatting")
p = pprettyprint(D)
print(p)
def test():
testparse("""text <>in xml
text in xml ]]>
just testing brackets feature
""")
filenames = [ #"../../reportlab/demos/pythonpoint/pythonpoint.xml",
"samples/hamlet.xml"]
#filenames = ["moa.xml"]
dump=1
if __name__=="__main__":
test()
from time import time
now = time()
for f in filenames:
t = open(f).read()
print("parsing", f)
testparse(t)
print("elapsed", time()-now)