File indexing completed on 2024-04-06 12:31:48
0001
0002
0003
0004
0005 import re
0006 import os
0007 import xml.sax.handler
0008
0009 class DataNode (object):
0010
0011 def __init__ (self, **kwargs):
0012 self._attrs = {}
0013 self._data = None
0014 self._ncDict = kwargs.get ('nameChangeDict', {})
0015
0016 def __len__ (self):
0017
0018 return 1
0019
0020 def __getitem__ (self, key):
0021 if isinstance (key, str):
0022 return self._attrs.get(key,None)
0023 else:
0024 return [self][key]
0025
0026 def __contains__ (self, name):
0027 return name in self._attrs
0028
0029 def __nonzero__ (self):
0030 return bool (self._attrs or self._data)
0031
0032 def __getattr__ (self, name):
0033 if name.startswith('__'):
0034
0035 raise AttributeError (name)
0036 return self._attrs.get (name, None)
0037
0038 def _add_xml_attr (self, name, value):
0039 change = self._ncDict.get (name)
0040 if change:
0041 name = change
0042 if name in self._attrs:
0043
0044 children = self._attrs[name]
0045 if not isinstance(children, list):
0046 children = [children]
0047 self._attrs[name] = children
0048 children.append(value)
0049 else:
0050 self._attrs[name] = value
0051
0052 def __str__ (self):
0053 return self._data or ''
0054
0055 def __repr__ (self):
0056 items = sorted (self._attrs.items())
0057 if self._data:
0058 items.append(('data', self._data))
0059 return u'{%s}' % ', '.join([u'%s:%s' % (k,repr(v)) for k,v in items])
0060
0061 def attributes (self):
0062 return self._attrs
0063
0064
0065 class TreeBuilder (xml.sax.handler.ContentHandler):
0066
0067 non_id_char = re.compile('[^_0-9a-zA-Z]')
0068
0069 def __init__ (self, **kwargs):
0070 self._stack = []
0071 self._text_parts = []
0072 self._ncDict = kwargs.get ('nameChangeDict', {})
0073 self._root = DataNode (nameChangeDict = self._ncDict)
0074 self.current = self._root
0075
0076 def startElement (self, name, attrs):
0077 self._stack.append( (self.current, self._text_parts))
0078 self.current = DataNode (nameChangeDict = self._ncDict)
0079 self._text_parts = []
0080
0081 for k, v in attrs.items():
0082 self.current._add_xml_attr (TreeBuilder._name_mangle(k), v)
0083
0084 def endElement (self, name):
0085 text = ''.join (self._text_parts).strip()
0086 if text:
0087 self.current._data = text
0088 if self.current.attributes():
0089 obj = self.current
0090 else:
0091
0092 obj = text or ''
0093 self.current, self._text_parts = self._stack.pop()
0094 self.current._add_xml_attr (TreeBuilder._name_mangle(name), obj)
0095
0096 def characters (self, content):
0097 self._text_parts.append(content)
0098
0099 def root (self):
0100 return self._root
0101
0102 def topLevel (self):
0103 '''Returns top level object'''
0104 return list(self._root.attributes().values())[0]
0105
0106
0107 @staticmethod
0108 def _name_mangle (name):
0109 return TreeBuilder.non_id_char.sub('_', name)
0110
0111
0112 regexList = [ (re.compile (r'&'), '&' ),
0113 (re.compile (r'<'), '<' ),
0114 (re.compile (r'>'), '>' ),
0115 (re.compile (r'"'), '"e;' ),
0116 (re.compile (r"'"), ''' )
0117 ]
0118
0119 quoteRE = re.compile (r'(\w\s*=\s*")([^"]+)"')
0120
0121 def fixQuoteValue (match):
0122 '''Changes all characters inside of the match'''
0123 quote = match.group(2)
0124 for regexTup in regexList:
0125 quote = regexTup[0].sub( regexTup[1], quote )
0126 return match.group(1) + quote + '"'
0127
0128
0129 def xml2obj (**kwargs):
0130 ''' Converts XML data into native Python object. Takes either
0131 file handle or string as input. Does NOT fix illegal characters.
0132
0133 input source: Exactly one of the three following is needed
0134 filehandle - input from file handle
0135 contents - input from string
0136 filename - input from filename
0137
0138 options:
0139 filtering - boolean value telling code whether or not to fileter
0140 input selection to remove illegal XML characters
0141 nameChangeDict - dictionaries of names to change in python object'''
0142
0143
0144 filehandle = kwargs.get ('filehandle')
0145 contents = kwargs.get ('contents')
0146 filename = kwargs.get ('filename')
0147 if not filehandle and not contents and not filename:
0148 raise RuntimeError("You must provide 'filehandle', 'contents', or 'filename'")
0149 if filehandle and contents or \
0150 filehandle and filename or \
0151 contents and filename:
0152 raise RuntimeError("You must provide only ONE of 'filehandle', 'contents', or 'filename'")
0153
0154
0155 filtering = kwargs.get ('filtering')
0156 if filtering:
0157
0158 if not contents:
0159 if not filehandle:
0160 try:
0161 filehandle = open (filename, 'r')
0162 except:
0163 raise RuntimeError("Failed to open '%s'" % filename)
0164 contents = ''
0165 for line in filehandle:
0166 contents += line
0167 filehandle.close()
0168 filehandle = filename = ''
0169 contents = quoteRE.sub (fixQuoteValue, contents)
0170
0171 ncDict = kwargs.get ('nameChangeDict', {})
0172 builder = TreeBuilder (nameChangeDict = ncDict)
0173 if contents:
0174 xml.sax.parseString(contents, builder)
0175 else:
0176 if not filehandle:
0177 try:
0178 filehandle = open (filename, 'r')
0179 except:
0180 raise RuntimeError("Failed to open '%s'" % filename)
0181 xml.sax.parse(filehandle, builder)
0182 return builder.topLevel()