File indexing completed on 2024-12-01 23:40:20
0001
0002
0003
0004
0005 import re
0006 import os
0007 import xml.sax.handler
0008 import pprint
0009
0010 class DataNode (object):
0011
0012 spaces = 4
0013
0014 def __init__ (self, **kwargs):
0015 self._attrs = {}
0016 self._data = None
0017 self._ncDict = kwargs.get ('nameChangeDict', {})
0018
0019
0020 def __len__ (self):
0021
0022 return 1
0023
0024
0025 def __getitem__ (self, key):
0026 if isinstance (key, str):
0027 return self._attrs.get(key,None)
0028 else:
0029 return [self][key]
0030
0031
0032 def __contains__ (self, name):
0033 return name in self._attrs
0034
0035
0036 def __nonzero__ (self):
0037 return bool (self._attrs or self._data)
0038
0039
0040 def __getattr__ (self, name):
0041 if name.startswith('__'):
0042
0043 raise AttributeError (name)
0044 return self._attrs.get (name, None)
0045
0046
0047 def _add_xml_attr (self, name, value):
0048 change = self._ncDict.get (name)
0049 if change:
0050 name = change
0051 if name in self._attrs:
0052
0053 children = self._attrs[name]
0054 if not isinstance(children, list):
0055 children = [children]
0056 self._attrs[name] = children
0057 children.append(value)
0058 else:
0059 self._attrs[name] = value
0060
0061
0062 def __str__ (self):
0063 return self.stringify()
0064
0065
0066 def __repr__ (self):
0067 items = sorted (self._attrs.items())
0068 if self._data:
0069 items.append(('data', self._data))
0070 return u'{%s}' % ', '.join([u'%s:%s' % (k,repr(v)) for k,v in items])
0071
0072
0073 def attributes (self):
0074 return self._attrs
0075
0076
0077 @staticmethod
0078 def isiterable (obj):
0079 return getattr (obj, '__iter__', False)
0080
0081
0082 @staticmethod
0083 def _outputValues (obj, name, offset):
0084 retval = ' ' * offset
0085 if name:
0086 retval += '%s: ' % name
0087 offset += len (name) + DataNode.spaces
0088
0089 if isinstance (obj, list):
0090 first = True
0091 for value in obj:
0092 print("value", value, value.__class__.__name__)
0093 if first:
0094 tempoffset = offset
0095 first = False
0096 retval += '[\n ' + ' ' * offset
0097 else:
0098 retval += ',\n ' + ' ' * offset
0099 tempoffset = offset
0100 if isinstance (value, DataNode):
0101 retval += value.stringify (offset=tempoffset)
0102 print(" calling stringify for %s" % value)
0103 elif DataNode.isiterable (value):
0104 retval += DataNode._outputValues (value, '', offset)
0105 else:
0106 retval += "%s" % value
0107 retval += '\n' + ' ' * (offset - 2) +']\n'
0108 return retval
0109 retval += pprint.pformat(obj,
0110 indent= offset,
0111 width=1)
0112 return retval
0113
0114
0115 def stringify (self, name = '', offset = 0):
0116
0117 if self._data and not len (self._attrs):
0118 return _outputValues (self._data, name, offset)
0119 retval = ' ' * offset
0120 if name:
0121 retval += '%s : %s\n' % \
0122 (name,
0123 pprint.pformat (self._data,
0124 indent= offset+DataNode.spaces,
0125 width=1) )
0126 else:
0127 retval += pprint.pformat (self._data,
0128 indent=offset+DataNode.spaces,
0129 width=1)
0130 return retval
0131
0132 retval = ''
0133 if name:
0134 retval += '\n' + ' ' * offset
0135 retval += '%s: ' % name
0136 first = True
0137 for key, value in sorted (self._attrs.items()):
0138 if first:
0139 retval += '{ \n'
0140 tempspace = offset + 3
0141 first = False
0142 else:
0143 retval += ',\n'
0144 tempspace = offset + 3
0145 if isinstance (value, DataNode):
0146 retval += value.stringify (key, tempspace)
0147 else:
0148 retval += DataNode._outputValues (value, key, tempspace)
0149
0150 if self._data:
0151 retval += ',\n'
0152 tempspace = offset + 3
0153 retval += DataNode._ouptputValues (self._data, name, tempspace)
0154 retval += '\n ' + ' ' * offset + '}'
0155 return retval
0156
0157
0158
0159 class TreeBuilder (xml.sax.handler.ContentHandler):
0160
0161 non_id_char = re.compile('[^_0-9a-zA-Z]')
0162
0163 def __init__ (self, **kwargs):
0164 self._stack = []
0165 self._text_parts = []
0166 self._ncDict = kwargs.get ('nameChangeDict', {})
0167 self._root = DataNode (nameChangeDict = self._ncDict)
0168 self.current = self._root
0169
0170 def startElement (self, name, attrs):
0171 self._stack.append( (self.current, self._text_parts))
0172 self.current = DataNode (nameChangeDict = self._ncDict)
0173 self._text_parts = []
0174
0175 for k, v in attrs.items():
0176 self.current._add_xml_attr (TreeBuilder._name_mangle(k), v)
0177
0178 def endElement (self, name):
0179 text = ''.join (self._text_parts).strip()
0180 if text:
0181 self.current._data = text
0182 if self.current.attributes():
0183 obj = self.current
0184 else:
0185
0186 obj = text or ''
0187 self.current, self._text_parts = self._stack.pop()
0188 self.current._add_xml_attr (TreeBuilder._name_mangle(name), obj)
0189
0190 def characters (self, content):
0191 self._text_parts.append(content)
0192
0193 def root (self):
0194 return self._root
0195
0196 def topLevel (self):
0197 '''Returns top level object'''
0198 return list(self._root.attributes().values())[0]
0199
0200
0201 @staticmethod
0202 def _name_mangle (name):
0203 return TreeBuilder.non_id_char.sub('_', name)
0204
0205
0206 regexList = [ (re.compile (r'&'), '&' ),
0207 (re.compile (r'<'), '<' ),
0208 (re.compile (r'>'), '>' ),
0209 (re.compile (r'"'), '"e;' ),
0210 (re.compile (r"'"), ''' )
0211 ]
0212
0213 quoteRE = re.compile (r'(\w\s*=\s*")([^"]+)"')
0214
0215 def fixQuoteValue (match):
0216 '''Changes all characters inside of the match'''
0217 quote = match.group(2)
0218 for regexTup in regexList:
0219 quote = regexTup[0].sub( regexTup[1], quote )
0220 return match.group(1) + quote + '"'
0221
0222
0223 def xml2obj (**kwargs):
0224 ''' Converts XML data into native Python object. Takes either
0225 file handle or string as input. Does NOT fix illegal characters.
0226
0227 input source: Exactly one of the three following is needed
0228 filehandle - input from file handle
0229 contents - input from string
0230 filename - input from filename
0231
0232 options:
0233 filtering - boolean value telling code whether or not to fileter
0234 input selection to remove illegal XML characters
0235 nameChangeDict - dictionaries of names to change in python object'''
0236
0237
0238 filehandle = kwargs.get ('filehandle')
0239 contents = kwargs.get ('contents')
0240 filename = kwargs.get ('filename')
0241 if not filehandle and not contents and not filename:
0242 raise RuntimeError("You must provide 'filehandle', 'contents', or 'filename'")
0243 if filehandle and contents or \
0244 filehandle and filename or \
0245 contents and filename:
0246 raise RuntimeError("You must provide only ONE of 'filehandle', 'contents', or 'filename'")
0247
0248
0249 filtering = kwargs.get ('filtering')
0250 if filtering:
0251
0252 if not contents:
0253 if not filehandle:
0254 try:
0255 filehandle = open (filename, 'r')
0256 except:
0257 raise RuntimeError("Failed to open '%s'" % filename)
0258 contents = ''
0259 for line in filehandle:
0260 contents += line
0261 filehandle.close()
0262 filehandle = filename = ''
0263 contents = quoteRE.sub (fixQuoteValue, contents)
0264
0265 ncDict = kwargs.get ('nameChangeDict', {})
0266 builder = TreeBuilder (nameChangeDict = ncDict)
0267 if contents:
0268 xml.sax.parseString(contents, builder)
0269 else:
0270 if not filehandle:
0271 try:
0272 filehandle = open (filename, 'r')
0273 except:
0274 raise RuntimeError("Failed to open '%s'" % filename)
0275 xml.sax.parse(filehandle, builder)
0276 return builder.topLevel()