File indexing completed on 2023-03-17 11:03:35
0001 from __future__ import print_function
0002
0003
0004
0005
0006 import re
0007 import os
0008 import xml.sax.handler
0009 import pprint
0010
0011 class DataNode (object):
0012
0013 spaces = 4
0014
0015 def __init__ (self, **kwargs):
0016 self._attrs = {}
0017 self._data = None
0018 self._ncDict = kwargs.get ('nameChangeDict', {})
0019
0020
0021 def __len__ (self):
0022
0023 return 1
0024
0025
0026 def __getitem__ (self, key):
0027 if isinstance (key, str):
0028 return self._attrs.get(key,None)
0029 else:
0030 return [self][key]
0031
0032
0033 def __contains__ (self, name):
0034 return name in self._attrs
0035
0036
0037 def __nonzero__ (self):
0038 return bool (self._attrs or self._data)
0039
0040
0041 def __getattr__ (self, name):
0042 if name.startswith('__'):
0043
0044 raise AttributeError (name)
0045 return self._attrs.get (name, None)
0046
0047
0048 def _add_xml_attr (self, name, value):
0049 change = self._ncDict.get (name)
0050 if change:
0051 name = change
0052 if name in self._attrs:
0053
0054 children = self._attrs[name]
0055 if not isinstance(children, list):
0056 children = [children]
0057 self._attrs[name] = children
0058 children.append(value)
0059 else:
0060 self._attrs[name] = value
0061
0062
0063 def __str__ (self):
0064 return self.stringify()
0065
0066
0067 def __repr__ (self):
0068 items = sorted (self._attrs.items())
0069 if self._data:
0070 items.append(('data', self._data))
0071 return u'{%s}' % ', '.join([u'%s:%s' % (k,repr(v)) for k,v in items])
0072
0073
0074 def attributes (self):
0075 return self._attrs
0076
0077
0078 @staticmethod
0079 def isiterable (obj):
0080 return getattr (obj, '__iter__', False)
0081
0082
0083 @staticmethod
0084 def _outputValues (obj, name, offset):
0085 retval = ' ' * offset
0086 if name:
0087 retval += '%s: ' % name
0088 offset += len (name) + DataNode.spaces
0089
0090 if isinstance (obj, list):
0091 first = True
0092 for value in obj:
0093 print("value", value, value.__class__.__name__)
0094 if first:
0095 tempoffset = offset
0096 first = False
0097 retval += '[\n ' + ' ' * offset
0098 else:
0099 retval += ',\n ' + ' ' * offset
0100 tempoffset = offset
0101 if isinstance (value, DataNode):
0102 retval += value.stringify (offset=tempoffset)
0103 print(" calling stringify for %s" % value)
0104 elif DataNode.isiterable (value):
0105 retval += DataNode._outputValues (value, '', offset)
0106 else:
0107 retval += "%s" % value
0108 retval += '\n' + ' ' * (offset - 2) +']\n'
0109 return retval
0110 retval += pprint.pformat(obj,
0111 indent= offset,
0112 width=1)
0113 return retval
0114
0115
0116 def stringify (self, name = '', offset = 0):
0117
0118 if self._data and not len (self._attrs):
0119 return _outputValues (self._data, name, offset)
0120 retval = ' ' * offset
0121 if name:
0122 retval += '%s : %s\n' % \
0123 (name,
0124 pprint.pformat (self._data,
0125 indent= offset+DataNode.spaces,
0126 width=1) )
0127 else:
0128 retval += pprint.pformat (self._data,
0129 indent=offset+DataNode.spaces,
0130 width=1)
0131 return retval
0132
0133 retval = ''
0134 if name:
0135 retval += '\n' + ' ' * offset
0136 retval += '%s: ' % name
0137 first = True
0138 for key, value in sorted (self._attrs.items()):
0139 if first:
0140 retval += '{ \n'
0141 tempspace = offset + 3
0142 first = False
0143 else:
0144 retval += ',\n'
0145 tempspace = offset + 3
0146 if isinstance (value, DataNode):
0147 retval += value.stringify (key, tempspace)
0148 else:
0149 retval += DataNode._outputValues (value, key, tempspace)
0150
0151 if self._data:
0152 retval += ',\n'
0153 tempspace = offset + 3
0154 retval += DataNode._ouptputValues (self._data, name, tempspace)
0155 retval += '\n ' + ' ' * offset + '}'
0156 return retval
0157
0158
0159
0160 class TreeBuilder (xml.sax.handler.ContentHandler):
0161
0162 non_id_char = re.compile('[^_0-9a-zA-Z]')
0163
0164 def __init__ (self, **kwargs):
0165 self._stack = []
0166 self._text_parts = []
0167 self._ncDict = kwargs.get ('nameChangeDict', {})
0168 self._root = DataNode (nameChangeDict = self._ncDict)
0169 self.current = self._root
0170
0171 def startElement (self, name, attrs):
0172 self._stack.append( (self.current, self._text_parts))
0173 self.current = DataNode (nameChangeDict = self._ncDict)
0174 self._text_parts = []
0175
0176 for k, v in attrs.items():
0177 self.current._add_xml_attr (TreeBuilder._name_mangle(k), v)
0178
0179 def endElement (self, name):
0180 text = ''.join (self._text_parts).strip()
0181 if text:
0182 self.current._data = text
0183 if self.current.attributes():
0184 obj = self.current
0185 else:
0186
0187 obj = text or ''
0188 self.current, self._text_parts = self._stack.pop()
0189 self.current._add_xml_attr (TreeBuilder._name_mangle(name), obj)
0190
0191 def characters (self, content):
0192 self._text_parts.append(content)
0193
0194 def root (self):
0195 return self._root
0196
0197 def topLevel (self):
0198 '''Returns top level object'''
0199 return list(self._root.attributes().values())[0]
0200
0201
0202 @staticmethod
0203 def _name_mangle (name):
0204 return TreeBuilder.non_id_char.sub('_', name)
0205
0206
0207 regexList = [ (re.compile (r'&'), '&' ),
0208 (re.compile (r'<'), '<' ),
0209 (re.compile (r'>'), '>' ),
0210 (re.compile (r'"'), '"e;' ),
0211 (re.compile (r"'"), ''' )
0212 ]
0213
0214 quoteRE = re.compile (r'(\w\s*=\s*")([^"]+)"')
0215
0216 def fixQuoteValue (match):
0217 '''Changes all characters inside of the match'''
0218 quote = match.group(2)
0219 for regexTup in regexList:
0220 quote = regexTup[0].sub( regexTup[1], quote )
0221 return match.group(1) + quote + '"'
0222
0223
0224 def xml2obj (**kwargs):
0225 ''' Converts XML data into native Python object. Takes either
0226 file handle or string as input. Does NOT fix illegal characters.
0227
0228 input source: Exactly one of the three following is needed
0229 filehandle - input from file handle
0230 contents - input from string
0231 filename - input from filename
0232
0233 options:
0234 filtering - boolean value telling code whether or not to fileter
0235 input selection to remove illegal XML characters
0236 nameChangeDict - dictionaries of names to change in python object'''
0237
0238
0239 filehandle = kwargs.get ('filehandle')
0240 contents = kwargs.get ('contents')
0241 filename = kwargs.get ('filename')
0242 if not filehandle and not contents and not filename:
0243 raise RuntimeError("You must provide 'filehandle', 'contents', or 'filename'")
0244 if filehandle and contents or \
0245 filehandle and filename or \
0246 contents and filename:
0247 raise RuntimeError("You must provide only ONE of 'filehandle', 'contents', or 'filename'")
0248
0249
0250 filtering = kwargs.get ('filtering')
0251 if filtering:
0252
0253 if not contents:
0254 if not filehandle:
0255 try:
0256 filehandle = open (filename, 'r')
0257 except:
0258 raise RuntimeError("Failed to open '%s'" % filename)
0259 contents = ''
0260 for line in filehandle:
0261 contents += line
0262 filehandle.close()
0263 filehandle = filename = ''
0264 contents = quoteRE.sub (fixQuoteValue, contents)
0265
0266 ncDict = kwargs.get ('nameChangeDict', {})
0267 builder = TreeBuilder (nameChangeDict = ncDict)
0268 if contents:
0269 xml.sax.parseString(contents, builder)
0270 else:
0271 if not filehandle:
0272 try:
0273 filehandle = open (filename, 'r')
0274 except:
0275 raise RuntimeError("Failed to open '%s'" % filename)
0276 xml.sax.parse(filehandle, builder)
0277 return builder.topLevel()