Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-02-14 13:33:14

0001 ## Original version of code heavily based on recipe written by Wai Yip
0002 ## Tung, released under PSF license.
0003 ## http://code.activestate.com/recipes/534109/
0004 
0005 import re
0006 import os
0007 import xml.sax.handler
0008 
0009 class DataNode (object):
0010 
0011     def __init__ (self, **kwargs):
0012         self._attrs = {}     # XML attributes and child elements
0013         self._data  = None   # child text data
0014         self._ncDict = kwargs.get ('nameChangeDict', {})
0015 
0016     def __len__ (self):
0017         # treat single element as a list of 1
0018         return 1
0019 
0020     def __getitem__ (self, key):
0021         if isinstance (key, str):
0022             return self._attrs.get(key,None)
0023         else:
0024             return [self][key]
0025 
0026     def __contains__ (self, name):
0027         return name in self._attrs
0028 
0029     def __nonzero__ (self):
0030         return bool (self._attrs or self._data)
0031 
0032     def __getattr__ (self, name):
0033         if name.startswith('__'):
0034             # need to do this for Python special methods???
0035             raise AttributeError (name)
0036         return self._attrs.get (name, None)
0037 
0038     def _add_xml_attr (self, name, value):
0039         change = self._ncDict.get (name)
0040         if change:
0041             name = change
0042         if name in self._attrs:
0043             # multiple attribute of the same name are represented by a list
0044             children = self._attrs[name]
0045             if not isinstance(children, list):
0046                 children = [children]
0047                 self._attrs[name] = children
0048             children.append(value)
0049         else:
0050             self._attrs[name] = value
0051 
0052     def __str__ (self):
0053         return self._data or ''
0054 
0055     def __repr__ (self):
0056         items = sorted (self._attrs.items())
0057         if self._data:
0058             items.append(('data', self._data))
0059         return u'{%s}' % ', '.join([u'%s:%s' % (k,repr(v)) for k,v in items])
0060 
0061     def attributes (self):
0062         return self._attrs
0063 
0064 
0065 class TreeBuilder (xml.sax.handler.ContentHandler):
0066 
0067     non_id_char = re.compile('[^_0-9a-zA-Z]')
0068 
0069     def __init__ (self, **kwargs):
0070         self._stack = []
0071         self._text_parts = []
0072         self._ncDict = kwargs.get ('nameChangeDict', {})
0073         self._root = DataNode (nameChangeDict = self._ncDict)
0074         self.current = self._root
0075 
0076     def startElement (self, name, attrs):
0077         self._stack.append( (self.current, self._text_parts))
0078         self.current = DataNode (nameChangeDict = self._ncDict)
0079         self._text_parts = []
0080         # xml attributes --> python attributes
0081         for k, v in attrs.items():
0082             self.current._add_xml_attr (TreeBuilder._name_mangle(k), v)
0083 
0084     def endElement (self, name):
0085         text = ''.join (self._text_parts).strip()
0086         if text:
0087             self.current._data = text
0088         if self.current.attributes():
0089             obj = self.current
0090         else:
0091             # a text only node is simply represented by the string
0092             obj = text or ''
0093         self.current, self._text_parts = self._stack.pop()
0094         self.current._add_xml_attr (TreeBuilder._name_mangle(name), obj)
0095 
0096     def characters (self, content):
0097         self._text_parts.append(content)
0098 
0099     def root (self):
0100         return self._root
0101 
0102     def topLevel (self):
0103         '''Returns top level object'''
0104         return self._root.attributes().values()[0]
0105         
0106 
0107     @staticmethod
0108     def _name_mangle (name):
0109         return TreeBuilder.non_id_char.sub('_', name)
0110 
0111 
0112 regexList = [ (re.compile (r'&'), '&'   ),
0113               (re.compile (r'<'), '&lt;'    ),
0114               (re.compile (r'>'), '&gt;'    ),
0115               (re.compile (r'"'), '&quote;' ),
0116               (re.compile (r"'"), '&#39;'   )
0117               ]
0118 
0119 quoteRE = re.compile (r'(\w\s*=\s*")([^"]+)"')
0120 
0121 def fixQuoteValue (match):
0122     '''Changes all characters inside of the match'''
0123     quote = match.group(2)
0124     for regexTup in regexList:
0125         quote = regexTup[0].sub( regexTup[1], quote )
0126     return match.group(1) + quote + '"'
0127 
0128 
0129 def xml2obj (**kwargs):
0130     ''' Converts XML data into native Python object.  Takes either
0131     file handle or string as input.  Does NOT fix illegal characters.
0132 
0133     input source:  Exactly one of the three following is needed
0134     filehandle     - input from file handle
0135     contents       - input from string
0136     filename       - input from filename
0137 
0138     options:
0139     filtering      - boolean value telling code whether or not to fileter
0140                      input selection to remove illegal XML characters
0141     nameChangeDict - dictionaries of names to change in python object'''
0142 
0143     # make sure we have exactly 1 input source
0144     filehandle = kwargs.get ('filehandle')
0145     contents   = kwargs.get ('contents')
0146     filename   = kwargs.get ('filename')
0147     if not filehandle and not contents and not filename:
0148         raise RuntimeError("You must provide 'filehandle', 'contents', or 'filename'")
0149     if     filehandle and contents or \
0150            filehandle and filename or \
0151            contents   and filename:
0152         raise RuntimeError("You must provide only ONE of 'filehandle', 'contents', or 'filename'")
0153 
0154     # are we filtering?
0155     filtering = kwargs.get ('filtering')
0156     if filtering:
0157         # if we are filtering, we need to read in the contents to modify them
0158         if not contents:
0159             if not filehandle:
0160                 try:
0161                     filehandle = open (filename, 'r')
0162                 except:
0163                     raise RuntimeError("Failed to open '%s'" % filename)
0164             contents = ''
0165             for line in filehandle:
0166                 contents += line
0167             filehandle.close()
0168             filehandle = filename = ''
0169         contents = quoteRE.sub (fixQuoteValue, contents)
0170     
0171     ncDict = kwargs.get ('nameChangeDict', {})
0172     builder = TreeBuilder (nameChangeDict = ncDict)
0173     if contents:
0174         xml.sax.parseString(contents, builder)
0175     else:
0176         if not filehandle:
0177             try:
0178                 filehandle = open (filename, 'r')
0179             except:
0180                 raise RuntimeError("Failed to open '%s'" % filename)
0181         xml.sax.parse(filehandle, builder)
0182     return builder.topLevel()