ReferenceManualScripts/python/TableParser.py

0001 from __future__ import print_function
0002 from __future__ import absolute_import
0003 # email: cmsdoxy@cern.ch, ali.mehmet.altundag@cern.ch
0004
0005 # please have a look at the namespaces.html (namespace list) and annotated.html
0006 # (~class list) html files to understand the tags/attributes that we use in
0007 # this script.
0008
0009 from bs4 import BeautifulSoup
0010 import sys, os, copy
0011
0012 htmlFullPath     = None
0013 htmlFilePath     = None
0014 htmlFileName     = None
0015 fileNameTemplate = None # html file name template
0016 htmlPage         = None
0017 tableClassName   = 'directory'
0018
0019 # load rows from the table in [C]lass and [N]amespace list pages  and prapere
0020 # pages in the following structure: pages = {'A' : [...], 'B' : [...]}
0021 def extractPages(configFileFlag = False):
0022     # initial page, A
0023     pages = {'A':[]}
0024     # find all class/namespace talbe rows.
0025     table = htmlPage.find('table', {'class' : tableClassName})
0026     for row in table.findAll('tr'):
0027         # please see the related html file (annotated.html) to understand the
0028         # approach here. you will see that, only hidden rows have style
0029         # attribute and these hidden rows must be added to pages of their
0030         # parents. This is why we need to check whether row has a style
0031         # attribute or not.
0032         styleFlag = False
0033         if 'style' in row: styleFlag = True
0034         # change the first letter if row is not hidden (child) one
0035         if not styleFlag: firstLetter = row.findAll('td')[0].text[0].upper()
0036         # if pages dict doesn't have the page yet..
0037         if firstLetter not in pages:
0038             pages[firstLetter] = []
0039         # insert the row into the related page
0040         if configFileFlag:
0041             url = row.find('a')['href']
0042             if '_cff' in url or '_cfi' in url or '_cfg' in url:
0043                 pages[firstLetter].append(row)
0044         else:
0045             pages[firstLetter].append(row)
0046     return pages
0047
0048 # load rows from the package documentation page. output structure:
0049 # pages = {'PackageA' : [..], 'PackageB' : [...]}
0050 def extractPagesForPackage():
0051     # initial page, A
0052     pages = {}
0053     table = htmlPage.find('table', {'class' : tableClassName})
0054     for row in table.findAll('tr'):
0055         # first cell contains name of the package...
0056         name = row.findAll('td')[0].text
0057         # parse package names --please have a look at the pages.html file
0058         name = name[name.find(' '):name.find('/')].strip()
0059         # if the package is not added yet
0060         if name not in pages: pages[name] = []
0061         pages[name].append(row)
0062     return pages
0063
0064 # generate alphabetic tab for html pages that will be generated by this script
0065 def generateTab(items, curr, tabClass = 'tabs3'):
0066     itemTagMap = {}; tab = ''
0067     for item in items:
0068         fn  = fileNameTemplate % item.replace(' ', '') # generate file name
0069         if item != curr: tab += '<li><a href="%s">%s</a></li>' % (fn, item)
0070         else: tab += '<li class="current"><a href="%s">%s</a></li>'%(fn, item)
0071     return '<div class="%s"><ul class="tablist">%s</ul></div>' % (tabClass,tab)
0072
0073 if __name__ == "__main__":
0074     if len(sys.argv) < 2:
0075         sys.stderr.write("not enough parameter!\n")
0076         sys.exit(1)
0077
0078     # initialize variables
0079     htmlFullPath     = sys.argv[1]
0080     htmlFilePath     = os.path.split(htmlFullPath)[0]
0081     htmlFileName     = os.path.split(htmlFullPath)[1]
0082     fileNameTemplate = htmlFileName.replace('.html', '_%s.html')
0083
0084     # load the html page
0085     with open(htmlFullPath) as f:
0086         htmlPage = f.read()
0087         htmlPage = BeautifulSoup(htmlPage)
0088
0089     # please have a look at the pages.html page. You will see that class name
0090     # of the related tab, which we will use to put 'index tab' by using this
0091     # tab, is different for pages.html file. For namespaces.html (namespace
0092     # list) and annotated.html (~class list) files, class names are the same
0093     # tabs2. this is why we are setting 'the destination tab class name' up
0094     # differently depending on the html file name.
0095     if htmlFileName == 'packageDocumentation.html':
0096         pages = extractPagesForPackage()
0097         destTabClassName = 'tabs'
0098     elif htmlFileName == 'configfiles.html':
0099         pages = extractPages(configFileFlag = True)
0100         destTabClassName = 'tabs2'
0101     else:
0102         pages = extractPages()
0103         destTabClassName = 'tabs2'
0104
0105     allRows = []
0106     pageNames = pages.keys(); pageNames.sort()
0107     for page in pageNames:
0108         allRows = allRows + pages[page]
0109     pages['All'] = allRows
0110     pageNames.append('All')
0111
0112     # prepare the template
0113     table     = htmlPage.find('table', {'class' : tableClassName})
0114     # generate template (clean whole table content)
0115     for row in table.findAll('tr'):
0116         row.extract()
0117
0118     # generate pages
0119     for page in pageNames:
0120         print('generating %s...' % (fileNameTemplate % page))
0121         temp   = BeautifulSoup(str(htmlPage))
0122         table  = temp.find('table', {'class' : tableClassName})
0123         oldTab = temp.find('div', {'class' : destTabClassName})
0124         newTab = generateTab(pageNames, page)
0125         oldTab.replaceWith(BeautifulSoup(oldTab.prettify() + str(newTab)))
0126         for row in pages[page]:
0127             table.append(row)
0128         # replace blank character with '_'. Please notice that you will not
0129         # be able to use original page name after this line.
0130         page = page.replace(' ', '_')
0131         with open('%s/%s'%(htmlFilePath, fileNameTemplate % page), 'w') as f:
0132             f.write(str(temp))