1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
# email: cmsdoxy@cern.ch, ali.mehmet.altundag@cern.ch
# please have a look at the namespaces.html (namespace list) and annotated.html
# (~class list) html files to understand the tags/attributes that we use in
# this script.
from bs4 import BeautifulSoup
import sys, os, copy
htmlFullPath = None
htmlFilePath = None
htmlFileName = None
fileNameTemplate = None # html file name template
htmlPage = None
tableClassName = 'directory'
# load rows from the table in [C]lass and [N]amespace list pages and prapere
# pages in the following structure: pages = {'A' : [...], 'B' : [...]}
def extractPages(configFileFlag = False):
# initial page, A
pages = {'A':[]}
# find all class/namespace talbe rows.
table = htmlPage.find('table', {'class' : tableClassName})
for row in table.findAll('tr'):
# please see the related html file (annotated.html) to understand the
# approach here. you will see that, only hidden rows have style
# attribute and these hidden rows must be added to pages of their
# parents. This is why we need to check whether row has a style
# attribute or not.
styleFlag = False
if 'style' in row: styleFlag = True
# change the first letter if row is not hidden (child) one
if not styleFlag: firstLetter = row.findAll('td')[0].text[0].upper()
# if pages dict doesn't have the page yet..
if firstLetter not in pages:
pages[firstLetter] = []
# insert the row into the related page
if configFileFlag:
url = row.find('a')['href']
if '_cff' in url or '_cfi' in url or '_cfg' in url:
pages[firstLetter].append(row)
else:
pages[firstLetter].append(row)
return pages
# load rows from the package documentation page. output structure:
# pages = {'PackageA' : [..], 'PackageB' : [...]}
def extractPagesForPackage():
# initial page, A
pages = {}
table = htmlPage.find('table', {'class' : tableClassName})
for row in table.findAll('tr'):
# first cell contains name of the package...
name = row.findAll('td')[0].text
# parse package names --please have a look at the pages.html file
name = name[name.find(' '):name.find('/')].strip()
# if the package is not added yet
if name not in pages: pages[name] = []
pages[name].append(row)
return pages
# generate alphabetic tab for html pages that will be generated by this script
def generateTab(items, curr, tabClass = 'tabs3'):
itemTagMap = {}; tab = ''
for item in items:
fn = fileNameTemplate % item.replace(' ', '') # generate file name
if item != curr: tab += '<li><a href="%s">%s</a></li>' % (fn, item)
else: tab += '<li class="current"><a href="%s">%s</a></li>'%(fn, item)
return '<div class="%s"><ul class="tablist">%s</ul></div>' % (tabClass,tab)
if __name__ == "__main__":
if len(sys.argv) < 2:
sys.stderr.write("not enough parameter!\n")
sys.exit(1)
# initialize variables
htmlFullPath = sys.argv[1]
htmlFilePath = os.path.split(htmlFullPath)[0]
htmlFileName = os.path.split(htmlFullPath)[1]
fileNameTemplate = htmlFileName.replace('.html', '_%s.html')
# load the html page
with open(htmlFullPath) as f:
htmlPage = f.read()
htmlPage = BeautifulSoup(htmlPage)
# please have a look at the pages.html page. You will see that class name
# of the related tab, which we will use to put 'index tab' by using this
# tab, is different for pages.html file. For namespaces.html (namespace
# list) and annotated.html (~class list) files, class names are the same
# tabs2. this is why we are setting 'the destination tab class name' up
# differently depending on the html file name.
if htmlFileName == 'packageDocumentation.html':
pages = extractPagesForPackage()
destTabClassName = 'tabs'
elif htmlFileName == 'configfiles.html':
pages = extractPages(configFileFlag = True)
destTabClassName = 'tabs2'
else:
pages = extractPages()
destTabClassName = 'tabs2'
allRows = []
pageNames = pages.keys(); pageNames.sort()
for page in pageNames:
allRows = allRows + pages[page]
pages['All'] = allRows
pageNames.append('All')
# prepare the template
table = htmlPage.find('table', {'class' : tableClassName})
# generate template (clean whole table content)
for row in table.findAll('tr'):
row.extract()
# generate pages
for page in pageNames:
print('generating %s...' % (fileNameTemplate % page))
temp = BeautifulSoup(str(htmlPage))
table = temp.find('table', {'class' : tableClassName})
oldTab = temp.find('div', {'class' : destTabClassName})
newTab = generateTab(pageNames, page)
oldTab.replaceWith(BeautifulSoup(oldTab.prettify() + str(newTab)))
for row in pages[page]:
table.append(row)
# replace blank character with '_'. Please notice that you will not
# be able to use original page name after this line.
page = page.replace(' ', '_')
with open('%s/%s'%(htmlFilePath, fileNameTemplate % page), 'w') as f:
f.write(str(temp))
|