Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-11-26 02:34:35

0001 #! /usr/bin/env python
0002 '''
0003 Help functions for ValidationMatrix_v2.py.
0004 
0005 Author:  Albertas Gimbutas,  Vilnius University (LT)
0006 e-mail:  albertasgim@gmail.com
0007 '''
0008 from builtins import range
0009 import sys
0010 import re
0011 import time
0012 import sqlite3
0013 from datetime import datetime
0014 from multiprocessing import Pool, Queue, Process
0015 import subprocess
0016 from optparse import OptionParser, OptionGroup
0017 from os import makedirs, listdir
0018 from os.path import basename, dirname, isfile, splitext, join, exists, getsize
0019 from Queue import Empty
0020 from urllib2  import build_opener, Request, HTTPError
0021 from urlparse import urlparse
0022 from httplib import BadStatusLine
0023 
0024 try:
0025     from Utilities.RelMon.authentication import X509CertOpen
0026 except ImportError:
0027     from .authentication import X509CertOpen
0028 
0029 ##-----------------   Make files pairs:  RelValData utils   --------------------
0030 def get_relvaldata_id(file):
0031     """Returns unique relvaldata ID for a given file."""
0032     run_id = re.search('R\\d{9}', file)
0033     run = re.search('_RelVal_([\\w\\d]*)-v\\d__', file)
0034     if not run:
0035         run = re.search('GR_R_\\d*_V\\d*C?_([\\w\\d]*)-v\\d__', file)
0036     if run_id and run:
0037         return (run_id.group(), run.group(1))
0038     return None
0039 
0040 def get_relvaldata_cmssw_version(file):
0041     """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
0042     cmssw_release = re.findall('(CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?)-', file)
0043     gr_r_version = re.findall('-(GR_R_\\d*_V\\d*\\w?)(?:_RelVal)?_', file)
0044     if not gr_r_version:
0045         gr_r_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-(\\w*)_RelVal_', file)
0046     if cmssw_release and gr_r_version:
0047         return (cmssw_release[0], gr_r_version[0])
0048 
0049 def get_relvaldata_version(file):
0050     """Returns tuple (CMSSW version, run version) for specified file."""
0051     cmssw_version = re.findall('DQM_V(\\d*)_', file)
0052     run_version = re.findall('_RelVal_[\\w\\d]*-v(\\d)__', file)
0053     if not run_version:
0054         run_version = re.findall('GR_R_\\d*_V\\d*C?_[\\w\\d]*-v(\\d)__', file)
0055     if cmssw_version and run_version:
0056         return (int(cmssw_version[0]), int(run_version[0]))
0057 
0058 def get_relvaldata_max_version(files):
0059     """Returns file with maximum version at a) beggining of the file,
0060     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0061     max_file = files[0]
0062     max_v = get_relvaldata_version(files[0])
0063     for file in files:
0064         file_v = get_relvaldata_version(file)
0065         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0066             max_file = file
0067             max_v = file_v
0068     return max_file
0069 
0070 ## -------------------   Make files pairs:  RelVal utils   ---------------------
0071 def get_relval_version(file):
0072     """Returns tuple (CMSSW version, run version) for specified file."""
0073     cmssw_version = re.findall('DQM_V(\\d*)_', file)
0074     run_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-[\\w\\d]*_V\\d*\\w?(?:_[\\w\\d]*)?-v(\\d*)__', file)
0075     if cmssw_version and run_version:
0076         return (int(cmssw_version[0]), int(run_version[0]))
0077 
0078 def get_relval_max_version(files):
0079     """Returns file with maximum version at a) beggining of the file,
0080     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0081     max_file = files[0]
0082     max_v = get_relval_version(files[0])
0083     for file in files:
0084         file_v = get_relval_version(file)
0085         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0086             max_file = file
0087             max_v = file_v
0088     return max_file
0089 
0090 def get_relval_cmssw_version(file):
0091     cmssw_release = re.findall('(CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?)-', file)
0092     gr_r_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-([\\w\\d]*)_V\\d*\\w?(_[\\w\\d]*)?-v', file)
0093     if cmssw_release and gr_r_version:
0094         return (cmssw_release[0], gr_r_version[0])
0095 
0096 def get_relval_id(file):
0097     """Returns unique relval ID (dataset name) for a given file."""
0098     dataset_name = re.findall('R\\d{9}__([\\w\\d]*)__CMSSW_', file)
0099     return dataset_name[0]
0100 
0101 ## -----------------------  Make file pairs --------------------------
0102 def is_relvaldata(files):
0103     is_relvaldata_re = re.compile('_RelVal_')
0104     return any([is_relvaldata_re.search(filename) for filename in files])
0105 
0106 def make_file_pairs(files1, files2):
0107     print('\n#################       Analyzing files       ###################')
0108     ## Select functions to use
0109     if is_relvaldata(files1):
0110         is_relval_data = True
0111         get_cmssw_version = get_relvaldata_cmssw_version
0112         get_id = get_relvaldata_id
0113         get_max_version = get_relvaldata_max_version
0114     else:
0115         is_relval_data = False
0116         get_cmssw_version = get_relval_cmssw_version
0117         get_id = get_relval_id
0118         get_max_version = get_relval_max_version
0119 
0120     ## Divide files into groups
0121     versions1, versions2 = dict(), dict() # {version1: [file1, file2, ...], version2: [...], ...}
0122     for files, versions in (files1, versions1), (files2, versions2):
0123         for file in files:
0124             version = get_cmssw_version(file)
0125             if version:
0126                 if version in versions:
0127                     versions[version].append(file)
0128                 else:
0129                     versions[version] = [file]
0130 
0131     ## Print the division into groups
0132     print('For RELEASE1 found file groups:')
0133     for version in versions1:
0134         print('   %s: %d files' % (str(version),  len(versions1[version])))
0135     if not versions1:
0136         print('None.')
0137 
0138     print('\nFor RELEASE2 found file groups:')
0139     for version in versions2:
0140         print('   %s: %d files' % (str(version),  len(versions2[version])))
0141     if not versions2:
0142         print('None.')
0143 
0144     if not len(versions1) or not len(versions2):
0145         print('\nNot enough file groups. Exiting...\n')
0146         exit()
0147 
0148     ## Pair till you find pairs.
0149     pairs = []
0150     for v1 in sorted(versions1, key=lambda x: len(versions1[x]), reverse=True):
0151         for v2 in sorted(versions2, key=lambda x: len(versions2[x]), reverse=True):
0152             if v1 == v2:
0153                 continue
0154             ## Print the groups.
0155             print('\n#################     Pairing the files     ###################')
0156             print('%s (%d files)   VS   %s (%d files):\n' % (str(v1),
0157                     len(versions1[v1]), str(v2), len(versions2[v2])))
0158 
0159             ## Pairing two versions
0160             for unique_id in set([get_id(file) for file in versions1[v1]]):
0161                 if is_relval_data:
0162                     dataset_re = re.compile(unique_id[0] + '_')
0163                     run_re = re.compile(unique_id[1])
0164                     c1_files = [file for file in versions1[v1] if dataset_re.search(file) and run_re.search(file)]
0165                     c2_files = [file for file in versions2[v2] if dataset_re.search(file) and run_re.search(file)]
0166                 else:
0167                     dataset_re = re.compile(unique_id + '_')
0168                     c1_files = [file for file in versions1[v1] if dataset_re.search(file)]
0169                     c2_files = [file for file in versions2[v2] if dataset_re.search(file)]
0170 
0171                 if len(c1_files) > 0 and len(c2_files) > 0:
0172                     first_file = get_max_version(c1_files)
0173                     second_file = get_max_version(c2_files)
0174                     print('%s\n%s\n' % (first_file, second_file))
0175                     pairs.append((first_file, second_file))
0176 
0177             print("Got %d pairs." % (len(pairs)))
0178             if pairs:
0179                 return pairs
0180     print('Found no file pairs. Exiting..\n')
0181     exit()
0182 
0183 ## --------------------   Recursife file downloader -----------------------
0184 def auth_wget(url):
0185     try:
0186         opener = build_opener(X509CertOpen())
0187         return opener.open(Request(url)).read()
0188     except HTTPError as e:
0189         print('\nError: DQM GUI is temporarily unavailable. Probably maintainance hours. '+\
0190                 'Please try again later. Original error message: ``%s``. \nExiting...\n' % (e,))
0191         exit()
0192     except BadStatusLine as e:
0193         print('\nYou do not have permissions to access DQM GUI. Please check if your certificates '+\
0194             'in ``~/.globus`` directory are configured correctly. Exitting...') 
0195         exit()
0196 
0197 
0198 def auth_download_file(url, chunk_size=1048576):
0199     filename = basename(url)
0200     file_path = join(auth_download_file.work_dir, filename)
0201 
0202     file = open(file_path, 'wb')
0203     opener = build_opener(X509CertOpen())
0204     url_file = opener.open(Request(url))
0205     chunk = url_file.read(chunk_size)
0206     while chunk:
0207         file.write(chunk)
0208         auth_download_file.q.put((1,))   # reports, that downloaded 1MB
0209         chunk = url_file.read(chunk_size)
0210     print('\rDownloaded: %s  ' % (filename,))
0211     file.close()
0212 
0213 
0214 def recursive_search_online(url, rel1, frags1, rel2, frags2):
0215     """Recursively searches for files, that matches the pattern."""
0216     if not url:
0217         url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/'
0218         g1, g2 = recursive_search_online(url, rel1, frags1, rel2, frags2)
0219         url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelVal/'
0220         g3, g4 = recursive_search_online(url, rel1, frags1, rel2, frags2)
0221         g1.update(g3), g2.update(g4)
0222         return g1, g2
0223 
0224     domain = '://'.join(urlparse(url)[:2])
0225 
0226     ## Compile regular expressions
0227     href_re = re.compile(r"<a href='([-./\w]*)'>([-./\w]*)<")
0228 
0229     def compile_res(rel, frags):
0230         frags = frags.split(',')
0231         regexps = [s for s in frags if not s.startswith('!')]
0232         regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
0233         regexps += [rel + '-', '.root']
0234         return [re.compile(r) for r in regexps]
0235 
0236     res1 = compile_res(rel1, frags1)
0237     res2 = compile_res(rel2, frags2)
0238 
0239     ## Recursively find files that matches regular expressions
0240     hrefs = [(name, path) for path, name in href_re.findall(auth_wget(url))[1:]]
0241     files_with_urls1, files_with_urls2 = dict(), dict()
0242     for name, path in hrefs:
0243         if splitext(name)[1]: # If file
0244             if all([r.search(name) for r in res1]):
0245                 files_with_urls1[name] = domain + path
0246             if all([r.search(name) for r in res2]):
0247                 files_with_urls2[name] = domain + path
0248         else:
0249             print(domain + path)
0250             new_hrefs = href_re.findall(auth_wget(domain + path))[1:]
0251             hrefs.extend([(name, path) for path, name in new_hrefs])
0252     return files_with_urls1, files_with_urls2
0253 
0254 def search_on_disk(work_path, rel1, frags1, rel2, frags2):
0255     if not work_path:
0256         print('No working directory specified. Use "--dir DIR" option to ' +\
0257               'specify working directory. Exiting...')
0258         exit()
0259     ## Compile regular expressions
0260     def compile_res(rel, frags):
0261         frags = frags.split(',')
0262         regexps = [s for s in frags if not s.startswith('!')]
0263         regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
0264         regexps += [rel + '-', '.root']
0265         return [re.compile(r) for r in regexps]
0266 
0267     res1 = compile_res(rel1, frags1)
0268     res2 = compile_res(rel2, frags2)
0269 
0270     ## Recursively find files that matches regular expressions
0271     files = listdir(work_path)
0272     files1, files2 = [], []
0273     for name in files:
0274         if splitext(name)[1]:
0275             if all([r.search(name) for r in res1]):
0276                 files1.append(name)
0277             if all([r.search(name) for r in res2]):
0278                 files2.append(name)
0279     return files1, files2
0280 
0281 
0282 ## Exception definitions
0283 comparison_errors = {
0284         'Missing histogram': -1,
0285         'Histograms have different types': -2,
0286         'Object is not a histogram': -3,
0287         'Ranges of histograms are different': -4
0288     }
0289 
0290 class ComparisonError(Exception):
0291     def __init__(self, error_message, *args, **kwargs):
0292         self.error_message = error_message
0293         self.error_code = comparison_errors[error_message]
0294 
0295     def __str__(self):
0296         return 'Comparison Error: %d' % self.error_code
0297 
0298 
0299 ## StatisticalTests
0300 class StatisticalTest(object):
0301     name = None
0302 
0303     def get_N_bins(self, h):
0304         x = h.GetNbinsX()
0305         y = h.GetNbinsY()
0306         z = h.GetNbinsZ()
0307         if not (y and z): # Is this realy necessary?
0308             return 0
0309         return (x + 1) * (y + 1) * (z + 1)
0310 
0311     def is_empty(self, h):
0312         for i in range(1, self.get_N_bins(h)):
0313             if h.GetBinContent(i) != 0:
0314                 return False
0315             return True
0316 
0317     def do_test(self, h1, h2):
0318         if not h1 or not h2:
0319             raise ComparisonError('Missing histogram')
0320         if not isinstance(h1, type(h2)):
0321             return -104     # raise ComparisonError('Histograms have different types')
0322         if not h1.InheritsFrom('TH1'):
0323             return -105     # raise ComparisonError('Object is not a histogram')
0324         if self.is_empty(h1) or self.is_empty(h2):
0325             return 1
0326         h1_bins = self.get_N_bins(h1)
0327         if h1_bins != self.get_N_bins(h2):
0328             return -103     # raise CoparisonError('Ranges of histograms are different')
0329 
0330 
0331 class KolmogorovTest(StatisticalTest):
0332     name = 'KS'
0333 
0334     def do_test(self, h1, h2):
0335         p_value = super(KolmogorovTest, self).do_test(h1, h2)
0336         if p_value is not None:
0337             return p_value
0338 
0339         for h in h1, h2:
0340             if h.GetSumw2().GetSize() == 0:
0341                 h.Sumw2()
0342         return h1.KolmogorovTest(h2)
0343 
0344 
0345 class Chi2Test(StatisticalTest):
0346     name = 'Chi2'
0347 
0348     def make_absolute(self, h, bin_count):
0349         for i in range(1, bin_count): # Why here is no +1?
0350             content = h.GetBinContent(i)
0351             if content < 0:
0352                 h.SetBinContent(i, -1 * content)
0353             if h.GetBinError(i) == 0 and content != 0:
0354                 h.SetBinContent(i, 0)
0355 
0356     def enough_filled_bins(self, h, bin_count, more_than=3):
0357         filled_bins = 0
0358         for i in range(1, bin_count):
0359             if h.GetBinContent(i) > 0:
0360                 filled_bins += 1
0361             if filled_bins > more_than:
0362                 return True
0363         return False
0364 
0365     def do_test(self, h1, h2):
0366         p_value = super(Chi2Test, self).do_test(h1, h2)
0367         if p_value is not None:
0368             return p_value
0369 
0370         bin_count = self.get_N_bins(h1)
0371 
0372         # Make histograms absolute.
0373         self.make_absolute(h1, bin_count)
0374         self.make_absolute(h2, bin_count)
0375 
0376         # Check if there is enough filled bins in bouth histograms.
0377         if not self.enough_filled_bins(h1, bin_count) or\
0378            not self.enough_filled_bins(h2, bin_count):
0379             return 1
0380 
0381         if h1.InheritsFrom("TProfile") or (h1.GetEntries() != h1.GetSumOfWeights()):
0382             return h1.Chi2Test(h2, 'WW')
0383         return h1.Chi2Test(h2, 'UU')
0384 
0385 
0386 tests = {KolmogorovTest.name: KolmogorovTest, Chi2Test.name: Chi2Test}
0387 
0388 ## Utils
0389 def init_database(db_path):
0390     print('Initialising DB: %s...' % basename(db_path), end=' ')
0391     conn = sqlite3.connect(db_path)
0392 
0393     ## Creates tables
0394     c = conn.cursor()
0395     c.execute("""CREATE TABLE IF NOT EXISTS ReleaseComparison (
0396                         id INTEGER PRIMARY KEY,
0397                         title TEXT,
0398                         release1 TEXT,
0399                         release2 TEXT,
0400                         statistical_test TEXT
0401                     );""")
0402     c.execute("""CREATE TABLE IF NOT EXISTS Directory (
0403                         id INTEGER PRIMARY KEY,
0404                         name TEXT,
0405                         parent_id INTEGER,
0406                         from_histogram_id INTEGER,
0407                         till_histogram_id INTEGER,
0408                         FOREIGN KEY (parent_id) REFERENCES Directory(id)
0409                         FOREIGN KEY (from_histogram_id) REFERENCES HistogramComparison(id)
0410                         FOREIGN KEY (till_histogram_id) REFERENCES HistogramComparison(id)
0411                     )""")
0412     c.execute("""CREATE TABLE IF NOT EXISTS RootFileComparison (
0413                         id INTEGER PRIMARY KEY,
0414                         filename1 TEXT,
0415                         filename2 TEXT,
0416                         release_comparison_id INTEGER,
0417                         directory_id INTEGER,
0418                         FOREIGN KEY (release_comparison_id) REFERENCES ReleaseComparison(id),
0419                         FOREIGN KEY (directory_id) REFERENCES Directory(id)
0420                     )""")
0421     c.execute("""CREATE TABLE IF NOT EXISTS HistogramComparison (
0422                         id INTEGER PRIMARY KEY,
0423                         name TEXT,
0424                         p_value REAL,
0425                         directory_id INTEGER,
0426                         FOREIGN KEY (directory_id) REFERENCES Directory(id)
0427                     )""")
0428 
0429     print('Done.')
0430     return db_path
0431 
0432 
0433 def get_version(filename):
0434     """Returns CMSSW and GR_R versions for the given filename."""
0435     if is_relvaldata([filename]):
0436         version_elems = get_relvaldata_cmssw_version(filename)
0437     else:
0438         relval_version = get_relval_cmssw_version(filename)
0439         version_elems = (relval_version[0], relval_version[1][0], relval_version[1][1])
0440     version_elems = [elem.strip('_').strip('RelVal_') for elem in version_elems]
0441     return '___'.join([elem for elem in version_elems if elem])
0442 
0443 
0444 def get_size_to_download(work_path, files_with_urls):
0445     """Returns file list to download and total size to download."""
0446     opener = build_opener(X509CertOpen())
0447     size_to_download = 0
0448     files_to_download = []
0449     for filename, url in files_with_urls:
0450         url_file = opener.open(Request(url))
0451         size = int(url_file.headers["Content-Length"])
0452         file_path = join(work_path, filename)
0453         if exists(file_path) and getsize(file_path) / 1024 == size / 1024:
0454             print("Exists on disk %s." % filename)
0455         else:
0456             size_to_download += size
0457             files_to_download.append(url)
0458     return size_to_download, files_to_download
0459 
0460 def check_disk_for_space(work_path, size_needed):
0461     '''Checks afs file system for space.'''
0462     pass
0463     # try:
0464     #     fs_proc = subprocess.Popen(['fs', 'listquota', work_path], stdout=subprocess.PIPE)
0465     # except OSError:
0466     #     return
0467     # fs_response = fs_proc.communicate()[0]
0468     # quota, used = re.findall('([\d]+)', fs_response)[:2]
0469     # free_space = int(quota) - int(used)
0470     # if free_space * 1024 < size_needed:
0471     #     print '\nNot enougth free space on disk.',
0472     #     print 'Free space: %d MB. Need: %d MB. Exiting...\n' % (free_space / 1024, size_needed /1048576)
0473     #     exit()
0474     # elif size_needed:
0475     #     print 'Free space on disk: %d MB.\n' % (free_space / 1024,)
0476 
0477 
0478 def show_status_bar(total_size):
0479     """Shows download status."""
0480     q = show_status_bar.q
0481     total_size = total_size / (1024*1024)
0482     downloaded = 0
0483     while downloaded < total_size:
0484         try:
0485             o = q.get(timeout=20)
0486             downloaded += 1
0487             print('\r      %d/%d MB     %d%%     ' % (downloaded, total_size, 100*downloaded/total_size), end=' ')
0488             sys.stdout.flush()
0489         except Empty:
0490             time.sleep(1)
0491             break