RelMon/python/utils_v2.py

0001 #! /usr/bin/env python
0002 '''
0003 Help functions for ValidationMatrix_v2.py.
0004
0005 Author:  Albertas Gimbutas,  Vilnius University (LT)
0006 e-mail:  albertasgim@gmail.com
0007 '''
0008 from __future__ import print_function
0009 from __future__ import absolute_import
0010 from builtins import range
0011 import sys
0012 import re
0013 import time
0014 import sqlite3
0015 from datetime import datetime
0016 from multiprocessing import Pool, Queue, Process
0017 import subprocess
0018 from optparse import OptionParser, OptionGroup
0019 from os import makedirs, listdir
0020 from os.path import basename, dirname, isfile, splitext, join, exists, getsize
0021 from Queue import Empty
0022 from urllib2  import build_opener, Request, HTTPError
0023 from urlparse import urlparse
0024 from httplib import BadStatusLine
0025
0026 try:
0027     from Utilities.RelMon.authentication import X509CertOpen
0028 except ImportError:
0029     from .authentication import X509CertOpen
0030
0031 ##-----------------   Make files pairs:  RelValData utils   --------------------
0032 def get_relvaldata_id(file):
0033     """Returns unique relvaldata ID for a given file."""
0034     run_id = re.search('R\d{9}', file)
0035     run = re.search('_RelVal_([\w\d]*)-v\d__', file)
0036     if not run:
0037         run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
0038     if run_id and run:
0039         return (run_id.group(), run.group(1))
0040     return None
0041
0042 def get_relvaldata_cmssw_version(file):
0043     """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
0044     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
0045     gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
0046     if not gr_r_version:
0047         gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
0048     if cmssw_release and gr_r_version:
0049         return (cmssw_release[0], gr_r_version[0])
0050
0051 def get_relvaldata_version(file):
0052     """Returns tuple (CMSSW version, run version) for specified file."""
0053     cmssw_version = re.findall('DQM_V(\d*)_', file)
0054     run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
0055     if not run_version:
0056         run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
0057     if cmssw_version and run_version:
0058         return (int(cmssw_version[0]), int(run_version[0]))
0059
0060 def get_relvaldata_max_version(files):
0061     """Returns file with maximum version at a) beggining of the file,
0062     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0063     max_file = files[0]
0064     max_v = get_relvaldata_version(files[0])
0065     for file in files:
0066         file_v = get_relvaldata_version(file)
0067         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0068             max_file = file
0069             max_v = file_v
0070     return max_file
0071
0072 ## -------------------   Make files pairs:  RelVal utils   ---------------------
0073 def get_relval_version(file):
0074     """Returns tuple (CMSSW version, run version) for specified file."""
0075     cmssw_version = re.findall('DQM_V(\d*)_', file)
0076     run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
0077     if cmssw_version and run_version:
0078         return (int(cmssw_version[0]), int(run_version[0]))
0079
0080 def get_relval_max_version(files):
0081     """Returns file with maximum version at a) beggining of the file,
0082     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0083     max_file = files[0]
0084     max_v = get_relval_version(files[0])
0085     for file in files:
0086         file_v = get_relval_version(file)
0087         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0088             max_file = file
0089             max_v = file_v
0090     return max_file
0091
0092 def get_relval_cmssw_version(file):
0093     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
0094     gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
0095     if cmssw_release and gr_r_version:
0096         return (cmssw_release[0], gr_r_version[0])
0097
0098 def get_relval_id(file):
0099     """Returns unique relval ID (dataset name) for a given file."""
0100     dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
0101     return dataset_name[0]
0102
0103 ## -----------------------  Make file pairs --------------------------
0104 def is_relvaldata(files):
0105     is_relvaldata_re = re.compile('_RelVal_')
0106     return any([is_relvaldata_re.search(filename) for filename in files])
0107
0108 def make_file_pairs(files1, files2):
0109     print('\n#################       Analyzing files       ###################')
0110     ## Select functions to use
0111     if is_relvaldata(files1):
0112         is_relval_data = True
0113         get_cmssw_version = get_relvaldata_cmssw_version
0114         get_id = get_relvaldata_id
0115         get_max_version = get_relvaldata_max_version
0116     else:
0117         is_relval_data = False
0118         get_cmssw_version = get_relval_cmssw_version
0119         get_id = get_relval_id
0120         get_max_version = get_relval_max_version
0121
0122     ## Divide files into groups
0123     versions1, versions2 = dict(), dict() # {version1: [file1, file2, ...], version2: [...], ...}
0124     for files, versions in (files1, versions1), (files2, versions2):
0125         for file in files:
0126             version = get_cmssw_version(file)
0127             if version:
0128                 if version in versions:
0129                     versions[version].append(file)
0130                 else:
0131                     versions[version] = [file]
0132
0133     ## Print the division into groups
0134     print('For RELEASE1 found file groups:')
0135     for version in versions1:
0136         print('   %s: %d files' % (str(version),  len(versions1[version])))
0137     if not versions1:
0138         print('None.')
0139
0140     print('\nFor RELEASE2 found file groups:')
0141     for version in versions2:
0142         print('   %s: %d files' % (str(version),  len(versions2[version])))
0143     if not versions2:
0144         print('None.')
0145
0146     if not len(versions1) or not len(versions2):
0147         print('\nNot enough file groups. Exiting...\n')
0148         exit()
0149
0150     ## Pair till you find pairs.
0151     pairs = []
0152     for v1 in sorted(versions1, key=lambda x: len(versions1[x]), reverse=True):
0153         for v2 in sorted(versions2, key=lambda x: len(versions2[x]), reverse=True):
0154             if v1 == v2:
0155                 continue
0156             ## Print the groups.
0157             print('\n#################     Pairing the files     ###################')
0158             print('%s (%d files)   VS   %s (%d files):\n' % (str(v1),
0159                     len(versions1[v1]), str(v2), len(versions2[v2])))
0160
0161             ## Pairing two versions
0162             for unique_id in set([get_id(file) for file in versions1[v1]]):
0163                 if is_relval_data:
0164                     dataset_re = re.compile(unique_id[0] + '_')
0165                     run_re = re.compile(unique_id[1])
0166                     c1_files = [file for file in versions1[v1] if dataset_re.search(file) and run_re.search(file)]
0167                     c2_files = [file for file in versions2[v2] if dataset_re.search(file) and run_re.search(file)]
0168                 else:
0169                     dataset_re = re.compile(unique_id + '_')
0170                     c1_files = [file for file in versions1[v1] if dataset_re.search(file)]
0171                     c2_files = [file for file in versions2[v2] if dataset_re.search(file)]
0172
0173                 if len(c1_files) > 0 and len(c2_files) > 0:
0174                     first_file = get_max_version(c1_files)
0175                     second_file = get_max_version(c2_files)
0176                     print('%s\n%s\n' % (first_file, second_file))
0177                     pairs.append((first_file, second_file))
0178
0179             print("Got %d pairs." % (len(pairs)))
0180             if pairs:
0181                 return pairs
0182     print('Found no file pairs. Exiting..\n')
0183     exit()
0184
0185 ## --------------------   Recursife file downloader -----------------------
0186 def auth_wget(url):
0187     try:
0188         opener = build_opener(X509CertOpen())
0189         return opener.open(Request(url)).read()
0190     except HTTPError as e:
0191         print('\nError: DQM GUI is temporarily unavailable. Probably maintainance hours. '+\
0192                 'Please try again later. Original error message: ``%s``. \nExiting...\n' % (e,))
0193         exit()
0194     except BadStatusLine as e:
0195         print('\nYou do not have permissions to access DQM GUI. Please check if your certificates '+\
0196             'in ``~/.globus`` directory are configured correctly. Exitting...')
0197         exit()
0198
0199
0200 def auth_download_file(url, chunk_size=1048576):
0201     filename = basename(url)
0202     file_path = join(auth_download_file.work_dir, filename)
0203
0204     file = open(file_path, 'wb')
0205     opener = build_opener(X509CertOpen())
0206     url_file = opener.open(Request(url))
0207     chunk = url_file.read(chunk_size)
0208     while chunk:
0209         file.write(chunk)
0210         auth_download_file.q.put((1,))   # reports, that downloaded 1MB
0211         chunk = url_file.read(chunk_size)
0212     print('\rDownloaded: %s  ' % (filename,))
0213     file.close()
0214
0215
0216 def recursive_search_online(url, rel1, frags1, rel2, frags2):
0217     """Recursively searches for files, that matches the pattern."""
0218     if not url:
0219         url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/'
0220         g1, g2 = recursive_search_online(url, rel1, frags1, rel2, frags2)
0221         url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelVal/'
0222         g3, g4 = recursive_search_online(url, rel1, frags1, rel2, frags2)
0223         g1.update(g3), g2.update(g4)
0224         return g1, g2
0225
0226     domain = '://'.join(urlparse(url)[:2])
0227
0228     ## Compile regular expressions
0229     href_re = re.compile(r"<a href='([-./\w]*)'>([-./\w]*)<")
0230
0231     def compile_res(rel, frags):
0232         frags = frags.split(',')
0233         regexps = [s for s in frags if not s.startswith('!')]
0234         regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
0235         regexps += [rel + '-', '.root']
0236         return [re.compile(r) for r in regexps]
0237
0238     res1 = compile_res(rel1, frags1)
0239     res2 = compile_res(rel2, frags2)
0240
0241     ## Recursively find files that matches regular expressions
0242     hrefs = [(name, path) for path, name in href_re.findall(auth_wget(url))[1:]]
0243     files_with_urls1, files_with_urls2 = dict(), dict()
0244     for name, path in hrefs:
0245         if splitext(name)[1]: # If file
0246             if all([r.search(name) for r in res1]):
0247                 files_with_urls1[name] = domain + path
0248             if all([r.search(name) for r in res2]):
0249                 files_with_urls2[name] = domain + path
0250         else:
0251             print(domain + path)
0252             new_hrefs = href_re.findall(auth_wget(domain + path))[1:]
0253             hrefs.extend([(name, path) for path, name in new_hrefs])
0254     return files_with_urls1, files_with_urls2
0255
0256 def search_on_disk(work_path, rel1, frags1, rel2, frags2):
0257     if not work_path:
0258         print('No working directory specified. Use "--dir DIR" option to ' +\
0259               'specify working directory. Exiting...')
0260         exit()
0261     ## Compile regular expressions
0262     def compile_res(rel, frags):
0263         frags = frags.split(',')
0264         regexps = [s for s in frags if not s.startswith('!')]
0265         regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
0266         regexps += [rel + '-', '.root']
0267         return [re.compile(r) for r in regexps]
0268
0269     res1 = compile_res(rel1, frags1)
0270     res2 = compile_res(rel2, frags2)
0271
0272     ## Recursively find files that matches regular expressions
0273     files = listdir(work_path)
0274     files1, files2 = [], []
0275     for name in files:
0276         if splitext(name)[1]:
0277             if all([r.search(name) for r in res1]):
0278                 files1.append(name)
0279             if all([r.search(name) for r in res2]):
0280                 files2.append(name)
0281     return files1, files2
0282
0283
0284 ## Exception definitions
0285 comparison_errors = {
0286         'Missing histogram': -1,
0287         'Histograms have different types': -2,
0288         'Object is not a histogram': -3,
0289         'Ranges of histograms are different': -4
0290     }
0291
0292 class ComparisonError(Exception):
0293     def __init__(self, error_message, *args, **kwargs):
0294         self.error_message = error_message
0295         self.error_code = comparison_errors[error_message]
0296
0297     def __str__(self):
0298         return 'Comparison Error: %d' % self.error_code
0299
0300
0301 ## StatisticalTests
0302 class StatisticalTest(object):
0303     name = None
0304
0305     def get_N_bins(self, h):
0306         x = h.GetNbinsX()
0307         y = h.GetNbinsY()
0308         z = h.GetNbinsZ()
0309         if not (y and z): # Is this realy necessary?
0310             return 0
0311         return (x + 1) * (y + 1) * (z + 1)
0312
0313     def is_empty(self, h):
0314         for i in range(1, self.get_N_bins(h)):
0315             if h.GetBinContent(i) != 0:
0316                 return False
0317             return True
0318
0319     def do_test(self, h1, h2):
0320         if not h1 or not h2:
0321             raise ComparisonError('Missing histogram')
0322         if not isinstance(h1, type(h2)):
0323             return -104     # raise ComparisonError('Histograms have different types')
0324         if not h1.InheritsFrom('TH1'):
0325             return -105     # raise ComparisonError('Object is not a histogram')
0326         if self.is_empty(h1) or self.is_empty(h2):
0327             return 1
0328         h1_bins = self.get_N_bins(h1)
0329         if h1_bins != self.get_N_bins(h2):
0330             return -103     # raise CoparisonError('Ranges of histograms are different')
0331
0332
0333 class KolmogorovTest(StatisticalTest):
0334     name = 'KS'
0335
0336     def do_test(self, h1, h2):
0337         p_value = super(KolmogorovTest, self).do_test(h1, h2)
0338         if p_value is not None:
0339             return p_value
0340
0341         for h in h1, h2:
0342             if h.GetSumw2().GetSize() == 0:
0343                 h.Sumw2()
0344         return h1.KolmogorovTest(h2)
0345
0346
0347 class Chi2Test(StatisticalTest):
0348     name = 'Chi2'
0349
0350     def make_absolute(self, h, bin_count):
0351         for i in range(1, bin_count): # Why here is no +1?
0352             content = h.GetBinContent(i)
0353             if content < 0:
0354                 h.SetBinContent(i, -1 * content)
0355             if h.GetBinError(i) == 0 and content != 0:
0356                 h.SetBinContent(i, 0)
0357
0358     def enough_filled_bins(self, h, bin_count, more_than=3):
0359         filled_bins = 0
0360         for i in range(1, bin_count):
0361             if h.GetBinContent(i) > 0:
0362                 filled_bins += 1
0363             if filled_bins > more_than:
0364                 return True
0365         return False
0366
0367     def do_test(self, h1, h2):
0368         p_value = super(Chi2Test, self).do_test(h1, h2)
0369         if p_value is not None:
0370             return p_value
0371
0372         bin_count = self.get_N_bins(h1)
0373
0374         # Make histograms absolute.
0375         self.make_absolute(h1, bin_count)
0376         self.make_absolute(h2, bin_count)
0377
0378         # Check if there is enough filled bins in bouth histograms.
0379         if not self.enough_filled_bins(h1, bin_count) or\
0380            not self.enough_filled_bins(h2, bin_count):
0381             return 1
0382
0383         if h1.InheritsFrom("TProfile") or (h1.GetEntries() != h1.GetSumOfWeights()):
0384             return h1.Chi2Test(h2, 'WW')
0385         return h1.Chi2Test(h2, 'UU')
0386
0387
0388 tests = {KolmogorovTest.name: KolmogorovTest, Chi2Test.name: Chi2Test}
0389
0390 ## Utils
0391 def init_database(db_path):
0392     print('Initialising DB: %s...' % basename(db_path), end=' ')
0393     conn = sqlite3.connect(db_path)
0394
0395     ## Creates tables
0396     c = conn.cursor()
0397     c.execute("""CREATE TABLE IF NOT EXISTS ReleaseComparison (
0398                         id INTEGER PRIMARY KEY,
0399                         title TEXT,
0400                         release1 TEXT,
0401                         release2 TEXT,
0402                         statistical_test TEXT
0403                     );""")
0404     c.execute("""CREATE TABLE IF NOT EXISTS Directory (
0405                         id INTEGER PRIMARY KEY,
0406                         name TEXT,
0407                         parent_id INTEGER,
0408                         from_histogram_id INTEGER,
0409                         till_histogram_id INTEGER,
0410                         FOREIGN KEY (parent_id) REFERENCES Directory(id)
0411                         FOREIGN KEY (from_histogram_id) REFERENCES HistogramComparison(id)
0412                         FOREIGN KEY (till_histogram_id) REFERENCES HistogramComparison(id)
0413                     )""")
0414     c.execute("""CREATE TABLE IF NOT EXISTS RootFileComparison (
0415                         id INTEGER PRIMARY KEY,
0416                         filename1 TEXT,
0417                         filename2 TEXT,
0418                         release_comparison_id INTEGER,
0419                         directory_id INTEGER,
0420                         FOREIGN KEY (release_comparison_id) REFERENCES ReleaseComparison(id),
0421                         FOREIGN KEY (directory_id) REFERENCES Directory(id)
0422                     )""")
0423     c.execute("""CREATE TABLE IF NOT EXISTS HistogramComparison (
0424                         id INTEGER PRIMARY KEY,
0425                         name TEXT,
0426                         p_value REAL,
0427                         directory_id INTEGER,
0428                         FOREIGN KEY (directory_id) REFERENCES Directory(id)
0429                     )""")
0430
0431     print('Done.')
0432     return db_path
0433
0434
0435 def get_version(filename):
0436     """Returns CMSSW and GR_R versions for the given filename."""
0437     if is_relvaldata([filename]):
0438         version_elems = get_relvaldata_cmssw_version(filename)
0439     else:
0440         relval_version = get_relval_cmssw_version(filename)
0441         version_elems = (relval_version[0], relval_version[1][0], relval_version[1][1])
0442     version_elems = [elem.strip('_').strip('RelVal_') for elem in version_elems]
0443     return '___'.join([elem for elem in version_elems if elem])
0444
0445
0446 def get_size_to_download(work_path, files_with_urls):
0447     """Returns file list to download and total size to download."""
0448     opener = build_opener(X509CertOpen())
0449     size_to_download = 0
0450     files_to_download = []
0451     for filename, url in files_with_urls:
0452         url_file = opener.open(Request(url))
0453         size = int(url_file.headers["Content-Length"])
0454         file_path = join(work_path, filename)
0455         if exists(file_path) and getsize(file_path) / 1024 == size / 1024:
0456             print("Exists on disk %s." % filename)
0457         else:
0458             size_to_download += size
0459             files_to_download.append(url)
0460     return size_to_download, files_to_download
0461
0462 def check_disk_for_space(work_path, size_needed):
0463     '''Checks afs file system for space.'''
0464     pass
0465     # try:
0466     #     fs_proc = subprocess.Popen(['fs', 'listquota', work_path], stdout=subprocess.PIPE)
0467     # except OSError:
0468     #     return
0469     # fs_response = fs_proc.communicate()[0]
0470     # quota, used = re.findall('([\d]+)', fs_response)[:2]
0471     # free_space = int(quota) - int(used)
0472     # if free_space * 1024 < size_needed:
0473     #     print '\nNot enougth free space on disk.',
0474     #     print 'Free space: %d MB. Need: %d MB. Exiting...\n' % (free_space / 1024, size_needed /1048576)
0475     #     exit()
0476     # elif size_needed:
0477     #     print 'Free space on disk: %d MB.\n' % (free_space / 1024,)
0478
0479
0480 def show_status_bar(total_size):
0481     """Shows download status."""
0482     q = show_status_bar.q
0483     total_size = total_size / (1024*1024)
0484     downloaded = 0
0485     while downloaded < total_size:
0486         try:
0487             o = q.get(timeout=20)
0488             downloaded += 1
0489             print('\r      %d/%d MB     %d%%     ' % (downloaded, total_size, 100*downloaded/total_size), end=' ')
0490             sys.stdout.flush()
0491         except Empty:
0492             time.sleep(1)
0493             break