File indexing completed on 2023-03-17 11:26:50
0001
0002 '''
0003 Help functions for ValidationMatrix_v2.py.
0004
0005 Author: Albertas Gimbutas, Vilnius University (LT)
0006 e-mail: albertasgim@gmail.com
0007 '''
0008 from __future__ import print_function
0009 from __future__ import absolute_import
0010 from builtins import range
0011 import sys
0012 import re
0013 import time
0014 import sqlite3
0015 from datetime import datetime
0016 from multiprocessing import Pool, Queue, Process
0017 import subprocess
0018 from optparse import OptionParser, OptionGroup
0019 from os import makedirs, listdir
0020 from os.path import basename, dirname, isfile, splitext, join, exists, getsize
0021 from Queue import Empty
0022 from urllib2 import build_opener, Request, HTTPError
0023 from urlparse import urlparse
0024 from httplib import BadStatusLine
0025
0026 try:
0027 from Utilities.RelMon.authentication import X509CertOpen
0028 except ImportError:
0029 from .authentication import X509CertOpen
0030
0031
0032 def get_relvaldata_id(file):
0033 """Returns unique relvaldata ID for a given file."""
0034 run_id = re.search('R\d{9}', file)
0035 run = re.search('_RelVal_([\w\d]*)-v\d__', file)
0036 if not run:
0037 run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
0038 if run_id and run:
0039 return (run_id.group(), run.group(1))
0040 return None
0041
0042 def get_relvaldata_cmssw_version(file):
0043 """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
0044 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
0045 gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
0046 if not gr_r_version:
0047 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
0048 if cmssw_release and gr_r_version:
0049 return (cmssw_release[0], gr_r_version[0])
0050
0051 def get_relvaldata_version(file):
0052 """Returns tuple (CMSSW version, run version) for specified file."""
0053 cmssw_version = re.findall('DQM_V(\d*)_', file)
0054 run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
0055 if not run_version:
0056 run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
0057 if cmssw_version and run_version:
0058 return (int(cmssw_version[0]), int(run_version[0]))
0059
0060 def get_relvaldata_max_version(files):
0061 """Returns file with maximum version at a) beggining of the file,
0062 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0063 max_file = files[0]
0064 max_v = get_relvaldata_version(files[0])
0065 for file in files:
0066 file_v = get_relvaldata_version(file)
0067 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0068 max_file = file
0069 max_v = file_v
0070 return max_file
0071
0072
0073 def get_relval_version(file):
0074 """Returns tuple (CMSSW version, run version) for specified file."""
0075 cmssw_version = re.findall('DQM_V(\d*)_', file)
0076 run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
0077 if cmssw_version and run_version:
0078 return (int(cmssw_version[0]), int(run_version[0]))
0079
0080 def get_relval_max_version(files):
0081 """Returns file with maximum version at a) beggining of the file,
0082 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0083 max_file = files[0]
0084 max_v = get_relval_version(files[0])
0085 for file in files:
0086 file_v = get_relval_version(file)
0087 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0088 max_file = file
0089 max_v = file_v
0090 return max_file
0091
0092 def get_relval_cmssw_version(file):
0093 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
0094 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
0095 if cmssw_release and gr_r_version:
0096 return (cmssw_release[0], gr_r_version[0])
0097
0098 def get_relval_id(file):
0099 """Returns unique relval ID (dataset name) for a given file."""
0100 dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
0101 return dataset_name[0]
0102
0103
0104 def is_relvaldata(files):
0105 is_relvaldata_re = re.compile('_RelVal_')
0106 return any([is_relvaldata_re.search(filename) for filename in files])
0107
0108 def make_file_pairs(files1, files2):
0109 print('\n################# Analyzing files ###################')
0110
0111 if is_relvaldata(files1):
0112 is_relval_data = True
0113 get_cmssw_version = get_relvaldata_cmssw_version
0114 get_id = get_relvaldata_id
0115 get_max_version = get_relvaldata_max_version
0116 else:
0117 is_relval_data = False
0118 get_cmssw_version = get_relval_cmssw_version
0119 get_id = get_relval_id
0120 get_max_version = get_relval_max_version
0121
0122
0123 versions1, versions2 = dict(), dict()
0124 for files, versions in (files1, versions1), (files2, versions2):
0125 for file in files:
0126 version = get_cmssw_version(file)
0127 if version:
0128 if version in versions:
0129 versions[version].append(file)
0130 else:
0131 versions[version] = [file]
0132
0133
0134 print('For RELEASE1 found file groups:')
0135 for version in versions1:
0136 print(' %s: %d files' % (str(version), len(versions1[version])))
0137 if not versions1:
0138 print('None.')
0139
0140 print('\nFor RELEASE2 found file groups:')
0141 for version in versions2:
0142 print(' %s: %d files' % (str(version), len(versions2[version])))
0143 if not versions2:
0144 print('None.')
0145
0146 if not len(versions1) or not len(versions2):
0147 print('\nNot enough file groups. Exiting...\n')
0148 exit()
0149
0150
0151 pairs = []
0152 for v1 in sorted(versions1, key=lambda x: len(versions1[x]), reverse=True):
0153 for v2 in sorted(versions2, key=lambda x: len(versions2[x]), reverse=True):
0154 if v1 == v2:
0155 continue
0156
0157 print('\n################# Pairing the files ###################')
0158 print('%s (%d files) VS %s (%d files):\n' % (str(v1),
0159 len(versions1[v1]), str(v2), len(versions2[v2])))
0160
0161
0162 for unique_id in set([get_id(file) for file in versions1[v1]]):
0163 if is_relval_data:
0164 dataset_re = re.compile(unique_id[0] + '_')
0165 run_re = re.compile(unique_id[1])
0166 c1_files = [file for file in versions1[v1] if dataset_re.search(file) and run_re.search(file)]
0167 c2_files = [file for file in versions2[v2] if dataset_re.search(file) and run_re.search(file)]
0168 else:
0169 dataset_re = re.compile(unique_id + '_')
0170 c1_files = [file for file in versions1[v1] if dataset_re.search(file)]
0171 c2_files = [file for file in versions2[v2] if dataset_re.search(file)]
0172
0173 if len(c1_files) > 0 and len(c2_files) > 0:
0174 first_file = get_max_version(c1_files)
0175 second_file = get_max_version(c2_files)
0176 print('%s\n%s\n' % (first_file, second_file))
0177 pairs.append((first_file, second_file))
0178
0179 print("Got %d pairs." % (len(pairs)))
0180 if pairs:
0181 return pairs
0182 print('Found no file pairs. Exiting..\n')
0183 exit()
0184
0185
0186 def auth_wget(url):
0187 try:
0188 opener = build_opener(X509CertOpen())
0189 return opener.open(Request(url)).read()
0190 except HTTPError as e:
0191 print('\nError: DQM GUI is temporarily unavailable. Probably maintainance hours. '+\
0192 'Please try again later. Original error message: ``%s``. \nExiting...\n' % (e,))
0193 exit()
0194 except BadStatusLine as e:
0195 print('\nYou do not have permissions to access DQM GUI. Please check if your certificates '+\
0196 'in ``~/.globus`` directory are configured correctly. Exitting...')
0197 exit()
0198
0199
0200 def auth_download_file(url, chunk_size=1048576):
0201 filename = basename(url)
0202 file_path = join(auth_download_file.work_dir, filename)
0203
0204 file = open(file_path, 'wb')
0205 opener = build_opener(X509CertOpen())
0206 url_file = opener.open(Request(url))
0207 chunk = url_file.read(chunk_size)
0208 while chunk:
0209 file.write(chunk)
0210 auth_download_file.q.put((1,))
0211 chunk = url_file.read(chunk_size)
0212 print('\rDownloaded: %s ' % (filename,))
0213 file.close()
0214
0215
0216 def recursive_search_online(url, rel1, frags1, rel2, frags2):
0217 """Recursively searches for files, that matches the pattern."""
0218 if not url:
0219 url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/'
0220 g1, g2 = recursive_search_online(url, rel1, frags1, rel2, frags2)
0221 url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelVal/'
0222 g3, g4 = recursive_search_online(url, rel1, frags1, rel2, frags2)
0223 g1.update(g3), g2.update(g4)
0224 return g1, g2
0225
0226 domain = '://'.join(urlparse(url)[:2])
0227
0228
0229 href_re = re.compile(r"<a href='([-./\w]*)'>([-./\w]*)<")
0230
0231 def compile_res(rel, frags):
0232 frags = frags.split(',')
0233 regexps = [s for s in frags if not s.startswith('!')]
0234 regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
0235 regexps += [rel + '-', '.root']
0236 return [re.compile(r) for r in regexps]
0237
0238 res1 = compile_res(rel1, frags1)
0239 res2 = compile_res(rel2, frags2)
0240
0241
0242 hrefs = [(name, path) for path, name in href_re.findall(auth_wget(url))[1:]]
0243 files_with_urls1, files_with_urls2 = dict(), dict()
0244 for name, path in hrefs:
0245 if splitext(name)[1]:
0246 if all([r.search(name) for r in res1]):
0247 files_with_urls1[name] = domain + path
0248 if all([r.search(name) for r in res2]):
0249 files_with_urls2[name] = domain + path
0250 else:
0251 print(domain + path)
0252 new_hrefs = href_re.findall(auth_wget(domain + path))[1:]
0253 hrefs.extend([(name, path) for path, name in new_hrefs])
0254 return files_with_urls1, files_with_urls2
0255
0256 def search_on_disk(work_path, rel1, frags1, rel2, frags2):
0257 if not work_path:
0258 print('No working directory specified. Use "--dir DIR" option to ' +\
0259 'specify working directory. Exiting...')
0260 exit()
0261
0262 def compile_res(rel, frags):
0263 frags = frags.split(',')
0264 regexps = [s for s in frags if not s.startswith('!')]
0265 regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
0266 regexps += [rel + '-', '.root']
0267 return [re.compile(r) for r in regexps]
0268
0269 res1 = compile_res(rel1, frags1)
0270 res2 = compile_res(rel2, frags2)
0271
0272
0273 files = listdir(work_path)
0274 files1, files2 = [], []
0275 for name in files:
0276 if splitext(name)[1]:
0277 if all([r.search(name) for r in res1]):
0278 files1.append(name)
0279 if all([r.search(name) for r in res2]):
0280 files2.append(name)
0281 return files1, files2
0282
0283
0284
0285 comparison_errors = {
0286 'Missing histogram': -1,
0287 'Histograms have different types': -2,
0288 'Object is not a histogram': -3,
0289 'Ranges of histograms are different': -4
0290 }
0291
0292 class ComparisonError(Exception):
0293 def __init__(self, error_message, *args, **kwargs):
0294 self.error_message = error_message
0295 self.error_code = comparison_errors[error_message]
0296
0297 def __str__(self):
0298 return 'Comparison Error: %d' % self.error_code
0299
0300
0301
0302 class StatisticalTest(object):
0303 name = None
0304
0305 def get_N_bins(self, h):
0306 x = h.GetNbinsX()
0307 y = h.GetNbinsY()
0308 z = h.GetNbinsZ()
0309 if not (y and z):
0310 return 0
0311 return (x + 1) * (y + 1) * (z + 1)
0312
0313 def is_empty(self, h):
0314 for i in range(1, self.get_N_bins(h)):
0315 if h.GetBinContent(i) != 0:
0316 return False
0317 return True
0318
0319 def do_test(self, h1, h2):
0320 if not h1 or not h2:
0321 raise ComparisonError('Missing histogram')
0322 if not isinstance(h1, type(h2)):
0323 return -104
0324 if not h1.InheritsFrom('TH1'):
0325 return -105
0326 if self.is_empty(h1) or self.is_empty(h2):
0327 return 1
0328 h1_bins = self.get_N_bins(h1)
0329 if h1_bins != self.get_N_bins(h2):
0330 return -103
0331
0332
0333 class KolmogorovTest(StatisticalTest):
0334 name = 'KS'
0335
0336 def do_test(self, h1, h2):
0337 p_value = super(KolmogorovTest, self).do_test(h1, h2)
0338 if p_value is not None:
0339 return p_value
0340
0341 for h in h1, h2:
0342 if h.GetSumw2().GetSize() == 0:
0343 h.Sumw2()
0344 return h1.KolmogorovTest(h2)
0345
0346
0347 class Chi2Test(StatisticalTest):
0348 name = 'Chi2'
0349
0350 def make_absolute(self, h, bin_count):
0351 for i in range(1, bin_count):
0352 content = h.GetBinContent(i)
0353 if content < 0:
0354 h.SetBinContent(i, -1 * content)
0355 if h.GetBinError(i) == 0 and content != 0:
0356 h.SetBinContent(i, 0)
0357
0358 def enough_filled_bins(self, h, bin_count, more_than=3):
0359 filled_bins = 0
0360 for i in range(1, bin_count):
0361 if h.GetBinContent(i) > 0:
0362 filled_bins += 1
0363 if filled_bins > more_than:
0364 return True
0365 return False
0366
0367 def do_test(self, h1, h2):
0368 p_value = super(Chi2Test, self).do_test(h1, h2)
0369 if p_value is not None:
0370 return p_value
0371
0372 bin_count = self.get_N_bins(h1)
0373
0374
0375 self.make_absolute(h1, bin_count)
0376 self.make_absolute(h2, bin_count)
0377
0378
0379 if not self.enough_filled_bins(h1, bin_count) or\
0380 not self.enough_filled_bins(h2, bin_count):
0381 return 1
0382
0383 if h1.InheritsFrom("TProfile") or (h1.GetEntries() != h1.GetSumOfWeights()):
0384 return h1.Chi2Test(h2, 'WW')
0385 return h1.Chi2Test(h2, 'UU')
0386
0387
0388 tests = {KolmogorovTest.name: KolmogorovTest, Chi2Test.name: Chi2Test}
0389
0390
0391 def init_database(db_path):
0392 print('Initialising DB: %s...' % basename(db_path), end=' ')
0393 conn = sqlite3.connect(db_path)
0394
0395
0396 c = conn.cursor()
0397 c.execute("""CREATE TABLE IF NOT EXISTS ReleaseComparison (
0398 id INTEGER PRIMARY KEY,
0399 title TEXT,
0400 release1 TEXT,
0401 release2 TEXT,
0402 statistical_test TEXT
0403 );""")
0404 c.execute("""CREATE TABLE IF NOT EXISTS Directory (
0405 id INTEGER PRIMARY KEY,
0406 name TEXT,
0407 parent_id INTEGER,
0408 from_histogram_id INTEGER,
0409 till_histogram_id INTEGER,
0410 FOREIGN KEY (parent_id) REFERENCES Directory(id)
0411 FOREIGN KEY (from_histogram_id) REFERENCES HistogramComparison(id)
0412 FOREIGN KEY (till_histogram_id) REFERENCES HistogramComparison(id)
0413 )""")
0414 c.execute("""CREATE TABLE IF NOT EXISTS RootFileComparison (
0415 id INTEGER PRIMARY KEY,
0416 filename1 TEXT,
0417 filename2 TEXT,
0418 release_comparison_id INTEGER,
0419 directory_id INTEGER,
0420 FOREIGN KEY (release_comparison_id) REFERENCES ReleaseComparison(id),
0421 FOREIGN KEY (directory_id) REFERENCES Directory(id)
0422 )""")
0423 c.execute("""CREATE TABLE IF NOT EXISTS HistogramComparison (
0424 id INTEGER PRIMARY KEY,
0425 name TEXT,
0426 p_value REAL,
0427 directory_id INTEGER,
0428 FOREIGN KEY (directory_id) REFERENCES Directory(id)
0429 )""")
0430
0431 print('Done.')
0432 return db_path
0433
0434
0435 def get_version(filename):
0436 """Returns CMSSW and GR_R versions for the given filename."""
0437 if is_relvaldata([filename]):
0438 version_elems = get_relvaldata_cmssw_version(filename)
0439 else:
0440 relval_version = get_relval_cmssw_version(filename)
0441 version_elems = (relval_version[0], relval_version[1][0], relval_version[1][1])
0442 version_elems = [elem.strip('_').strip('RelVal_') for elem in version_elems]
0443 return '___'.join([elem for elem in version_elems if elem])
0444
0445
0446 def get_size_to_download(work_path, files_with_urls):
0447 """Returns file list to download and total size to download."""
0448 opener = build_opener(X509CertOpen())
0449 size_to_download = 0
0450 files_to_download = []
0451 for filename, url in files_with_urls:
0452 url_file = opener.open(Request(url))
0453 size = int(url_file.headers["Content-Length"])
0454 file_path = join(work_path, filename)
0455 if exists(file_path) and getsize(file_path) / 1024 == size / 1024:
0456 print("Exists on disk %s." % filename)
0457 else:
0458 size_to_download += size
0459 files_to_download.append(url)
0460 return size_to_download, files_to_download
0461
0462 def check_disk_for_space(work_path, size_needed):
0463 '''Checks afs file system for space.'''
0464 pass
0465
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478
0479
0480 def show_status_bar(total_size):
0481 """Shows download status."""
0482 q = show_status_bar.q
0483 total_size = total_size / (1024*1024)
0484 downloaded = 0
0485 while downloaded < total_size:
0486 try:
0487 o = q.get(timeout=20)
0488 downloaded += 1
0489 print('\r %d/%d MB %d%% ' % (downloaded, total_size, 100*downloaded/total_size), end=' ')
0490 sys.stdout.flush()
0491 except Empty:
0492 time.sleep(1)
0493 break