File indexing completed on 2024-11-26 02:34:35
0001
0002 '''
0003 Help functions for ValidationMatrix_v2.py.
0004
0005 Author: Albertas Gimbutas, Vilnius University (LT)
0006 e-mail: albertasgim@gmail.com
0007 '''
0008 from builtins import range
0009 import sys
0010 import re
0011 import time
0012 import sqlite3
0013 from datetime import datetime
0014 from multiprocessing import Pool, Queue, Process
0015 import subprocess
0016 from optparse import OptionParser, OptionGroup
0017 from os import makedirs, listdir
0018 from os.path import basename, dirname, isfile, splitext, join, exists, getsize
0019 from Queue import Empty
0020 from urllib2 import build_opener, Request, HTTPError
0021 from urlparse import urlparse
0022 from httplib import BadStatusLine
0023
0024 try:
0025 from Utilities.RelMon.authentication import X509CertOpen
0026 except ImportError:
0027 from .authentication import X509CertOpen
0028
0029
0030 def get_relvaldata_id(file):
0031 """Returns unique relvaldata ID for a given file."""
0032 run_id = re.search('R\\d{9}', file)
0033 run = re.search('_RelVal_([\\w\\d]*)-v\\d__', file)
0034 if not run:
0035 run = re.search('GR_R_\\d*_V\\d*C?_([\\w\\d]*)-v\\d__', file)
0036 if run_id and run:
0037 return (run_id.group(), run.group(1))
0038 return None
0039
0040 def get_relvaldata_cmssw_version(file):
0041 """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
0042 cmssw_release = re.findall('(CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?)-', file)
0043 gr_r_version = re.findall('-(GR_R_\\d*_V\\d*\\w?)(?:_RelVal)?_', file)
0044 if not gr_r_version:
0045 gr_r_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-(\\w*)_RelVal_', file)
0046 if cmssw_release and gr_r_version:
0047 return (cmssw_release[0], gr_r_version[0])
0048
0049 def get_relvaldata_version(file):
0050 """Returns tuple (CMSSW version, run version) for specified file."""
0051 cmssw_version = re.findall('DQM_V(\\d*)_', file)
0052 run_version = re.findall('_RelVal_[\\w\\d]*-v(\\d)__', file)
0053 if not run_version:
0054 run_version = re.findall('GR_R_\\d*_V\\d*C?_[\\w\\d]*-v(\\d)__', file)
0055 if cmssw_version and run_version:
0056 return (int(cmssw_version[0]), int(run_version[0]))
0057
0058 def get_relvaldata_max_version(files):
0059 """Returns file with maximum version at a) beggining of the file,
0060 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0061 max_file = files[0]
0062 max_v = get_relvaldata_version(files[0])
0063 for file in files:
0064 file_v = get_relvaldata_version(file)
0065 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0066 max_file = file
0067 max_v = file_v
0068 return max_file
0069
0070
0071 def get_relval_version(file):
0072 """Returns tuple (CMSSW version, run version) for specified file."""
0073 cmssw_version = re.findall('DQM_V(\\d*)_', file)
0074 run_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-[\\w\\d]*_V\\d*\\w?(?:_[\\w\\d]*)?-v(\\d*)__', file)
0075 if cmssw_version and run_version:
0076 return (int(cmssw_version[0]), int(run_version[0]))
0077
0078 def get_relval_max_version(files):
0079 """Returns file with maximum version at a) beggining of the file,
0080 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0081 max_file = files[0]
0082 max_v = get_relval_version(files[0])
0083 for file in files:
0084 file_v = get_relval_version(file)
0085 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0086 max_file = file
0087 max_v = file_v
0088 return max_file
0089
0090 def get_relval_cmssw_version(file):
0091 cmssw_release = re.findall('(CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?)-', file)
0092 gr_r_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-([\\w\\d]*)_V\\d*\\w?(_[\\w\\d]*)?-v', file)
0093 if cmssw_release and gr_r_version:
0094 return (cmssw_release[0], gr_r_version[0])
0095
0096 def get_relval_id(file):
0097 """Returns unique relval ID (dataset name) for a given file."""
0098 dataset_name = re.findall('R\\d{9}__([\\w\\d]*)__CMSSW_', file)
0099 return dataset_name[0]
0100
0101
0102 def is_relvaldata(files):
0103 is_relvaldata_re = re.compile('_RelVal_')
0104 return any([is_relvaldata_re.search(filename) for filename in files])
0105
0106 def make_file_pairs(files1, files2):
0107 print('\n################# Analyzing files ###################')
0108
0109 if is_relvaldata(files1):
0110 is_relval_data = True
0111 get_cmssw_version = get_relvaldata_cmssw_version
0112 get_id = get_relvaldata_id
0113 get_max_version = get_relvaldata_max_version
0114 else:
0115 is_relval_data = False
0116 get_cmssw_version = get_relval_cmssw_version
0117 get_id = get_relval_id
0118 get_max_version = get_relval_max_version
0119
0120
0121 versions1, versions2 = dict(), dict()
0122 for files, versions in (files1, versions1), (files2, versions2):
0123 for file in files:
0124 version = get_cmssw_version(file)
0125 if version:
0126 if version in versions:
0127 versions[version].append(file)
0128 else:
0129 versions[version] = [file]
0130
0131
0132 print('For RELEASE1 found file groups:')
0133 for version in versions1:
0134 print(' %s: %d files' % (str(version), len(versions1[version])))
0135 if not versions1:
0136 print('None.')
0137
0138 print('\nFor RELEASE2 found file groups:')
0139 for version in versions2:
0140 print(' %s: %d files' % (str(version), len(versions2[version])))
0141 if not versions2:
0142 print('None.')
0143
0144 if not len(versions1) or not len(versions2):
0145 print('\nNot enough file groups. Exiting...\n')
0146 exit()
0147
0148
0149 pairs = []
0150 for v1 in sorted(versions1, key=lambda x: len(versions1[x]), reverse=True):
0151 for v2 in sorted(versions2, key=lambda x: len(versions2[x]), reverse=True):
0152 if v1 == v2:
0153 continue
0154
0155 print('\n################# Pairing the files ###################')
0156 print('%s (%d files) VS %s (%d files):\n' % (str(v1),
0157 len(versions1[v1]), str(v2), len(versions2[v2])))
0158
0159
0160 for unique_id in set([get_id(file) for file in versions1[v1]]):
0161 if is_relval_data:
0162 dataset_re = re.compile(unique_id[0] + '_')
0163 run_re = re.compile(unique_id[1])
0164 c1_files = [file for file in versions1[v1] if dataset_re.search(file) and run_re.search(file)]
0165 c2_files = [file for file in versions2[v2] if dataset_re.search(file) and run_re.search(file)]
0166 else:
0167 dataset_re = re.compile(unique_id + '_')
0168 c1_files = [file for file in versions1[v1] if dataset_re.search(file)]
0169 c2_files = [file for file in versions2[v2] if dataset_re.search(file)]
0170
0171 if len(c1_files) > 0 and len(c2_files) > 0:
0172 first_file = get_max_version(c1_files)
0173 second_file = get_max_version(c2_files)
0174 print('%s\n%s\n' % (first_file, second_file))
0175 pairs.append((first_file, second_file))
0176
0177 print("Got %d pairs." % (len(pairs)))
0178 if pairs:
0179 return pairs
0180 print('Found no file pairs. Exiting..\n')
0181 exit()
0182
0183
0184 def auth_wget(url):
0185 try:
0186 opener = build_opener(X509CertOpen())
0187 return opener.open(Request(url)).read()
0188 except HTTPError as e:
0189 print('\nError: DQM GUI is temporarily unavailable. Probably maintainance hours. '+\
0190 'Please try again later. Original error message: ``%s``. \nExiting...\n' % (e,))
0191 exit()
0192 except BadStatusLine as e:
0193 print('\nYou do not have permissions to access DQM GUI. Please check if your certificates '+\
0194 'in ``~/.globus`` directory are configured correctly. Exitting...')
0195 exit()
0196
0197
0198 def auth_download_file(url, chunk_size=1048576):
0199 filename = basename(url)
0200 file_path = join(auth_download_file.work_dir, filename)
0201
0202 file = open(file_path, 'wb')
0203 opener = build_opener(X509CertOpen())
0204 url_file = opener.open(Request(url))
0205 chunk = url_file.read(chunk_size)
0206 while chunk:
0207 file.write(chunk)
0208 auth_download_file.q.put((1,))
0209 chunk = url_file.read(chunk_size)
0210 print('\rDownloaded: %s ' % (filename,))
0211 file.close()
0212
0213
0214 def recursive_search_online(url, rel1, frags1, rel2, frags2):
0215 """Recursively searches for files, that matches the pattern."""
0216 if not url:
0217 url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/'
0218 g1, g2 = recursive_search_online(url, rel1, frags1, rel2, frags2)
0219 url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelVal/'
0220 g3, g4 = recursive_search_online(url, rel1, frags1, rel2, frags2)
0221 g1.update(g3), g2.update(g4)
0222 return g1, g2
0223
0224 domain = '://'.join(urlparse(url)[:2])
0225
0226
0227 href_re = re.compile(r"<a href='([-./\w]*)'>([-./\w]*)<")
0228
0229 def compile_res(rel, frags):
0230 frags = frags.split(',')
0231 regexps = [s for s in frags if not s.startswith('!')]
0232 regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
0233 regexps += [rel + '-', '.root']
0234 return [re.compile(r) for r in regexps]
0235
0236 res1 = compile_res(rel1, frags1)
0237 res2 = compile_res(rel2, frags2)
0238
0239
0240 hrefs = [(name, path) for path, name in href_re.findall(auth_wget(url))[1:]]
0241 files_with_urls1, files_with_urls2 = dict(), dict()
0242 for name, path in hrefs:
0243 if splitext(name)[1]:
0244 if all([r.search(name) for r in res1]):
0245 files_with_urls1[name] = domain + path
0246 if all([r.search(name) for r in res2]):
0247 files_with_urls2[name] = domain + path
0248 else:
0249 print(domain + path)
0250 new_hrefs = href_re.findall(auth_wget(domain + path))[1:]
0251 hrefs.extend([(name, path) for path, name in new_hrefs])
0252 return files_with_urls1, files_with_urls2
0253
0254 def search_on_disk(work_path, rel1, frags1, rel2, frags2):
0255 if not work_path:
0256 print('No working directory specified. Use "--dir DIR" option to ' +\
0257 'specify working directory. Exiting...')
0258 exit()
0259
0260 def compile_res(rel, frags):
0261 frags = frags.split(',')
0262 regexps = [s for s in frags if not s.startswith('!')]
0263 regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
0264 regexps += [rel + '-', '.root']
0265 return [re.compile(r) for r in regexps]
0266
0267 res1 = compile_res(rel1, frags1)
0268 res2 = compile_res(rel2, frags2)
0269
0270
0271 files = listdir(work_path)
0272 files1, files2 = [], []
0273 for name in files:
0274 if splitext(name)[1]:
0275 if all([r.search(name) for r in res1]):
0276 files1.append(name)
0277 if all([r.search(name) for r in res2]):
0278 files2.append(name)
0279 return files1, files2
0280
0281
0282
0283 comparison_errors = {
0284 'Missing histogram': -1,
0285 'Histograms have different types': -2,
0286 'Object is not a histogram': -3,
0287 'Ranges of histograms are different': -4
0288 }
0289
0290 class ComparisonError(Exception):
0291 def __init__(self, error_message, *args, **kwargs):
0292 self.error_message = error_message
0293 self.error_code = comparison_errors[error_message]
0294
0295 def __str__(self):
0296 return 'Comparison Error: %d' % self.error_code
0297
0298
0299
0300 class StatisticalTest(object):
0301 name = None
0302
0303 def get_N_bins(self, h):
0304 x = h.GetNbinsX()
0305 y = h.GetNbinsY()
0306 z = h.GetNbinsZ()
0307 if not (y and z):
0308 return 0
0309 return (x + 1) * (y + 1) * (z + 1)
0310
0311 def is_empty(self, h):
0312 for i in range(1, self.get_N_bins(h)):
0313 if h.GetBinContent(i) != 0:
0314 return False
0315 return True
0316
0317 def do_test(self, h1, h2):
0318 if not h1 or not h2:
0319 raise ComparisonError('Missing histogram')
0320 if not isinstance(h1, type(h2)):
0321 return -104
0322 if not h1.InheritsFrom('TH1'):
0323 return -105
0324 if self.is_empty(h1) or self.is_empty(h2):
0325 return 1
0326 h1_bins = self.get_N_bins(h1)
0327 if h1_bins != self.get_N_bins(h2):
0328 return -103
0329
0330
0331 class KolmogorovTest(StatisticalTest):
0332 name = 'KS'
0333
0334 def do_test(self, h1, h2):
0335 p_value = super(KolmogorovTest, self).do_test(h1, h2)
0336 if p_value is not None:
0337 return p_value
0338
0339 for h in h1, h2:
0340 if h.GetSumw2().GetSize() == 0:
0341 h.Sumw2()
0342 return h1.KolmogorovTest(h2)
0343
0344
0345 class Chi2Test(StatisticalTest):
0346 name = 'Chi2'
0347
0348 def make_absolute(self, h, bin_count):
0349 for i in range(1, bin_count):
0350 content = h.GetBinContent(i)
0351 if content < 0:
0352 h.SetBinContent(i, -1 * content)
0353 if h.GetBinError(i) == 0 and content != 0:
0354 h.SetBinContent(i, 0)
0355
0356 def enough_filled_bins(self, h, bin_count, more_than=3):
0357 filled_bins = 0
0358 for i in range(1, bin_count):
0359 if h.GetBinContent(i) > 0:
0360 filled_bins += 1
0361 if filled_bins > more_than:
0362 return True
0363 return False
0364
0365 def do_test(self, h1, h2):
0366 p_value = super(Chi2Test, self).do_test(h1, h2)
0367 if p_value is not None:
0368 return p_value
0369
0370 bin_count = self.get_N_bins(h1)
0371
0372
0373 self.make_absolute(h1, bin_count)
0374 self.make_absolute(h2, bin_count)
0375
0376
0377 if not self.enough_filled_bins(h1, bin_count) or\
0378 not self.enough_filled_bins(h2, bin_count):
0379 return 1
0380
0381 if h1.InheritsFrom("TProfile") or (h1.GetEntries() != h1.GetSumOfWeights()):
0382 return h1.Chi2Test(h2, 'WW')
0383 return h1.Chi2Test(h2, 'UU')
0384
0385
0386 tests = {KolmogorovTest.name: KolmogorovTest, Chi2Test.name: Chi2Test}
0387
0388
0389 def init_database(db_path):
0390 print('Initialising DB: %s...' % basename(db_path), end=' ')
0391 conn = sqlite3.connect(db_path)
0392
0393
0394 c = conn.cursor()
0395 c.execute("""CREATE TABLE IF NOT EXISTS ReleaseComparison (
0396 id INTEGER PRIMARY KEY,
0397 title TEXT,
0398 release1 TEXT,
0399 release2 TEXT,
0400 statistical_test TEXT
0401 );""")
0402 c.execute("""CREATE TABLE IF NOT EXISTS Directory (
0403 id INTEGER PRIMARY KEY,
0404 name TEXT,
0405 parent_id INTEGER,
0406 from_histogram_id INTEGER,
0407 till_histogram_id INTEGER,
0408 FOREIGN KEY (parent_id) REFERENCES Directory(id)
0409 FOREIGN KEY (from_histogram_id) REFERENCES HistogramComparison(id)
0410 FOREIGN KEY (till_histogram_id) REFERENCES HistogramComparison(id)
0411 )""")
0412 c.execute("""CREATE TABLE IF NOT EXISTS RootFileComparison (
0413 id INTEGER PRIMARY KEY,
0414 filename1 TEXT,
0415 filename2 TEXT,
0416 release_comparison_id INTEGER,
0417 directory_id INTEGER,
0418 FOREIGN KEY (release_comparison_id) REFERENCES ReleaseComparison(id),
0419 FOREIGN KEY (directory_id) REFERENCES Directory(id)
0420 )""")
0421 c.execute("""CREATE TABLE IF NOT EXISTS HistogramComparison (
0422 id INTEGER PRIMARY KEY,
0423 name TEXT,
0424 p_value REAL,
0425 directory_id INTEGER,
0426 FOREIGN KEY (directory_id) REFERENCES Directory(id)
0427 )""")
0428
0429 print('Done.')
0430 return db_path
0431
0432
0433 def get_version(filename):
0434 """Returns CMSSW and GR_R versions for the given filename."""
0435 if is_relvaldata([filename]):
0436 version_elems = get_relvaldata_cmssw_version(filename)
0437 else:
0438 relval_version = get_relval_cmssw_version(filename)
0439 version_elems = (relval_version[0], relval_version[1][0], relval_version[1][1])
0440 version_elems = [elem.strip('_').strip('RelVal_') for elem in version_elems]
0441 return '___'.join([elem for elem in version_elems if elem])
0442
0443
0444 def get_size_to_download(work_path, files_with_urls):
0445 """Returns file list to download and total size to download."""
0446 opener = build_opener(X509CertOpen())
0447 size_to_download = 0
0448 files_to_download = []
0449 for filename, url in files_with_urls:
0450 url_file = opener.open(Request(url))
0451 size = int(url_file.headers["Content-Length"])
0452 file_path = join(work_path, filename)
0453 if exists(file_path) and getsize(file_path) / 1024 == size / 1024:
0454 print("Exists on disk %s." % filename)
0455 else:
0456 size_to_download += size
0457 files_to_download.append(url)
0458 return size_to_download, files_to_download
0459
0460 def check_disk_for_space(work_path, size_needed):
0461 '''Checks afs file system for space.'''
0462 pass
0463
0464
0465
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475
0476
0477
0478 def show_status_bar(total_size):
0479 """Shows download status."""
0480 q = show_status_bar.q
0481 total_size = total_size / (1024*1024)
0482 downloaded = 0
0483 while downloaded < total_size:
0484 try:
0485 o = q.get(timeout=20)
0486 downloaded += 1
0487 print('\r %d/%d MB %d%% ' % (downloaded, total_size, 100*downloaded/total_size), end=' ')
0488 sys.stdout.flush()
0489 except Empty:
0490 time.sleep(1)
0491 break