Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-08-27 04:39:33

0001 #!/usr/bin/env python3
0002 import pycurl
0003 from io import BytesIO
0004 import pycurl
0005 import ast
0006 import subprocess
0007 import pandas as pd
0008 import argparse
0009 from bs4 import BeautifulSoup
0010 import numpy as np
0011 import os
0012 import json
0013 import sys
0014 
0015 ## Helpers
0016 base_cert_url = "https://cms-service-dqmdc.web.cern.ch/CAF/certification/"
0017 base_cert_path = "/eos/user/c/cmsdqm/www/CAF/certification/"
0018 
0019 def get_url_clean(url):
0020     
0021     buffer = BytesIO()
0022     c = pycurl.Curl()
0023     c.setopt(c.URL, url)
0024     c.setopt(c.WRITEDATA, buffer)
0025     c.perform()
0026     c.close()
0027     
0028     return BeautifulSoup(buffer.getvalue(), "lxml").text
0029 
0030 def das_do_command(cmd):
0031     out = subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode('utf8')
0032     return out.split("\n")
0033 
0034 def das_file_site(dataset, site):
0035     cmd = "dasgoclient --query='file dataset=%s site=%s'"%(dataset,site)
0036     out = das_do_command(cmd)
0037     df = pd.DataFrame(out,columns=["file"])
0038 
0039     return df
0040 
0041 def das_file_data(dataset,opt=""):
0042     cmd = "dasgoclient --query='file dataset=%s %s| grep file.name, file.nevents'"%(dataset,opt)
0043     out = das_do_command(cmd)
0044     out = [np.array(r.split(" "))[[0,3]] for r in out if len(r) > 0]
0045 
0046     df = pd.DataFrame(out,columns=["file","events"])
0047     df.events = df.events.values.astype(int)
0048     
0049     return df
0050 
0051 def das_lumi_data(dataset,opt=""):
0052     cmd = "dasgoclient --query='file,lumi,run dataset=%s %s'"%(dataset,opt)
0053     
0054     out = das_do_command(cmd)
0055     out = [r.split(" ") for r in out if len(r)>0]
0056     
0057     df = pd.DataFrame(out,columns=["file","run","lumis"])
0058     
0059     return df
0060 
0061 def das_run_events_data(dataset,run,opt=""):
0062     cmd = "dasgoclient --query='file dataset=%s run=%s %s | sum(file.nevents) '"%(dataset,run,opt)
0063     out = das_do_command(cmd)[0]
0064 
0065     out = [o for o in out.split(" ") if "sum" not in o]
0066     out = int([r.split(" ") for r in out if len(r)>0][0][0])
0067 
0068     return out
0069 
0070 def das_run_data(dataset,opt=""):
0071     cmd = "dasgoclient --query='run dataset=%s %s '"%(dataset,opt)
0072     out = das_do_command(cmd)
0073 
0074     return out
0075 
0076 def no_intersection():
0077     print("No intersection between:")
0078     print(" - json   : ", best_json)
0079     print(" - dataset: ", dataset)
0080     print("Exiting.")
0081     sys.exit(1)
0082 
0083 if __name__ == '__main__':
0084 
0085     parser = argparse.ArgumentParser()
0086     parser.add_argument('--dataset','-d', default=None, help="Dataset Name (e.g. '/DisplacedJet/Run2024C-v1/RAW' )",type=str,required=True)
0087     parser.add_argument('--threshold','-t', help ="Event threshold per file",type=int,default=-1)
0088     parser.add_argument('--events','-e', help ="Tot number of events targeted",type=int,default=-1)
0089     parser.add_argument('--outfile','-o', help='Dump results to file', type=str, default=None)
0090     parser.add_argument('--pandas', '-pd',action='store_true',help="Store the whole dataset (no event or threshold cut) in a csv") 
0091     parser.add_argument('--proxy','-p', help='Allow to parse a x509 proxy if needed', type=str, default=None)
0092     parser.add_argument('--site','-s', help='Only data at specific site', type=str, default=None)
0093     parser.add_argument('--precheck','-pc', action='store_true', help='Check run per run before building the dataframes, to avoid huge caching.')
0094     args = parser.parse_args()
0095 
0096     if args.proxy is not None:
0097         os.environ["X509_USER_PROXY"] = args.proxy
0098     elif "X509_USER_PROXY" not in os.environ:
0099         print("No X509 proxy set. Exiting.")
0100         sys.exit(1)
0101     
0102     ## Check if we are in the cms-bot "environment"
0103     testing = "JENKINS_PREFIX" in os.environ
0104     dataset   = args.dataset
0105     events    = args.events
0106     threshold = args.threshold
0107     outfile   = args.outfile
0108     site      = args.site
0109 
0110     ## get the greatest golden json
0111     year = dataset.split("Run")[1][2:4] # from 20XX to XX
0112     PD = dataset.split("/")[1]
0113     cert_type = "Collisions" + str(year)
0114     if "Cosmics" in dataset:
0115         cert_type = "Cosmics" + str(year)
0116     elif "Commisioning" in dataset:
0117         cert_type = "Commisioning2020" 
0118     elif "HI" in PD:
0119         cert_type = "Collisions" + str(year) + "HI"
0120     
0121     cert_path = base_cert_path + cert_type + "/"
0122     web_fallback = False
0123 
0124     ## if we have access to eos we get from there ...
0125     if os.path.isdir(cert_path):
0126         json_list = os.listdir(cert_path)
0127         if len(json_list) == 0:
0128             web_fallback == True
0129         json_list = [c for c in json_list if "Golden" in c and "era" not in c]
0130         json_list = [c for c in json_list if c.startswith("Cert_C") and c.endswith("json")]
0131     else:
0132         web_fallback = True
0133     ## ... if not we go to the website
0134     if web_fallback:
0135         cert_url = base_cert_url + cert_type + "/"
0136         json_list = get_url_clean(cert_url).split("\n")
0137         json_list = [c for c in json_list if "Golden" in c and "era" not in c]
0138         json_list = [[cc for cc in c.split(" ") if cc.startswith("Cert_C") and cc.endswith("json")][0] for c in json_list]
0139 
0140     # the larger the better, assuming file naming schema 
0141     # Cert_X_RunStart_RunFinish_Type.json
0142     run_ranges = [int(c.split("_")[3]) - int(c.split("_")[2]) for c in json_list]
0143     latest_json = np.array(json_list[np.argmax(run_ranges)]).reshape(1,-1)[0].astype(str)
0144     best_json = str(latest_json[0])
0145     if not web_fallback:
0146         with open(cert_path + "/" + best_json) as js:
0147             golden = json.load(js)
0148     else:
0149         golden = get_url_clean(cert_url + best_json)
0150         golden = ast.literal_eval(golden) #converts string to dict
0151 
0152     # golden json with all the lumisections
0153     golden_flat = {}
0154     for k in golden:
0155         R = []
0156         for r in golden[k]:
0157             R = R + [f for f in range(r[0],r[1]+1)]
0158         golden_flat[k] = R
0159 
0160     # let's just check there's an intersection between the
0161     # dataset and the json
0162     data_runs = das_run_data(dataset)
0163     golden_data_runs = [r for r in data_runs if r in golden_flat]
0164 
0165     if (len(golden_data_runs)==0):
0166         no_intersection()
0167 
0168     # building the dataframe, cleaning for bad lumis
0169     golden_data_runs_tocheck = golden_data_runs
0170     das_opt = ""
0171     if testing or args.precheck:
0172         golden_data_runs_tocheck = []
0173         # Here we check run per run.
0174         # This implies more dasgoclient queries, but smaller outputs
0175         # useful when running the IB/PR tests not to have huge
0176         # query results that have to be cached.
0177 
0178         sum_events = 0
0179 
0180         for r in golden_data_runs:
0181             sum_events = sum_events + int(das_run_events_data(dataset,r))
0182             golden_data_runs_tocheck.append(r)
0183             if events > 0 and sum_events > events:
0184                 break
0185 
0186         das_opt = "run in %s"%(str([int(g) for g in golden_data_runs_tocheck]))
0187 
0188     df = das_lumi_data(dataset,opt=das_opt).merge(das_file_data(dataset,opt=das_opt),on="file",how="inner") # merge file informations with run and lumis
0189 
0190     df["lumis"] = [[int(ff) for ff in f.replace("[","").replace("]","").split(",")] for f in df.lumis.values]
0191     df_rs = []
0192     for r in golden_data_runs_tocheck:
0193         cut = (df["run"] == r)
0194         if not any(cut):
0195             continue
0196 
0197         df_r = df[cut]
0198 
0199         # jumping low event content runs
0200         if df_r["events"].sum() < threshold:
0201             continue
0202 
0203         good_lumis = np.array([len([ll for ll in l if ll in golden_flat[r]]) for l in df_r.lumis])
0204         n_lumis = np.array([len(l) for l in df_r.lumis])
0205         df_rs.append(df_r[good_lumis==n_lumis])
0206 
0207     if (len(df_rs)==0):
0208         no_intersection()
0209 
0210     df = pd.concat(df_rs)
0211     df.loc[:,"min_lumi"] = [min(f) for f in df.lumis]
0212     df.loc[:,"max_lumi"] = [max(f) for f in df.lumis]
0213     df = df.sort_values(["run","min_lumi","max_lumi"])
0214 
0215     if site is not None:
0216         df = df.merge(das_file_site(dataset,site),on="file",how="inner")
0217 
0218     if args.pandas:
0219         df.to_csv(dataset.replace("/","")+".csv")
0220 
0221     if events > 0:
0222         df = df[df["events"] <= events] #jump too big files
0223         df.loc[:,"sum_evs"] = df.loc[:,"events"].cumsum()
0224         df = df[df["sum_evs"] < events]
0225             
0226     files = df.file
0227 
0228     if outfile is not None:
0229         with open(outfile, 'w') as f:
0230             for line in files:
0231                 f.write(f"{line}\n") 
0232     else:
0233         print("\n".join(files))
0234 
0235     sys.exit(0)
0236 
0237