Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-02-14 12:44:58

0001 #!/bin/tcsh
0002 
0003 # get working dir to set environment
0004 #source /afs/cern.ch/cms/caf/setup.csh
0005 set workdir = `cd ../../; pwd`
0006 
0007 set iter = $1                          # final iteration number
0008 set odir = `cd $2; pwd`                # get full output path
0009 set name = `basename $odir`            # for job name
0010 set iovfile = `basename $3`
0011 set jobs = `ls -d $odir/job*/ | wc -l` # number of jobs
0012 set queue = tomorrow                   # pick queue to run on
0013 set startdir = `pwd`
0014 
0015 set allowed_n_jobs_submitted = 1000
0016 set allowed_n_jobs_idle = 1000
0017 set allowed_n_recover = 2
0018 set allowed_n_rescue = 5
0019 set allowed_n_resub = 2
0020 set allowed_n_retry = 2
0021 
0022 ## Submit jobs and iterate
0023 
0024 cp $3 $odir
0025 set k = 0
0026 
0027 foreach line ( "`cat $odir/$iovfile`" )
0028         @ k++
0029         echo $line
0030         cmsRun $odir/main/initial_cfg_$k.py >& $odir/main/initial_$line.out
0031         set lastrun = $line
0032 end
0033 
0034 
0035 @ i = `cat $odir/main/IOIteration_$lastrun.root` + 1
0036 
0037 #echo "-----------------------------------------------------------------------"
0038 echo Starting iteration = $i, final iteration = $iter, number of jobs = $jobs
0039 echo ""
0040 
0041 echo "Writing Condor and DAGMan files..."
0042 mkdir $odir/DAG/
0043 #foreach it (`seq 1 1 $itter`)
0044 #CSH
0045 echo "#"\!"/bin/tcsh"                                >> $odir/upload.csh
0046 echo "cd $workdir"                                   >> $odir/upload.csh
0047 echo "eval "\`"scramv1 runtime -csh"\`               >> $odir/upload.csh
0048 echo "cd $odir"                                      >> $odir/upload.csh
0049 echo "cp $odir/main/IOIteration*.root $odir/"        >> $odir/upload.csh
0050 echo "cp $odir/main/IOAlignedPositions*.root $odir/" >> $odir/upload.csh
0051 echo "set kk = 0"                                    >> $odir/upload.csh
0052 echo "foreach line ( "\""`cat $odir/$iovfile`"\"" )" >> $odir/upload.csh
0053 echo "    @ kk++"                                    >> $odir/upload.csh
0054 echo "    cmsRun upload_cfg_"\$"kk.py"               >> $odir/upload.csh 
0055 echo "end"                                           >> $odir/upload.csh
0056 echo "rm -f $odir/*.root"                            >> $odir/upload.csh
0057 echo "cd -"                                          >> $odir/upload.csh
0058 #JDL
0059 echo "universe = vanilla" >>                           $odir/upload.jdl
0060 echo "executable = $odir/upload.csh" >>                $odir/upload.jdl
0061 echo "output = $odir/upload.out" >>                    $odir/upload.jdl
0062 echo "error  = $odir/upload.out" >>                    $odir/upload.jdl
0063 echo "log    = $odir/upload.log" >>                    $odir/upload.jdl
0064 echo "+JobFlavour = "\""espresso"\" >>                 $odir/upload.jdl
0065 echo "+JobBatchName = "\""upload"\" >>                 $odir/upload.jdl
0066 echo "queue" >>                                        $odir/upload.jdl
0067 #CFG
0068 echo "DAGMAN_MAX_JOBS_SUBMITTED = $allowed_n_jobs_submitted"          >> $odir/DAG/sequence.cfg
0069 echo "DAGMAN_MAX_JOBS_IDLE = $allowed_n_jobs_idle"                    >> $odir/DAG/sequence.cfg 
0070 
0071 chmod +x $odir/upload.csh
0072 
0073 # Create the DAG PARENT line
0074 set parent_line_jb = "PARENT"
0075 set parent_line_iov = "PARENT"
0076 set child_line_jb = "CHILD"
0077 set retry_line_jb = ""
0078 
0079 # Job loop
0080 foreach jb (`seq 1 1 $jobs`)
0081     # Create the job specific condor files
0082     echo "universe = vanilla" >>                        $odir/job$jb/runScript.jdl
0083     echo "executable = $odir/job$jb/runScript.csh" >>   $odir/job$jb/runScript.jdl
0084     echo "arguments = $workdir" >>                      $odir/job$jb/runScript.jdl
0085     echo "output = $odir/job$jb/align.out" >>           $odir/job$jb/runScript.jdl
0086     echo "error  = $odir/job$jb/align.out" >>           $odir/job$jb/runScript.jdl
0087     echo "log    = $odir/job$jb/align.log" >>           $odir/job$jb/runScript.jdl
0088     echo "+JobFlavour = "\""$queue"\" >>                $odir/job$jb/runScript.jdl
0089     echo "+JobBatchName = "\""$name/align_$jb"\" >>     $odir/job$jb/runScript.jdl
0090     echo "queue" >>                                     $odir/job$jb/runScript.jdl
0091   
0092     # Append the job to the DAG file and parent line
0093     echo "JOB job$jb $odir/job$jb/runScript.jdl" >>  $odir/DAG/sequence.dag
0094     set parent_line_jb = "$parent_line_jb job$jb"
0095     # Append the job into the retry option
0096     set retry_line_jb = "${retry_line_jb}Retry job$jb $allowed_n_retry\n"     
0097 
0098 end
0099 
0100 # IOV collection loop
0101 foreach iov (`seq 1 1 $k`)
0102     # Create the collection job condor files
0103     echo "universe = vanilla" >>                           $odir/main/runScript_$iov.jdl
0104     echo "executable = $odir/main/runScript_$iov.csh" >>   $odir/main/runScript_$iov.jdl
0105     echo "arguments = $workdir" >>                         $odir/main/runScript_$iov.jdl
0106     echo "output = $odir/main/collect_$iov.out" >>         $odir/main/runScript_$iov.jdl
0107     echo "error  = $odir/main/collect_$iov.out" >>         $odir/main/runScript_$iov.jdl
0108     echo "log    = $odir/main/collect_$iov.log" >>         $odir/main/runScript_$iov.jdl
0109     echo "+JobFlavour = "\""espresso"\" >>                 $odir/main/runScript_$iov.jdl
0110     echo "+JobBatchName = "\""$name/collect_$iov"\" >>     $odir/main/runScript_$iov.jdl
0111     echo "queue" >>                                        $odir/main/runScript_$iov.jdl
0112     
0113     # Append the job to the DAG file and child line
0114     echo "JOB collect$iov $odir/main/runScript_$iov.jdl" >>  $odir/DAG/sequence.dag
0115     set child_line_jb = "$child_line_jb collect$iov"
0116     set parent_line_iov = "$parent_line_iov collect$iov"
0117     set retry_line_jb = "${retry_line_jb}Retry collect$iov $allowed_n_retry\n"     
0118 end
0119 # Finish the DAG file
0120 echo  "JOB upload $odir/upload.jdl" >> $odir/DAG/sequence.dag
0121 set retry_line_jb = "${retry_line_jb}Retry upload $allowed_n_retry"
0122 
0123 echo "$parent_line_jb $child_line_jb" >>  $odir/DAG/sequence.dag     
0124 echo "$parent_line_iov CHILD upload \n$retry_line_jb" >>  $odir/DAG/sequence.dag
0125 echo "CONFIG $odir/DAG/sequence.cfg" >> $odir/DAG/sequence.dag
0126 
0127 while ($i <= $iter)
0128 
0129     echo "-----------------------------------------------------------------------"
0130     echo "                       Starting iteration $i                           "
0131     echo "-----------------------------------------------------------------------"
0132     #echo Running iteration $i
0133 
0134     echo "Submitting DAGMan file..."
0135     condor_submit_dag $odir/DAG/sequence.dag
0136 
0137     set n_recover = 0
0138     set n_rescue = 0
0139     set n_resub = 0
0140     set iteration_done = 0
0141     while (!($iteration_done) && ($n_recover<$allowed_n_recover) && ($n_rescue<$allowed_n_rescue) && ($n_resub<$allowed_n_resub))
0142 
0143         # metrics file should only be created at the end
0144         if ( -f "$odir/DAG/sequence.dag.metrics" ) then
0145             echo "DAGMAN submission has finished unexpectedly. Restarting submission..."
0146             rm $odir/DAG/sequence.dag.metrics
0147             @ n_resub++
0148             condor_submit_dag $odir/DAG/sequence.dag
0149             continue
0150         endif
0151 
0152         echo "Submission was succesful."
0153         echo "Wait untill jobs stop running..."
0154         sleep 30
0155         condor_wait $odir/DAG/sequence.dag.dagman.log
0156         if ( $? != 0 ) then
0157             echo "condor_wait finished unexpectedly, returning $?. Terminating..."
0158             exit 1
0159         endif
0160 
0161         @ found_rescue = 1
0162         if (`ls | grep rescue | wc -l` < `expr $n_rescue + 1`) then 
0163             @ found_rescue = 0
0164         else
0165             @ n_rescue++
0166         endif
0167        
0168         @ log_done = 0
0169         if ( -f $odir/DAG/sequence.dag.dagman.out && "`tail $odir/DAG/sequence.dag.dagman.out |grep "\""EXITING WITH STATUS"\""`" != "") then
0170             @ log_done = 1
0171         endif
0172 
0173         # User has manualy removed jobs from condor (condor_rm)
0174         if ( `cat $odir/DAG/sequence.dag.dagman.log | grep "Job was aborted by the user" | wc -l` > 0 ) then
0175                 echo "USER interruption detected. Terminating..."
0176                 exit 1  
0177         endif 
0178         
0179         if ( !($found_rescue) && $log_done) then
0180             echo "Iteration $i finished."
0181             @ iteration_done = 1
0182         else if (!($found_rescue) && !($log_done)) then
0183             @ n_recover++
0184             echo "Sumbit recovery attempt $n_recover"
0185             condor_submit_dag -DoRecovery $odir/DAG/sequence.dag 
0186         else if ($found_rescue) then
0187             echo "Submit rescue attempt $n_rescue"
0188             condor_submit_dag $odir/DAG/sequence.dag
0189         endif 
0190     end
0191     if !($iteration_done) then
0192         exit 1
0193     endif
0194     cd $startdir
0195 
0196     # Clean up
0197     mv $odir/alignments.db $odir/alignments_iter${i}.db
0198 
0199 
0200     rm -rf $odir/job*/IOUserVariables.root
0201     #rm -rf $odir/job*/DONE1
0202     #rm -rf $odir/main/runControl*.txt
0203     foreach jb (`seq 1 1 $jobs`)
0204         mv $odir/job$jb/align.out $odir/job$jb/align$i.out
0205         mv $odir/job$jb/align.log $odir/job$jb/align$i.log
0206         gzip -f  $odir/job$jb/align$i.out 
0207         gzip -f  $odir/job$jb/align$i.log 
0208     end
0209     foreach iov (`seq 1 1 $k`)
0210         mv $odir/main/collect_$iov.out $odir/main/collect_"$iov"_it$i.out
0211         mv $odir/main/collect_$iov.log $odir/main/collect_"$iov"_it$i.log
0212         gzip -f $odir/main/collect_"$iov"_it$i.out 
0213         gzip -f $odir/main/collect_"$iov"_it$i.log 
0214     end
0215     mv $odir/upload.out $odir/upload$i.out
0216     mv $odir/upload.log $odir/upload$i.log
0217     gzip -f $odir/upload$i.out
0218     gzip -f $odir/upload$i.log
0219     mv $odir/DAG/sequence.dag.dagman.out $odir/DAG/sequence$i.dag.dagman.out
0220     rm -rf $odir/DAG/sequence.dag.*
0221 
0222   @ i++
0223 end
0224