Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
#!/bin/tcsh

# get working dir to set environment
#source /afs/cern.ch/cms/caf/setup.csh
set workdir = `cd ../../; pwd`

set iter = $1                          # final iteration number
set odir = `cd $2; pwd`                # get full output path
set name = `basename $odir`            # for job name
set iovfile = `basename $3`
set jobs = `ls -d $odir/job*/ | wc -l` # number of jobs
set queue = tomorrow                   # pick queue to run on
set startdir = `pwd`

set allowed_n_jobs_submitted = 1000
set allowed_n_jobs_idle = 1000
set allowed_n_recover = 2
set allowed_n_rescue = 5
set allowed_n_resub = 2
set allowed_n_retry = 2

## Submit jobs and iterate

cp $3 $odir
set k = 0

foreach line ( "`cat $odir/$iovfile`" )
	@ k++
	echo $line
	cmsRun $odir/main/initial_cfg_$k.py >& $odir/main/initial_$line.out
	set lastrun = $line
end


@ i = `cat $odir/main/IOIteration_$lastrun.root` + 1

#echo "-----------------------------------------------------------------------"
echo Starting iteration = $i, final iteration = $iter, number of jobs = $jobs
echo ""

echo "Writing Condor and DAGMan files..."
mkdir $odir/DAG/
#foreach it (`seq 1 1 $itter`)
#CSH
echo "#"\!"/bin/tcsh"                                >> $odir/upload.csh
echo "cd $workdir"                                   >> $odir/upload.csh
echo "eval "\`"scramv1 runtime -csh"\`               >> $odir/upload.csh
echo "cd $odir"                                      >> $odir/upload.csh
echo "cp $odir/main/IOIteration*.root $odir/"        >> $odir/upload.csh
echo "cp $odir/main/IOAlignedPositions*.root $odir/" >> $odir/upload.csh
echo "set kk = 0"                                    >> $odir/upload.csh
echo "foreach line ( "\""`cat $odir/$iovfile`"\"" )" >> $odir/upload.csh
echo "    @ kk++"                                    >> $odir/upload.csh
echo "    cmsRun upload_cfg_"\$"kk.py"               >> $odir/upload.csh 
echo "end"                                           >> $odir/upload.csh
echo "rm -f $odir/*.root"                            >> $odir/upload.csh
echo "cd -"                                          >> $odir/upload.csh
#JDL
echo "universe = vanilla" >>                           $odir/upload.jdl
echo "executable = $odir/upload.csh" >>                $odir/upload.jdl
echo "output = $odir/upload.out" >>                    $odir/upload.jdl
echo "error  = $odir/upload.out" >>                    $odir/upload.jdl
echo "log    = $odir/upload.log" >>                    $odir/upload.jdl
echo "+JobFlavour = "\""espresso"\" >>                 $odir/upload.jdl
echo "+JobBatchName = "\""upload"\" >>                 $odir/upload.jdl
echo "queue" >>                                        $odir/upload.jdl
#CFG
echo "DAGMAN_MAX_JOBS_SUBMITTED = $allowed_n_jobs_submitted"          >> $odir/DAG/sequence.cfg
echo "DAGMAN_MAX_JOBS_IDLE = $allowed_n_jobs_idle"                    >> $odir/DAG/sequence.cfg 

chmod +x $odir/upload.csh

# Create the DAG PARENT line
set parent_line_jb = "PARENT"
set parent_line_iov = "PARENT"
set child_line_jb = "CHILD"
set retry_line_jb = ""

# Job loop
foreach jb (`seq 1 1 $jobs`)
    # Create the job specific condor files
    echo "universe = vanilla" >>                        $odir/job$jb/runScript.jdl
    echo "executable = $odir/job$jb/runScript.csh" >>   $odir/job$jb/runScript.jdl
    echo "arguments = $workdir" >>                      $odir/job$jb/runScript.jdl
    echo "output = $odir/job$jb/align.out" >>           $odir/job$jb/runScript.jdl
    echo "error  = $odir/job$jb/align.out" >>           $odir/job$jb/runScript.jdl
    echo "log    = $odir/job$jb/align.log" >>           $odir/job$jb/runScript.jdl
    echo "+JobFlavour = "\""$queue"\" >>                $odir/job$jb/runScript.jdl
    echo "+JobBatchName = "\""$name/align_$jb"\" >>     $odir/job$jb/runScript.jdl
    echo "queue" >>                                     $odir/job$jb/runScript.jdl
  
    # Append the job to the DAG file and parent line
    echo "JOB job$jb $odir/job$jb/runScript.jdl" >>  $odir/DAG/sequence.dag
    set parent_line_jb = "$parent_line_jb job$jb"
    # Append the job into the retry option
    set retry_line_jb = "${retry_line_jb}Retry job$jb $allowed_n_retry\n"     

end

# IOV collection loop
foreach iov (`seq 1 1 $k`)
    # Create the collection job condor files
    echo "universe = vanilla" >>                           $odir/main/runScript_$iov.jdl
    echo "executable = $odir/main/runScript_$iov.csh" >>   $odir/main/runScript_$iov.jdl
    echo "arguments = $workdir" >>                         $odir/main/runScript_$iov.jdl
    echo "output = $odir/main/collect_$iov.out" >>         $odir/main/runScript_$iov.jdl
    echo "error  = $odir/main/collect_$iov.out" >>         $odir/main/runScript_$iov.jdl
    echo "log    = $odir/main/collect_$iov.log" >>         $odir/main/runScript_$iov.jdl
    echo "+JobFlavour = "\""espresso"\" >>                 $odir/main/runScript_$iov.jdl
    echo "+JobBatchName = "\""$name/collect_$iov"\" >>     $odir/main/runScript_$iov.jdl
    echo "queue" >>                                        $odir/main/runScript_$iov.jdl
    
    # Append the job to the DAG file and child line
    echo "JOB collect$iov $odir/main/runScript_$iov.jdl" >>  $odir/DAG/sequence.dag
    set child_line_jb = "$child_line_jb collect$iov"
    set parent_line_iov = "$parent_line_iov collect$iov"
    set retry_line_jb = "${retry_line_jb}Retry collect$iov $allowed_n_retry\n"     
end
# Finish the DAG file
echo  "JOB upload $odir/upload.jdl" >> $odir/DAG/sequence.dag
set retry_line_jb = "${retry_line_jb}Retry upload $allowed_n_retry"

echo "$parent_line_jb $child_line_jb" >>  $odir/DAG/sequence.dag     
echo "$parent_line_iov CHILD upload \n$retry_line_jb" >>  $odir/DAG/sequence.dag
echo "CONFIG $odir/DAG/sequence.cfg" >> $odir/DAG/sequence.dag

while ($i <= $iter)

    echo "-----------------------------------------------------------------------"
    echo "                       Starting iteration $i                           "
    echo "-----------------------------------------------------------------------"
    #echo Running iteration $i

    echo "Submitting DAGMan file..."
    condor_submit_dag $odir/DAG/sequence.dag

    set n_recover = 0
    set n_rescue = 0
    set n_resub = 0
    set iteration_done = 0
    while (!($iteration_done) && ($n_recover<$allowed_n_recover) && ($n_rescue<$allowed_n_rescue) && ($n_resub<$allowed_n_resub))

        # metrics file should only be created at the end
        if ( -f "$odir/DAG/sequence.dag.metrics" ) then
            echo "DAGMAN submission has finished unexpectedly. Restarting submission..."
            rm $odir/DAG/sequence.dag.metrics
            @ n_resub++
            condor_submit_dag $odir/DAG/sequence.dag
            continue
        endif

        echo "Submission was succesful."
        echo "Wait untill jobs stop running..."
        sleep 30
        condor_wait $odir/DAG/sequence.dag.dagman.log
        if ( $? != 0 ) then
            echo "condor_wait finished unexpectedly, returning $?. Terminating..."
            exit 1
        endif

        @ found_rescue = 1
        if (`ls | grep rescue | wc -l` < `expr $n_rescue + 1`) then 
            @ found_rescue = 0
        else
            @ n_rescue++
        endif
       
        @ log_done = 0
        if ( -f $odir/DAG/sequence.dag.dagman.out && "`tail $odir/DAG/sequence.dag.dagman.out |grep "\""EXITING WITH STATUS"\""`" != "") then
            @ log_done = 1
        endif

        # User has manualy removed jobs from condor (condor_rm)
        if ( `cat $odir/DAG/sequence.dag.dagman.log | grep "Job was aborted by the user" | wc -l` > 0 ) then
                echo "USER interruption detected. Terminating..."
                exit 1  
        endif 
        
        if ( !($found_rescue) && $log_done) then
            echo "Iteration $i finished."
            @ iteration_done = 1
        else if (!($found_rescue) && !($log_done)) then
            @ n_recover++
            echo "Sumbit recovery attempt $n_recover"
            condor_submit_dag -DoRecovery $odir/DAG/sequence.dag 
        else if ($found_rescue) then
            echo "Submit rescue attempt $n_rescue"
            condor_submit_dag $odir/DAG/sequence.dag
        endif 
    end
    if !($iteration_done) then
        exit 1
    endif
    cd $startdir

    # Clean up
    mv $odir/alignments.db $odir/alignments_iter${i}.db


    rm -rf $odir/job*/IOUserVariables.root
    #rm -rf $odir/job*/DONE1
    #rm -rf $odir/main/runControl*.txt
    foreach jb (`seq 1 1 $jobs`)
        mv $odir/job$jb/align.out $odir/job$jb/align$i.out
        mv $odir/job$jb/align.log $odir/job$jb/align$i.log
        gzip -f  $odir/job$jb/align$i.out 
        gzip -f  $odir/job$jb/align$i.log 
    end
    foreach iov (`seq 1 1 $k`)
        mv $odir/main/collect_$iov.out $odir/main/collect_"$iov"_it$i.out
        mv $odir/main/collect_$iov.log $odir/main/collect_"$iov"_it$i.log
        gzip -f $odir/main/collect_"$iov"_it$i.out 
        gzip -f $odir/main/collect_"$iov"_it$i.log 
    end
    mv $odir/upload.out $odir/upload$i.out
    mv $odir/upload.log $odir/upload$i.log
    gzip -f $odir/upload$i.out
    gzip -f $odir/upload$i.log
    mv $odir/DAG/sequence.dag.dagman.out $odir/DAG/sequence$i.dag.dagman.out
    rm -rf $odir/DAG/sequence.dag.*

  @ i++
end