Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:15:47

0001 #!/bin/bash
0002 
0003 # defaults
0004 USEDOCKER=""
0005 GPU=""
0006 VERBOSE=""
0007 VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1"
0008 WTIME=600
0009 SERVER=triton_server_instance
0010 RETRIES=3
0011 REPOS=()
0012 MODELS=()
0013 FORCE=""
0014 CLEANUP=true
0015 TMPDIR=""
0016 INSTANCES=0
0017 LOCALMODELREPO=""
0018 DRYRUN=""
0019 PARENTPID=""
0020 BASEPORT=8000
0021 AUTOPORT=""
0022 NPORTS=3
0023 IMAGE=fastml/triton-torchgeo:22.07-py3-geometric
0024 SANDBOX=""
0025 COMPAT_USR=""
0026 EXTRAPATH=/cvmfs/oasis.opensciencegrid.org/mis/apptainer/current/bin
0027 
0028 get_sandbox(){
0029         if [ -z "$SANDBOX" ]; then
0030                 echo /cvmfs/unpacked.cern.ch/registry.hub.docker.com/${IMAGE}
0031         else
0032                 echo "$SANDBOX"
0033         fi
0034 }
0035 
0036 usage() {
0037         ECHO="echo -e"
0038         $ECHO "cmsTriton [options] [start|stop|check]"
0039         $ECHO
0040         $ECHO "Options:"
0041         $ECHO "-c          \t don't cleanup temporary dir (for debugging)"
0042         $ECHO "-C [dir]    \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)"
0043         $ECHO "-D          \t dry run: print container commands rather than executing them"
0044         $ECHO "-d          \t use Docker instead of Apptainer"
0045         $ECHO "-E [path]   \t include extra path(s) for executables (default: ${EXTRAPATH})"
0046         $ECHO "-f          \t force reuse of (possibly) existing container instance"
0047         $ECHO "-g          \t use GPU instead of CPU"
0048         $ECHO "-i [name]   \t server image name (default: ${IMAGE})"
0049         $ECHO "-I [num]    \t number of model instances (default: ${INSTANCES} -> means no local editing of config files)"
0050         $ECHO "-M [dir]    \t model repository (can be given more than once)"
0051         $ECHO "-m [dir]    \t specific model directory (can be given more than once)"
0052         $ECHO "-n [name]   \t name of container instance, also used for default hidden temporary dir (default: ${SERVER})"
0053         $ECHO "-P [port]   \t base port number for services (-1: automatically find an unused port range) (default: ${BASEPORT})"
0054         $ECHO "-p [pid]    \t automatically shut down server when process w/ specified PID ends (-1: use parent process PID)"
0055         $ECHO "-r [num]    \t number of retries when starting container (default: ${RETRIES})"
0056         $ECHO "-s [dir]    \t Apptainer sandbox directory (default: $(get_sandbox))"
0057         $ECHO "-t [dir]    \t non-default hidden temporary dir"
0058         $ECHO "-v          \t (verbose) start: activate server debugging info; stop: keep server logs"
0059         $ECHO "-w [time]   \t maximum time to wait for server to start (default: ${WTIME} seconds)"
0060         $ECHO "-h          \t print this message and exit"
0061         $ECHO
0062         $ECHO "Operations:"
0063         $ECHO "start       \t start server"
0064         $ECHO "stop        \t stop server"
0065         $ECHO "check       \t check if server can run on this system"
0066         exit $1
0067 }
0068 
0069 # check shm locations
0070 SHM=/dev/shm
0071 if [ -e /run/shm ]; then
0072         SHM=/run/shm
0073 fi
0074 
0075 while getopts "cC:Ddfgi:I:M:m:n:P:p:r:s:t:vw:h" opt; do
0076         case "$opt" in
0077                 c) CLEANUP=""
0078                 ;;
0079                 C) COMPAT_USR="$OPTARG"
0080                 ;;
0081                 D) DRYRUN=echo
0082                 ;;
0083                 d) USEDOCKER=true
0084                 ;;
0085                 f) FORCE=true
0086                 ;;
0087                 g) GPU=true
0088                 ;;
0089                 i) IMAGE="$OPTARG"
0090                 ;;
0091                 I) INSTANCES="$OPTARG"
0092                 ;;
0093                 M) REPOS+=("$OPTARG")
0094                 ;;
0095                 m) MODELS+=("$OPTARG")
0096                 ;;
0097                 n) SERVER="$OPTARG"
0098                 ;;
0099                 P) if [ "$OPTARG" -eq -1 ]; then AUTOPORT=true; else BASEPORT="$OPTARG"; fi
0100                 ;;
0101                 p) if [ "$OPTARG" -eq -1 ]; then PARENTPID="$PPID"; else PARENTPID="$OPTARG"; fi
0102                 ;;
0103                 r) RETRIES="$OPTARG"
0104                 ;;
0105                 s) SANDBOX="$OPTARG"
0106                 ;;
0107                 t) TMPDIR="$OPTARG"
0108                 ;;
0109                 v) VERBOSE="$VERBOSE_ARGS"
0110                 ;;
0111                 w) WTIME="$OPTARG"
0112                 ;;
0113                 h) usage 0
0114                 ;;
0115         esac
0116 done
0117 
0118 shift $(($OPTIND - 1))
0119 OP=$1
0120 
0121 if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != check ]; then
0122         usage 1
0123 fi
0124 
0125 if [ "$RETRIES" -le 0 ]; then
0126         RETRIES=1
0127 fi
0128 
0129 TOPDIR=$PWD
0130 if [ -z "$TMPDIR" ]; then
0131         TMPDIR="${TOPDIR}/.${SERVER}"
0132 else
0133         TMPDIR=$(readlink -f $TMPDIR)
0134 fi
0135 
0136 # update path
0137 if [ -n "$EXTRAPATH" ]; then
0138         export PATH="${EXTRAPATH}:${PATH}"
0139 fi
0140 
0141 # find executables
0142 if [ -n "$USEDOCKER" ]; then
0143         if [ -z "$DOCKER" ]; then
0144                 DOCKER="sudo docker"
0145         fi
0146 else
0147         if [ -z "$APPTAINER" ]; then
0148                 if type apptainer >& /dev/null; then
0149                         APPTAINER=apptainer
0150                 elif type singularity >& /dev/null; then
0151                         APPTAINER=singularity
0152                 else
0153                         echo "Missing apptainer and singularity"
0154                         exit 1
0155                 fi
0156         fi
0157 fi
0158 
0159 SANDBOX=$(get_sandbox)
0160 SANDBOX=$(readlink -f ${SANDBOX})
0161 LOG="log_${SERVER}.log"
0162 STOPLOG="log_stop_${SERVER}.log"
0163 LIB=lib
0164 STARTED_INDICATOR="Started GRPCInferenceService"
0165 SEGFAULT_INDICATOR="Address already in use"
0166 EXTRA=""
0167 COMPAT_SCRIPT=/etc/shinit_v2
0168 
0169 THREADCONTROL=""
0170 # do not apply thread control settings if GPU use is requested
0171 if [ "$INSTANCES" -gt 0 ] && [ -z "$GPU" ]; then
0172         THREADCONTROL=true
0173 fi
0174 
0175 compute_ports(){
0176         # compute derived port numbers
0177         export HTTPPORT=$BASEPORT
0178         export GRPCPORT=$((BASEPORT+1))
0179         export METRPORT=$((BASEPORT+2))
0180 }
0181 
0182 check_port(){
0183         # success on this command means the port is in use
0184         if 2>/dev/null >"/dev/tcp/0.0.0.0/$1"; then
0185                 return 1
0186         else
0187                 return 0
0188         fi
0189 }
0190 
0191 check_ports(){
0192         if check_port $HTTPPORT && check_port $GRPCPORT && check_port $METRPORT; then
0193                 return 0
0194         else
0195                 return 1
0196         fi
0197 }
0198 
0199 find_ports(){
0200         while ! check_ports; do
0201                 BASEPORT=$((BASEPORT+NPORTS))
0202                 compute_ports
0203         done
0204         echo "CMS_TRITON_GRPC_PORT: $GRPCPORT"
0205 }
0206 
0207 handle_ports(){
0208         # handle ports
0209         compute_ports
0210         if [ -n "$AUTOPORT" ]; then
0211                 find_ports
0212         elif ! check_ports; then
0213                 echo "Error: requested port in use"
0214                 return 1
0215         fi
0216         export PORTARGS="--http-port=${HTTPPORT} --grpc-port=${GRPCPORT} --metrics-port=${METRPORT}"
0217 }
0218 
0219 start_docker(){
0220         # mount all model repositories
0221         MOUNTARGS=""
0222         REPOARGS=""
0223         for REPO in ${REPOS[@]}; do
0224                 MOUNTARGS="$MOUNTARGS -v$REPO:$REPO"
0225                 REPOARGS="$REPOARGS --model-repository=${REPO}"
0226         done
0227 
0228         # compatibility driver environment
0229         if [ -n "$COMPAT" ]; then
0230                 MOUNTARGS="$MOUNTARGS -v$COMPAT"
0231                 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0232                         MOUNTARGS="$MOUNTARGS -v$COMPAT_SCRIPT_MOUNT"
0233                 fi
0234         fi
0235 
0236         $DRYRUN $DOCKER run -d --name ${SERVER} \
0237                 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \
0238                 -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
0239                 ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
0240 }
0241 
0242 start_apptainer(){
0243         # triton server image may need to modify contents of opt/tritonserver/lib/
0244         # but cvmfs is read-only
0245         # -> make a writable local directory with the same contents
0246         # this is no longer needed as of triton 2.11.0, but kept for compatibility w/ older server versions
0247         $DRYRUN mkdir ${LIB}
0248         $DRYRUN ln -s ${SANDBOX}/opt/tritonserver/lib/* ${LIB}/
0249 
0250         # mount all model repositories
0251         MOUNTARGS=""
0252         REPOARGS=""
0253         for REPO in ${REPOS[@]}; do
0254                 MOUNTARGS="$MOUNTARGS -B $REPO"
0255                 REPOARGS="$REPOARGS --model-repository=${REPO}"
0256         done
0257 
0258         # compatibility driver environment
0259         if [ -n "$COMPAT" ]; then
0260                 MOUNTARGS="$MOUNTARGS -B $COMPAT"
0261                 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0262                         MOUNTARGS="$MOUNTARGS -B $COMPAT_SCRIPT_MOUNT"
0263                 fi
0264         fi
0265 
0266         # workaround for nvidia libs w/ singularity-in-singularity
0267         # from https://github.com/hpcng/singularity/issues/5759#issuecomment-919523970
0268         if [ -d /.singularity.d/libs ]; then
0269                 TMPD=`mktemp -d`
0270                 (echo '#!/bin/bash'; echo 'exec /usr/sbin/ldconfig -C '"$TMPD"'/ld.so.cache "$@"') > $TMPD/ldconfig
0271                 chmod +x $TMPD/ldconfig
0272                 PATH=$TMPD:$PATH
0273                 # this does not work with LD_LIBRARY_PATH from cmsenv
0274                 ldconfig /.singularity.d/libs
0275         fi
0276 
0277         # start instance
0278         # need to bind /cvmfs for above symlinks to work inside container
0279         $DRYRUN $APPTAINER instance start \
0280                 -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $MOUNTARGS $EXTRA \
0281                 ${SANDBOX} ${SERVER}
0282 
0283         START_EXIT=$?
0284         if [ "$START_EXIT" -ne 0 ]; then
0285                 rm -rf ${LIB}
0286                 return "$START_EXIT"
0287         fi
0288 
0289         # run the actual server
0290         if [ -z "$DRYRUN" ]; then
0291                 REDIR="$LOG"
0292         else
0293                 REDIR=/dev/stdout
0294         fi
0295         $DRYRUN $APPTAINER run instance://${SERVER} \
0296                 tritonserver $PORTARGS $REPOARGS $VERBOSE >& ${REDIR} &
0297         [ -z "$DRYRUN" ] || wait
0298 }
0299 
0300 stop_docker(){
0301         # keep log
0302         if [ -z "$DRYRUN" ]; then
0303                 if [ -n "$VERBOSE" ]; then $DOCKER logs ${SERVER} >& "$LOG"; fi
0304         fi
0305 
0306         $DRYRUN $DOCKER stop ${SERVER}
0307         $DRYRUN $DOCKER rm ${SERVER}
0308 }
0309 
0310 stop_apptainer(){
0311         $DRYRUN $APPTAINER instance stop ${SERVER}
0312 }
0313 
0314 test_docker(){
0315         # docker logs print to stderr
0316         ${DOCKER} logs ${SERVER} |& grep "$1"
0317 }
0318 
0319 test_apptainer(){
0320         grep "$1" $LOG
0321 }
0322 
0323 wait_server(){
0324         if [ -n "$DRYRUN" ]; then
0325                 return
0326         fi
0327 
0328         COUNT=0
0329         while ! $TEST_FN "$STARTED_INDICATOR" >& /dev/null; do
0330                 if $TEST_FN "$SEGFAULT_INDICATOR" >& /dev/null; then
0331                         handle_ports
0332                         PORT_EXIT=$?
0333                         # port exit is zero if it found a new, good port; so retry
0334                         if [ "$PORT_EXIT" -ne 0 ]; then
0335                                 auto_stop true
0336                                 exit $PORT_EXIT
0337                         else
0338                                 return 1
0339                         fi
0340                 elif [ "$COUNT" -gt "$WTIME" ]; then
0341                         echo "timed out waiting for server to start"
0342                         auto_stop true
0343                         exit 1
0344                 else
0345                         COUNT=$(($COUNT + 1))
0346                         sleep 1
0347                 fi
0348         done
0349 
0350         echo "server is ready!"
0351 }
0352 
0353 list_models(){
0354         # make list of model repositories
0355         LOCALMODELREPO="local_model_repo"
0356         if [ -n "$THREADCONTROL" ]; then
0357                 if [ -d "$TMPDIR/$LOCALMODELREPO" ]; then
0358                         #Want to start with a fresh copy of model files in case this directory already exists with local edits
0359                         rm -rf $TMPDIR/$LOCALMODELREPO
0360                 fi
0361                 $DRYRUN mkdir $TMPDIR/$LOCALMODELREPO
0362                 MKMODELDIR_EXIT=$?
0363                 if [ "$MKMODELDIR_EXIT" -ne 0 ]; then
0364                         echo "Could not create local_model_repo dir: $TMPDIR/$LOCALMODELREPO"
0365                         exit "$MKMODELDIR_EXIT"
0366                 fi
0367         fi
0368 
0369         for MODEL in ${MODELS[@]}; do
0370                 # check if file was provided rather than directory
0371                 if [ -f "$MODEL" ]; then
0372                         MODEL="$(dirname "$MODEL")"
0373                 fi
0374                 if [ -n "$THREADCONTROL" ]; then
0375                         $DRYRUN cmsTritonConfigTool threadcontrol -c ${MODEL}/config.pbtxt --copy $TMPDIR/$LOCALMODELREPO --nThreads $INSTANCES
0376                         TOOL_EXIT=$?
0377                         if [ "$TOOL_EXIT" -ne 0 ]; then
0378                                 echo "Could not apply threadcontrol to $MODEL"
0379                                 exit "$TOOL_EXIT"
0380                         fi
0381                 else
0382                         REPOS+=("$(dirname "$MODEL")")
0383                 fi
0384         done
0385         if [ -n "$THREADCONTROL" ]; then
0386                 REPOS=$TMPDIR/$LOCALMODELREPO
0387         else
0388                 for ((r=0; r < ${#REPOS[@]}; r++)); do
0389                         # avoid issues w/ multiple levels of symlinks
0390                         REPOS[$r]=$(readlink -f ${REPOS[$r]})
0391                 done
0392                 read -a REPOS <<< "$(printf "%s\n" "${REPOS[@]}" | sort -u | tr '\n' ' ')"
0393         fi
0394 }
0395 
0396 auto_stop(){
0397         # allow enabling verbosity here even if disabled at top level
0398         # but otherwise use top-level setting
0399         if [ -n "$1" ]; then VERBOSE="$VERBOSE_ARGS"; fi
0400         PARENTPID="$2"
0401 
0402         if [ -n "$PARENTPID" ]; then
0403                 if [ -n "$VERBOSE" ]; then
0404                         echo "watching PID $PARENTPID"
0405                         ps
0406                 fi
0407                 PCOUNTER=0
0408                 PMAX=5
0409                 # builtin wait is not used here because it can only monitor a child process, not a parent process
0410                 while [ "$PCOUNTER" -le "$PMAX" ]; do
0411                         if ! kill -0 $PARENTPID >& /dev/null; then
0412                                 PCOUNTER=$((PCOUNTER+1))
0413                                 if [ -n "$VERBOSE" ]; then
0414                                         echo "trigger $PCOUNTER:"
0415                                         ps
0416                                 fi
0417                         else
0418                                 # must get N in a row, otherwise reset
0419                                 if [ "$PCOUNTER" -gt 0 ] && [ -n "$VERBOSE" ]; then
0420                                         echo "reset:"
0421                                         ps
0422                                 fi
0423                                 PCOUNTER=0
0424                         fi
0425                         sleep 1
0426                 done
0427         fi
0428         $STOP_FN
0429 
0430         # move logs out of tmp dir
0431         if [ -z "$DRYRUN" ] && [ -n "$VERBOSE" ]; then
0432                 mv "$LOG" "$TOPDIR"
0433                 # only keep non-empty log
0434                 if [ -s "$STOPLOG" ]; then
0435                         mv "$STOPLOG" "$TOPDIR"
0436                 fi
0437         fi
0438 
0439         if [ -n "$CLEANUP" ]; then
0440                 $DRYRUN cd "$TOPDIR"
0441                 $DRYRUN rm -rf "$TMPDIR"
0442         fi
0443 }
0444 
0445 make_tmp(){
0446         # make sure everything happens in tmp dir
0447         $DRYRUN mkdir "$TMPDIR"
0448         MKDIR_EXIT=$?
0449         if [ "$MKDIR_EXIT" -ne 0 ]; then
0450                 echo "Could not create temp dir: $TMPDIR"
0451                 exit "$MKDIR_EXIT"
0452         fi
0453         $DRYRUN cd "$TMPDIR"
0454 }
0455 
0456 scram_tag(){
0457         cd $CMSSW_BASE
0458         TOOL="$1"
0459         TAG="$2"
0460         scram tool tag $TOOL $TAG 2> /dev/null || true
0461 }
0462 
0463 driver_docker(){
0464         $DOCKER run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
0465 }
0466 
0467 driver_apptainer(){
0468         D2S=${SANDBOX}/.singularity.d/env/10-docker2singularity.sh
0469         if [ -f "$D2S" ]; then
0470                 source $D2S && echo $CUDA_DRIVER_VERSION
0471         fi
0472 }
0473 
0474 compat_docker(){
0475         $DOCKER cp $($DOCKER create --rm ${IMAGE}):${COMPAT_SCRIPT} .
0476 }
0477 
0478 compat_apptainer(){
0479         cp ${SANDBOX}/${COMPAT_SCRIPT} .
0480 }
0481 
0482 check_drivers(){
0483         # get sandbox env vars in subshell
0484         CUDA_DRIVER_VERSION=$($DRIVER_FN)
0485         # copied from https://github.com/triton-inference-server/server/blob/v2.11.0/nvidia_entrypoint.sh
0486         DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
0487         if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then
0488                 return 0
0489         fi
0490 
0491         export COMPAT=""
0492 
0493         # 1. check for user-specified compatibility drivers
0494         if [ -n "$COMPAT_USR" ] && [ -d "$COMPAT_USR" ]; then
0495                 COMPAT="$COMPAT_USR"
0496         fi
0497 
0498         # 2. check for compatibility drivers from CMSSW if available
0499         # based on $CMSSW_BASE/config/SCRAM/hooks/runtime/00-nvidia-drivers
0500         # but need to check drivers, not runtime
0501         if [ -z "$COMPAT" ] && [ -n "$CMSSW_BASE" ]; then
0502                 CUDA_BASE=$(scram_tag cuda CUDA_BASE)
0503                 COMPAT_CMSSW=${CUDA_BASE}/drivers
0504 
0505                 if [ -n "$CUDA_BASE" ] && [ -d "$COMPAT_CMSSW" ]; then
0506                         COMPAT="$COMPAT_CMSSW"
0507                 fi
0508         fi
0509 
0510         # 3. finally, check expected system location
0511         COMPAT_SYS=/usr/local/cuda/compat
0512         if [ -z "$COMPAT" ] && [ -d "$COMPAT_SYS" ]; then
0513                 COMPAT="$COMPAT_SYS"
0514         fi
0515 
0516         if [ -n "$COMPAT" ]; then
0517                 # in order to mount drivers where Triton expects them
0518                 export COMPAT="${COMPAT}:${COMPAT_SYS}/lib.real"
0519                 # workaround for bug in Triton/Nvidia compatibility check
0520                 if [ "$PWD" == "$TMPDIR" ]; then
0521                         $DRYRUN $COMPAT_FN
0522                         COMPAT_SCRIPT_BASE=$(basename $COMPAT_SCRIPT)
0523                         # newer containers already include this fix, no need to do it twice
0524                         if ! grep -Fq 'LD_PRELOAD=""' $COMPAT_SCRIPT_BASE; then
0525                                 $DRYRUN sed -i 's/LD_LIBRARY_PATH="${_CUDA_COMPAT_REALLIB}"/&; LD_PRELOAD=""/' $COMPAT_SCRIPT_BASE
0526                                 COMPAT_SCRIPT_PATH=$(readlink -f $TMPDIR/$COMPAT_SCRIPT_BASE)
0527                                 COMPAT_SCRIPT_MOUNT="$COMPAT_SCRIPT_PATH:$COMPAT_SCRIPT"
0528                         fi
0529                 fi
0530                 return 0
0531         else
0532                 return 1
0533         fi
0534 }
0535 
0536 if [ -n "$USEDOCKER" ]; then
0537         if [ -n "$GPU" ]; then
0538                 EXTRA="--gpus all"
0539         fi
0540         START_FN=start_docker
0541         TEST_FN=test_docker
0542         STOP_FN=stop_docker
0543         DRIVER_FN=driver_docker
0544         COMPAT_FN=compat_docker
0545         PROG_NAME=Docker
0546 else
0547         if [ -n "$GPU" ]; then
0548                 EXTRA="--nv"
0549         fi
0550         START_FN=start_apptainer
0551         TEST_FN=test_apptainer
0552         STOP_FN=stop_apptainer
0553         DRIVER_FN=driver_apptainer
0554         COMPAT_FN=compat_apptainer
0555         PROG_NAME=Apptainer
0556 fi
0557 
0558 if [ "$OP" == check ]; then
0559         check_drivers
0560         exit $?
0561 elif [ "$OP" == start ]; then
0562         # handle cleaning up
0563         if [ -n "$FORCE" ]; then
0564                 auto_stop
0565         elif [ -d "$TMPDIR" ]; then
0566                 echo "Error: this container may already exist (override with -f)"
0567                 exit 1
0568         fi
0569 
0570         handle_ports
0571         PORT_EXIT=$?
0572         if [ "$PORT_EXIT" -ne 0 ]; then exit $PORT_EXIT; fi
0573 
0574         # if parent PID is provided, automatically stop server when finished
0575         # do this before actually trying to start the server in case of ctrl+c
0576         if [ -n "$PARENTPID" ]; then
0577                 auto_stop "" "$PARENTPID" >& "$STOPLOG" &
0578         fi
0579 
0580         START_EXIT=0
0581         for ((counter=0; counter < ${RETRIES}; counter++)); do
0582                 make_tmp
0583 
0584                 # if we plan on editing model configs, must copy files into /tmp/local_model_repo, which is deleted upon retry
0585                 if [ "$counter" -eq 0 ] || [ -n "$THREADCONTROL" ]; then list_models; fi
0586 
0587                 # only need to check drivers if using GPU
0588                 if [ -n "$GPU" ]; then
0589                         check_drivers
0590                         DRIVER_EXIT=$?
0591                         if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi
0592                 fi
0593 
0594                 $START_FN
0595                 START_EXIT=$?
0596                 if [ "$START_EXIT" -eq 0 ]; then
0597                         wait_server
0598                         WAIT_EXIT=$?
0599                         if [ "$WAIT_EXIT" -eq 0 ]; then
0600                                 break
0601                         else
0602                                 # allow to keep retrying if there was a port issue
0603                                 counter=$((counter-1))
0604                                 START_EXIT="$WAIT_EXIT"
0605                         fi
0606                 fi
0607 
0608                 if [ "$START_EXIT" -ne 0 ]; then
0609                         auto_stop
0610                         echo "Retrying after container issue..."
0611                 fi
0612         done
0613         if [ "$START_EXIT" -ne 0 ]; then
0614                 echo "Error from $PROG_NAME"
0615                 exit "$START_EXIT"
0616         fi
0617 else
0618         # check for tmp dir
0619         if [ -d "$TMPDIR" ] || [ -n "$DRYRUN" ]; then
0620                 $DRYRUN cd "$TMPDIR"
0621         elif [ -z "$FORCE" ]; then
0622                 echo "Error: attempt to stop unknown container $SERVER"
0623                 exit 1
0624         fi
0625 
0626         auto_stop
0627 fi