File indexing completed on 2024-08-07 23:11:45
0001
0002
0003
0004 CONTAINER=apptainer
0005 VERBOSE=""
0006 VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1"
0007 WTIME=600
0008 SERVER=triton_server_instance
0009 RETRIES=3
0010 REPOS=()
0011 MODELS=()
0012 FORCE=""
0013 CLEANUP=true
0014 TMPDIR=""
0015 INSTANCES=0
0016 LOCALMODELREPO=""
0017 DRYRUN=""
0018 PARENTPID=""
0019 BASEPORT=8000
0020 AUTOPORT=""
0021 NPORTS=3
0022 IMAGE=fastml/triton-torchgeo:22.07-py3-geometric
0023 SANDBOX=""
0024 COMPAT_USR=""
0025 EXTRAPATH=/cvmfs/oasis.opensciencegrid.org/mis/apptainer/current/bin
0026 OSVERSION=$(sed -nr 's/[^0-9]*([0-9]+).*/\1/p' /etc/redhat-release)
0027 if [ "$OSVERSION" -eq 7 ]; then
0028
0029 EXTRAPATH=/cvmfs/oasis.opensciencegrid.org/mis/apptainer/1.2.5/bin
0030 fi
0031 DEVICE=auto
0032 THREADCONTROL=""
0033
0034 get_sandbox(){
0035 if [ -z "$SANDBOX" ]; then
0036 echo /cvmfs/unpacked.cern.ch/registry.hub.docker.com/${IMAGE}
0037 else
0038 echo "$SANDBOX"
0039 fi
0040 }
0041
0042 usage() {
0043 ECHO="echo -e"
0044 $ECHO "cmsTriton [options] [start|stop|check]"
0045 $ECHO
0046 $ECHO "Options:"
0047 $ECHO "-c \t don't cleanup temporary dir (for debugging)"
0048 $ECHO "-C [dir] \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)"
0049 $ECHO "-D \t dry run: print container commands rather than executing them"
0050 $ECHO "-d [exe] \t container choice: apptainer, docker, podman, podman-hpc (default: ${CONTAINER})"
0051 $ECHO "-E [path] \t include extra path(s) for executables (default: ${EXTRAPATH})"
0052 $ECHO "-f \t force reuse of (possibly) existing container instance"
0053 $ECHO "-g [device] \t device choice: auto (try to detect GPU), CPU, GPU (default: ${DEVICE})"
0054 $ECHO "-i [name] \t server image name (default: ${IMAGE})"
0055 $ECHO "-I [num] \t number of model instances (default: ${INSTANCES} -> means no local editing of config files)"
0056 $ECHO "-M [dir] \t model repository (can be given more than once)"
0057 $ECHO "-m [dir] \t specific model directory (can be given more than once)"
0058 $ECHO "-n [name] \t name of container instance, also used for default hidden temporary dir (default: ${SERVER})"
0059 $ECHO "-P [port] \t base port number for services (-1: automatically find an unused port range) (default: ${BASEPORT})"
0060 $ECHO "-p [pid] \t automatically shut down server when process w/ specified PID ends (-1: use parent process PID)"
0061 $ECHO "-r [num] \t number of retries when starting container (default: ${RETRIES})"
0062 $ECHO "-s [dir] \t apptainer sandbox directory (default: $(get_sandbox))"
0063 $ECHO "-t [dir] \t non-default hidden temporary dir"
0064 $ECHO "-v \t (verbose) start: activate server debugging info; stop: keep server logs"
0065 $ECHO "-w [time] \t maximum time to wait for server to start (default: ${WTIME} seconds)"
0066 $ECHO "-h \t print this message and exit"
0067 $ECHO
0068 $ECHO "Operations:"
0069 $ECHO "start \t start server"
0070 $ECHO "stop \t stop server"
0071 $ECHO "check \t check if server can run on this system"
0072 exit $1
0073 }
0074
0075
0076 SHM=/dev/shm
0077 if [ -e /run/shm ]; then
0078 SHM=/run/shm
0079 fi
0080
0081 while getopts "cC:Dd:fg:i:I:M:m:n:P:p:r:s:t:vw:h" opt; do
0082 case "$opt" in
0083 c) CLEANUP=""
0084 ;;
0085 C) COMPAT_USR="$OPTARG"
0086 ;;
0087 D) DRYRUN=echo
0088 ;;
0089 d) CONTAINER="$OPTARG"
0090 ;;
0091 f) FORCE=true
0092 ;;
0093 g) DEVICE="$OPTARG"
0094 ;;
0095 i) IMAGE="$OPTARG"
0096 ;;
0097 I) INSTANCES="$OPTARG"
0098 ;;
0099 M) REPOS+=("$OPTARG")
0100 ;;
0101 m) MODELS+=("$OPTARG")
0102 ;;
0103 n) SERVER="$OPTARG"
0104 ;;
0105 P) if [ "$OPTARG" -eq -1 ]; then AUTOPORT=true; else BASEPORT="$OPTARG"; fi
0106 ;;
0107 p) if [ "$OPTARG" -eq -1 ]; then PARENTPID="$PPID"; else PARENTPID="$OPTARG"; fi
0108 ;;
0109 r) RETRIES="$OPTARG"
0110 ;;
0111 s) SANDBOX="$OPTARG"
0112 ;;
0113 t) TMPDIR="$OPTARG"
0114 ;;
0115 v) VERBOSE="$VERBOSE_ARGS"
0116 ;;
0117 w) WTIME="$OPTARG"
0118 ;;
0119 h) usage 0
0120 ;;
0121 esac
0122 done
0123
0124 shift $(($OPTIND - 1))
0125 OP=$1
0126
0127 if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != check ]; then
0128 usage 1
0129 fi
0130
0131
0132 DEVICE="${DEVICE,,}"
0133 if [[ ! " auto cpu gpu " =~ " $DEVICE " ]]; then
0134 echo "Unsupported device value: $DEVICE"
0135 exit 1
0136 fi
0137
0138
0139 CONTAINER="${CONTAINER,,}"
0140 if [[ ! " apptainer docker podman podman-hpc " =~ " $CONTAINER " ]]; then
0141 echo "Unsupported container value: $CONTAINER"
0142 exit 1
0143 fi
0144
0145 if [ "$RETRIES" -le 0 ]; then
0146 RETRIES=1
0147 fi
0148
0149 TOPDIR=$PWD
0150 if [ -z "$TMPDIR" ]; then
0151 TMPDIR="${TOPDIR}/.${SERVER}"
0152 else
0153 TMPDIR=$(readlink -f $TMPDIR)
0154 fi
0155
0156
0157 if [ -n "$EXTRAPATH" ]; then
0158 export PATH="${EXTRAPATH}:${PATH}"
0159 fi
0160
0161
0162 if [ "$CONTAINER" == "docker" ]; then
0163 if [ -z "$DOCKER" ]; then
0164 DOCKER="docker"
0165 fi
0166 elif [ "$CONTAINER" == "podman" ]; then
0167 if [ -z "$PODMAN" ]; then
0168 PODMAN="podman"
0169 fi
0170 elif [ "$CONTAINER" == "podman-hpc" ]; then
0171 if [ -z "$PODMAN" ]; then
0172 PODMAN="podman-hpc"
0173 fi
0174 else
0175 if [ -z "$APPTAINER" ]; then
0176 if type apptainer >& /dev/null; then
0177 APPTAINER=apptainer
0178 elif type singularity >& /dev/null; then
0179 APPTAINER=singularity
0180 else
0181 echo "Missing apptainer and singularity"
0182 exit 1
0183 fi
0184 fi
0185 fi
0186
0187 SANDBOX=$(get_sandbox)
0188 SANDBOX=$(readlink -f ${SANDBOX})
0189 LOG="log_${SERVER}.log"
0190 STOPLOG="log_stop_${SERVER}.log"
0191 LIB=lib
0192 STARTED_INDICATOR="Started GRPCInferenceService"
0193 SEGFAULT_INDICATOR="Address already in use"
0194 EXTRA=""
0195 COMPAT_SCRIPT=/etc/shinit_v2
0196
0197
0198 if [ "$INSTANCES" -gt 0 ]; then
0199 THREADCONTROL=true
0200 fi
0201
0202 compute_ports(){
0203
0204 export HTTPPORT=$BASEPORT
0205 export GRPCPORT=$((BASEPORT+1))
0206 export METRPORT=$((BASEPORT+2))
0207 }
0208
0209 check_port(){
0210
0211 if 2>/dev/null >"/dev/tcp/0.0.0.0/$1"; then
0212 return 1
0213 else
0214 return 0
0215 fi
0216 }
0217
0218 check_ports(){
0219 if check_port $HTTPPORT && check_port $GRPCPORT && check_port $METRPORT; then
0220 return 0
0221 else
0222 return 1
0223 fi
0224 }
0225
0226 find_ports(){
0227 while ! check_ports; do
0228 BASEPORT=$((BASEPORT+NPORTS))
0229 compute_ports
0230 done
0231 echo "CMS_TRITON_GRPC_PORT: $GRPCPORT"
0232 }
0233
0234 handle_ports(){
0235
0236 compute_ports
0237 if [ -n "$AUTOPORT" ]; then
0238 find_ports
0239 elif ! check_ports; then
0240 echo "Error: requested port in use"
0241 return 1
0242 fi
0243 export PORTARGS="--http-port=${HTTPPORT} --grpc-port=${GRPCPORT} --metrics-port=${METRPORT}"
0244 }
0245
0246 start_docker(){
0247
0248 MOUNTARGS=""
0249 REPOARGS=""
0250 for REPO in ${REPOS[@]}; do
0251 MOUNTARGS="$MOUNTARGS -v$REPO:$REPO"
0252 REPOARGS="$REPOARGS --model-repository=${REPO}"
0253 done
0254
0255
0256 if [ -n "$COMPAT" ]; then
0257 MOUNTARGS="$MOUNTARGS -v$COMPAT"
0258 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0259 MOUNTARGS="$MOUNTARGS -v$COMPAT_SCRIPT_MOUNT"
0260 fi
0261 fi
0262
0263 $DRYRUN $DOCKER run -d --name ${SERVER} \
0264 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \
0265 -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
0266 ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
0267 }
0268
0269 start_podman(){
0270
0271 MOUNTARGS=""
0272 REPOARGS=""
0273 for REPO in ${REPOS[@]}; do
0274 MOUNTARGS="$MOUNTARGS --volume $REPO:$REPO"
0275 REPOARGS="$REPOARGS --model-repository=${REPO}"
0276 done
0277
0278
0279 if [ -n "$COMPAT" ]; then
0280 MOUNTARGS="$MOUNTARGS --volume $COMPAT"
0281 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0282 MOUNTARGS="$MOUNTARGS --volume $COMPAT_SCRIPT_MOUNT"
0283 fi
0284 fi
0285
0286 $DRYRUN $PODMAN run -d --name ${SERVER} \
0287 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
0288 -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
0289 ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
0290 }
0291
0292 start_apptainer(){
0293
0294
0295
0296
0297 $DRYRUN mkdir ${LIB}
0298 $DRYRUN ln -s ${SANDBOX}/opt/tritonserver/lib/* ${LIB}/
0299
0300
0301 MOUNTARGS=""
0302 REPOARGS=""
0303 for REPO in ${REPOS[@]}; do
0304 MOUNTARGS="$MOUNTARGS -B $REPO"
0305 REPOARGS="$REPOARGS --model-repository=${REPO}"
0306 done
0307
0308
0309 if [ -n "$COMPAT" ]; then
0310 MOUNTARGS="$MOUNTARGS -B $COMPAT"
0311 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0312 MOUNTARGS="$MOUNTARGS -B $COMPAT_SCRIPT_MOUNT"
0313 fi
0314 fi
0315
0316
0317 unset DBUS_SESSION_BUS_ADDRESS
0318
0319
0320
0321
0322 $DRYRUN $APPTAINER instance start \
0323 --underlay \
0324 -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $MOUNTARGS $EXTRA \
0325 ${SANDBOX} ${SERVER}
0326
0327 START_EXIT=$?
0328 if [ "$START_EXIT" -ne 0 ]; then
0329 rm -rf ${LIB}
0330 return "$START_EXIT"
0331 fi
0332
0333
0334 if [ -z "$DRYRUN" ]; then
0335 REDIR="$LOG"
0336 else
0337 REDIR=/dev/stdout
0338 fi
0339 $DRYRUN $APPTAINER run instance://${SERVER} \
0340 tritonserver $PORTARGS $REPOARGS $VERBOSE >& ${REDIR} &
0341 [ -z "$DRYRUN" ] || wait
0342 }
0343
0344 stop_docker(){
0345
0346 if [ -z "$DRYRUN" ]; then
0347 if [ -n "$VERBOSE" ]; then $DOCKER logs ${SERVER} >& "$LOG"; fi
0348 fi
0349
0350 $DRYRUN $DOCKER stop ${SERVER}
0351 $DRYRUN $DOCKER rm ${SERVER}
0352 }
0353
0354 stop_podman(){
0355
0356 if [ -z "$DRYRUN" ]; then
0357 if [ -n "$VERBOSE" ]; then $PODMAN logs ${SERVER} >& "$LOG"; fi
0358 fi
0359
0360 $DRYRUN $PODMAN stop ${SERVER}
0361 $DRYRUN $PODMAN rm ${SERVER}
0362 }
0363
0364 stop_apptainer(){
0365 $DRYRUN $APPTAINER instance stop ${SERVER}
0366 }
0367
0368 test_docker(){
0369
0370 ${DOCKER} logs ${SERVER} |& grep "$1"
0371 }
0372
0373 test_podman(){
0374
0375 ${PODMAN} logs ${SERVER} |& grep "$1"
0376 }
0377
0378 test_apptainer(){
0379 grep "$1" $LOG
0380 }
0381
0382 wait_server(){
0383 if [ -n "$DRYRUN" ]; then
0384 return
0385 fi
0386
0387 COUNT=0
0388 while ! $TEST_FN "$STARTED_INDICATOR" >& /dev/null; do
0389 if $TEST_FN "$SEGFAULT_INDICATOR" >& /dev/null; then
0390 handle_ports
0391 PORT_EXIT=$?
0392
0393 if [ "$PORT_EXIT" -ne 0 ]; then
0394 auto_stop true
0395 exit $PORT_EXIT
0396 else
0397 return 1
0398 fi
0399 elif [ "$COUNT" -gt "$WTIME" ]; then
0400 echo "timed out waiting for server to start"
0401 auto_stop true
0402 exit 1
0403 else
0404 COUNT=$(($COUNT + 1))
0405 sleep 1
0406 fi
0407 done
0408
0409 echo "server is ready!"
0410 }
0411
0412 list_models(){
0413
0414 LOCALMODELREPO="local_model_repo"
0415 if [ -n "$THREADCONTROL" ]; then
0416 if [ -d "$TMPDIR/$LOCALMODELREPO" ]; then
0417
0418 rm -rf $TMPDIR/$LOCALMODELREPO
0419 fi
0420 $DRYRUN mkdir $TMPDIR/$LOCALMODELREPO
0421 MKMODELDIR_EXIT=$?
0422 if [ "$MKMODELDIR_EXIT" -ne 0 ]; then
0423 echo "Could not create local_model_repo dir: $TMPDIR/$LOCALMODELREPO"
0424 exit "$MKMODELDIR_EXIT"
0425 fi
0426 fi
0427
0428 for MODEL in ${MODELS[@]}; do
0429
0430 if [ -f "$MODEL" ]; then
0431 MODEL="$(dirname "$MODEL")"
0432 fi
0433 if [ -n "$THREADCONTROL" ]; then
0434 $DRYRUN cmsTritonConfigTool threadcontrol -c ${MODEL}/config.pbtxt --copy $TMPDIR/$LOCALMODELREPO --nThreads $INSTANCES
0435 TOOL_EXIT=$?
0436 if [ "$TOOL_EXIT" -ne 0 ]; then
0437 echo "Could not apply threadcontrol to $MODEL"
0438 exit "$TOOL_EXIT"
0439 fi
0440 else
0441 REPOS+=("$(dirname "$MODEL")")
0442 fi
0443 done
0444 if [ -n "$THREADCONTROL" ]; then
0445 REPOS=$TMPDIR/$LOCALMODELREPO
0446 else
0447 for ((r=0; r < ${
0448
0449 REPOS[$r]=$(readlink -f ${REPOS[$r]})
0450 done
0451 read -a REPOS <<< "$(printf "%s\n" "${REPOS[@]}" | sort -u | tr '\n' ' ')"
0452 fi
0453 }
0454
0455 auto_stop(){
0456
0457
0458 if [ -n "$1" ]; then VERBOSE="$VERBOSE_ARGS"; fi
0459 PARENTPID="$2"
0460
0461 if [ -n "$PARENTPID" ]; then
0462 if [ -n "$VERBOSE" ]; then
0463 echo "watching PID $PARENTPID"
0464 ps
0465 fi
0466 PCOUNTER=0
0467 PMAX=5
0468
0469 while [ "$PCOUNTER" -le "$PMAX" ]; do
0470 if ! kill -0 $PARENTPID >& /dev/null; then
0471 PCOUNTER=$((PCOUNTER+1))
0472 if [ -n "$VERBOSE" ]; then
0473 echo "trigger $PCOUNTER:"
0474 ps
0475 fi
0476 else
0477
0478 if [ "$PCOUNTER" -gt 0 ] && [ -n "$VERBOSE" ]; then
0479 echo "reset:"
0480 ps
0481 fi
0482 PCOUNTER=0
0483 fi
0484 sleep 1
0485 done
0486 fi
0487 $STOP_FN
0488
0489
0490 if [ -z "$DRYRUN" ] && [ -n "$VERBOSE" ]; then
0491 mv "$LOG" "$TOPDIR"
0492
0493 if [ -s "$STOPLOG" ]; then
0494 mv "$STOPLOG" "$TOPDIR"
0495 fi
0496 fi
0497
0498 if [ -n "$CLEANUP" ]; then
0499 $DRYRUN cd "$TOPDIR"
0500 $DRYRUN rm -rf "$TMPDIR"
0501 fi
0502 }
0503
0504 make_tmp(){
0505
0506 $DRYRUN mkdir "$TMPDIR"
0507 MKDIR_EXIT=$?
0508 if [ "$MKDIR_EXIT" -ne 0 ]; then
0509 echo "Could not create temp dir: $TMPDIR"
0510 exit "$MKDIR_EXIT"
0511 fi
0512 $DRYRUN cd "$TMPDIR"
0513 }
0514
0515 scram_tag(){
0516 cd $CMSSW_BASE
0517 TOOL="$1"
0518 TAG="$2"
0519 scram tool tag $TOOL $TAG 2> /dev/null || true
0520 }
0521
0522 driver_docker(){
0523 $DOCKER run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
0524 }
0525
0526 driver_podman(){
0527 $PODMAN run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
0528 }
0529
0530 driver_apptainer(){
0531 D2S=${SANDBOX}/.singularity.d/env/10-docker2singularity.sh
0532 if [ -f "$D2S" ]; then
0533 source $D2S && echo "CUDA_DRIVER_VERSION=$CUDA_DRIVER_VERSION"
0534 fi
0535 }
0536
0537 compat_docker(){
0538 $DOCKER cp $($DOCKER create --rm ${IMAGE}):${COMPAT_SCRIPT} .
0539 }
0540
0541 compat_apptainer(){
0542 cp ${SANDBOX}/${COMPAT_SCRIPT} .
0543 }
0544
0545 check_drivers(){
0546
0547 eval "$($DRIVER_FN)"
0548
0549
0550 DRIVER_VERSION=$(sed -nr 's/^NVRM.*Kernel Module[^.]* ([0-9.]*).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
0551 if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then
0552 return 0
0553 fi
0554
0555 export COMPAT=""
0556
0557
0558 if [ -n "$COMPAT_USR" ] && [ -d "$COMPAT_USR" ]; then
0559 COMPAT="$COMPAT_USR"
0560 fi
0561
0562
0563
0564
0565 if [ -z "$COMPAT" ] && [ -n "$CMSSW_BASE" ]; then
0566 CUDA_BASE=$(scram_tag cuda CUDA_BASE)
0567 COMPAT_CMSSW=${CUDA_BASE}/drivers
0568
0569 if [ -n "$CUDA_BASE" ] && [ -d "$COMPAT_CMSSW" ]; then
0570 COMPAT="$COMPAT_CMSSW"
0571 fi
0572 fi
0573
0574
0575 COMPAT_SYS=/usr/local/cuda/compat
0576 if [ -z "$COMPAT" ] && [ -d "$COMPAT_SYS" ]; then
0577 COMPAT="$COMPAT_SYS"
0578 fi
0579
0580 if [ -n "$COMPAT" ]; then
0581
0582 export COMPAT="${COMPAT}:${COMPAT_SYS}/lib.real"
0583
0584 if [ "$PWD" == "$TMPDIR" ]; then
0585 $DRYRUN $COMPAT_FN
0586 COMPAT_SCRIPT_BASE=$(basename $COMPAT_SCRIPT)
0587
0588 if ! grep -Fq 'LD_PRELOAD=""' $COMPAT_SCRIPT_BASE; then
0589 $DRYRUN sed -i 's/LD_LIBRARY_PATH="${_CUDA_COMPAT_REALLIB}"/&; LD_PRELOAD=""/' $COMPAT_SCRIPT_BASE
0590 COMPAT_SCRIPT_PATH=$(readlink -f $TMPDIR/$COMPAT_SCRIPT_BASE)
0591 COMPAT_SCRIPT_MOUNT="$COMPAT_SCRIPT_PATH:$COMPAT_SCRIPT"
0592 fi
0593 fi
0594 return 0
0595 else
0596 return 1
0597 fi
0598 }
0599
0600 extra_docker(){
0601 if [ "$DEVICE" == gpu ]; then
0602 EXTRA="--gpus all"
0603 fi
0604 }
0605 extra_podman(){
0606 if [ "$DEVICE" == gpu ]; then
0607 EXTRA="--device nvidia.com/gpu=all"
0608 fi
0609 }
0610 extra_podman_hpc(){
0611 if [ "$DEVICE" == gpu ]; then
0612 EXTRA="--gpu"
0613 fi
0614 EXTRA="$EXTRA --cvmfs --log-driver=json-file"
0615 }
0616 extra_apptainer(){
0617 if [ "$DEVICE" == gpu ]; then
0618 EXTRA="--nv"
0619 fi
0620 }
0621
0622 if [ "$CONTAINER" == "docker" ]; then
0623 START_FN=start_docker
0624 EXTRA_FN=extra_docker
0625 TEST_FN=test_docker
0626 STOP_FN=stop_docker
0627 DRIVER_FN=driver_docker
0628 COMPAT_FN=compat_docker
0629 PROG_NAME=docker
0630 elif [ "$CONTAINER" == "podman" ]; then
0631 START_FN=start_podman
0632 EXTRA_FN=extra_podman
0633 TEST_FN=test_podman
0634 STOP_FN=stop_podman
0635 DRIVER_FN=driver_podman
0636 COMPAT_FN=compat_podman
0637 PROG_NAME=podman
0638 elif [ "$CONTAINER" == "podman-hpc" ]; then
0639 START_FN=start_podman
0640 EXTRA_FN=extra_podman_hpc
0641 TEST_FN=test_podman
0642 STOP_FN=stop_podman
0643 DRIVER_FN=driver_podman
0644 COMPAT_FN=compat_podman
0645 PROG_NAME=podman-hpc
0646 else
0647 START_FN=start_apptainer
0648 EXTRA_FN=extra_apptainer
0649 TEST_FN=test_apptainer
0650 STOP_FN=stop_apptainer
0651 DRIVER_FN=driver_apptainer
0652 COMPAT_FN=compat_apptainer
0653 PROG_NAME=apptainer
0654 fi
0655
0656 if [ "$OP" == check ]; then
0657 check_drivers
0658 exit $?
0659 elif [ "$OP" == start ]; then
0660
0661 if [ -n "$FORCE" ]; then
0662 auto_stop
0663 elif [ -d "$TMPDIR" ]; then
0664 echo "Error: this container may already exist (override with -f)"
0665 exit 1
0666 fi
0667
0668
0669 if [ "$DEVICE" == auto ]; then
0670 if nvidia-smi -L >& /dev/null; then
0671 DEVICE=gpu
0672 else
0673 DEVICE=cpu
0674 fi
0675 fi
0676 echo "CMS_TRITON_CHOSEN_DEVICE: $DEVICE"
0677 $EXTRA_FN
0678
0679
0680 if [ "$DEVICE" == gpu ]; then
0681 THREADCONTROL=""
0682 fi
0683
0684 handle_ports
0685 PORT_EXIT=$?
0686 if [ "$PORT_EXIT" -ne 0 ]; then exit $PORT_EXIT; fi
0687
0688
0689
0690 if [ -n "$PARENTPID" ]; then
0691 auto_stop "" "$PARENTPID" >& "$STOPLOG" &
0692 fi
0693
0694 START_EXIT=0
0695 for ((counter=0; counter < ${RETRIES}; counter++)); do
0696 make_tmp
0697
0698
0699 if [ "$counter" -eq 0 ] || [ -n "$THREADCONTROL" ]; then list_models; fi
0700
0701
0702 if [ "$DEVICE" == gpu ]; then
0703 check_drivers
0704 DRIVER_EXIT=$?
0705 if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi
0706 fi
0707
0708 $START_FN
0709 START_EXIT=$?
0710 if [ "$START_EXIT" -eq 0 ]; then
0711 wait_server
0712 WAIT_EXIT=$?
0713 if [ "$WAIT_EXIT" -eq 0 ]; then
0714 break
0715 else
0716
0717 counter=$((counter-1))
0718 START_EXIT="$WAIT_EXIT"
0719 fi
0720 fi
0721
0722 if [ "$START_EXIT" -ne 0 ]; then
0723 auto_stop
0724 echo "Retrying after container issue..."
0725 fi
0726 done
0727 if [ "$START_EXIT" -ne 0 ]; then
0728 echo "Error from $PROG_NAME"
0729 exit "$START_EXIT"
0730 fi
0731 else
0732
0733 if [ -d "$TMPDIR" ] || [ -n "$DRYRUN" ]; then
0734 $DRYRUN cd "$TMPDIR"
0735 elif [ -z "$FORCE" ]; then
0736 echo "Error: attempt to stop unknown container $SERVER"
0737 exit 1
0738 fi
0739
0740 auto_stop
0741 fi