Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-08-07 23:11:45

0001 #!/bin/bash
0002 
0003 # defaults
0004 CONTAINER=apptainer
0005 VERBOSE=""
0006 VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1"
0007 WTIME=600
0008 SERVER=triton_server_instance
0009 RETRIES=3
0010 REPOS=()
0011 MODELS=()
0012 FORCE=""
0013 CLEANUP=true
0014 TMPDIR=""
0015 INSTANCES=0
0016 LOCALMODELREPO=""
0017 DRYRUN=""
0018 PARENTPID=""
0019 BASEPORT=8000
0020 AUTOPORT=""
0021 NPORTS=3
0022 IMAGE=fastml/triton-torchgeo:22.07-py3-geometric
0023 SANDBOX=""
0024 COMPAT_USR=""
0025 EXTRAPATH=/cvmfs/oasis.opensciencegrid.org/mis/apptainer/current/bin
0026 OSVERSION=$(sed -nr 's/[^0-9]*([0-9]+).*/\1/p' /etc/redhat-release)
0027 if [ "$OSVERSION" -eq 7 ]; then
0028         # this is the latest version with guaranteed sl7 support
0029         EXTRAPATH=/cvmfs/oasis.opensciencegrid.org/mis/apptainer/1.2.5/bin
0030 fi
0031 DEVICE=auto
0032 THREADCONTROL=""
0033 
0034 get_sandbox(){
0035         if [ -z "$SANDBOX" ]; then
0036                 echo /cvmfs/unpacked.cern.ch/registry.hub.docker.com/${IMAGE}
0037         else
0038                 echo "$SANDBOX"
0039         fi
0040 }
0041 
0042 usage() {
0043         ECHO="echo -e"
0044         $ECHO "cmsTriton [options] [start|stop|check]"
0045         $ECHO
0046         $ECHO "Options:"
0047         $ECHO "-c          \t don't cleanup temporary dir (for debugging)"
0048         $ECHO "-C [dir]    \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)"
0049         $ECHO "-D          \t dry run: print container commands rather than executing them"
0050         $ECHO "-d [exe]    \t container choice: apptainer, docker, podman, podman-hpc (default: ${CONTAINER})"
0051         $ECHO "-E [path]   \t include extra path(s) for executables (default: ${EXTRAPATH})"
0052         $ECHO "-f          \t force reuse of (possibly) existing container instance"
0053         $ECHO "-g [device] \t device choice: auto (try to detect GPU), CPU, GPU (default: ${DEVICE})"
0054         $ECHO "-i [name]   \t server image name (default: ${IMAGE})"
0055         $ECHO "-I [num]    \t number of model instances (default: ${INSTANCES} -> means no local editing of config files)"
0056         $ECHO "-M [dir]    \t model repository (can be given more than once)"
0057         $ECHO "-m [dir]    \t specific model directory (can be given more than once)"
0058         $ECHO "-n [name]   \t name of container instance, also used for default hidden temporary dir (default: ${SERVER})"
0059         $ECHO "-P [port]   \t base port number for services (-1: automatically find an unused port range) (default: ${BASEPORT})"
0060         $ECHO "-p [pid]    \t automatically shut down server when process w/ specified PID ends (-1: use parent process PID)"
0061         $ECHO "-r [num]    \t number of retries when starting container (default: ${RETRIES})"
0062         $ECHO "-s [dir]    \t apptainer sandbox directory (default: $(get_sandbox))"
0063         $ECHO "-t [dir]    \t non-default hidden temporary dir"
0064         $ECHO "-v          \t (verbose) start: activate server debugging info; stop: keep server logs"
0065         $ECHO "-w [time]   \t maximum time to wait for server to start (default: ${WTIME} seconds)"
0066         $ECHO "-h          \t print this message and exit"
0067         $ECHO
0068         $ECHO "Operations:"
0069         $ECHO "start       \t start server"
0070         $ECHO "stop        \t stop server"
0071         $ECHO "check       \t check if server can run on this system"
0072         exit $1
0073 }
0074 
0075 # check shm locations
0076 SHM=/dev/shm
0077 if [ -e /run/shm ]; then
0078         SHM=/run/shm
0079 fi
0080 
0081 while getopts "cC:Dd:fg:i:I:M:m:n:P:p:r:s:t:vw:h" opt; do
0082         case "$opt" in
0083                 c) CLEANUP=""
0084                 ;;
0085                 C) COMPAT_USR="$OPTARG"
0086                 ;;
0087                 D) DRYRUN=echo
0088                 ;;
0089                 d) CONTAINER="$OPTARG"
0090                 ;;
0091                 f) FORCE=true
0092                 ;;
0093                 g) DEVICE="$OPTARG"
0094                 ;;
0095                 i) IMAGE="$OPTARG"
0096                 ;;
0097                 I) INSTANCES="$OPTARG"
0098                 ;;
0099                 M) REPOS+=("$OPTARG")
0100                 ;;
0101                 m) MODELS+=("$OPTARG")
0102                 ;;
0103                 n) SERVER="$OPTARG"
0104                 ;;
0105                 P) if [ "$OPTARG" -eq -1 ]; then AUTOPORT=true; else BASEPORT="$OPTARG"; fi
0106                 ;;
0107                 p) if [ "$OPTARG" -eq -1 ]; then PARENTPID="$PPID"; else PARENTPID="$OPTARG"; fi
0108                 ;;
0109                 r) RETRIES="$OPTARG"
0110                 ;;
0111                 s) SANDBOX="$OPTARG"
0112                 ;;
0113                 t) TMPDIR="$OPTARG"
0114                 ;;
0115                 v) VERBOSE="$VERBOSE_ARGS"
0116                 ;;
0117                 w) WTIME="$OPTARG"
0118                 ;;
0119                 h) usage 0
0120                 ;;
0121         esac
0122 done
0123 
0124 shift $(($OPTIND - 1))
0125 OP=$1
0126 
0127 if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != check ]; then
0128         usage 1
0129 fi
0130 
0131 # check acceptable values for device choice
0132 DEVICE="${DEVICE,,}"
0133 if [[ ! " auto cpu gpu " =~ " $DEVICE " ]]; then
0134         echo "Unsupported device value: $DEVICE"
0135         exit 1
0136 fi
0137 
0138 # check acceptable values for container choice
0139 CONTAINER="${CONTAINER,,}"
0140 if [[ ! " apptainer docker podman podman-hpc " =~ " $CONTAINER " ]]; then
0141         echo "Unsupported container value: $CONTAINER"
0142         exit 1
0143 fi
0144 
0145 if [ "$RETRIES" -le 0 ]; then
0146         RETRIES=1
0147 fi
0148 
0149 TOPDIR=$PWD
0150 if [ -z "$TMPDIR" ]; then
0151         TMPDIR="${TOPDIR}/.${SERVER}"
0152 else
0153         TMPDIR=$(readlink -f $TMPDIR)
0154 fi
0155 
0156 # update path
0157 if [ -n "$EXTRAPATH" ]; then
0158         export PATH="${EXTRAPATH}:${PATH}"
0159 fi
0160 
0161 # find executables
0162 if [ "$CONTAINER" == "docker" ]; then
0163         if [ -z "$DOCKER" ]; then
0164                 DOCKER="docker"
0165         fi
0166 elif [ "$CONTAINER" == "podman" ]; then
0167         if [ -z "$PODMAN" ]; then
0168                 PODMAN="podman"
0169         fi
0170 elif [ "$CONTAINER" == "podman-hpc" ]; then
0171         if [ -z "$PODMAN" ]; then
0172                 PODMAN="podman-hpc"
0173         fi
0174 else
0175         if [ -z "$APPTAINER" ]; then
0176                 if type apptainer >& /dev/null; then
0177                         APPTAINER=apptainer
0178                 elif type singularity >& /dev/null; then
0179                         APPTAINER=singularity
0180                 else
0181                         echo "Missing apptainer and singularity"
0182                         exit 1
0183                 fi
0184         fi
0185 fi
0186 
0187 SANDBOX=$(get_sandbox)
0188 SANDBOX=$(readlink -f ${SANDBOX})
0189 LOG="log_${SERVER}.log"
0190 STOPLOG="log_stop_${SERVER}.log"
0191 LIB=lib
0192 STARTED_INDICATOR="Started GRPCInferenceService"
0193 SEGFAULT_INDICATOR="Address already in use"
0194 EXTRA=""
0195 COMPAT_SCRIPT=/etc/shinit_v2
0196 
0197 # this will be reset later if chosen device is gpu
0198 if [ "$INSTANCES" -gt 0 ]; then
0199         THREADCONTROL=true
0200 fi
0201 
0202 compute_ports(){
0203         # compute derived port numbers
0204         export HTTPPORT=$BASEPORT
0205         export GRPCPORT=$((BASEPORT+1))
0206         export METRPORT=$((BASEPORT+2))
0207 }
0208 
0209 check_port(){
0210         # success on this command means the port is in use
0211         if 2>/dev/null >"/dev/tcp/0.0.0.0/$1"; then
0212                 return 1
0213         else
0214                 return 0
0215         fi
0216 }
0217 
0218 check_ports(){
0219         if check_port $HTTPPORT && check_port $GRPCPORT && check_port $METRPORT; then
0220                 return 0
0221         else
0222                 return 1
0223         fi
0224 }
0225 
0226 find_ports(){
0227         while ! check_ports; do
0228                 BASEPORT=$((BASEPORT+NPORTS))
0229                 compute_ports
0230         done
0231         echo "CMS_TRITON_GRPC_PORT: $GRPCPORT"
0232 }
0233 
0234 handle_ports(){
0235         # handle ports
0236         compute_ports
0237         if [ -n "$AUTOPORT" ]; then
0238                 find_ports
0239         elif ! check_ports; then
0240                 echo "Error: requested port in use"
0241                 return 1
0242         fi
0243         export PORTARGS="--http-port=${HTTPPORT} --grpc-port=${GRPCPORT} --metrics-port=${METRPORT}"
0244 }
0245 
0246 start_docker(){
0247         # mount all model repositories
0248         MOUNTARGS=""
0249         REPOARGS=""
0250         for REPO in ${REPOS[@]}; do
0251                 MOUNTARGS="$MOUNTARGS -v$REPO:$REPO"
0252                 REPOARGS="$REPOARGS --model-repository=${REPO}"
0253         done
0254 
0255         # compatibility driver environment
0256         if [ -n "$COMPAT" ]; then
0257                 MOUNTARGS="$MOUNTARGS -v$COMPAT"
0258                 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0259                         MOUNTARGS="$MOUNTARGS -v$COMPAT_SCRIPT_MOUNT"
0260                 fi
0261         fi
0262 
0263         $DRYRUN $DOCKER run -d --name ${SERVER} \
0264                 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \
0265                 -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
0266                 ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
0267 }
0268 
0269 start_podman(){
0270         # mount all model repositories
0271         MOUNTARGS=""
0272         REPOARGS=""
0273         for REPO in ${REPOS[@]}; do
0274                 MOUNTARGS="$MOUNTARGS --volume $REPO:$REPO"
0275                 REPOARGS="$REPOARGS --model-repository=${REPO}"
0276         done
0277 
0278         # compatibility driver environment
0279         if [ -n "$COMPAT" ]; then
0280                 MOUNTARGS="$MOUNTARGS --volume $COMPAT"
0281                 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0282                         MOUNTARGS="$MOUNTARGS --volume $COMPAT_SCRIPT_MOUNT"
0283                 fi
0284         fi
0285 
0286         $DRYRUN $PODMAN run -d --name ${SERVER} \
0287                 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
0288                 -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
0289                 ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
0290 }
0291 
0292 start_apptainer(){
0293         # triton server image may need to modify contents of opt/tritonserver/lib/
0294         # but cvmfs is read-only
0295         # -> make a writable local directory with the same contents
0296         # this is no longer needed as of triton 2.11.0, but kept for compatibility w/ older server versions
0297         $DRYRUN mkdir ${LIB}
0298         $DRYRUN ln -s ${SANDBOX}/opt/tritonserver/lib/* ${LIB}/
0299 
0300         # mount all model repositories
0301         MOUNTARGS=""
0302         REPOARGS=""
0303         for REPO in ${REPOS[@]}; do
0304                 MOUNTARGS="$MOUNTARGS -B $REPO"
0305                 REPOARGS="$REPOARGS --model-repository=${REPO}"
0306         done
0307 
0308         # compatibility driver environment
0309         if [ -n "$COMPAT" ]; then
0310                 MOUNTARGS="$MOUNTARGS -B $COMPAT"
0311                 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0312                         MOUNTARGS="$MOUNTARGS -B $COMPAT_SCRIPT_MOUNT"
0313                 fi
0314         fi
0315 
0316     # workaround for https://github.com/apptainer/apptainer/issues/2164
0317     unset DBUS_SESSION_BUS_ADDRESS
0318 
0319         # start instance
0320         # need to bind /cvmfs for above symlinks to work inside container
0321         # --underlay: workaround for https://github.com/apptainer/apptainer/issues/2167
0322         $DRYRUN $APPTAINER instance start \
0323                 --underlay \
0324                 -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $MOUNTARGS $EXTRA \
0325                 ${SANDBOX} ${SERVER}
0326 
0327         START_EXIT=$?
0328         if [ "$START_EXIT" -ne 0 ]; then
0329                 rm -rf ${LIB}
0330                 return "$START_EXIT"
0331         fi
0332 
0333         # run the actual server
0334         if [ -z "$DRYRUN" ]; then
0335                 REDIR="$LOG"
0336         else
0337                 REDIR=/dev/stdout
0338         fi
0339         $DRYRUN $APPTAINER run instance://${SERVER} \
0340                 tritonserver $PORTARGS $REPOARGS $VERBOSE >& ${REDIR} &
0341         [ -z "$DRYRUN" ] || wait
0342 }
0343 
0344 stop_docker(){
0345         # keep log
0346         if [ -z "$DRYRUN" ]; then
0347                 if [ -n "$VERBOSE" ]; then $DOCKER logs ${SERVER} >& "$LOG"; fi
0348         fi
0349 
0350         $DRYRUN $DOCKER stop ${SERVER}
0351         $DRYRUN $DOCKER rm ${SERVER}
0352 }
0353 
0354 stop_podman(){
0355         # keep log
0356         if [ -z "$DRYRUN" ]; then
0357                 if [ -n "$VERBOSE" ]; then $PODMAN logs ${SERVER} >& "$LOG"; fi
0358         fi
0359 
0360         $DRYRUN $PODMAN stop ${SERVER}
0361         $DRYRUN $PODMAN rm ${SERVER}
0362 }
0363 
0364 stop_apptainer(){
0365         $DRYRUN $APPTAINER instance stop ${SERVER}
0366 }
0367 
0368 test_docker(){
0369         # docker logs print to stderr
0370         ${DOCKER} logs ${SERVER} |& grep "$1"
0371 }
0372 
0373 test_podman(){
0374         # podman logs print to stdout
0375         ${PODMAN} logs ${SERVER} |& grep "$1"
0376 }
0377 
0378 test_apptainer(){
0379         grep "$1" $LOG
0380 }
0381 
0382 wait_server(){
0383         if [ -n "$DRYRUN" ]; then
0384                 return
0385         fi
0386 
0387         COUNT=0
0388         while ! $TEST_FN "$STARTED_INDICATOR" >& /dev/null; do
0389                 if $TEST_FN "$SEGFAULT_INDICATOR" >& /dev/null; then
0390                         handle_ports
0391                         PORT_EXIT=$?
0392                         # port exit is zero if it found a new, good port; so retry
0393                         if [ "$PORT_EXIT" -ne 0 ]; then
0394                                 auto_stop true
0395                                 exit $PORT_EXIT
0396                         else
0397                                 return 1
0398                         fi
0399                 elif [ "$COUNT" -gt "$WTIME" ]; then
0400                         echo "timed out waiting for server to start"
0401                         auto_stop true
0402                         exit 1
0403                 else
0404                         COUNT=$(($COUNT + 1))
0405                         sleep 1
0406                 fi
0407         done
0408 
0409         echo "server is ready!"
0410 }
0411 
0412 list_models(){
0413         # make list of model repositories
0414         LOCALMODELREPO="local_model_repo"
0415         if [ -n "$THREADCONTROL" ]; then
0416                 if [ -d "$TMPDIR/$LOCALMODELREPO" ]; then
0417                         #Want to start with a fresh copy of model files in case this directory already exists with local edits
0418                         rm -rf $TMPDIR/$LOCALMODELREPO
0419                 fi
0420                 $DRYRUN mkdir $TMPDIR/$LOCALMODELREPO
0421                 MKMODELDIR_EXIT=$?
0422                 if [ "$MKMODELDIR_EXIT" -ne 0 ]; then
0423                         echo "Could not create local_model_repo dir: $TMPDIR/$LOCALMODELREPO"
0424                         exit "$MKMODELDIR_EXIT"
0425                 fi
0426         fi
0427 
0428         for MODEL in ${MODELS[@]}; do
0429                 # check if file was provided rather than directory
0430                 if [ -f "$MODEL" ]; then
0431                         MODEL="$(dirname "$MODEL")"
0432                 fi
0433                 if [ -n "$THREADCONTROL" ]; then
0434                         $DRYRUN cmsTritonConfigTool threadcontrol -c ${MODEL}/config.pbtxt --copy $TMPDIR/$LOCALMODELREPO --nThreads $INSTANCES
0435                         TOOL_EXIT=$?
0436                         if [ "$TOOL_EXIT" -ne 0 ]; then
0437                                 echo "Could not apply threadcontrol to $MODEL"
0438                                 exit "$TOOL_EXIT"
0439                         fi
0440                 else
0441                         REPOS+=("$(dirname "$MODEL")")
0442                 fi
0443         done
0444         if [ -n "$THREADCONTROL" ]; then
0445                 REPOS=$TMPDIR/$LOCALMODELREPO
0446         else
0447                 for ((r=0; r < ${#REPOS[@]}; r++)); do
0448                         # avoid issues w/ multiple levels of symlinks
0449                         REPOS[$r]=$(readlink -f ${REPOS[$r]})
0450                 done
0451                 read -a REPOS <<< "$(printf "%s\n" "${REPOS[@]}" | sort -u | tr '\n' ' ')"
0452         fi
0453 }
0454 
0455 auto_stop(){
0456         # allow enabling verbosity here even if disabled at top level
0457         # but otherwise use top-level setting
0458         if [ -n "$1" ]; then VERBOSE="$VERBOSE_ARGS"; fi
0459         PARENTPID="$2"
0460 
0461         if [ -n "$PARENTPID" ]; then
0462                 if [ -n "$VERBOSE" ]; then
0463                         echo "watching PID $PARENTPID"
0464                         ps
0465                 fi
0466                 PCOUNTER=0
0467                 PMAX=5
0468                 # builtin wait is not used here because it can only monitor a child process, not a parent process
0469                 while [ "$PCOUNTER" -le "$PMAX" ]; do
0470                         if ! kill -0 $PARENTPID >& /dev/null; then
0471                                 PCOUNTER=$((PCOUNTER+1))
0472                                 if [ -n "$VERBOSE" ]; then
0473                                         echo "trigger $PCOUNTER:"
0474                                         ps
0475                                 fi
0476                         else
0477                                 # must get N in a row, otherwise reset
0478                                 if [ "$PCOUNTER" -gt 0 ] && [ -n "$VERBOSE" ]; then
0479                                         echo "reset:"
0480                                         ps
0481                                 fi
0482                                 PCOUNTER=0
0483                         fi
0484                         sleep 1
0485                 done
0486         fi
0487         $STOP_FN
0488 
0489         # move logs out of tmp dir
0490         if [ -z "$DRYRUN" ] && [ -n "$VERBOSE" ]; then
0491                 mv "$LOG" "$TOPDIR"
0492                 # only keep non-empty log
0493                 if [ -s "$STOPLOG" ]; then
0494                         mv "$STOPLOG" "$TOPDIR"
0495                 fi
0496         fi
0497 
0498         if [ -n "$CLEANUP" ]; then
0499                 $DRYRUN cd "$TOPDIR"
0500                 $DRYRUN rm -rf "$TMPDIR"
0501         fi
0502 }
0503 
0504 make_tmp(){
0505         # make sure everything happens in tmp dir
0506         $DRYRUN mkdir "$TMPDIR"
0507         MKDIR_EXIT=$?
0508         if [ "$MKDIR_EXIT" -ne 0 ]; then
0509                 echo "Could not create temp dir: $TMPDIR"
0510                 exit "$MKDIR_EXIT"
0511         fi
0512         $DRYRUN cd "$TMPDIR"
0513 }
0514 
0515 scram_tag(){
0516         cd $CMSSW_BASE
0517         TOOL="$1"
0518         TAG="$2"
0519         scram tool tag $TOOL $TAG 2> /dev/null || true
0520 }
0521 
0522 driver_docker(){
0523         $DOCKER run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
0524 }
0525 
0526 driver_podman(){
0527         $PODMAN run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
0528 }
0529 
0530 driver_apptainer(){
0531         D2S=${SANDBOX}/.singularity.d/env/10-docker2singularity.sh
0532         if [ -f "$D2S" ]; then
0533                 source $D2S && echo "CUDA_DRIVER_VERSION=$CUDA_DRIVER_VERSION"
0534         fi
0535 }
0536 
0537 compat_docker(){
0538         $DOCKER cp $($DOCKER create --rm ${IMAGE}):${COMPAT_SCRIPT} .
0539 }
0540 
0541 compat_apptainer(){
0542         cp ${SANDBOX}/${COMPAT_SCRIPT} .
0543 }
0544 
0545 check_drivers(){
0546         # get sandbox env vars in subshell
0547         eval "$($DRIVER_FN)"
0548         # copied from https://github.com/triton-inference-server/server/blob/v2.11.0/nvidia_entrypoint.sh
0549         # regex generalized to handle SUSE
0550         DRIVER_VERSION=$(sed -nr 's/^NVRM.*Kernel Module[^.]* ([0-9.]*).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
0551         if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then
0552                 return 0
0553         fi
0554 
0555         export COMPAT=""
0556 
0557         # 1. check for user-specified compatibility drivers
0558         if [ -n "$COMPAT_USR" ] && [ -d "$COMPAT_USR" ]; then
0559                 COMPAT="$COMPAT_USR"
0560         fi
0561 
0562         # 2. check for compatibility drivers from CMSSW if available
0563         # based on $CMSSW_BASE/config/SCRAM/hooks/runtime/00-nvidia-drivers
0564         # but need to check drivers, not runtime
0565         if [ -z "$COMPAT" ] && [ -n "$CMSSW_BASE" ]; then
0566                 CUDA_BASE=$(scram_tag cuda CUDA_BASE)
0567                 COMPAT_CMSSW=${CUDA_BASE}/drivers
0568 
0569                 if [ -n "$CUDA_BASE" ] && [ -d "$COMPAT_CMSSW" ]; then
0570                         COMPAT="$COMPAT_CMSSW"
0571                 fi
0572         fi
0573 
0574         # 3. finally, check expected system location
0575         COMPAT_SYS=/usr/local/cuda/compat
0576         if [ -z "$COMPAT" ] && [ -d "$COMPAT_SYS" ]; then
0577                 COMPAT="$COMPAT_SYS"
0578         fi
0579 
0580         if [ -n "$COMPAT" ]; then
0581                 # in order to mount drivers where Triton expects them
0582                 export COMPAT="${COMPAT}:${COMPAT_SYS}/lib.real"
0583                 # workaround for bug in Triton/Nvidia compatibility check
0584                 if [ "$PWD" == "$TMPDIR" ]; then
0585                         $DRYRUN $COMPAT_FN
0586                         COMPAT_SCRIPT_BASE=$(basename $COMPAT_SCRIPT)
0587                         # newer containers already include this fix, no need to do it twice
0588                         if ! grep -Fq 'LD_PRELOAD=""' $COMPAT_SCRIPT_BASE; then
0589                                 $DRYRUN sed -i 's/LD_LIBRARY_PATH="${_CUDA_COMPAT_REALLIB}"/&; LD_PRELOAD=""/' $COMPAT_SCRIPT_BASE
0590                                 COMPAT_SCRIPT_PATH=$(readlink -f $TMPDIR/$COMPAT_SCRIPT_BASE)
0591                                 COMPAT_SCRIPT_MOUNT="$COMPAT_SCRIPT_PATH:$COMPAT_SCRIPT"
0592                         fi
0593                 fi
0594                 return 0
0595         else
0596                 return 1
0597         fi
0598 }
0599 
0600 extra_docker(){
0601         if [ "$DEVICE" == gpu ]; then
0602                 EXTRA="--gpus all"
0603         fi
0604 }
0605 extra_podman(){
0606         if [ "$DEVICE" == gpu ]; then
0607                 EXTRA="--device nvidia.com/gpu=all"
0608         fi
0609 }
0610 extra_podman_hpc(){
0611         if [ "$DEVICE" == gpu ]; then
0612                 EXTRA="--gpu"
0613         fi
0614         EXTRA="$EXTRA --cvmfs --log-driver=json-file"
0615 }
0616 extra_apptainer(){
0617         if [ "$DEVICE" == gpu ]; then
0618                 EXTRA="--nv"
0619         fi
0620 }
0621 
0622 if [ "$CONTAINER" == "docker" ]; then
0623         START_FN=start_docker
0624         EXTRA_FN=extra_docker
0625         TEST_FN=test_docker
0626         STOP_FN=stop_docker
0627         DRIVER_FN=driver_docker
0628         COMPAT_FN=compat_docker
0629         PROG_NAME=docker
0630 elif [ "$CONTAINER" == "podman" ]; then
0631         START_FN=start_podman
0632         EXTRA_FN=extra_podman
0633         TEST_FN=test_podman
0634         STOP_FN=stop_podman
0635         DRIVER_FN=driver_podman
0636         COMPAT_FN=compat_podman
0637         PROG_NAME=podman
0638 elif [ "$CONTAINER" == "podman-hpc" ]; then
0639         START_FN=start_podman
0640         EXTRA_FN=extra_podman_hpc
0641         TEST_FN=test_podman
0642         STOP_FN=stop_podman
0643         DRIVER_FN=driver_podman
0644         COMPAT_FN=compat_podman
0645         PROG_NAME=podman-hpc
0646 else
0647         START_FN=start_apptainer
0648         EXTRA_FN=extra_apptainer
0649         TEST_FN=test_apptainer
0650         STOP_FN=stop_apptainer
0651         DRIVER_FN=driver_apptainer
0652         COMPAT_FN=compat_apptainer
0653         PROG_NAME=apptainer
0654 fi
0655 
0656 if [ "$OP" == check ]; then
0657         check_drivers
0658         exit $?
0659 elif [ "$OP" == start ]; then
0660         # handle cleaning up
0661         if [ -n "$FORCE" ]; then
0662                 auto_stop
0663         elif [ -d "$TMPDIR" ]; then
0664                 echo "Error: this container may already exist (override with -f)"
0665                 exit 1
0666         fi
0667 
0668         # auto GPU check
0669         if [ "$DEVICE" == auto ]; then
0670                 if nvidia-smi -L >& /dev/null; then
0671                         DEVICE=gpu
0672                 else
0673                         DEVICE=cpu
0674                 fi
0675         fi
0676         echo "CMS_TRITON_CHOSEN_DEVICE: $DEVICE"
0677         $EXTRA_FN
0678 
0679         # do not apply thread control settings if GPU use is requested
0680         if [ "$DEVICE" == gpu ]; then
0681                 THREADCONTROL=""
0682         fi
0683 
0684         handle_ports
0685         PORT_EXIT=$?
0686         if [ "$PORT_EXIT" -ne 0 ]; then exit $PORT_EXIT; fi
0687 
0688         # if parent PID is provided, automatically stop server when finished
0689         # do this before actually trying to start the server in case of ctrl+c
0690         if [ -n "$PARENTPID" ]; then
0691                 auto_stop "" "$PARENTPID" >& "$STOPLOG" &
0692         fi
0693 
0694         START_EXIT=0
0695         for ((counter=0; counter < ${RETRIES}; counter++)); do
0696                 make_tmp
0697 
0698                 # if we plan on editing model configs, must copy files into /tmp/local_model_repo, which is deleted upon retry
0699                 if [ "$counter" -eq 0 ] || [ -n "$THREADCONTROL" ]; then list_models; fi
0700 
0701                 # only need to check drivers if using GPU
0702                 if [ "$DEVICE" == gpu ]; then
0703                         check_drivers
0704                         DRIVER_EXIT=$?
0705                         if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi
0706                 fi
0707 
0708                 $START_FN
0709                 START_EXIT=$?
0710                 if [ "$START_EXIT" -eq 0 ]; then
0711                         wait_server
0712                         WAIT_EXIT=$?
0713                         if [ "$WAIT_EXIT" -eq 0 ]; then
0714                                 break
0715                         else
0716                                 # allow to keep retrying if there was a port issue
0717                                 counter=$((counter-1))
0718                                 START_EXIT="$WAIT_EXIT"
0719                         fi
0720                 fi
0721 
0722                 if [ "$START_EXIT" -ne 0 ]; then
0723                         auto_stop
0724                         echo "Retrying after container issue..."
0725                 fi
0726         done
0727         if [ "$START_EXIT" -ne 0 ]; then
0728                 echo "Error from $PROG_NAME"
0729                 exit "$START_EXIT"
0730         fi
0731 else
0732         # check for tmp dir
0733         if [ -d "$TMPDIR" ] || [ -n "$DRYRUN" ]; then
0734                 $DRYRUN cd "$TMPDIR"
0735         elif [ -z "$FORCE" ]; then
0736                 echo "Error: attempt to stop unknown container $SERVER"
0737                 exit 1
0738         fi
0739 
0740         auto_stop
0741 fi