File indexing completed on 2023-01-31 03:06:34
0001
0002
0003
0004 USEDOCKER=""
0005 GPU=""
0006 VERBOSE=""
0007 VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1"
0008 WTIME=300
0009 SERVER=triton_server_instance
0010 RETRIES=3
0011 REPOS=()
0012 MODELS=()
0013 FORCE=""
0014 CLEANUP=true
0015 TMPDIR=""
0016 DRYRUN=""
0017 PARENTPID=""
0018 BASEPORT=8000
0019 AUTOPORT=""
0020 NPORTS=3
0021 IMAGE=fastml/triton-torchgeo:21.06-py3-geometric
0022 SANDBOX=""
0023 COMPAT_USR=""
0024
0025 get_sandbox(){
0026 if [ -z "$SANDBOX" ]; then
0027 echo /cvmfs/unpacked.cern.ch/registry.hub.docker.com/${IMAGE}
0028 else
0029 echo "$SANDBOX"
0030 fi
0031 }
0032
0033 usage() {
0034 ECHO="echo -e"
0035 $ECHO "cmsTriton [options] [start|stop]"
0036 $ECHO
0037 $ECHO "Options:"
0038 $ECHO "-c \t don't cleanup temporary dir (for debugging)"
0039 $ECHO "-C [dir] \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)"
0040 $ECHO "-D \t dry run: print container commands rather than executing them"
0041 $ECHO "-d \t use Docker instead of Apptainer"
0042 $ECHO "-f \t force reuse of (possibly) existing container instance"
0043 $ECHO "-g \t use GPU instead of CPU"
0044 $ECHO "-i [name] \t server image name (default: ${IMAGE})"
0045 $ECHO "-M [dir] \t model repository (can be given more than once)"
0046 $ECHO "-m [dir] \t specific model directory (can be given more than once)"
0047 $ECHO "-n [name] \t name of container instance, also used for default hidden temporary dir (default: ${SERVER})"
0048 $ECHO "-P [port] \t base port number for services (-1: automatically find an unused port range) (default: ${BASEPORT})"
0049 $ECHO "-p [pid] \t automatically shut down server when process w/ specified PID ends (-1: use parent process PID)"
0050 $ECHO "-r [num] \t number of retries when starting container (default: ${RETRIES})"
0051 $ECHO "-s [dir] \t Apptainer sandbox directory (default: $(get_sandbox))"
0052 $ECHO "-t [dir] \t non-default hidden temporary dir"
0053 $ECHO "-v \t (verbose) start: activate server debugging info; stop: keep server logs"
0054 $ECHO "-w [time] \t maximum time to wait for server to start (default: ${WTIME} seconds)"
0055 $ECHO "-h \t print this message and exit"
0056 $ECHO
0057 $ECHO "Operations:"
0058 $ECHO "start \t start server"
0059 $ECHO "stop \t stop server"
0060 $ECHO "check \t check if server can run on this system"
0061 exit $1
0062 }
0063
0064
0065 SHM=/dev/shm
0066 if [ -e /run/shm ]; then
0067 SHM=/run/shm
0068 fi
0069
0070 while getopts "cC:Ddfgi:M:m:n:P:p:r:s:t:vw:h" opt; do
0071 case "$opt" in
0072 c) CLEANUP=""
0073 ;;
0074 C) COMPAT_USR="$OPTARG"
0075 ;;
0076 D) DRYRUN=echo
0077 ;;
0078 d) USEDOCKER=true
0079 ;;
0080 f) FORCE=true
0081 ;;
0082 g) GPU=true
0083 ;;
0084 i) IMAGE="$OPTARG"
0085 ;;
0086 M) REPOS+=("$OPTARG")
0087 ;;
0088 m) MODELS+=("$OPTARG")
0089 ;;
0090 n) SERVER="$OPTARG"
0091 ;;
0092 P) if [ "$OPTARG" -eq -1 ]; then AUTOPORT=true; else BASEPORT="$OPTARG"; fi
0093 ;;
0094 p) if [ "$OPTARG" -eq -1 ]; then PARENTPID="$PPID"; else PARENTPID="$OPTARG"; fi
0095 ;;
0096 r) RETRIES="$OPTARG"
0097 ;;
0098 s) SANDBOX="$OPTARG"
0099 ;;
0100 t) TMPDIR="$OPTARG"
0101 ;;
0102 v) VERBOSE="$VERBOSE_ARGS"
0103 ;;
0104 w) WTIME="$OPTARG"
0105 ;;
0106 h) usage 0
0107 ;;
0108 esac
0109 done
0110
0111 shift $(($OPTIND - 1))
0112 OP=$1
0113
0114 if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != check ]; then
0115 usage 1
0116 fi
0117
0118 if [ "$RETRIES" -le 0 ]; then
0119 RETRIES=1
0120 fi
0121
0122 TOPDIR=$PWD
0123 if [ -z "$TMPDIR" ]; then
0124 TMPDIR="${TOPDIR}/.${SERVER}"
0125 else
0126 TMPDIR=$(readlink -f $TMPDIR)
0127 fi
0128
0129
0130 if [ -n "$USEDOCKER" ]; then
0131 if [ -z "$DOCKER" ]; then
0132 DOCKER="sudo docker"
0133 fi
0134 else
0135 if [ -z "$APPTAINER" ]; then
0136 if type apptainer >& /dev/null; then
0137 APPTAINER=apptainer
0138 elif type singularity >& /dev/null; then
0139 APPTAINER=singularity
0140 else
0141 echo "Missing apptainer and singularity"
0142 exit 1
0143 fi
0144 fi
0145 fi
0146
0147
0148 SANDBOX=$(get_sandbox)
0149 SANDBOX=$(readlink -f ${SANDBOX})
0150 LOG="log_${SERVER}.log"
0151 STOPLOG="log_stop_${SERVER}.log"
0152 LIB=lib
0153 STARTED_INDICATOR="Started GRPCInferenceService"
0154 SEGFAULT_INDICATOR="Address already in use"
0155 EXTRA=""
0156 COMPAT_SCRIPT=/etc/shinit_v2
0157
0158 compute_ports(){
0159
0160 export HTTPPORT=$BASEPORT
0161 export GRPCPORT=$((BASEPORT+1))
0162 export METRPORT=$((BASEPORT+2))
0163 }
0164
0165 check_port(){
0166
0167 if 2>/dev/null >"/dev/tcp/0.0.0.0/$1"; then
0168 return 1
0169 else
0170 return 0
0171 fi
0172 }
0173
0174 check_ports(){
0175 if check_port $HTTPPORT && check_port $GRPCPORT && check_port $METRPORT; then
0176 return 0
0177 else
0178 return 1
0179 fi
0180 }
0181
0182 find_ports(){
0183 while ! check_ports; do
0184 BASEPORT=$((BASEPORT+NPORTS))
0185 compute_ports
0186 done
0187 echo "CMS_TRITON_GRPC_PORT: $GRPCPORT"
0188 }
0189
0190 handle_ports(){
0191
0192 compute_ports
0193 if [ -n "$AUTOPORT" ]; then
0194 find_ports
0195 elif ! check_ports; then
0196 echo "Error: requested port in use"
0197 return 1
0198 fi
0199 export PORTARGS="--http-port=${HTTPPORT} --grpc-port=${GRPCPORT} --metrics-port=${METRPORT}"
0200 }
0201
0202 start_docker(){
0203
0204 MOUNTARGS=""
0205 REPOARGS=""
0206 for REPO in ${REPOS[@]}; do
0207 MOUNTARGS="$MOUNTARGS -v$REPO:$REPO"
0208 REPOARGS="$REPOARGS --model-repository=${REPO}"
0209 done
0210
0211
0212 if [ -n "$COMPAT" ]; then
0213 MOUNTARGS="$MOUNTARGS -v$COMPAT"
0214 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0215 MOUNTARGS="$MOUNTARGS -v$COMPAT_SCRIPT_MOUNT"
0216 fi
0217 fi
0218
0219 $DRYRUN $DOCKER run -d --name ${SERVER} \
0220 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \
0221 -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
0222 ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
0223 }
0224
0225 start_apptainer(){
0226
0227
0228
0229
0230 $DRYRUN mkdir ${LIB}
0231 $DRYRUN ln -s ${SANDBOX}/opt/tritonserver/lib/* ${LIB}/
0232
0233
0234 MOUNTARGS=""
0235 REPOARGS=""
0236 for REPO in ${REPOS[@]}; do
0237 MOUNTARGS="$MOUNTARGS -B $REPO"
0238 REPOARGS="$REPOARGS --model-repository=${REPO}"
0239 done
0240
0241
0242 if [ -n "$COMPAT" ]; then
0243 MOUNTARGS="$MOUNTARGS -B $COMPAT"
0244 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0245 MOUNTARGS="$MOUNTARGS -B $COMPAT_SCRIPT_MOUNT"
0246 fi
0247 fi
0248
0249
0250
0251 if [ -d /.singularity.d/libs ]; then
0252 TMPD=`mktemp -d`
0253 (echo '#!/bin/bash'; echo 'exec /usr/sbin/ldconfig -C '"$TMPD"'/ld.so.cache "$@"') > $TMPD/ldconfig
0254 chmod +x $TMPD/ldconfig
0255 PATH=$TMPD:$PATH
0256
0257 ldconfig /.singularity.d/libs
0258 fi
0259
0260
0261
0262 $DRYRUN $APPTAINER instance start \
0263 -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $MOUNTARGS $EXTRA \
0264 ${SANDBOX} ${SERVER}
0265
0266 START_EXIT=$?
0267 if [ "$START_EXIT" -ne 0 ]; then
0268 rm -rf ${LIB}
0269 return "$START_EXIT"
0270 fi
0271
0272
0273 if [ -z "$DRYRUN" ]; then
0274 REDIR="$LOG"
0275 else
0276 REDIR=/dev/stdout
0277 fi
0278 $DRYRUN $APPTAINER run instance://${SERVER} \
0279 tritonserver $PORTARGS $REPOARGS $VERBOSE >& ${REDIR} &
0280 [ -z "$DRYRUN" ] || wait
0281 }
0282
0283 stop_docker(){
0284
0285 if [ -z "$DRYRUN" ]; then
0286 if [ -n "$VERBOSE" ]; then $DOCKER logs ${SERVER} >& "$LOG"; fi
0287 fi
0288
0289 $DRYRUN $DOCKER stop ${SERVER}
0290 $DRYRUN $DOCKER rm ${SERVER}
0291 }
0292
0293 stop_apptainer(){
0294 $DRYRUN $APPTAINER instance stop ${SERVER}
0295 }
0296
0297 test_docker(){
0298
0299 ${DOCKER} logs ${SERVER} |& grep "$1"
0300 }
0301
0302 test_apptainer(){
0303 grep "$1" $LOG
0304 }
0305
0306 wait_server(){
0307 if [ -n "$DRYRUN" ]; then
0308 return
0309 fi
0310
0311 COUNT=0
0312 while ! $TEST_FN "$STARTED_INDICATOR" >& /dev/null; do
0313 if $TEST_FN "$SEGFAULT_INDICATOR" >& /dev/null; then
0314 handle_ports
0315 PORT_EXIT=$?
0316
0317 if [ "$PORT_EXIT" -ne 0 ]; then
0318 auto_stop true
0319 exit $PORT_EXIT
0320 else
0321 return 1
0322 fi
0323 elif [ "$COUNT" -gt "$WTIME" ]; then
0324 echo "timed out waiting for server to start"
0325 auto_stop true
0326 exit 1
0327 else
0328 COUNT=$(($COUNT + 1))
0329 sleep 1
0330 fi
0331 done
0332
0333 echo "server is ready!"
0334 }
0335
0336 list_models(){
0337
0338 for MODEL in ${MODELS[@]}; do
0339
0340 if [ -f "$MODEL" ]; then
0341 MODEL="$(dirname "$MODEL")"
0342 fi
0343 REPOS+=("$(dirname "$MODEL")")
0344 done
0345 for ((r=0; r < ${
0346
0347 REPOS[$r]=$(readlink -f ${REPOS[$r]})
0348 done
0349
0350 read -a REPOS <<< "$(printf "%s\n" "${REPOS[@]}" | sort -u | tr '\n' ' ')"
0351 }
0352
0353 auto_stop(){
0354
0355
0356 if [ -n "$1" ]; then VERBOSE="$VERBOSE_ARGS"; fi
0357 PARENTPID="$2"
0358
0359 if [ -n "$PARENTPID" ]; then
0360 if [ -n "$VERBOSE" ]; then
0361 echo "watching PID $PARENTPID"
0362 ps
0363 fi
0364 PCOUNTER=0
0365 PMAX=5
0366 while [ "$PCOUNTER" -le "$PMAX" ]; do
0367 if ! kill -0 $PARENTPID >& /dev/null; then
0368 PCOUNTER=$((PCOUNTER+1))
0369 if [ -n "$VERBOSE" ]; then
0370 echo "trigger $PCOUNTER:"
0371 ps
0372 fi
0373 else
0374
0375 if [ "$PCOUNTER" -gt 0 ] && [ -n "$VERBOSE" ]; then
0376 echo "reset:"
0377 ps
0378 fi
0379 PCOUNTER=0
0380 fi
0381 sleep 1
0382 done
0383 fi
0384 $STOP_FN
0385
0386
0387 if [ -z "$DRYRUN" ]; then
0388 if [ -n "$VERBOSE" ]; then
0389 mv "$LOG" "$TOPDIR"
0390
0391 if [ -s "$STOPLOG" ]; then
0392 mv "$STOPLOG" "$TOPDIR"
0393 fi
0394 fi
0395 fi
0396
0397 if [ -n "$CLEANUP" ]; then
0398 $DRYRUN cd "$TOPDIR"
0399 $DRYRUN rm -rf "$TMPDIR"
0400 fi
0401 }
0402
0403 make_tmp(){
0404
0405 $DRYRUN mkdir "$TMPDIR"
0406 MKDIR_EXIT=$?
0407 if [ "$MKDIR_EXIT" -ne 0 ]; then
0408 echo "Could not create temp dir: $TMPDIR"
0409 exit "$MKDIR_EXIT"
0410 fi
0411 $DRYRUN cd "$TMPDIR"
0412 }
0413
0414 scram_tag(){
0415 cd $CMSSW_BASE
0416 TOOL="$1"
0417 TAG="$2"
0418 scram tool tag $TOOL $TAG 2> /dev/null || true
0419 }
0420
0421 driver_docker(){
0422 $DOCKER run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
0423 }
0424
0425 driver_apptainer(){
0426 source ${SANDBOX}/.singularity.d/env/10-docker2singularity.sh && echo $CUDA_DRIVER_VERSION
0427 }
0428
0429 compat_docker(){
0430 $DOCKER cp $($DOCKER create --rm ${IMAGE}):${COMPAT_SCRIPT} .
0431 }
0432
0433 compat_apptainer(){
0434 cp ${SANDBOX}/${COMPAT_SCRIPT} .
0435 }
0436
0437 check_drivers(){
0438
0439 CUDA_DRIVER_VERSION=$($DRIVER_FN)
0440
0441 DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
0442 if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then
0443 return 0
0444 fi
0445
0446 export COMPAT=""
0447
0448
0449 if [ -n "$COMPAT_USR" ] && [ -d "$COMPAT_USR" ]; then
0450 COMPAT="$COMPAT_USR"
0451 fi
0452
0453
0454
0455
0456 if [ -z "$COMPAT" ] && [ -n "$CMSSW_BASE" ]; then
0457 CUDA_BASE=$(scram_tag cuda CUDA_BASE)
0458 COMPAT_CMSSW=${CUDA_BASE}/drivers
0459
0460 if [ -n "$CUDA_BASE" ] && [ -d "$COMPAT_CMSSW" ]; then
0461 COMPAT="$COMPAT_CMSSW"
0462 fi
0463 fi
0464
0465
0466 COMPAT_SYS=/usr/local/cuda/compat
0467 if [ -z "$COMPAT" ] && [ -d "$COMPAT_SYS" ]; then
0468 COMPAT="$COMPAT_SYS"
0469 fi
0470
0471 if [ -n "$COMPAT" ]; then
0472
0473 export COMPAT="${COMPAT}:${COMPAT_SYS}/lib.real"
0474
0475 if [ "$PWD" == "$TMPDIR" ]; then
0476 $DRYRUN $COMPAT_FN
0477 COMPAT_SCRIPT_BASE=$(basename $COMPAT_SCRIPT)
0478 $DRYRUN sed -i 's/LD_LIBRARY_PATH="${_CUDA_COMPAT_REALLIB}"/&; LD_PRELOAD=""/' $COMPAT_SCRIPT_BASE
0479 COMPAT_SCRIPT_MOUNT="$COMPAT_SCRIPT_BASE:$COMPAT_SCRIPT"
0480 fi
0481 return 0
0482 else
0483 return 1
0484 fi
0485 }
0486
0487 if [ -n "$USEDOCKER" ]; then
0488 if [ -n "$GPU" ]; then
0489 EXTRA="--gpus all"
0490 fi
0491 START_FN=start_docker
0492 TEST_FN=test_docker
0493 STOP_FN=stop_docker
0494 DRIVER_FN=driver_docker
0495 COMPAT_FN=compat_docker
0496 PROG_NAME=Docker
0497 else
0498 if [ -n "$GPU" ]; then
0499 EXTRA="--nv"
0500 fi
0501 START_FN=start_apptainer
0502 TEST_FN=test_apptainer
0503 STOP_FN=stop_apptainer
0504 DRIVER_FN=driver_apptainer
0505 COMPAT_FN=compat_apptainer
0506 PROG_NAME=Apptainer
0507 fi
0508
0509 if [ "$OP" == check ]; then
0510 check_drivers
0511 exit $?
0512 elif [ "$OP" == start ]; then
0513
0514 if [ -n "$FORCE" ]; then
0515 auto_stop
0516 elif [ -d "$TMPDIR" ]; then
0517 echo "Error: this container may already exist (override with -f)"
0518 exit 1
0519 fi
0520
0521 handle_ports
0522 PORT_EXIT=$?
0523 if [ "$PORT_EXIT" -ne 0 ]; then exit $PORT_EXIT; fi
0524
0525 list_models
0526
0527 make_tmp
0528
0529
0530 check_drivers
0531 DRIVER_EXIT=$?
0532 if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi
0533
0534
0535
0536 if [ -n "$PARENTPID" ]; then
0537 auto_stop "" "$PARENTPID" >& "$STOPLOG" &
0538 fi
0539
0540 START_EXIT=0
0541 for ((counter=0; counter < ${RETRIES}; counter++)); do
0542 if [ "$START_EXIT" -ne 0 ]; then make_tmp; fi
0543
0544 $START_FN
0545 START_EXIT=$?
0546 if [ "$START_EXIT" -eq 0 ]; then
0547 wait_server
0548 WAIT_EXIT=$?
0549 if [ "$WAIT_EXIT" -eq 0 ]; then
0550 break
0551 else
0552
0553 counter=$((counter-1))
0554 START_EXIT="$WAIT_EXIT"
0555 fi
0556 fi
0557
0558 if [ "$START_EXIT" -ne 0 ]; then
0559 auto_stop
0560 echo "Retrying after container issue..."
0561 fi
0562 done
0563 if [ "$START_EXIT" -ne 0 ]; then
0564 echo "Error from $PROG_NAME"
0565 exit "$START_EXIT"
0566 fi
0567 else
0568
0569 if [ -d "$TMPDIR" ] || [ -n "$DRYRUN" ]; then
0570 $DRYRUN cd "$TMPDIR"
0571 elif [ -z "$FORCE" ]; then
0572 echo "Error: attempt to stop unknown container $SERVER"
0573 exit 1
0574 fi
0575
0576 auto_stop
0577 fi