diff --git a/DataFormats/PyTorchTest/interface/Device.h b/DataFormats/PyTorchTest/interface/Device.h index d25ba9c1d9d..5adf6a5b01f 100644 --- a/DataFormats/PyTorchTest/interface/Device.h +++ b/DataFormats/PyTorchTest/interface/Device.h @@ -4,17 +4,16 @@ #include "DataFormats/Portable/interface/PortableDeviceCollection.h" #include "DataFormats/PyTorchTest/interface/Layout.h" - namespace torchportable { -template -using ParticleCollectionDevice = PortableDeviceCollection; + template + using ParticleCollectionDevice = PortableDeviceCollection; -template -using ClassificationCollectionDevice = PortableDeviceCollection; + template + using ClassificationCollectionDevice = PortableDeviceCollection; -template -using RegressionCollectionDevice = PortableDeviceCollection; + template + using RegressionCollectionDevice = PortableDeviceCollection; } // namespace torchportable diff --git a/DataFormats/PyTorchTest/interface/Host.h b/DataFormats/PyTorchTest/interface/Host.h index 72a96d5d223..271838576a5 100644 --- a/DataFormats/PyTorchTest/interface/Host.h +++ b/DataFormats/PyTorchTest/interface/Host.h @@ -6,9 +6,9 @@ namespace torchportable { -using ParticleCollectionHost = PortableHostCollection; -using ClassificationCollectionHost = PortableHostCollection; -using RegressionCollectionHost = PortableHostCollection; + using ParticleCollectionHost = PortableHostCollection; + using ClassificationCollectionHost = PortableHostCollection; + using RegressionCollectionHost = PortableHostCollection; } // namespace torchportable diff --git a/DataFormats/PyTorchTest/interface/Layout.h b/DataFormats/PyTorchTest/interface/Layout.h index 91e35f89927..4d1252a3a89 100644 --- a/DataFormats/PyTorchTest/interface/Layout.h +++ b/DataFormats/PyTorchTest/interface/Layout.h @@ -7,24 +7,14 @@ namespace torchportable { -GENERATE_SOA_LAYOUT(ParticleLayout, - SOA_COLUMN(float, pt), - SOA_COLUMN(float, eta), - SOA_COLUMN(float, phi) -) -using ParticleSoA = ParticleLayout<>; + GENERATE_SOA_LAYOUT(ParticleLayout, SOA_COLUMN(float, pt), SOA_COLUMN(float, eta), SOA_COLUMN(float, phi)) + using ParticleSoA = ParticleLayout<>; + GENERATE_SOA_LAYOUT(ClassificationLayout, SOA_COLUMN(float, c1), SOA_COLUMN(float, c2)) + using ClassificationSoA = ClassificationLayout<>; -GENERATE_SOA_LAYOUT(ClassificationLayout, - SOA_COLUMN(float, c1), - SOA_COLUMN(float, c2) -) -using ClassificationSoA = ClassificationLayout<>; - -GENERATE_SOA_LAYOUT(RegressionLayout, - SOA_COLUMN(float, reco_pt) -) -using RegressionSoA = RegressionLayout<>; + GENERATE_SOA_LAYOUT(RegressionLayout, SOA_COLUMN(float, reco_pt)) + using RegressionSoA = RegressionLayout<>; } // namespace torchportable diff --git a/DataFormats/PyTorchTest/interface/alpaka/Collections.h b/DataFormats/PyTorchTest/interface/alpaka/Collections.h index 62d6e3c49e0..5d05794da12 100644 --- a/DataFormats/PyTorchTest/interface/alpaka/Collections.h +++ b/DataFormats/PyTorchTest/interface/alpaka/Collections.h @@ -12,40 +12,34 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::torchportable { -/** + /** * make the names from the top-level `torchportable` namespace visible for unqualified lookup * inside the `ALPAKA_ACCELERATOR_NAMESPACE::torchportable` namespace */ -using namespace ::torchportable; -using ::torchportable::ParticleCollectionHost; -using ::torchportable::ClassificationCollectionHost; -using ::torchportable::RegressionCollectionHost; -using ::torchportable::ParticleCollectionDevice; -using ::torchportable::ClassificationCollectionDevice; -using ::torchportable::RegressionCollectionDevice; - -using ParticleCollection = - std::conditional_t< - std::is_same_v, - ParticleCollectionHost, - ParticleCollectionDevice>; - -using ClassificationCollection = - std::conditional_t< - std::is_same_v, - ClassificationCollectionHost, - ClassificationCollectionDevice>; - -using RegressionCollection = - std::conditional_t< - std::is_same_v, - RegressionCollectionHost, - RegressionCollectionDevice>; + using namespace ::torchportable; + using ::torchportable::ClassificationCollectionDevice; + using ::torchportable::ClassificationCollectionHost; + using ::torchportable::ParticleCollectionDevice; + using ::torchportable::ParticleCollectionHost; + using ::torchportable::RegressionCollectionDevice; + using ::torchportable::RegressionCollectionHost; + + using ParticleCollection = std:: + conditional_t, ParticleCollectionHost, ParticleCollectionDevice>; + + using ClassificationCollection = std::conditional_t, + ClassificationCollectionHost, + ClassificationCollectionDevice>; + + using RegressionCollection = std::conditional_t, + RegressionCollectionHost, + RegressionCollectionDevice>; } // namespace ALPAKA_ACCELERATOR_NAMESPACE::torchportable ASSERT_DEVICE_MATCHES_HOST_COLLECTION(torchportable::ParticleCollection, torchportable::ParticleCollectionHost); -ASSERT_DEVICE_MATCHES_HOST_COLLECTION(torchportable::ClassificationCollection, torchportable::ClassificationCollectionHost); +ASSERT_DEVICE_MATCHES_HOST_COLLECTION(torchportable::ClassificationCollection, + torchportable::ClassificationCollectionHost); ASSERT_DEVICE_MATCHES_HOST_COLLECTION(torchportable::RegressionCollection, torchportable::RegressionCollectionHost); #endif // DATA_FORMATS__PYTORCH_TEST__INTERFACE__ALPAKA__COLLECTIONS_H_ diff --git a/PhysicsTools/PyTorch/interface/AlpakaConfig.h b/PhysicsTools/PyTorch/interface/AlpakaConfig.h index e7ad2ec3520..946a751c4ad 100644 --- a/PhysicsTools/PyTorch/interface/AlpakaConfig.h +++ b/PhysicsTools/PyTorch/interface/AlpakaConfig.h @@ -1,4 +1,4 @@ -// ROCm/HIP backend not yet supported, see: https://github.com/pytorch/pytorch/blob/main/aten/CMakeLists.txt#L75 +// ROCm/HIP backend not yet supported, see: https://github.com/pytorch/pytorch/blob/main/aten/CMakeLists.txt#L75 #ifndef PHYSICS_TOOLS__PYTORCH__INTERFACE__ALPAKA_CONFIG_H_ #define PHYSICS_TOOLS__PYTORCH__INTERFACE__ALPAKA_CONFIG_H_ @@ -9,13 +9,12 @@ // #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED || ALPAKA_ACC_GPU_HIP_ENABLED #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED #include -#endif - +#endif namespace cms::torch::alpaka { -template -inline constexpr bool false_value = false; + template + inline constexpr bool false_value = false; /** * @brief Specifies the device type used in the torch integration with Alpaka. @@ -24,18 +23,18 @@ inline constexpr bool false_value = false; * PyTorch device type (`c10::DeviceType`) for the system. */ #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED -constexpr c10::DeviceType kTorchDeviceType = c10::DeviceType::CUDA; + constexpr c10::DeviceType kTorchDeviceType = c10::DeviceType::CUDA; // #elif ALPAKA_ACC_GPU_HIP_ENABLED // constexpr c10::DeviceType kTorchDeviceType = c10::DeviceType::HIP; #elif ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -constexpr c10::DeviceType kTorchDeviceType = c10::DeviceType::CPU; + constexpr c10::DeviceType kTorchDeviceType = c10::DeviceType::CPU; #elif ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -constexpr c10::DeviceType kTorchDeviceType = c10::DeviceType::CPU; + constexpr c10::DeviceType kTorchDeviceType = c10::DeviceType::CPU; #else #error "Could not define the torch device type." #endif -/** + /** * @brief Converts an Alpaka device or queue object to a PyTorch device. * * This function extracts the native handle from the Alpaka device or queue and @@ -47,17 +46,17 @@ constexpr c10::DeviceType kTorchDeviceType = c10::DeviceType::CPU; * @param obj The Alpaka object (Device or Queue) to convert. * @return Corresponding PyTorch device. */ -template -inline ::torch::Device device(const T& obj) { - if constexpr (::alpaka::isDevice) - return ::torch::Device(kTorchDeviceType, obj.getNativeHandle()); - else if constexpr (::alpaka::isQueue) - return ::torch::Device(kTorchDeviceType, ::alpaka::getDev(obj).getNativeHandle()); - else - static_assert(false_value, "Unsupported type passed to device()"); -} + template + inline ::torch::Device device(const T &obj) { + if constexpr (::alpaka::isDevice) + return ::torch::Device(kTorchDeviceType, obj.getNativeHandle()); + else if constexpr (::alpaka::isQueue) + return ::torch::Device(kTorchDeviceType, ::alpaka::getDev(obj).getNativeHandle()); + else + static_assert(false_value, "Unsupported type passed to device()"); + } -/** + /** * @brief Base class for managing guard scopes that ensure thread safety * when working with Alpaka queues and PyTorch models. * @@ -66,133 +65,118 @@ inline ::torch::Device device(const T& obj) { * * @tparam TQueue The type of Alpaka queue. */ -template -class GuardScope { - public: - explicit GuardScope(const TQueue &queue) : queue_(queue) { set(); } - ~GuardScope() { reset(); }; - - protected: - const TQueue& queue_; - - private: - - void set() { - /** + template + class GuardScope { + public: + explicit GuardScope(const TQueue &queue) : queue_(queue) { set(); } + ~GuardScope() { reset(); }; + + protected: + const TQueue &queue_; + + private: + void set() { + /** * @brief Sets the guard to disable multi-threading and control PyTorch's threading model. * @note Global call. */ - [[maybe_unused]] static bool threading_guard = [] { - at::set_num_threads(1); - at::set_num_interop_threads(1); - return true; - } (); - static_cast(this)->set_impl(); - } + [[maybe_unused]] static bool threading_guard = [] { + at::set_num_threads(1); + at::set_num_interop_threads(1); + return true; + }(); + static_cast(this)->set_impl(); + } - /** + /** * @brief Resets the guard state, restoring the previous configuration. */ - void reset() { - static_cast(this)->reset_impl(); - } -}; + void reset() { static_cast(this)->reset_impl(); } + }; -/** + /** * @brief Specialization of GuardScope for different Alpaka queue types. */ -template -struct GuardTraits; + template + struct GuardTraits; #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED -/** + /** * @brief Guard for CUDA-based operations when using Alpaka with the CUDA backend. * * This class manages the stream switching required when running Alpaka queues on a CUDA device. * It ensures that the correct stream is set before operations and resets it afterward. */ -class CudaAsyncGuard : public GuardScope { - public: - using Base = GuardScope; - using Base::Base; - - void set_impl() { - prev_stream_ = c10::cuda::getCurrentCUDAStream(); - auto dev = device(this->queue_); - auto stream = c10::cuda::getStreamFromExternal(this->queue_.getNativeHandle(), dev.index()); - c10::cuda::setCurrentCUDAStream(stream); - } - - void reset_impl() { - c10::cuda::setCurrentCUDAStream(prev_stream_); - } - - private: - c10::cuda::CUDAStream prev_stream_ = c10::cuda::getDefaultCUDAStream(); -}; - -template <> -struct GuardTraits { - using type = CudaAsyncGuard; -}; + class CudaAsyncGuard : public GuardScope { + public: + using Base = GuardScope; + using Base::Base; + + void set_impl() { + prev_stream_ = c10::cuda::getCurrentCUDAStream(); + auto dev = device(this->queue_); + auto stream = c10::cuda::getStreamFromExternal(this->queue_.getNativeHandle(), dev.index()); + c10::cuda::setCurrentCUDAStream(stream); + } + + void reset_impl() { c10::cuda::setCurrentCUDAStream(prev_stream_); } + + private: + c10::cuda::CUDAStream prev_stream_ = c10::cuda::getDefaultCUDAStream(); + }; + + template <> + struct GuardTraits { + using type = CudaAsyncGuard; + }; // #elif ALPAKA_ACC_GPU_HIP_ENABLED // Similar structure can be added for HIP support if needed. torch::cuda ns is hip when using ROCm/HIP #elif ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -/** + /** * @brief Guard for serial CPU operations using Alpaka with CPU serial backend. */ -class CpuSerialSyncGuard : public GuardScope { - public: - using Base = GuardScope; - using Base::Base; - void set_impl() { - /**< nothing to be done, only threading is disabled (in base class) */ - } - void reset_impl() { - /**< nothing to be done */ - } -}; - -template <> -struct GuardTraits { - using type = CpuSerialSyncGuard; -}; + class CpuSerialSyncGuard : public GuardScope { + public: + using Base = GuardScope; + using Base::Base; + void set_impl() { /**< nothing to be done, only threading is disabled (in base class) */ } + void reset_impl() { /**< nothing to be done */ } + }; + + template <> + struct GuardTraits { + using type = CpuSerialSyncGuard; + }; #elif ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -/** + /** * @brief Guard for asynchronous CPU operations using Alpaka with TBB backend. */ -class CpuTbbAsyncGuard : public GuardScope { - public: - using Base = GuardScope; - using Base::Base; - void set_impl() { - /**< nothing to be done, only threading is disabled (in base class) */ - } - void reset_impl() { - /**< nothing to be done */ - } -}; - -template <> -struct GuardTraits { - using type = CpuTbbAsyncGuard; -}; + class CpuTbbAsyncGuard : public GuardScope { + public: + using Base = GuardScope; + using Base::Base; + void set_impl() { /**< nothing to be done, only threading is disabled (in base class) */ } + void reset_impl() { /**< nothing to be done */ } + }; + + template <> + struct GuardTraits { + using type = CpuTbbAsyncGuard; + }; #else #error "Torch guard for this backend is not defined." #endif - -/** + /** * @brief Alias for the appropriate Guard type based on the Alpaka queue. */ -template -using Guard = typename GuardTraits::type; - + template + using Guard = typename GuardTraits::type; -/** + /** * @brief Computes a unique hash representation for the given Alpaka queue. * * This function generates a hash string that uniquely represents the given Alpaka queue. @@ -202,23 +186,21 @@ using Guard = typename GuardTraits::type; * @param queue The Alpaka queue to generate the hash for. * @return A string representing the unique hash for the queue. */ -template >> -inline std::string queue_hash(const TQueue &queue) { - std::stringstream repr; + template >> + inline std::string queue_hash(const TQueue &queue) { + std::stringstream repr; #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - auto stream = c10::cuda::getStreamFromExternal( - queue.getNativeHandle(), device(queue).index()); - repr << "0x" << std::hex << stream.stream(); - return repr.str(); + auto stream = c10::cuda::getStreamFromExternal(queue.getNativeHandle(), device(queue).index()); + repr << "0x" << std::hex << stream.stream(); + return repr.str(); // #elif ALPAKA_ACC_GPU_HIP_ENABLED // return "0x0"; #endif - repr << "0x" << std::hex << std::hash{}(std::this_thread::get_id()); - return repr.str(); -} - + repr << "0x" << std::hex << std::hash{}(std::this_thread::get_id()); + return repr.str(); + } -/** + /** * @brief Computes a unique hash representation for the current stream associated with the given Alpaka queue. * * This function generates a hash string representing the current stream of the given Alpaka queue, @@ -228,20 +210,20 @@ inline std::string queue_hash(const TQueue &queue) { * @param queue The Alpaka queue to generate the current stream hash for. * @return A string representing the unique hash for the current stream. */ -template >> -inline std::string current_stream_hash(const TQueue &queue) { - std::stringstream repr; + template >> + inline std::string current_stream_hash(const TQueue &queue) { + std::stringstream repr; #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - const auto dev = device(queue); - auto stream = c10::cuda::getCurrentCUDAStream(dev.index()); - repr << "0x" << std::hex << stream.stream(); - return repr.str(); + const auto dev = device(queue); + auto stream = c10::cuda::getCurrentCUDAStream(dev.index()); + repr << "0x" << std::hex << stream.stream(); + return repr.str(); #elif ALPAKA_ACC_GPU_HIP_ENABLED - return "0x0"; + return "0x0"; #endif - repr << "0x" << std::hex << std::hash{}(std::this_thread::get_id()); - return repr.str(); -} + repr << "0x" << std::hex << std::hash{}(std::this_thread::get_id()); + return repr.str(); + } } // namespace cms::torch::alpaka diff --git a/PhysicsTools/PyTorch/interface/Config.h b/PhysicsTools/PyTorch/interface/Config.h index 72230b927cc..8269cac7a54 100644 --- a/PhysicsTools/PyTorch/interface/Config.h +++ b/PhysicsTools/PyTorch/interface/Config.h @@ -6,29 +6,27 @@ #include "FWCore/Utilities/interface/Exception.h" - namespace cms::torch { - -/** + + /** * The following `constexpr` constants are aliases for various PyTorch data types. * * Primarily used for specifying tensor types when working with PyTorch * tensors in the CMS environment. */ -constexpr auto Byte = ::torch::kByte; /**< unsigned 8-bit integer type. */ -constexpr auto Char = ::torch::kChar; /**< signed 8-bit integer type. */ -constexpr auto Short = ::torch::kShort; /**< signed 16-bit integer type. */ -constexpr auto Int = ::torch::kInt; /**< signed 32-bit integer type. */ -constexpr auto Long = ::torch::kLong; /**< signed 64-bit integer type. */ -constexpr auto UInt16 = ::torch::kUInt16; /**< unsigned 16-bit integer type. */ -constexpr auto UInt32 = ::torch::kUInt32; /**< unsigned 32-bit integer type. */ -constexpr auto UInt64 = ::torch::kUInt64; /**< unsigned 64-bit integer type. */ -constexpr auto Half = ::torch::kHalf; /**< 16-bit floating point type. */ -constexpr auto Float = ::torch::kFloat; /**< 32-bit floating point type. */ -constexpr auto Double = ::torch::kDouble; /**< 64-bit floating point type. */ - - -/** + constexpr auto Byte = ::torch::kByte; /**< unsigned 8-bit integer type. */ + constexpr auto Char = ::torch::kChar; /**< signed 8-bit integer type. */ + constexpr auto Short = ::torch::kShort; /**< signed 16-bit integer type. */ + constexpr auto Int = ::torch::kInt; /**< signed 32-bit integer type. */ + constexpr auto Long = ::torch::kLong; /**< signed 64-bit integer type. */ + constexpr auto UInt16 = ::torch::kUInt16; /**< unsigned 16-bit integer type. */ + constexpr auto UInt32 = ::torch::kUInt32; /**< unsigned 32-bit integer type. */ + constexpr auto UInt64 = ::torch::kUInt64; /**< unsigned 64-bit integer type. */ + constexpr auto Half = ::torch::kHalf; /**< 16-bit floating point type. */ + constexpr auto Float = ::torch::kFloat; /**< 32-bit floating point type. */ + constexpr auto Double = ::torch::kDouble; /**< 64-bit floating point type. */ + + /** * @brief Loads a TorchScript model. * * This function wraps `torch::jit::load` to load a TorchScript model from a specified path. @@ -42,16 +40,16 @@ constexpr auto Double = ::torch::kDouble; /**< 64-bit floating point type. */ * @note This function is intended for model loading in CMSSW environments, providing * integration with the framework's exception handling and logging facilities. */ -inline ::torch::jit::script::Module load(const std::string &model_path) { - try { - return ::torch::jit::load(model_path); - } catch (const c10::Error &e) { - cms::Exception ex("ModelLoadingError"); - ex.addContext("Calling cms::torch::load(const std::string&)"); - ex.addAdditionalInfo("Error loading the model: " + std::string(e.what())); - throw ex; + inline ::torch::jit::script::Module load(const std::string &model_path) { + try { + return ::torch::jit::load(model_path); + } catch (const c10::Error &e) { + cms::Exception ex("ModelLoadingError"); + ex.addContext("Calling cms::torch::load(const std::string&)"); + ex.addAdditionalInfo("Error loading the model: " + std::string(e.what())); + throw ex; + } } -} } // namespace cms::torch diff --git a/PhysicsTools/PyTorch/interface/Converter.h b/PhysicsTools/PyTorch/interface/Converter.h index 2c831d5bf7c..8770bdcc4b3 100644 --- a/PhysicsTools/PyTorch/interface/Converter.h +++ b/PhysicsTools/PyTorch/interface/Converter.h @@ -6,97 +6,100 @@ #include "DataFormats/SoATemplate/interface/SoALayout.h" #include "PhysicsTools/PyTorch/interface/SoAWrapper.h" - namespace cms::torch::alpaka { -// Metadata to run model with input SOA and fill output SOA. -template -class ModelMetadata { - public: - SoAWrapper input; - SoAWrapper output; + // Metadata to run model with input SOA and fill output SOA. + template + class ModelMetadata { + public: + SoAWrapper input; + SoAWrapper output; - // Used in AOT model class to correctly choose multi or single output conversion - // Default value true, as single value can be parsed with multi output - bool multi_output; + // Used in AOT model class to correctly choose multi or single output conversion + // Default value true, as single value can be parsed with multi output + bool multi_output; - ModelMetadata(const SoAWrapper& input_, const SoAWrapper& output_, bool multi_output_=true) - : input(input_), output(output_), multi_output(multi_output_) {} -}; + ModelMetadata(const SoAWrapper& input_, const SoAWrapper& output_, bool multi_output_ = true) + : input(input_), output(output_), multi_output(multi_output_) {} + }; -// Static class to wrap raw SOA pointer in tensor object without copying. -class Converter { - public: - // Calculate size and stride of data store based on InputMetadata and return list of IValue, which is parent class of torch::tensor. - template - static std::vector<::torch::IValue> convert_input(const ModelMetadata& metadata, - ::torch::Device device) { - std::vector<::torch::IValue> tensors(metadata.input.nBlocks); - for (int i = 0; i < metadata.input.nBlocks; i++) { - assert(reinterpret_cast(metadata.input[metadata.input.order[i]].ptr) % SOA_Input::alignment == 0); - tensors.at(i) = - std::move(Converter::array_to_tensor(device, metadata.input[metadata.input.order[i]])); + // Static class to wrap raw SOA pointer in tensor object without copying. + class Converter { + public: + // Calculate size and stride of data store based on InputMetadata and return list of IValue, which is parent class of torch::tensor. + template + static std::vector<::torch::IValue> convert_input(const ModelMetadata& metadata, + ::torch::Device device) { + std::vector<::torch::IValue> tensors(metadata.input.nBlocks); + for (int i = 0; i < metadata.input.nBlocks; i++) { + assert(reinterpret_cast(metadata.input[metadata.input.order[i]].ptr) % SOA_Input::alignment == 0); + tensors.at(i) = + std::move(Converter::array_to_tensor(device, metadata.input[metadata.input.order[i]])); + } + return tensors; } - return tensors; - } - // AOT specific implementation, as model expects vector of torch::Tensor not torch::IValue - template - static std::vector<::torch::Tensor> convert_input_tensor(const ModelMetadata& metadata, - ::torch::Device device) { - std::vector<::torch::Tensor> tensors(metadata.input.nBlocks); - for (int i = 0; i < metadata.input.nBlocks; i++) { - assert(reinterpret_cast(metadata.input[metadata.input.order[i]].ptr) % SOA_Input::alignment == 0); - tensors.at(i) = - std::move(Converter::array_to_tensor(device, metadata.input[metadata.input.order[i]])); + // AOT specific implementation, as model expects vector of torch::Tensor not torch::IValue + template + static std::vector<::torch::Tensor> convert_input_tensor(const ModelMetadata& metadata, + ::torch::Device device) { + std::vector<::torch::Tensor> tensors(metadata.input.nBlocks); + for (int i = 0; i < metadata.input.nBlocks; i++) { + assert(reinterpret_cast(metadata.input[metadata.input.order[i]].ptr) % SOA_Input::alignment == 0); + tensors.at(i) = + std::move(Converter::array_to_tensor(device, metadata.input[metadata.input.order[i]])); + } + return tensors; } - return tensors; - } - // Calculate size and stride of data store based on OutputMetadata and return single output tensor - template - static ::torch::Tensor convert_output(const ModelMetadata& metadata, ::torch::Device device) { - assert(reinterpret_cast(metadata.output[metadata.output.order[0]].ptr) % SOA_Output::alignment == 0); - return Converter::array_to_tensor(device, metadata.output[metadata.output.order[0]]); - } + // Calculate size and stride of data store based on OutputMetadata and return single output tensor + template + static ::torch::Tensor convert_output(const ModelMetadata& metadata, + ::torch::Device device) { + assert(reinterpret_cast(metadata.output[metadata.output.order[0]].ptr) % SOA_Output::alignment == 0); + return Converter::array_to_tensor(device, metadata.output[metadata.output.order[0]]); + } - // Calculate size and stride of data store based on OutputMetadata and fill SoA with tensor values - template - static void convert_output(const std::vector<::torch::IValue>& tensors, - const ModelMetadata& metadata, - ::torch::Device device) { - for (int i = 0; i < metadata.output.nBlocks; i++) { - // Only tensors are currenlty supported for conversion - if(tensors.at(i).isTensor()) { - assert(reinterpret_cast(metadata.output[metadata.output.order[i]].ptr) % SOA_Output::alignment == 0); - Converter::array_to_tensor(device, metadata.output[metadata.output.order[i]]) = tensors.at(i).toTensor(); - } + // Calculate size and stride of data store based on OutputMetadata and fill SoA with tensor values + template + static void convert_output(const std::vector<::torch::IValue>& tensors, + const ModelMetadata& metadata, + ::torch::Device device) { + for (int i = 0; i < metadata.output.nBlocks; i++) { + // Only tensors are currenlty supported for conversion + if (tensors.at(i).isTensor()) { + assert(reinterpret_cast(metadata.output[metadata.output.order[i]].ptr) % SOA_Output::alignment == + 0); + Converter::array_to_tensor(device, metadata.output[metadata.output.order[i]]) = + tensors.at(i).toTensor(); + } + } } - } - // AOT specific implementation, as return type is torch::Tensor not torch::IValue - template - static void convert_output(const std::vector<::torch::Tensor>& tensors, - const ModelMetadata& metadata, - ::torch::Device device) { - for (int i = 0; i < metadata.output.nBlocks; i++) { - assert(reinterpret_cast(metadata.output[metadata.output.order[i]].ptr) % SOA_Output::alignment == 0); - Converter::array_to_tensor(device, metadata.output[metadata.output.order[i]]) = tensors.at(i); + // AOT specific implementation, as return type is torch::Tensor not torch::IValue + template + static void convert_output(const std::vector<::torch::Tensor>& tensors, + const ModelMetadata& metadata, + ::torch::Device device) { + for (int i = 0; i < metadata.output.nBlocks; i++) { + assert(reinterpret_cast(metadata.output[metadata.output.order[i]].ptr) % SOA_Output::alignment == 0); + Converter::array_to_tensor(device, metadata.output[metadata.output.order[i]]) = tensors.at(i); + } } - } - private: - // Wrap raw pointer by torch::Tensor based on type, size and stride. - template - static ::torch::Tensor array_to_tensor(::torch::Device device, const Block& block) { - auto options = ::torch::TensorOptions().dtype(block.type) + private: + // Wrap raw pointer by torch::Tensor based on type, size and stride. + template + static ::torch::Tensor array_to_tensor(::torch::Device device, const Block& block) { + auto options = ::torch::TensorOptions() + .dtype(block.type) #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - .device(device) + .device(device) #endif - .pinned_memory(true); - return ::torch::from_blob(block.ptr, block.size, block.stride, options); - } -}; + .pinned_memory(true); + return ::torch::from_blob(block.ptr, block.size, block.stride, options); + } + }; } // namespace cms::torch::alpaka diff --git a/PhysicsTools/PyTorch/interface/Model.h b/PhysicsTools/PyTorch/interface/Model.h index d2b70cca59b..a1272cae87e 100644 --- a/PhysicsTools/PyTorch/interface/Model.h +++ b/PhysicsTools/PyTorch/interface/Model.h @@ -5,65 +5,59 @@ #include "PhysicsTools/PyTorch/interface/AlpakaConfig.h" #include "PhysicsTools/PyTorch/interface/Converter.h" - namespace cms::torch::alpaka { -/** + /** * @class CompilationType * @brief Specifies the type of compilation used for the model. */ -enum class CompilationType { - kJustInTime, /**< JIT compilation, load and compile at runtime from TorchScript */ - kAheadOfTime /**< AOT compilation, load precompiled shared library at runtime */ -}; + enum class CompilationType { + kJustInTime, /**< JIT compilation, load and compile at runtime from TorchScript */ + kAheadOfTime /**< AOT compilation, load precompiled shared library at runtime */ + }; -/** + /** * @class Model * @brief Model base class. */ -template -class Model; + template + class Model; -/** + /** * @class Model * @brief AOT Model specific implementation. * * Interface for loading and running models with AOT compilation models. */ -template <> -class Model { - public: - explicit Model(const std::string &model_path) - : loader_(model_path), runner_(loader_.get_runner()) {} + template <> + class Model { + public: + explicit Model(const std::string &model_path) : loader_(model_path), runner_(loader_.get_runner()) {} - /** + /** * @brief Torch fallback for testing purposes. * @param inputs input tensors * @return output tensors */ - auto forward(std::vector<::torch::Tensor> &inputs) const { - return runner_->run(inputs); - } + auto forward(std::vector<::torch::Tensor> &inputs) const { return runner_->run(inputs); } - /** + /** * @brief Torch portable inference with SoA buffers without explicit copies. * @param metadata Metadata specyfies how memory blob is organized and can be accessed. */ - template - void forward(const ModelMetadata &metadata) const { - std::vector inputs = Converter::convert_input_tensor(metadata, device_); - - if (metadata.multi_output) { - auto out = runner_->run(inputs); - Converter::convert_output(out, metadata, device_); - } - else { - Converter::convert_output(metadata, device_) = runner_->run(inputs)[0]; + template + void forward(const ModelMetadata &metadata) const { + std::vector inputs = Converter::convert_input_tensor(metadata, device_); + + if (metadata.multi_output) { + auto out = runner_->run(inputs); + Converter::convert_output(out, metadata, device_); + } else { + Converter::convert_output(metadata, device_) = runner_->run(inputs)[0]; + } } - - } - /** + /** * @brief Change model metadata `device_` to a specified device. * * Utility function updates model metadata `device_` using Alpaka-aware @@ -81,48 +75,46 @@ class Model { * @note If the device is already set to the specified device, the function does nothing. * @throws A static assertion failure at compile-time if an unsupported type is passed. */ - template - void to(const T &obj) const { - ::torch::Device device = ::torch::kCPU; - if constexpr (::alpaka::isDevice || ::alpaka::isQueue) { - device = cms::torch::alpaka::device(obj); - } else if constexpr (std::is_same_v) { - device = obj; - } else { - static_assert(false_value, "Unsupported type passed -> to(const T&)"); + template + void to(const T &obj) const { + ::torch::Device device = ::torch::kCPU; + if constexpr (::alpaka::isDevice || ::alpaka::isQueue) { + device = cms::torch::alpaka::device(obj); + } else if constexpr (std::is_same_v) { + device = obj; + } else { + static_assert(false_value, "Unsupported type passed -> to(const T&)"); + } + + if (device == device_) + return; + device_ = device; } - if (device == device_) - return; - device_ = device; - } - - /** + /** * @brief Torch portable inference with SoA buffers without explicit copies. * @return Current device binded to the model. */ - ::torch::Device device() const { return device_; } - - private: - mutable ::torch::Device device_ = ::torch::kCPU; /**< Device metadata of the model */ - mutable ::torch::inductor::AOTIModelPackageLoader loader_; /**< AOT model package loader */ - mutable ::torch::inductor::AOTIModelContainerRunner* runner_ = nullptr; /**< AOT model container runner */ -}; + ::torch::Device device() const { return device_; } + private: + mutable ::torch::Device device_ = ::torch::kCPU; /**< Device metadata of the model */ + mutable ::torch::inductor::AOTIModelPackageLoader loader_; /**< AOT model package loader */ + mutable ::torch::inductor::AOTIModelContainerRunner *runner_ = nullptr; /**< AOT model container runner */ + }; -/** + /** * @class Model * @brief JIT Model specific implementation. * * Interface for loading and running models with JIT compilation models. */ -template <> -class Model { - public: - Model(const std::string &model_path) - : model_(std::move(cms::torch::load(model_path))) {} + template <> + class Model { + public: + Model(const std::string &model_path) : model_(std::move(cms::torch::load(model_path))) {} - /** + /** * @brief Moves the model to a specified device. * * Utility function updates the internal device of the model, using Alpaka-aware @@ -138,54 +130,52 @@ class Model { * @note If the device is already set to the specified device, the function does nothing. * @throws A static assertion failure at compile-time if an unsupported type is passed. */ - template - void to(const T &obj) const { - ::torch::Device device = ::torch::kCPU; - if constexpr (::alpaka::isDevice || ::alpaka::isQueue) { - device = cms::torch::alpaka::device(obj); - } else if constexpr (std::is_same_v) { - device = obj; - } else { - static_assert(false_value, "Unsupported type passed -> to(const T&)"); + template + void to(const T &obj) const { + ::torch::Device device = ::torch::kCPU; + if constexpr (::alpaka::isDevice || ::alpaka::isQueue) { + device = cms::torch::alpaka::device(obj); + } else if constexpr (std::is_same_v) { + device = obj; + } else { + static_assert(false_value, "Unsupported type passed -> to(const T&)"); + } + + if (device == device_) + return; + + device_ = device; + model_.to(device_, true); } - if (device == device_) - return; - - device_ = device; - model_.to(device_, true); - } - - /** + /** * @brief Torch fallback for testing purposes. * @param inputs input tensors * @return output tensors */ - auto forward(std::vector<::torch::IValue> &inputs) const { - return model_.forward(inputs); - } + auto forward(std::vector<::torch::IValue> &inputs) const { return model_.forward(inputs); } - /** + /** * @brief Torch portable inference with SoA buffers without explicit copies. * @param metadata Metadata specyfies how memory blob is organized and can be accessed. */ - template - void forward(const ModelMetadata &metadata) const { - auto input_tensor = Converter::convert_input(metadata, device_); - // TODO: think about support for multi-output models (without temporary mem copy) - Converter::convert_output(metadata, device_) = model_.forward(input_tensor).toTensor(); - }; - - /** + template + void forward(const ModelMetadata &metadata) const { + auto input_tensor = Converter::convert_input(metadata, device_); + // TODO: think about support for multi-output models (without temporary mem copy) + Converter::convert_output(metadata, device_) = model_.forward(input_tensor).toTensor(); + }; + + /** * @brief Torch portable inference with SoA buffers without explicit copies. * @return Current device binded to the model. */ - ::torch::Device device() const { return device_; } - - private: - mutable ::torch::jit::script::Module model_; /**< JIT model */ - mutable ::torch::Device device_ = ::torch::kCPU; /**< Device binded to the model */ -}; + ::torch::Device device() const { return device_; } + + private: + mutable ::torch::jit::script::Module model_; /**< JIT model */ + mutable ::torch::Device device_ = ::torch::kCPU; /**< Device binded to the model */ + }; } // namespace cms::torch::alpaka diff --git a/PhysicsTools/PyTorch/interface/Nvtx.h b/PhysicsTools/PyTorch/interface/Nvtx.h index bca01b62441..8c2e6880c5a 100644 --- a/PhysicsTools/PyTorch/interface/Nvtx.h +++ b/PhysicsTools/PyTorch/interface/Nvtx.h @@ -1,7 +1,8 @@ #ifndef PHYSICS_TOOLS__PYTORCH__INTERFACE__NVTX_H_ #define PHYSICS_TOOLS__PYTORCH__INTERFACE__NVTX_H_ -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \ + defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) #include #endif @@ -15,28 +16,30 @@ class NvtxScopedRange { public: NvtxScopedRange(const char* msg) { -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \ + defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) id_ = nvtxRangeStartA(msg); #endif - } - + } + void end() { -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \ + defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) if (active_) { active_ = false; nvtxRangeEnd(id_); } #endif } - + ~NvtxScopedRange() { end(); } - + private: -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \ + defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) nvtxRangeId_t id_; bool active_ = true; #endif }; - #endif // PHYSICS_TOOLS__PYTORCH__INTERFACE__NVTX_H_ diff --git a/PhysicsTools/PyTorch/interface/SoAWrapper.h b/PhysicsTools/PyTorch/interface/SoAWrapper.h index 720d91f774f..62ea79e3684 100644 --- a/PhysicsTools/PyTorch/interface/SoAWrapper.h +++ b/PhysicsTools/PyTorch/interface/SoAWrapper.h @@ -9,192 +9,199 @@ #include "DataFormats/SoATemplate/interface/SoALayout.h" - using namespace cms::soa; namespace cms::torch::alpaka { -template -concept SameTypes = (std::same_as && ...); - -// Wrapper struct to merge info about scalar columns and multidimensional eigen columns -struct Columns { - std::vector columns; - - // Constructor for scalar columns - Columns(int columns_) { columns.push_back(columns_); } - - // Constructor for multidimensional eigen columns - Columns(const std::vector& columns_) : columns(columns_) {} - Columns(std::vector&& columns_) : columns(std::move(columns_)) {} - - size_t size() const { return columns.size(); } - int operator[](int i) const { return columns[i]; } - void push(int i) { columns.push_back(i); } -}; - -// Block of SoA Columns with same type and element size. -// Calculates size and stride and stores torch type. -template -struct Block { - std::vector stride; - std::vector size; - - void* ptr; - ::torch::ScalarType type; - size_t bytes; - bool is_scalar = false; - - Block() : ptr(nullptr) {} - // Constructor for columns and eigen columns - Block(int nElements, void* ptr_, const Columns& columns_, ::torch::ScalarType type_, size_t bytes_) - : ptr(ptr_), type(type_), bytes(bytes_) { - stride = std::move(create_stride(nElements, columns_, bytes_)); - size = std::move(create_size(nElements, columns_)); + template + concept SameTypes = (std::same_as && ...); + + // Wrapper struct to merge info about scalar columns and multidimensional eigen columns + struct Columns { + std::vector columns; + + // Constructor for scalar columns + Columns(int columns_) { columns.push_back(columns_); } + + // Constructor for multidimensional eigen columns + Columns(const std::vector& columns_) : columns(columns_) {} + Columns(std::vector&& columns_) : columns(std::move(columns_)) {} + + size_t size() const { return columns.size(); } + int operator[](int i) const { return columns[i]; } + void push(int i) { columns.push_back(i); } }; - // Constructor for scalar columns - Block(int nElements, void* ptr_, ::torch::ScalarType type_, size_t bytes_) : ptr(ptr_), type(type_), bytes(bytes_) { - stride = std::move(create_stride(nElements, 1, bytes_, true)); - size = std::move(create_size(nElements, 1)); + // Block of SoA Columns with same type and element size. + // Calculates size and stride and stores torch type. + template + struct Block { + std::vector stride; + std::vector size; + + void* ptr; + ::torch::ScalarType type; + size_t bytes; + bool is_scalar = false; + + Block() : ptr(nullptr) {} + // Constructor for columns and eigen columns + Block(int nElements, void* ptr_, const Columns& columns_, ::torch::ScalarType type_, size_t bytes_) + : ptr(ptr_), type(type_), bytes(bytes_) { + stride = std::move(create_stride(nElements, columns_, bytes_)); + size = std::move(create_size(nElements, columns_)); + }; + + // Constructor for scalar columns + Block(int nElements, void* ptr_, ::torch::ScalarType type_, size_t bytes_) : ptr(ptr_), type(type_), bytes(bytes_) { + stride = std::move(create_stride(nElements, 1, bytes_, true)); + size = std::move(create_size(nElements, 1)); + }; + + static int get_elems_per_column(int nElements, size_t bytes) { + int per_bunch = SOA_Layout::alignment / bytes; + int bunches = std::ceil(1.0 * nElements / per_bunch); + return bunches * per_bunch; + } + + private: + static std::vector create_size(int nElements, const Columns& columns) { + std::vector size(columns.size() + 1); + size[0] = nElements; + std::copy(columns.columns.begin(), columns.columns.end(), size.begin() + 1); + + return size; + } + + static std::vector create_stride(int nElements, + const Columns& columns, + size_t bytes, + bool is_scalar = false) { + int N = columns.size() + 1; + std::vector stride(N); + + int per_bunch = SOA_Layout::alignment / bytes; + int bunches = std::ceil(1.0 * nElements / per_bunch); + + if (!is_scalar) + stride[0] = 1; + else { + // Jump no element per row, to fill with scalar value + stride[0] = 0; + bunches = 1; + } + stride[std::min(2, N - 1)] = bunches * per_bunch; + + // eigen are stored in column major, but still for every column. + if (N > 2) { + for (int i = 3; i < N; i++) { + stride[i] = stride[i - 1] * columns[i - 2]; + } + stride[1] = stride[N - 1] * columns[N - 2]; + } + return stride; + } }; - static int get_elems_per_column(int nElements, size_t bytes) { - int per_bunch = SOA_Layout::alignment / bytes; - int bunches = std::ceil(1.0 * nElements / per_bunch); - return bunches * per_bunch; - } - - private: - static std::vector create_size(int nElements, const Columns& columns) { - std::vector size(columns.size() + 1); - size[0] = nElements; - std::copy(columns.columns.begin(), columns.columns.end(), size.begin() + 1); - - return size; - } - - static std::vector create_stride(int nElements, - const Columns& columns, - size_t bytes, - bool is_scalar = false) { - int N = columns.size() + 1; - std::vector stride(N); - - int per_bunch = SOA_Layout::alignment / bytes; - int bunches = std::ceil(1.0 * nElements / per_bunch); - - if (!is_scalar) - stride[0] = 1; - else { - // Jump no element per row, to fill with scalar value - stride[0] = 0; - bunches = 1; + // Metadata for SOA split into multiple blocks. + // An order for the resulting tensors can be defined. + template + struct SoAWrapper { + private: + std::map> blocks; + + template + inline static ::torch::ScalarType get_type() { + return ::torch::CppTypeToScalarType(); } - stride[std::min(2, N - 1)] = bunches * per_bunch; - // eigen are stored in column major, but still for every column. - if (N > 2) { - for (int i = 3; i < N; i++) { - stride[i] = stride[i - 1] * columns[i - 2]; + inline static std::vector standard_order(int size) { + std::vector order(size); + for (int i = 0; i < size; i++) { + order[i] = i; } - stride[1] = stride[N - 1] * columns[N - 2]; + return order; } - return stride; - } -}; - -// Metadata for SOA split into multiple blocks. -// An order for the resulting tensors can be defined. -template -struct SoAWrapper { - private: - std::map> blocks; - - template - inline static ::torch::ScalarType get_type() { - return ::torch::CppTypeToScalarType(); - } - - inline static std::vector standard_order(int size) { - std::vector order(size); - for (int i = 0; i < size; i++) { - order[i] = i; + + template + bool check_location(int elements, T* column, T* other_column, Others... others) { + return check_location(elements, other_column, others...) && (column + elements) == other_column; } - return order; - } - template - bool check_location(int elements, T* column, T* other_column, Others... others) { - return check_location(elements, other_column, others...) && (column + elements) == other_column; - } - - template - bool check_location(int elements, T* column, T* other_column) { - return (column + elements) == other_column; - } - - template - bool check_location(int elements, T* column) { - return true; - } - - public: - // Order of resulting tensor list - std::vector order; - int nElements; - int nBlocks; - - SoAWrapper(int nElements_) : nElements(nElements_), nBlocks(0) {} - - // TODO: Check columns are contiguous - template - requires (SameTypes && T::columnType == SoAColumnType::eigen) - void append_block(const std::string& name, T column, Others... others) { - const auto [ptr, stride] = column.tupleOrPointer(); - - int elems = Block::get_elems_per_column(nElements, sizeof(typename T::ScalarType)); - assert(check_location(elems * T::ValueType::RowsAtCompileTime * T::ValueType::ColsAtCompileTime, ptr, std::get<0>(others.tupleOrPointer())...)); - - Columns col({sizeof...(others) + 1, T::ValueType::RowsAtCompileTime}); - if (T::ValueType::ColsAtCompileTime > 1) - col.push(T::ValueType::ColsAtCompileTime); - - blocks.try_emplace(name, nElements, ptr, col, get_type(), sizeof(typename T::ScalarType)); - order.push_back(name); - nBlocks += 1; - } - - // TODO: Check columns are contiguous - // Append a block based on a typed pointer and a column object. - // Can be normal column or eigen column. - template - requires (SameTypes && T::columnType == SoAColumnType::column) - void append_block(const std::string& name, T column, Others... others) { - int elems = Block::get_elems_per_column(nElements, sizeof(typename T::ScalarType)); - assert(check_location(elems, column.tupleOrPointer(), others.tupleOrPointer()...)); - - blocks.try_emplace(name, nElements, column.tupleOrPointer(), sizeof...(others) + 1, get_type(), sizeof(typename T::ScalarType)); - order.push_back(name); - nBlocks += 1; - } - - template - requires (std::is_arithmetic_v && col_type == SoAColumnType::scalar) - void append_block(const std::string& name, SoAParametersImpl column) { - blocks.try_emplace(name, nElements, column.tupleOrPointer(), get_type(), sizeof(T)); - order.push_back(name); - nBlocks += 1; - } - - // The order is defined by the order append_block is called. - // It can be changed by passing a vector of the block names afterwards. - // All blocks have to be mentioned. - void change_order(const std::vector& new_order) { order = new_order; } - void change_order(std::vector&& new_order) { order = std::move(new_order); } - - inline Block operator[](const std::string& key) const { return blocks.at(key); } -}; + template + bool check_location(int elements, T* column, T* other_column) { + return (column + elements) == other_column; + } + + template + bool check_location(int elements, T* column) { + return true; + } + + public: + // Order of resulting tensor list + std::vector order; + int nElements; + int nBlocks; + + SoAWrapper(int nElements_) : nElements(nElements_), nBlocks(0) {} + + // TODO: Check columns are contiguous + template + requires(SameTypes && T::columnType == SoAColumnType::eigen) + void append_block(const std::string& name, T column, Others... others) { + const auto [ptr, stride] = column.tupleOrPointer(); + + int elems = Block::get_elems_per_column(nElements, sizeof(typename T::ScalarType)); + assert(check_location(elems * T::ValueType::RowsAtCompileTime * T::ValueType::ColsAtCompileTime, + ptr, + std::get<0>(others.tupleOrPointer())...)); + + Columns col({sizeof...(others) + 1, T::ValueType::RowsAtCompileTime}); + if (T::ValueType::ColsAtCompileTime > 1) + col.push(T::ValueType::ColsAtCompileTime); + + blocks.try_emplace(name, nElements, ptr, col, get_type(), sizeof(typename T::ScalarType)); + order.push_back(name); + nBlocks += 1; + } + + // TODO: Check columns are contiguous + // Append a block based on a typed pointer and a column object. + // Can be normal column or eigen column. + template + requires(SameTypes && + T::columnType == SoAColumnType::column) + void append_block(const std::string& name, T column, Others... others) { + int elems = Block::get_elems_per_column(nElements, sizeof(typename T::ScalarType)); + assert(check_location(elems, column.tupleOrPointer(), others.tupleOrPointer()...)); + + blocks.try_emplace(name, + nElements, + column.tupleOrPointer(), + sizeof...(others) + 1, + get_type(), + sizeof(typename T::ScalarType)); + order.push_back(name); + nBlocks += 1; + } + + template + requires(std::is_arithmetic_v && col_type == SoAColumnType::scalar) + void append_block(const std::string& name, SoAParametersImpl column) { + blocks.try_emplace(name, nElements, column.tupleOrPointer(), get_type(), sizeof(T)); + order.push_back(name); + nBlocks += 1; + } + + // The order is defined by the order append_block is called. + // It can be changed by passing a vector of the block names afterwards. + // All blocks have to be mentioned. + void change_order(const std::vector& new_order) { order = new_order; } + void change_order(std::vector&& new_order) { order = std::move(new_order); } + + inline Block operator[](const std::string& key) const { return blocks.at(key); } + }; } // namespace cms::torch::alpaka diff --git a/PhysicsTools/PyTorch/plugins/alpaka/AotRegressionProducer.cc b/PhysicsTools/PyTorch/plugins/alpaka/AotRegressionProducer.cc index 140733d93f2..c1c89e4c015 100644 --- a/PhysicsTools/PyTorch/plugins/alpaka/AotRegressionProducer.cc +++ b/PhysicsTools/PyTorch/plugins/alpaka/AotRegressionProducer.cc @@ -19,12 +19,11 @@ #include "PhysicsTools/PyTorch/interface/Nvtx.h" #include "PhysicsTools/PyTorch/plugins/alpaka/Kernels.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -using AotModel = cms::torch::alpaka::Model; + using AotModel = cms::torch::alpaka::Model; -/** + /** * @class AotRegressionProducer * @brief EDProducer that runs a regression model on input particles with Alpaka backend. * @@ -33,103 +32,106 @@ using AotModel = cms::torch::alpaka::Model> { - public: - AotRegressionProducer(const edm::ParameterSet ¶ms, const AotModel *cache); + class AotRegressionProducer : public stream::EDProducer> { + public: + AotRegressionProducer(const edm::ParameterSet ¶ms, const AotModel *cache); - static std::unique_ptr initializeGlobalCache(const edm::ParameterSet ¶ms); - static void globalEndJob(const AotModel *cache); + static std::unique_ptr initializeGlobalCache(const edm::ParameterSet ¶ms); + static void globalEndJob(const AotModel *cache); - void produce(device::Event &event, const device::EventSetup &event_setup) override; - static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + void produce(device::Event &event, const device::EventSetup &event_setup) override; + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); - private: - const device::EDGetToken inputs_token_; /**< Token to get input data. */ - const device::EDPutToken outputs_token_; /**< Token to store output data. */ - std::unique_ptr kernels_ = nullptr; /**< Kernel utilities for post-inference validation. */ -}; + private: + const device::EDGetToken inputs_token_; /**< Token to get input data. */ + const device::EDPutToken outputs_token_; /**< Token to store output data. */ + std::unique_ptr kernels_ = nullptr; /**< Kernel utilities for post-inference validation. */ + }; -AotRegressionProducer::AotRegressionProducer(edm::ParameterSet const& params, const AotModel *cache) - : EDProducer>(params), - inputs_token_{consumes(params.getParameter("inputs"))}, - outputs_token_{produces()}, - kernels_{std::make_unique()} {} + AotRegressionProducer::AotRegressionProducer(edm::ParameterSet const ¶ms, const AotModel *cache) + : EDProducer>(params), + inputs_token_{consumes(params.getParameter("inputs"))}, + outputs_token_{produces()}, + kernels_{std::make_unique()} {} -/** + /** * @brief Initializes the global cache by loading the model. * @param params Configuration parameters. * @return A unique pointer to the loaded model. */ -std::unique_ptr AotRegressionProducer::initializeGlobalCache(const edm::ParameterSet ¶m) { - auto model_path = param.getParameter("modelPath").fullPath(); - return std::make_unique(model_path); -} + std::unique_ptr AotRegressionProducer::initializeGlobalCache(const edm::ParameterSet ¶m) { + auto model_path = param.getParameter("modelPath").fullPath(); + return std::make_unique(model_path); + } -/** + /** * @brief Called at the end of the job to clean up global cache. * @param cache Pointer to the global model cache. */ -void AotRegressionProducer::globalEndJob(const AotModel *cache) {} + void AotRegressionProducer::globalEndJob(const AotModel *cache) {} -/** + /** * @brief Performs inference on the event data. * @param event The event to process. * @param event_setup Event setup information. */ -void AotRegressionProducer::produce(device::Event &event, const device::EventSetup &event_setup) { - auto t1 = std::chrono::high_resolution_clock::now(); - NvtxScopedRange produce_range("Regression::produce"); - - // guard torch internal operations to not conflict with cmssw fw scheme - cms::torch::alpaka::Guard guard(event.queue()); - // sanity check for debug - assert(cms::torch::alpaka::queue_hash(event.queue()) == cms::torch::alpaka::current_stream_hash(event.queue())); - - // get data - // TODO: const_cast should not be done by user - // in principle should not be done by anyone - // @see: torch::from_blob(void*) - auto& inputs = const_cast(event.get(inputs_token_));; - const size_t batch_size = inputs.const_view().metadata().size(); - auto outputs = torchportable::RegressionCollection(batch_size, event.queue()); - - // metadata for automatic tensor conversion - auto input_records = inputs.view().records(); - auto output_records = outputs.view().records(); - cms::torch::alpaka::SoAWrapper inputs_metadata(batch_size); - inputs_metadata.append_block("features", input_records.pt(), input_records.eta(), input_records.phi()); - cms::torch::alpaka::SoAWrapper outputs_metadata(batch_size); - outputs_metadata.append_block("preds", output_records.reco_pt()); - cms::torch::alpaka::ModelMetadata metadata(inputs_metadata, outputs_metadata); - - // inference - NvtxScopedRange move_to_device("Regression::move_to_device"); - if (cms::torch::alpaka::device(event.queue()) != globalCache()->device()) - globalCache()->to(event.queue()); - assert(cms::torch::alpaka::device(event.queue()) == globalCache()->device()); - move_to_device.end(); - NvtxScopedRange infer_range("Regression::inference"); - globalCache()->forward(metadata); - infer_range.end(); - - // assert output match expected - kernels_->AssertRegression(event.queue(), outputs); - event.emplace(outputs_token_, std::move(outputs)); - auto t2 = std::chrono::high_resolution_clock::now(); - std::cout << "(Regression) OK - " << std::chrono::duration_cast(t2 - t1).count() << " us" << std::endl; - produce_range.end(); -} - -/** + void AotRegressionProducer::produce(device::Event &event, const device::EventSetup &event_setup) { + auto t1 = std::chrono::high_resolution_clock::now(); + NvtxScopedRange produce_range("Regression::produce"); + + // guard torch internal operations to not conflict with cmssw fw scheme + cms::torch::alpaka::Guard guard(event.queue()); + // sanity check for debug + assert(cms::torch::alpaka::queue_hash(event.queue()) == cms::torch::alpaka::current_stream_hash(event.queue())); + + // get data + // TODO: const_cast should not be done by user + // in principle should not be done by anyone + // @see: torch::from_blob(void*) + auto &inputs = const_cast(event.get(inputs_token_)); + ; + const size_t batch_size = inputs.const_view().metadata().size(); + auto outputs = torchportable::RegressionCollection(batch_size, event.queue()); + + // metadata for automatic tensor conversion + auto input_records = inputs.view().records(); + auto output_records = outputs.view().records(); + cms::torch::alpaka::SoAWrapper inputs_metadata(batch_size); + inputs_metadata.append_block("features", input_records.pt(), input_records.eta(), input_records.phi()); + cms::torch::alpaka::SoAWrapper outputs_metadata(batch_size); + outputs_metadata.append_block("preds", output_records.reco_pt()); + cms::torch::alpaka::ModelMetadata metadata( + inputs_metadata, outputs_metadata); + + // inference + NvtxScopedRange move_to_device("Regression::move_to_device"); + if (cms::torch::alpaka::device(event.queue()) != globalCache()->device()) + globalCache()->to(event.queue()); + assert(cms::torch::alpaka::device(event.queue()) == globalCache()->device()); + move_to_device.end(); + NvtxScopedRange infer_range("Regression::inference"); + globalCache()->forward(metadata); + infer_range.end(); + + // assert output match expected + kernels_->AssertRegression(event.queue(), outputs); + event.emplace(outputs_token_, std::move(outputs)); + auto t2 = std::chrono::high_resolution_clock::now(); + std::cout << "(Regression) OK - " << std::chrono::duration_cast(t2 - t1).count() << " us" + << std::endl; + produce_range.end(); + } + + /** * @brief Fills the parameter descriptions for this module. * @param descriptions Configuration descriptions object to fill. */ -void AotRegressionProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { - edm::ParameterSetDescription desc; - desc.add("inputs"); - desc.add("modelPath"); - descriptions.addWithDefaultLabel(desc); -} + void AotRegressionProducer::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + desc.add("inputs"); + desc.add("modelPath"); + descriptions.addWithDefaultLabel(desc); + } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/plugins/alpaka/CombinatoricsProducer.cc b/PhysicsTools/PyTorch/plugins/alpaka/CombinatoricsProducer.cc index d3a209584c5..bfc2ed620f6 100644 --- a/PhysicsTools/PyTorch/plugins/alpaka/CombinatoricsProducer.cc +++ b/PhysicsTools/PyTorch/plugins/alpaka/CombinatoricsProducer.cc @@ -17,72 +17,72 @@ #include "PhysicsTools/PyTorch/interface/Nvtx.h" #include "PhysicsTools/PyTorch/plugins/alpaka/Kernels.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -/** + /** * @class CombinatoricsProducer * @brief A dummy Alpaka EDProducer that fills a particle collection. * * This producer simulates combinatorics logic by filling a new ParticleCollection with * placeholder values via a kernel call. Intended primarily for testing and validation. - */ -class CombinatoricsProducer : public stream::EDProducer<> { - public: - CombinatoricsProducer(const edm::ParameterSet ¶ms); + */ + class CombinatoricsProducer : public stream::EDProducer<> { + public: + CombinatoricsProducer(const edm::ParameterSet ¶ms); - void produce(device::Event &event, const device::EventSetup &event_setup) override; - static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + void produce(device::Event &event, const device::EventSetup &event_setup) override; + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); - private: - const device::EDGetToken inputs_token_; /**< Token to get input data. */ - const device::EDPutToken outputs_token_; /**< Token to store output data. */ - std::unique_ptr kernels_ = nullptr; /**< Kernel helper object. */ -}; + private: + const device::EDGetToken inputs_token_; /**< Token to get input data. */ + const device::EDPutToken outputs_token_; /**< Token to store output data. */ + std::unique_ptr kernels_ = nullptr; /**< Kernel helper object. */ + }; -CombinatoricsProducer::CombinatoricsProducer(edm::ParameterSet const& params) - : EDProducer<>(params), - inputs_token_{consumes(params.getParameter("inputs"))}, - outputs_token_{produces()}, - kernels_(std::make_unique()) {} + CombinatoricsProducer::CombinatoricsProducer(edm::ParameterSet const ¶ms) + : EDProducer<>(params), + inputs_token_{consumes(params.getParameter("inputs"))}, + outputs_token_{produces()}, + kernels_(std::make_unique()) {} -/** + /** * @brief Processes the event and fills output with mock data using a kernel. * @param event The current event. * @param event_setup Setup information for the event. - */ -void CombinatoricsProducer::produce(device::Event &event, const device::EventSetup &event_setup) { - auto t1 = std::chrono::high_resolution_clock::now(); - // debug stream usage in concurrently scheduled modules - NvtxScopedRange produce_range("Combinatorics::produce"); + */ + void CombinatoricsProducer::produce(device::Event &event, const device::EventSetup &event_setup) { + auto t1 = std::chrono::high_resolution_clock::now(); + // debug stream usage in concurrently scheduled modules + NvtxScopedRange produce_range("Combinatorics::produce"); - // get data - const auto& inputs = event.get(inputs_token_); - const size_t batch_size = inputs.const_view().metadata().size(); - auto outputs = torchportable::ParticleCollection(batch_size, event.queue()); + // get data + const auto &inputs = event.get(inputs_token_); + const size_t batch_size = inputs.const_view().metadata().size(); + auto outputs = torchportable::ParticleCollection(batch_size, event.queue()); - // dummy kernel emulation - NvtxScopedRange kernel_range("Combinatorics::kernel"); - kernels_->FillParticleCollection(event.queue(), outputs, 0.32f); - kernel_range.end(); + // dummy kernel emulation + NvtxScopedRange kernel_range("Combinatorics::kernel"); + kernels_->FillParticleCollection(event.queue(), outputs, 0.32f); + kernel_range.end(); - // assert output match expected - kernels_->AssertCombinatorics(event.queue(), outputs, 0.32f); - event.emplace(outputs_token_, std::move(outputs)); - auto t2 = std::chrono::high_resolution_clock::now(); - std::cout << "(Combinatorics) OK - " << std::chrono::duration_cast(t2 - t1).count() << " us" << std::endl; - produce_range.end(); -} + // assert output match expected + kernels_->AssertCombinatorics(event.queue(), outputs, 0.32f); + event.emplace(outputs_token_, std::move(outputs)); + auto t2 = std::chrono::high_resolution_clock::now(); + std::cout << "(Combinatorics) OK - " << std::chrono::duration_cast(t2 - t1).count() + << " us" << std::endl; + produce_range.end(); + } -/** + /** * @brief Defines configuration parameters for the module. * @param descriptions Object to be populated with parameter descriptions. */ -void CombinatoricsProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { - edm::ParameterSetDescription desc; - desc.add("inputs"); - descriptions.addWithDefaultLabel(desc); -} + void CombinatoricsProducer::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + desc.add("inputs"); + descriptions.addWithDefaultLabel(desc); + } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/plugins/alpaka/DataProducer.cc b/PhysicsTools/PyTorch/plugins/alpaka/DataProducer.cc index ea2cb42063d..f623d61f910 100644 --- a/PhysicsTools/PyTorch/plugins/alpaka/DataProducer.cc +++ b/PhysicsTools/PyTorch/plugins/alpaka/DataProducer.cc @@ -9,57 +9,55 @@ #include "HeterogeneousCore/AlpakaCore/interface/alpaka/stream/EDProducer.h" #include "HeterogeneousCore/AlpakaInterface/interface/config.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -/** + /** * @class DataProducer * @brief A minimal Alpaka EDProducer that generates a dummy ParticleCollection. * * This module produces a particle collection of configurable batch size, * zero-initialized, for testing or as placeholder data in processing chains. */ -class DataProducer : public stream::EDProducer<> { - public: - DataProducer(const edm::ParameterSet ¶ms); + class DataProducer : public stream::EDProducer<> { + public: + DataProducer(const edm::ParameterSet ¶ms); - void produce(device::Event &event, const device::EventSetup &event_setup) override; - static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + void produce(device::Event &event, const device::EventSetup &event_setup) override; + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); - private: - const device::EDPutToken sic_put_token_; /**< Token to store output data. */ - const uint32_t batch_size_; /**< Size of the batch to be produced. */ -}; + private: + const device::EDPutToken sic_put_token_; /**< Token to store output data. */ + const uint32_t batch_size_; /**< Size of the batch to be produced. */ + }; -DataProducer::DataProducer(edm::ParameterSet const& params) - : EDProducer<>(params), - sic_put_token_{produces()}, - batch_size_(params.getParameter("batchSize")) {} + DataProducer::DataProducer(edm::ParameterSet const ¶ms) + : EDProducer<>(params), sic_put_token_{produces()}, batch_size_(params.getParameter("batchSize")) {} -/** + /** * @brief Produces a ParticleCollection of fixed size with zero-initialized data. * @param event The current Alpaka event. * @param event_setup Event setup context (not used). */ -void DataProducer::produce(device::Event &event, const device::EventSetup &event_setup) { - auto t1 = std::chrono::high_resolution_clock::now(); - // create dummy data - auto collection = torchportable::ParticleCollection(batch_size_, event.queue()); - collection.zeroInitialise(event.queue()); - event.emplace(sic_put_token_, std::move(collection)); - auto t2 = std::chrono::high_resolution_clock::now(); - std::cout << "(Data) OK - " << std::chrono::duration_cast(t2 - t1).count() << " us" << std::endl; -} + void DataProducer::produce(device::Event &event, const device::EventSetup &event_setup) { + auto t1 = std::chrono::high_resolution_clock::now(); + // create dummy data + auto collection = torchportable::ParticleCollection(batch_size_, event.queue()); + collection.zeroInitialise(event.queue()); + event.emplace(sic_put_token_, std::move(collection)); + auto t2 = std::chrono::high_resolution_clock::now(); + std::cout << "(Data) OK - " << std::chrono::duration_cast(t2 - t1).count() << " us" + << std::endl; + } -/** + /** * @brief Describes the allowed configuration parameters for this module. * @param descriptions Configuration description object to populate. */ -void DataProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { - edm::ParameterSetDescription desc; - desc.add("batchSize"); - descriptions.addWithDefaultLabel(desc); -} + void DataProducer::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + desc.add("batchSize"); + descriptions.addWithDefaultLabel(desc); + } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/plugins/alpaka/JitClassificationProducer.cc b/PhysicsTools/PyTorch/plugins/alpaka/JitClassificationProducer.cc index 90d791308ac..58eaf883c80 100644 --- a/PhysicsTools/PyTorch/plugins/alpaka/JitClassificationProducer.cc +++ b/PhysicsTools/PyTorch/plugins/alpaka/JitClassificationProducer.cc @@ -19,12 +19,11 @@ #include "PhysicsTools/PyTorch/interface/Nvtx.h" #include "PhysicsTools/PyTorch/plugins/alpaka/Kernels.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -using JitModel = cms::torch::alpaka::Model; + using JitModel = cms::torch::alpaka::Model; -/** + /** * @class JitClassificationProducer * @brief EDProducer that runs a classification model on input particles with Alpaka backend. * @@ -33,103 +32,106 @@ using JitModel = cms::torch::alpaka::Model> { - public: - JitClassificationProducer(const edm::ParameterSet ¶ms, const JitModel *cache); + class JitClassificationProducer : public stream::EDProducer> { + public: + JitClassificationProducer(const edm::ParameterSet ¶ms, const JitModel *cache); - static std::unique_ptr initializeGlobalCache(const edm::ParameterSet ¶ms); - static void globalEndJob(const JitModel *cache); + static std::unique_ptr initializeGlobalCache(const edm::ParameterSet ¶ms); + static void globalEndJob(const JitModel *cache); - void produce(device::Event &event, const device::EventSetup &event_setup) override; - static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + void produce(device::Event &event, const device::EventSetup &event_setup) override; + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); - private: - const device::EDGetToken inputs_token_; /**< Token to get input data. */ - const device::EDPutToken outputs_token_; /**< Token to store output data. */ - std::unique_ptr kernels_ = nullptr; /**< Kernel utilities for post-inference validation. */ -}; + private: + const device::EDGetToken inputs_token_; /**< Token to get input data. */ + const device::EDPutToken outputs_token_; /**< Token to store output data. */ + std::unique_ptr kernels_ = nullptr; /**< Kernel utilities for post-inference validation. */ + }; -JitClassificationProducer::JitClassificationProducer(edm::ParameterSet const& params, const JitModel *cache) - : EDProducer>(params), - inputs_token_{consumes(params.getParameter("inputs"))}, - outputs_token_{produces()}, - kernels_{std::make_unique()} {} + JitClassificationProducer::JitClassificationProducer(edm::ParameterSet const ¶ms, const JitModel *cache) + : EDProducer>(params), + inputs_token_{consumes(params.getParameter("inputs"))}, + outputs_token_{produces()}, + kernels_{std::make_unique()} {} -/** + /** * @brief Initializes the global model cache by loading a TorchScript JIT model. * @param params Configuration parameters. * @return A unique pointer to the model. */ -std::unique_ptr JitClassificationProducer::initializeGlobalCache(const edm::ParameterSet ¶m) { - auto model_path = param.getParameter("modelPath").fullPath(); - return std::make_unique(model_path); -} + std::unique_ptr JitClassificationProducer::initializeGlobalCache(const edm::ParameterSet ¶m) { + auto model_path = param.getParameter("modelPath").fullPath(); + return std::make_unique(model_path); + } -/** + /** * @brief Called once at global job end. No-op here. * @param cache The global model cache instance. */ -void JitClassificationProducer::globalEndJob(const JitModel *cache) {} + void JitClassificationProducer::globalEndJob(const JitModel *cache) {} -/** + /** * @brief Executes model inference on the particle collection. * @param event The current Alpaka event. * @param event_setup Event setup context (unused). */ -void JitClassificationProducer::produce(device::Event &event, const device::EventSetup &event_setup) { - auto t1 = std::chrono::high_resolution_clock::now(); - NvtxScopedRange produce_range("Classifier::produce"); - - // guard torch internal operations to not conflict with fw execution scheme - cms::torch::alpaka::Guard guard(event.queue()); - // sanity check - assert(cms::torch::alpaka::queue_hash(event.queue()) == cms::torch::alpaka::current_stream_hash(event.queue())); - - // get data - // TODO: const_cast should not be done by user - // in principle should not be done by anyone - // @see: torch::from_blob(void*) - auto& inputs = const_cast(event.get(inputs_token_));; - const size_t batch_size = inputs.const_view().metadata().size(); - auto outputs = torchportable::ClassificationCollection(batch_size, event.queue()); - - // metadata for automatic tensor conversion - auto input_records = inputs.view().records(); - auto output_records = outputs.view().records(); - cms::torch::alpaka::SoAWrapper inputs_metadata(batch_size); - inputs_metadata.append_block("features", input_records.pt(), input_records.eta(), input_records.phi()); - cms::torch::alpaka::SoAWrapper outputs_metadata(batch_size); - outputs_metadata.append_block("preds", output_records.c1(), output_records.c2()); - cms::torch::alpaka::ModelMetadata metadata(inputs_metadata, outputs_metadata); - - // inference - NvtxScopedRange move_to_device("Classifier::move_to_device"); - if (cms::torch::alpaka::device(event.queue()) != globalCache()->device()) - globalCache()->to(event.queue()); - assert(cms::torch::alpaka::device(event.queue()) == globalCache()->device()); - move_to_device.end(); - NvtxScopedRange infer_range("Classifier::inference"); - globalCache()->forward(metadata); - infer_range.end(); - - // assert output match expected - kernels_->AssertClassification(event.queue(), outputs); - event.emplace(outputs_token_, std::move(outputs)); - auto t2 = std::chrono::high_resolution_clock::now(); - std::cout << "(Classification) OK - " << std::chrono::duration_cast(t2 - t1).count() << " us" << std::endl; - produce_range.end(); -} - -/** + void JitClassificationProducer::produce(device::Event &event, const device::EventSetup &event_setup) { + auto t1 = std::chrono::high_resolution_clock::now(); + NvtxScopedRange produce_range("Classifier::produce"); + + // guard torch internal operations to not conflict with fw execution scheme + cms::torch::alpaka::Guard guard(event.queue()); + // sanity check + assert(cms::torch::alpaka::queue_hash(event.queue()) == cms::torch::alpaka::current_stream_hash(event.queue())); + + // get data + // TODO: const_cast should not be done by user + // in principle should not be done by anyone + // @see: torch::from_blob(void*) + auto &inputs = const_cast(event.get(inputs_token_)); + ; + const size_t batch_size = inputs.const_view().metadata().size(); + auto outputs = torchportable::ClassificationCollection(batch_size, event.queue()); + + // metadata for automatic tensor conversion + auto input_records = inputs.view().records(); + auto output_records = outputs.view().records(); + cms::torch::alpaka::SoAWrapper inputs_metadata(batch_size); + inputs_metadata.append_block("features", input_records.pt(), input_records.eta(), input_records.phi()); + cms::torch::alpaka::SoAWrapper outputs_metadata(batch_size); + outputs_metadata.append_block("preds", output_records.c1(), output_records.c2()); + cms::torch::alpaka::ModelMetadata metadata( + inputs_metadata, outputs_metadata); + + // inference + NvtxScopedRange move_to_device("Classifier::move_to_device"); + if (cms::torch::alpaka::device(event.queue()) != globalCache()->device()) + globalCache()->to(event.queue()); + assert(cms::torch::alpaka::device(event.queue()) == globalCache()->device()); + move_to_device.end(); + NvtxScopedRange infer_range("Classifier::inference"); + globalCache()->forward(metadata); + infer_range.end(); + + // assert output match expected + kernels_->AssertClassification(event.queue(), outputs); + event.emplace(outputs_token_, std::move(outputs)); + auto t2 = std::chrono::high_resolution_clock::now(); + std::cout << "(Classification) OK - " << std::chrono::duration_cast(t2 - t1).count() + << " us" << std::endl; + produce_range.end(); + } + + /** * @brief Describes module configuration parameters. * @param descriptions Configuration description object. */ -void JitClassificationProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { - edm::ParameterSetDescription desc; - desc.add("inputs"); - desc.add("modelPath"); - descriptions.addWithDefaultLabel(desc); -} + void JitClassificationProducer::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + desc.add("inputs"); + desc.add("modelPath"); + descriptions.addWithDefaultLabel(desc); + } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/plugins/alpaka/Kernels.dev.cc b/PhysicsTools/PyTorch/plugins/alpaka/Kernels.dev.cc index 476812a11be..f4bf0cc35ea 100644 --- a/PhysicsTools/PyTorch/plugins/alpaka/Kernels.dev.cc +++ b/PhysicsTools/PyTorch/plugins/alpaka/Kernels.dev.cc @@ -6,25 +6,23 @@ #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" #include "PhysicsTools/PyTorch/plugins/alpaka/Kernels.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -using namespace cms::alpakatools; - + using namespace cms::alpakatools; -class FillParticleCollectionKernel { -public: - template >> - ALPAKA_FN_ACC void operator()(const TAcc &acc, torchportable::ParticleCollection::View data, float value) const { - for (auto tid : uniform_elements(acc, data.metadata().size())) { - data.pt()[tid] = value; - data.phi()[tid] = value; - data.eta()[tid] = value; + class FillParticleCollectionKernel { + public: + template >> + ALPAKA_FN_ACC void operator()(const TAcc &acc, torchportable::ParticleCollection::View data, float value) const { + for (auto tid : uniform_elements(acc, data.metadata().size())) { + data.pt()[tid] = value; + data.phi()[tid] = value; + data.eta()[tid] = value; + } } - } -}; + }; -/** + /** * @brief Fill all values in a particle collection with a specified constant. * * For debugging and unit testing. @@ -33,26 +31,26 @@ public: * @param data Particle collection to be modified. * @param value Constant value to fill the collection with. */ -void Kernels::FillParticleCollection(Queue &queue, torchportable::ParticleCollection &data, float value) { - uint32_t threads_per_block = 512; - uint32_t blocks_per_grid = divide_up_by(data.view().metadata().size(), threads_per_block); - auto grid = make_workdiv(blocks_per_grid, threads_per_block); - alpaka::exec(queue, grid, FillParticleCollectionKernel{}, data.view(), value); -} + void Kernels::FillParticleCollection(Queue &queue, torchportable::ParticleCollection &data, float value) { + uint32_t threads_per_block = 512; + uint32_t blocks_per_grid = divide_up_by(data.view().metadata().size(), threads_per_block); + auto grid = make_workdiv(blocks_per_grid, threads_per_block); + alpaka::exec(queue, grid, FillParticleCollectionKernel{}, data.view(), value); + } -class AssertCombinatoricsKernel { + class AssertCombinatoricsKernel { public: - template >> - ALPAKA_FN_ACC void operator()(const TAcc &acc, torchportable::ParticleCollection::View data, float value) const { - for (auto tid : uniform_elements(acc, data.metadata().size())) { - ALPAKA_ASSERT_ACC(data.pt()[tid] == value); - ALPAKA_ASSERT_ACC(data.phi()[tid] == value); - ALPAKA_ASSERT_ACC(data.eta()[tid] == value); - } - } - }; - -/** + template >> + ALPAKA_FN_ACC void operator()(const TAcc &acc, torchportable::ParticleCollection::View data, float value) const { + for (auto tid : uniform_elements(acc, data.metadata().size())) { + ALPAKA_ASSERT_ACC(data.pt()[tid] == value); + ALPAKA_ASSERT_ACC(data.phi()[tid] == value); + ALPAKA_ASSERT_ACC(data.eta()[tid] == value); + } + } + }; + + /** * @brief Assert that the particle collection obeys certain combinatoric relationships. * * Used in test scenarios to verify data layout or transformation logic. @@ -61,25 +59,25 @@ class AssertCombinatoricsKernel { * @param data Particle collection to check. * @param value Reference value for validation. */ - void Kernels::AssertCombinatorics(Queue &queue, torchportable::ParticleCollection &data, float value) { - uint32_t threads_per_block = 512; - uint32_t blocks_per_grid = divide_up_by(data.view().metadata().size(), threads_per_block); - auto grid = make_workdiv(blocks_per_grid, threads_per_block); - alpaka::exec(queue, grid, AssertCombinatoricsKernel{}, data.view(), value); - } + void Kernels::AssertCombinatorics(Queue &queue, torchportable::ParticleCollection &data, float value) { + uint32_t threads_per_block = 512; + uint32_t blocks_per_grid = divide_up_by(data.view().metadata().size(), threads_per_block); + auto grid = make_workdiv(blocks_per_grid, threads_per_block); + alpaka::exec(queue, grid, AssertCombinatoricsKernel{}, data.view(), value); + } -class AssertClassificationKernel { - public: - template >> - ALPAKA_FN_ACC void operator()(const TAcc &acc, torchportable::ClassificationCollection::View data) const { - for (auto tid : uniform_elements(acc, data.metadata().size())) { - ALPAKA_ASSERT_ACC(data.c1()[tid] == 0.5f); - ALPAKA_ASSERT_ACC(data.c2()[tid] == 0.5f); + class AssertClassificationKernel { + public: + template >> + ALPAKA_FN_ACC void operator()(const TAcc &acc, torchportable::ClassificationCollection::View data) const { + for (auto tid : uniform_elements(acc, data.metadata().size())) { + ALPAKA_ASSERT_ACC(data.c1()[tid] == 0.5f); + ALPAKA_ASSERT_ACC(data.c2()[tid] == 0.5f); + } } - } -}; + }; -/** + /** * @brief Validate classification model outputs. * * Checks whether the classification outputs match expected format or values. @@ -88,24 +86,24 @@ class AssertClassificationKernel { * @param queue Alpaka execution queue. * @param data Classification output collection. */ -void Kernels::AssertClassification(Queue &queue, torchportable::ClassificationCollection &data) { - uint32_t threads_per_block = 512; - uint32_t blocks_per_grid = divide_up_by(data.view().metadata().size(), threads_per_block); - auto grid = make_workdiv(blocks_per_grid, threads_per_block); - alpaka::exec(queue, grid, AssertClassificationKernel{}, data.view()); -} + void Kernels::AssertClassification(Queue &queue, torchportable::ClassificationCollection &data) { + uint32_t threads_per_block = 512; + uint32_t blocks_per_grid = divide_up_by(data.view().metadata().size(), threads_per_block); + auto grid = make_workdiv(blocks_per_grid, threads_per_block); + alpaka::exec(queue, grid, AssertClassificationKernel{}, data.view()); + } -class AssertRegressionKernel { - public: - template >> - ALPAKA_FN_ACC void operator()(const TAcc &acc, torchportable::RegressionCollection::View data) const { - for (auto tid : uniform_elements(acc, data.metadata().size())) { - ALPAKA_ASSERT_ACC(data.reco_pt()[tid] == 0.5f); + class AssertRegressionKernel { + public: + template >> + ALPAKA_FN_ACC void operator()(const TAcc &acc, torchportable::RegressionCollection::View data) const { + for (auto tid : uniform_elements(acc, data.metadata().size())) { + ALPAKA_ASSERT_ACC(data.reco_pt()[tid] == 0.5f); + } } - } -}; + }; -/** + /** * @brief Validate regression model outputs. * * Similar to classification checks, this is used for asserting output correctness in regression tasks. @@ -113,11 +111,11 @@ class AssertRegressionKernel { * @param queue Alpaka execution queue. * @param data Regression output collection. */ -void Kernels::AssertRegression(Queue &queue, torchportable::RegressionCollection &data) { - uint32_t threads_per_block = 512; - uint32_t blocks_per_grid = divide_up_by(data.view().metadata().size(), threads_per_block); - auto grid = make_workdiv(blocks_per_grid, threads_per_block); - alpaka::exec(queue, grid, AssertRegressionKernel{}, data.view()); -} + void Kernels::AssertRegression(Queue &queue, torchportable::RegressionCollection &data) { + uint32_t threads_per_block = 512; + uint32_t blocks_per_grid = divide_up_by(data.view().metadata().size(), threads_per_block); + auto grid = make_workdiv(blocks_per_grid, threads_per_block); + alpaka::exec(queue, grid, AssertRegressionKernel{}, data.view()); + } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/plugins/alpaka/Kernels.h b/PhysicsTools/PyTorch/plugins/alpaka/Kernels.h index 8a62a509aa2..58e6dfd1651 100644 --- a/PhysicsTools/PyTorch/plugins/alpaka/Kernels.h +++ b/PhysicsTools/PyTorch/plugins/alpaka/Kernels.h @@ -8,20 +8,20 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { -/** + /** * @class Kernels * @brief Utility class containing helper functions to run simple Alpaka kernels for testing or validation. * * This class provides simple device-side functionality for modifying and verifying * collections of structured SoA data, such as particles, classification outputs, and regressions. */ -class Kernels { - public: - void FillParticleCollection(Queue &queue, torchportable::ParticleCollection &data, float value); - void AssertCombinatorics(Queue &queue, torchportable::ParticleCollection &data, float value); - void AssertClassification(Queue &queue, torchportable::ClassificationCollection &data); - void AssertRegression(Queue &queue, torchportable::RegressionCollection &data); -}; + class Kernels { + public: + void FillParticleCollection(Queue &queue, torchportable::ParticleCollection &data, float value); + void AssertCombinatorics(Queue &queue, torchportable::ParticleCollection &data, float value); + void AssertClassification(Queue &queue, torchportable::ClassificationCollection &data); + void AssertRegression(Queue &queue, torchportable::RegressionCollection &data); + }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/test/alpaka/testModelAOT.cc b/PhysicsTools/PyTorch/test/alpaka/testModelAOT.cc index 03a27610783..83fe9bc86c6 100644 --- a/PhysicsTools/PyTorch/test/alpaka/testModelAOT.cc +++ b/PhysicsTools/PyTorch/test/alpaka/testModelAOT.cc @@ -6,64 +6,65 @@ #include "PhysicsTools/PyTorch/interface/Model.h" #include "PhysicsTools/PyTorch/test/testUtilities.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -using namespace ::cms::torch::alpaka; + using namespace ::cms::torch::alpaka; -class TestModelAOT : public CppUnit::TestFixture { - CPPUNIT_TEST_SUITE(TestModelAOT); - CPPUNIT_TEST(test); - CPPUNIT_TEST_SUITE_END(); + class TestModelAOT : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(TestModelAOT); + CPPUNIT_TEST(test); + CPPUNIT_TEST_SUITE_END(); - public: - void test(); - std::string shared_lib(); -}; + public: + void test(); + std::string shared_lib(); + }; -CPPUNIT_TEST_SUITE_REGISTRATION(TestModelAOT); + CPPUNIT_TEST_SUITE_REGISTRATION(TestModelAOT); -std::string TestModelAOT::shared_lib() { + std::string TestModelAOT::shared_lib() { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - return get_path("/src/PhysicsTools/PyTorch/models/aot_regression_model_cuda_el9_amd64_gcc12.pt2"); + return get_path("/src/PhysicsTools/PyTorch/models/aot_regression_model_cuda_el9_amd64_gcc12.pt2"); #elif ALPAKA_ACC_GPU_HIP_ENABLED - std::cerr << "FAILED: ROCm backend not yet supported, see: https://github.com/pytorch/pytorch/blob/main/aten/CMakeLists.txt#L75" << std::endl; - return ""; + std::cerr << "FAILED: ROCm backend not yet supported, see: " + "https://github.com/pytorch/pytorch/blob/main/aten/CMakeLists.txt#L75" + << std::endl; + return ""; #elif ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED - return get_path("/src/PhysicsTools/PyTorch/models/aot_regression_model_cpu_el9_amd64_gcc12.pt2"); + return get_path("/src/PhysicsTools/PyTorch/models/aot_regression_model_cpu_el9_amd64_gcc12.pt2"); #else - std::cerr << "FAILED: Unable to detect backend type." << std::endl; - return ""; + std::cerr << "FAILED: Unable to detect backend type." << std::endl; + return ""; #endif -} + } + + void TestModelAOT::test() { + // alpaka setup + Platform platform; + std::vector devices = ::alpaka::getDevs(platform); + CPPUNIT_ASSERT(!devices.empty()); + const auto& device = devices[0]; + Queue queue{device}; -void TestModelAOT::test() { - // alpaka setup - Platform platform; - std::vector devices = ::alpaka::getDevs(platform); - CPPUNIT_ASSERT(!devices.empty()); - const auto& device = devices[0]; - Queue queue{device}; + const std::size_t batch_size = 2 << 10; + std::vector<::torch::IValue> inputs; + inputs.push_back(torch::ones({batch_size, 3}, cms::torch::alpaka::device(queue))); - const std::size_t batch_size = 2 << 10; - std::vector<::torch::IValue> inputs; - inputs.push_back(torch::ones({batch_size, 3}, cms::torch::alpaka::device(queue))); + std::vector inputs_tensor; + for (const auto& val : inputs) + inputs_tensor.push_back(val.toTensor()); - std::vector inputs_tensor; - for (const auto& val : inputs) - inputs_tensor.push_back(val.toTensor()); - - auto lib_path = shared_lib(); - CPPUNIT_ASSERT_MESSAGE("FAILED: Architecture compiled shared library missing.", !lib_path.empty()); + auto lib_path = shared_lib(); + CPPUNIT_ASSERT_MESSAGE("FAILED: Architecture compiled shared library missing.", !lib_path.empty()); - Model aot_model(lib_path); - aot_model.to(queue); - std::cout << "Device: " << aot_model.device() << std::endl; - CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == aot_model.device()); - auto outputs = aot_model.forward(inputs_tensor); - for (const auto& val : outputs) { - CPPUNIT_ASSERT(::torch::allclose(val, ::torch::full_like(val, 0.5f))); + Model aot_model(lib_path); + aot_model.to(queue); + std::cout << "Device: " << aot_model.device() << std::endl; + CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == aot_model.device()); + auto outputs = aot_model.forward(inputs_tensor); + for (const auto& val : outputs) { + CPPUNIT_ASSERT(::torch::allclose(val, ::torch::full_like(val, 0.5f))); + } } -} } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/test/alpaka/testModelAlpakaNamespace.cc b/PhysicsTools/PyTorch/test/alpaka/testModelAlpakaNamespace.cc index 1ec8b98a1bc..7c4dd6755c8 100644 --- a/PhysicsTools/PyTorch/test/alpaka/testModelAlpakaNamespace.cc +++ b/PhysicsTools/PyTorch/test/alpaka/testModelAlpakaNamespace.cc @@ -6,39 +6,38 @@ #include "PhysicsTools/PyTorch/interface/AlpakaConfig.h" #include "PhysicsTools/PyTorch/test/testUtilities.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -using namespace ::cms::torch::alpaka; - -class TestModelAlpakaNamespace : public CppUnit::TestFixture { - CPPUNIT_TEST_SUITE(TestModelAlpakaNamespace); - CPPUNIT_TEST(test); - CPPUNIT_TEST_SUITE_END(); - -public: - void test(); - const int64_t batch_size_ = 2 << 10; -}; - -CPPUNIT_TEST_SUITE_REGISTRATION(TestModelAlpakaNamespace); - -void TestModelAlpakaNamespace::test() { - // alpaka setup - Platform platform; - std::vector devices = ::alpaka::getDevs(platform); - CPPUNIT_ASSERT(!devices.empty()); - const auto& alpaka_device = devices[0]; - Queue queue{alpaka_device}; - - auto device = cms::torch::alpaka::device(queue); - std::cout << "Device: " << device << std::endl; - auto inputs = ::torch::ones({batch_size_, 3}, device=device); - - ClassifierModel model; - model.to(device); - auto outputs = model.forward(inputs); - CPPUNIT_ASSERT(::torch::allclose(outputs, ::torch::full_like(outputs, 0.5f))); -} + using namespace ::cms::torch::alpaka; + + class TestModelAlpakaNamespace : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(TestModelAlpakaNamespace); + CPPUNIT_TEST(test); + CPPUNIT_TEST_SUITE_END(); + + public: + void test(); + const int64_t batch_size_ = 2 << 10; + }; + + CPPUNIT_TEST_SUITE_REGISTRATION(TestModelAlpakaNamespace); + + void TestModelAlpakaNamespace::test() { + // alpaka setup + Platform platform; + std::vector devices = ::alpaka::getDevs(platform); + CPPUNIT_ASSERT(!devices.empty()); + const auto& alpaka_device = devices[0]; + Queue queue{alpaka_device}; + + auto device = cms::torch::alpaka::device(queue); + std::cout << "Device: " << device << std::endl; + auto inputs = ::torch::ones({batch_size_, 3}, device = device); + + ClassifierModel model; + model.to(device); + auto outputs = model.forward(inputs); + CPPUNIT_ASSERT(::torch::allclose(outputs, ::torch::full_like(outputs, 0.5f))); + } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/test/alpaka/testModelJIT.cc b/PhysicsTools/PyTorch/test/alpaka/testModelJIT.cc index d2168838c52..5d616f2933d 100644 --- a/PhysicsTools/PyTorch/test/alpaka/testModelJIT.cc +++ b/PhysicsTools/PyTorch/test/alpaka/testModelJIT.cc @@ -6,69 +6,68 @@ #include "PhysicsTools/PyTorch/interface/Model.h" #include "PhysicsTools/PyTorch/test/testUtilities.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -using namespace ::cms::torch::alpaka; - -class TestModelJIT : public CppUnit::TestFixture { - CPPUNIT_TEST_SUITE(TestModelJIT); - CPPUNIT_TEST(test); - CPPUNIT_TEST(testMultiBranch); - CPPUNIT_TEST_SUITE_END(); - - public: - void test(); - void testMultiBranch(); -}; - -CPPUNIT_TEST_SUITE_REGISTRATION(TestModelJIT); - -void TestModelJIT::test() { - // alpaka setup - Platform platform; - std::vector devices = ::alpaka::getDevs(platform); - CPPUNIT_ASSERT(!devices.empty()); - const auto& device = devices[0]; - Queue queue{device}; - - const std::size_t batch_size = 2 << 10; - std::vector<::torch::IValue> inputs; - inputs.push_back(::torch::ones({batch_size, 3}, cms::torch::alpaka::device(queue))); - - auto m_path = get_path("/src/PhysicsTools/PyTorch/models/jit_classification_model.pt"); - Model jit_model(m_path); - jit_model.to(queue); - std::cout << "Device: " << jit_model.device() << std::endl; - CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == jit_model.device()); - auto outputs = jit_model.forward(inputs); - CPPUNIT_ASSERT(::torch::allclose(outputs.toTensor(), ::torch::full_like(outputs.toTensor(), 0.5f))); -} - -void TestModelJIT::testMultiBranch() { - // alpaka setup - Platform platform; - std::vector devices = ::alpaka::getDevs(platform); - CPPUNIT_ASSERT(!devices.empty()); - const auto& device = devices[0]; - Queue queue{device}; - - const std::size_t batch_size = 2 << 10; - std::vector<::torch::IValue> inputs; - inputs.push_back(::torch::ones({batch_size, 5}, cms::torch::alpaka::device(queue))); - - auto m_path = get_path("/src/PhysicsTools/PyTorch/models/jit_multi_branch_model.pt"); - Model jit_model(m_path); - jit_model.to(queue); - std::cout << "Device: " << jit_model.device() << std::endl; - CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == jit_model.device()); - auto outputs = jit_model.forward(inputs).toTuple(); - - auto class_probs = outputs->elements()[0].toTensor(); - auto reg_value = outputs->elements()[1].toTensor(); - - CPPUNIT_ASSERT(::torch::allclose(class_probs, ::torch::full_like(class_probs, 0.2f))); - CPPUNIT_ASSERT(::torch::allclose(reg_value, ::torch::full_like(reg_value, 15.7286f))); -} + using namespace ::cms::torch::alpaka; + + class TestModelJIT : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(TestModelJIT); + CPPUNIT_TEST(test); + CPPUNIT_TEST(testMultiBranch); + CPPUNIT_TEST_SUITE_END(); + + public: + void test(); + void testMultiBranch(); + }; + + CPPUNIT_TEST_SUITE_REGISTRATION(TestModelJIT); + + void TestModelJIT::test() { + // alpaka setup + Platform platform; + std::vector devices = ::alpaka::getDevs(platform); + CPPUNIT_ASSERT(!devices.empty()); + const auto& device = devices[0]; + Queue queue{device}; + + const std::size_t batch_size = 2 << 10; + std::vector<::torch::IValue> inputs; + inputs.push_back(::torch::ones({batch_size, 3}, cms::torch::alpaka::device(queue))); + + auto m_path = get_path("/src/PhysicsTools/PyTorch/models/jit_classification_model.pt"); + Model jit_model(m_path); + jit_model.to(queue); + std::cout << "Device: " << jit_model.device() << std::endl; + CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == jit_model.device()); + auto outputs = jit_model.forward(inputs); + CPPUNIT_ASSERT(::torch::allclose(outputs.toTensor(), ::torch::full_like(outputs.toTensor(), 0.5f))); + } + + void TestModelJIT::testMultiBranch() { + // alpaka setup + Platform platform; + std::vector devices = ::alpaka::getDevs(platform); + CPPUNIT_ASSERT(!devices.empty()); + const auto& device = devices[0]; + Queue queue{device}; + + const std::size_t batch_size = 2 << 10; + std::vector<::torch::IValue> inputs; + inputs.push_back(::torch::ones({batch_size, 5}, cms::torch::alpaka::device(queue))); + + auto m_path = get_path("/src/PhysicsTools/PyTorch/models/jit_multi_branch_model.pt"); + Model jit_model(m_path); + jit_model.to(queue); + std::cout << "Device: " << jit_model.device() << std::endl; + CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == jit_model.device()); + auto outputs = jit_model.forward(inputs).toTuple(); + + auto class_probs = outputs->elements()[0].toTensor(); + auto reg_value = outputs->elements()[1].toTensor(); + + CPPUNIT_ASSERT(::torch::allclose(class_probs, ::torch::full_like(class_probs, 0.2f))); + CPPUNIT_ASSERT(::torch::allclose(reg_value, ::torch::full_like(reg_value, 15.7286f))); + } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/test/alpaka/testPortableInferenceAOT.dev.cc b/PhysicsTools/PyTorch/test/alpaka/testPortableInferenceAOT.dev.cc index a7d9a07d52d..7c407fc25d9 100644 --- a/PhysicsTools/PyTorch/test/alpaka/testPortableInferenceAOT.dev.cc +++ b/PhysicsTools/PyTorch/test/alpaka/testPortableInferenceAOT.dev.cc @@ -3,7 +3,6 @@ #include #include - #include "DataFormats/Portable/interface/PortableCollection.h" #include "DataFormats/Portable/interface/PortableHostCollection.h" #include "DataFormats/SoATemplate/interface/SoALayout.h" @@ -15,102 +14,94 @@ #include "PhysicsTools/PyTorch/interface/SoAWrapper.h" #include "PhysicsTools/PyTorch/test/testUtilities.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -GENERATE_SOA_LAYOUT(SoAInputsTemplate, - SOA_COLUMN(float, x), - SOA_COLUMN(float, y), - SOA_COLUMN(float, z) -) + GENERATE_SOA_LAYOUT(SoAInputsTemplate, SOA_COLUMN(float, x), SOA_COLUMN(float, y), SOA_COLUMN(float, z)) -GENERATE_SOA_LAYOUT(SoAOutputsTemplate, - SOA_COLUMN(float, prob) -) + GENERATE_SOA_LAYOUT(SoAOutputsTemplate, SOA_COLUMN(float, prob)) -using SoAInputs = SoAInputsTemplate<>; -using SoAOutputs = SoAOutputsTemplate<>; + using SoAInputs = SoAInputsTemplate<>; + using SoAOutputs = SoAOutputsTemplate<>; + class TestPortableInferenceAOT : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(TestPortableInferenceAOT); + CPPUNIT_TEST(test); + CPPUNIT_TEST_SUITE_END(); -class TestPortableInferenceAOT : public CppUnit::TestFixture { - CPPUNIT_TEST_SUITE(TestPortableInferenceAOT); - CPPUNIT_TEST(test); - CPPUNIT_TEST_SUITE_END(); + public: + void test(); + std::string shared_lib(); + }; - public: - void test(); - std::string shared_lib(); -}; + CPPUNIT_TEST_SUITE_REGISTRATION(TestPortableInferenceAOT); -CPPUNIT_TEST_SUITE_REGISTRATION(TestPortableInferenceAOT); - -std::string TestPortableInferenceAOT::shared_lib() { + std::string TestPortableInferenceAOT::shared_lib() { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - return get_path("/src/PhysicsTools/PyTorch/models/aot_regression_model_cuda_el9_amd64_gcc12.pt2"); + return get_path("/src/PhysicsTools/PyTorch/models/aot_regression_model_cuda_el9_amd64_gcc12.pt2"); #elif ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED - return get_path("/src/PhysicsTools/PyTorch/models/aot_regression_model_cpu_el9_amd64_gcc12.pt2"); + return get_path("/src/PhysicsTools/PyTorch/models/aot_regression_model_cpu_el9_amd64_gcc12.pt2"); #else - std::cerr << "FAILED: Unable to detect backend type." << std::endl; - return ""; + std::cerr << "FAILED: Unable to detect backend type." << std::endl; + return ""; #endif -} - -void TestPortableInferenceAOT::test() { - // alpaka setup - Platform platform; - std::vector alpaka_devices = alpaka::getDevs(platform); - const auto& alpaka_host = alpaka::getDevByIdx(alpaka_common::PlatformHost(), 0u); - CPPUNIT_ASSERT(alpaka_devices.size()); - const auto& alpaka_device = alpaka_devices[0]; - Queue queue{alpaka_device}; - - const std::size_t batch_size = 2 << 10; - - // host structs - PortableHostCollection inputs_host(batch_size, cms::alpakatools::host()); - PortableHostCollection outputs_host(batch_size, cms::alpakatools::host()); - // device structs - PortableCollection inputs_device(batch_size, alpaka_device); - PortableCollection outputs_device(batch_size, alpaka_device); - - // prepare inputs - for (size_t i = 0; i < batch_size; i++) { - inputs_host.view().x()[i] = 0.0f; - inputs_host.view().y()[i] = 0.0f; - inputs_host.view().z()[i] = 0.0f; } - alpaka::memcpy(queue, inputs_device.buffer(), inputs_host.buffer()); - alpaka::wait(queue); - - { - // guard scope - cms::torch::alpaka::Guard guard(queue); - - // instantiate model - auto lib_path = shared_lib(); - CPPUNIT_ASSERT_MESSAGE("FAILED: Architecture compiled shared library is missing.", !lib_path.empty()); - auto model = cms::torch::alpaka::Model(lib_path); - model.to(queue); - CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == model.device()); - std::cout << "Device: " << model.device() << std::endl; - - // metadata for automatic tensor conversion - auto input_records = inputs_device.view().records(); - auto output_records = outputs_device.view().records(); - cms::torch::alpaka::SoAWrapper inputs_metadata(batch_size); - inputs_metadata.append_block("features", input_records.x(), input_records.y(), input_records.z()); - cms::torch::alpaka::SoAWrapper outputs_metadata(batch_size); - outputs_metadata.append_block("prob", output_records.prob()); - cms::torch::alpaka::ModelMetadata metadata(inputs_metadata, outputs_metadata); - // inference - model.forward(metadata); - // check outputs - alpaka::memcpy(queue, outputs_host.buffer(), outputs_device.buffer()); - alpaka::wait(queue); + + void TestPortableInferenceAOT::test() { + // alpaka setup + Platform platform; + std::vector alpaka_devices = alpaka::getDevs(platform); + const auto& alpaka_host = alpaka::getDevByIdx(alpaka_common::PlatformHost(), 0u); + CPPUNIT_ASSERT(alpaka_devices.size()); + const auto& alpaka_device = alpaka_devices[0]; + Queue queue{alpaka_device}; + + const std::size_t batch_size = 2 << 10; + + // host structs + PortableHostCollection inputs_host(batch_size, cms::alpakatools::host()); + PortableHostCollection outputs_host(batch_size, cms::alpakatools::host()); + // device structs + PortableCollection inputs_device(batch_size, alpaka_device); + PortableCollection outputs_device(batch_size, alpaka_device); + + // prepare inputs for (size_t i = 0; i < batch_size; i++) { - CPPUNIT_ASSERT(outputs_host.const_view().prob()[i] == 0.5f); + inputs_host.view().x()[i] = 0.0f; + inputs_host.view().y()[i] = 0.0f; + inputs_host.view().z()[i] = 0.0f; + } + alpaka::memcpy(queue, inputs_device.buffer(), inputs_host.buffer()); + alpaka::wait(queue); + + { + // guard scope + cms::torch::alpaka::Guard guard(queue); + + // instantiate model + auto lib_path = shared_lib(); + CPPUNIT_ASSERT_MESSAGE("FAILED: Architecture compiled shared library is missing.", !lib_path.empty()); + auto model = cms::torch::alpaka::Model(lib_path); + model.to(queue); + CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == model.device()); + std::cout << "Device: " << model.device() << std::endl; + + // metadata for automatic tensor conversion + auto input_records = inputs_device.view().records(); + auto output_records = outputs_device.view().records(); + cms::torch::alpaka::SoAWrapper inputs_metadata(batch_size); + inputs_metadata.append_block("features", input_records.x(), input_records.y(), input_records.z()); + cms::torch::alpaka::SoAWrapper outputs_metadata(batch_size); + outputs_metadata.append_block("prob", output_records.prob()); + cms::torch::alpaka::ModelMetadata metadata(inputs_metadata, outputs_metadata); + // inference + model.forward(metadata); + // check outputs + alpaka::memcpy(queue, outputs_host.buffer(), outputs_device.buffer()); + alpaka::wait(queue); + for (size_t i = 0; i < batch_size; i++) { + CPPUNIT_ASSERT(outputs_host.const_view().prob()[i] == 0.5f); + } } } -} } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/test/alpaka/testPortableInferenceJIT.cc b/PhysicsTools/PyTorch/test/alpaka/testPortableInferenceJIT.cc index 3abe82769a3..b56350bbdc9 100644 --- a/PhysicsTools/PyTorch/test/alpaka/testPortableInferenceJIT.cc +++ b/PhysicsTools/PyTorch/test/alpaka/testPortableInferenceJIT.cc @@ -3,7 +3,6 @@ #include #include - #include "DataFormats/Portable/interface/PortableCollection.h" #include "DataFormats/Portable/interface/PortableHostCollection.h" #include "DataFormats/SoATemplate/interface/SoALayout.h" @@ -15,91 +14,83 @@ #include "PhysicsTools/PyTorch/interface/SoAWrapper.h" #include "PhysicsTools/PyTorch/test/testUtilities.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -GENERATE_SOA_LAYOUT(SoAInputsTemplate, - SOA_COLUMN(float, x), - SOA_COLUMN(float, y), - SOA_COLUMN(float, z) -) - -GENERATE_SOA_LAYOUT(SoAOutputsTemplate, - SOA_COLUMN(float, m), - SOA_COLUMN(float, n) -) - -using SoAInputs = SoAInputsTemplate<>; -using SoAOutputs = SoAOutputsTemplate<>; - -class TestPortableInferenceJIT : public CppUnit::TestFixture { - CPPUNIT_TEST_SUITE(TestPortableInferenceJIT); - CPPUNIT_TEST(test); - CPPUNIT_TEST_SUITE_END(); - - public: - void test(); -}; - -CPPUNIT_TEST_SUITE_REGISTRATION(TestPortableInferenceJIT); - -void TestPortableInferenceJIT::test() { - // alpaka setup - Platform platform; - std::vector alpaka_devices = alpaka::getDevs(platform); - const auto& alpaka_host = alpaka::getDevByIdx(alpaka_common::PlatformHost(), 0u); - CPPUNIT_ASSERT(alpaka_devices.size()); - const auto& alpaka_device = alpaka_devices[0]; - Queue queue{alpaka_device}; - - const std::size_t batch_size = 32; - - // host structs - PortableHostCollection inputs_host(batch_size, cms::alpakatools::host()); - PortableHostCollection outputs_host(batch_size, cms::alpakatools::host()); - // device structs - PortableCollection inputs_device(batch_size, alpaka_device); - PortableCollection outputs_device(batch_size, alpaka_device); - - // prepare inputs - for (size_t i = 0; i < batch_size; i++) { - inputs_host.view().x()[i] = 0.0f; - inputs_host.view().y()[i] = 0.0f; - inputs_host.view().z()[i] = 0.0f; - } - alpaka::memcpy(queue, inputs_device.buffer(), inputs_host.buffer()); - alpaka::wait(queue); - - { - // guard scope - cms::torch::alpaka::Guard guard(queue); - - // instantiate model - auto m_path = get_path("/src/PhysicsTools/PyTorch/models/jit_classification_model.pt"); - auto model = cms::torch::alpaka::Model(m_path); - model.to(queue); - CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == model.device()); - std::cout << "Device: " << model.device() << std::endl; - - // metadata for automatic tensor conversion - auto input_records = inputs_device.view().records(); - auto output_records = outputs_device.view().records(); - cms::torch::alpaka::SoAWrapper inputs_metadata(batch_size); - inputs_metadata.append_block("features", input_records.x(), input_records.y(), input_records.z()); - cms::torch::alpaka::SoAWrapper outputs_metadata(batch_size); - outputs_metadata.append_block("preds", output_records.m(), output_records.n()); - cms::torch::alpaka::ModelMetadata metadata(inputs_metadata, outputs_metadata); - // inference - model.forward(metadata); - // check outputs - alpaka::memcpy(queue, outputs_host.buffer(), outputs_device.buffer()); - alpaka::wait(queue); + GENERATE_SOA_LAYOUT(SoAInputsTemplate, SOA_COLUMN(float, x), SOA_COLUMN(float, y), SOA_COLUMN(float, z)) + + GENERATE_SOA_LAYOUT(SoAOutputsTemplate, SOA_COLUMN(float, m), SOA_COLUMN(float, n)) + + using SoAInputs = SoAInputsTemplate<>; + using SoAOutputs = SoAOutputsTemplate<>; + + class TestPortableInferenceJIT : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(TestPortableInferenceJIT); + CPPUNIT_TEST(test); + CPPUNIT_TEST_SUITE_END(); + + public: + void test(); + }; + CPPUNIT_TEST_SUITE_REGISTRATION(TestPortableInferenceJIT); + + void TestPortableInferenceJIT::test() { + // alpaka setup + Platform platform; + std::vector alpaka_devices = alpaka::getDevs(platform); + const auto& alpaka_host = alpaka::getDevByIdx(alpaka_common::PlatformHost(), 0u); + CPPUNIT_ASSERT(alpaka_devices.size()); + const auto& alpaka_device = alpaka_devices[0]; + Queue queue{alpaka_device}; + + const std::size_t batch_size = 32; + + // host structs + PortableHostCollection inputs_host(batch_size, cms::alpakatools::host()); + PortableHostCollection outputs_host(batch_size, cms::alpakatools::host()); + // device structs + PortableCollection inputs_device(batch_size, alpaka_device); + PortableCollection outputs_device(batch_size, alpaka_device); + + // prepare inputs for (size_t i = 0; i < batch_size; i++) { - CPPUNIT_ASSERT(outputs_host.const_view().m()[i] == 0.5f); - CPPUNIT_ASSERT(outputs_host.const_view().n()[i] == 0.5f); + inputs_host.view().x()[i] = 0.0f; + inputs_host.view().y()[i] = 0.0f; + inputs_host.view().z()[i] = 0.0f; + } + alpaka::memcpy(queue, inputs_device.buffer(), inputs_host.buffer()); + alpaka::wait(queue); + + { + // guard scope + cms::torch::alpaka::Guard guard(queue); + + // instantiate model + auto m_path = get_path("/src/PhysicsTools/PyTorch/models/jit_classification_model.pt"); + auto model = cms::torch::alpaka::Model(m_path); + model.to(queue); + CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == model.device()); + std::cout << "Device: " << model.device() << std::endl; + + // metadata for automatic tensor conversion + auto input_records = inputs_device.view().records(); + auto output_records = outputs_device.view().records(); + cms::torch::alpaka::SoAWrapper inputs_metadata(batch_size); + inputs_metadata.append_block("features", input_records.x(), input_records.y(), input_records.z()); + cms::torch::alpaka::SoAWrapper outputs_metadata(batch_size); + outputs_metadata.append_block("preds", output_records.m(), output_records.n()); + cms::torch::alpaka::ModelMetadata metadata(inputs_metadata, outputs_metadata); + // inference + model.forward(metadata); + // check outputs + alpaka::memcpy(queue, outputs_host.buffer(), outputs_device.buffer()); + alpaka::wait(queue); + + for (size_t i = 0; i < batch_size; i++) { + CPPUNIT_ASSERT(outputs_host.const_view().m()[i] == 0.5f); + CPPUNIT_ASSERT(outputs_host.const_view().n()[i] == 0.5f); + } } } -} } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/test/alpaka/testSOADataTypes.dev.cc b/PhysicsTools/PyTorch/test/alpaka/testSOADataTypes.dev.cc index a99891fcc0f..dfe6dde6ffd 100644 --- a/PhysicsTools/PyTorch/test/alpaka/testSOADataTypes.dev.cc +++ b/PhysicsTools/PyTorch/test/alpaka/testSOADataTypes.dev.cc @@ -23,153 +23,151 @@ #include "PhysicsTools/PyTorch/interface/AlpakaConfig.h" #include "PhysicsTools/PyTorch/interface/Converter.h" +namespace ALPAKA_ACCELERATOR_NAMESPACE { + using namespace ::cms::torch::alpaka; + + class TestSOADataTypes : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(TestSOADataTypes); + CPPUNIT_TEST(testInterfaceVerbose); + CPPUNIT_TEST(testMultiOutput); + CPPUNIT_TEST(testSingleElement); + CPPUNIT_TEST(testNoElement); + CPPUNIT_TEST(testEmptyMetadata); + CPPUNIT_TEST_SUITE_END(); + + public: + void testInterfaceVerbose(); + void testIncorrectMetadata(); + void testMultiOutput(); + void testSingleElement(); + void testNoElement(); + void testEmptyMetadata(); + }; -namespace ALPAKA_ACCELERATOR_NAMESPACE { - -using namespace ::cms::torch::alpaka; - -class TestSOADataTypes : public CppUnit::TestFixture { - CPPUNIT_TEST_SUITE(TestSOADataTypes); - CPPUNIT_TEST(testInterfaceVerbose); - CPPUNIT_TEST(testMultiOutput); - CPPUNIT_TEST(testSingleElement); - CPPUNIT_TEST(testNoElement); - CPPUNIT_TEST(testEmptyMetadata); - CPPUNIT_TEST_SUITE_END(); - - public: - void testInterfaceVerbose(); - void testIncorrectMetadata(); - void testMultiOutput(); - void testSingleElement(); - void testNoElement(); - void testEmptyMetadata(); -}; - -CPPUNIT_TEST_SUITE_REGISTRATION(TestSOADataTypes); - -GENERATE_SOA_LAYOUT(SoATemplate, - SOA_EIGEN_COLUMN(Eigen::Vector3d, a), - SOA_EIGEN_COLUMN(Eigen::Vector3d, b), - - SOA_EIGEN_COLUMN(Eigen::Matrix2f, c), - - SOA_COLUMN(double, x), - SOA_COLUMN(double, y), - SOA_COLUMN(double, z), - - SOA_SCALAR(float, type), - SOA_SCALAR(int, someNumber), - - SOA_COLUMN(double, v), - SOA_COLUMN(double, w)); - -using SoA = SoATemplate<>; -using SoAView = SoA::View; -using SoAMetaRecords = SoA::View::Metarecords; - -constexpr auto tol = 1.0e-5; - -class FillKernel { - public: - template >> - ALPAKA_FN_ACC void operator()(TAcc const& acc, PortableCollection::View view) const { - if (cms::alpakatools::once_per_grid(acc)) { - view.type() = 4; - view.someNumber() = 5; - } + CPPUNIT_TEST_SUITE_REGISTRATION(TestSOADataTypes); - for (int32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { - view[i].a()(0) = 1 + i; - view[i].a()(1) = 2 + i; - view[i].a()(2) = 3 + i; + GENERATE_SOA_LAYOUT(SoATemplate, + SOA_EIGEN_COLUMN(Eigen::Vector3d, a), + SOA_EIGEN_COLUMN(Eigen::Vector3d, b), - view[i].b()(0) = 4 + i; - view[i].b()(1) = 5 + i; - view[i].b()(2) = 6 + i; + SOA_EIGEN_COLUMN(Eigen::Matrix2f, c), - view[i].c()(0, 0) = 4 + i; - view[i].c()(0, 1) = 6 + i; - view[i].c()(1, 0) = 8 + i; - view[i].c()(1, 1) = 10 + i; + SOA_COLUMN(double, x), + SOA_COLUMN(double, y), + SOA_COLUMN(double, z), - view.x()[i] = 12 + i; - view.y()[i] = 1 + 2.5 * i; - view.z()[i] = 36 * i; - } - } -}; - -class InputVerifyKernel { - public: - ALPAKA_FN_ACC void operator()(Acc1D const& acc, PortableCollection::View view) const { - if (cms::alpakatools::once_per_grid(acc)) { - ALPAKA_ASSERT_ACC(view.type() == 4); - ALPAKA_ASSERT_ACC(view.someNumber() == 5); + SOA_SCALAR(float, type), + SOA_SCALAR(int, someNumber), + + SOA_COLUMN(double, v), + SOA_COLUMN(double, w)); + + using SoA = SoATemplate<>; + using SoAView = SoA::View; + using SoAMetaRecords = SoA::View::Metarecords; + + constexpr auto tol = 1.0e-5; + + class FillKernel { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, PortableCollection::View view) const { + if (cms::alpakatools::once_per_grid(acc)) { + view.type() = 4; + view.someNumber() = 5; + } + + for (int32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { + view[i].a()(0) = 1 + i; + view[i].a()(1) = 2 + i; + view[i].a()(2) = 3 + i; + + view[i].b()(0) = 4 + i; + view[i].b()(1) = 5 + i; + view[i].b()(2) = 6 + i; + + view[i].c()(0, 0) = 4 + i; + view[i].c()(0, 1) = 6 + i; + view[i].c()(1, 0) = 8 + i; + view[i].c()(1, 1) = 10 + i; + + view.x()[i] = 12 + i; + view.y()[i] = 1 + 2.5 * i; + view.z()[i] = 36 * i; + } } + }; - for (uint32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { - ALPAKA_ASSERT_ACC(view[i].a()(0) == 1 + i); - ALPAKA_ASSERT_ACC(view[i].a()(1) == 2 + i); - ALPAKA_ASSERT_ACC(view[i].a()(2) == 3 + i); + class InputVerifyKernel { + public: + ALPAKA_FN_ACC void operator()(Acc1D const& acc, PortableCollection::View view) const { + if (cms::alpakatools::once_per_grid(acc)) { + ALPAKA_ASSERT_ACC(view.type() == 4); + ALPAKA_ASSERT_ACC(view.someNumber() == 5); + } + + for (uint32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { + ALPAKA_ASSERT_ACC(view[i].a()(0) == 1 + i); + ALPAKA_ASSERT_ACC(view[i].a()(1) == 2 + i); + ALPAKA_ASSERT_ACC(view[i].a()(2) == 3 + i); - ALPAKA_ASSERT_ACC(view[i].b()(0) == 4 + i); - ALPAKA_ASSERT_ACC(view[i].b()(1) == 5 + i); - ALPAKA_ASSERT_ACC(view[i].b()(2) == 6 + i); + ALPAKA_ASSERT_ACC(view[i].b()(0) == 4 + i); + ALPAKA_ASSERT_ACC(view[i].b()(1) == 5 + i); + ALPAKA_ASSERT_ACC(view[i].b()(2) == 6 + i); - ALPAKA_ASSERT_ACC(view[i].c()(0, 0) == 4 + i); - ALPAKA_ASSERT_ACC(view[i].c()(0, 1) == 6 + i); - ALPAKA_ASSERT_ACC(view[i].c()(1, 0) == 8 + i); - ALPAKA_ASSERT_ACC(view[i].c()(1, 1) == 10 + i); + ALPAKA_ASSERT_ACC(view[i].c()(0, 0) == 4 + i); + ALPAKA_ASSERT_ACC(view[i].c()(0, 1) == 6 + i); + ALPAKA_ASSERT_ACC(view[i].c()(1, 0) == 8 + i); + ALPAKA_ASSERT_ACC(view[i].c()(1, 1) == 10 + i); - ALPAKA_ASSERT_ACC(view.x()[i] == 12 + i); - ALPAKA_ASSERT_ACC(view.y()[i] == 1 + 2.5 * i); - ALPAKA_ASSERT_ACC(view.z()[i] == 36 * i); + ALPAKA_ASSERT_ACC(view.x()[i] == 12 + i); + ALPAKA_ASSERT_ACC(view.y()[i] == 1 + 2.5 * i); + ALPAKA_ASSERT_ACC(view.z()[i] == 36 * i); + } } - } -}; - -class TestVerifyKernel { - public: - ALPAKA_FN_ACC void operator()(Acc1D const& acc, - PortableCollection::View view, - torch::PackedTensorAccessor64 tensor_vector, - torch::PackedTensorAccessor64 tensor_matrix, - torch::PackedTensorAccessor64 tensor_column, - torch::PackedTensorAccessor64 tensor_scalar) const { - for (uint32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { - ALPAKA_ASSERT_ACC(view[i].a()(0) - tensor_vector[i][0][0] < tol); - ALPAKA_ASSERT_ACC(view[i].a()(1) - tensor_vector[i][0][1] < tol); - ALPAKA_ASSERT_ACC(view[i].a()(2) - tensor_vector[i][0][2] < tol); - ALPAKA_ASSERT_ACC(view[i].a()(0) - tensor_vector[i][0][0] > -tol); - ALPAKA_ASSERT_ACC(view[i].a()(1) - tensor_vector[i][0][1] > -tol); - ALPAKA_ASSERT_ACC(view[i].a()(2) - tensor_vector[i][0][2] > -tol); - - ALPAKA_ASSERT_ACC(view[i].b()(0) - tensor_vector[i][1][0] < tol); - ALPAKA_ASSERT_ACC(view[i].b()(1) - tensor_vector[i][1][1] < tol); - ALPAKA_ASSERT_ACC(view[i].b()(2) - tensor_vector[i][1][2] < tol); - ALPAKA_ASSERT_ACC(view[i].b()(0) - tensor_vector[i][1][0] > -tol); - ALPAKA_ASSERT_ACC(view[i].b()(1) - tensor_vector[i][1][1] > -tol); - ALPAKA_ASSERT_ACC(view[i].b()(2) - tensor_vector[i][1][2] > -tol); - - ALPAKA_ASSERT_ACC(view[i].c()(0, 0) - tensor_matrix[i][0][0][0] < tol); - ALPAKA_ASSERT_ACC(view[i].c()(0, 0) - tensor_matrix[i][0][0][0] > -tol); - ALPAKA_ASSERT_ACC(view[i].c()(0, 1) - tensor_matrix[i][0][0][1] < tol); - ALPAKA_ASSERT_ACC(view[i].c()(0, 1) - tensor_matrix[i][0][0][1] > -tol); - ALPAKA_ASSERT_ACC(view[i].c()(1, 0) - tensor_matrix[i][0][1][0] < tol); - ALPAKA_ASSERT_ACC(view[i].c()(1, 0) - tensor_matrix[i][0][1][0] > -tol); - ALPAKA_ASSERT_ACC(view[i].c()(1, 1) - tensor_matrix[i][0][1][1] < tol); - ALPAKA_ASSERT_ACC(view[i].c()(1, 1) - tensor_matrix[i][0][1][1] > -tol); - - ALPAKA_ASSERT_ACC(view.x()[i] - tensor_column[i][0] < tol); - ALPAKA_ASSERT_ACC(view.x()[i] - tensor_column[i][0] > -tol); - - ALPAKA_ASSERT_ACC(view.y()[i] - tensor_column[i][1] < tol); - ALPAKA_ASSERT_ACC(view.y()[i] - tensor_column[i][1] > -tol); - - ALPAKA_ASSERT_ACC(view.z()[i] - tensor_column[i][2] < tol); - ALPAKA_ASSERT_ACC(view.z()[i] - tensor_column[i][2] > -tol); + }; + + class TestVerifyKernel { + public: + ALPAKA_FN_ACC void operator()(Acc1D const& acc, + PortableCollection::View view, + torch::PackedTensorAccessor64 tensor_vector, + torch::PackedTensorAccessor64 tensor_matrix, + torch::PackedTensorAccessor64 tensor_column, + torch::PackedTensorAccessor64 tensor_scalar) const { + for (uint32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { + ALPAKA_ASSERT_ACC(view[i].a()(0) - tensor_vector[i][0][0] < tol); + ALPAKA_ASSERT_ACC(view[i].a()(1) - tensor_vector[i][0][1] < tol); + ALPAKA_ASSERT_ACC(view[i].a()(2) - tensor_vector[i][0][2] < tol); + ALPAKA_ASSERT_ACC(view[i].a()(0) - tensor_vector[i][0][0] > -tol); + ALPAKA_ASSERT_ACC(view[i].a()(1) - tensor_vector[i][0][1] > -tol); + ALPAKA_ASSERT_ACC(view[i].a()(2) - tensor_vector[i][0][2] > -tol); + + ALPAKA_ASSERT_ACC(view[i].b()(0) - tensor_vector[i][1][0] < tol); + ALPAKA_ASSERT_ACC(view[i].b()(1) - tensor_vector[i][1][1] < tol); + ALPAKA_ASSERT_ACC(view[i].b()(2) - tensor_vector[i][1][2] < tol); + ALPAKA_ASSERT_ACC(view[i].b()(0) - tensor_vector[i][1][0] > -tol); + ALPAKA_ASSERT_ACC(view[i].b()(1) - tensor_vector[i][1][1] > -tol); + ALPAKA_ASSERT_ACC(view[i].b()(2) - tensor_vector[i][1][2] > -tol); + + ALPAKA_ASSERT_ACC(view[i].c()(0, 0) - tensor_matrix[i][0][0][0] < tol); + ALPAKA_ASSERT_ACC(view[i].c()(0, 0) - tensor_matrix[i][0][0][0] > -tol); + ALPAKA_ASSERT_ACC(view[i].c()(0, 1) - tensor_matrix[i][0][0][1] < tol); + ALPAKA_ASSERT_ACC(view[i].c()(0, 1) - tensor_matrix[i][0][0][1] > -tol); + ALPAKA_ASSERT_ACC(view[i].c()(1, 0) - tensor_matrix[i][0][1][0] < tol); + ALPAKA_ASSERT_ACC(view[i].c()(1, 0) - tensor_matrix[i][0][1][0] > -tol); + ALPAKA_ASSERT_ACC(view[i].c()(1, 1) - tensor_matrix[i][0][1][1] < tol); + ALPAKA_ASSERT_ACC(view[i].c()(1, 1) - tensor_matrix[i][0][1][1] > -tol); + + ALPAKA_ASSERT_ACC(view.x()[i] - tensor_column[i][0] < tol); + ALPAKA_ASSERT_ACC(view.x()[i] - tensor_column[i][0] > -tol); + + ALPAKA_ASSERT_ACC(view.y()[i] - tensor_column[i][1] < tol); + ALPAKA_ASSERT_ACC(view.y()[i] - tensor_column[i][1] > -tol); + + ALPAKA_ASSERT_ACC(view.z()[i] - tensor_column[i][2] < tol); + ALPAKA_ASSERT_ACC(view.z()[i] - tensor_column[i][2] > -tol); ALPAKA_ASSERT_ACC(view.type() - tensor_scalar[i][0] < tol); ALPAKA_ASSERT_ACC(view.type() - tensor_scalar[i][0] > -tol); @@ -178,230 +176,224 @@ class TestVerifyKernel { }; class TestOutputVerifyKernel { - public: - ALPAKA_FN_ACC void operator()(Acc1D const& acc, - PortableCollection::View view) const { - for (uint32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { - ALPAKA_ASSERT_ACC(view.x()[i] - view.v()[i] < tol); - ALPAKA_ASSERT_ACC(view.x()[i] - view.v()[i] > -tol); - - ALPAKA_ASSERT_ACC(view.y()[i] - view.w()[i] < tol); - ALPAKA_ASSERT_ACC(view.y()[i] - view.w()[i] > -tol); - } + public: + ALPAKA_FN_ACC void operator()(Acc1D const& acc, PortableCollection::View view) const { + for (uint32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { + ALPAKA_ASSERT_ACC(view.x()[i] - view.v()[i] < tol); + ALPAKA_ASSERT_ACC(view.x()[i] - view.v()[i] > -tol); + + ALPAKA_ASSERT_ACC(view.y()[i] - view.w()[i] < tol); + ALPAKA_ASSERT_ACC(view.y()[i] - view.w()[i] > -tol); } - }; - -void fill(Queue& queue, PortableCollection& collection) { - uint32_t items = 64; - uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); - auto workDiv = cms::alpakatools::make_workdiv(groups, items); - alpaka::exec(queue, workDiv, FillKernel{}, collection.view()); - alpaka::exec(queue, workDiv, InputVerifyKernel{}, collection.view()); -} - -void check(Queue& queue, PortableCollection& collection, std::vector tensors) { - uint32_t items = 64; - uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); - auto workDiv = cms::alpakatools::make_workdiv(groups, items); - alpaka::exec(queue, - workDiv, - TestVerifyKernel{}, - collection.view(), - tensors[3].toTensor().packed_accessor64(), - tensors[2].toTensor().packed_accessor64(), - tensors[0].toTensor().packed_accessor64(), - tensors[1].toTensor().packed_accessor64()); -} - -void check_not_ordered(Queue& queue, PortableCollection& collection, std::vector tensors) { - uint32_t items = 64; - uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); - auto workDiv = cms::alpakatools::make_workdiv(groups, items); - alpaka::exec(queue, - workDiv, - TestVerifyKernel{}, - collection.view(), - tensors[0].toTensor().packed_accessor64(), - tensors[1].toTensor().packed_accessor64(), - tensors[2].toTensor().packed_accessor64(), - tensors[3].toTensor().packed_accessor64()); -} - -void check_output(Queue& queue, PortableCollection& collection) { - uint32_t items = 64; - uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); - auto workDiv = cms::alpakatools::make_workdiv(groups, items); - alpaka::exec(queue, - workDiv, - TestOutputVerifyKernel{}, - collection.view()); -} - -void TestSOADataTypes::testInterfaceVerbose() { - Platform platform; - std::vector alpakaDevices = alpaka::getDevs(platform); - const auto& alpakaHost = alpaka::getDevByIdx(alpaka_common::PlatformHost(), 0u); - CPPUNIT_ASSERT(alpakaDevices.size()); - const auto& alpakaDevice = alpakaDevices[0]; - Queue queue{alpakaDevice}; - torch::Device torchDevice(kTorchDeviceType); - - // Large batch size, so multiple bunches needed - const std::size_t batch_size = 325; - - // Create and fill needed portable collections - PortableCollection deviceCollection(batch_size, queue); - fill(queue, deviceCollection); - SoAMetaRecords records = deviceCollection.view().records(); - - SoAWrapper input(batch_size); - input.append_block("vector", records.a(), records.b()); - input.append_block("matrix", records.c()); - input.append_block("column", records.x(), records.y(), records.z()); - input.append_block("scalar", records.type()); - input.change_order({"column", "scalar", "matrix", "vector"}); - - SoAWrapper output(batch_size); - output.append_block("result", records.v()); - ModelMetadata metadata(input, output); - - alpaka::wait(queue); - std::vector tensors = Converter::convert_input(metadata, torchDevice); - - // Check if tensor list built correctly - check(queue, deviceCollection, tensors); -}; - -void TestSOADataTypes::testMultiOutput() { - Platform platform; - std::vector alpakaDevices = alpaka::getDevs(platform); - const auto& alpakaHost = alpaka::getDevByIdx(alpaka_common::PlatformHost(), 0u); - CPPUNIT_ASSERT(alpakaDevices.size()); - const auto& alpakaDevice = alpakaDevices[0]; - Queue queue{alpakaDevice}; - torch::Device torchDevice(kTorchDeviceType); - - // Large batch size, so multiple bunches needed - const std::size_t batch_size = 325; - - // Create and fill needed portable collections - PortableCollection deviceCollection(batch_size, queue); - fill(queue, deviceCollection); - - auto records = deviceCollection.view().records(); - SoAWrapper input(batch_size); - input.append_block("x", records.x()); - input.append_block("y", records.y()); - - SoAWrapper output(batch_size); - output.append_block("v", records.v()); - output.append_block("w", records.w()); - ModelMetadata metadata(input, output); - - alpaka::wait(queue); - std::vector tensors = Converter::convert_input(metadata, torchDevice); - Converter::convert_output(tensors, metadata, torchDevice); - - // Check if tensor list built correctly - check_output(queue, deviceCollection); -}; - -void TestSOADataTypes::testSingleElement() { - Platform platform; - std::vector alpakaDevices = alpaka::getDevs(platform); - CPPUNIT_ASSERT(alpakaDevices.size()); - const auto& alpakaDevice = alpakaDevices[0]; - Queue queue(alpakaDevice); - torch::Device torchDevice(kTorchDeviceType); - - - // Create and fill portable collections - const std::size_t batch_size = 1; - PortableCollection deviceCollection(batch_size, queue); - fill(queue, deviceCollection); - SoAMetaRecords records = deviceCollection.view().records(); - - // Run Converter for single tensor - SoAWrapper input(batch_size); - input.append_block("vector", records.a(), records.b()); - input.append_block("matrix", records.c()); - input.append_block("column", records.x(), records.y(), records.z()); - input.append_block("scalar", records.type()); - input.change_order({"column", "scalar", "matrix", "vector"}); - - SoAWrapper output(batch_size); - output.append_block("result", records.v()); - ModelMetadata metadata(input, output); - - alpaka::wait(queue); - std::vector tensors = - Converter::convert_input(metadata, torchDevice); - - // Check if tensor list built correctly - check(queue, deviceCollection, tensors); -}; - -void TestSOADataTypes::testNoElement() { - Platform platform; - std::vector alpakaDevices = alpaka::getDevs(platform); - CPPUNIT_ASSERT(alpakaDevices.size()); - const auto& alpakaDevice = alpakaDevices[0]; - Queue queue(alpakaDevice); - torch::Device torchDevice(kTorchDeviceType); - - //Create empty portable collection - const std::size_t batch_size = 0; - PortableCollection deviceCollection(batch_size, queue); - SoAMetaRecords records = deviceCollection.view().records(); - - // Run Converter - SoAWrapper input(batch_size); - input.append_block("vector", records.a(), records.b()); - input.append_block("matrix", records.c()); - input.append_block("column", records.x(), records.y(), records.z()); - input.append_block("scalar", records.type()); - input.change_order({"column", "scalar", "matrix", "vector"}); - - SoAWrapper output(batch_size); - output.append_block("result", records.v()); - ModelMetadata metadata(input, output); - - alpaka::wait(queue); - std::vector tensors = Converter::convert_input(metadata, torchDevice); - - // Check if tensor list has empty tensors - CPPUNIT_ASSERT(tensors[0].toTensor().size(0) == 0); - CPPUNIT_ASSERT(tensors[1].toTensor().size(0) == 0); - CPPUNIT_ASSERT(tensors[2].toTensor().size(0) == 0); - CPPUNIT_ASSERT(tensors[3].toTensor().size(0) == 0); -}; - -void TestSOADataTypes::testEmptyMetadata() { - // alpaka setup - Platform platform; - std::vector alpakaDevices = alpaka::getDevs(platform); - CPPUNIT_ASSERT(alpakaDevices.size()); - const auto& alpakaDevice = alpakaDevices[0]; - Queue queue(alpakaDevice); - torch::Device torchDevice(kTorchDeviceType); - - - // Create and fill portable collections - const std::size_t batch_size = 12; - PortableCollection deviceCollection(batch_size, queue); - fill(queue, deviceCollection); - - // Run Converter for empty metadata - SoAWrapper input(batch_size); - SoAWrapper output(batch_size); - ModelMetadata metadata(input, output); - - alpaka::wait(queue); - std::vector tensors = - Converter::convert_input(metadata, torchDevice); - - // Check if tensor list is empty - CPPUNIT_ASSERT(tensors.size() == 0); -}; + } + }; + + void fill(Queue& queue, PortableCollection& collection) { + uint32_t items = 64; + uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); + auto workDiv = cms::alpakatools::make_workdiv(groups, items); + alpaka::exec(queue, workDiv, FillKernel{}, collection.view()); + alpaka::exec(queue, workDiv, InputVerifyKernel{}, collection.view()); + } + + void check(Queue& queue, PortableCollection& collection, std::vector tensors) { + uint32_t items = 64; + uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); + auto workDiv = cms::alpakatools::make_workdiv(groups, items); + alpaka::exec(queue, + workDiv, + TestVerifyKernel{}, + collection.view(), + tensors[3].toTensor().packed_accessor64(), + tensors[2].toTensor().packed_accessor64(), + tensors[0].toTensor().packed_accessor64(), + tensors[1].toTensor().packed_accessor64()); + } + + void check_not_ordered(Queue& queue, + PortableCollection& collection, + std::vector tensors) { + uint32_t items = 64; + uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); + auto workDiv = cms::alpakatools::make_workdiv(groups, items); + alpaka::exec(queue, + workDiv, + TestVerifyKernel{}, + collection.view(), + tensors[0].toTensor().packed_accessor64(), + tensors[1].toTensor().packed_accessor64(), + tensors[2].toTensor().packed_accessor64(), + tensors[3].toTensor().packed_accessor64()); + } + + void check_output(Queue& queue, PortableCollection& collection) { + uint32_t items = 64; + uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); + auto workDiv = cms::alpakatools::make_workdiv(groups, items); + alpaka::exec(queue, workDiv, TestOutputVerifyKernel{}, collection.view()); + } + + void TestSOADataTypes::testInterfaceVerbose() { + Platform platform; + std::vector alpakaDevices = alpaka::getDevs(platform); + const auto& alpakaHost = alpaka::getDevByIdx(alpaka_common::PlatformHost(), 0u); + CPPUNIT_ASSERT(alpakaDevices.size()); + const auto& alpakaDevice = alpakaDevices[0]; + Queue queue{alpakaDevice}; + torch::Device torchDevice(kTorchDeviceType); + + // Large batch size, so multiple bunches needed + const std::size_t batch_size = 325; + + // Create and fill needed portable collections + PortableCollection deviceCollection(batch_size, queue); + fill(queue, deviceCollection); + SoAMetaRecords records = deviceCollection.view().records(); + + SoAWrapper input(batch_size); + input.append_block("vector", records.a(), records.b()); + input.append_block("matrix", records.c()); + input.append_block("column", records.x(), records.y(), records.z()); + input.append_block("scalar", records.type()); + input.change_order({"column", "scalar", "matrix", "vector"}); + + SoAWrapper output(batch_size); + output.append_block("result", records.v()); + ModelMetadata metadata(input, output); + + alpaka::wait(queue); + std::vector tensors = Converter::convert_input(metadata, torchDevice); + + // Check if tensor list built correctly + check(queue, deviceCollection, tensors); + }; + + void TestSOADataTypes::testMultiOutput() { + Platform platform; + std::vector alpakaDevices = alpaka::getDevs(platform); + const auto& alpakaHost = alpaka::getDevByIdx(alpaka_common::PlatformHost(), 0u); + CPPUNIT_ASSERT(alpakaDevices.size()); + const auto& alpakaDevice = alpakaDevices[0]; + Queue queue{alpakaDevice}; + torch::Device torchDevice(kTorchDeviceType); + + // Large batch size, so multiple bunches needed + const std::size_t batch_size = 325; + + // Create and fill needed portable collections + PortableCollection deviceCollection(batch_size, queue); + fill(queue, deviceCollection); + + auto records = deviceCollection.view().records(); + SoAWrapper input(batch_size); + input.append_block("x", records.x()); + input.append_block("y", records.y()); + + SoAWrapper output(batch_size); + output.append_block("v", records.v()); + output.append_block("w", records.w()); + ModelMetadata metadata(input, output); + + alpaka::wait(queue); + std::vector tensors = Converter::convert_input(metadata, torchDevice); + Converter::convert_output(tensors, metadata, torchDevice); + + // Check if tensor list built correctly + check_output(queue, deviceCollection); + }; + + void TestSOADataTypes::testSingleElement() { + Platform platform; + std::vector alpakaDevices = alpaka::getDevs(platform); + CPPUNIT_ASSERT(alpakaDevices.size()); + const auto& alpakaDevice = alpakaDevices[0]; + Queue queue(alpakaDevice); + torch::Device torchDevice(kTorchDeviceType); + + // Create and fill portable collections + const std::size_t batch_size = 1; + PortableCollection deviceCollection(batch_size, queue); + fill(queue, deviceCollection); + SoAMetaRecords records = deviceCollection.view().records(); + + // Run Converter for single tensor + SoAWrapper input(batch_size); + input.append_block("vector", records.a(), records.b()); + input.append_block("matrix", records.c()); + input.append_block("column", records.x(), records.y(), records.z()); + input.append_block("scalar", records.type()); + input.change_order({"column", "scalar", "matrix", "vector"}); + + SoAWrapper output(batch_size); + output.append_block("result", records.v()); + ModelMetadata metadata(input, output); + + alpaka::wait(queue); + std::vector tensors = Converter::convert_input(metadata, torchDevice); + + // Check if tensor list built correctly + check(queue, deviceCollection, tensors); + }; + + void TestSOADataTypes::testNoElement() { + Platform platform; + std::vector alpakaDevices = alpaka::getDevs(platform); + CPPUNIT_ASSERT(alpakaDevices.size()); + const auto& alpakaDevice = alpakaDevices[0]; + Queue queue(alpakaDevice); + torch::Device torchDevice(kTorchDeviceType); + + //Create empty portable collection + const std::size_t batch_size = 0; + PortableCollection deviceCollection(batch_size, queue); + SoAMetaRecords records = deviceCollection.view().records(); + + // Run Converter + SoAWrapper input(batch_size); + input.append_block("vector", records.a(), records.b()); + input.append_block("matrix", records.c()); + input.append_block("column", records.x(), records.y(), records.z()); + input.append_block("scalar", records.type()); + input.change_order({"column", "scalar", "matrix", "vector"}); + + SoAWrapper output(batch_size); + output.append_block("result", records.v()); + ModelMetadata metadata(input, output); + + alpaka::wait(queue); + std::vector tensors = Converter::convert_input(metadata, torchDevice); + + // Check if tensor list has empty tensors + CPPUNIT_ASSERT(tensors[0].toTensor().size(0) == 0); + CPPUNIT_ASSERT(tensors[1].toTensor().size(0) == 0); + CPPUNIT_ASSERT(tensors[2].toTensor().size(0) == 0); + CPPUNIT_ASSERT(tensors[3].toTensor().size(0) == 0); + }; + + void TestSOADataTypes::testEmptyMetadata() { + // alpaka setup + Platform platform; + std::vector alpakaDevices = alpaka::getDevs(platform); + CPPUNIT_ASSERT(alpakaDevices.size()); + const auto& alpakaDevice = alpakaDevices[0]; + Queue queue(alpakaDevice); + torch::Device torchDevice(kTorchDeviceType); + + // Create and fill portable collections + const std::size_t batch_size = 12; + PortableCollection deviceCollection(batch_size, queue); + fill(queue, deviceCollection); + + // Run Converter for empty metadata + SoAWrapper input(batch_size); + SoAWrapper output(batch_size); + ModelMetadata metadata(input, output); + + alpaka::wait(queue); + std::vector tensors = Converter::convert_input(metadata, torchDevice); + + // Check if tensor list is empty + CPPUNIT_ASSERT(tensors.size() == 0); + }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/test/alpaka/testSOAtoTorch.dev.cc b/PhysicsTools/PyTorch/test/alpaka/testSOAtoTorch.dev.cc index 90f5c2c583a..3566a7ac309 100644 --- a/PhysicsTools/PyTorch/test/alpaka/testSOAtoTorch.dev.cc +++ b/PhysicsTools/PyTorch/test/alpaka/testSOAtoTorch.dev.cc @@ -17,128 +17,129 @@ #include "PhysicsTools/PyTorch/interface/Converter.h" #include "PhysicsTools/PyTorch/test/testBase.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -// Input SOA -GENERATE_SOA_LAYOUT(SoAPositionTemplate, SOA_COLUMN(float, x), SOA_COLUMN(float, y), SOA_COLUMN(float, z)) - -using SoAPosition = SoAPositionTemplate<>; -using SoAPositionView = SoAPosition::View; -using SoAPositionConstView = SoAPosition::ConstView; + // Input SOA + GENERATE_SOA_LAYOUT(SoAPositionTemplate, SOA_COLUMN(float, x), SOA_COLUMN(float, y), SOA_COLUMN(float, z)) -// Output SOA -GENERATE_SOA_LAYOUT(SoAResultTemplate, SOA_COLUMN(float, x), SOA_COLUMN(float, y)) + using SoAPosition = SoAPositionTemplate<>; + using SoAPositionView = SoAPosition::View; + using SoAPositionConstView = SoAPosition::ConstView; -using SoAResult = SoAResultTemplate<>; -using SoAResultView = SoAResult::View; + // Output SOA + GENERATE_SOA_LAYOUT(SoAResultTemplate, SOA_COLUMN(float, x), SOA_COLUMN(float, y)) -class testSOAToTorch : public testBasePyTorch { - CPPUNIT_TEST_SUITE(testSOAToTorch); - CPPUNIT_TEST(test); - CPPUNIT_TEST_SUITE_END(); + using SoAResult = SoAResultTemplate<>; + using SoAResultView = SoAResult::View; -public: - std::string pyScript() const override; - void test() override; -}; + class testSOAToTorch : public testBasePyTorch { + CPPUNIT_TEST_SUITE(testSOAToTorch); + CPPUNIT_TEST(test); + CPPUNIT_TEST_SUITE_END(); -CPPUNIT_TEST_SUITE_REGISTRATION(testSOAToTorch); + public: + std::string pyScript() const override; + void test() override; + }; -std::string testSOAToTorch::pyScript() const { return "create_linear_dnn.py"; } + CPPUNIT_TEST_SUITE_REGISTRATION(testSOAToTorch); -// Build Tensor, run model and fill output pointer with result -template -void run(torch::Device device, torch::jit::Module& model, const cms::torch::alpaka::ModelMetadata& metadata) { - std::vector input_tensor = cms::torch::alpaka::Converter::convert_input(metadata, device); - cms::torch::alpaka::Converter::convert_output(metadata, device) = model.forward(input_tensor).toTensor(); -} + std::string testSOAToTorch::pyScript() const { return "create_linear_dnn.py"; } -class FillKernel { -public: - template >> - ALPAKA_FN_ACC void operator()(TAcc const& acc, PortableCollection::View view) const { - float input[4][3] = {{1, 2, 1}, {2, 4, 3}, {3, 4, 1}, {2, 3, 2}}; + // Build Tensor, run model and fill output pointer with result + template + void run(torch::Device device, + torch::jit::Module& model, + const cms::torch::alpaka::ModelMetadata& metadata) { + std::vector input_tensor = cms::torch::alpaka::Converter::convert_input(metadata, device); + cms::torch::alpaka::Converter::convert_output(metadata, device) = model.forward(input_tensor).toTensor(); + } - for (int32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { - view.x()[i] = input[i][0]; - view.y()[i] = input[i][1]; - view.z()[i] = input[i][2]; + class FillKernel { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, PortableCollection::View view) const { + float input[4][3] = {{1, 2, 1}, {2, 4, 3}, {3, 4, 1}, {2, 3, 2}}; + + for (int32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { + view.x()[i] = input[i][0]; + view.y()[i] = input[i][1]; + view.z()[i] = input[i][2]; + } } - } -}; - -class TestVerifyKernel { -public: - ALPAKA_FN_ACC void operator()(Acc1D const& acc, PortableCollection::View view) const { - float result_check[4][2] = {{2.3, -0.5}, {6.6, 3.0}, {2.5, -4.9}, {4.4, 1.3}}; - for (uint32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { - ALPAKA_ASSERT_ACC(view.x()[i] - result_check[i][0] < 1.0e-05); - ALPAKA_ASSERT_ACC(view.x()[i] - result_check[i][0] > -1.0e-05); - ALPAKA_ASSERT_ACC(view.y()[i] - result_check[i][1] < 1.0e-05); - ALPAKA_ASSERT_ACC(view.y()[i] - result_check[i][1] > -1.0e-05); + }; + + class TestVerifyKernel { + public: + ALPAKA_FN_ACC void operator()(Acc1D const& acc, PortableCollection::View view) const { + float result_check[4][2] = {{2.3, -0.5}, {6.6, 3.0}, {2.5, -4.9}, {4.4, 1.3}}; + for (uint32_t i : cms::alpakatools::uniform_elements(acc, view.metadata().size())) { + ALPAKA_ASSERT_ACC(view.x()[i] - result_check[i][0] < 1.0e-05); + ALPAKA_ASSERT_ACC(view.x()[i] - result_check[i][0] > -1.0e-05); + ALPAKA_ASSERT_ACC(view.y()[i] - result_check[i][1] < 1.0e-05); + ALPAKA_ASSERT_ACC(view.y()[i] - result_check[i][1] > -1.0e-05); + } } + }; + + void fill(Queue& queue, PortableCollection& collection) { + uint32_t items = 64; + uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); + auto workDiv = cms::alpakatools::make_workdiv(groups, items); + alpaka::exec(queue, workDiv, FillKernel{}, collection.view()); } -}; - -void fill(Queue& queue, PortableCollection& collection) { - uint32_t items = 64; - uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); - auto workDiv = cms::alpakatools::make_workdiv(groups, items); - alpaka::exec(queue, workDiv, FillKernel{}, collection.view()); -} - -void check(Queue& queue, PortableCollection& collection) { - uint32_t items = 64; - uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); - auto workDiv = cms::alpakatools::make_workdiv(groups, items); - alpaka::exec(queue, workDiv, TestVerifyKernel{}, collection.view()); -} - -void testSOAToTorch::test() { - Platform platform; - std::vector alpakaDevices = ::alpaka::getDevs(platform); - const auto& alpakaHost = ::alpaka::getDevByIdx(alpaka_common::PlatformHost(), 0u); - CPPUNIT_ASSERT(alpakaDevices.size()); - const auto& alpakaDevice = alpakaDevices[0]; - Queue queue{alpakaDevice}; - torch::Device torchDevice(cms::torch::alpaka::kTorchDeviceType); - - // Number of elements - const std::size_t batch_size = 4; - - // Create and fill needed portable collections - PortableCollection positionCollection(batch_size, alpakaDevice); - PortableCollection resultCollection(batch_size, alpakaDevice); - fill(queue, positionCollection); - alpaka::wait(queue); - - torch::jit::script::Module model; - try { - // Deserialize the ScriptModule from a file using torch::jit::load(). - std::string model_path = dataPath_ + "/linear_dnn.pt"; - model = torch::jit::load(model_path); - model.to(torchDevice); - - } catch (const c10::Error& e) { - std::cerr << "error loading the model\n" << e.what() << std::endl; + + void check(Queue& queue, PortableCollection& collection) { + uint32_t items = 64; + uint32_t groups = cms::alpakatools::divide_up_by(collection->metadata().size(), items); + auto workDiv = cms::alpakatools::make_workdiv(groups, items); + alpaka::exec(queue, workDiv, TestVerifyKernel{}, collection.view()); } - // Create SoA Metadata - cms::torch::alpaka::SoAWrapper input(batch_size); - auto posview = positionCollection.view().records(); - input.append_block("main", posview.x(), posview.y(), posview.z()); + void testSOAToTorch::test() { + Platform platform; + std::vector alpakaDevices = ::alpaka::getDevs(platform); + const auto& alpakaHost = ::alpaka::getDevByIdx(alpaka_common::PlatformHost(), 0u); + CPPUNIT_ASSERT(alpakaDevices.size()); + const auto& alpakaDevice = alpakaDevices[0]; + Queue queue{alpakaDevice}; + torch::Device torchDevice(cms::torch::alpaka::kTorchDeviceType); + + // Number of elements + const std::size_t batch_size = 4; + + // Create and fill needed portable collections + PortableCollection positionCollection(batch_size, alpakaDevice); + PortableCollection resultCollection(batch_size, alpakaDevice); + fill(queue, positionCollection); + alpaka::wait(queue); + + torch::jit::script::Module model; + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + std::string model_path = dataPath_ + "/linear_dnn.pt"; + model = torch::jit::load(model_path); + model.to(torchDevice); + + } catch (const c10::Error& e) { + std::cerr << "error loading the model\n" << e.what() << std::endl; + } + + // Create SoA Metadata + cms::torch::alpaka::SoAWrapper input(batch_size); + auto posview = positionCollection.view().records(); + input.append_block("main", posview.x(), posview.y(), posview.z()); - cms::torch::alpaka::SoAWrapper output(batch_size); - auto view = resultCollection.view().records(); - output.append_block("result", view.x(), view.y()); - cms::torch::alpaka::ModelMetadata metadata(input, output); + cms::torch::alpaka::SoAWrapper output(batch_size); + auto view = resultCollection.view().records(); + output.append_block("result", view.x(), view.y()); + cms::torch::alpaka::ModelMetadata metadata(input, output); - // Call function to build tensor and run model - run(torchDevice, model, metadata); - alpaka::wait(queue); + // Call function to build tensor and run model + run(torchDevice, model, metadata); + alpaka::wait(queue); - check(queue, resultCollection); -} + check(queue, resultCollection); + } -} // namespace ALPAKA_ACCELERATOR_NAMESPACE::torch_alpaka \ No newline at end of file +} // namespace ALPAKA_ACCELERATOR_NAMESPACE \ No newline at end of file diff --git a/PhysicsTools/PyTorch/test/alpaka/testTorchDeviceMatchAlpaka.cc b/PhysicsTools/PyTorch/test/alpaka/testTorchDeviceMatchAlpaka.cc index 100e0c007bc..f966926d046 100644 --- a/PhysicsTools/PyTorch/test/alpaka/testTorchDeviceMatchAlpaka.cc +++ b/PhysicsTools/PyTorch/test/alpaka/testTorchDeviceMatchAlpaka.cc @@ -4,31 +4,30 @@ #include "HeterogeneousCore/AlpakaInterface/interface/config.h" #include "PhysicsTools/PyTorch/interface/AlpakaConfig.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -class TestTorchDeviceMatchAlpaka : public CppUnit::TestFixture { - CPPUNIT_TEST_SUITE(TestTorchDeviceMatchAlpaka); - CPPUNIT_TEST(test); - CPPUNIT_TEST_SUITE_END(); + class TestTorchDeviceMatchAlpaka : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(TestTorchDeviceMatchAlpaka); + CPPUNIT_TEST(test); + CPPUNIT_TEST_SUITE_END(); - public: - void test(); -}; + public: + void test(); + }; -CPPUNIT_TEST_SUITE_REGISTRATION(TestTorchDeviceMatchAlpaka); + CPPUNIT_TEST_SUITE_REGISTRATION(TestTorchDeviceMatchAlpaka); -void TestTorchDeviceMatchAlpaka::test() { - // alpaka setup - Platform platform; - const auto& devices = alpaka::getDevs(platform); + void TestTorchDeviceMatchAlpaka::test() { + // alpaka setup + Platform platform; + const auto& devices = alpaka::getDevs(platform); - std::cout << "Devices:" << std::endl; - for (auto& device : devices) { - std::cout << "- " << alpaka::getName(device) << std::endl; - auto torch_device = cms::torch::alpaka::device(device); - CPPUNIT_ASSERT(torch_device.type() == cms::torch::alpaka::kTorchDeviceType); + std::cout << "Devices:" << std::endl; + for (auto& device : devices) { + std::cout << "- " << alpaka::getName(device) << std::endl; + auto torch_device = cms::torch::alpaka::device(device); + CPPUNIT_ASSERT(torch_device.type() == cms::torch::alpaka::kTorchDeviceType); + } } -} } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/test/alpaka/testTorchExecutionControlGuard.cc b/PhysicsTools/PyTorch/test/alpaka/testTorchExecutionControlGuard.cc index 858f5bc2570..8efae73552f 100644 --- a/PhysicsTools/PyTorch/test/alpaka/testTorchExecutionControlGuard.cc +++ b/PhysicsTools/PyTorch/test/alpaka/testTorchExecutionControlGuard.cc @@ -11,117 +11,109 @@ #include "PhysicsTools/PyTorch/interface/Nvtx.h" #include "PhysicsTools/PyTorch/test/testUtilities.h" - namespace ALPAKA_ACCELERATOR_NAMESPACE { -GENERATE_SOA_LAYOUT(SoAInputsTemplate, - SOA_COLUMN(float, x), - SOA_COLUMN(float, y), - SOA_COLUMN(float, z) -) - -GENERATE_SOA_LAYOUT(SoAOutputsTemplate, - SOA_COLUMN(float, m), - SOA_COLUMN(float, n) -) - -using SoAInputs = SoAInputsTemplate<>; -using SoAOutputs = SoAOutputsTemplate<>; - -class testTorchExecutionControlGuard : public CppUnit::TestFixture { - CPPUNIT_TEST_SUITE(testTorchExecutionControlGuard); - CPPUNIT_TEST(test); - CPPUNIT_TEST_SUITE_END(); - - public: - void test(); -}; - -CPPUNIT_TEST_SUITE_REGISTRATION(testTorchExecutionControlGuard); - -void testTorchExecutionControlGuard::test() { - if (prctl(PR_SET_NAME, "test::Main", 0, 0, 0)) - printf("Warning: Could not set thread name: %s\n", strerror(errno)); - - // alpaka setup - Platform platform; - const auto& devices = alpaka::getDevs(platform); - assert(!devices.empty()); - const auto& device = devices[0]; - - uint32_t batch_size = 32; - - // host structs - PortableHostCollection inputs_host(batch_size, cms::alpakatools::host()); - // prepare inputs - for (size_t i = 0; i < batch_size; i++) { - inputs_host.view().x()[i] = 0.0f; - inputs_host.view().y()[i] = 0.0f; - inputs_host.view().z()[i] = 0.0f; - } - - size_t num_threads = 8; - std::vector threads; - for (size_t t = 1; t <= num_threads; ++t) { - threads.emplace_back([&, t] { - std::cout << "T " << t << " Starting" << std::endl; - Queue queue{device}; - cms::torch::alpaka::Guard guard(queue); - - char threadName[15]; - snprintf(threadName, 15, "test::%ld", t); - if (prctl(PR_SET_NAME, threadName, 0, 0, 0)) - printf("Warning: Could not set thread name: %s\n", strerror(errno)); - - for (size_t i = 0; i < 10; i++) { - NvtxScopedRange malloc_range((std::string("Malloc thread ") + std::to_string(t)).c_str()); - // host structs - PortableHostCollection outputs_host(batch_size, cms::alpakatools::host()); - // device structs - std::cout << "T" << t << " I" << i << std::endl; - PortableCollection inputs_device(batch_size, queue); - PortableCollection outputs_device(batch_size, queue); - alpaka::memcpy(queue, inputs_device.buffer(), inputs_host.buffer()); - malloc_range.end(); - - NvtxScopedRange minit_range((std::string("Model instantiation thread ") + std::to_string(t)).c_str()); - auto m_path = get_path("/src/PhysicsTools/PyTorch/models/jit_classification_model.pt"); - auto jit_model = cms::torch::alpaka::Model(m_path); - jit_model.to(queue); - CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == jit_model.device()); - minit_range.end(); - - NvtxScopedRange meta_range((std::string("Metarecords thread ") + std::to_string(t)).c_str()); - auto input_records = inputs_device.view().records(); - auto output_records = outputs_device.view().records(); - cms::torch::alpaka::SoAWrapper inputs_metadata(batch_size); - inputs_metadata.append_block("features", input_records.x(), input_records.y(), input_records.z()); - cms::torch::alpaka::SoAWrapper outputs_metadata(batch_size); - outputs_metadata.append_block("preds", output_records.m(), output_records.n()); - cms::torch::alpaka::ModelMetadata metadata(inputs_metadata, outputs_metadata); - meta_range.end(); - // inference - NvtxScopedRange infer_range((std::string("Inference thread ") + std::to_string(t)).c_str()); - jit_model.forward(metadata); - infer_range.end(); - // check outputs - NvtxScopedRange assert_range((std::string("Assert thread ") + std::to_string(t)).c_str()); - alpaka::memcpy(queue, outputs_host.buffer(), outputs_device.buffer()); - alpaka::wait(queue); - - for (size_t i = 0; i < batch_size; i++) { - CPPUNIT_ASSERT(outputs_host.const_view().m()[i] == 0.5f); - CPPUNIT_ASSERT(outputs_host.const_view().n()[i] == 0.5f); + GENERATE_SOA_LAYOUT(SoAInputsTemplate, SOA_COLUMN(float, x), SOA_COLUMN(float, y), SOA_COLUMN(float, z)) + + GENERATE_SOA_LAYOUT(SoAOutputsTemplate, SOA_COLUMN(float, m), SOA_COLUMN(float, n)) + + using SoAInputs = SoAInputsTemplate<>; + using SoAOutputs = SoAOutputsTemplate<>; + + class testTorchExecutionControlGuard : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(testTorchExecutionControlGuard); + CPPUNIT_TEST(test); + CPPUNIT_TEST_SUITE_END(); + + public: + void test(); + }; + + CPPUNIT_TEST_SUITE_REGISTRATION(testTorchExecutionControlGuard); + + void testTorchExecutionControlGuard::test() { + if (prctl(PR_SET_NAME, "test::Main", 0, 0, 0)) + printf("Warning: Could not set thread name: %s\n", strerror(errno)); + + // alpaka setup + Platform platform; + const auto& devices = alpaka::getDevs(platform); + assert(!devices.empty()); + const auto& device = devices[0]; + + uint32_t batch_size = 32; + + // host structs + PortableHostCollection inputs_host(batch_size, cms::alpakatools::host()); + // prepare inputs + for (size_t i = 0; i < batch_size; i++) { + inputs_host.view().x()[i] = 0.0f; + inputs_host.view().y()[i] = 0.0f; + inputs_host.view().z()[i] = 0.0f; + } + + size_t num_threads = 8; + std::vector threads; + for (size_t t = 1; t <= num_threads; ++t) { + threads.emplace_back([&, t] { + std::cout << "T " << t << " Starting" << std::endl; + Queue queue{device}; + cms::torch::alpaka::Guard guard(queue); + + char threadName[15]; + snprintf(threadName, 15, "test::%ld", t); + if (prctl(PR_SET_NAME, threadName, 0, 0, 0)) + printf("Warning: Could not set thread name: %s\n", strerror(errno)); + + for (size_t i = 0; i < 10; i++) { + NvtxScopedRange malloc_range((std::string("Malloc thread ") + std::to_string(t)).c_str()); + // host structs + PortableHostCollection outputs_host(batch_size, cms::alpakatools::host()); + // device structs + std::cout << "T" << t << " I" << i << std::endl; + PortableCollection inputs_device(batch_size, queue); + PortableCollection outputs_device(batch_size, queue); + alpaka::memcpy(queue, inputs_device.buffer(), inputs_host.buffer()); + malloc_range.end(); + + NvtxScopedRange minit_range((std::string("Model instantiation thread ") + std::to_string(t)).c_str()); + auto m_path = get_path("/src/PhysicsTools/PyTorch/models/jit_classification_model.pt"); + auto jit_model = cms::torch::alpaka::Model(m_path); + jit_model.to(queue); + CPPUNIT_ASSERT(cms::torch::alpaka::device(queue) == jit_model.device()); + minit_range.end(); + + NvtxScopedRange meta_range((std::string("Metarecords thread ") + std::to_string(t)).c_str()); + auto input_records = inputs_device.view().records(); + auto output_records = outputs_device.view().records(); + cms::torch::alpaka::SoAWrapper inputs_metadata(batch_size); + inputs_metadata.append_block("features", input_records.x(), input_records.y(), input_records.z()); + cms::torch::alpaka::SoAWrapper outputs_metadata(batch_size); + outputs_metadata.append_block("preds", output_records.m(), output_records.n()); + cms::torch::alpaka::ModelMetadata metadata(inputs_metadata, outputs_metadata); + meta_range.end(); + // inference + NvtxScopedRange infer_range((std::string("Inference thread ") + std::to_string(t)).c_str()); + jit_model.forward(metadata); + infer_range.end(); + // check outputs + NvtxScopedRange assert_range((std::string("Assert thread ") + std::to_string(t)).c_str()); + alpaka::memcpy(queue, outputs_host.buffer(), outputs_device.buffer()); + alpaka::wait(queue); + + for (size_t i = 0; i < batch_size; i++) { + CPPUNIT_ASSERT(outputs_host.const_view().m()[i] == 0.5f); + CPPUNIT_ASSERT(outputs_host.const_view().n()[i] == 0.5f); + } + assert_range.end(); } - assert_range.end(); - } - - std::cout << "T " << t << " OK." << std::endl; - }); - } - for (auto& t : threads) - t.join(); -} + std::cout << "T " << t << " OK." << std::endl; + }); + } + + for (auto& t : threads) + t.join(); + } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/PhysicsTools/PyTorch/test/testModel.cc b/PhysicsTools/PyTorch/test/testModel.cc index 55c30141d4f..bca55acdeab 100644 --- a/PhysicsTools/PyTorch/test/testModel.cc +++ b/PhysicsTools/PyTorch/test/testModel.cc @@ -4,7 +4,6 @@ #include #include "PhysicsTools/PyTorch/test/testUtilities.h" - class TestModel : public CppUnit::TestFixture { CPPUNIT_TEST_SUITE(TestModel); CPPUNIT_TEST(testClassificationCpu); @@ -30,8 +29,8 @@ CPPUNIT_TEST_SUITE_REGISTRATION(TestModel); void TestModel::testClassificationCpu() { auto device = torch::Device(torch::kCPU, 0); - auto inputs = torch::ones({batch_size_, 3}, device=device); - + auto inputs = torch::ones({batch_size_, 3}, device = device); + ClassifierModel model; auto outputs = model.forward(inputs); CPPUNIT_ASSERT(torch::allclose(outputs, torch::full_like(outputs, 0.5f))); @@ -39,7 +38,7 @@ void TestModel::testClassificationCpu() { void TestModel::testClassificationCuda() { auto device = torch::Device(torch::kCUDA, 0); - auto inputs = torch::ones({batch_size_, 3}, device=device); + auto inputs = torch::ones({batch_size_, 3}, device = device); ClassifierModel model; model.to(device); @@ -49,8 +48,8 @@ void TestModel::testClassificationCuda() { void TestModel::testRegressionCpu() { auto device = torch::Device(torch::kCPU, 0); - auto inputs = torch::ones({batch_size_, 3}, device=device); - + auto inputs = torch::ones({batch_size_, 3}, device = device); + RegressionModel model; auto outputs = model.forward(inputs); CPPUNIT_ASSERT(torch::allclose(outputs, torch::full_like(outputs, 0.5f))); @@ -58,7 +57,7 @@ void TestModel::testRegressionCpu() { void TestModel::testRegressionCuda() { auto device = torch::Device(torch::kCUDA, 0); - auto inputs = torch::ones({batch_size_, 3}, device=device); + auto inputs = torch::ones({batch_size_, 3}, device = device); RegressionModel model; model.to(device); @@ -68,8 +67,8 @@ void TestModel::testRegressionCuda() { void TestModel::testMultiTaskModelCpu() { auto device = torch::Device(torch::kCPU, 0); - auto inputs = torch::ones({batch_size_, 5}, device=device); - + auto inputs = torch::ones({batch_size_, 5}, device = device); + MultiTaskModel model; auto [class_probs, reg_output] = model.forward(inputs); CPPUNIT_ASSERT(torch::allclose(class_probs, torch::full_like(class_probs, 0.2f))); @@ -78,7 +77,7 @@ void TestModel::testMultiTaskModelCpu() { void TestModel::testMultiTaskModelCuda() { auto device = torch::Device(torch::kCUDA, 0); - auto inputs = torch::ones({batch_size_, 5}, device=device); + auto inputs = torch::ones({batch_size_, 5}, device = device); MultiTaskModel model; model.to(device); diff --git a/PhysicsTools/PyTorch/test/testTensorStride.cu b/PhysicsTools/PyTorch/test/testTensorStride.cu index 32943ffca6e..85837dfdc16 100644 --- a/PhysicsTools/PyTorch/test/testTensorStride.cu +++ b/PhysicsTools/PyTorch/test/testTensorStride.cu @@ -12,7 +12,6 @@ #include "PhysicsTools/PyTorch/test/testUtilities.h" - class TestTensorStride : public CppUnit::TestFixture { CPPUNIT_TEST_SUITE(TestTensorStride); CPPUNIT_TEST(test); @@ -24,17 +23,16 @@ public: CPPUNIT_TEST_SUITE_REGISTRATION(TestTensorStride); - template torch::Tensor array_to_tensor(torch::Device device, T* arr, const long int* size) { long int arr_size[N]; long int arr_stride[N]; - std::copy(size, size+N, arr_size); - std::copy(size, size+N, arr_stride); + std::copy(size, size + N, arr_size); + std::copy(size, size + N, arr_stride); std::shift_right(std::begin(arr_stride), std::end(arr_stride), 1); arr_stride[0] = 1; - arr_stride[N-1] *= arr_stride[N-2]; + arr_stride[N - 1] *= arr_stride[N - 2]; auto options = torch::TensorOptions().dtype(torch::CppTypeToScalarType()).device(device).pinned_memory(true); return torch::from_blob(arr, arr_size, arr_stride, options); @@ -45,46 +43,49 @@ void print_column_major(T* arr, const long int* size) { if (N == 2) { for (int i = 0; i < size[0]; i++) { for (int j = 0; j < size[1]; j++) { - std::cout << arr[i + j*size[0]] << " "; + std::cout << arr[i + j * size[0]] << " "; } std::cout << std::endl; - } + } } else if (N == 3) { for (int i = 0; i < size[0]; i++) { std::cout << "(" << i << ", .., ..)" << std::endl; for (int j = 0; j < size[1]; j++) { for (int k = 0; k < size[2]; k++) { - std::cout << arr[i + j*size[0] + k*size[0]*size[1]] << " "; + std::cout << arr[i + j * size[0] + k * size[0] * size[1]] << " "; } std::cout << std::endl; - } + } std::cout << std::endl; } } std::cout << std::endl; } - template -void run(torch::Device device, LinearModel model, T* input, const long int* input_shape, T* output, const long int* output_shape) { +void run(torch::Device device, + LinearModel model, + T* input, + const long int* input_shape, + T* output, + const long int* output_shape) { torch::Tensor input_tensor = array_to_tensor(device, input, input_shape); // from_blod doesn't work if use array from parameter list long int res_shape[M]; - std::copy(output_shape, output_shape+M, res_shape); + std::copy(output_shape, output_shape + M, res_shape); array_to_tensor(device, output, output_shape) = model.forward(input_tensor); } - void TestTensorStride::test() { torch::Device device(torch::kCUDA); - + float input_cpu[] = {1, 2, 3, 2, 2, 4, 4, 3, 1, 3, 1, 2}; const long int shape[] = {4, 3}; const long int result_shape[] = {4, 2}; - float result_cpu[result_shape[0]*result_shape[1]]; + float result_cpu[result_shape[0] * result_shape[1]]; float result_check[4][2] = {{2.3, -0.5}, {6.6, 3.0}, {2.5, -4.9}, {4.4, 1.3}}; // Prints array in correct form. @@ -93,11 +94,11 @@ void TestTensorStride::test() { float *input_gpu, *result_gpu; cudaMalloc(&input_gpu, sizeof(input_cpu)); cudaMalloc(&result_gpu, sizeof(result_cpu)); - cudaMemcpy(input_gpu, input_cpu, sizeof(input_cpu), cudaMemcpyHostToDevice); + cudaMemcpy(input_gpu, input_cpu, sizeof(input_cpu), cudaMemcpyHostToDevice); LinearModel model; model.to(device); - + // Call function to build tensor and run model run(device, model, input_gpu, shape, result_gpu, result_shape); @@ -105,8 +106,7 @@ void TestTensorStride::test() { cudaMemcpy(result_cpu, result_gpu, sizeof(result_cpu), cudaMemcpyDeviceToHost); for (int i = 0; i < result_shape[0]; i++) { for (int j = 0; j < result_shape[1]; j++) { - CPPUNIT_ASSERT(std::abs(result_cpu[i + j*result_shape[0]] - result_check[i][j]) <= 1.0e-05); + CPPUNIT_ASSERT(std::abs(result_cpu[i + j * result_shape[0]] - result_check[i][j]) <= 1.0e-05); } } - } diff --git a/PhysicsTools/PyTorch/test/testTorchFromBlobStride.cu b/PhysicsTools/PyTorch/test/testTorchFromBlobStride.cu index d9088bc48e8..fe147e914e3 100644 --- a/PhysicsTools/PyTorch/test/testTorchFromBlobStride.cu +++ b/PhysicsTools/PyTorch/test/testTorchFromBlobStride.cu @@ -9,10 +9,9 @@ #include #include - -using std::chrono::high_resolution_clock; -using std::chrono::duration_cast; using std::chrono::duration; +using std::chrono::duration_cast; +using std::chrono::high_resolution_clock; using std::chrono::milliseconds; /* @@ -22,12 +21,12 @@ template torch::Tensor array_to_tensor(torch::Device device, T* arr, const long int* size) { long int arr_size[N]; long int arr_stride[N]; - std::copy(size, size+N, arr_size); - std::copy(size, size+N, arr_stride); + std::copy(size, size + N, arr_size); + std::copy(size, size + N, arr_stride); std::shift_right(std::begin(arr_stride), std::end(arr_stride), 1); arr_stride[0] = 1; - arr_stride[N-1] *= arr_stride[N-2]; + arr_stride[N - 1] *= arr_stride[N - 2]; // Create Torch DType based on https://discuss.pytorch.org/t/mapping-a-template-type-to-a-scalartype/53174 auto options = torch::TensorOptions().dtype(torch::CppTypeToScalarType()).device(device).pinned_memory(true); @@ -41,26 +40,25 @@ void print_column_major(T* arr, const long int* size) { if (N == 2) { for (int i = 0; i < size[0]; i++) { for (int j = 0; j < size[1]; j++) { - std::cout << arr[i + j*size[0]] << " "; + std::cout << arr[i + j * size[0]] << " "; } std::cout << std::endl; - } + } } else if (N == 3) { for (int i = 0; i < size[0]; i++) { std::cout << "(" << i << ", .., ..)" << std::endl; for (int j = 0; j < size[1]; j++) { for (int k = 0; k < size[2]; k++) { - std::cout << arr[i + j*size[0] + k*size[0]*size[1]] << " "; + std::cout << arr[i + j * size[0] + k * size[0] * size[1]] << " "; } std::cout << std::endl; - } + } std::cout << std::endl; } } } - -int main(int argc, char* argv[]) { +int main(int argc, char* argv[]) { torch::Device device(torch::kCUDA); int a_cpu[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}; @@ -75,7 +73,7 @@ int main(int argc, char* argv[]) { // Prints array in correct form. print_column_major(a_cpu, a_shape); - int *a_gpu; + int* a_gpu; cudaMalloc(&a_gpu, sizeof(a_cpu)); cudaMemcpy(a_gpu, a_cpu, sizeof(a_cpu), cudaMemcpyHostToDevice); @@ -83,16 +81,16 @@ int main(int argc, char* argv[]) { auto options = torch::TensorOptions().dtype(torch::kInt).device(device).pinned_memory(true); std::cout << "Converting vector to Torch tensors on CPU without stride" << std::endl; torch::Tensor tensor = torch::from_blob(a_gpu, a_shape, options); - std::cout << tensor << std::endl; + std::cout << tensor << std::endl; // Correct Transposition to get to smae dimensions as column major. std::cout << "Correct Tensor with Transpose" << std::endl; long int a_size[dims]; - std::copy(a_shape, a_shape+dims, a_size); + std::copy(a_shape, a_shape + dims, a_size); std::reverse(std::begin(a_size), std::end(a_size)); tensor = torch::from_blob(a_gpu, a_size, options); - tensor = torch::transpose(tensor, 0, dims-1); + tensor = torch::transpose(tensor, 0, dims - 1); std::cout << tensor << std::endl; // Use stride to read correctly. @@ -106,9 +104,9 @@ int main(int argc, char* argv[]) { for (int j = 0; j < b_shape[1]; j++) { b[i][j] = rand(); } - } + } - int *b_gpu; + int* b_gpu; cudaMalloc(&b_gpu, b_shape[0] * b_shape[1] * sizeof(int)); cudaMemcpy(b_gpu, b, b_shape[0] * b_shape[1] * sizeof(int), cudaMemcpyHostToDevice); @@ -124,12 +122,12 @@ int main(int argc, char* argv[]) { t1 = high_resolution_clock::now(); const size_t dim_b2 = sizeof(b_shape) / sizeof(long int); long int b_size[dim_b2]; - std::copy(b_shape, b_shape+dim_b2, b_size); + std::copy(b_shape, b_shape + dim_b2, b_size); std::reverse(std::begin(a_size), std::end(a_size)); std::reverse(std::begin(b_shape), std::end(b_shape)); torch::Tensor tensor_transp = torch::from_blob(b_gpu, b_size, options); - tensor_transp = torch::transpose(tensor_transp, 0, dim_b2-1); + tensor_transp = torch::transpose(tensor_transp, 0, dim_b2 - 1); t2 = high_resolution_clock::now(); ms_double = t2 - t1; std::cout << "Transpose:" << ms_double.count() << "ms\n"; diff --git a/PhysicsTools/PyTorch/test/testUtilities.h b/PhysicsTools/PyTorch/test/testUtilities.h index 5e01d2f00ef..4cbbd084ba9 100644 --- a/PhysicsTools/PyTorch/test/testUtilities.h +++ b/PhysicsTools/PyTorch/test/testUtilities.h @@ -14,13 +14,10 @@ std::string get_path(std::string path) { return (std::filesystem::exists(base.c_str()) ? base : rel) + path; } - class ClassifierModel : public torch::nn::Module { public: - ClassifierModel(int in_dim = 3, int out_dim = 2) - : fc(in_dim, out_dim), softmax(torch::nn::SoftmaxOptions(1)) { - auto weight = - torch::tensor({{1.0, 1.0, 0.0}, {1.0, 1.0, 0.0}}, torch::kFloat); + ClassifierModel(int in_dim = 3, int out_dim = 2) : fc(in_dim, out_dim), softmax(torch::nn::SoftmaxOptions(1)) { + auto weight = torch::tensor({{1.0, 1.0, 0.0}, {1.0, 1.0, 0.0}}, torch::kFloat); fc->weight.set_data(weight); fc->weight.requires_grad_(false); @@ -42,13 +39,11 @@ private: torch::nn::Softmax softmax; }; - class RegressionModel : public torch::nn::Module { public: - RegressionModel(int in_dim = 3, int out_dim = 1) - : fc(in_dim, out_dim) { + RegressionModel(int in_dim = 3, int out_dim = 1) : fc(in_dim, out_dim) { fc->weight.set_data(torch::zeros_like(fc->weight)); - fc->weight.requires_grad_(false); + fc->weight.requires_grad_(false); fc->bias.set_data(torch::full_like(fc->bias, 0.5f)); fc->bias.requires_grad_(false); @@ -56,24 +51,16 @@ public: fc = register_module("fc", fc); } - torch::Tensor forward(torch::Tensor input) { - return fc(input); - } + torch::Tensor forward(torch::Tensor input) { return fc(input); } private: torch::nn::Linear fc{nullptr}; }; - class LinearModel : public torch::nn::Module { public: - LinearModel(int in_dim = 3, int out_dim = 2) - : fc(in_dim, out_dim) { - - auto weights = torch::tensor({ - {-0.1f, 0.2f, 2.0f}, - {0.1f, -2.3f, 4.0f} - }); + LinearModel(int in_dim = 3, int out_dim = 2) : fc(in_dim, out_dim) { + auto weights = torch::tensor({{-0.1f, 0.2f, 2.0f}, {0.1f, -2.3f, 4.0f}}); fc->weight.set_data(weights); fc->weight.requires_grad_(false); @@ -83,24 +70,16 @@ public: fc = register_module("fc", fc); } - torch::Tensor forward(torch::Tensor x) { - return fc(x); - } + torch::Tensor forward(torch::Tensor x) { return fc(x); } private: torch::nn::Linear fc{nullptr}; }; - class MultiTaskModel : public torch::nn::Module { public: MultiTaskModel(int input_dim = 5) - : fc1(input_dim, 128), - fc2(128, 128), - class_fc1(128, 64), - class_fc2(64, 5), - reg_fc1(128, 64), - reg_fc2(64, 1) { + : fc1(input_dim, 128), fc2(128, 128), class_fc1(128, 64), class_fc2(64, 5), reg_fc1(128, 64), reg_fc2(64, 1) { fc1 = register_module("fc1", fc1); fc2 = register_module("fc2", fc2); class_fc1 = register_module("class_fc1", class_fc1); @@ -108,7 +87,6 @@ public: reg_fc1 = register_module("reg_fc1", reg_fc1); reg_fc2 = register_module("reg_fc2", reg_fc2); - // set weights and biases fc1->weight.set_data(torch::full_like(fc1->weight, 0.1f)); fc1->bias.set_data(torch::full_like(fc1->bias, 0.0f)); @@ -145,7 +123,7 @@ public: // Regression head auto reg = reg_fc1->forward(x); - reg = reg_fc2->forward(reg); + reg = reg_fc2->forward(reg); return std::make_tuple(cls, reg); }