Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-20 02:31:58

0001 #ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
0002 #define HeterogeneousCore_AlpakaInterface_interface_workdivision_h
0003 
0004 #include <algorithm>
0005 #include <cstddef>
0006 #include <type_traits>
0007 
0008 #include <alpaka/alpaka.hpp>
0009 
0010 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0011 
0012 namespace cms::alpakatools {
0013 
0014   using namespace alpaka_common;
0015 
0016   // If the first argument is not a multiple of the second argument, round it up to the next multiple
0017   inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }
0018 
0019   // Return the integer division of the first argument by the second argument, rounded up to the next integer
0020   inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }
0021 
0022   // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
0023   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0024   struct requires_single_thread_per_block : public std::true_type {};
0025 
0026 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
0027   template <typename TDim>
0028   struct requires_single_thread_per_block<alpaka::AccGpuCudaRt<TDim, Idx>> : public std::false_type {};
0029 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
0030 
0031 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
0032   template <typename TDim>
0033   struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
0034 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED
0035 
0036 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
0037   template <typename TDim>
0038   struct requires_single_thread_per_block<alpaka::AccCpuThreads<TDim, Idx>> : public std::false_type {};
0039 #endif  // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
0040 
0041   // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
0042   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0043   inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block<TAcc>::value;
0044 
0045   // Create an accelerator-dependent work division for 1-dimensional kernels
0046   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
0047   inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
0048     if constexpr (not requires_single_thread_per_block_v<TAcc>) {
0049       // On GPU backends, each thread is looking at a single element:
0050       //   - the number of threads per block is "elements";
0051       //   - the number of elements per thread is always 1.
0052       return WorkDiv<Dim1D>(blocks, elements, Idx{1});
0053     } else {
0054       // On CPU backends, run serially with a single thread per block:
0055       //   - the number of threads per block is always 1;
0056       //   - the number of elements per thread is "elements".
0057       return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
0058     }
0059   }
0060 
0061   // Create the accelerator-dependent workdiv for N-dimensional kernels
0062   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0063   inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
0064                                                  const Vec<alpaka::Dim<TAcc>>& elements) {
0065     using Dim = alpaka::Dim<TAcc>;
0066     if constexpr (not requires_single_thread_per_block_v<TAcc>) {
0067       // On GPU backends, each thread is looking at a single element:
0068       //   - the number of threads per block is "elements";
0069       //   - the number of elements per thread is always 1.
0070       return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
0071     } else {
0072       // On CPU backends, run serially with a single thread per block:
0073       //   - the number of threads per block is always 1;
0074       //   - the number of elements per thread is "elements".
0075       return WorkDiv<Dim>(blocks, Vec<Dim>::ones(), elements);
0076     }
0077   }
0078 
0079   /* ElementIndex
0080    *
0081    * an aggregate that containes the `.global` and `.local` indices of an element; returned by iterating over the objecs
0082    * returned by `uniform_group_elements` and similar functions.
0083    */
0084 
0085   struct ElementIndex {
0086     Idx global;
0087     Idx local;
0088   };
0089 
0090   namespace detail {
0091 
0092     /* UniformElementsAlong
0093    *
0094    * `UniformElementsAlong<TAcc, Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that spans the
0095    * element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension.
0096    * If `first` is not specified, it defaults to 0.
0097    * If `extent` is not specified, it defaults to the kernel grid size along the `Dim` dimension.
0098    *
0099    * `uniform_elements_along<Dim>(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, Dim>(acc, ...)` that can
0100    * infer the accelerator type from the argument.
0101    *
0102    * In a 1-dimensional kernel, `uniform_elements(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, 0>(acc, ...)`.
0103    *
0104    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed
0105    * by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
0106    * For convenience when converting CUDA or HIP code, `uniform_elements_x(acc, ...)`, `_y` and `_z` are shorthands for
0107    * `UniformElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
0108    *
0109    * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
0110    * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
0111    * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
0112    * loop over each group's elements, and synchronise only in the outer loop:
0113    *
0114    *  for (auto group : uniform_groups_along<Dim>(acc, extent)) {
0115    *    for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
0116    *       // first part of the computation
0117    *       // no synchronisations here
0118    *       ...
0119    *    }
0120    *    // wait for all threads to complete the first part
0121    *    alpaka::syncBlockThreads();
0122    *    for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
0123    *       // second part of the computation
0124    *       // no synchronisations here
0125    *       ...
0126    *    }
0127    *    // wait for all threads to complete the second part
0128    *    alpaka::syncBlockThreads();
0129    *    ...
0130    *  }
0131    *
0132    * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
0133    * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
0134    * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
0135    * explicitly inside the loop:
0136    *
0137    *  for (auto element : uniform_elements_along<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
0138    *    bool flag = false;
0139    *    if (element < extent) {
0140    *      // do some work and compute a result flag only for the valid elements
0141    *      flag = do_some_work();
0142    *    }
0143    *    // check if any valid element had a positive result
0144    *    if (alpaka::warp::any(acc, flag)) {
0145    *      // ...
0146    *    }
0147    *  }
0148    *
0149    * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
0150    */
0151 
0152     template <typename TAcc,
0153               std::size_t Dim,
0154               typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
0155     class UniformElementsAlong {
0156     public:
0157       ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc)
0158           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
0159             first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
0160             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
0161             extent_{stride_} {}
0162 
0163       ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent)
0164           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
0165             first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
0166             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
0167             extent_{extent} {}
0168 
0169       ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent)
0170           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
0171             first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first},
0172             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
0173             extent_{extent} {}
0174 
0175       class const_iterator;
0176       using iterator = const_iterator;
0177 
0178       ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, first_); }
0179 
0180       ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }
0181 
0182       class const_iterator {
0183         friend class UniformElementsAlong;
0184 
0185         ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
0186             : elements_{elements},
0187               stride_{stride},
0188               extent_{extent},
0189               first_{std::min(first, extent)},
0190               index_{first_},
0191               range_{std::min(first + elements, extent)} {}
0192 
0193       public:
0194         ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
0195 
0196         // pre-increment the iterator
0197         ALPAKA_FN_ACC inline const_iterator& operator++() {
0198           if constexpr (requires_single_thread_per_block_v<TAcc>) {
0199             // increment the index along the elements processed by the current thread
0200             ++index_;
0201             if (index_ < range_)
0202               return *this;
0203           }
0204 
0205           // increment the thread index with the grid stride
0206           first_ += stride_;
0207           index_ = first_;
0208           range_ = std::min(first_ + elements_, extent_);
0209           if (index_ < extent_)
0210             return *this;
0211 
0212           // the iterator has reached or passed the end of the extent, clamp it to the extent
0213           first_ = extent_;
0214           index_ = extent_;
0215           range_ = extent_;
0216           return *this;
0217         }
0218 
0219         // post-increment the iterator
0220         ALPAKA_FN_ACC inline const_iterator operator++(int) {
0221           const_iterator old = *this;
0222           ++(*this);
0223           return old;
0224         }
0225 
0226         ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const {
0227           return (index_ == other.index_) and (first_ == other.first_);
0228         }
0229 
0230         ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
0231 
0232       private:
0233         // non-const to support iterator copy and assignment
0234         Idx elements_;
0235         Idx stride_;
0236         Idx extent_;
0237         // modified by the pre/post-increment operator
0238         Idx first_;
0239         Idx index_;
0240         Idx range_;
0241       };
0242 
0243     private:
0244       const Idx elements_;
0245       const Idx first_;
0246       const Idx stride_;
0247       const Idx extent_;
0248     };
0249 
0250   }  // namespace detail
0251 
0252   /* uniform_elements
0253    *
0254    * `uniform_elements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element indices
0255    * from `first` (inclusive) to `extent` (exlusive).
0256    * If `first` is not specified, it defaults to 0.
0257    * If `extent` is not specified, it defaults to the kernel grid size.
0258    *
0259    * `uniform_elements(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, 0>(acc, ...)`.
0260    *
0261    * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
0262    * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
0263    * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
0264    * loop over each group's elements, and synchronise only in the outer loop:
0265    *
0266    *  for (auto group : uniform_groups(acc, extent)) {
0267    *    for (auto element : uniform_group_elements(acc, group, extent)) {
0268    *       // first part of the computation
0269    *       // no synchronisations here
0270    *       ...
0271    *    }
0272    *    // wait for all threads to complete the first part
0273    *    alpaka::syncBlockThreads();
0274    *    for (auto element : uniform_group_elements(acc, group, extent)) {
0275    *       // second part of the computation
0276    *       // no synchronisations here
0277    *       ...
0278    *    }
0279    *    // wait for all threads to complete the second part
0280    *    alpaka::syncBlockThreads();
0281    *    ...
0282    *  }
0283    *
0284    * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
0285    * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
0286    * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
0287    * explicitly inside the loop:
0288    *
0289    *  for (auto element : uniform_elements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
0290    *    bool flag = false;
0291    *    if (element < extent) {
0292    *      // do some work and compute a result flag only for elements up to extent
0293    *      flag = do_some_work();
0294    *    }
0295    *    // check if any valid element had a positive result
0296    *    if (alpaka::warp::any(acc, flag)) {
0297    *      // ...
0298    *    }
0299    *  }
0300    *
0301    * Note that `uniform_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
0302    *   - `uniform_elements_nd(acc, ...)` to cover an N-dimensional problem space with a single loop;
0303    *   - `uniform_elements_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
0304    *   - `uniform_elements_x(acc, ...)`, `uniform_elements_y(acc, ...)`, or `uniform_elements_z(acc, ...)` to loop
0305    *     along the fastest, second-fastest, or third-fastest dimension.
0306    */
0307 
0308   template <typename TAcc,
0309             typename... TArgs,
0310             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
0311   ALPAKA_FN_ACC inline auto uniform_elements(TAcc const& acc, TArgs... args) {
0312     return detail::UniformElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
0313   }
0314 
0315   /* uniform_elements_along<Dim>
0316    *
0317    * `uniform_elements_along<Dim>(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, Dim>(acc, ...)` that can
0318    * infer the accelerator type from the argument.
0319    */
0320 
0321   template <typename TAcc,
0322             std::size_t Dim,
0323             typename... TArgs,
0324             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
0325   ALPAKA_FN_ACC inline auto uniform_elements_along(TAcc const& acc, TArgs... args) {
0326     return detail::UniformElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
0327   }
0328 
0329   /* uniform_elements_x, _y, _z
0330    *
0331    * Like `uniform_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
0332    */
0333 
0334   template <typename TAcc,
0335             typename... TArgs,
0336             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
0337   ALPAKA_FN_ACC inline auto uniform_elements_x(TAcc const& acc, TArgs... args) {
0338     return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
0339   }
0340 
0341   template <typename TAcc,
0342             typename... TArgs,
0343             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
0344   ALPAKA_FN_ACC inline auto uniform_elements_y(TAcc const& acc, TArgs... args) {
0345     return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
0346   }
0347 
0348   template <typename TAcc,
0349             typename... TArgs,
0350             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
0351   ALPAKA_FN_ACC inline auto uniform_elements_z(TAcc const& acc, TArgs... args) {
0352     return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
0353   }
0354 
0355   namespace detail {
0356 
0357     /* UniformElementsND
0358    *
0359    * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
0360    * required to cover the given problem size, indicated by `extent`.
0361    *
0362    * `uniform_elements_nd(acc, ...)` is an alias for `UniformElementsND<TAcc>(acc, ...)`.
0363    *
0364    * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
0365    * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
0366    * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
0367    * loop over each group's elements, and synchronise only in the outer loop:
0368    *
0369    *  for (auto group0 : uniform_groups_along<0>(acc, extent[0])) {
0370    *    for (auto group1 : uniform_groups_along<1>(acc, extent[1])) {
0371    *      for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
0372    *        for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
0373    *           // first part of the computation
0374    *           // no synchronisations here
0375    *           ...
0376    *        }
0377    *      }
0378    *      // wait for all threads to complete the first part
0379    *      alpaka::syncBlockThreads();
0380    *      for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
0381    *        for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
0382    *           // second part of the computation
0383    *           // no synchronisations here
0384    *           ...
0385    *        }
0386    *      }
0387    *      // wait for all threads to complete the second part
0388    *      alpaka::syncBlockThreads();
0389    *      ...
0390    *    }
0391    *  }
0392    *
0393    * For more details, see `UniformElementsAlong<TAcc, Dim>(acc, ...)`.
0394    */
0395 
0396     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
0397     class UniformElementsND {
0398     public:
0399       using Dim = alpaka::Dim<TAcc>;
0400       using Vec = alpaka::Vec<Dim, Idx>;
0401 
0402       ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc)
0403           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
0404             thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
0405             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
0406             extent_{stride_} {}
0407 
0408       ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent)
0409           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
0410             thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
0411             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
0412             extent_{extent} {}
0413 
0414       // tag used to construct an end iterator
0415       struct at_end_t {};
0416 
0417       class const_iterator;
0418       using iterator = const_iterator;
0419 
0420       ALPAKA_FN_ACC inline const_iterator begin() const {
0421         // check that all dimensions of the current thread index are within the extent
0422         if ((thread_ < extent_).all()) {
0423           // construct an iterator pointing to the first element to be processed by the current thread
0424           return const_iterator{this, thread_};
0425         } else {
0426           // construct an end iterator, pointing post the end of the extent
0427           return const_iterator{this, at_end_t{}};
0428         }
0429       }
0430 
0431       ALPAKA_FN_ACC inline const_iterator end() const {
0432         // construct an end iterator, pointing post the end of the extent
0433         return const_iterator{this, at_end_t{}};
0434       }
0435 
0436       class const_iterator {
0437         friend class UniformElementsND;
0438 
0439       public:
0440         ALPAKA_FN_ACC inline Vec operator*() const { return index_; }
0441 
0442         // pre-increment the iterator
0443         ALPAKA_FN_ACC constexpr inline const_iterator operator++() {
0444           increment();
0445           return *this;
0446         }
0447 
0448         // post-increment the iterator
0449         ALPAKA_FN_ACC constexpr inline const_iterator operator++(int) {
0450           const_iterator old = *this;
0451           increment();
0452           return old;
0453         }
0454 
0455         ALPAKA_FN_ACC constexpr inline bool operator==(const_iterator const& other) const {
0456           return (index_ == other.index_);
0457         }
0458 
0459         ALPAKA_FN_ACC constexpr inline bool operator!=(const_iterator const& other) const {
0460           return not(*this == other);
0461         }
0462 
0463       private:
0464         // construct an iterator pointing to the first element to be processed by the current thread
0465         ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first)
0466             : loop_{loop},
0467               first_{alpaka::elementwise_min(first, loop->extent_)},
0468               range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
0469               index_{first_} {}
0470 
0471         // construct an end iterator, pointing post the end of the extent
0472         ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&)
0473             : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {}
0474 
0475         template <size_t I>
0476         ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
0477           bool overflow = false;
0478           ++index_[I];
0479           if (index_[I] >= range_[I]) {
0480             index_[I] = first_[I];
0481             overflow = true;
0482           }
0483           return overflow;
0484         }
0485 
0486         template <size_t N>
0487         ALPAKA_FN_ACC inline constexpr bool do_elements_loops() {
0488           if constexpr (N == 0) {
0489             // overflow
0490             return true;
0491           } else {
0492             if (not nth_elements_loop<N - 1>()) {
0493               return false;
0494             } else {
0495               return do_elements_loops<N - 1>();
0496             }
0497           }
0498         }
0499 
0500         template <size_t I>
0501         ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
0502           bool overflow = false;
0503           first_[I] += loop_->stride_[I];
0504           if (first_[I] >= loop_->extent_[I]) {
0505             first_[I] = loop_->thread_[I];
0506             overflow = true;
0507           }
0508           index_[I] = first_[I];
0509           range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
0510           return overflow;
0511         }
0512 
0513         template <size_t N>
0514         ALPAKA_FN_ACC inline constexpr bool do_strided_loops() {
0515           if constexpr (N == 0) {
0516             // overflow
0517             return true;
0518           } else {
0519             if (not nth_strided_loop<N - 1>()) {
0520               return false;
0521             } else {
0522               return do_strided_loops<N - 1>();
0523             }
0524           }
0525         }
0526 
0527         // increment the iterator
0528         ALPAKA_FN_ACC inline constexpr void increment() {
0529           if constexpr (requires_single_thread_per_block_v<TAcc>) {
0530             // linear N-dimensional loops over the elements associated to the thread;
0531             // do_elements_loops<>() returns true if any of those loops overflows
0532             if (not do_elements_loops<Dim::value>()) {
0533               // the elements loops did not overflow, return the next index
0534               return;
0535             }
0536           }
0537 
0538           // strided N-dimensional loop over the threads in the kernel launch grid;
0539           // do_strided_loops<>() returns true if any of those loops overflows
0540           if (not do_strided_loops<Dim::value>()) {
0541             // the strided loops did not overflow, return the next index
0542             return;
0543           }
0544 
0545           // the iterator has reached or passed the end of the extent, clamp it to the extent
0546           first_ = loop_->extent_;
0547           range_ = loop_->extent_;
0548           index_ = loop_->extent_;
0549         }
0550 
0551         // const pointer to the UniformElementsND that the iterator refers to
0552         const UniformElementsND* loop_;
0553 
0554         // modified by the pre/post-increment operator
0555         Vec first_;  // first element processed by this thread
0556         Vec range_;  // last element processed by this thread
0557         Vec index_;  // current element processed by this thread
0558       };
0559 
0560     private:
0561       const Vec elements_;
0562       const Vec thread_;
0563       const Vec stride_;
0564       const Vec extent_;
0565     };
0566 
0567   }  // namespace detail
0568 
0569   /* uniform_elements_nd
0570    *
0571    * `uniform_elements_nd(acc, ...)` is a shorthand for `detail::UniformElementsND<TAcc>(acc, ...)`.
0572    */
0573 
0574   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
0575   ALPAKA_FN_ACC inline auto uniform_elements_nd(TAcc const& acc) {
0576     return detail::UniformElementsND<TAcc>(acc);
0577   }
0578 
0579   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
0580   ALPAKA_FN_ACC inline auto uniform_elements_nd(TAcc const& acc, alpaka::Vec<alpaka::Dim<TAcc>, Idx> extent) {
0581     return detail::UniformElementsND<TAcc>(acc, extent);
0582   }
0583 
0584   namespace detail {
0585 
0586     /* UniformGroupsAlong
0587    *
0588    * `UniformGroupsAlong<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group indices
0589    * required to cover the given problem size along the `Dim` dimension, in units of the block size. `elements`
0590    * indicates the total number of elements, across all groups; if not specified, it defaults to the kernel grid size
0591    * along the `Dim` dimension.
0592    *
0593    * `uniform_groups_along<Dim>(acc, ...)` is a shorthand for `UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can infer
0594    * the accelerator type from the argument.
0595    *
0596    * In a 1-dimensional kernel, `uniform_groups(acc, ...)` is a shorthand for `UniformGroupsAlong<Tacc, 0>(acc, ...)`.
0597    *
0598    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
0599    * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
0600    * For convenience when converting CUDA or HIP code, `uniform_groups_x(acc, ...)`, `_y` and `_z` are shorthands for
0601    * `UniformGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
0602    *
0603    * `uniform_groups_along<Dim>(acc, ...)` should be called consistently by all the threads in a block. All threads in a
0604    * block see the same loop iterations, while threads in different blocks may see a different number of iterations.
0605    * If the work division has more blocks than the required number of groups, the first blocks will perform one
0606    * iteration of the loop, while the other blocks will exit the loop immediately.
0607    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
0608    * one iteration, in order to cover then whole problem space.
0609    *
0610    * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
0611    * than the block size. However, also in this case all threads in the block will execute the same number of iterations
0612    * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
0613    * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by
0614    * `uniform_group_elements_along<Dim>(acc, group, elements)`.
0615    *
0616    * For example, if the block size is 64 and there are 400 elements
0617    *
0618    *   for (auto group: uniform_groups_along<Dim>(acc, 400)
0619    *
0620    * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
0621    * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
0622    * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
0623    * the inner loop to not process the non-existing elements after 399.
0624    *
0625    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
0626    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
0627    * process one group while block 7 will no process any.
0628    *
0629    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
0630    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
0631    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
0632    * group 3.
0633    *
0634    * See `UniformElementsAlong<TAcc, Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
0635    * `uniform_group_elements_along<Dim>`.
0636    */
0637 
0638     template <typename TAcc,
0639               std::size_t Dim,
0640               typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
0641     class UniformGroupsAlong {
0642     public:
0643       ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc)
0644           : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
0645             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
0646             extent_{stride_} {}
0647 
0648       // extent is the total number of elements (not blocks)
0649       ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent)
0650           : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
0651             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
0652             extent_{divide_up_by(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])} {}
0653 
0654       class const_iterator;
0655       using iterator = const_iterator;
0656 
0657       ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); }
0658 
0659       ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }
0660 
0661       class const_iterator {
0662         friend class UniformGroupsAlong;
0663 
0664         ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
0665             : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
0666 
0667       public:
0668         ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
0669 
0670         // pre-increment the iterator
0671         ALPAKA_FN_ACC inline const_iterator& operator++() {
0672           // increment the first-element-in-block index by the grid stride
0673           first_ += stride_;
0674           if (first_ < extent_)
0675             return *this;
0676 
0677           // the iterator has reached or passed the end of the extent, clamp it to the extent
0678           first_ = extent_;
0679           return *this;
0680         }
0681 
0682         // post-increment the iterator
0683         ALPAKA_FN_ACC inline const_iterator operator++(int) {
0684           const_iterator old = *this;
0685           ++(*this);
0686           return old;
0687         }
0688 
0689         ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); }
0690 
0691         ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
0692 
0693       private:
0694         // non-const to support iterator copy and assignment
0695         Idx stride_;
0696         Idx extent_;
0697         // modified by the pre/post-increment operator
0698         Idx first_;
0699       };
0700 
0701     private:
0702       const Idx first_;
0703       const Idx stride_;
0704       const Idx extent_;
0705     };
0706 
0707   }  // namespace detail
0708 
0709   /* uniform_groups
0710    *
0711    * `uniform_groups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required to
0712    * cover the given problem size, in units of the block size. `elements` indicates the total number of elements, across
0713    * all groups; if not specified, it defaults to the kernel grid size.
0714    *
0715    * `uniform_groups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, 0>(acc, ...)`.
0716    *
0717    * `uniform_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block see
0718    * the same loop iterations, while threads in different blocks may see a different number of iterations.
0719    * If the work division has more blocks than the required number of groups, the first blocks will perform one
0720    * iteration of the loop, while the other blocks will exit the loop immediately.
0721    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
0722    * one iteration, in order to cover then whole problem space.
0723    *
0724    * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
0725    * than the block size. However, also in this case all threads in the block will execute the same number of iterations
0726    * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
0727    * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by
0728    * `uniform_group_elements(acc, group, elements)`.
0729    *
0730    * For example, if the block size is 64 and there are 400 elements
0731    *
0732    *   for (auto group: uniform_groups(acc, 400)
0733    *
0734    * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
0735    * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
0736    * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
0737    * the inner loop to not process the non-existing elements after 399.
0738    *
0739    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
0740    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
0741    * process one group while block 7 will no process any.
0742    *
0743    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
0744    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
0745    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
0746    * group 3.
0747    *
0748    * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
0749    *
0750    * Note that `uniform_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
0751    *   - `uniform_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
0752    *   - `uniform_groups_x(acc, ...)`, `uniform_groups_y(acc, ...)`, or `uniform_groups_z(acc, ...)` to loop
0753    *     along the fastest, second-fastest, or third-fastest dimension.
0754    */
0755 
0756   template <typename TAcc,
0757             typename... TArgs,
0758             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
0759   ALPAKA_FN_ACC inline auto uniform_groups(TAcc const& acc, TArgs... args) {
0760     return detail::UniformGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
0761   }
0762 
0763   /* uniform_groups_along<Dim>
0764    *
0765    * `uniform_groups_along<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can infer
0766    * the accelerator type from the argument.
0767    */
0768 
0769   template <typename TAcc,
0770             std::size_t Dim,
0771             typename... TArgs,
0772             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
0773   ALPAKA_FN_ACC inline auto uniform_groups_along(TAcc const& acc, TArgs... args) {
0774     return detail::UniformGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
0775   }
0776 
0777   /* uniform_groups_x, _y, _z
0778    *
0779    * Like `uniform_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
0780    */
0781 
0782   template <typename TAcc,
0783             typename... TArgs,
0784             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
0785   ALPAKA_FN_ACC inline auto uniform_groups_x(TAcc const& acc, TArgs... args) {
0786     return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
0787   }
0788 
0789   template <typename TAcc,
0790             typename... TArgs,
0791             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
0792   ALPAKA_FN_ACC inline auto uniform_groups_y(TAcc const& acc, TArgs... args) {
0793     return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
0794   }
0795 
0796   template <typename TAcc,
0797             typename... TArgs,
0798             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
0799   ALPAKA_FN_ACC inline auto uniform_groups_z(TAcc const& acc, TArgs... args) {
0800     return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
0801   }
0802 
0803   namespace detail {
0804 
0805     /* UniformGroupElementsAlong
0806    *
0807    * `UniformGroupElementsAlong<TAcc, Dim>(acc, group, elements)` returns a one-dimensional iteratable range that spans
0808    * all the elements within the given `group` along dimension `Dim`, as obtained from `UniformGroupsAlong<Dim>`, up to
0809    * `elements` (exclusive). `elements` indicates the total number of elements across all groups; if not specified, it
0810    * defaults to the kernel grid size.
0811    *
0812    * `uniform_group_elements_along<Dim>(acc, ...)` is a shorthand for `UniformGroupElementsAlong<TAcc, Dim>(acc, ...)`
0813    * that can infer the accelerator type from the argument.
0814    *
0815    * In a 1-dimensional kernel, `uniform_group_elements(acc, ...)` is a shorthand for
0816    * `UniformGroupElementsAlong<0>(acc, ...)`.
0817    *
0818    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
0819    * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
0820    * For convenience when converting CUDA or HIP code, `uniform_group_elements_x(acc, ...)`, `_y` and `_z` are
0821    * shorthands for `UniformGroupElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
0822    *
0823    * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
0824    * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
0825    * local index spans the range from 0 to the block size (excluded).
0826    *
0827    * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
0828    * global element index reaches `elements`.
0829    *
0830    * If the problem size is not a multiple of the block size, different threads may execute a different number of
0831    * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
0832    * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
0833    * See `UniformElementsAlong<Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
0834    * `uniform_group_elements_along<Dim>`.
0835    *
0836    * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
0837    * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
0838    * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
0839    * index explicitly inside the loop:
0840    *
0841    *  for (auto element : uniform_group_elements_along<N-1>(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
0842    *    bool flag = false;
0843    *    if (element < elements) {
0844    *      // do some work and compute a result flag only for the valid elements
0845    *      flag = do_some_work();
0846    *    }
0847    *    // check if any valid element had a positive result
0848    *    if (alpaka::warp::any(acc, flag)) {
0849    *      // ...
0850    *    }
0851    *  }
0852    *
0853    * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
0854    */
0855 
0856     template <typename TAcc,
0857               std::size_t Dim,
0858               typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
0859     class UniformGroupElementsAlong {
0860     public:
0861       ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block)
0862           : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
0863             local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
0864                    alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
0865             range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]} {}
0866 
0867       ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent)
0868           : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
0869             local_{std::min(extent - first_,
0870                             alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
0871                                 alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])},
0872             range_{std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])} {}
0873 
0874       class const_iterator;
0875       using iterator = const_iterator;
0876 
0877       ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(local_, first_, range_); }
0878 
0879       ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(range_, first_, range_); }
0880 
0881       class const_iterator {
0882         friend class UniformGroupElementsAlong;
0883 
0884         ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
0885             : index_{local}, first_{first}, range_{range} {}
0886 
0887       public:
0888         ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; }
0889 
0890         // pre-increment the iterator
0891         ALPAKA_FN_ACC inline const_iterator& operator++() {
0892           if constexpr (requires_single_thread_per_block_v<TAcc>) {
0893             // increment the index along the elements processed by the current thread
0894             ++index_;
0895             if (index_ < range_)
0896               return *this;
0897           }
0898 
0899           // the iterator has reached or passed the end of the extent, clamp it to the extent
0900           index_ = range_;
0901           return *this;
0902         }
0903 
0904         // post-increment the iterator
0905         ALPAKA_FN_ACC inline const_iterator operator++(int) {
0906           const_iterator old = *this;
0907           ++(*this);
0908           return old;
0909         }
0910 
0911         ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (index_ == other.index_); }
0912 
0913         ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
0914 
0915       private:
0916         // modified by the pre/post-increment operator
0917         Idx index_;
0918         // non-const to support iterator copy and assignment
0919         Idx first_;
0920         Idx range_;
0921       };
0922 
0923     private:
0924       const Idx first_;
0925       const Idx local_;
0926       const Idx range_;
0927     };
0928 
0929   }  // namespace detail
0930 
0931   /* uniform_group_elements
0932    *
0933    * `uniform_group_elements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
0934    * elements within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements`
0935    * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
0936    *
0937    * `uniform_group_elements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`.
0938    *
0939    * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
0940    * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
0941    * local index spans the range from 0 to the block size (excluded).
0942    *
0943    * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
0944    * global element index reaches `elements`.
0945    *
0946    * If the problem size is not a multiple of the block size, different threads may execute a different number of
0947    * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
0948    * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
0949    * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
0950    *
0951    * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
0952    * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
0953    * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
0954    * index explicitly inside the loop:
0955    *
0956    *  for (auto element : uniform_group_elements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
0957    *    bool flag = false;
0958    *    if (element < elements) {
0959    *      // do some work and compute a result flag only for the valid elements
0960    *      flag = do_some_work();
0961    *    }
0962    *    // check if any valid element had a positive result
0963    *    if (alpaka::warp::any(acc, flag)) {
0964    *      // ...
0965    *    }
0966    *  }
0967    *
0968    * Note that `uniform_group_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
0969    * kernels, use
0970    *   - `detail::UniformGroupElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
0971    *   - `uniform_group_elements_x(acc, ...)`, `uniform_group_elements_y(acc, ...)`, or
0972    *     `uniform_group_elements_z(acc, ...)` to loop along the fastest, second-fastest, or third-fastest dimension.
0973    */
0974 
0975   template <typename TAcc,
0976             typename... TArgs,
0977             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
0978   ALPAKA_FN_ACC inline auto uniform_group_elements(TAcc const& acc, TArgs... args) {
0979     return detail::UniformGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
0980   }
0981 
0982   /* uniform_group_elements_along<Dim>
0983    *
0984    * `uniform_group_elements_along<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<TAcc, Dim>(acc, ...)`
0985    * that can infer the accelerator type from the argument.
0986    */
0987 
0988   template <typename TAcc,
0989             std::size_t Dim,
0990             typename... TArgs,
0991             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
0992   ALPAKA_FN_ACC inline auto uniform_group_elements_along(TAcc const& acc, TArgs... args) {
0993     return detail::UniformGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
0994   }
0995 
0996   /* uniform_group_elements_x, _y, _z
0997    *
0998    * Like `uniform_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
0999    * dimensions.
1000    */
1001 
1002   template <typename TAcc,
1003             typename... TArgs,
1004             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1005   ALPAKA_FN_ACC inline auto uniform_group_elements_x(TAcc const& acc, TArgs... args) {
1006     return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
1007   }
1008 
1009   template <typename TAcc,
1010             typename... TArgs,
1011             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1012   ALPAKA_FN_ACC inline auto uniform_group_elements_y(TAcc const& acc, TArgs... args) {
1013     return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
1014   }
1015 
1016   template <typename TAcc,
1017             typename... TArgs,
1018             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1019   ALPAKA_FN_ACC inline auto uniform_group_elements_z(TAcc const& acc, TArgs... args) {
1020     return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
1021   }
1022 
1023   namespace detail {
1024 
1025     /* IndependentGroupsAlong
1026    *
1027    * `IndependentGroupsAlong<TAcc, Dim>(acc, groups)` returns a one-dimensional iteratable range than spans the group
1028    * indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If `groups` is not
1029    * specified, it defaults to the number of blocks along the `Dim` dimension.
1030    *
1031    * `independent_groups_along<Dim>(acc, ...)` is a shorthand for `IndependentGroupsAlong<TAcc, Dim>(acc, ...)` that can
1032    * infer the accelerator type from the argument.
1033    *
1034    * In a 1-dimensional kernel, `independent_groups(acc, ...)` is a shorthand for
1035    * `IndependentGroupsAlong<TAcc, 0>(acc, ...)`.
1036    *
1037    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
1038    * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
1039    * For convenience when converting CUDA or HIP code, `independent_groups_x(acc, ...)`, `_y` and `_z` are shorthands
1040    * for `IndependentGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
1041    *
1042    * `independent_groups_along<Dim>(acc, ...)` should be called consistently by all the threads in a block. All threads
1043    * in a block see the same loop iterations, while threads in different blocks may see a different number of iterations.
1044    * If the work division has more blocks than the required number of groups, the first blocks will perform one
1045    * iteration of the loop, while the other blocks will exit the loop immediately.
1046    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
1047    * one iteration, in order to cover then whole problem space.
1048    *
1049    * For example,
1050    *
1051    *   for (auto group: independent_groups_along<Dim>(acc, 7))
1052    *
1053    * will return the group range from 0 to 6, distributed across all blocks in the work division.
1054    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
1055    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
1056    * process one group while block 7 will no process any.
1057    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
1058    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
1059    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
1060    * group 3.
1061    */
1062 
1063     template <typename TAcc,
1064               std::size_t Dim,
1065               typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1066     class IndependentGroupsAlong {
1067     public:
1068       ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc)
1069           : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1070             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1071             extent_{stride_} {}
1072 
1073       ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc, Idx groups)
1074           : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1075             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1076             extent_{groups} {}
1077 
1078       class const_iterator;
1079       using iterator = const_iterator;
1080 
1081       ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); }
1082 
1083       ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }
1084 
1085       class const_iterator {
1086         friend class IndependentGroupsAlong;
1087 
1088         ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
1089             : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
1090 
1091       public:
1092         ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
1093 
1094         // pre-increment the iterator
1095         ALPAKA_FN_ACC inline const_iterator& operator++() {
1096           // increment the first-element-in-block index by the grid stride
1097           first_ += stride_;
1098           if (first_ < extent_)
1099             return *this;
1100 
1101           // the iterator has reached or passed the end of the extent, clamp it to the extent
1102           first_ = extent_;
1103           return *this;
1104         }
1105 
1106         // post-increment the iterator
1107         ALPAKA_FN_ACC inline const_iterator operator++(int) {
1108           const_iterator old = *this;
1109           ++(*this);
1110           return old;
1111         }
1112 
1113         ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); }
1114 
1115         ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
1116 
1117       private:
1118         // non-const to support iterator copy and assignment
1119         Idx stride_;
1120         Idx extent_;
1121         // modified by the pre/post-increment operator
1122         Idx first_;
1123       };
1124 
1125     private:
1126       const Idx first_;
1127       const Idx stride_;
1128       const Idx extent_;
1129     };
1130 
1131   }  // namespace detail
1132 
1133   /* independent_groups
1134    *
1135    * `independent_groups(acc, groups)` returns a one-dimensional iteratable range than spans the group indices from 0 to
1136    * `groups`. If `groups` is not specified, it defaults to the number of blocks.
1137    *
1138    * `independent_groups(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, 0>(acc, ...)`.
1139    *
1140    * `independent_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
1141    * see the same loop iterations, while threads in different blocks may see a different number of iterations.
1142    * If the work division has more blocks than the required number of groups, the first blocks will perform one
1143    * iteration of the loop, while the other blocks will exit the loop immediately.
1144    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
1145    * one iteration, in order to cover then whole problem space.
1146    *
1147    * For example,
1148    *
1149    *   for (auto group: independent_groups(acc, 7))
1150    *
1151    * will return the group range from 0 to 6, distributed across all blocks in the work division.
1152    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
1153    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
1154    * process one group while block 7 will no process any.
1155    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
1156    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
1157    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
1158    * group 3.
1159    *
1160    * Note that `independent_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
1161    * use
1162    *   - `independent_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
1163    *   - `independent_groups_x(acc, ...)`, `independent_groups_y(acc, ...)`, or `independent_groups_z(acc, ...)` to loop
1164    *     along the fastest, second-fastest, or third-fastest dimension.
1165    */
1166 
1167   template <typename TAcc,
1168             typename... TArgs,
1169             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
1170   ALPAKA_FN_ACC inline auto independent_groups(TAcc const& acc, TArgs... args) {
1171     return detail::IndependentGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
1172   }
1173 
1174   /* independent_groups_along<Dim>
1175    *
1176    * `independent_groups_along<Dim>(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, Dim>(acc, ...)` that can
1177    * infer the accelerator type from the argument.
1178    */
1179 
1180   template <typename TAcc,
1181             std::size_t Dim,
1182             typename... TArgs,
1183             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1184   ALPAKA_FN_ACC inline auto independent_groups_along(TAcc const& acc, TArgs... args) {
1185     return detail::IndependentGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
1186   }
1187 
1188   /* independent_groups_x, _y, _z
1189    *
1190    * Like `independent_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1191    * dimensions.
1192    */
1193 
1194   template <typename TAcc,
1195             typename... TArgs,
1196             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1197   ALPAKA_FN_ACC inline auto independent_groups_x(TAcc const& acc, TArgs... args) {
1198     return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
1199   }
1200 
1201   template <typename TAcc,
1202             typename... TArgs,
1203             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1204   ALPAKA_FN_ACC inline auto independent_groups_y(TAcc const& acc, TArgs... args) {
1205     return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
1206   }
1207 
1208   template <typename TAcc,
1209             typename... TArgs,
1210             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1211   ALPAKA_FN_ACC inline auto independent_groups_z(TAcc const& acc, TArgs... args) {
1212     return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
1213   }
1214 
1215   namespace detail {
1216 
1217     /* IndependentGroupElementsAlong
1218    *
1219    * `independent_group_elements_along<Dim>(acc, ...)` is a shorthand for
1220    * `IndependentGroupElementsAlong<TAcc, Dim>(acc, ...)` that can infer the accelerator type from the argument.
1221    */
1222 
1223     template <typename TAcc,
1224               std::size_t Dim,
1225               typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1226     class IndependentGroupElementsAlong {
1227     public:
1228       ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc)
1229           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1230             thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1231             stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1232             extent_{stride_} {}
1233 
1234       ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx extent)
1235           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1236             thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1237             stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1238             extent_{extent} {}
1239 
1240       ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx first, Idx extent)
1241           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1242             thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_ + first},
1243             stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1244             extent_{extent} {}
1245 
1246       class const_iterator;
1247       using iterator = const_iterator;
1248 
1249       ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, thread_); }
1250 
1251       ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }
1252 
1253       class const_iterator {
1254         friend class IndependentGroupElementsAlong;
1255 
1256         ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
1257             : elements_{elements},
1258               stride_{stride},
1259               extent_{extent},
1260               first_{std::min(first, extent)},
1261               index_{first_},
1262               range_{std::min(first + elements, extent)} {}
1263 
1264       public:
1265         ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
1266 
1267         // pre-increment the iterator
1268         ALPAKA_FN_ACC inline const_iterator& operator++() {
1269           if constexpr (requires_single_thread_per_block_v<TAcc>) {
1270             // increment the index along the elements processed by the current thread
1271             ++index_;
1272             if (index_ < range_)
1273               return *this;
1274           }
1275 
1276           // increment the thread index with the block stride
1277           first_ += stride_;
1278           index_ = first_;
1279           range_ = std::min(first_ + elements_, extent_);
1280           if (index_ < extent_)
1281             return *this;
1282 
1283           // the iterator has reached or passed the end of the extent, clamp it to the extent
1284           first_ = extent_;
1285           index_ = extent_;
1286           range_ = extent_;
1287           return *this;
1288         }
1289 
1290         // post-increment the iterator
1291         ALPAKA_FN_ACC inline const_iterator operator++(int) {
1292           const_iterator old = *this;
1293           ++(*this);
1294           return old;
1295         }
1296 
1297         ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const {
1298           return (index_ == other.index_) and (first_ == other.first_);
1299         }
1300 
1301         ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
1302 
1303       private:
1304         // non-const to support iterator copy and assignment
1305         Idx elements_;
1306         Idx stride_;
1307         Idx extent_;
1308         // modified by the pre/post-increment operator
1309         Idx first_;
1310         Idx index_;
1311         Idx range_;
1312       };
1313 
1314     private:
1315       const Idx elements_;
1316       const Idx thread_;
1317       const Idx stride_;
1318       const Idx extent_;
1319     };
1320 
1321   }  // namespace detail
1322 
1323   /* independent_group_elements
1324    */
1325 
1326   template <typename TAcc,
1327             typename... TArgs,
1328             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
1329   ALPAKA_FN_ACC inline auto independent_group_elements(TAcc const& acc, TArgs... args) {
1330     return detail::IndependentGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
1331   }
1332 
1333   /* independent_group_elements_along<Dim>
1334    *
1335    * `independent_group_elements_along<Dim>(acc, ...)` is a shorthand for
1336    * `detail::IndependentGroupElementsAlong<TAcc, Dim>(acc, ...)` that can infer the accelerator type from the argument.
1337    */
1338 
1339   template <typename TAcc,
1340             std::size_t Dim,
1341             typename... TArgs,
1342             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1343   ALPAKA_FN_ACC inline auto independent_group_elements_along(TAcc const& acc, TArgs... args) {
1344     return detail::IndependentGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
1345   }
1346 
1347   /* independent_group_elements_x, _y, _z
1348    *
1349    * Like `independent_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1350    * dimensions.
1351    */
1352 
1353   template <typename TAcc,
1354             typename... TArgs,
1355             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1356   ALPAKA_FN_ACC inline auto independent_group_elements_x(TAcc const& acc, TArgs... args) {
1357     return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
1358   }
1359 
1360   template <typename TAcc,
1361             typename... TArgs,
1362             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1363   ALPAKA_FN_ACC inline auto independent_group_elements_y(TAcc const& acc, TArgs... args) {
1364     return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
1365   }
1366 
1367   template <typename TAcc,
1368             typename... TArgs,
1369             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1370   ALPAKA_FN_ACC inline auto independent_group_elements_z(TAcc const& acc, TArgs... args) {
1371     return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
1372   }
1373 
1374   /* once_per_grid
1375    *
1376    * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.
1377    *
1378    * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
1379    */
1380 
1381   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1382   ALPAKA_FN_ACC inline constexpr bool once_per_grid(TAcc const& acc) {
1383     return alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
1384   }
1385 
1386   /* once_per_block
1387    *
1388    * `once_per_block(acc)` returns true for a single thread within the block.
1389    *
1390    * Usually the condition is true for thread 0, but this index should not be relied upon.
1391    */
1392 
1393   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1394   ALPAKA_FN_ACC inline constexpr bool once_per_block(TAcc const& acc) {
1395     return alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
1396   }
1397 
1398 }  // namespace cms::alpakatools
1399 
1400 #endif  // HeterogeneousCore_AlpakaInterface_interface_workdivision_h