Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-02-14 04:11:49

0001 #ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
0002 #define HeterogeneousCore_AlpakaInterface_interface_workdivision_h
0003 
0004 #include <type_traits>
0005 
0006 #include <alpaka/alpaka.hpp>
0007 
0008 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0009 #include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
0010 
0011 namespace cms::alpakatools {
0012 
0013   using namespace alpaka_common;
0014 
0015   // If the first argument is not a multiple of the second argument, round it up to the next multiple
0016   inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }
0017 
0018   // Return the integer division of the first argument by the second argument, rounded up to the next integer
0019   inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }
0020 
0021   // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
0022   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0023   struct requires_single_thread_per_block : public std::true_type {};
0024 
0025 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
0026   template <typename TDim>
0027   struct requires_single_thread_per_block<alpaka::AccGpuCudaRt<TDim, Idx>> : public std::false_type {};
0028 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
0029 
0030 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
0031   template <typename TDim>
0032   struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
0033 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED
0034 
0035 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
0036   template <typename TDim>
0037   struct requires_single_thread_per_block<alpaka::AccCpuThreads<TDim, Idx>> : public std::false_type {};
0038 #endif  // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
0039 
0040   // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
0041   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0042   inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block<TAcc>::value;
0043 
0044   // Create an accelerator-dependent work division for 1-dimensional kernels
0045   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
0046   inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
0047     if constexpr (not requires_single_thread_per_block_v<TAcc>) {
0048       // On GPU backends, each thread is looking at a single element:
0049       //   - the number of threads per block is "elements";
0050       //   - the number of elements per thread is always 1.
0051       return WorkDiv<Dim1D>(blocks, elements, Idx{1});
0052     } else {
0053       // On CPU backends, run serially with a single thread per block:
0054       //   - the number of threads per block is always 1;
0055       //   - the number of elements per thread is "elements".
0056       return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
0057     }
0058   }
0059 
0060   // Create the accelerator-dependent workdiv for N-dimensional kernels
0061   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0062   inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
0063                                                  const Vec<alpaka::Dim<TAcc>>& elements) {
0064     using Dim = alpaka::Dim<TAcc>;
0065     if constexpr (not requires_single_thread_per_block_v<TAcc>) {
0066       // On GPU backends, each thread is looking at a single element:
0067       //   - the number of threads per block is "elements";
0068       //   - the number of elements per thread is always 1.
0069       return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
0070     } else {
0071       // On CPU backends, run serially with a single thread per block:
0072       //   - the number of threads per block is always 1;
0073       //   - the number of elements per thread is "elements".
0074       return WorkDiv<Dim>(blocks, Vec<Dim>::ones(), elements);
0075     }
0076   }
0077 
0078   /* ElementIndex
0079    *
0080    * an aggregate that containes the `.global` and `.local` indices of an element; returned by iterating over the objecs
0081    * returned by `elements_in_block` and similar functions.
0082    */
0083 
0084   struct ElementIndex {
0085     Idx global;
0086     Idx local;
0087   };
0088 
0089   /* uniform_elements_along
0090    *
0091    * `uniform_elements_along<Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that spans the
0092    * element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension.
0093    * If `first` is not specified, it defaults to 0.
0094    * If `extent` is not specified, it defaults to the kernel grid size along the `Dim` dimension.
0095    *
0096    * In a 1-dimensional kernel, `uniform_elements(acc, ...)` is a shorthand for `uniform_elements_along<0>(acc, ...)`.
0097    *
0098    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed
0099    * by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
0100    * For convenience when converting CUDA or HIP code, `uniform_elements_x(acc, ...)`, `_y` and `_z` are shorthands for 
0101    * `uniform_elements_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
0102    *
0103    * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
0104    * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
0105    * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
0106    * loop over each group's elements, and synchronise only in the outer loop:
0107    *
0108    *  for (auto group : uniform_groups_along<Dim>(acc, extent)) {
0109    *    for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
0110    *       // first part of the computation
0111    *       // no synchronisations here
0112    *       ...
0113    *    }
0114    *    // wait for all threads to complete the first part
0115    *    alpaka::syncBlockThreads();
0116    *    for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
0117    *       // second part of the computation
0118    *       // no synchronisations here
0119    *       ...
0120    *    }
0121    *    // wait for all threads to complete the second part
0122    *    alpaka::syncBlockThreads();
0123    *    ...
0124    *  }
0125    *
0126    * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
0127    * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
0128    * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
0129    * explicitly inside the loop:
0130    *
0131    *  for (auto element : uniform_elements_along<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
0132    *    bool flag = false;
0133    *    if (element < extent) {
0134    *      // do some work and compute a result flag only for the valid elements
0135    *      flag = do_some_work();
0136    *    }
0137    *    // check if any valid element had a positive result
0138    *    if (alpaka::warp::any(acc, flag)) {
0139    *      // ...
0140    *    }
0141    *  }
0142    *
0143    * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
0144    */
0145 
0146   template <typename TAcc,
0147             std::size_t Dim,
0148             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
0149   class uniform_elements_along {
0150   public:
0151     ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc)
0152         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
0153           first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
0154           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
0155           extent_{stride_} {}
0156 
0157     ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc, Idx extent)
0158         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
0159           first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
0160           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
0161           extent_{extent} {}
0162 
0163     ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc, Idx first, Idx extent)
0164         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
0165           first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first},
0166           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
0167           extent_{extent} {}
0168 
0169     class const_iterator;
0170     using iterator = const_iterator;
0171 
0172     ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, first_); }
0173 
0174     ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }
0175 
0176     class const_iterator {
0177       friend class uniform_elements_along;
0178 
0179       ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
0180           : elements_{elements},
0181             stride_{stride},
0182             extent_{extent},
0183             first_{std::min(first, extent)},
0184             index_{first_},
0185             range_{std::min(first + elements, extent)} {}
0186 
0187     public:
0188       ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
0189 
0190       // pre-increment the iterator
0191       ALPAKA_FN_ACC inline const_iterator& operator++() {
0192         if constexpr (requires_single_thread_per_block_v<TAcc>) {
0193           // increment the index along the elements processed by the current thread
0194           ++index_;
0195           if (index_ < range_)
0196             return *this;
0197         }
0198 
0199         // increment the thread index with the grid stride
0200         first_ += stride_;
0201         index_ = first_;
0202         range_ = std::min(first_ + elements_, extent_);
0203         if (index_ < extent_)
0204           return *this;
0205 
0206         // the iterator has reached or passed the end of the extent, clamp it to the extent
0207         first_ = extent_;
0208         index_ = extent_;
0209         range_ = extent_;
0210         return *this;
0211       }
0212 
0213       // post-increment the iterator
0214       ALPAKA_FN_ACC inline const_iterator operator++(int) {
0215         const_iterator old = *this;
0216         ++(*this);
0217         return old;
0218       }
0219 
0220       ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const {
0221         return (index_ == other.index_) and (first_ == other.first_);
0222       }
0223 
0224       ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
0225 
0226     private:
0227       // non-const to support iterator copy and assignment
0228       Idx elements_;
0229       Idx stride_;
0230       Idx extent_;
0231       // modified by the pre/post-increment operator
0232       Idx first_;
0233       Idx index_;
0234       Idx range_;
0235     };
0236 
0237   private:
0238     const Idx elements_;
0239     const Idx first_;
0240     const Idx stride_;
0241     const Idx extent_;
0242   };
0243 
0244   /* uniform_elements
0245    *
0246    * `uniform_elements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element indices
0247    * from `first` (inclusive) to `extent` (exlusive).
0248    * If `first` is not specified, it defaults to 0.
0249    * If `extent` is not specified, it defaults to the kernel grid size.
0250    *
0251    * `uniform_elements(acc, ...)` is a shorthand for `uniform_elements_along<0>(acc, ...)`.
0252    *
0253    * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
0254    * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
0255    * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
0256    * loop over each group's elements, and synchronise only in the outer loop:
0257    *
0258    *  for (auto group : uniform_groups(acc, extent)) {
0259    *    for (auto element : uniform_group_elements(acc, group, extent)) {
0260    *       // first part of the computation
0261    *       // no synchronisations here
0262    *       ...
0263    *    }
0264    *    // wait for all threads to complete the first part
0265    *    alpaka::syncBlockThreads();
0266    *    for (auto element : uniform_group_elements(acc, group, extent)) {
0267    *       // second part of the computation
0268    *       // no synchronisations here
0269    *       ...
0270    *    }
0271    *    // wait for all threads to complete the second part
0272    *    alpaka::syncBlockThreads();
0273    *    ...
0274    *  }
0275    *
0276    * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
0277    * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
0278    * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
0279    * explicitly inside the loop:
0280    *
0281    *  for (auto element : uniform_elements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
0282    *    bool flag = false;
0283    *    if (element < extent) {
0284    *      // do some work and compute a result flag only for elements up to extent
0285    *      flag = do_some_work();
0286    *    }
0287    *    // check if any valid element had a positive result
0288    *    if (alpaka::warp::any(acc, flag)) {
0289    *      // ...
0290    *    }
0291    *  }
0292    *
0293    * Note that `uniform_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
0294    *   - `uniform_elements_nd(acc, ...)` to cover an N-dimensional problem space with a single loop;
0295    *   - `uniform_elements_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
0296    *   - `uniform_elements_x(acc, ...)`, `uniform_elements_y(acc, ...)`, or `uniform_elements_z(acc, ...)` to loop
0297    *     along the fastest, second-fastest, or third-fastest dimension.
0298    */
0299 
0300   template <typename TAcc,
0301             typename... TArgs,
0302             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
0303   ALPAKA_FN_ACC inline auto uniform_elements(TAcc const& acc, TArgs... args) {
0304     return uniform_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
0305   }
0306 
0307   /* uniform_elements_x, _y, _z
0308    *
0309    * Like `uniform_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
0310    */
0311 
0312   template <typename TAcc,
0313             typename... TArgs,
0314             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
0315   ALPAKA_FN_ACC inline auto uniform_elements_x(TAcc const& acc, TArgs... args) {
0316     return uniform_elements_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
0317   }
0318 
0319   template <typename TAcc,
0320             typename... TArgs,
0321             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
0322   ALPAKA_FN_ACC inline auto uniform_elements_y(TAcc const& acc, TArgs... args) {
0323     return uniform_elements_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
0324   }
0325 
0326   template <typename TAcc,
0327             typename... TArgs,
0328             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
0329   ALPAKA_FN_ACC inline auto uniform_elements_z(TAcc const& acc, TArgs... args) {
0330     return uniform_elements_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
0331   }
0332 
0333   /* elements_with_stride
0334    *
0335    * `elements_with_stride(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element
0336    * indices from `first` (inclusive) to `extent` (exlusive).
0337    * If `first` is not specified, it defaults to 0.
0338    * If `extent` is not specified, it defaults to the kernel grid size.
0339    *
0340    * `elements_with_stride(acc, ...)` is a legacy name for `uniform_elements(acc, ...)`.
0341    */
0342 
0343   template <typename TAcc,
0344             typename... TArgs,
0345             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
0346   ALPAKA_FN_ACC inline auto elements_with_stride(TAcc const& acc, TArgs... args) {
0347     return uniform_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
0348   }
0349 
0350   /* uniform_elements_nd
0351    *
0352    * `uniform_elements_nd(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
0353    * required to cover the given problem size, indicated by `extent`.
0354    *
0355    * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
0356    * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
0357    * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
0358    * loop over each group's elements, and synchronise only in the outer loop:
0359    *
0360    *  for (auto group0 : uniform_groups_along<0>(acc, extent[0])) {
0361    *    for (auto group1 : uniform_groups_along<1>(acc, extent[1])) {
0362    *      for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
0363    *        for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
0364    *           // first part of the computation
0365    *           // no synchronisations here
0366    *           ...
0367    *        }
0368    *      }
0369    *      // wait for all threads to complete the first part
0370    *      alpaka::syncBlockThreads();
0371    *      for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
0372    *        for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
0373    *           // second part of the computation
0374    *           // no synchronisations here
0375    *           ...
0376    *        }
0377    *      }
0378    *      // wait for all threads to complete the second part
0379    *      alpaka::syncBlockThreads();
0380    *      ...
0381    *    }
0382    *  }
0383    *
0384    * For more details, see `uniform_elements_along<Dim>(acc, ...)`.
0385    */
0386 
0387   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
0388   class uniform_elements_nd {
0389   public:
0390     using Dim = alpaka::Dim<TAcc>;
0391     using Vec = alpaka::Vec<Dim, Idx>;
0392 
0393     ALPAKA_FN_ACC inline uniform_elements_nd(TAcc const& acc)
0394         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
0395           thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
0396           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
0397           extent_{stride_} {}
0398 
0399     ALPAKA_FN_ACC inline uniform_elements_nd(TAcc const& acc, Vec extent)
0400         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
0401           thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
0402           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
0403           extent_{extent} {}
0404 
0405     // tag used to construct an end iterator
0406     struct at_end_t {};
0407 
0408     class const_iterator;
0409     using iterator = const_iterator;
0410 
0411     ALPAKA_FN_ACC inline const_iterator begin() const {
0412       // check that all dimensions of the current thread index are within the extent
0413       if ((thread_ < extent_).all()) {
0414         // construct an iterator pointing to the first element to be processed by the current thread
0415         return const_iterator{this, thread_};
0416       } else {
0417         // construct an end iterator, pointing post the end of the extent
0418         return const_iterator{this, at_end_t{}};
0419       }
0420     }
0421 
0422     ALPAKA_FN_ACC inline const_iterator end() const {
0423       // construct an end iterator, pointing post the end of the extent
0424       return const_iterator{this, at_end_t{}};
0425     }
0426 
0427     class const_iterator {
0428       friend class uniform_elements_nd;
0429 
0430     public:
0431       ALPAKA_FN_ACC inline Vec operator*() const { return index_; }
0432 
0433       // pre-increment the iterator
0434       ALPAKA_FN_ACC constexpr inline const_iterator operator++() {
0435         increment();
0436         return *this;
0437       }
0438 
0439       // post-increment the iterator
0440       ALPAKA_FN_ACC constexpr inline const_iterator operator++(int) {
0441         const_iterator old = *this;
0442         increment();
0443         return old;
0444       }
0445 
0446       ALPAKA_FN_ACC constexpr inline bool operator==(const_iterator const& other) const {
0447         return (index_ == other.index_);
0448       }
0449 
0450       ALPAKA_FN_ACC constexpr inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
0451 
0452     private:
0453       // construct an iterator pointing to the first element to be processed by the current thread
0454       ALPAKA_FN_ACC inline const_iterator(uniform_elements_nd const* loop, Vec first)
0455           : loop_{loop},
0456             first_{alpaka::elementwise_min(first, loop->extent_)},
0457             range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
0458             index_{first_} {}
0459 
0460       // construct an end iterator, pointing post the end of the extent
0461       ALPAKA_FN_ACC inline const_iterator(uniform_elements_nd const* loop, at_end_t const&)
0462           : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {}
0463 
0464       template <size_t I>
0465       ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
0466         bool overflow = false;
0467         ++index_[I];
0468         if (index_[I] >= range_[I]) {
0469           index_[I] = first_[I];
0470           overflow = true;
0471         }
0472         return overflow;
0473       }
0474 
0475       template <size_t N>
0476       ALPAKA_FN_ACC inline constexpr bool do_elements_loops() {
0477         if constexpr (N == 0) {
0478           // overflow
0479           return true;
0480         } else {
0481           if (not nth_elements_loop<N - 1>()) {
0482             return false;
0483           } else {
0484             return do_elements_loops<N - 1>();
0485           }
0486         }
0487       }
0488 
0489       template <size_t I>
0490       ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
0491         bool overflow = false;
0492         first_[I] += loop_->stride_[I];
0493         if (first_[I] >= loop_->extent_[I]) {
0494           first_[I] = loop_->thread_[I];
0495           overflow = true;
0496         }
0497         index_[I] = first_[I];
0498         range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
0499         return overflow;
0500       }
0501 
0502       template <size_t N>
0503       ALPAKA_FN_ACC inline constexpr bool do_strided_loops() {
0504         if constexpr (N == 0) {
0505           // overflow
0506           return true;
0507         } else {
0508           if (not nth_strided_loop<N - 1>()) {
0509             return false;
0510           } else {
0511             return do_strided_loops<N - 1>();
0512           }
0513         }
0514       }
0515 
0516       // increment the iterator
0517       ALPAKA_FN_ACC inline constexpr void increment() {
0518         if constexpr (requires_single_thread_per_block_v<TAcc>) {
0519           // linear N-dimensional loops over the elements associated to the thread;
0520           // do_elements_loops<>() returns true if any of those loops overflows
0521           if (not do_elements_loops<Dim::value>()) {
0522             // the elements loops did not overflow, return the next index
0523             return;
0524           }
0525         }
0526 
0527         // strided N-dimensional loop over the threads in the kernel launch grid;
0528         // do_strided_loops<>() returns true if any of those loops overflows
0529         if (not do_strided_loops<Dim::value>()) {
0530           // the strided loops did not overflow, return the next index
0531           return;
0532         }
0533 
0534         // the iterator has reached or passed the end of the extent, clamp it to the extent
0535         first_ = loop_->extent_;
0536         range_ = loop_->extent_;
0537         index_ = loop_->extent_;
0538       }
0539 
0540       // const pointer to the uniform_elements_nd that the iterator refers to
0541       const uniform_elements_nd* loop_;
0542 
0543       // modified by the pre/post-increment operator
0544       Vec first_;  // first element processed by this thread
0545       Vec range_;  // last element processed by this thread
0546       Vec index_;  // current element processed by this thread
0547     };
0548 
0549   private:
0550     const Vec elements_;
0551     const Vec thread_;
0552     const Vec stride_;
0553     const Vec extent_;
0554   };
0555 
0556   /* elements_with_stride_nd
0557    *
0558    * `elements_with_stride_nd(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
0559    * required to cover the given problem size, indicated by `extent`.
0560    *
0561    * `elements_with_stride_nd(acc, ...)` is a legacy name for `uniform_elements_nd(acc, ...)`.
0562    */
0563 
0564   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
0565   ALPAKA_FN_ACC inline auto elements_with_stride_nd(TAcc const& acc) {
0566     return uniform_elements_nd<TAcc>(acc);
0567   }
0568 
0569   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
0570   ALPAKA_FN_ACC inline auto elements_with_stride_nd(TAcc const& acc, alpaka::Vec<alpaka::Dim<TAcc>, Idx> extent) {
0571     return uniform_elements_nd<TAcc>(acc, extent);
0572   }
0573 
0574   /* uniform_groups_along
0575    *
0576    * `uniform_groups_along<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group indices
0577    * required to cover the given problem size along the `Dim` dimension, in units of the block size. `elements`
0578    * indicates the total number of elements, across all groups; if not specified, it defaults to the kernel grid size
0579    * along the `Dim` dimension.
0580    *
0581    * In a 1-dimensional kernel, `uniform_groups(acc, ...)` is a shorthand for `uniform_groups_along<0>(acc, ...)`.
0582    *
0583    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
0584    * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
0585    * For convenience when converting CUDA or HIP code, `uniform_groups_x(acc, ...)`, `_y` and `_z` are shorthands for 
0586    * `uniform_groups_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
0587    *
0588    * `uniform_groups_along<Dim>` should be called consistently by all the threads in a block. All threads in a block see
0589    * the same loop iterations, while threads in different blocks may see a different number of iterations.
0590    * If the work division has more blocks than the required number of groups, the first blocks will perform one
0591    * iteration of the loop, while the other blocks will exit the loop immediately.
0592    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
0593    * one iteration, in order to cover then whole problem space.
0594    *
0595    * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
0596    * than the block size. However, also in this case all threads in the block will execute the same number of iterations
0597    * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
0598    * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by 
0599    * `uniform_group_elements_along<Dim>(acc, group, elements)`.
0600    *
0601    * For example, if the block size is 64 and there are 400 elements
0602    *
0603    *   for (auto group: uniform_groups_along<Dim>(acc, 400)
0604    *
0605    * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
0606    * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
0607    * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
0608    * the inner loop to not process the non-existing elements after 399.
0609    *
0610    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
0611    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
0612    * process one group while block 7 will no process any.
0613    *
0614    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
0615    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
0616    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
0617    * group 3.
0618    *
0619    * See `uniform_elements_along<Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
0620    * `uniform_group_elements_along<Dim>`.
0621    */
0622 
0623   template <typename TAcc,
0624             std::size_t Dim,
0625             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
0626   class uniform_groups_along {
0627   public:
0628     ALPAKA_FN_ACC inline uniform_groups_along(TAcc const& acc)
0629         : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
0630           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
0631           extent_{stride_} {}
0632 
0633     // extent is the total number of elements (not blocks)
0634     ALPAKA_FN_ACC inline uniform_groups_along(TAcc const& acc, Idx extent)
0635         : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
0636           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
0637           extent_{divide_up_by(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])} {}
0638 
0639     class const_iterator;
0640     using iterator = const_iterator;
0641 
0642     ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); }
0643 
0644     ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }
0645 
0646     class const_iterator {
0647       friend class uniform_groups_along;
0648 
0649       ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
0650           : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
0651 
0652     public:
0653       ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
0654 
0655       // pre-increment the iterator
0656       ALPAKA_FN_ACC inline const_iterator& operator++() {
0657         // increment the first-element-in-block index by the grid stride
0658         first_ += stride_;
0659         if (first_ < extent_)
0660           return *this;
0661 
0662         // the iterator has reached or passed the end of the extent, clamp it to the extent
0663         first_ = extent_;
0664         return *this;
0665       }
0666 
0667       // post-increment the iterator
0668       ALPAKA_FN_ACC inline const_iterator operator++(int) {
0669         const_iterator old = *this;
0670         ++(*this);
0671         return old;
0672       }
0673 
0674       ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); }
0675 
0676       ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
0677 
0678     private:
0679       // non-const to support iterator copy and assignment
0680       Idx stride_;
0681       Idx extent_;
0682       // modified by the pre/post-increment operator
0683       Idx first_;
0684     };
0685 
0686   private:
0687     const Idx first_;
0688     const Idx stride_;
0689     const Idx extent_;
0690   };
0691 
0692   /* uniform_groups
0693    *
0694    * `uniform_groups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required to
0695    * cover the given problem size, in units of the block size. `elements` indicates the total number of elements, across
0696    * all groups; if not specified, it defaults to the kernel grid size.
0697    *
0698    * `uniform_groups(acc, ...)` is a shorthand for `uniform_groups_along<0>(acc, ...)`.
0699    *
0700    * `uniform_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block see
0701    * the same loop iterations, while threads in different blocks may see a different number of iterations.
0702    * If the work division has more blocks than the required number of groups, the first blocks will perform one
0703    * iteration of the loop, while the other blocks will exit the loop immediately.
0704    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
0705    * one iteration, in order to cover then whole problem space.
0706    *
0707    * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
0708    * than the block size. However, also in this case all threads in the block will execute the same number of iterations
0709    * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
0710    * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by 
0711    * `uniform_group_elements(acc, group, elements)`.
0712    *
0713    * For example, if the block size is 64 and there are 400 elements
0714    *
0715    *   for (auto group: uniform_groups(acc, 400)
0716    *
0717    * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
0718    * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
0719    * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
0720    * the inner loop to not process the non-existing elements after 399.
0721    *
0722    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
0723    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
0724    * process one group while block 7 will no process any.
0725    *
0726    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
0727    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
0728    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
0729    * group 3.
0730    *
0731    * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
0732    *
0733    * Note that `uniform_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
0734    *   - `uniform_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
0735    *   - `uniform_groups_x(acc, ...)`, `uniform_groups_y(acc, ...)`, or `uniform_groups_z(acc, ...)` to loop
0736    *     along the fastest, second-fastest, or third-fastest dimension.
0737    */
0738 
0739   template <typename TAcc,
0740             typename... TArgs,
0741             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
0742   ALPAKA_FN_ACC inline auto uniform_groups(TAcc const& acc, TArgs... args) {
0743     return uniform_groups_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
0744   }
0745 
0746   /* uniform_groups_x, _y, _z
0747    *
0748    * Like `uniform_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
0749    */
0750 
0751   template <typename TAcc,
0752             typename... TArgs,
0753             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
0754   ALPAKA_FN_ACC inline auto uniform_groups_x(TAcc const& acc, TArgs... args) {
0755     return uniform_groups_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
0756   }
0757 
0758   template <typename TAcc,
0759             typename... TArgs,
0760             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
0761   ALPAKA_FN_ACC inline auto uniform_groups_y(TAcc const& acc, TArgs... args) {
0762     return uniform_groups_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
0763   }
0764 
0765   template <typename TAcc,
0766             typename... TArgs,
0767             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
0768   ALPAKA_FN_ACC inline auto uniform_groups_z(TAcc const& acc, TArgs... args) {
0769     return uniform_groups_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
0770   }
0771 
0772   /* blocks_with_stride
0773    *
0774    * `blocks_with_stride(acc, elements)` returns a one-dimensional iteratable range than spans the group indices
0775    * required to cover the given problem size, in units of the block size. `elements` indicates the total number of
0776    * elements, across all groups; if not specified, it defaults to the kernel grid size.
0777    *
0778    * `blocks_with_stride(acc, ...)` is a legacy name for `uniform_groups(acc, ...)`.
0779    */
0780 
0781   template <typename TAcc,
0782             typename... TArgs,
0783             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
0784   ALPAKA_FN_ACC inline auto blocks_with_stride(TAcc const& acc, TArgs... args) {
0785     return uniform_groups_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
0786   }
0787 
0788   /* uniform_group_elements_along
0789    *
0790    * `uniform_group_elements_along<Dim>(acc, group, elements)` returns a one-dimensional iteratable range that spans all
0791    * the elements within the given `group` along dimension `Dim`, as obtained from `uniform_groups_along<Dim>`, up to
0792    * `elements` (exclusive). `elements` indicates the total number of elements across all groups; if not specified, it
0793    * defaults to the kernel grid size.
0794    *
0795    * In a 1-dimensional kernel, `uniform_group_elements(acc, ...)` is a shorthand for
0796    * `uniform_group_elements_along<0>(acc, ...)`.
0797    *
0798    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by 
0799    * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
0800    * For convenience when converting CUDA or HIP code, `uniform_group_elements_x(acc, ...)`, `_y` and `_z` are
0801    * shorthands for `uniform_group_elements_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
0802    *
0803    * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
0804    * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
0805    * local index spans the range from 0 to the block size (excluded).
0806    *
0807    * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
0808    * global element index reaches `elements`.
0809    *
0810    * If the problem size is not a multiple of the block size, different threads may execute a different number of
0811    * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
0812    * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
0813    * See `uniform_elements_along<Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
0814    * `uniform_group_elements_along<Dim>`.
0815    *
0816    * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
0817    * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
0818    * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
0819    * index explicitly inside the loop:
0820    *
0821    *  for (auto element : uniform_group_elements_along<N-1>(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
0822    *    bool flag = false;
0823    *    if (element < elements) {
0824    *      // do some work and compute a result flag only for the valid elements
0825    *      flag = do_some_work();
0826    *    }
0827    *    // check if any valid element had a positive result
0828    *    if (alpaka::warp::any(acc, flag)) {
0829    *      // ...
0830    *    }
0831    *  }
0832    *
0833    * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
0834    */
0835 
0836   template <typename TAcc,
0837             std::size_t Dim,
0838             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
0839   class uniform_group_elements_along {
0840   public:
0841     ALPAKA_FN_ACC inline uniform_group_elements_along(TAcc const& acc, Idx block)
0842         : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
0843           local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
0844                  alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
0845           range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]} {}
0846 
0847     ALPAKA_FN_ACC inline uniform_group_elements_along(TAcc const& acc, Idx block, Idx extent)
0848         : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
0849           local_{std::min(extent - first_,
0850                           alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
0851                               alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])},
0852           range_{std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])} {}
0853 
0854     class const_iterator;
0855     using iterator = const_iterator;
0856 
0857     ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(local_, first_, range_); }
0858 
0859     ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(range_, first_, range_); }
0860 
0861     class const_iterator {
0862       friend class uniform_group_elements_along;
0863 
0864       ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
0865           : index_{local}, first_{first}, range_{range} {}
0866 
0867     public:
0868       ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; }
0869 
0870       // pre-increment the iterator
0871       ALPAKA_FN_ACC inline const_iterator& operator++() {
0872         if constexpr (requires_single_thread_per_block_v<TAcc>) {
0873           // increment the index along the elements processed by the current thread
0874           ++index_;
0875           if (index_ < range_)
0876             return *this;
0877         }
0878 
0879         // the iterator has reached or passed the end of the extent, clamp it to the extent
0880         index_ = range_;
0881         return *this;
0882       }
0883 
0884       // post-increment the iterator
0885       ALPAKA_FN_ACC inline const_iterator operator++(int) {
0886         const_iterator old = *this;
0887         ++(*this);
0888         return old;
0889       }
0890 
0891       ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (index_ == other.index_); }
0892 
0893       ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
0894 
0895     private:
0896       // modified by the pre/post-increment operator
0897       Idx index_;
0898       // non-const to support iterator copy and assignment
0899       Idx first_;
0900       Idx range_;
0901     };
0902 
0903   private:
0904     const Idx first_;
0905     const Idx local_;
0906     const Idx range_;
0907   };
0908 
0909   /* uniform_group_elements
0910    *
0911    * `uniform_group_elements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
0912    * elements within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements`
0913    * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
0914    *
0915    * `uniform_group_elements(acc, ...)` is a shorthand for `uniform_group_elements_along<0>(acc, ...)`.
0916    *
0917    * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
0918    * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
0919    * local index spans the range from 0 to the block size (excluded).
0920    *
0921    * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
0922    * global element index reaches `elements`.
0923    *
0924    * If the problem size is not a multiple of the block size, different threads may execute a different number of
0925    * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
0926    * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
0927    * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
0928    *
0929    * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
0930    * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
0931    * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
0932    * index explicitly inside the loop:
0933    *
0934    *  for (auto element : uniform_group_elements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
0935    *    bool flag = false;
0936    *    if (element < elements) {
0937    *      // do some work and compute a result flag only for the valid elements
0938    *      flag = do_some_work();
0939    *    }
0940    *    // check if any valid element had a positive result
0941    *    if (alpaka::warp::any(acc, flag)) {
0942    *      // ...
0943    *    }
0944    *  }
0945    *
0946    * Note that `uniform_group_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
0947    * kernels, use
0948    *   - `uniform_group_elements_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
0949    *   - `uniform_group_elements_x(acc, ...)`, `uniform_group_elements_y(acc, ...)`, or
0950    *     `uniform_group_elements_z(acc, ...)` to loop along the fastest, second-fastest, or third-fastest dimension.
0951    */
0952 
0953   template <typename TAcc,
0954             typename... TArgs,
0955             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
0956   ALPAKA_FN_ACC inline auto uniform_group_elements(TAcc const& acc, TArgs... args) {
0957     return uniform_group_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
0958   }
0959 
0960   /* uniform_group_elements_x, _y, _z
0961    *
0962    * Like `uniform_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
0963    * dimensions.
0964    */
0965 
0966   template <typename TAcc,
0967             typename... TArgs,
0968             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
0969   ALPAKA_FN_ACC inline auto uniform_group_elements_x(TAcc const& acc, TArgs... args) {
0970     return uniform_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
0971   }
0972 
0973   template <typename TAcc,
0974             typename... TArgs,
0975             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
0976   ALPAKA_FN_ACC inline auto uniform_group_elements_y(TAcc const& acc, TArgs... args) {
0977     return uniform_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
0978   }
0979 
0980   template <typename TAcc,
0981             typename... TArgs,
0982             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
0983   ALPAKA_FN_ACC inline auto uniform_group_elements_z(TAcc const& acc, TArgs... args) {
0984     return uniform_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
0985   }
0986 
0987   /* elements_in_block
0988    *
0989    * `elements_in_block(acc, group, elements)` returns a one-dimensional iteratable range that spans all the elements
0990    * within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements` indicates the
0991    * total number of elements across all groups; if not specified, it defaults to the kernel grid size.
0992    *
0993    * `elements_in_block(acc, ...)` is a legacy for `uniform_group_elements(acc, ...)`.
0994    */
0995 
0996   template <typename TAcc,
0997             typename... TArgs,
0998             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
0999   ALPAKA_FN_ACC inline auto elements_in_block(TAcc const& acc, TArgs... args) {
1000     return uniform_group_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
1001   }
1002 
1003   /* independent_groups_along
1004    *
1005    * `independent_groups_along<Dim>(acc, groups)` returns a one-dimensional iteratable range than spans the group
1006    * indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If `groups` is not
1007    * specified, it defaults to the number of blocks along the `Dim` dimension.
1008    *
1009    * In a 1-dimensional kernel, `independent_groups(acc, ...)` is a shorthand for
1010    * `independent_groups_along<0>(acc, ...)`.
1011    *
1012    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
1013    * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
1014    * For convenience when converting CUDA or HIP code, `independent_groups_x(acc, ...)`, `_y` and `_z` are shorthands
1015    * for `independent_groups_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
1016    *
1017    * `independent_groups_along<Dim>` should be called consistently by all the threads in a block. All threads in a block
1018    * see the same loop iterations, while threads in different blocks may see a different number of iterations.
1019    * If the work division has more blocks than the required number of groups, the first blocks will perform one
1020    * iteration of the loop, while the other blocks will exit the loop immediately.
1021    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
1022    * one iteration, in order to cover then whole problem space.
1023    *
1024    * For example,
1025    *
1026    *   for (auto group: independent_groups_along<Dim>(acc, 7))
1027    *
1028    * will return the group range from 0 to 6, distributed across all blocks in the work division.
1029    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
1030    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
1031    * process one group while block 7 will no process any.
1032    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
1033    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
1034    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
1035    * group 3.
1036    */
1037 
1038   template <typename TAcc,
1039             std::size_t Dim,
1040             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1041   class independent_groups_along {
1042   public:
1043     ALPAKA_FN_ACC inline independent_groups_along(TAcc const& acc)
1044         : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1045           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1046           extent_{stride_} {}
1047 
1048     ALPAKA_FN_ACC inline independent_groups_along(TAcc const& acc, Idx groups)
1049         : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1050           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1051           extent_{groups} {}
1052 
1053     class const_iterator;
1054     using iterator = const_iterator;
1055 
1056     ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); }
1057 
1058     ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }
1059 
1060     class const_iterator {
1061       friend class independent_groups_along;
1062 
1063       ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
1064           : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
1065 
1066     public:
1067       ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
1068 
1069       // pre-increment the iterator
1070       ALPAKA_FN_ACC inline const_iterator& operator++() {
1071         // increment the first-element-in-block index by the grid stride
1072         first_ += stride_;
1073         if (first_ < extent_)
1074           return *this;
1075 
1076         // the iterator has reached or passed the end of the extent, clamp it to the extent
1077         first_ = extent_;
1078         return *this;
1079       }
1080 
1081       // post-increment the iterator
1082       ALPAKA_FN_ACC inline const_iterator operator++(int) {
1083         const_iterator old = *this;
1084         ++(*this);
1085         return old;
1086       }
1087 
1088       ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); }
1089 
1090       ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
1091 
1092     private:
1093       // non-const to support iterator copy and assignment
1094       Idx stride_;
1095       Idx extent_;
1096       // modified by the pre/post-increment operator
1097       Idx first_;
1098     };
1099 
1100   private:
1101     const Idx first_;
1102     const Idx stride_;
1103     const Idx extent_;
1104   };
1105 
1106   /* independent_groups
1107    *
1108    * `independent_groups(acc, groups)` returns a one-dimensional iteratable range than spans the group indices from 0 to
1109    * `groups`. If `groups` is not specified, it defaults to the number of blocks.
1110    *
1111    * `independent_groups(acc, ...)` is a shorthand for `independent_groups_along<0>(acc, ...)`.
1112    *
1113    * `independent_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
1114    * see the same loop iterations, while threads in different blocks may see a different number of iterations.
1115    * If the work division has more blocks than the required number of groups, the first blocks will perform one
1116    * iteration of the loop, while the other blocks will exit the loop immediately.
1117    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
1118    * one iteration, in order to cover then whole problem space.
1119    *
1120    * For example,
1121    *
1122    *   for (auto group: independent_groups(acc, 7))
1123    *
1124    * will return the group range from 0 to 6, distributed across all blocks in the work division.
1125    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
1126    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
1127    * process one group while block 7 will no process any.
1128    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
1129    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
1130    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
1131    * group 3.
1132    *
1133    * Note that `independent_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
1134    * use
1135    *   - `independent_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
1136    *   - `independent_groups_x(acc, ...)`, `independent_groups_y(acc, ...)`, or `independent_groups_z(acc, ...)` to loop
1137    *     along the fastest, second-fastest, or third-fastest dimension.
1138    */
1139 
1140   template <typename TAcc,
1141             typename... TArgs,
1142             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
1143   ALPAKA_FN_ACC inline auto independent_groups(TAcc const& acc, TArgs... args) {
1144     return independent_groups_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
1145   }
1146 
1147   /* independent_groups_x, _y, _z
1148    *
1149    * Like `independent_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1150    * dimensions.
1151    */
1152 
1153   template <typename TAcc,
1154             typename... TArgs,
1155             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1156   ALPAKA_FN_ACC inline auto independent_groups_x(TAcc const& acc, TArgs... args) {
1157     return independent_groups_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
1158   }
1159 
1160   template <typename TAcc,
1161             typename... TArgs,
1162             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1163   ALPAKA_FN_ACC inline auto independent_groups_y(TAcc const& acc, TArgs... args) {
1164     return independent_groups_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
1165   }
1166 
1167   template <typename TAcc,
1168             typename... TArgs,
1169             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1170   ALPAKA_FN_ACC inline auto independent_groups_z(TAcc const& acc, TArgs... args) {
1171     return independent_groups_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
1172   }
1173 
1174   /* independent_group_elements_along
1175    */
1176 
1177   template <typename TAcc,
1178             std::size_t Dim,
1179             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1180   class independent_group_elements_along {
1181   public:
1182     ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc)
1183         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1184           thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1185           stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1186           extent_{stride_} {}
1187 
1188     ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc, Idx extent)
1189         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1190           thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1191           stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1192           extent_{extent} {}
1193 
1194     ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc, Idx first, Idx extent)
1195         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1196           thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_ + first},
1197           stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1198           extent_{extent} {}
1199 
1200     class const_iterator;
1201     using iterator = const_iterator;
1202 
1203     ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, thread_); }
1204 
1205     ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }
1206 
1207     class const_iterator {
1208       friend class independent_group_elements_along;
1209 
1210       ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
1211           : elements_{elements},
1212             stride_{stride},
1213             extent_{extent},
1214             first_{std::min(first, extent)},
1215             index_{first_},
1216             range_{std::min(first + elements, extent)} {}
1217 
1218     public:
1219       ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
1220 
1221       // pre-increment the iterator
1222       ALPAKA_FN_ACC inline const_iterator& operator++() {
1223         if constexpr (requires_single_thread_per_block_v<TAcc>) {
1224           // increment the index along the elements processed by the current thread
1225           ++index_;
1226           if (index_ < range_)
1227             return *this;
1228         }
1229 
1230         // increment the thread index with the block stride
1231         first_ += stride_;
1232         index_ = first_;
1233         range_ = std::min(first_ + elements_, extent_);
1234         if (index_ < extent_)
1235           return *this;
1236 
1237         // the iterator has reached or passed the end of the extent, clamp it to the extent
1238         first_ = extent_;
1239         index_ = extent_;
1240         range_ = extent_;
1241         return *this;
1242       }
1243 
1244       // post-increment the iterator
1245       ALPAKA_FN_ACC inline const_iterator operator++(int) {
1246         const_iterator old = *this;
1247         ++(*this);
1248         return old;
1249       }
1250 
1251       ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const {
1252         return (index_ == other.index_) and (first_ == other.first_);
1253       }
1254 
1255       ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
1256 
1257     private:
1258       // non-const to support iterator copy and assignment
1259       Idx elements_;
1260       Idx stride_;
1261       Idx extent_;
1262       // modified by the pre/post-increment operator
1263       Idx first_;
1264       Idx index_;
1265       Idx range_;
1266     };
1267 
1268   private:
1269     const Idx elements_;
1270     const Idx thread_;
1271     const Idx stride_;
1272     const Idx extent_;
1273   };
1274 
1275   /* independent_group_elements
1276    */
1277 
1278   template <typename TAcc,
1279             typename... TArgs,
1280             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
1281   ALPAKA_FN_ACC inline auto independent_group_elements(TAcc const& acc, TArgs... args) {
1282     return independent_group_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
1283   }
1284 
1285   /* independent_group_elements_x, _y, _z
1286    *
1287    * Like `independent_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1288    * dimensions.
1289    */
1290 
1291   template <typename TAcc,
1292             typename... TArgs,
1293             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1294   ALPAKA_FN_ACC inline auto independent_group_elements_x(TAcc const& acc, TArgs... args) {
1295     return independent_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
1296   }
1297 
1298   template <typename TAcc,
1299             typename... TArgs,
1300             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1301   ALPAKA_FN_ACC inline auto independent_group_elements_y(TAcc const& acc, TArgs... args) {
1302     return independent_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
1303   }
1304 
1305   template <typename TAcc,
1306             typename... TArgs,
1307             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1308   ALPAKA_FN_ACC inline auto independent_group_elements_z(TAcc const& acc, TArgs... args) {
1309     return independent_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
1310   }
1311 
1312   /* once_per_grid
1313    *
1314    * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.
1315    *
1316    * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
1317    */
1318 
1319   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1320   ALPAKA_FN_ACC inline constexpr bool once_per_grid(TAcc const& acc) {
1321     return alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
1322   }
1323 
1324   /* once_per_block
1325    *
1326    * `once_per_block(acc)` returns true for a single thread within the block.
1327    *
1328    * Usually the condition is true for thread 0, but this index should not be relied upon.
1329    */
1330 
1331   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1332   ALPAKA_FN_ACC inline constexpr bool once_per_block(TAcc const& acc) {
1333     return alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
1334   }
1335 
1336 }  // namespace cms::alpakatools
1337 
1338 #endif  // HeterogeneousCore_AlpakaInterface_interface_workdivision_h