NVIDIA · cliffburdick · Nov 2, 2023 · Nov 2, 2023
diff --git a/docs/_sources/basics/concepts.rst b/docs/_sources/basics/concepts.rst
@@ -97,7 +97,7 @@ may even use C++ execution policies behind the scenes. Executors are designed so
 on a variety of different targets. Currently the following executors are defined:
 
 * cudaExecutor - Execute on a CUDA-supported device
-* SingleThreadHostExecutor - Execute on a single host thread
+* HostExecutor - Execute on a single host thread
 
 More executor types will be added in future releases.
 

diff --git a/docs/_sources/quickstart.rst b/docs/_sources/quickstart.rst
@@ -290,11 +290,11 @@ The example above uses the ``ones`` generator to create a tensor with only the v
 value ``1`` any time an element of it is requested, and no data is ever loaded from memory.
 
 Implicit in the ``run`` call above is a CUDA executor type. As a beta feature, MatX also supports executing code on the host using a different executor.
-To run the same code on the host, a ``SingleThreadHostExecutor`` can be passed into ``run``:
+To run the same code on the host, a ``HostExecutor`` can be passed into ``run``:
 
 .. code-block:: cpp
 
-    (c = (a*a) + ones(a.Shape())).run(SingleThreadHostExecutor{});
+    (c = (a*a) + ones(a.Shape())).run(HostExecutor{});
 
 Instead of a CUDA stream, we pass an executor to ``run`` that instructs MatX to execute the code on the host instead of the device using a single CPU thread.
 Unlike CUDA calls, host executors are synchronous, and the line above will block until finished executing.

diff --git a/docs_input/api/logic/truth/allclose.rst b/docs_input/api/logic/truth/allclose.rst
@@ -7,7 +7,7 @@ Reduce the closeness of two operators to a single scalar (0D) output. The output
 from allclose is an ``int`` value since boolean reductions are not available in hardware
 
 
-.. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, SingleThreadHostExecutor exec)
+.. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, HostExecutor exec)
 .. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, cudaExecutor exec = 0)
 
 Examples

diff --git a/docs_input/basics/concepts.rst b/docs_input/basics/concepts.rst
@@ -97,7 +97,7 @@ may even use C++ execution policies behind the scenes. Executors are designed so
 on a variety of different targets. Currently the following executors are defined:
 
 * cudaExecutor - Execute on a CUDA-supported device
-* SingleThreadHostExecutor - Execute on a single host thread
+* HostExecutor - Execute on one or more host (CPU) threads
 
 More executor types will be added in future releases.
 

diff --git a/docs_input/quickstart.rst b/docs_input/quickstart.rst
@@ -290,11 +290,11 @@ The example above uses the ``ones`` generator to create a tensor with only the v
 value ``1`` any time an element of it is requested, and no data is ever loaded from memory.
 
 Implicit in the ``run`` call above is a CUDA executor type. As a beta feature, MatX also supports executing code on the host using a different executor.
-To run the same code on the host, a ``SingleThreadHostExecutor`` can be passed into ``run``:
+To run the same code on the host, a ``HostExecutor`` can be passed into ``run``:
 
 .. code-block:: cpp
 
-    (c = (a*a) + ones(a.Shape())).run(SingleThreadHostExecutor{});
+    (c = (a*a) + ones(a.Shape())).run(HostExecutor{});
 
 Instead of a CUDA stream, we pass an executor to ``run`` that instructs MatX to execute the code on the host instead of the device using a single CPU thread.
 Unlike CUDA calls, host executors are synchronous, and the line above will block until finished executing.

diff --git a/include/matx/core/type_utils.h b/include/matx/core/type_utils.h
@@ -256,7 +256,7 @@ inline constexpr bool is_settable_xform_v = std::conjunction_v<detail::is_matx_s
 namespace detail {
 template <typename T> struct is_executor : std::false_type {};
 template <> struct is_executor<cudaExecutor> : std::true_type {};
-template <> struct is_executor<SingleThreadHostExecutor> : std::true_type {};
+template <> struct is_executor<HostExecutor> : std::true_type {};
 }
 
 /**
@@ -286,7 +286,7 @@ inline constexpr bool is_device_executor_v = detail::is_device_executor<typename
 
 namespace detail {
 template<typename T> struct is_single_thread_host_executor : std::false_type {};
-template<> struct is_single_thread_host_executor<matx::SingleThreadHostExecutor> : std::true_type {};
+template<> struct is_single_thread_host_executor<matx::HostExecutor> : std::true_type {};
 }
 
 /**

diff --git a/include/matx/executors/host.h b/include/matx/executors/host.h
@@ -38,15 +38,39 @@
 
 namespace matx 
 {
+
+// Matches current Linux max
+static constexpr int MAX_CPUS = 1024;
+struct cpu_set_t {
+  using set_type = uint64_t;
+
+  std::array<set_type, MAX_CPUS / (8 * sizeof(set_type))> bits_;
+};
+
+struct HostExecParams {
+  HostExecParams(int threads = 1) : threads_(threads) {}
+  HostExecParams(cpu_set_t cpu_set) : cpu_set_(cpu_set) {
+    MATX_ASSERT_STR(false, matxNotSupported, "CPU affinity not supported yet");
+  }
+
+  int GetNumThreads() const { return threads_; }
+
+  private:
+    int threads_;
+    cpu_set_t cpu_set_;
+};
+
 /**
  * @brief Executor for running an operator on a single host thread
  * 
  */
-class SingleThreadHostExecutor {
+class HostExecutor {
   public:
-    using matx_cpu = bool; ///< Type trait indicating this is an executor
+    using matx_cpu = bool; ///< Type trait indicating this is a CPU executor
     using matx_executor = bool; ///< Type trait indicating this is an executor
 
+    HostExecutor(const HostExecParams &params = HostExecParams{}) : params_(params) {}
+
     /**
      * @brief Execute an operator
      * 
@@ -55,19 +79,24 @@ class SingleThreadHostExecutor {
      */
     template <typename Op>
     void Exec(Op &op) const noexcept {
-      if constexpr (Op::Rank() == 0) {
-        op();
-      }
-      else {
-        index_t size = TotalSize(op);
-        for (index_t i = 0; i < size; i++) {
-          auto idx = GetIdxFromAbs(op, i);
-          std::apply([&](auto... args) {
-            return op(args...);
-          }, idx);        
-        }      
+      if (params_.GetNumThreads() == 1) {
+        if constexpr (Op::Rank() == 0) {
+          op();
+        }
+        else {
+          index_t size = TotalSize(op);
+          for (index_t i = 0; i < size; i++) {
+            auto idx = GetIdxFromAbs(op, i);
+            std::apply([&](auto... args) {
+              return op(args...);
+            }, idx);        
+          }      
+        }
       }
     }
+
+    private:
+      HostExecParams params_;
 };
 
 }
diff --git a/include/matx/transforms/cub.h b/include/matx/transforms/cub.h
@@ -1412,7 +1412,7 @@ void sort_impl(OutputTensor &a_out, const InputOperator &a,
 template <typename OutputTensor, typename InputOperator>
 void sort_impl(OutputTensor &a_out, const InputOperator &a,
           const SortDirection_t dir,
-          [[maybe_unused]] SingleThreadHostExecutor exec)
+          [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
@@ -1509,7 +1509,7 @@ void cumsum_impl(OutputTensor &a_out, const InputOperator &a,
 
 template <typename OutputTensor, typename InputOperator>
 void cumsum_impl(OutputTensor &a_out, const InputOperator &a,
-            [[maybe_unused]] SingleThreadHostExecutor exec)
+            [[maybe_unused]] HostExecutor exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
@@ -1783,7 +1783,7 @@ void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator
  *   Single-threaded host executor
  */
 template <typename SelectType, typename CountTensor, typename OutputTensor, typename InputOperator>
-void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, [[maybe_unused]] SingleThreadHostExecutor exec)
+void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, [[maybe_unused]] HostExecutor exec)
 {
   static_assert(num_found.Rank() == 0, "Num found output tensor rank must be 0");
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
@@ -1904,7 +1904,7 @@ void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOpera
  *   Single host executor
  */
 template <typename SelectType, typename CountTensor, typename OutputTensor, typename InputOperator>
-void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, [[maybe_unused]] SingleThreadHostExecutor exec)
+void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, [[maybe_unused]] HostExecutor exec)
 {
   static_assert(num_found.Rank() == 0, "Num found output tensor rank must be 0");
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
@@ -2021,7 +2021,7 @@ void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperato
  *   Single thread executor
  */
 template <typename CountTensor, typename OutputTensor, typename InputOperator>
-void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, [[maybe_unused]] SingleThreadHostExecutor exec)
+void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, [[maybe_unused]] HostExecutor exec)
 {
 #ifdef __CUDACC__
   static_assert(num_found.Rank() == 0, "Num found output tensor rank must be 0");

diff --git a/include/matx/transforms/reduce.h b/include/matx/transforms/reduce.h
@@ -1530,7 +1530,7 @@ void __MATX_INLINE__ mean_impl(OutType dest, const InType &in,
  *   Single thread host executor
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ mean_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec)
+void __MATX_INLINE__ mean_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("mean_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
 
@@ -1796,7 +1796,7 @@ void __MATX_INLINE__ median_impl(OutType dest,
  *   Single thread host executor
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ median_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec)
+void __MATX_INLINE__ median_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("median_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
   auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) {
@@ -1888,7 +1888,7 @@ void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, cudaExecutor exec
  *   Single thread host executor
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec)
+void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("sum_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
   auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) {
@@ -1956,7 +1956,7 @@ void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, cudaExecutor exec
  *   Single thread host executor
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec)
+void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("prod_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
   auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) {
@@ -2033,7 +2033,7 @@ void __MATX_INLINE__ rmax_impl(OutType dest, const InType &in, cudaExecutor exec
  *   Single threaded host executor
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ rmax_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec)
+void __MATX_INLINE__ rmax_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("rmax_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
 
@@ -2111,7 +2111,7 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT
  *   Single threaded host executor
  */
 template <typename OutType, typename TensorIndexType, typename InType>
-void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec)
+void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("argmax_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
 
@@ -2187,7 +2187,7 @@ void __MATX_INLINE__ rmin_impl(OutType dest, const InType &in, cudaExecutor exec
  *   Single threaded host executor
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ rmin_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec)
+void __MATX_INLINE__ rmin_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("rmin_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
   auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) {
@@ -2261,7 +2261,7 @@ void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InT
  *   SIngle host executor
  */
 template <typename OutType, typename TensorIndexType, typename InType>
-void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec)
+void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("argmin_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
 
@@ -2334,7 +2334,7 @@ void __MATX_INLINE__ any_impl(OutType dest, const InType &in, cudaExecutor exec
  *   Single threaded host executor
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ any_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec)
+void __MATX_INLINE__ any_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("any_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
 
@@ -2407,7 +2407,7 @@ void __MATX_INLINE__ all_impl(OutType dest, const InType &in, cudaExecutor exec
  *   Single threaded host executor
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ all_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec)
+void __MATX_INLINE__ all_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("all_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
 
@@ -2493,7 +2493,7 @@ void __MATX_INLINE__ allclose(OutType dest, const InType1 &in1, const InType2 &i
  *   Single threaded host executor
  */
 template <typename OutType, typename InType1, typename InType2>
-void __MATX_INLINE__ allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, [[maybe_unused]] SingleThreadHostExecutor exec)
+void __MATX_INLINE__ allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, [[maybe_unused]] HostExecutor exec)
 {
   MATX_NVTX_START("allclose(" + get_type_str(in1) + ", " + get_type_str(in2) + ")", matx::MATX_NVTX_LOG_API)
   static_assert(OutType::Rank() == 0, "allclose output must be rank 0");

diff --git a/include/matx/transforms/transpose.h b/include/matx/transforms/transpose.h
@@ -104,7 +104,7 @@ namespace matx
 
   template <typename OutputTensor, typename InputTensor>
     __MATX_INLINE__ void transpose_matrix_impl([[maybe_unused]] OutputTensor &out,
-        const InputTensor &in, SingleThreadHostExecutor exec)
+        const InputTensor &in, HostExecutor exec)
     {
       static_assert(InputTensor::Rank() >= 2, "transpose_matrix operator must be on rank 2 or greater");
 

diff --git a/test/include/test_types.h b/test/include/test_types.h
@@ -72,7 +72,7 @@ template <> auto inline GenerateData<cuda::std::complex<double>>()
   return cuda::std::complex<double>(1.5, -2.5);
 }
 
-using ExecutorTypesAll = std::tuple<matx::cudaExecutor, matx::SingleThreadHostExecutor>;
+using ExecutorTypesAll = std::tuple<matx::cudaExecutor, matx::HostExecutor>;
 
 // Define the types to test for each group. If a type is put into a list that
 // isn't compatible with a test type, a compiler error will occur