diff --git a/docs/_sources/basics/concepts.rst b/docs/_sources/basics/concepts.rst index 7e033fc1..baabaec7 100644 --- a/docs/_sources/basics/concepts.rst +++ b/docs/_sources/basics/concepts.rst @@ -97,7 +97,7 @@ may even use C++ execution policies behind the scenes. Executors are designed so on a variety of different targets. Currently the following executors are defined: * cudaExecutor - Execute on a CUDA-supported device -* SingleThreadHostExecutor - Execute on a single host thread +* HostExecutor - Execute on a single host thread More executor types will be added in future releases. diff --git a/docs/_sources/quickstart.rst b/docs/_sources/quickstart.rst index 5107d2ca..06579595 100644 --- a/docs/_sources/quickstart.rst +++ b/docs/_sources/quickstart.rst @@ -290,11 +290,11 @@ The example above uses the ``ones`` generator to create a tensor with only the v value ``1`` any time an element of it is requested, and no data is ever loaded from memory. Implicit in the ``run`` call above is a CUDA executor type. As a beta feature, MatX also supports executing code on the host using a different executor. -To run the same code on the host, a ``SingleThreadHostExecutor`` can be passed into ``run``: +To run the same code on the host, a ``HostExecutor`` can be passed into ``run``: .. code-block:: cpp - (c = (a*a) + ones(a.Shape())).run(SingleThreadHostExecutor{}); + (c = (a*a) + ones(a.Shape())).run(HostExecutor{}); Instead of a CUDA stream, we pass an executor to ``run`` that instructs MatX to execute the code on the host instead of the device using a single CPU thread. Unlike CUDA calls, host executors are synchronous, and the line above will block until finished executing. diff --git a/docs_input/api/logic/truth/allclose.rst b/docs_input/api/logic/truth/allclose.rst index 5db53eee..e6fd82e4 100644 --- a/docs_input/api/logic/truth/allclose.rst +++ b/docs_input/api/logic/truth/allclose.rst @@ -7,7 +7,7 @@ Reduce the closeness of two operators to a single scalar (0D) output. The output from allclose is an ``int`` value since boolean reductions are not available in hardware -.. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, SingleThreadHostExecutor exec) +.. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, HostExecutor exec) .. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, cudaExecutor exec = 0) Examples diff --git a/docs_input/basics/concepts.rst b/docs_input/basics/concepts.rst index 7e033fc1..2d34fd00 100644 --- a/docs_input/basics/concepts.rst +++ b/docs_input/basics/concepts.rst @@ -97,7 +97,7 @@ may even use C++ execution policies behind the scenes. Executors are designed so on a variety of different targets. Currently the following executors are defined: * cudaExecutor - Execute on a CUDA-supported device -* SingleThreadHostExecutor - Execute on a single host thread +* HostExecutor - Execute on one or more host (CPU) threads More executor types will be added in future releases. diff --git a/docs_input/quickstart.rst b/docs_input/quickstart.rst index 5107d2ca..06579595 100644 --- a/docs_input/quickstart.rst +++ b/docs_input/quickstart.rst @@ -290,11 +290,11 @@ The example above uses the ``ones`` generator to create a tensor with only the v value ``1`` any time an element of it is requested, and no data is ever loaded from memory. Implicit in the ``run`` call above is a CUDA executor type. As a beta feature, MatX also supports executing code on the host using a different executor. -To run the same code on the host, a ``SingleThreadHostExecutor`` can be passed into ``run``: +To run the same code on the host, a ``HostExecutor`` can be passed into ``run``: .. code-block:: cpp - (c = (a*a) + ones(a.Shape())).run(SingleThreadHostExecutor{}); + (c = (a*a) + ones(a.Shape())).run(HostExecutor{}); Instead of a CUDA stream, we pass an executor to ``run`` that instructs MatX to execute the code on the host instead of the device using a single CPU thread. Unlike CUDA calls, host executors are synchronous, and the line above will block until finished executing. diff --git a/include/matx/core/type_utils.h b/include/matx/core/type_utils.h index 738775ce..ba3df398 100644 --- a/include/matx/core/type_utils.h +++ b/include/matx/core/type_utils.h @@ -256,7 +256,7 @@ inline constexpr bool is_settable_xform_v = std::conjunction_v struct is_executor : std::false_type {}; template <> struct is_executor : std::true_type {}; -template <> struct is_executor : std::true_type {}; +template <> struct is_executor : std::true_type {}; } /** @@ -286,7 +286,7 @@ inline constexpr bool is_device_executor_v = detail::is_device_executor struct is_single_thread_host_executor : std::false_type {}; -template<> struct is_single_thread_host_executor : std::true_type {}; +template<> struct is_single_thread_host_executor : std::true_type {}; } /** diff --git a/include/matx/executors/host.h b/include/matx/executors/host.h index cbe1b5b9..4fa87055 100644 --- a/include/matx/executors/host.h +++ b/include/matx/executors/host.h @@ -38,15 +38,39 @@ namespace matx { + +// Matches current Linux max +static constexpr int MAX_CPUS = 1024; +struct cpu_set_t { + using set_type = uint64_t; + + std::array bits_; +}; + +struct HostExecParams { + HostExecParams(int threads = 1) : threads_(threads) {} + HostExecParams(cpu_set_t cpu_set) : cpu_set_(cpu_set) { + MATX_ASSERT_STR(false, matxNotSupported, "CPU affinity not supported yet"); + } + + int GetNumThreads() const { return threads_; } + + private: + int threads_; + cpu_set_t cpu_set_; +}; + /** * @brief Executor for running an operator on a single host thread * */ -class SingleThreadHostExecutor { +class HostExecutor { public: - using matx_cpu = bool; ///< Type trait indicating this is an executor + using matx_cpu = bool; ///< Type trait indicating this is a CPU executor using matx_executor = bool; ///< Type trait indicating this is an executor + HostExecutor(const HostExecParams ¶ms = HostExecParams{}) : params_(params) {} + /** * @brief Execute an operator * @@ -55,19 +79,24 @@ class SingleThreadHostExecutor { */ template void Exec(Op &op) const noexcept { - if constexpr (Op::Rank() == 0) { - op(); - } - else { - index_t size = TotalSize(op); - for (index_t i = 0; i < size; i++) { - auto idx = GetIdxFromAbs(op, i); - std::apply([&](auto... args) { - return op(args...); - }, idx); - } + if (params_.GetNumThreads() == 1) { + if constexpr (Op::Rank() == 0) { + op(); + } + else { + index_t size = TotalSize(op); + for (index_t i = 0; i < size; i++) { + auto idx = GetIdxFromAbs(op, i); + std::apply([&](auto... args) { + return op(args...); + }, idx); + } + } } } + + private: + HostExecParams params_; }; } diff --git a/include/matx/transforms/cub.h b/include/matx/transforms/cub.h index e59a97da..27942402 100644 --- a/include/matx/transforms/cub.h +++ b/include/matx/transforms/cub.h @@ -1412,7 +1412,7 @@ void sort_impl(OutputTensor &a_out, const InputOperator &a, template void sort_impl(OutputTensor &a_out, const InputOperator &a, const SortDirection_t dir, - [[maybe_unused]] SingleThreadHostExecutor exec) + [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) @@ -1509,7 +1509,7 @@ void cumsum_impl(OutputTensor &a_out, const InputOperator &a, template void cumsum_impl(OutputTensor &a_out, const InputOperator &a, - [[maybe_unused]] SingleThreadHostExecutor exec) + [[maybe_unused]] HostExecutor exec) { #ifdef __CUDACC__ MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) @@ -1783,7 +1783,7 @@ void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator * Single-threaded host executor */ template -void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, [[maybe_unused]] SingleThreadHostExecutor exec) +void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, [[maybe_unused]] HostExecutor exec) { static_assert(num_found.Rank() == 0, "Num found output tensor rank must be 0"); MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) @@ -1904,7 +1904,7 @@ void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOpera * Single host executor */ template -void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, [[maybe_unused]] SingleThreadHostExecutor exec) +void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, [[maybe_unused]] HostExecutor exec) { static_assert(num_found.Rank() == 0, "Num found output tensor rank must be 0"); MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) @@ -2021,7 +2021,7 @@ void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperato * Single thread executor */ template -void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, [[maybe_unused]] SingleThreadHostExecutor exec) +void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, [[maybe_unused]] HostExecutor exec) { #ifdef __CUDACC__ static_assert(num_found.Rank() == 0, "Num found output tensor rank must be 0"); diff --git a/include/matx/transforms/reduce.h b/include/matx/transforms/reduce.h index e409ca9a..a4196fb5 100644 --- a/include/matx/transforms/reduce.h +++ b/include/matx/transforms/reduce.h @@ -1530,7 +1530,7 @@ void __MATX_INLINE__ mean_impl(OutType dest, const InType &in, * Single thread host executor */ template -void __MATX_INLINE__ mean_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec) +void __MATX_INLINE__ mean_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("mean_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API) @@ -1796,7 +1796,7 @@ void __MATX_INLINE__ median_impl(OutType dest, * Single thread host executor */ template -void __MATX_INLINE__ median_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec) +void __MATX_INLINE__ median_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("median_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API) auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) { @@ -1888,7 +1888,7 @@ void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, cudaExecutor exec * Single thread host executor */ template -void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec) +void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("sum_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API) auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) { @@ -1956,7 +1956,7 @@ void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, cudaExecutor exec * Single thread host executor */ template -void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec) +void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("prod_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API) auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) { @@ -2033,7 +2033,7 @@ void __MATX_INLINE__ rmax_impl(OutType dest, const InType &in, cudaExecutor exec * Single threaded host executor */ template -void __MATX_INLINE__ rmax_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec) +void __MATX_INLINE__ rmax_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("rmax_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API) @@ -2111,7 +2111,7 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT * Single threaded host executor */ template -void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec) +void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("argmax_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API) @@ -2187,7 +2187,7 @@ void __MATX_INLINE__ rmin_impl(OutType dest, const InType &in, cudaExecutor exec * Single threaded host executor */ template -void __MATX_INLINE__ rmin_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec) +void __MATX_INLINE__ rmin_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("rmin_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API) auto ft = [&](auto &&lin, auto &&lout, [[maybe_unused]] auto &&lbegin, [[maybe_unused]] auto &&lend) { @@ -2261,7 +2261,7 @@ void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InT * SIngle host executor */ template -void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec) +void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("argmin_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API) @@ -2334,7 +2334,7 @@ void __MATX_INLINE__ any_impl(OutType dest, const InType &in, cudaExecutor exec * Single threaded host executor */ template -void __MATX_INLINE__ any_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec) +void __MATX_INLINE__ any_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("any_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API) @@ -2407,7 +2407,7 @@ void __MATX_INLINE__ all_impl(OutType dest, const InType &in, cudaExecutor exec * Single threaded host executor */ template -void __MATX_INLINE__ all_impl(OutType dest, const InType &in, [[maybe_unused]] SingleThreadHostExecutor exec) +void __MATX_INLINE__ all_impl(OutType dest, const InType &in, [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("all_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API) @@ -2493,7 +2493,7 @@ void __MATX_INLINE__ allclose(OutType dest, const InType1 &in1, const InType2 &i * Single threaded host executor */ template -void __MATX_INLINE__ allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, [[maybe_unused]] SingleThreadHostExecutor exec) +void __MATX_INLINE__ allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, [[maybe_unused]] HostExecutor exec) { MATX_NVTX_START("allclose(" + get_type_str(in1) + ", " + get_type_str(in2) + ")", matx::MATX_NVTX_LOG_API) static_assert(OutType::Rank() == 0, "allclose output must be rank 0"); diff --git a/include/matx/transforms/transpose.h b/include/matx/transforms/transpose.h index 20047c45..dc1fa300 100644 --- a/include/matx/transforms/transpose.h +++ b/include/matx/transforms/transpose.h @@ -104,7 +104,7 @@ namespace matx template __MATX_INLINE__ void transpose_matrix_impl([[maybe_unused]] OutputTensor &out, - const InputTensor &in, SingleThreadHostExecutor exec) + const InputTensor &in, HostExecutor exec) { static_assert(InputTensor::Rank() >= 2, "transpose_matrix operator must be on rank 2 or greater"); diff --git a/test/include/test_types.h b/test/include/test_types.h index afbb9cdb..e4b3d8bb 100644 --- a/test/include/test_types.h +++ b/test/include/test_types.h @@ -72,7 +72,7 @@ template <> auto inline GenerateData>() return cuda::std::complex(1.5, -2.5); } -using ExecutorTypesAll = std::tuple; +using ExecutorTypesAll = std::tuple; // Define the types to test for each group. If a type is put into a list that // isn't compatible with a test type, a compiler error will occur