Add kokkos recycling example

SC-SGS · May 29, 2024 · f09c6aa · f09c6aa
1 parent 33097e3
commit f09c6aa
Show file tree

Hide file tree

Showing 4 changed files with 523 additions and 19 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -262,6 +262,17 @@ if (CPPUDDLE_WITH_TESTS)
           SOURCES
           examples/cuda_vector_add.cu
           )
+
+      if (CPPUDDLE_WITH_KOKKOS)
+         add_hpx_executable(
+              recycling-with-hpx-kokkos
+              DEPENDENCIES
+              Boost::boost Boost::program_options Kokkos::kokkos HPXKokkos::hpx_kokkos HPX::hpx buffer_manager stream_manager
+              COMPONENT_DEPENDENCIES iostreams
+              SOURCES
+              examples/recycling-with-hpx-kokkos.cpp
+              )
+      endif()
   endif()
 endif()
 #------------------------------------------------------------------------------------------------------------

diff --git a/examples/cuda_vector_add.cu b/examples/cuda_vector_add.cu
@@ -3,6 +3,10 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+// Developer  TODOs regarding CPPuddle usability:
+// TODO(daissgr) Simplify specifying an executor pool (at least when using the
+// default round_robin_pool_impl). The current way seems awfully verbose
+
 #include <algorithm>
 #include <cstdlib>
 #include <hpx/include/async.hpp>
@@ -13,6 +17,7 @@
 
 #include <boost/program_options.hpp>
 
+#include <cppuddle/memory_recycling/buffer_management_interface.hpp>
 #include <cppuddle/memory_recycling/std_recycling_allocators.hpp>
 #include <cppuddle/memory_recycling/cuda_recycling_allocators.hpp>
 #include <cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp>
@@ -22,28 +27,31 @@
 #include <stdexcept>
 #include <vector>
 
-// Compile-time options
-using float_t = float;
-using device_executor_t = hpx::cuda::experimental::cuda_executor;
 
 /** \file This example shows how to use HPX + CPPuddle with GPU-accelerated
  * applications. Particulary we focus on how to use a) recycled pinned host
  * memory, b) recycled device memory, c) the executor pool, d) the HPX-CUDA
  * futures and the basic CPU/GPU load balancing based on executor usage in an
  * HPX application. To demonstrate these features we just use the simplest of
- * kernels: a vector add, that is repeated over a multitude of tasks (with
+ * kernels: a vector addition that is repeated over a multitude of tasks (with
  * varying, artifical dependencies inbetween). So while the compute kernel is
- * basic, we still get to see how the CPPuddle/HPX features may be used.
+ * basic, we still get to see how the CPPuddle/HPX features may be used with 
+ * it.
  *
  * The example has three parts: First the GPU part, then the HPX task graph
  * management and lastly the remaining initialization/boilerplate code
  */
 
 //=================================================================================================
-// PART I: The GPU kernel and how to launch it with CPPuddle + HPX whilst avoid
+// PART I: The (CUDA) GPU kernel and how to launch it with CPPuddle + HPX whilst avoid
 // any CPU/GPU barriers
 //=================================================================================================
 
+// Compile-time options: float type...
+using float_t = float;
+// ... and we will use the HPX CUDA executor inside the executor pool later on
+using device_executor_t = hpx::cuda::experimental::cuda_executor;
+
 /** Just some example CUDA kernel. For simplicity it just adds two vectors. */
 __global__ void kernel_add(const float_t *input_a, const float_t *input_b, float_t *output_c) {
   const int index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -97,15 +105,15 @@ void launch_gpu_kernel_task(const size_t task_id, const size_t entries_per_task,
 
   // 4. Run Kernel on either CPU or GPU
   if (!device_executor_available) {
-    // 4a. Launch CPU Fallback  Version
     number_cpu_kernel_launches++;
+    // 4a. Launch CPU Fallback  Version
     for (size_t entry_id = 0; entry_id < entries_per_task; entry_id++) {
       host_c[entry_id] = host_a[entry_id] + host_b[entry_id];
     }
   } else {
+    number_gpu_kernel_launches++;
     // 4b. Create per_task device-side buffers (using recylced device memory)
     // and draw GPU executor from CPPuddle executor pool
-    number_gpu_kernel_launches++;
     cppuddle::executor_recycling::executor_interface<
         device_executor_t,
         cppuddle::executor_recycling::round_robin_pool_impl<device_executor_t>>
@@ -236,23 +244,23 @@ build_task_graph(const size_t number_repetitions, const size_t number_tasks,
  * Polling usually has the superior performance, however, it requires that the
  * polling is initialized at startup (or at least before the CUDA futures are
  * used). The CPPuddle executor pool also needs initialzing as we need to set it
- * to a specified number of executors (which CPPuddle cannot know having the
- * number_executors parameter). We will use the round_robin_pool_impl for
+ * to a specified number of executors (which CPPuddle cannot know without the
+ * number_gpu_executors parameter). We will use the round_robin_pool_impl for
  * simplicity. A priority_pool_impl is also available.
  */
-void init_executor_pool_and_polling(const size_t number_executors, const size_t gpu_id) {
+void init_executor_pool_and_polling(const size_t number_gpu_executors, const size_t gpu_id) {
   assert(gpu_id == 0); // MultiGPU not used in this example
   hpx::cuda::experimental::detail::register_polling(hpx::resource::get_thread_pool(0));
   cppuddle::executor_recycling::executor_pool::init_executor_pool<
       device_executor_t,
       cppuddle::executor_recycling::round_robin_pool_impl<device_executor_t>>(
-      gpu_id, number_executors, gpu_id, true);
+      gpu_id, number_gpu_executors, gpu_id, true);
 }
 
 /// Processes the CLI options via boost program_options to configure the example
 bool process_cli_options(int argc, char *argv[], size_t &entries_per_task,
                          size_t &number_tasks, bool &in_order_repetitions,
-                         size_t &number_repetitions, size_t &number_executors,
+                         size_t &number_repetitions, size_t &number_gpu_executors,
                          size_t &max_queue_length) {
   try {
     boost::program_options::options_description desc{"Options"};
@@ -274,8 +282,8 @@ bool process_cli_options(int argc, char *argv[], size_t &entries_per_task,
         boost::program_options::value<size_t>(&number_repetitions)
             ->default_value(20),
         "Sets the number of repetitions")(
-        "number_executors",
-        boost::program_options::value<size_t>(&number_executors)
+        "number_gpu_executors",
+        boost::program_options::value<size_t>(&number_gpu_executors)
             ->default_value(32),
         "Number of GPU executors in the pool")(
         "max_queue_length_per_executor",
@@ -302,7 +310,7 @@ bool process_cli_options(int argc, char *argv[], size_t &entries_per_task,
                 << " --tasks_per_repetition =  " << number_tasks << std::endl
                 << " --number_repetitions = " << number_repetitions << std::endl
                 << " --in_order_repetitions = " << in_order_repetitions << std::endl
-                << " --number_executors = " << number_executors << std::endl
+                << " --number_gpu_executors = " << number_gpu_executors << std::endl
                 << " --max_queue_length_per_executor = " << max_queue_length << std::endl
                 << " --hpx:threads = " << hpx::get_os_thread_count()
                 << std::endl << std::endl;
@@ -328,17 +336,17 @@ int hpx_main(int argc, char *argv[]) {
   size_t number_repetitions = 20;
   bool in_order_repetitions = false;
   size_t max_queue_length = 5;
-  size_t number_executors = 1;
+  size_t number_gpu_executors = 1;
   size_t gpu_id = 0;
   if(!process_cli_options(argc, argv, entries_per_task, number_tasks,
                       in_order_repetitions, number_repetitions,
-                      number_executors, max_queue_length)) {
+                      number_gpu_executors, max_queue_length)) {
     return hpx::finalize(); // problem with CLI parameters detected -> exiting..
   }
 
   // Init HPX CUDA polling + executor pool
   hpx::cout << "Start initializing CUDA polling and executor pool..." << std::endl;
-  init_executor_pool_and_polling(number_executors, gpu_id);
+  init_executor_pool_and_polling(number_gpu_executors, gpu_id);
   hpx::cout << "Init done!" << std::endl << std::endl;
 
 
@@ -361,6 +369,10 @@ int hpx_main(int argc, char *argv[]) {
 
   // Finalize HPX (CPPuddle finalizes automatically)
   hpx::cout << "Finalizing..." << std::endl;
+  // Deallocates all CPPuddle everything and prevent further usage. Technically
+  // not required as long as static variables with CPPuddle-managed memory are
+  // not used, however, it does not hurt either.
+  cppuddle::memory_recycling::finalize();
   hpx::cuda::experimental::detail::unregister_polling(
       hpx::resource::get_thread_pool(0));
   return hpx::finalize();