Skip to content

Commit

Permalink
Add kokkos recycling example
Browse files Browse the repository at this point in the history
  • Loading branch information
G-071 committed May 29, 2024
1 parent 33097e3 commit f09c6aa
Show file tree
Hide file tree
Showing 4 changed files with 523 additions and 19 deletions.
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,17 @@ if (CPPUDDLE_WITH_TESTS)
SOURCES
examples/cuda_vector_add.cu
)

if (CPPUDDLE_WITH_KOKKOS)
add_hpx_executable(
recycling-with-hpx-kokkos
DEPENDENCIES
Boost::boost Boost::program_options Kokkos::kokkos HPXKokkos::hpx_kokkos HPX::hpx buffer_manager stream_manager
COMPONENT_DEPENDENCIES iostreams
SOURCES
examples/recycling-with-hpx-kokkos.cpp
)
endif()
endif()
endif()
#------------------------------------------------------------------------------------------------------------
Expand Down
50 changes: 31 additions & 19 deletions examples/cuda_vector_add.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

// Developer TODOs regarding CPPuddle usability:
// TODO(daissgr) Simplify specifying an executor pool (at least when using the
// default round_robin_pool_impl). The current way seems awfully verbose

#include <algorithm>
#include <cstdlib>
#include <hpx/include/async.hpp>
Expand All @@ -13,6 +17,7 @@

#include <boost/program_options.hpp>

#include <cppuddle/memory_recycling/buffer_management_interface.hpp>
#include <cppuddle/memory_recycling/std_recycling_allocators.hpp>
#include <cppuddle/memory_recycling/cuda_recycling_allocators.hpp>
#include <cppuddle/memory_recycling/util/cuda_recycling_device_buffer.hpp>
Expand All @@ -22,28 +27,31 @@
#include <stdexcept>
#include <vector>

// Compile-time options
using float_t = float;
using device_executor_t = hpx::cuda::experimental::cuda_executor;

/** \file This example shows how to use HPX + CPPuddle with GPU-accelerated
* applications. Particulary we focus on how to use a) recycled pinned host
* memory, b) recycled device memory, c) the executor pool, d) the HPX-CUDA
* futures and the basic CPU/GPU load balancing based on executor usage in an
* HPX application. To demonstrate these features we just use the simplest of
* kernels: a vector add, that is repeated over a multitude of tasks (with
* kernels: a vector addition that is repeated over a multitude of tasks (with
* varying, artifical dependencies inbetween). So while the compute kernel is
* basic, we still get to see how the CPPuddle/HPX features may be used.
* basic, we still get to see how the CPPuddle/HPX features may be used with
* it.
*
* The example has three parts: First the GPU part, then the HPX task graph
* management and lastly the remaining initialization/boilerplate code
*/

//=================================================================================================
// PART I: The GPU kernel and how to launch it with CPPuddle + HPX whilst avoid
// PART I: The (CUDA) GPU kernel and how to launch it with CPPuddle + HPX whilst avoid
// any CPU/GPU barriers
//=================================================================================================

// Compile-time options: float type...
using float_t = float;
// ... and we will use the HPX CUDA executor inside the executor pool later on
using device_executor_t = hpx::cuda::experimental::cuda_executor;

/** Just some example CUDA kernel. For simplicity it just adds two vectors. */
__global__ void kernel_add(const float_t *input_a, const float_t *input_b, float_t *output_c) {
const int index = blockIdx.x * blockDim.x + threadIdx.x;
Expand Down Expand Up @@ -97,15 +105,15 @@ void launch_gpu_kernel_task(const size_t task_id, const size_t entries_per_task,

// 4. Run Kernel on either CPU or GPU
if (!device_executor_available) {
// 4a. Launch CPU Fallback Version
number_cpu_kernel_launches++;
// 4a. Launch CPU Fallback Version
for (size_t entry_id = 0; entry_id < entries_per_task; entry_id++) {
host_c[entry_id] = host_a[entry_id] + host_b[entry_id];
}
} else {
number_gpu_kernel_launches++;
// 4b. Create per_task device-side buffers (using recylced device memory)
// and draw GPU executor from CPPuddle executor pool
number_gpu_kernel_launches++;
cppuddle::executor_recycling::executor_interface<
device_executor_t,
cppuddle::executor_recycling::round_robin_pool_impl<device_executor_t>>
Expand Down Expand Up @@ -236,23 +244,23 @@ build_task_graph(const size_t number_repetitions, const size_t number_tasks,
* Polling usually has the superior performance, however, it requires that the
* polling is initialized at startup (or at least before the CUDA futures are
* used). The CPPuddle executor pool also needs initialzing as we need to set it
* to a specified number of executors (which CPPuddle cannot know having the
* number_executors parameter). We will use the round_robin_pool_impl for
* to a specified number of executors (which CPPuddle cannot know without the
* number_gpu_executors parameter). We will use the round_robin_pool_impl for
* simplicity. A priority_pool_impl is also available.
*/
void init_executor_pool_and_polling(const size_t number_executors, const size_t gpu_id) {
void init_executor_pool_and_polling(const size_t number_gpu_executors, const size_t gpu_id) {
assert(gpu_id == 0); // MultiGPU not used in this example
hpx::cuda::experimental::detail::register_polling(hpx::resource::get_thread_pool(0));
cppuddle::executor_recycling::executor_pool::init_executor_pool<
device_executor_t,
cppuddle::executor_recycling::round_robin_pool_impl<device_executor_t>>(
gpu_id, number_executors, gpu_id, true);
gpu_id, number_gpu_executors, gpu_id, true);
}

/// Processes the CLI options via boost program_options to configure the example
bool process_cli_options(int argc, char *argv[], size_t &entries_per_task,
size_t &number_tasks, bool &in_order_repetitions,
size_t &number_repetitions, size_t &number_executors,
size_t &number_repetitions, size_t &number_gpu_executors,
size_t &max_queue_length) {
try {
boost::program_options::options_description desc{"Options"};
Expand All @@ -274,8 +282,8 @@ bool process_cli_options(int argc, char *argv[], size_t &entries_per_task,
boost::program_options::value<size_t>(&number_repetitions)
->default_value(20),
"Sets the number of repetitions")(
"number_executors",
boost::program_options::value<size_t>(&number_executors)
"number_gpu_executors",
boost::program_options::value<size_t>(&number_gpu_executors)
->default_value(32),
"Number of GPU executors in the pool")(
"max_queue_length_per_executor",
Expand All @@ -302,7 +310,7 @@ bool process_cli_options(int argc, char *argv[], size_t &entries_per_task,
<< " --tasks_per_repetition = " << number_tasks << std::endl
<< " --number_repetitions = " << number_repetitions << std::endl
<< " --in_order_repetitions = " << in_order_repetitions << std::endl
<< " --number_executors = " << number_executors << std::endl
<< " --number_gpu_executors = " << number_gpu_executors << std::endl
<< " --max_queue_length_per_executor = " << max_queue_length << std::endl
<< " --hpx:threads = " << hpx::get_os_thread_count()
<< std::endl << std::endl;
Expand All @@ -328,17 +336,17 @@ int hpx_main(int argc, char *argv[]) {
size_t number_repetitions = 20;
bool in_order_repetitions = false;
size_t max_queue_length = 5;
size_t number_executors = 1;
size_t number_gpu_executors = 1;
size_t gpu_id = 0;
if(!process_cli_options(argc, argv, entries_per_task, number_tasks,
in_order_repetitions, number_repetitions,
number_executors, max_queue_length)) {
number_gpu_executors, max_queue_length)) {
return hpx::finalize(); // problem with CLI parameters detected -> exiting..
}

// Init HPX CUDA polling + executor pool
hpx::cout << "Start initializing CUDA polling and executor pool..." << std::endl;
init_executor_pool_and_polling(number_executors, gpu_id);
init_executor_pool_and_polling(number_gpu_executors, gpu_id);
hpx::cout << "Init done!" << std::endl << std::endl;


Expand All @@ -361,6 +369,10 @@ int hpx_main(int argc, char *argv[]) {

// Finalize HPX (CPPuddle finalizes automatically)
hpx::cout << "Finalizing..." << std::endl;
// Deallocates all CPPuddle everything and prevent further usage. Technically
// not required as long as static variables with CPPuddle-managed memory are
// not used, however, it does not hurt either.
cppuddle::memory_recycling::finalize();
hpx::cuda::experimental::detail::unregister_polling(
hpx::resource::get_thread_pool(0));
return hpx::finalize();
Expand Down
Loading

0 comments on commit f09c6aa

Please sign in to comment.