Skip to content

Commit

Permalink
Add in-order flag
Browse files Browse the repository at this point in the history
  • Loading branch information
G-071 committed May 27, 2024
1 parent b3d8d67 commit d4e026c
Showing 1 changed file with 97 additions and 15 deletions.
112 changes: 97 additions & 15 deletions examples/cuda_vector_add.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <hpx/include/iostreams.hpp>
#include <hpx/async_cuda/cuda_executor.hpp>

#include <boost/program_options.hpp>

#include <cppuddle/memory_recycling/std_recycling_allocators.hpp>
#include <cppuddle/memory_recycling/cuda_recycling_allocators.hpp>
Expand All @@ -25,6 +26,7 @@ constexpr size_t number_repetitions = 20;
constexpr size_t max_queue_length = 5;
constexpr size_t number_executors = 1;
constexpr size_t gpu_id = 0;
constexpr bool in_order_repetitions = true;

static_assert(vector_size % entries_per_task == 0);

Expand All @@ -35,6 +37,47 @@ __global__ void kernel_add(const float_t *input_a, const float_t *input_b, float
}

int hpx_main(int argc, char *argv[]) {

/* try { */
/* boost::program_options::options_description desc{"Options"}; */
/* desc.add_options()("help", "Help screen")( */
/* "elements_per_task", */
/* boost::program_options::value<size_t>(&array_size) */
/* ->default_value(5000000), */
/* "Size of the buffers")( */
/* "tasks_per_repetition", */
/* boost::program_options::value<size_t>(&number_futures) */
/* ->default_value(64), */
/* "Sets the number of futures to be (potentially) executed in parallel")( */
/* "number_repetitions", */
/* boost::program_options::value<size_t>(&passes)->default_value(200), */
/* "Sets the number of repetitions")( */
/* "outputfile", */
/* boost::program_options::value<std::string>(&filename)->default_value( */
/* ""), */
/* "Redirect stdout/stderr to this file"); */

/* boost::program_options::variables_map vm; */
/* boost::program_options::parsed_options options = */
/* parse_command_line(argc, argv, desc); */
/* boost::program_options::store(options, vm); */
/* boost::program_options::notify(vm); */

/* if (vm.count("help") == 0u) { */
/* std::cout << "Running with parameters:" << std::endl */
/* << " --arraysize = " << array_size << std::endl */
/* << " --futures = " << number_futures << std::endl */
/* << " --passes = " << passes << std::endl */
/* << " --hpx:threads = " << hpx::get_os_thread_count() */
/* << std::endl; */
/* } else { */
/* std::cout << desc << std::endl; */
/* return hpx::finalize(); */
/* } */
/* } catch (const boost::program_options::error &ex) { */
/* std::cerr << "CLI argument problem found: " << ex.what() << '\n'; */
/* } */

// HPX and CPPuddle Setup for executor (polling + pool init)
// =========================================== 0.a Init HPX CUDA polling
hpx::cout << "Start initializing CUDA polling and executor pool..." << std::endl;
Expand All @@ -52,12 +95,15 @@ int hpx_main(int argc, char *argv[]) {
// Launch tasks
// Note: Repetitions may be out of order since they do not depend on each other in this toy sample
hpx::cout << "Start launching tasks..." << std::endl;

hpx::shared_future<void> previous_iteration_fut = hpx::make_ready_future<void>();
std::vector<hpx::future<void>> repetition_futs(number_repetitions);
for (size_t repetition = 0; repetition < number_repetitions; repetition++) {
std::vector<hpx::future<void>> futs(number_tasks);
for (size_t task_id = 0; task_id < number_tasks; task_id++) {
futs[task_id] = hpx::async([task_id, &number_cpu_kernel_launches,
&number_gpu_kernel_launches]() {
auto gpu_task_lambda = [](const auto task_id,
auto &number_cpu_kernel_launches,
auto &number_gpu_kernel_launches) {
// Inner Task Setup to launch the CUDA kernels:
// ===========================================

Expand Down Expand Up @@ -142,27 +188,63 @@ int hpx_main(int argc, char *argv[]) {

// Inner Task Done!
// ===========================================
});
};
if (in_order_repetitions) {
futs[task_id] = previous_iteration_fut.then([task_id, &number_cpu_kernel_launches,
&number_gpu_kernel_launches, gpu_task_lambda](auto && fut) {
gpu_task_lambda(task_id, number_cpu_kernel_launches,
number_gpu_kernel_launches);
});
} else {
futs[task_id] =
hpx::async([task_id, &number_cpu_kernel_launches,
&number_gpu_kernel_launches, gpu_task_lambda]() {
gpu_task_lambda(task_id, number_cpu_kernel_launches,
number_gpu_kernel_launches);
});
}
}
// Schedule output task to run once a repetition is done
auto repetition_finished = hpx::when_all(futs);
repetition_futs.emplace_back(repetition_finished.then([repetition](auto &&fut) {
if (in_order_repetitions) {
previous_iteration_fut =
repetition_finished.then([repetition](auto &&fut) {
hpx::cout << "Repetition " << repetition << " done!" << std::endl;
});
} else {
repetition_futs.emplace_back(
repetition_finished.then([repetition](auto &&fut) {
hpx::cout << "Repetition " << repetition << " done!" << std::endl;
}));
}
}
hpx::cout << "All tasks launched asynchronously!" << std::endl << std::endl;
// Schedule output task to run once all other tasks are done
auto all_done_fut =
hpx::when_all(repetition_futs)
.then([&number_cpu_kernel_launches,
&number_gpu_kernel_launches](auto &&fut) {
hpx::cout << "All tasks are done!" << std::endl;
hpx::cout << " => " << number_gpu_kernel_launches
<< " kernels were run on the GPU" << std::endl;
hpx::cout << " => " << number_cpu_kernel_launches
<< " kernels were using the CPU fallback" << std::endl << std::endl;
});
all_done_fut.get();
if (in_order_repetitions) {
previous_iteration_fut
.then([&number_cpu_kernel_launches,
&number_gpu_kernel_launches](auto &&fut) {
hpx::cout << "All tasks are done! [in-order repetitions version]" << std::endl;
hpx::cout << " => " << number_gpu_kernel_launches
<< " kernels were run on the GPU" << std::endl;
hpx::cout << " => " << number_cpu_kernel_launches
<< " kernels were using the CPU fallback" << std::endl
<< std::endl;
})
.get();
} else {
hpx::when_all(repetition_futs)
.then([&number_cpu_kernel_launches,
&number_gpu_kernel_launches](auto &&fut) {
hpx::cout << "All tasks are done! [out-of-order repetitions version]" << std::endl;
hpx::cout << " => " << number_gpu_kernel_launches
<< " kernels were run on the GPU" << std::endl;
hpx::cout << " => " << number_cpu_kernel_launches
<< " kernels were using the CPU fallback" << std::endl
<< std::endl;
})
.get();
}

hpx::cuda::experimental::detail::unregister_polling(hpx::resource::get_thread_pool(0));
hpx::cout << "Finalizing..." << std::endl;
Expand Down

0 comments on commit d4e026c

Please sign in to comment.