From 0c32a8377140a4eaac13e1e54718014a530ed300 Mon Sep 17 00:00:00 2001 From: HJA Bird Date: Wed, 3 Jul 2019 23:38:55 +0100 Subject: [PATCH] Made radix sort CPU parallel Radix sorting is now GPU parallel. For a test case starting at 4800 particles and ending with 38000 particles over 100 steps, parallelisation reduced meshing time from 20% of sim time to 10%. Also removed original radix-1 sort. CPU parallel radix-8 is now only option. --- CMakeLists.txt | 17 +++--- src/P3D.c | 2 +- src/sorting.c | 139 ++++++++++++++++++++++++------------------------- src/sorting.h | 7 +-- 4 files changed, 78 insertions(+), 87 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c291a48..a93781d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,14 +11,6 @@ set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -if(USE_OPENMP) - find_package(OpenMP) - if (OPENMP_FOUND) - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") - endif() -endif(USE_OPENMP) - set_property(GLOBAL PROPERTY USE_FOLDERS ON) file (GLOB CVORTEX_INCLUDE "include/cvortex/libcvtx.h") # So shoot me for GLOBing. file (GLOB CVORTEX_SOURCE "src/*.[ch]") @@ -41,6 +33,15 @@ endif(BUILD_STATIC_LIBRARY) # We don't want warnings... target_compile_definitions(cvortex PRIVATE _CRT_SECURE_NO_WARNINGS) +if(USE_OPENMP) + find_package(OpenMP) + if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") + target_compile_definitions(cvortex PRIVATE CVTX_USING_OPENMP) + endif() +endif(USE_OPENMP) + if(USE_OPENCL) find_package(OpenCL REQUIRED) target_link_libraries(cvortex PRIVATE ${OpenCL_LIBRARIES}) diff --git a/src/P3D.c b/src/P3D.c index e64c983..6ebe18a 100644 --- a/src/P3D.c +++ b/src/P3D.c @@ -446,7 +446,7 @@ CVTX_EXPORT int cvtx_P3D_redistribute_on_grid( /* Now merge our new particles */ /* qsort_r(nidx_array, n_input_particles * ppop, sizeof(unsigned int), comp_Gridkey3D_by_idx, nkey_array); */ - sort_uintkey_by_uivar_radix_p((char*)nkey_array, + sort_uintkey_by_uivar_radix((char*)nkey_array, sizeof(struct Gridkey3D), nidx_array, n_input_particles* ppop); nnkey_array = malloc(sizeof(struct Gridkey3D) * n_input_particles * ppop); diff --git a/src/sorting.c b/src/sorting.c index 609eeba..5e850a9 100644 --- a/src/sorting.c +++ b/src/sorting.c @@ -28,66 +28,11 @@ SOFTWARE. #include #include -#define CACHELINE_SIZE 64 /* Bytes */ +#ifdef CVTX_USING_OPENMP +# include +#endif void sort_uintkey_by_uivar_radix( - unsigned char *ui_start, size_t uibytes, - unsigned int *key_start, size_t num_items) { - - assert(uibytes > 0); - assert(num_items >= 0); - assert(ui_start != NULL); - assert(key_start != NULL); - - /* Buffer (buffer) swaps with input array - as working array (wa) or output array (oa) */ - unsigned int *buffer = NULL, *wa=NULL, *oa=NULL; - /* Bit = 0/1 counts and offsets*/ - unsigned int bc0 = 0, bc1 = 0, bc0o = 0, bc1o = 0; - /* swap = what are we writing the result of this iter into? - True: Work from buffer into key_start - False: Work from key_start into buffer - */ - unsigned int bit = 0, byte = 0, swap = 0, uini = (unsigned int)num_items; - size_t i, j; - buffer = malloc(num_items * uibytes); - - for (byte = 0; byte < uibytes; ++byte) { - for (bit = 0; bit < 8; ++bit) { - bc1 = bc0 = 0; - wa = swap ? buffer : key_start; - oa = swap ? key_start : buffer; - unsigned char mask = 0x1 << bit; - for (i = 0; i < uini; ++i) { - j = wa[i] * uibytes + byte; - ui_start[j] & mask ? ++bc1 : ++bc0; - } - assert(bc0 + bc1 == uini); - bc1o = bc0o = 0; - for (i = 0; i < uini; ++i) { - j = wa[i] * uibytes + byte; - if (ui_start[j] & mask) { - oa[bc1o + bc0] = wa[i]; - ++bc1o; - } - else { - oa[bc0o] = wa[i]; - ++bc0o; - } - } - swap = swap ? 0 : 1; - } - } - /* If we wrote our solution into the buffer, we need to copy - it back. */ - if (!swap) { - memcpy(key_start, buffer, num_items); - } - free(buffer); - return; -} - -void sort_uintkey_by_uivar_radix_p( unsigned char* ui_start, size_t uibytes, unsigned int* key_start, size_t num_items) { @@ -100,30 +45,77 @@ void sort_uintkey_by_uivar_radix_p( as working array (wa) or output array (oa) */ unsigned int *buffer = NULL, *wa = NULL, *oa = NULL; unsigned int n_para = 0x1 << sizeof(char) * 8; +#ifdef CVTX_USING_OPENMP + unsigned int nthreads = omp_get_num_procs(); + omp_set_dynamic(0); + omp_set_num_threads(nthreads); +#else + unsigned int nthreads = 1; +#endif unsigned int *counts, *offsets, info_size = sizeof(unsigned int) * n_para; /* swap = what are we writing the result of this iter into? */ unsigned int bit = 0, byte = 0, swap = 0, uini = (unsigned int)num_items; buffer = malloc(num_items * uibytes); - counts = malloc(info_size); - offsets = malloc(info_size); + counts = malloc(info_size * nthreads); + offsets = malloc(info_size * nthreads); for (byte = 0; byte < uibytes; ++byte) { - memset(counts, 0, info_size); - memset(offsets, 0, info_size); + memset(counts, 0, info_size * nthreads); + memset(offsets, 0, info_size * nthreads); wa = swap ? buffer : key_start; oa = swap ? key_start : buffer; - size_t i = 0, j = 0; - for (i = 0; i < uini; ++i) { - counts[ui_start[wa[i] * uibytes + byte]]++; + size_t i = 0, j = 0, m = 0, n = 0; + + /* Counting pass */ +#pragma omp parallel for private(m, n, j) + for (i = 0; i < nthreads; ++i) { +#ifdef CVTX_USING_OPENMP + unsigned int threadid = omp_get_thread_num(); +#else + unsigned int threadid = 0; +#endif + m = (uini / nthreads) * threadid; + n = threadid == nthreads - 1 ? + uini : (uini / nthreads) * (threadid + 1); + unsigned int k; + for (j = m; j < n; ++j) { + k = ui_start[wa[j] * uibytes + byte]; + counts[k + n_para * threadid]++; + } } - for (i = 0; i < n_para-1; ++i) { - offsets[i + 1] = offsets[i] + counts[i]; - assert(offsets[i + 1] >= offsets[i]); + /* Compute offsets */ +#pragma omp parallel for private(n, j, i) + for (m = 0; m < nthreads; ++m) { + for (n = 0; n < n_para; ++n) { + for (j = 0; j < nthreads; ++j) { + for (i = 0; i < n; ++i) { + offsets[n + m * n_para] += + counts[i + j * n_para]; + } + } + for (j = 0; j < m; ++j) { + offsets[n + m * n_para] += + counts[n + j * n_para]; + } + } } - for (i = 0; i < uini; ++i) { - j = ui_start[wa[i] * uibytes + byte]; - oa[offsets[j]] = wa[i]; - ++offsets[j]; + /* Reorder pass*/ +#pragma omp parallel for private(m, n, j) + for (i = 0; i < nthreads; ++i) { +#ifdef CVTX_USING_OPENMP + unsigned int threadid = omp_get_thread_num(); +#else + unsigned int threadid = 0; +#endif + m = (uini / nthreads) * threadid; + n = threadid == nthreads - 1 ? + uini : (uini / nthreads) * (threadid + 1); + for (j = m; j < n; ++j) { + unsigned int k = ui_start[wa[j] * uibytes + byte] + + n_para * threadid; + oa[offsets[k]] = wa[j]; + ++offsets[k]; + } } swap = swap ? 0 : 1; } @@ -135,5 +127,8 @@ void sort_uintkey_by_uivar_radix_p( free(buffer); free(counts); free(offsets); +#ifdef CVTX_USING_OPENMP + omp_set_dynamic(1); +#endif return; } diff --git a/src/sorting.h b/src/sorting.h index f89092f..aa4ceb8 100644 --- a/src/sorting.h +++ b/src/sorting.h @@ -42,7 +42,7 @@ END: UI = [3, 2, 6, 4] KEYS = [1, 0, 3, 2] UI is assumed to be array of unsigned integers of uibytes size. -Uses radix sorting method. +Uses parallel radix-8 sorting method. ui_start is an array of uibytes * num_items bytes long. key_start is an array of unsigned ints num_items long. @@ -51,9 +51,4 @@ void sort_uintkey_by_uivar_radix( unsigned char* ui_start, size_t uibytes, unsigned int* key_start, size_t num_items); -/* Parallel variant of sort_uintkey_by_uivar_radix(..) */ -void sort_uintkey_by_uivar_radix_p( - unsigned char* ui_start, size_t uibytes, - unsigned int* key_start, size_t num_items); - #endif