Skip to content

Commit

Permalink
Made radix sort CPU parallel
Browse files Browse the repository at this point in the history
Radix sorting is now GPU parallel.
For a test case starting at 4800 particles and ending
with 38000 particles over 100 steps, parallelisation
reduced meshing time from 20% of sim time to 10%.

Also removed original radix-1 sort. CPU parallel radix-8
is now only option.
  • Loading branch information
hjabird committed Jul 3, 2019
1 parent cc3a832 commit 0c32a83
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 87 deletions.
17 changes: 9 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,6 @@ set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)

if(USE_OPENMP)
find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
endif()
endif(USE_OPENMP)

set_property(GLOBAL PROPERTY USE_FOLDERS ON)
file (GLOB CVORTEX_INCLUDE "include/cvortex/libcvtx.h") # So shoot me for GLOBing.
file (GLOB CVORTEX_SOURCE "src/*.[ch]")
Expand All @@ -41,6 +33,15 @@ endif(BUILD_STATIC_LIBRARY)
# We don't want warnings...
target_compile_definitions(cvortex PRIVATE _CRT_SECURE_NO_WARNINGS)

if(USE_OPENMP)
find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
target_compile_definitions(cvortex PRIVATE CVTX_USING_OPENMP)
endif()
endif(USE_OPENMP)

if(USE_OPENCL)
find_package(OpenCL REQUIRED)
target_link_libraries(cvortex PRIVATE ${OpenCL_LIBRARIES})
Expand Down
2 changes: 1 addition & 1 deletion src/P3D.c
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ CVTX_EXPORT int cvtx_P3D_redistribute_on_grid(
/* Now merge our new particles */
/* qsort_r(nidx_array, n_input_particles * ppop, sizeof(unsigned int),
comp_Gridkey3D_by_idx, nkey_array); */
sort_uintkey_by_uivar_radix_p((char*)nkey_array,
sort_uintkey_by_uivar_radix((char*)nkey_array,
sizeof(struct Gridkey3D), nidx_array, n_input_particles* ppop);

nnkey_array = malloc(sizeof(struct Gridkey3D) * n_input_particles * ppop);
Expand Down
139 changes: 67 additions & 72 deletions src/sorting.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,66 +28,11 @@ SOFTWARE.
#include <stdlib.h>
#include <string.h>

#define CACHELINE_SIZE 64 /* Bytes */
#ifdef CVTX_USING_OPENMP
# include <omp.h>
#endif

void sort_uintkey_by_uivar_radix(
unsigned char *ui_start, size_t uibytes,
unsigned int *key_start, size_t num_items) {

assert(uibytes > 0);
assert(num_items >= 0);
assert(ui_start != NULL);
assert(key_start != NULL);

/* Buffer (buffer) swaps with input array
as working array (wa) or output array (oa) */
unsigned int *buffer = NULL, *wa=NULL, *oa=NULL;
/* Bit = 0/1 counts and offsets*/
unsigned int bc0 = 0, bc1 = 0, bc0o = 0, bc1o = 0;
/* swap = what are we writing the result of this iter into?
True: Work from buffer into key_start
False: Work from key_start into buffer
*/
unsigned int bit = 0, byte = 0, swap = 0, uini = (unsigned int)num_items;
size_t i, j;
buffer = malloc(num_items * uibytes);

for (byte = 0; byte < uibytes; ++byte) {
for (bit = 0; bit < 8; ++bit) {
bc1 = bc0 = 0;
wa = swap ? buffer : key_start;
oa = swap ? key_start : buffer;
unsigned char mask = 0x1 << bit;
for (i = 0; i < uini; ++i) {
j = wa[i] * uibytes + byte;
ui_start[j] & mask ? ++bc1 : ++bc0;
}
assert(bc0 + bc1 == uini);
bc1o = bc0o = 0;
for (i = 0; i < uini; ++i) {
j = wa[i] * uibytes + byte;
if (ui_start[j] & mask) {
oa[bc1o + bc0] = wa[i];
++bc1o;
}
else {
oa[bc0o] = wa[i];
++bc0o;
}
}
swap = swap ? 0 : 1;
}
}
/* If we wrote our solution into the buffer, we need to copy
it back. */
if (!swap) {
memcpy(key_start, buffer, num_items);
}
free(buffer);
return;
}

void sort_uintkey_by_uivar_radix_p(
unsigned char* ui_start, size_t uibytes,
unsigned int* key_start, size_t num_items) {

Expand All @@ -100,30 +45,77 @@ void sort_uintkey_by_uivar_radix_p(
as working array (wa) or output array (oa) */
unsigned int *buffer = NULL, *wa = NULL, *oa = NULL;
unsigned int n_para = 0x1 << sizeof(char) * 8;
#ifdef CVTX_USING_OPENMP
unsigned int nthreads = omp_get_num_procs();
omp_set_dynamic(0);
omp_set_num_threads(nthreads);
#else
unsigned int nthreads = 1;
#endif
unsigned int *counts, *offsets, info_size = sizeof(unsigned int) * n_para;
/* swap = what are we writing the result of this iter into? */
unsigned int bit = 0, byte = 0, swap = 0, uini = (unsigned int)num_items;
buffer = malloc(num_items * uibytes);
counts = malloc(info_size);
offsets = malloc(info_size);
counts = malloc(info_size * nthreads);
offsets = malloc(info_size * nthreads);

for (byte = 0; byte < uibytes; ++byte) {
memset(counts, 0, info_size);
memset(offsets, 0, info_size);
memset(counts, 0, info_size * nthreads);
memset(offsets, 0, info_size * nthreads);
wa = swap ? buffer : key_start;
oa = swap ? key_start : buffer;
size_t i = 0, j = 0;
for (i = 0; i < uini; ++i) {
counts[ui_start[wa[i] * uibytes + byte]]++;
size_t i = 0, j = 0, m = 0, n = 0;

/* Counting pass */
#pragma omp parallel for private(m, n, j)
for (i = 0; i < nthreads; ++i) {
#ifdef CVTX_USING_OPENMP
unsigned int threadid = omp_get_thread_num();
#else
unsigned int threadid = 0;
#endif
m = (uini / nthreads) * threadid;
n = threadid == nthreads - 1 ?
uini : (uini / nthreads) * (threadid + 1);
unsigned int k;
for (j = m; j < n; ++j) {
k = ui_start[wa[j] * uibytes + byte];
counts[k + n_para * threadid]++;
}
}
for (i = 0; i < n_para-1; ++i) {
offsets[i + 1] = offsets[i] + counts[i];
assert(offsets[i + 1] >= offsets[i]);
/* Compute offsets */
#pragma omp parallel for private(n, j, i)
for (m = 0; m < nthreads; ++m) {
for (n = 0; n < n_para; ++n) {
for (j = 0; j < nthreads; ++j) {
for (i = 0; i < n; ++i) {
offsets[n + m * n_para] +=
counts[i + j * n_para];
}
}
for (j = 0; j < m; ++j) {
offsets[n + m * n_para] +=
counts[n + j * n_para];
}
}
}
for (i = 0; i < uini; ++i) {
j = ui_start[wa[i] * uibytes + byte];
oa[offsets[j]] = wa[i];
++offsets[j];
/* Reorder pass*/
#pragma omp parallel for private(m, n, j)
for (i = 0; i < nthreads; ++i) {
#ifdef CVTX_USING_OPENMP
unsigned int threadid = omp_get_thread_num();
#else
unsigned int threadid = 0;
#endif
m = (uini / nthreads) * threadid;
n = threadid == nthreads - 1 ?
uini : (uini / nthreads) * (threadid + 1);
for (j = m; j < n; ++j) {
unsigned int k = ui_start[wa[j] * uibytes + byte]
+ n_para * threadid;
oa[offsets[k]] = wa[j];
++offsets[k];
}
}
swap = swap ? 0 : 1;
}
Expand All @@ -135,5 +127,8 @@ void sort_uintkey_by_uivar_radix_p(
free(buffer);
free(counts);
free(offsets);
#ifdef CVTX_USING_OPENMP
omp_set_dynamic(1);
#endif
return;
}
7 changes: 1 addition & 6 deletions src/sorting.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ END: UI = [3, 2, 6, 4]
KEYS = [1, 0, 3, 2]
UI is assumed to be array of unsigned integers of
uibytes size.
Uses radix sorting method.
Uses parallel radix-8 sorting method.
ui_start is an array of uibytes * num_items bytes long.
key_start is an array of unsigned ints num_items long.
Expand All @@ -51,9 +51,4 @@ void sort_uintkey_by_uivar_radix(
unsigned char* ui_start, size_t uibytes,
unsigned int* key_start, size_t num_items);

/* Parallel variant of sort_uintkey_by_uivar_radix(..) */
void sort_uintkey_by_uivar_radix_p(
unsigned char* ui_start, size_t uibytes,
unsigned int* key_start, size_t num_items);

#endif

0 comments on commit 0c32a83

Please sign in to comment.