From 0c32a8377140a4eaac13e1e54718014a530ed300 Mon Sep 17 00:00:00 2001
From: HJA Bird <h.bird.1@research.gla.ac.uk>
Date: Wed, 3 Jul 2019 23:38:55 +0100
Subject: [PATCH] Made radix sort CPU parallel

Radix sorting is now GPU parallel.
For a test case starting at 4800 particles and ending
with 38000 particles over 100 steps, parallelisation
reduced meshing time from 20% of sim time to 10%.

Also removed original radix-1 sort. CPU parallel radix-8
is now only option.
---
 CMakeLists.txt |  17 +++---
 src/P3D.c      |   2 +-
 src/sorting.c  | 139 ++++++++++++++++++++++++-------------------------
 src/sorting.h  |   7 +--
 4 files changed, 78 insertions(+), 87 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c291a48..a93781d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,14 +11,6 @@ set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 
-if(USE_OPENMP)
-    find_package(OpenMP)
-    if (OPENMP_FOUND)
-        set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-    endif()
-endif(USE_OPENMP)
-
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 file (GLOB CVORTEX_INCLUDE "include/cvortex/libcvtx.h") # So shoot me for GLOBing.
 file (GLOB CVORTEX_SOURCE  "src/*.[ch]")
@@ -41,6 +33,15 @@ endif(BUILD_STATIC_LIBRARY)
 # We don't want warnings...
 target_compile_definitions(cvortex PRIVATE _CRT_SECURE_NO_WARNINGS)
 
+if(USE_OPENMP)
+    find_package(OpenMP)
+    if (OPENMP_FOUND)
+        set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+		target_compile_definitions(cvortex PRIVATE CVTX_USING_OPENMP)
+    endif()
+endif(USE_OPENMP)
+
 if(USE_OPENCL)
     find_package(OpenCL REQUIRED)
     target_link_libraries(cvortex PRIVATE ${OpenCL_LIBRARIES})
diff --git a/src/P3D.c b/src/P3D.c
index e64c983..6ebe18a 100644
--- a/src/P3D.c
+++ b/src/P3D.c
@@ -446,7 +446,7 @@ CVTX_EXPORT int cvtx_P3D_redistribute_on_grid(
 	/* Now merge our new particles */
 	/* qsort_r(nidx_array, n_input_particles * ppop, sizeof(unsigned int),
 		comp_Gridkey3D_by_idx, nkey_array); */
-	sort_uintkey_by_uivar_radix_p((char*)nkey_array,
+	sort_uintkey_by_uivar_radix((char*)nkey_array,
 		sizeof(struct Gridkey3D), nidx_array, n_input_particles* ppop);
 
 	nnkey_array = malloc(sizeof(struct Gridkey3D) * n_input_particles * ppop);
diff --git a/src/sorting.c b/src/sorting.c
index 609eeba..5e850a9 100644
--- a/src/sorting.c
+++ b/src/sorting.c
@@ -28,66 +28,11 @@ SOFTWARE.
 #include <stdlib.h>
 #include <string.h>
 
-#define CACHELINE_SIZE 64	/* Bytes */
+#ifdef CVTX_USING_OPENMP
+#	include <omp.h>
+#endif
 
 void sort_uintkey_by_uivar_radix(
-	unsigned char *ui_start, size_t uibytes,
-	unsigned int *key_start, size_t num_items) {
-
-	assert(uibytes > 0);
-	assert(num_items >= 0);
-	assert(ui_start != NULL);
-	assert(key_start != NULL);
-
-	/* Buffer (buffer) swaps with input array
-	as working array (wa) or output array (oa) */
-	unsigned int *buffer = NULL, *wa=NULL, *oa=NULL;
-	/* Bit = 0/1 counts and offsets*/
-	unsigned int bc0 = 0, bc1 = 0, bc0o = 0, bc1o = 0;
-	/* swap = what are we writing the result of this iter into?
-	True:	Work from buffer into key_start
-	False:	Work from key_start into buffer
-	*/
-	unsigned int bit = 0, byte = 0, swap = 0, uini = (unsigned int)num_items;
-	size_t i, j;
-	buffer = malloc(num_items * uibytes);
-
-	for (byte = 0; byte < uibytes; ++byte) {
-		for (bit = 0; bit < 8; ++bit) {
-			bc1 = bc0 = 0;
-			wa = swap ? buffer : key_start;
-			oa = swap ? key_start : buffer;
-			unsigned char mask = 0x1 << bit;
-			for (i = 0; i < uini; ++i) {
-				j = wa[i] * uibytes + byte;
-				ui_start[j] & mask ? ++bc1 : ++bc0;
-			}
-			assert(bc0 + bc1 == uini);
-			bc1o = bc0o = 0;
-			for (i = 0; i < uini; ++i) {
-				j = wa[i] * uibytes + byte;
-				if (ui_start[j] & mask) {
-					oa[bc1o + bc0] = wa[i];
-					++bc1o;
-				}
-				else {
-					oa[bc0o] = wa[i];
-					++bc0o;
-				}
-			}
-			swap = swap ? 0 : 1;
-		}
-	}
-	/* If we wrote our solution into the buffer, we need to copy
-	it back. */
-	if (!swap) {
-		memcpy(key_start, buffer, num_items);
-	}
-	free(buffer);
-	return;
-}
-
-void sort_uintkey_by_uivar_radix_p(
 	unsigned char* ui_start, size_t uibytes,
 	unsigned int* key_start, size_t num_items) {
 
@@ -100,30 +45,77 @@ void sort_uintkey_by_uivar_radix_p(
 	as working array (wa) or output array (oa) */
 	unsigned int *buffer = NULL, *wa = NULL, *oa = NULL;
 	unsigned int n_para = 0x1 << sizeof(char) * 8;
+#ifdef CVTX_USING_OPENMP
+	unsigned int nthreads = omp_get_num_procs();
+	omp_set_dynamic(0);
+	omp_set_num_threads(nthreads);
+#else
+	unsigned int nthreads = 1;
+#endif
 	unsigned int *counts, *offsets, info_size = sizeof(unsigned int) * n_para;
 	/* swap = what are we writing the result of this iter into? */
 	unsigned int bit = 0, byte = 0, swap = 0, uini = (unsigned int)num_items;
 	buffer = malloc(num_items * uibytes);
-	counts = malloc(info_size);
-	offsets = malloc(info_size);
+	counts = malloc(info_size * nthreads);
+	offsets = malloc(info_size * nthreads);
 
 	for (byte = 0; byte < uibytes; ++byte) {
-		memset(counts, 0, info_size);
-		memset(offsets, 0, info_size);
+		memset(counts, 0, info_size * nthreads);
+		memset(offsets, 0, info_size * nthreads);
 		wa = swap ? buffer : key_start;
 		oa = swap ? key_start : buffer;
-		size_t i = 0, j = 0;
-		for (i = 0; i < uini; ++i) {
-			counts[ui_start[wa[i] * uibytes + byte]]++;
+		size_t i = 0, j = 0, m = 0, n = 0;
+
+		/* Counting pass */
+#pragma omp parallel for private(m, n, j)
+		for (i = 0; i < nthreads; ++i) {
+#ifdef CVTX_USING_OPENMP
+			unsigned int threadid = omp_get_thread_num();
+#else
+			unsigned int threadid = 0;
+#endif
+			m = (uini / nthreads) * threadid;
+			n = threadid == nthreads - 1 ? 
+				uini : (uini / nthreads) * (threadid + 1);
+			unsigned int k;
+			for (j = m; j < n; ++j) {
+				k = ui_start[wa[j] * uibytes + byte];
+				counts[k + n_para * threadid]++;
+			}
 		}
-		for (i = 0; i < n_para-1; ++i) {
-			offsets[i + 1] = offsets[i] + counts[i];
-			assert(offsets[i + 1] >= offsets[i]);
+		/* Compute offsets */
+#pragma omp parallel for private(n, j, i)
+		for (m = 0; m < nthreads; ++m) {
+			for (n = 0; n < n_para; ++n) {
+				for (j = 0; j < nthreads; ++j) {
+					for (i = 0; i < n; ++i) {
+						offsets[n + m * n_para] +=
+							counts[i + j * n_para];
+					}
+				}
+				for (j = 0; j < m; ++j) {
+					offsets[n + m * n_para] +=
+						counts[n + j * n_para];
+				}
+			}
 		}
-		for (i = 0; i < uini; ++i) {
-			j = ui_start[wa[i] * uibytes + byte];
-			oa[offsets[j]] = wa[i];
-			++offsets[j];
+		/* Reorder pass*/
+#pragma omp parallel for private(m, n, j)
+		for (i = 0; i < nthreads; ++i) {
+#ifdef CVTX_USING_OPENMP
+			unsigned int threadid = omp_get_thread_num();
+#else
+			unsigned int threadid = 0;
+#endif
+			m = (uini / nthreads) * threadid;
+			n = threadid == nthreads - 1 ?
+				uini : (uini / nthreads) * (threadid + 1);
+			for (j = m; j < n; ++j) {
+				unsigned int k = ui_start[wa[j] * uibytes + byte] 
+					+ n_para * threadid;
+				oa[offsets[k]] = wa[j];
+				++offsets[k];
+			} 
 		}
 		swap = swap ? 0 : 1;
 	}
@@ -135,5 +127,8 @@ void sort_uintkey_by_uivar_radix_p(
 	free(buffer);
 	free(counts);
 	free(offsets);
+#ifdef CVTX_USING_OPENMP
+	omp_set_dynamic(1);
+#endif
 	return;
 }
diff --git a/src/sorting.h b/src/sorting.h
index f89092f..aa4ceb8 100644
--- a/src/sorting.h
+++ b/src/sorting.h
@@ -42,7 +42,7 @@ END:	UI		= [3, 2, 6, 4]
 		KEYS	= [1, 0, 3, 2]
 UI is assumed to be array of unsigned integers of
 uibytes size.
-Uses radix sorting method.
+Uses parallel radix-8 sorting method.
 
 ui_start is an array of uibytes * num_items bytes long.
 key_start is an array of unsigned ints num_items long.
@@ -51,9 +51,4 @@ void sort_uintkey_by_uivar_radix(
 	unsigned char* ui_start, size_t uibytes,
 	unsigned int* key_start, size_t num_items);
 
-/* Parallel variant of sort_uintkey_by_uivar_radix(..) */
-void sort_uintkey_by_uivar_radix_p(
-	unsigned char* ui_start, size_t uibytes,
-	unsigned int* key_start, size_t num_items);
-
 #endif