-
Notifications
You must be signed in to change notification settings - Fork 8
/
chol.cu
334 lines (273 loc) · 10 KB
/
chol.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/* Cholesky decomposition.
* Host code.
* Author: Naga Kandasamy
* Date: May 23, 2013
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil.h>
// includes, kernels
#include "chol_kernel.cu"
////////////////////////////////////////////////////////////////////////////////
// declarations, forward
Matrix allocate_matrix_on_gpu(const Matrix M);
Matrix allocate_matrix(int num_rows, int num_columns, int init);
void copy_matrix_to_device(Matrix Mdevice, const Matrix Mhost);
void copy_matrix_from_device(Matrix Mhost, const Matrix Mdevice);
void check_error(const char *msg);
extern Matrix create_positive_definite_matrix(unsigned int, unsigned int);
extern "C" int chol_gold(const Matrix, Matrix);
extern "C" int check_chol(const Matrix, const Matrix);
void chol_on_device(const Matrix, Matrix);
void chol_on_device_optimized(const Matrix, Matrix);
//Globals
float time_cpu;
// Matrices for the program
Matrix A; // The N x N input matrix
Matrix reference; // The upper triangular matrix computed by the CPU
Matrix U_on_device; // The upper triangular matrix computed by the device (slow)
Matrix U_on_device_fast; // The upper triangular matrix computed by the device (fast)
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
// Check command line arguments
if(argc > 1){
printf("Error. This program accepts no arguments. \n");
exit(0);
}
// Initialize the random number generator with a seed value
srand(time(NULL));
// Create the positive definite matrix. May require a few tries if we are unlucky
int success = 0;
while(!success){
A = create_positive_definite_matrix(MATRIX_SIZE, MATRIX_SIZE);
if(A.elements != NULL)
success = 1;
}
reference = allocate_matrix(MATRIX_SIZE, MATRIX_SIZE, 0); // Create a matrix to store the CPU result
U_on_device = allocate_matrix(MATRIX_SIZE, MATRIX_SIZE, 0); // Create a matrix to store the device result
U_on_device_fast = allocate_matrix(MATRIX_SIZE, MATRIX_SIZE, 0);
//Compute the Cholesky decomposition on the CPU
unsigned int timer;
cutCreateTimer(&timer);
cutStartTimer(timer);
printf("== CPU ==\n");
int status = 1;
status = chol_gold(A, reference);
cutStopTimer(timer);
time_cpu = 1e-3 * cutGetTimerValue(timer);
printf(" Run time: %0.10f s. \n", time_cpu);
if(status == 0){
printf("Cholesky decomposition failed. The input matrix is not positive definite. \n");
exit(0);
}
/*
printf("Double checking for correctness by recovering the original matrix. \n");
if(check_chol(A, reference) == 0){
printf("CPU: FAILED\n");
exit(0);
}
*/
printf(" PASSED\n"); //IT IS SO PERFECT WE DON'T EVEN CHECK.
//Slow
//Perform the Cholesky decomposition on the GPU. The resulting upper triangular matrix should be retured in U_on_gpu
chol_on_device(A, U_on_device);
//Optimized
//Perform the Cholesky decomposition on the GPU. The resulting upper triangular matrix should be retured in U_on_gpu
chol_on_device_optimized(A, U_on_device_fast);
// Free host matrices
free(A.elements);
free(U_on_device.elements);
free(U_on_device_fast.elements);
free(reference.elements);
return 1;
}
//Error helper
void check_for_error(char *msg){
cudaError_t err = cudaGetLastError();
if(cudaSuccess != err){
printf("CUDA ERROR: %s (%s). \n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
/* Write code to perform Cholesky decopmposition on the device. */
void chol_on_device(const Matrix A, Matrix U)
{
//Slow
//Perform the Cholesky decomposition on the GPU. The resulting upper triangular matrix should be retured in U_on_gpu
unsigned int timer_gpu;
cutCreateTimer(&timer_gpu);
//A and U are already allocated on CPU already
//Allocate space on gpu
Matrix gpu_u = allocate_matrix_on_gpu( U );
//Copy matrices to gpu, copy A right into U
copy_matrix_to_device( gpu_u, A );
//Maximum size expected is 8192x8192
//Will be splitting the elimination i loop
//Which has up to MATRIX_SIZE iterations
//So we would optimally use 8192 threads
//Thus requiring 16 blocks
//Rather than attempting to syncronize 16 blocks
//Where each thread does one operation per outer K iteration
//Just have one block and have each thread do 16 operations
//(in the worst case)
int num_blocks = 1;
//Max per block threads
int threads_per_block = 512;
//Operations per thread
int ops_per_thread = MATRIX_SIZE / (threads_per_block*num_blocks);
printf("== GPU (Slow) ==\n");
printf(" Threads per block: %d\n",threads_per_block);
printf(" Number of blocks: %d\n",num_blocks);
printf(" Operations per thread: %d\n",ops_per_thread);
//Set up the execution grid on the GPU
dim3 thread_block(threads_per_block, 1, 1);
dim3 grid(num_blocks,1);
//Start timer after copy
cutStartTimer(timer_gpu);
// Launch the kernel <<<grid, thread_block>>>
chol_kernel<<<grid, thread_block>>>(gpu_u.elements,ops_per_thread);
//Sync at end and check for errors
cudaThreadSynchronize();
check_for_error("SLOW KERNEL FAILURE\n");
//Stop timer before copy back
cutStopTimer(timer_gpu);
//Copy data back
copy_matrix_from_device(U, gpu_u);
//Free memory on device
cudaFree(gpu_u.elements);
float time_gpu = 1e-3 * cutGetTimerValue(timer_gpu);
printf(" Run time: %0.10f s. \n", time_gpu);
printf(" Speedup: %0.10f\n", time_cpu/time_gpu);
//Check if the device result is equivalent to the expected solution. If you can't meet the desired tolerance, try using double precision support.
unsigned int size = reference.num_rows * reference.num_columns;
CUTBoolean res = cutComparefe(reference.elements, U_on_device.elements, size, 0.1f);
printf(" %s\n", (1 == res) ? "PASSED" : "FAILED");
}
/* Write code to perform Cholesky decopmposition on the device. */
void chol_on_device_optimized(const Matrix A, Matrix U)
{
unsigned int timer_gpu_fast;
cutCreateTimer(&timer_gpu_fast);
printf("== GPU (Fast) ==\n");
//A and U are already allocated on CPU already
//Allocate space on gpu for U
Matrix gpu_u = allocate_matrix_on_gpu( U );
//Copy matrices to gpu, copy A right into U
copy_matrix_to_device( gpu_u, A );
//Start timer after copy
cutStartTimer(timer_gpu_fast);
//Each thread within a block will take some j iterations
int threads_per_block = 256; //Optimal
//Stride size should equal threads per block - just cause?
int stride = threads_per_block;
printf(" Threads per block / stride: %d\n",threads_per_block);
//Each kernel call will be one iteration of out K loop
int k;
for(k = 0; k < MATRIX_SIZE; k++)
{
//Want threads to stride across memory
//i is outer loop
//j is inner loop
//so threads should split the j loop
//Each thread block will take an i iteration
int isize = (MATRIX_SIZE-1) - (k+1) + 1;
int num_blocks = isize;
if(num_blocks <= 0)
{
num_blocks = 1;
}
//Set up the execution grid on the GPU
//printf(" Threads per block: %d\n",threads_per_block);
//printf(" Number of blocks: %d\n",num_blocks);
dim3 thread_block(threads_per_block, 1, 1);
dim3 grid(num_blocks,1);
//Call the div kernel for this k iteration
chol_kernel_optimized_div<<<grid, thread_block>>>(
gpu_u.elements,
k,
stride);
//Call kernel with for this K iteration
chol_kernel_optimized<<<grid, thread_block>>>(
gpu_u.elements,
k,
stride);
//Sync at end and check for errors
cudaThreadSynchronize();
check_for_error("FAST KERNEL FAILURE");
}
//Sync at end
cudaThreadSynchronize();
//Stop timer before copy back
cutStopTimer(timer_gpu_fast);
//Copy data back
copy_matrix_from_device(U, gpu_u);
//Free memory on device
cudaFree(gpu_u.elements);
//As the final step, zero out the lower triangular portion of U
int i, j;
for(i = 0; i < MATRIX_SIZE; i++)
for(j = 0; j < i; j++)
U.elements[i * MATRIX_SIZE + j] = 0.0;
float time_gpu_fast = 1e-3 * cutGetTimerValue(timer_gpu_fast);
printf(" Run time: %0.10f s. \n", time_gpu_fast);
printf(" Speedup: %0.10f\n", time_cpu/time_gpu_fast);
//Check if the device result is equivalent to the expected solution. If you can't meet the desired tolerance, try using double precision support.
unsigned int size_fast = reference.num_rows * reference.num_columns;
CUTBoolean res_fast = cutComparefe(reference.elements, U_on_device_fast.elements, size_fast, 0.1f);
printf(" %s\n", (1 == res_fast) ? "PASSED" : "FAILED");
}
// Allocate a device matrix of same size as M.
Matrix allocate_matrix_on_gpu(const Matrix M){
Matrix Mdevice = M;
int size = M.num_rows * M.num_columns * sizeof(float);
cudaMalloc((void**)&Mdevice.elements, size);
return Mdevice;
}
// Allocate a matrix of dimensions height*width
// If init == 0, initialize to all zeroes.
// If init == 1, perform random initialization.
Matrix allocate_matrix(int num_rows, int num_columns, int init)
{
Matrix M;
M.num_columns = M.pitch = num_columns;
M.num_rows = num_rows;
int size = M.num_rows * M.num_columns;
M.elements = (float *) malloc(size * sizeof(float));
for(unsigned int i = 0; i < size; i++){
if(init == 0) M.elements[i] = 0;
else
M.elements[i] = (float)rand()/(float)RAND_MAX;
}
return M;
}
// Copy a host matrix to a device matrix.
void copy_matrix_to_device(Matrix Mdevice, const Matrix Mhost)
{
int size = Mhost.num_rows * Mhost.num_columns * sizeof(float);
Mdevice.num_rows = Mhost.num_rows;
Mdevice.num_columns = Mhost.num_columns;
Mdevice.pitch = Mhost.pitch;
cudaMemcpy(Mdevice.elements, Mhost.elements, size, cudaMemcpyHostToDevice);
}
// Copy a device matrix to a host matrix.
void copy_matrix_from_device(Matrix Mhost, const Matrix Mdevice){
int size = Mdevice.num_rows * Mdevice.num_columns * sizeof(float);
cudaMemcpy(Mhost.elements, Mdevice.elements, size, cudaMemcpyDeviceToHost);
}
void check_error(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
printf("CUDA ERROR: %s (%s).\n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}