-
Notifications
You must be signed in to change notification settings - Fork 1
/
clwrap.hpp
605 lines (491 loc) · 14.1 KB
/
clwrap.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
#ifndef __CLWRAP_HPP_DEFINED__
#define __CLWRAP_HPP_DEFINED__
// clwrap.hpp is a C++ header file that wraps OpenCL host boilerplate
// codes and provides some performance and power measurement hooks.
//
// Written by Kaz Yoshii <kazutomo@mcs.anl.gov>
//
// Tested platforms:
// Intel OpenCL SDK for Intel embedded GPUs (Gen9)
// Intel OpenCL SDK for FPGAs (e.g., Nallatech 385A)
// Intel OpenCL CPUs
//
// LICENSE: BSD 3-clause
//
// (setq c-basic-offset 8)
#include <sys/stat.h>
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iomanip>
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY
#include <CL/cl2.hpp>
// cl.hpp
// typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
// cl2.hpp
// w/ CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY
// typedef vector<std::pair<const void*, size_type> > Binaries
// w/o
// typedef vector<vector<unsigned char>> Binaries
// e.g., local referece
// /soft/fpga/altera/pro/16.0.2.222/hld/host/include/CL/cl.hpp
// uncomment below for reading the board power. only tested this on Nallatech385A
#define ENABLE_AOCL_MMD_HACK
#ifdef ENABLE_AOCL_MMD_HACK
extern "C" void aocl_mmd_card_info(const char *name , int id,
size_t sz,
void *v, size_t *retsize);
#endif
class clwrap {
public:
const int version_major = 0;
const int version_minor = 10;
// VALUE: pass by value, otherwise passed by reference
enum dir_enum { VALUE, HOST2DEV, DEV2HOST, DUPLEX };
typedef const char* here_t;
class profile_event {
public:
enum evtype {EV_NDRANGE, EV_WRITE, EV_READ, EV_MARKER};
evtype et;
int argidx; // for EV_{WRITE|READ}
cl::Event ev;
double start_sec;
double end_sec;
size_t sz;
profile_event(evtype et, int argidx = 0) {
this->et = et;
this->argidx = argidx;
}
};
private:
std::vector<profile_event> p_evs;
bool profiling_enabled;
struct arg_struct {
dir_enum dir;
size_t sz;
void *data;
bool buffered;
cl::Buffer buf;
};
std::vector<cl::Platform> pfs; // initialized only in c'tor
std::vector<cl::Device> devs; // initialized only in c'tor
std::vector<cl::Device> dev_selected; // initialized only in c'tor
std::vector<cl::Program> prgs; // clear and push_back in prepKernel()
cl::Context ctx;
// selected ids
int platform_id, device_id, program_id;
cl::Event kernel_event;
cl::CommandQueue queue;
cl::Kernel kernel;
std::vector<struct arg_struct> kargs;
std::vector<char> kernelbuf;
enum loader {
SRC=0,
BIN=1
};
typedef std::pair<std::string,enum loader> kext_t;
std::vector<kext_t> kexts;
bool flag_dumpkernel;
int flag_verbose;
const char* getkernelbuf()
{
return &kernelbuf[0];
}
size_t getkernelbufsize()
{
return kernelbuf.size();
}
// return 0 if success
bool loadkernel(std::string fn, bool nullterminate=false)
{
size_t sz;
std::ifstream f(fn.c_str(), std::ifstream::binary);
if (! f.is_open()) {
return false;
}
f.seekg(0, f.end);
sz = f.tellg();
f.seekg(0, f.beg);
if (nullterminate)
kernelbuf.resize(sz+1);
else
kernelbuf.resize(sz);
f.read(&kernelbuf[0], sz);
if (nullterminate)
kernelbuf[sz] = 0;
f.close();
return true;
}
bool dumpkernel(std::string fn, const cl_ulong sz, const char *buf)
{
std::ofstream f(fn.c_str(), std::ostream::binary);
f.write(buf, sz);
return true;
}
bool fexists(const std::string fn)
{
struct stat st;
return (stat(fn.c_str(), &st) == 0);
}
bool loadprog_bin(std::string fn) {
if (! loadkernel(fn))
return false;
cl::Program::Binaries bin;
bin.push_back({getkernelbuf(),getkernelbufsize()});
std::vector<int> binaryStatus;
cl_int err = CL_SUCCESS;
cl::Program p(ctx, dev_selected, bin, &binaryStatus, &err);
// std::cout << "err=" << err << std::endl;
if (err != CL_SUCCESS) {
std::cout << "fn=" << fn << std::endl;
std::cout << "Program failed to build: " << err << std::endl;
std::cout << p.getBuildInfo<CL_PROGRAM_BUILD_LOG>(dev_selected[0]);
return false;
}
prgs.push_back(p);
if (flag_verbose > 0) {
std::cout << "binary prog: " << fn << " is loaded.\n";
}
return true;
}
bool loadprog_src(std::string fn) {
cl_int err = CL_SUCCESS;
if (! loadkernel(fn, true))
return false;
cl::Program::Sources src;
src.push_back({getkernelbuf(),getkernelbufsize()});
cl::Program p(ctx, src, &err);
if (err != CL_SUCCESS) {
std::cout << "Program failed" << err << std::endl;
return false;
}
err = p.build(dev_selected);
// err = p.build(dev_selected, "-cl-intel-gtpin-rera");
if (err != CL_SUCCESS) {
std::cout << "Program failed to build: " << err << std::endl;
std::cout << p.getBuildInfo<CL_PROGRAM_BUILD_LOG>(dev_selected[0]);
return false;
}
prgs.push_back(p);
if (flag_verbose > 0) {
std::cout << "text prog: " << fn << " is loaded.\n";
}
#if 0
std::vector<unsigned long> kszs =
p.getInfo<CL_PROGRAM_BINARY_SIZES>();
std::vector<char*> bins;
bins.push_back( new char[*kszs.begin()] );
p.getInfo(CL_PROGRAM_BINARIES, &bins[0]);
savefile("test.bin", *kszs.begin(), bins[0] );
#endif
return true;
}
public:
const std::vector<profile_event> &get_p_evs() {return p_evs; }
#ifdef AOCL_MMD_HACK
// technically this function should be called in other thread context
// to measure the power consumption while kernel is running.
float readboardpower(void) {
float pwr;
size_t retsize;
aocl_mmd_card_info("aclnalla_pcie0", 9,
sizeof(float),
(void*)&pwr, &retsize);
return pwr;
}
#else
float readboardpower(void) {
return 0.0;
}
#endif
// constructor
clwrap(int pid=0, int did=0) {
kexts = {{".aocx", BIN},
{".bin", BIN},
{".cl", SRC}
};
flag_dumpkernel = false;
flag_verbose = 0;
if(const char *env = std::getenv("CLW_VERBOSE"))
flag_verbose = std::atoi(env);
if(std::getenv("CLW_DUMPKERNEL"))
flag_verbose = true;
cl::Platform::get(&pfs);
if (pfs.size() == 0) {
std::cout << "No platform found" << std::endl;
return;
}
/* set default platform id and device id */
platform_id = pid;
/*
platform id
Intel CPU: "Intel(R) CPU Runtime for OpenCL(TM) Applications"
Intel CPU: "Experimental OpenCL 2.1 CPU Only Platform"
Intel Gen GPU (NEO): "Intel(R) OpenCL HD Graphics"
Intel FPGA: "Intel(R) FPGA SDK for OpenCL(TM)"
pocl: "Portable Computing Language"
*/
#ifdef INTEL
std::string pfkey = "HD";
#endif
#ifdef NVIDIA
std::string pfkey = "CUDA";
#endif
if(const char *env = std::getenv("CLW_PF")) {
pfkey = env;
std::cout << "Platform search: " << pfkey << std::endl;
}
platform_id = -1;
for (int i = 0; i < (int)pfs.size(); i++) {
std::string pn = pfs[i].getInfo<CL_PLATFORM_NAME>();
std::cout << pn << std::endl;
if (pn.find(pfkey.c_str()) != std::string::npos) {
platform_id = i;
break;
}
}
if (platform_id < 0) {
std::cout << "No platform found" << std::endl;
return;
}
pfs[platform_id].getDevices(CL_DEVICE_TYPE_ALL, &devs);
if (devs.size() == 0) {
std::cout << "No device found" << std::endl;
return;
}
ctx = devs[did];
device_id = did;
dev_selected.push_back(devs[device_id]);
program_id = 0; //
}
void listPlatforms(void) {
std::cout << "[Platforms]\n";
for (int i = 0; i < (int)pfs.size(); i++) {
std::cout << i << ": " << pfs[i].getInfo<CL_PLATFORM_NAME>();
if (i == platform_id) std::cout << " [selected]";
std::cout << std::endl;
}
}
void listDevices(void) {
std::cout << "[Devices]\n";
for (int i = 0; i < (int)devs.size(); i++) {
std::cout << "Device" << i << ": " << devs[i].getInfo<CL_DEVICE_NAME>();
// std::cout << " " << devs[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << " ";
if (i == device_id) std::cout << " [selected]";
#if 0
// to query shared virtual memory capabilities
cl_device_svm_capabilities svmcap;
devs[i].getInfo(CL_DEVICE_SVM_CAPABILITIES, &svmcap);
// CL_INVALID_VALUE is returned, no svm is supported
if( svmcap&CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ) std::cout << "CGBUF ";
if( svmcap&CL_DEVICE_SVM_FINE_GRAIN_BUFFER ) std::cout << "FGBUF ";
if( svmcap&CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ) std::cout << "FGSYS ";
if( svmcap&CL_DEVICE_SVM_ATOMICS ) std::cout << "ATOM "; // only for fine grain
#endif
std::cout << std::endl;
}
}
void info(void) {
std::cout << "clwrap version " << version_major << "." << version_minor << std::endl;
listPlatforms();
listDevices();
}
bool prepKernel(const char *filename, const char *funcname = NULL, bool enableProfile = true) {
std::string fn = filename;
cl_int err = CL_SUCCESS;
size_t pos = fn.find_last_of(".");
kext_t kext_found = {"", BIN};
profiling_enabled = enableProfile;
prgs.clear();
if (pos == std::string::npos) {
// the file extension is omitted
std::string tmpfn;
if (!funcname) funcname = filename;
for (std::vector<kext_t>::iterator it = kexts.begin(); it != kexts.end(); ++it) {
tmpfn = fn + (*it).first;
if (fexists(tmpfn)) {
fn = tmpfn;
kext_found = *it;
break;
}
}
} else {
// assume that the file extension is explicitly specified
std::string e = fn.substr(pos);
for (std::vector<kext_t>::iterator it = kexts.begin(); it != kexts.end(); ++it) {
if (e == (*it).first) {
kext_found = *it;
break;
}
}
}
if (kext_found.first == "") {
std::cout << "Error: no kernel file found! " << filename << std::endl;
return false;
}
if (kext_found.second == BIN) {
if (! loadprog_bin(fn)) {
return false;
}
} else {
if (! loadprog_src(fn)) {
return false;
}
}
// create a command queue and kernel
cl_command_queue_properties prop = 0;
if (profiling_enabled)
prop = CL_QUEUE_PROFILING_ENABLE;
queue = cl::CommandQueue(ctx, dev_selected[0], prop, &err);
kernel = cl::Kernel(prgs[program_id], funcname, &err);
if (err != CL_SUCCESS) {
switch(err) {
case CL_INVALID_PROGRAM: std::cout << "CL_INVALID_PROGRAM\n"; break;
case CL_INVALID_PROGRAM_EXECUTABLE: std::cout << "CL_INVALID_PROGRAM_EXECUTABLE\n"; break;
case CL_INVALID_KERNEL_NAME: std::cout << "CL_INVALID_KERNEL_NAME\n"; break;
case CL_INVALID_KERNEL_DEFINITION: std::cout << "CL_INVALID_KERNEL_DEFINITION\n"; break;
default:
std::cout << "cl::Kernel() failed:" << err << std::endl;
}
return false;
}
return true;
}
/* */
cl::Kernel getKernel(void) { return kernel; }
cl::CommandQueue getQueue(void) { return queue; }
cl::Context getContext(void) { return ctx; }
cl_mem_flags get_mem_flag(dir_enum dir) {
cl_mem_flags rc = CL_MEM_READ_WRITE;
switch (dir) {
case HOST2DEV:
rc = CL_MEM_READ_ONLY; break;
case DEV2HOST:
rc = CL_MEM_WRITE_ONLY; break;
default:
rc = CL_MEM_READ_WRITE;
}
return rc;
}
void clearArgs(void) {
// for (std::vector<arg_struct>::iterator it = kargs.begin(); it != kargs.end(); ++it)
kargs.clear();
}
// return the index of the added argument
int appendArg(size_t sz, void *data,
dir_enum dir = VALUE) {
bool buffered = false;
if (dir != VALUE) buffered = true;
kargs.push_back(arg_struct());
int idx = kargs.size() - 1;
kargs[idx].sz = sz;
kargs[idx].data = data;
kargs[idx].dir = dir;
kargs[idx].buffered = buffered;
if (buffered) {
cl::Buffer buf(ctx, get_mem_flag(dir), sz);
kargs[idx].buf = buf;
kernel.setArg(idx, buf);
} else {
kernel.setArg(idx, sz, data);
}
return idx;
}
double _ev_start_sec(cl::Event &ev) {
cl_ulong tmp;
ev.getProfilingInfo(CL_PROFILING_COMMAND_START, &tmp);
return (double)tmp * 1e-9;
}
double _ev_end_sec(cl::Event &ev) {
cl_ulong tmp;
ev.getProfilingInfo(CL_PROFILING_COMMAND_END, &tmp);
return (double)tmp * 1e-9;
}
void _fill_start_end_sec() {
if (! profiling_enabled) return;
for (std::vector<profile_event>::iterator it = p_evs.begin(); it != p_evs.end(); ++it) {
it->start_sec = _ev_start_sec(it->ev);
it->end_sec = _ev_end_sec(it->ev);
}
}
void writeToDevice(void) {
int argidx = 0;
for (std::vector<arg_struct>::iterator it = kargs.begin(); it != kargs.end(); ++it, ++argidx) {
if (it->dir == HOST2DEV || it->dir == DUPLEX)
{
cl::Event *evsp = NULL;
if (profiling_enabled) {
p_evs.push_back(profile_event(profile_event::EV_WRITE, argidx));
evsp = &(p_evs.back().ev);
p_evs.back().sz = it->sz;
}
// request a blocking WriteBuffer
queue.enqueueWriteBuffer(it->buf, CL_TRUE, 0, it->sz, it->data, NULL, evsp);
}
}
}
void readFromDevice(void) {
int argidx = 0;
for (std::vector<arg_struct>::iterator it = kargs.begin(); it != kargs.end(); ++it, ++argidx) {
if (it->dir == DEV2HOST || it->dir == DUPLEX) {
cl::Event *evsp = NULL;
if (profiling_enabled) {
p_evs.push_back(profile_event(profile_event::EV_READ, argidx));
p_evs.back().sz = it->sz;
evsp = &(p_evs.back().ev);
}
// request a blocking ReadBuffer
queue.enqueueReadBuffer(it->buf, CL_TRUE, 0, it->sz, it->data, NULL, evsp);
}
}
}
void runProducer(void) {
cl::NDRange gsz(1);
cl::NDRange lsz(1);
writeToDevice();
queue.enqueueNDRangeKernel(
kernel,
cl::NullRange, // offset
gsz,
lsz,
NULL,
NULL);
}
void runKernel(cl::NDRange &gsz, cl::NDRange &lsz,
bool docopy = true) {
if (docopy) writeToDevice();
if (profiling_enabled)
p_evs.push_back(profile_event(profile_event::EV_NDRANGE));
queue.enqueueNDRangeKernel(
kernel,
cl::NullRange, // offset
gsz,
lsz,
NULL, // events
&kernel_event);
kernel_event.wait();
if (profiling_enabled)
p_evs.back().ev = kernel_event;
if (docopy) readFromDevice();
}
void finish(){
queue.finish();
_fill_start_end_sec();
}
void runKernel(int gsz, int lsz = 0, bool docopy = true) {
cl::NDRange ngsz(gsz);
cl::NDRange nlsz(lsz);
if (lsz == 0) nlsz = cl::NullRange;
runKernel(ngsz, nlsz, docopy);
}
double getKernelElapsedNanoSec(void) {
cl_ulong start, end;
kernel_event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start);
kernel_event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end);
return end - start;
}
};
#endif