-
Notifications
You must be signed in to change notification settings - Fork 9
/
matrix.cuh
154 lines (135 loc) · 5.46 KB
/
matrix.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#pragma once
#include "handle_cuda_err.hpp"
#include <cassert>
namespace qvis {
bool isManaged(cudaPointerAttributes &attr) {
#if CUDART_VERSION < 10000 // isManaged deprecated in CUDA 10.
return attr.isManaged != 0;
#else // attr.type doesn't exist before CUDA 10
return attr.type == cudaMemoryTypeManaged;
#endif
}
template <typename ElementType> struct MatrixPitched {
// data is optimized for aligned (e.g. GPU) reading, you should not access it directly,
// using [] operater instead
ElementType *data_; // Host or GPU, row * col, matrix data
size_t pitch; // Host & GPU, pitch bytes, for alligment
unsigned col; // Host & GPU
unsigned row; // Host & GPU
__host__ __device__ MatrixPitched() {
col = 0;
row = 0;
data_ = nullptr;
}
__host__ __device__ MatrixPitched(unsigned row, unsigned col, size_t pitch, ElementType *data) {
this->row = row;
this->col = col;
this->pitch = pitch;
this->data_ = data;
}
__host__ void free() {
if (data_ != nullptr) {
HANDLE_ERROR(cudaFree(data_));
data_ = nullptr;
}
}
__host__ __device__ ElementType *&data() { return data_; }
__host__ __device__ const ElementType *data() const { return data_; }
__host__ void allocate_memory_managed(int col, int row) {
this->col = col;
this->row = row;
if (col * row == 0) {
return;
}
pitch = (sizeof(ElementType) * col + 128 - 1) / 128 * 128;
HANDLE_ERROR(cudaMallocManaged((void **)&data_, pitch * row));
}
// transpose data and set
// @param data row * data_row_stride,
// @param row data row
// @param col data colom
__host__ void set_data_transpose_gpu(const ElementType *data, unsigned row, unsigned col,
unsigned data_row_stride) {
this->col = row;
this->row = col;
if (row == 0 || col == 0) {
return;
}
assert(data_ == nullptr);
HANDLE_ERROR(cudaMallocPitch((void **)&data_, &pitch, sizeof(ElementType) * this->col, this->row));
uint8_t *data_host;
HANDLE_ERROR(cudaMallocHost((void **)&data_host, this->row * pitch)); // Unpaged Memory
for (size_t j = 0; j < col; j++) {
ElementType *data_row = (ElementType *)(data_host + j * pitch);
for (size_t i = 0; i < row; i++) {
data_row[i] = data[i * data_row_stride + j];
}
}
HANDLE_ERROR(cudaMemcpy(data_, data_host, this->row * pitch, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaFreeHost(data_host));
}
__host__ void set_to_gpu(const MatrixPitched<ElementType> &matrix) {
if (matrix.data() == nullptr) {
return;
}
cudaPointerAttributes att;
HANDLE_ERROR(cudaPointerGetAttributes(&att, matrix.data()));
assert(att.memoryType == cudaMemoryTypeHost ||
isManaged(att)); // it has been deprecated in favour of cudaPointerAttributes::type.
*this = matrix;
if (isManaged(att)) { // use CUDA unified memory
return;
}
HANDLE_ERROR(cudaMalloc((void **)&data_, matrix.row * matrix.pitch));
HANDLE_ERROR(cudaMemcpy(data_, matrix.data(), this->row * pitch, cudaMemcpyHostToDevice));
}
__device__ __host__ const ElementType *operator[](int index) const {
return (ElementType *)(((uint8_t *)data_) + index * pitch);
}
__device__ __host__ ElementType *operator[](int index) {
return (ElementType *)(((uint8_t *)data_) + index * pitch);
}
__host__ void save_data_cpu(const char *filename) {
std::ofstream out(filename, std::ios::binary);
if (!out.is_open()) {
std::cout << "can not open data output file" << std::endl;
return;
}
for (unsigned i = 0; i < row; i++) {
out.write(reinterpret_cast<const char *>(&col), sizeof(unsigned));
out.write(reinterpret_cast<const char *>(data_ + i * (pitch / sizeof(ElementType))),
col * sizeof(ElementType));
}
out.close();
}
__host__ void load_data_cpu(const char *filename) {
std::ifstream in(filename, std::ios::binary);
if (!in.is_open()) {
std::cout << "can not open data input file" << std::endl;
return;
}
for (unsigned i = 0; i < row; i++) {
in.seekg(sizeof(unsigned), std::ios::cur);
in.read(reinterpret_cast<char *>(data_ + i * (pitch / sizeof(ElementType))), col * sizeof(ElementType));
}
in.close();
}
__host__ void save_data_gpu(const char *filename) {
std::ofstream out(filename, std::ios::binary);
if (!out.is_open()) {
std::cout << "can not open data output file" << std::endl;
return;
}
ElementType *data_host;
HANDLE_ERROR(cudaMallocHost((void **)&data_host, this->row * pitch)); // Unpaged Memory
HANDLE_ERROR(cudaMemcpy(data_host, data, this->row * pitch, cudaMemcpyDeviceToHost));
for (unsigned i = 0; i < row; i++) {
out.write(reinterpret_cast<const char *>(&col), sizeof(unsigned));
out.write(reinterpret_cast<const char *>(data_host + i * (pitch / sizeof(ElementType))),
col * sizeof(ElementType));
}
out.close();
HANDLE_ERROR(cudaFreeHost(data_host));
}
};
} // namespace qvis