-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTensor.cpp
172 lines (145 loc) · 5.58 KB
/
Tensor.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#include <iostream>
#include <cmath>
#include <random>
#include <iomanip>
#include <cstring>
#ifdef AVX
#include <omp.h>
#include <immintrin.h>
#endif
#include "Tensor.h"
#include "exceptions.h"
Tensor::Tensor(int rows, int cols) : rows(rows), cols(cols), size(rows * cols) {
data = new float[size];
}
Tensor::~Tensor() {
if (data != nullptr){
delete[] data;
data = nullptr;
}
}
void Tensor::kaimingInit(int fan_in) {
std::random_device rd;
std::mt19937 gen(rd());
const float bound = std::sqrt(2.0 / fan_in);
std::normal_distribution<float> distribution(0.0f, bound);
for (int i = 0; i < size; ++i) {
data[i] = distribution(gen);
}
}
void Tensor::identityInit() {
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
data[i * cols + j] = (i == j) ? 1.0f : 0.0f;
}
}
}
void Tensor::sequentialInit() {
for (int i = 0; i < size; ++i) {
data[i] = i + 1;
}
}
void Tensor::setOneInit(float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
std::ostream& operator<<(std::ostream& os, const Tensor& tensor) {
for (int i = 0; i < tensor.rows; ++i) {
for (int j = 0; j < tensor.cols; ++j) {
os << tensor.data[i * tensor.cols + j] << ", ";
}
os << std::endl;
}
return os;
}
Tensor* Tensor::operator*(const Tensor& other) const {
// std::cout << "Mult" << std::endl;
// std::cout << *this << " \n With \n" << other << std::endl;
if (cols != other.rows) {
throw SizeMismatchException(rows, cols, other.rows, other.cols);
}
Tensor *result = new Tensor(rows, other.cols);
result->setOneInit(0);
#ifdef NORMAL
for (int i = 0; i < rows; i++) {
for (int j = 0; j < other.cols; j++) {
result->data[i * other.cols + j] = 0;
for (int k = 0; k < cols; k++) {
result->data[i * other.cols + j] += data[i * cols + k] * other.data[k * other.cols + j];
}
}
}
#else
#ifdef AVX
int i, j, k, pipeline_id;
const int num_pipeline = 4;
__m256 a[num_pipeline], b[num_pipeline], result_register;
float temp_result[8];
#pragma omp parallel for private(a, b, result_register, temp_result, i, j, k, pipeline_id)
for(i=0; i<rows; i+=8){
float temp_result[8];
j = 0;
for(j=0; j<cols; j+=num_pipeline){
for (k=0; k<other.cols; k++){
result_register = _mm256_set_ps(result->data[(i)*other.cols + k], result->data[(i+1)*other.cols + k],
result->data[(i+2)*other.cols + k], result->data[(i+3)*other.cols + k],
result->data[(i+4)*other.cols + k], result->data[(i+5)*other.cols + k],
result->data[(i+6)*other.cols + k], result->data[(i+7)*other.cols + k]);
#pragma unroll
for(pipeline_id = 0; pipeline_id < num_pipeline; pipeline_id++){
a[pipeline_id] = _mm256_set_ps(data[(i)*cols + j+pipeline_id], data[(i+1)*cols + j+pipeline_id],
data[(i+2)*cols + j+pipeline_id], data[(i+3)*cols + j+pipeline_id],
data[(i+4)*cols + j+pipeline_id], data[(i+5)*cols + j+pipeline_id],
data[(i+6)*cols + j+pipeline_id], data[(i+7)*cols + j+pipeline_id]);
b[pipeline_id] = _mm256_set1_ps(other.data[(j+pipeline_id)*other.cols + k]);
result_register = _mm256_fmadd_ps(a[pipeline_id], b[pipeline_id], result_register);
}
_mm256_storeu_ps(&temp_result[0], result_register);
#pragma unroll
for(int res_id=0; res_id<8; res_id++){
result->data[(i+res_id)*other.cols + k] = temp_result[res_id];
}
}
}
}
#endif
#endif
// std::cout << "Result: " << result << "\n end mult" << std::endl;
return result;
}
Tensor::Tensor(const Tensor& other) : rows(other.rows), cols(other.cols), size(other.size) {
std::memcpy(data, other.data, sizeof(float) * size);
}
void Tensor::transpose() {
float temp;
float *result = new float[rows*cols];
// std::cout << "Transposing \n" << *this << std::endl;
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
result[j*rows + i] = data[i*cols + j];
}
}
delete[] data;
data = result;
temp = rows;
rows = cols;
cols = temp;
}
// Need to change this to 1D
Tensor** Tensor::vertical_split(int num_splits) {
int splitSize = cols / num_splits;
Tensor **result = new Tensor*[num_splits];
for (int i = 0; i < num_splits; ++i) {
result[i] = new Tensor(rows, splitSize);
}
for (int i = 0; i < num_splits; ++i) {
int col_offset = i * splitSize;
for (int row = 0; row < rows; ++row) {
for (int col = 0; col < splitSize; ++col) {
result[i]->data[row * splitSize + col] = data[row * cols + col_offset + col];
}
}
}
return result;
}