Skip to content

Commit

Permalink
Add omp parallel for in MatMul lib (NNPA) (onnx#2702)
Browse files Browse the repository at this point in the history
* parallel for

Signed-off-by: Chen Tong <chentong@us.ibm.com>

* run through

Signed-off-by: Tong Chen <chentong@linux0d.pok.ibm.com>

* flag

Signed-off-by: Tong Chen <chentong@us.ibm.com>

* format

Signed-off-by: chentong319 <chentong@us.ibm.com>

* nested

Signed-off-by: Tong Chen <chentong@us.ibm.com>

* format

Signed-off-by: Tong Chen <chentong@us.ibm.com>

* no omp.h

Signed-off-by: Tong Chen <chentong@us.ibm.com>

* lit test

Signed-off-by: Tong Chen <chentong@us.ibm.com>

* test

Signed-off-by: Tong Chen <chentong@us.ibm.com>

* polish

Signed-off-by: Tong Chen <chentong@us.ibm.com>

* move code

Signed-off-by: Tong Chen <chentong@us.ibm.com>

* format

Signed-off-by: Tong Chen <chentong@us.ibm.com>

* cleanup

Signed-off-by: Tong Chen <chentong@us.ibm.com>

---------

Signed-off-by: Chen Tong <chentong@us.ibm.com>
Signed-off-by: Tong Chen <chentong@linux0d.pok.ibm.com>
Signed-off-by: Tong Chen <chentong@us.ibm.com>
Signed-off-by: chentong319 <chentong@us.ibm.com>
Co-authored-by: Tung D. Le <tung@jp.ibm.com>
  • Loading branch information
chentong319 and tungld authored Feb 19, 2024
1 parent 15030f3 commit 0124472
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 13 deletions.
2 changes: 1 addition & 1 deletion src/Accelerators/NNPA/NNPAAccelerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ NNPAAccelerator::NNPAAccelerator() : Accelerator(Accelerator::Kind::NNPA) {

acceleratorTargets.push_back(this);
// Order is important! libRuntimeNNPA depends on libzdnn
addCompilerConfig(CCM_SHARED_LIB_DEPS, {"RuntimeNNPA", "zdnn"});
addCompilerConfig(CCM_SHARED_LIB_DEPS, {"RuntimeNNPA", "zdnn"}, true);
};

NNPAAccelerator::~NNPAAccelerator() { delete instance; }
Expand Down
2 changes: 1 addition & 1 deletion src/Accelerators/NNPA/Runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ set_target_properties(RuntimeNNPA
PROPERTIES
LANGUAGE C
POSITION_INDEPENDENT_CODE TRUE
COMPILE_OPTIONS -O3
COMPILE_OPTIONS "-O3;-fopenmp"
)

37 changes: 30 additions & 7 deletions src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <assert.h>
#include <math.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
Expand All @@ -41,14 +42,18 @@ static inline zdnn_status call_zdnn_matmul_op(const zdnn_ztensor *inputA,
inputA, inputB, inputC, (zdnn_matmul_ops)opType, output);
}

// It is supposed that sched.h should have the declaration of sched_getcpu.
// No problem when a standalone test case is compiled with clang or g++.
// But in onnx-mlir, this function is not defined. Explicitly define it here
// ToFix: find the correct include file.
extern int sched_getcpu();

static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
const zdnn_ztensor *inputB, const zdnn_ztensor *inputC, int opType,
zdnn_ztensor *output, bool isBcast) {
double totalTime = 0.;
clock_t start_time = 0, end_time = 0;

if (OMZTensorSplitDebug)
start_time = clock();
struct timeval start_t, end_t;
struct timeval start_t1, end_t1;

// For a MatMul of A(M,N)*B(N,P)+C(P),
// We split M that is e2 in (e4, e3, e2, e1), and P that is e1.
Expand All @@ -65,6 +70,10 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
.axis = E2,
.numOfElemsPerTile = OMZTensorSplitSize};

if (OMZTensorSplitDebug) {
gettimeofday(&start_t, NULL);
}

initSplitInfo(&splitInfoA, true, "MatMul A");
initSplitInfo(&splitInfoB, true, "MatMul B");
initSplitInfo(&splitInfoC, true, "MatMul C");
Expand All @@ -75,6 +84,10 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
copyData(&splitInfoB, FULL_TO_TILES);
copyData(&splitInfoC, FULL_TO_TILES);

if (OMZTensorSplitDebug) {
gettimeofday(&start_t1, NULL);
}

// Call zdnn_matmul_op on each tile.
// Iterate over the tiles along the first dim of A.
for (uint32_t i = 0; i < splitInfoA.numOfTiles; ++i) {
Expand All @@ -85,6 +98,7 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
.axis = E1,
.numOfElemsPerTile = OMZTensorSplitSize};
initSplitInfo(&splitInfoYB, true, "MatMul YB");

// Iterate over the tiles along the second dim of B.
for (uint32_t j = 0; j < splitInfoB.numOfTiles; ++j) {
zdnn_ztensor *zbTensor = splitInfoB.tiles + j;
Expand All @@ -93,11 +107,20 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
zdnn_status status = call_zdnn_matmul_op(
zaTensor, zbTensor, zcTensor, opType, zybTensor, isBcast);
assert(status == ZDNN_OK);
if (OMZTensorSplitDebug) {
printf("thread [%u, %u] is on cpu %d\n", i, j, sched_getcpu());
}
}
copyData(&splitInfoYB, TILES_TO_FULL);
FreeSplitInfoData(&splitInfoYB);
}

if (OMZTensorSplitDebug) {
gettimeofday(&end_t1, NULL);
totalTime = GetElapseTime(start_t1, end_t1);
printf("[MatMul] mm loop time, %f (milliseconds)\n", totalTime);
}

// Copy data from the tiles back to the full ztensor.
copyData(&splitInfoY, TILES_TO_FULL);

Expand All @@ -108,8 +131,8 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
FreeSplitInfoData(&splitInfoY);

if (OMZTensorSplitDebug) {
end_time = clock();
totalTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
gettimeofday(&end_t, NULL);
totalTime = GetElapseTime(start_t, end_t);
printf("[MatMul] total time, %f (milliseconds)\n", totalTime);
}

Expand All @@ -135,7 +158,7 @@ zdnn_status zdnn_matmul_bcast_op_ext(const zdnn_ztensor *inputA,
inputA, inputB, inputC, opType, output, /*isBcast=*/true);
// Compiler does not check the return result at this moment. Thus, check it
// here.
assert(status == ZDNN_OK && "Failed to execute MatMul on NNPA");
assert(status == ZDNN_OK);
return status;
}

Expand Down
6 changes: 6 additions & 0 deletions src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.c
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,12 @@ void printSplitInfo(const SplitInfo *splitInfo, const char *tag) {
splitInfo->reuseFullBuffer);
}

float GetElapseTime(const struct timeval start_t, const struct timeval end_t) {
return (((end_t.tv_sec * 1000000.) + end_t.tv_usec) -
((start_t.tv_sec * 1000000) + start_t.tv_usec)) /
1000;
}

#ifdef __cplusplus
}
#endif
8 changes: 8 additions & 0 deletions src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#ifndef ONNX_MLIR_ZDNNEXTENSION_H
#define ONNX_MLIR_ZDNNEXTENSION_H

#include <stdlib.h>
#include <sys/time.h>

#include "zdnn.h"

#ifdef __cplusplus
Expand Down Expand Up @@ -254,6 +257,11 @@ zdnn_status zdnn_softmax_ext(const zdnn_ztensor *input, void *save_area,
zdnn_softmax_act act_func, zdnn_ztensor *output);
zdnn_status zdnn_tanh_ext(const zdnn_ztensor *input, zdnn_ztensor *output);

// -----------------------------------------------------------------------------
// Misc Utility Functions
// -----------------------------------------------------------------------------
float GetElapseTime(const struct timeval start_t, const struct timeval end_t);

#ifdef __cplusplus
}
#endif
Expand Down
4 changes: 2 additions & 2 deletions src/Compiler/CompilerOptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -957,10 +957,10 @@ std::vector<std::string> getCompilerConfig(std::string k) {

// Add strings in a vector to the string vector associated
// with the specified key
void addCompilerConfig(std::string k, std::vector<std::string> v) {
void addCompilerConfig(std::string k, std::vector<std::string> v, bool head) {
std::vector<std::string> u = CompilerConfigMap[k];

u.insert(u.end(), v.begin(), v.end());
u.insert(head ? u.begin() : u.end(), v.begin(), v.end());
CompilerConfigMap[k] = u;
}

Expand Down
3 changes: 2 additions & 1 deletion src/Compiler/CompilerOptions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,8 @@ std::string getCompilerOption(const onnx_mlir::OptionKind kind);
// The add and del functions are not thread-safe and should only be
// called from one thread.
std::vector<std::string> getCompilerConfig(std::string k);
void addCompilerConfig(std::string k, std::vector<std::string> v);
void addCompilerConfig(
std::string k, std::vector<std::string> v, bool head = false);
void delCompilerConfig(std::string k, std::vector<std::string> v);

// Functions related to initializing compiler configuration states based on
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ module {
}
// CHECK: {{.*}} opt {{.*}} -o {{.*}}.bc
// CHECK-NEXT: {{.*}} llc {{.*}} {{.*}} {{.*}}.bc
// CHECK-NEXT: {{.*}} {{clang|c|g}}++{{.*}} {{.*}}.o -o {{.*}}.so -shared -fPIC -L{{.*}}/lib -lcruntime -lRuntimeNNPA -lzdnn
// CHECK-NEXT: {{.*}} {{clang|c|g}}++{{.*}} {{.*}}.o -o {{.*}}.so -shared -fPIC -L{{.*}}/lib -lRuntimeNNPA -lzdnn -lcruntime

0 comments on commit 0124472

Please sign in to comment.