Add omp parallel for in MatMul lib (NNPA) (onnx#2702)

* parallel for Signed-off-by: Chen Tong <chentong@us.ibm.com> * run through Signed-off-by: Tong Chen <chentong@linux0d.pok.ibm.com> * flag Signed-off-by: Tong Chen <chentong@us.ibm.com> * format Signed-off-by: chentong319 <chentong@us.ibm.com> * nested Signed-off-by: Tong Chen <chentong@us.ibm.com> * format Signed-off-by: Tong Chen <chentong@us.ibm.com> * no omp.h Signed-off-by: Tong Chen <chentong@us.ibm.com> * lit test Signed-off-by: Tong Chen <chentong@us.ibm.com> * test Signed-off-by: Tong Chen <chentong@us.ibm.com> * polish Signed-off-by: Tong Chen <chentong@us.ibm.com> * move code Signed-off-by: Tong Chen <chentong@us.ibm.com> * format Signed-off-by: Tong Chen <chentong@us.ibm.com> * cleanup Signed-off-by: Tong Chen <chentong@us.ibm.com> --------- Signed-off-by: Chen Tong <chentong@us.ibm.com> Signed-off-by: Tong Chen <chentong@linux0d.pok.ibm.com> Signed-off-by: Tong Chen <chentong@us.ibm.com> Signed-off-by: chentong319 <chentong@us.ibm.com> Co-authored-by: Tung D. Le <tung@jp.ibm.com>
cjvolzka · Feb 19, 2024 · 0124472 · 0124472
1 parent 15030f3
commit 0124472
Show file tree

Hide file tree

Showing 8 changed files with 51 additions and 13 deletions.
diff --git a/src/Accelerators/NNPA/NNPAAccelerator.cpp b/src/Accelerators/NNPA/NNPAAccelerator.cpp
@@ -58,7 +58,7 @@ NNPAAccelerator::NNPAAccelerator() : Accelerator(Accelerator::Kind::NNPA) {
 
   acceleratorTargets.push_back(this);
   // Order is important! libRuntimeNNPA depends on libzdnn
-  addCompilerConfig(CCM_SHARED_LIB_DEPS, {"RuntimeNNPA", "zdnn"});
+  addCompilerConfig(CCM_SHARED_LIB_DEPS, {"RuntimeNNPA", "zdnn"}, true);
 };
 
 NNPAAccelerator::~NNPAAccelerator() { delete instance; }

diff --git a/src/Accelerators/NNPA/Runtime/CMakeLists.txt b/src/Accelerators/NNPA/Runtime/CMakeLists.txt
@@ -21,6 +21,6 @@ set_target_properties(RuntimeNNPA
   PROPERTIES
   LANGUAGE C
   POSITION_INDEPENDENT_CODE TRUE
-  COMPILE_OPTIONS -O3
+  COMPILE_OPTIONS "-O3;-fopenmp"
   )
 
diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c
@@ -20,6 +20,7 @@
 
 #include <assert.h>
 #include <math.h>
+#include <sched.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
@@ -41,14 +42,18 @@ static inline zdnn_status call_zdnn_matmul_op(const zdnn_ztensor *inputA,
       inputA, inputB, inputC, (zdnn_matmul_ops)opType, output);
 }
 
+// It is supposed that sched.h should have the declaration of sched_getcpu.
+// No problem when a standalone test case is compiled with clang or g++.
+// But in onnx-mlir, this function is not defined. Explicitly define it here
+// ToFix: find the correct include file.
+extern int sched_getcpu();
+
 static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
     const zdnn_ztensor *inputB, const zdnn_ztensor *inputC, int opType,
     zdnn_ztensor *output, bool isBcast) {
   double totalTime = 0.;
-  clock_t start_time = 0, end_time = 0;
-
-  if (OMZTensorSplitDebug)
-    start_time = clock();
+  struct timeval start_t, end_t;
+  struct timeval start_t1, end_t1;
 
   // For a MatMul of A(M,N)*B(N,P)+C(P),
   // We split M that is e2 in (e4, e3, e2, e1), and P that is e1.
@@ -65,6 +70,10 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
       .axis = E2,
       .numOfElemsPerTile = OMZTensorSplitSize};
 
+  if (OMZTensorSplitDebug) {
+    gettimeofday(&start_t, NULL);
+  }
+
   initSplitInfo(&splitInfoA, true, "MatMul A");
   initSplitInfo(&splitInfoB, true, "MatMul B");
   initSplitInfo(&splitInfoC, true, "MatMul C");
@@ -75,6 +84,10 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
   copyData(&splitInfoB, FULL_TO_TILES);
   copyData(&splitInfoC, FULL_TO_TILES);
 
+  if (OMZTensorSplitDebug) {
+    gettimeofday(&start_t1, NULL);
+  }
+
   // Call zdnn_matmul_op on each tile.
   // Iterate over the tiles along the first dim of A.
   for (uint32_t i = 0; i < splitInfoA.numOfTiles; ++i) {
@@ -85,6 +98,7 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
         .axis = E1,
         .numOfElemsPerTile = OMZTensorSplitSize};
     initSplitInfo(&splitInfoYB, true, "MatMul YB");
+
     // Iterate over the tiles along the second dim of B.
     for (uint32_t j = 0; j < splitInfoB.numOfTiles; ++j) {
       zdnn_ztensor *zbTensor = splitInfoB.tiles + j;
@@ -93,11 +107,20 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
       zdnn_status status = call_zdnn_matmul_op(
           zaTensor, zbTensor, zcTensor, opType, zybTensor, isBcast);
       assert(status == ZDNN_OK);
+      if (OMZTensorSplitDebug) {
+        printf("thread [%u, %u] is on cpu %d\n", i, j, sched_getcpu());
+      }
     }
     copyData(&splitInfoYB, TILES_TO_FULL);
     FreeSplitInfoData(&splitInfoYB);
   }
 
+  if (OMZTensorSplitDebug) {
+    gettimeofday(&end_t1, NULL);
+    totalTime = GetElapseTime(start_t1, end_t1);
+    printf("[MatMul] mm loop time, %f (milliseconds)\n", totalTime);
+  }
+
   // Copy data from the tiles back to the full ztensor.
   copyData(&splitInfoY, TILES_TO_FULL);
 
@@ -108,8 +131,8 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
   FreeSplitInfoData(&splitInfoY);
 
   if (OMZTensorSplitDebug) {
-    end_time = clock();
-    totalTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
+    gettimeofday(&end_t, NULL);
+    totalTime = GetElapseTime(start_t, end_t);
     printf("[MatMul] total time, %f (milliseconds)\n", totalTime);
   }
 
@@ -135,7 +158,7 @@ zdnn_status zdnn_matmul_bcast_op_ext(const zdnn_ztensor *inputA,
       inputA, inputB, inputC, opType, output, /*isBcast=*/true);
   // Compiler does not check the return result at this moment. Thus, check it
   // here.
-  assert(status == ZDNN_OK && "Failed to execute MatMul on NNPA");
+  assert(status == ZDNN_OK);
   return status;
 }
 

diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.c
@@ -578,6 +578,12 @@ void printSplitInfo(const SplitInfo *splitInfo, const char *tag) {
       splitInfo->reuseFullBuffer);
 }
 
+float GetElapseTime(const struct timeval start_t, const struct timeval end_t) {
+  return (((end_t.tv_sec * 1000000.) + end_t.tv_usec) -
+             ((start_t.tv_sec * 1000000) + start_t.tv_usec)) /
+         1000;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h
@@ -15,6 +15,9 @@
 #ifndef ONNX_MLIR_ZDNNEXTENSION_H
 #define ONNX_MLIR_ZDNNEXTENSION_H
 
+#include <stdlib.h>
+#include <sys/time.h>
+
 #include "zdnn.h"
 
 #ifdef __cplusplus
@@ -254,6 +257,11 @@ zdnn_status zdnn_softmax_ext(const zdnn_ztensor *input, void *save_area,
     zdnn_softmax_act act_func, zdnn_ztensor *output);
 zdnn_status zdnn_tanh_ext(const zdnn_ztensor *input, zdnn_ztensor *output);
 
+// -----------------------------------------------------------------------------
+// Misc Utility Functions
+// -----------------------------------------------------------------------------
+float GetElapseTime(const struct timeval start_t, const struct timeval end_t);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/src/Compiler/CompilerOptions.cpp b/src/Compiler/CompilerOptions.cpp
@@ -957,10 +957,10 @@ std::vector<std::string> getCompilerConfig(std::string k) {
 
 // Add strings in a vector to the string vector associated
 // with the specified key
-void addCompilerConfig(std::string k, std::vector<std::string> v) {
+void addCompilerConfig(std::string k, std::vector<std::string> v, bool head) {
   std::vector<std::string> u = CompilerConfigMap[k];
 
-  u.insert(u.end(), v.begin(), v.end());
+  u.insert(head ? u.begin() : u.end(), v.begin(), v.end());
   CompilerConfigMap[k] = u;
 }
 

diff --git a/src/Compiler/CompilerOptions.hpp b/src/Compiler/CompilerOptions.hpp
@@ -191,7 +191,8 @@ std::string getCompilerOption(const onnx_mlir::OptionKind kind);
 // The add and del functions are not thread-safe and should only be
 // called from one thread.
 std::vector<std::string> getCompilerConfig(std::string k);
-void addCompilerConfig(std::string k, std::vector<std::string> v);
+void addCompilerConfig(
+    std::string k, std::vector<std::string> v, bool head = false);
 void delCompilerConfig(std::string k, std::vector<std::string> v);
 
 // Functions related to initializing compiler configuration states based on

diff --git a/test/mlir/accelerators/nnpa/module_op_be/compiler-config.mlir b/test/mlir/accelerators/nnpa/module_op_be/compiler-config.mlir
@@ -13,4 +13,4 @@ module {
 }
 // CHECK: {{.*}} opt {{.*}} -o {{.*}}.bc
 // CHECK-NEXT: {{.*}} llc {{.*}}  {{.*}} {{.*}}.bc
-// CHECK-NEXT: {{.*}} {{clang|c|g}}++{{.*}} {{.*}}.o -o {{.*}}.so -shared -fPIC -L{{.*}}/lib -lcruntime -lRuntimeNNPA -lzdnn
+// CHECK-NEXT: {{.*}} {{clang|c|g}}++{{.*}} {{.*}}.o -o {{.*}}.so -shared -fPIC -L{{.*}}/lib -lRuntimeNNPA -lzdnn -lcruntime