Matmul nbits to optimize memory layout for avx instructions #22203

liqunfu · 2024-09-24T15:50:37Z

Description

Motivation and Context

Signed-off-by: liqunfu <liqun.fu@microsoft.com>

github-actions

You can commit the suggested changes from lintrunner.

github-actions · 2024-11-18T16:04:09Z

onnxruntime/test/contrib_ops/matmul_4bits_test.cc

 TEST(MatMulNBits, Float32_Accuracy4) {
-  TestMatMulNBitsTyped<float, 1, 1, 16, 16, 4>();
-  TestMatMulNBitsTyped<float, 1, 2, 16, 16, 4>();
-  TestMatMulNBitsTyped<float, 1, 32, 16, 16, 4>();
-  TestMatMulNBitsTyped<float, 1, 32, 32, 16, 4>();
-  TestMatMulNBitsTyped<float, 1, 32, 16, 128, 4>();
-  TestMatMulNBitsTyped<float, 1, 288, 16, 16, 4>();
-  TestMatMulNBitsTyped<float, 1, 288, 1024, 16, 4>();
-  TestMatMulNBitsTyped<float, 1, 288, 1024, 128, 4>();
-  TestMatMulNBitsTyped<float, 1, 288, 93, 32, 4>();
-  TestMatMulNBitsTyped<float, 1, 288, 93, 128, 4>();
-  TestMatMulNBitsTyped<float, 1, 288, 1234, 16, 4>();
-  TestMatMulNBitsTyped<float, 2, 1, 16, 16, 4>();
-  TestMatMulNBitsTyped<float, 2, 2, 16, 16, 4>();
-  TestMatMulNBitsTyped<float, 100, 1, 16, 16, 4>();
-  TestMatMulNBitsTyped<float, 100, 2, 16, 16, 4>();
-  TestMatMulNBitsTyped<float, 100, 32, 16, 16, 4>();
-  TestMatMulNBitsTyped<float, 100, 32, 32, 16, 4>();
-  TestMatMulNBitsTyped<float, 100, 32, 16, 128, 4>();
-  TestMatMulNBitsTyped<float, 100, 288, 16, 16, 4>();
-  TestMatMulNBitsTyped<float, 100, 288, 1024, 16, 4>();
-  TestMatMulNBitsTyped<float, 100, 288, 1024, 128, 4>();
-  TestMatMulNBitsTyped<float, 100, 288, 93, 32, 4>();
-  TestMatMulNBitsTyped<float, 100, 288, 93, 128, 4>();
-  TestMatMulNBitsTyped<float, 100, 288, 1234, 16, 4>();
+  //TestMatMulNBitsTyped<float, 1, 1, 16, 16, 4>();
+  //TestMatMulNBitsTyped<float, 1, 2, 16, 16, 4>();
+  //TestMatMulNBitsTyped<float, 1, 32, 16, 16, 4>();
+  //TestMatMulNBitsTyped<float, 1, 32, 32, 16, 4>();
+  //TestMatMulNBitsTyped<float, 1, 32, 16, 128, 4>();
+  //TestMatMulNBitsTyped<float, 1, 288, 16, 16, 4>();
+  //TestMatMulNBitsTyped<float, 1, 288, 1024, 16, 4>();
+  //TestMatMulNBitsTyped<float, 1, 288, 1024, 128, 4>();
+  //TestMatMulNBitsTyped<float, 1, 288, 93, 32, 4>();
+  //TestMatMulNBitsTyped<float, 1, 288, 93, 128, 4>();
+  //TestMatMulNBitsTyped<float, 1, 288, 1234, 16, 4>();
+  //TestMatMulNBitsTyped<float, 2, 1, 16, 16, 4>();
+  //TestMatMulNBitsTyped<float, 2, 2, 16, 16, 4>();
+  //TestMatMulNBitsTyped<float, 100, 1, 16, 16, 4>();
+  //TestMatMulNBitsTyped<float, 100, 2, 16, 16, 4>();
+  //TestMatMulNBitsTyped<float, 100, 32, 16, 16, 4>();
+  //TestMatMulNBitsTyped<float, 100, 32, 32, 16, 4>();
+  //TestMatMulNBitsTyped<float, 100, 32, 16, 128, 4>();
+  //TestMatMulNBitsTyped<float, 100, 288, 16, 16, 4>();
+  //TestMatMulNBitsTyped<float, 100, 288, 1024, 16, 4>();
+  //TestMatMulNBitsTyped<float, 100, 288, 1024, 128, 4>();
+  //TestMatMulNBitsTyped<float, 100, 288, 93, 32, 4>();
+  //TestMatMulNBitsTyped<float, 100, 288, 93, 128, 4>();
+  //TestMatMulNBitsTyped<float, 100, 288, 1234, 16, 4>();
+  TestMatMulNBitsTyped<float, 2, 4, 128, 32, 4>();
+  //TestMatMulNBitsTyped<float, 100, 288, 1234, 32, 4>();
 }


Suggested change

TEST(MatMulNBits, Float32_Accuracy4) {

TestMatMulNBitsTyped<float, 1, 1, 16, 16, 4>();

TestMatMulNBitsTyped<float, 1, 2, 16, 16, 4>();

TestMatMulNBitsTyped<float, 1, 32, 16, 16, 4>();

TestMatMulNBitsTyped<float, 1, 32, 32, 16, 4>();

TestMatMulNBitsTyped<float, 1, 32, 16, 128, 4>();

TestMatMulNBitsTyped<float, 1, 288, 16, 16, 4>();

TestMatMulNBitsTyped<float, 1, 288, 1024, 16, 4>();

TestMatMulNBitsTyped<float, 1, 288, 1024, 128, 4>();

TestMatMulNBitsTyped<float, 1, 288, 93, 32, 4>();

TestMatMulNBitsTyped<float, 1, 288, 93, 128, 4>();

TestMatMulNBitsTyped<float, 1, 288, 1234, 16, 4>();

TestMatMulNBitsTyped<float, 2, 1, 16, 16, 4>();

TestMatMulNBitsTyped<float, 2, 2, 16, 16, 4>();

TestMatMulNBitsTyped<float, 100, 1, 16, 16, 4>();

TestMatMulNBitsTyped<float, 100, 2, 16, 16, 4>();

TestMatMulNBitsTyped<float, 100, 32, 16, 16, 4>();

TestMatMulNBitsTyped<float, 100, 32, 32, 16, 4>();

TestMatMulNBitsTyped<float, 100, 32, 16, 128, 4>();

TestMatMulNBitsTyped<float, 100, 288, 16, 16, 4>();

TestMatMulNBitsTyped<float, 100, 288, 1024, 16, 4>();

TestMatMulNBitsTyped<float, 100, 288, 1024, 128, 4>();

TestMatMulNBitsTyped<float, 100, 288, 93, 32, 4>();

TestMatMulNBitsTyped<float, 100, 288, 93, 128, 4>();

TestMatMulNBitsTyped<float, 100, 288, 1234, 16, 4>();

//TestMatMulNBitsTyped<float, 1, 1, 16, 16, 4>();

//TestMatMulNBitsTyped<float, 1, 2, 16, 16, 4>();

//TestMatMulNBitsTyped<float, 1, 32, 16, 16, 4>();

//TestMatMulNBitsTyped<float, 1, 32, 32, 16, 4>();

//TestMatMulNBitsTyped<float, 1, 32, 16, 128, 4>();

//TestMatMulNBitsTyped<float, 1, 288, 16, 16, 4>();

//TestMatMulNBitsTyped<float, 1, 288, 1024, 16, 4>();

//TestMatMulNBitsTyped<float, 1, 288, 1024, 128, 4>();

//TestMatMulNBitsTyped<float, 1, 288, 93, 32, 4>();

//TestMatMulNBitsTyped<float, 1, 288, 93, 128, 4>();

//TestMatMulNBitsTyped<float, 1, 288, 1234, 16, 4>();

//TestMatMulNBitsTyped<float, 2, 1, 16, 16, 4>();

//TestMatMulNBitsTyped<float, 2, 2, 16, 16, 4>();

//TestMatMulNBitsTyped<float, 100, 1, 16, 16, 4>();

//TestMatMulNBitsTyped<float, 100, 2, 16, 16, 4>();

//TestMatMulNBitsTyped<float, 100, 32, 16, 16, 4>();

//TestMatMulNBitsTyped<float, 100, 32, 32, 16, 4>();

//TestMatMulNBitsTyped<float, 100, 32, 16, 128, 4>();

//TestMatMulNBitsTyped<float, 100, 288, 16, 16, 4>();

//TestMatMulNBitsTyped<float, 100, 288, 1024, 16, 4>();

//TestMatMulNBitsTyped<float, 100, 288, 1024, 128, 4>();

//TestMatMulNBitsTyped<float, 100, 288, 93, 32, 4>();

//TestMatMulNBitsTyped<float, 100, 288, 93, 128, 4>();

//TestMatMulNBitsTyped<float, 100, 288, 1234, 16, 4>();

TestMatMulNBitsTyped<float, 2, 4, 128, 32, 4>();

//TestMatMulNBitsTyped<float, 100, 288, 1234, 32, 4>();

}

TEST(MatMulNBits, Float32_Accuracy4) {

// TestMatMulNBitsTyped<float, 1, 1, 16, 16, 4>();

// TestMatMulNBitsTyped<float, 1, 2, 16, 16, 4>();

// TestMatMulNBitsTyped<float, 1, 32, 16, 16, 4>();

// TestMatMulNBitsTyped<float, 1, 32, 32, 16, 4>();

// TestMatMulNBitsTyped<float, 1, 32, 16, 128, 4>();

// TestMatMulNBitsTyped<float, 1, 288, 16, 16, 4>();

// TestMatMulNBitsTyped<float, 1, 288, 1024, 16, 4>();

// TestMatMulNBitsTyped<float, 1, 288, 1024, 128, 4>();

// TestMatMulNBitsTyped<float, 1, 288, 93, 32, 4>();

// TestMatMulNBitsTyped<float, 1, 288, 93, 128, 4>();

// TestMatMulNBitsTyped<float, 1, 288, 1234, 16, 4>();

// TestMatMulNBitsTyped<float, 2, 1, 16, 16, 4>();

// TestMatMulNBitsTyped<float, 2, 2, 16, 16, 4>();

// TestMatMulNBitsTyped<float, 100, 1, 16, 16, 4>();

// TestMatMulNBitsTyped<float, 100, 2, 16, 16, 4>();

// TestMatMulNBitsTyped<float, 100, 32, 16, 16, 4>();

// TestMatMulNBitsTyped<float, 100, 32, 32, 16, 4>();

// TestMatMulNBitsTyped<float, 100, 32, 16, 128, 4>();

// TestMatMulNBitsTyped<float, 100, 288, 16, 16, 4>();

// TestMatMulNBitsTyped<float, 100, 288, 1024, 16, 4>();

// TestMatMulNBitsTyped<float, 100, 288, 1024, 128, 4>();

// TestMatMulNBitsTyped<float, 100, 288, 93, 32, 4>();

// TestMatMulNBitsTyped<float, 100, 288, 93, 128, 4>();

// TestMatMulNBitsTyped<float, 100, 288, 1234, 16, 4>();

TestMatMulNBitsTyped<float, 2, 4, 128, 32, 4>();

// TestMatMulNBitsTyped<float, 100, 288, 1234, 32, 4>();

}

Signed-off-by: liqunfu <liqun.fu@microsoft.com>

onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp

+    float* AScaledBlkSum  // scale_k * Sum_blklen(a_i)
+)
+{
+    const size_t BlkLen = 32;


onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp

+)
+{
+    const size_t BlkLen = 32;
+    const int64_t SubBlkLen = 4 * BlkLen;  // process 128 weights at a time and then process the remaining weights


onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp

+
+        // Convert int32 to int8
+        i_16_epi8[index] = _mm512_cvtepi32_epi8(i0);
+        //_mm_storeu_si128(dst++, i0_8);


onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp

+    }
+
+    while (k_remaining > 0) {
+        // for (size_t k = 0; k < CountK; k += BlkLen) {


onnxruntime/test/contrib_ops/matmul_4bits_test.cc

+}
+
+TEST(MatMulNBits, LongTestFloat32) {
+  // onnxruntime::profiling::Profiler::Profiler::Instance().StartProfiling<char>("profile.json");


Signed-off-by: liqunfu <liqun.fu@microsoft.com>

…hus not to implement avx512 Signed-off-by: liqunfu <liqun.fu@microsoft.com>

… to be in a separate loop. defer this work later Signed-off-by: liqunfu <liqun.fu@microsoft.com>

Signed-off-by: liqunfu <liqun.fu@microsoft.com>

onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen32.h

+static MLAS_FORCEINLINE
+__m512 load_1blksum_512(const float* BlksumPtr) {
+    // Create a mask to set only the lowest element
+    const __mmask16 mask = 0x01;  // Binary: 0000 0000 0000 0001


onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen32.h

+// Function to load a single float value into the lowest element of a __m256 register
+static MLAS_FORCEINLINE
+__m256 load_1blksum_256(const float* BlksumPtr) {
+    const __mmask8 mask = 0x01;  // Binary: 0000 0001


onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h

@@ -32,6 +32,42 @@
    return result;
 }

+static MLAS_FORCEINLINE
+__m512 load_broadcast_512(const float& combined_scale) {
+    const __mmask16 mask = 00000001;  // Binary: 0000 0000 0000 0001, lowest element


onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h

+    // const __m512 scale_b_16_ps = _mm512_broadcast_f32x8(scale_b_ps);
+    // return;
+
+    const __mmask8 mask = 0xff;  // Binary: 1111 1111, to set all elements


Signed-off-by: Liqun Fu <liqun.fu@microsoft.com>

Signed-off-by: Liqun Fu <liqun_fu@hotmail.com>

matmul nbits to optimize memory layout for avx instructions

555e951

Signed-off-by: liqunfu <liqun.fu@microsoft.com>

liqunfu requested a review from a team as a code owner September 24, 2024 15:50

liqunfu marked this pull request as draft September 24, 2024 15:50

liqunfu added 2 commits November 7, 2024 21:28

Merge branch 'main' into liqun/avx-layout

076998c

intermediate push

99aec95

Signed-off-by: liqunfu <liqun.fu@microsoft.com>

github-actions bot reviewed Nov 18, 2024

View reviewed changes

liqunfu added 2 commits November 27, 2024 02:38

pass mlas and utest for blklen32 avx512

8ce1a2a

Signed-off-by: liqunfu <liqun.fu@microsoft.com>

Merge branch 'main' into liqun/avx-layout

f016555

github-advanced-security bot found potential problems Nov 27, 2024

View reviewed changes

liqunfu added 7 commits November 29, 2024 00:43

pass avx512/vnni-blklen32

d371c59

Signed-off-by: liqunfu <liqun.fu@microsoft.com>

pass avx512vnni-blklen128. plan to compute blksum in different loop t…

790b03f

…hus not to implement avx512 Signed-off-by: liqunfu <liqun.fu@microsoft.com>

attmpt to make blklen256 work. failed because blksum computation need…

557fbb0

… to be in a separate loop. defer this work later Signed-off-by: liqunfu <liqun.fu@microsoft.com>

avx512 blklen64 to compute blksum in a separate loop

6b28657

Signed-off-by: liqunfu <liqun.fu@microsoft.com>

avx512 scaled_zp compute in a separate loop except blklen16

0b867f8

Signed-off-by: liqunfu <liqun.fu@microsoft.com>

avx512, all blklens, scaled_zp compute in a separate loop

2e74f56

Signed-off-by: liqunfu <liqun.fu@microsoft.com>

Merge branch 'main' into liqun/avx-layout

0bf47f7

github-advanced-security bot found potential problems Dec 12, 2024

View reviewed changes

liqunfu and others added 2 commits December 13, 2024 10:09

avx2 passes

c19ae9e

Signed-off-by: Liqun Fu <liqun.fu@microsoft.com>

avxvnni, matmul_nbit kernel

b26b075

Signed-off-by: Liqun Fu <liqun_fu@hotmail.com>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Matmul nbits to optimize memory layout for avx instructions #22203

Matmul nbits to optimize memory layout for avx instructions #22203

liqunfu commented Sep 24, 2024

github-actions bot left a comment

github-actions bot Nov 18, 2024

Matmul nbits to optimize memory layout for avx instructions #22203

Are you sure you want to change the base?

Matmul nbits to optimize memory layout for avx instructions #22203

Conversation

liqunfu commented Sep 24, 2024

Description

Motivation and Context

github-actions bot left a comment

Choose a reason for hiding this comment

github-actions bot Nov 18, 2024

Choose a reason for hiding this comment