[Cuda Codegen] Emit launch bounds

Cuda functions can be annotated with launch bounds, that is the maximum number of threads per block (the minimum blocks per multiprocessor can also be specified). This information is used by nvrtc/nvcc during register allocation (and probably other phases as well).
facebookresearch · Jun 20, 2018 · f6a78dc · f6a78dc
1 parent 45ca22e
commit f6a78dc
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 5 deletions.
diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc
@@ -153,9 +153,17 @@ void emitArgs(stringstream& ss, const Scop& scop) {
 void emitKernelSignature(
     stringstream& ss,
     const std::string& specializedName,
-    const Scop& scop) {
+    const Scop& scop,
+    const Block& block) {
   TC_CHECK_NE(specializedName, "") << "name not provided";
-  ss << "__global__ void " << specializedName << "(";
+  auto b0 = block.view[0];
+  b0 = b0 == 0 ? 1 : b0;
+  auto b1 = block.view[1];
+  b1 = b1 == 0 ? 1 : b1;
+  auto b2 = block.view[2];
+  b1 = b2 == 0 ? 1 : b2;
+  ss << "__global__ __launch_bounds__(" << b0 * b1 * b2 << ") void "
+     << specializedName << "(";
   emitArgs(ss, scop);
   ss << ") {" << endl;
 }
@@ -753,7 +761,7 @@ string emitCudaKernel(
   }
 
   stringstream ss;
-  emitKernelSignature(ss, specializedName, scop);
+  emitKernelSignature(ss, specializedName, scop, mscop.numThreads);
   emitThreadIdInit(ss, mscop);
   emitTensorViews(ss, scop.halide.outputs, paramValues);
   emitTensorViews(ss, scop.halide.inputs, paramValues);

diff --git a/test/test_cuda_mapper.cc b/test/test_cuda_mapper.cc
@@ -451,7 +451,7 @@ def fun(float(N, N) A) -> (O)
   auto res = std::get<0>(mscop->codegen(specializedName));
 
   string expected(
-      R"RES(__global__ void kernel_anon(int32 N, float32* pO, const float32* pA) {
+      R"RES(__global__ __launch_bounds__(1) void kernel_anon(int32 N, float32* pO, const float32* pA) {
   int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
   float32 (*O)[N] = reinterpret_cast<float32 (*)[N]>(pO);
@@ -480,7 +480,7 @@ def fun(float(N, N) A, float(N, N) B, float(N) C) -> (O)
   auto res = std::get<0>(mscop->codegen(specializedName));
 
   string expected =
-      R"RES(__global__ void kernel_anon(int32 N, float32* pO, const float32* pA, const float32* pB, const float32* pC) {
+      R"RES(__global__ __launch_bounds__(1) void kernel_anon(int32 N, float32* pO, const float32* pA, const float32* pB, const float32* pC) {
   int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
   float32 (*O)[512] = reinterpret_cast<float32 (*)[512]>(pO);