Add inline-c-cuda package

fpco · Aug 27, 2023 · 9f3dc32 · 9f3dc32
1 parent 7d6dd8f
commit 9f3dc32
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 61 deletions.
diff --git a/inline-c-cuda/README.md b/inline-c-cuda/README.md
@@ -1 +1 @@
-Small set of utilities to inline C++ code.  See tests for example.
+Small set of utilities to inline CUDA code.  See tests for example.
diff --git a/inline-c-cuda/inline-c-cuda.cabal b/inline-c-cuda/inline-c-cuda.cabal
@@ -1,6 +1,6 @@
 cabal-version:       2.2
 name:                inline-c-cuda
-version:             0.5.0.0
+version:             0.1.0.0
 synopsis:            Lets you embed CUDA code into Haskell.
 description:         Utilities to inline CUDA code into Haskell using inline-c.  See
                      tests for example on how to build.
@@ -12,45 +12,12 @@ copyright:           (c) 2015-2016 FP Complete Corporation, (c) 2017-2019 France
 category:            FFI
 tested-with:         GHC == 8.4.4, GHC == 8.6.5, GHC == 8.8.4, GHC == 8.10.2
 build-type:          Simple
-extra-source-files:  test/*.h
 
 source-repository head
   type:     git
   location: https://github.com/fpco/inline-c
 
-flag std-vector-example
-  description:         Build std::vector example
-  default:             False
-
-common cxx-opts
-  -- These options are for compilation of C++ _files_. We need to duplicate
-  -- these in ghc-options to apply them on inline-c-cpp snippets.
-  -- This is partly(?) due to Cabal < 3.2.1.0 not passing cxx-options to
-  -- GHC 8.10 correctly. See https://github.com/haskell/cabal/issues/6421
-  cxx-options:
-    -- Compilers strive to be ABI compatible regardless of the C++ language
-    -- version (except perhaps experimental features).
-    -- Discussion: https://stackoverflow.com/questions/46746878/is-it-safe-to-link-c17-c14-and-c11-objects/49118876
-    -- We only have to raise this if a new inline-c-cpp feature requires us to
-    -- bundle C++ code that requires a newer version of the standard.
-    -- Generated code in user libraries will be compiled with the language
-    -- version configured there.
-    -std=c++11
-    -Wall
-
-  -- Linking to the C++ standard library
-  if impl(ghc >= 9.4)
-    build-depends: system-cxx-std-lib == 1.0
-  elif os(linux)
-    extra-libraries: stdc++
-  extra-libraries: cudart
-
-  ghc-options:
-    -optcxx-std=c++11
-    -optcxx-Wall
-
 library
-  import:              cxx-opts
   exposed-modules:     Language.C.Inline.Cuda
   build-depends:       base >=4.7 && <5
                      , bytestring
@@ -60,16 +27,13 @@ library
                      , text
                      , safe-exceptions
                      , containers
+                     , process
   hs-source-dirs:      src
   default-language:    Haskell2010
   ghc-options:         -Wall
-  include-dirs:        include
-  install-includes:    HaskellException.hxx HaskellStablePtr.hxx
-  cxx-sources:         cxx-src/HaskellException.cxx
-                       cxx-src/HaskellStablePtr.cxx
+  extra-libraries:     cudart
 
 test-suite tests
-  import:              cxx-opts
   type:                exitcode-stdio-1.0
   hs-source-dirs:      test
   main-is:             tests.hs
@@ -84,8 +48,3 @@ test-suite tests
                      , template-haskell
                      , vector
   default-language:    Haskell2010
-  cxx-options:         -Werror -std=c++11
-
-  if impl(ghc >= 8.10)
-    ghc-options:
-      -optcxx-Werror
diff --git a/inline-c-cuda/src/Language/C/Inline/Cuda.hs b/inline-c-cuda/src/Language/C/Inline/Cuda.hs
@@ -13,7 +13,6 @@ module Language.C.Inline.Cuda
   , Cpp.AbstractCppExceptionPtr
   ) where
 
-import           Data.Monoid ((<>), mempty)
 import qualified Language.Haskell.TH as TH
 import qualified Language.Haskell.TH.Syntax as TH
 
@@ -23,6 +22,21 @@ import qualified Language.C.Types as CT
 import qualified Language.C.Inline.Cpp as Cpp
 
 import qualified Data.Map as Map
+import           Control.Monad.IO.Class (liftIO)
+import           System.Exit (ExitCode(..))
+import           System.Process (readProcessWithExitCode)
+
+compileCuda :: String -> TH.Q FilePath
+compileCuda src = do
+  cuFile <- TH.addTempFile "cu"
+  oFile <- TH.addTempFile "o"
+  let (cmd,args) = ("nvcc", ["-c","-o",oFile, cuFile])
+  (code, stdout, stderr) <- liftIO $ do
+    writeFile cuFile src
+    readProcessWithExitCode cmd args ""
+  case code of
+    ExitFailure _ -> fail $ "Compile Command: " ++ (foldl (\a b -> a ++ " " ++ b) " " (cmd : args)) ++ "\n" ++ "    Output: " ++ stdout ++ "\n" ++ "    Error: " ++ stderr
+    ExitSuccess -> return oFile
 
 -- | The equivalent of 'C.baseCtx' for CUDA.  It specifies the @.cu@
 -- file extension for the CUDA file, so that nvcc will decide to build CUDA
@@ -33,7 +47,7 @@ cudaCtx = Cpp.cppCtx <> mempty
   { ctxForeignSrcLang = Just TH.RawObject
   , ctxOutput = Just $ \s -> "extern \"C\" {\n" ++ s ++ "\n}"
   , ctxEnableCpp = True
-  , ctxRawObjectCompileCommand = Just (\objFile cuFile -> ("nvcc", ["-c", "-o", objFile, cuFile]))
+  , ctxRawObjectCompile = Just compileCuda
   , ctxTypesTable = Map.singleton (CT.TypeName "std::exception_ptr") [t|Cpp.AbstractCppExceptionPtr|]
   }
 
diff --git a/inline-c-cuda/test/tests.hs b/inline-c-cuda/test/tests.hs
@@ -21,30 +21,24 @@
 
 import           Control.Exception.Safe
 import           Control.Monad
-import qualified Data.ByteString as BS
-import           Data.ByteString (ByteString)
 import qualified Language.C.Inline.Context as CC
 import qualified Language.C.Types as CT
 import qualified Language.C.Inline.Cuda as C
-import           Foreign.C.String (withCString)
-import           Foreign.StablePtr (StablePtr, newStablePtr, castStablePtrToPtr)
 import qualified Test.Hspec as Hspec
 import           Test.Hspec (shouldBe)
 import           Foreign.Ptr (Ptr)
-import           Data.List (isInfixOf)
 import           Data.Monoid
-import qualified Data.Vector.Storable as VS
+import           Foreign.Marshal.Array
+import           Foreign.Marshal.Alloc
+import           Foreign.Storable
 
 
 C.context $ C.cudaCtx
 
 C.include "<iostream>"
-C.include "<vector>"
-C.include "<array>"
-C.include "<tuple>"
 C.include "<stdexcept>"
 
-[C.verbatimBlock|
+[C.emitBlock|
 __global__ void
 vectorAdd(const float *A, const float *B, float *C, int numElements)
 {
@@ -57,11 +51,83 @@ vectorAdd(const float *A, const float *B, float *C, int numElements)
 }
 |]
 
+cudaAllocaArray :: forall b. Int -> (Ptr C.CFloat -> IO b) -> IO b
+cudaAllocaArray size func = do
+  let csize = fromIntegral size
+  alloca $ \(ptr_d_A :: Ptr (Ptr C.CFloat)) -> do
+    [C.block| void {
+      cudaError_t err = cudaMalloc((void **)$(float** ptr_d_A), $(int csize) * sizeof(float));
+      if (err != cudaSuccess)
+      {
+          fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
+          exit(EXIT_FAILURE);
+      }
+    } |]
+    d_A <- peekElemOff ptr_d_A 0
+    ret <- func d_A
+    [C.block| void {
+      cudaError_t err = cudaFree($(float* d_A));
+      if (err != cudaSuccess)
+      {
+          fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
+          exit(EXIT_FAILURE);
+      }
+    } |]
+    return ret
+
+cudaMemcpyHostToDevice :: Int -> Ptr C.CFloat -> Ptr C.CFloat -> IO ()
+cudaMemcpyHostToDevice num host device = do
+  let cnum = fromIntegral num
+  [C.block| void {
+      cudaError_t err = cudaMemcpy($(float* device), $(float* host), $(int cnum) * sizeof(float), cudaMemcpyHostToDevice);
+      if (err != cudaSuccess)
+      {
+          fprintf(stderr, "Failed to copy vector from host to device (error code %s)!\n", cudaGetErrorString(err));
+          exit(EXIT_FAILURE);
+      }
+  } |]
+
+cudaMemcpyDeviceToHost :: Int -> Ptr C.CFloat -> Ptr C.CFloat -> IO ()
+cudaMemcpyDeviceToHost num device host = do
+  let cnum = fromIntegral num
+  [C.block| void {
+      cudaError_t err = cudaMemcpy($(float* host), $(float* device), $(int cnum) * sizeof(float), cudaMemcpyDeviceToHost);
+      if (err != cudaSuccess)
+      {
+          fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
+          exit(EXIT_FAILURE);
+      }
+  } |]
+
 
 main :: IO ()
 main = Hspec.hspec $ do
   Hspec.describe "Basic CUDA" $ do
-    Hspec.it "Hello World" $ do
-      let x = 3
-      x `shouldBe` 3
-
+    Hspec.it "Add vectors on device" $ do
+      let numElements = 50000
+          cNumElements = fromIntegral numElements
+      allocaArray numElements $ \(h_A :: Ptr C.CFloat) -> do
+        allocaArray numElements $ \(h_B :: Ptr C.CFloat) -> do
+          allocaArray numElements $ \(h_C :: Ptr C.CFloat) -> do
+            cudaAllocaArray numElements $ \(d_A :: Ptr C.CFloat) -> do
+              cudaAllocaArray numElements $ \(d_B :: Ptr C.CFloat) -> do
+                cudaAllocaArray numElements $ \(d_C :: Ptr C.CFloat) -> do
+                  [C.block| void {
+                    for (int i = 0; i < $(int cNumElements); ++i)
+                    {
+                        $(float* h_A)[i] = rand()/(float)RAND_MAX;
+                        $(float* h_B)[i] = rand()/(float)RAND_MAX;
+                    }
+                  } |]
+                  cudaMemcpyHostToDevice numElements h_A d_A
+                  cudaMemcpyHostToDevice numElements h_B d_B
+                  [C.block| void {
+                    int threadsPerBlock = 256;
+                    int blocksPerGrid =($(int cNumElements) + threadsPerBlock - 1) / threadsPerBlock;
+                    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>($(float* d_A), $(float* d_B), $(float* d_C), $(int cNumElements));
+                  } |]
+                  cudaMemcpyDeviceToHost numElements d_C h_C
+                  lA <- peekArray numElements h_A
+                  lB <- peekArray numElements h_B
+                  lC <- peekArray numElements h_C
+                  all (< 1e-5) (map (\((a,b),c) -> abs(a + b - c)) (zip (zip lA lB) lC)) `shouldBe` True
diff --git a/stack.yaml b/stack.yaml
@@ -2,6 +2,7 @@ resolver: lts-20.11
 packages:
 - inline-c
 - inline-c-cpp
+- inline-c-cuda
 - inline-c-objc
 - sample-cabal-project
 extra-deps: