Skip to content

Commit

Permalink
Add inline-c-cuda package
Browse files Browse the repository at this point in the history
  • Loading branch information
junjihashimoto committed Aug 27, 2023
1 parent 7d6dd8f commit 9f3dc32
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 61 deletions.
2 changes: 1 addition & 1 deletion inline-c-cuda/README.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Small set of utilities to inline C++ code. See tests for example.
Small set of utilities to inline CUDA code. See tests for example.
47 changes: 3 additions & 44 deletions inline-c-cuda/inline-c-cuda.cabal
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cabal-version: 2.2
name: inline-c-cuda
version: 0.5.0.0
version: 0.1.0.0
synopsis: Lets you embed CUDA code into Haskell.
description: Utilities to inline CUDA code into Haskell using inline-c. See
tests for example on how to build.
Expand All @@ -12,45 +12,12 @@ copyright: (c) 2015-2016 FP Complete Corporation, (c) 2017-2019 France
category: FFI
tested-with: GHC == 8.4.4, GHC == 8.6.5, GHC == 8.8.4, GHC == 8.10.2
build-type: Simple
extra-source-files: test/*.h

source-repository head
type: git
location: https://github.com/fpco/inline-c

flag std-vector-example
description: Build std::vector example
default: False

common cxx-opts
-- These options are for compilation of C++ _files_. We need to duplicate
-- these in ghc-options to apply them on inline-c-cpp snippets.
-- This is partly(?) due to Cabal < 3.2.1.0 not passing cxx-options to
-- GHC 8.10 correctly. See https://github.com/haskell/cabal/issues/6421
cxx-options:
-- Compilers strive to be ABI compatible regardless of the C++ language
-- version (except perhaps experimental features).
-- Discussion: https://stackoverflow.com/questions/46746878/is-it-safe-to-link-c17-c14-and-c11-objects/49118876
-- We only have to raise this if a new inline-c-cpp feature requires us to
-- bundle C++ code that requires a newer version of the standard.
-- Generated code in user libraries will be compiled with the language
-- version configured there.
-std=c++11
-Wall

-- Linking to the C++ standard library
if impl(ghc >= 9.4)
build-depends: system-cxx-std-lib == 1.0
elif os(linux)
extra-libraries: stdc++
extra-libraries: cudart

ghc-options:
-optcxx-std=c++11
-optcxx-Wall

library
import: cxx-opts
exposed-modules: Language.C.Inline.Cuda
build-depends: base >=4.7 && <5
, bytestring
Expand All @@ -60,16 +27,13 @@ library
, text
, safe-exceptions
, containers
, process
hs-source-dirs: src
default-language: Haskell2010
ghc-options: -Wall
include-dirs: include
install-includes: HaskellException.hxx HaskellStablePtr.hxx
cxx-sources: cxx-src/HaskellException.cxx
cxx-src/HaskellStablePtr.cxx
extra-libraries: cudart

test-suite tests
import: cxx-opts
type: exitcode-stdio-1.0
hs-source-dirs: test
main-is: tests.hs
Expand All @@ -84,8 +48,3 @@ test-suite tests
, template-haskell
, vector
default-language: Haskell2010
cxx-options: -Werror -std=c++11

if impl(ghc >= 8.10)
ghc-options:
-optcxx-Werror
18 changes: 16 additions & 2 deletions inline-c-cuda/src/Language/C/Inline/Cuda.hs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ module Language.C.Inline.Cuda
, Cpp.AbstractCppExceptionPtr
) where

import Data.Monoid ((<>), mempty)
import qualified Language.Haskell.TH as TH
import qualified Language.Haskell.TH.Syntax as TH

Expand All @@ -23,6 +22,21 @@ import qualified Language.C.Types as CT
import qualified Language.C.Inline.Cpp as Cpp

import qualified Data.Map as Map
import Control.Monad.IO.Class (liftIO)
import System.Exit (ExitCode(..))
import System.Process (readProcessWithExitCode)

compileCuda :: String -> TH.Q FilePath
compileCuda src = do
cuFile <- TH.addTempFile "cu"
oFile <- TH.addTempFile "o"
let (cmd,args) = ("nvcc", ["-c","-o",oFile, cuFile])
(code, stdout, stderr) <- liftIO $ do
writeFile cuFile src
readProcessWithExitCode cmd args ""
case code of
ExitFailure _ -> fail $ "Compile Command: " ++ (foldl (\a b -> a ++ " " ++ b) " " (cmd : args)) ++ "\n" ++ " Output: " ++ stdout ++ "\n" ++ " Error: " ++ stderr
ExitSuccess -> return oFile

-- | The equivalent of 'C.baseCtx' for CUDA. It specifies the @.cu@
-- file extension for the CUDA file, so that nvcc will decide to build CUDA
Expand All @@ -33,7 +47,7 @@ cudaCtx = Cpp.cppCtx <> mempty
{ ctxForeignSrcLang = Just TH.RawObject
, ctxOutput = Just $ \s -> "extern \"C\" {\n" ++ s ++ "\n}"
, ctxEnableCpp = True
, ctxRawObjectCompileCommand = Just (\objFile cuFile -> ("nvcc", ["-c", "-o", objFile, cuFile]))
, ctxRawObjectCompile = Just compileCuda
, ctxTypesTable = Map.singleton (CT.TypeName "std::exception_ptr") [t|Cpp.AbstractCppExceptionPtr|]
}

94 changes: 80 additions & 14 deletions inline-c-cuda/test/tests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,24 @@

import Control.Exception.Safe
import Control.Monad
import qualified Data.ByteString as BS
import Data.ByteString (ByteString)
import qualified Language.C.Inline.Context as CC
import qualified Language.C.Types as CT
import qualified Language.C.Inline.Cuda as C
import Foreign.C.String (withCString)
import Foreign.StablePtr (StablePtr, newStablePtr, castStablePtrToPtr)
import qualified Test.Hspec as Hspec
import Test.Hspec (shouldBe)
import Foreign.Ptr (Ptr)
import Data.List (isInfixOf)
import Data.Monoid
import qualified Data.Vector.Storable as VS
import Foreign.Marshal.Array
import Foreign.Marshal.Alloc
import Foreign.Storable


C.context $ C.cudaCtx

C.include "<iostream>"
C.include "<vector>"
C.include "<array>"
C.include "<tuple>"
C.include "<stdexcept>"

[C.verbatimBlock|
[C.emitBlock|
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
Expand All @@ -57,11 +51,83 @@ vectorAdd(const float *A, const float *B, float *C, int numElements)
}
|]

cudaAllocaArray :: forall b. Int -> (Ptr C.CFloat -> IO b) -> IO b
cudaAllocaArray size func = do
let csize = fromIntegral size
alloca $ \(ptr_d_A :: Ptr (Ptr C.CFloat)) -> do
[C.block| void {
cudaError_t err = cudaMalloc((void **)$(float** ptr_d_A), $(int csize) * sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
} |]
d_A <- peekElemOff ptr_d_A 0
ret <- func d_A
[C.block| void {
cudaError_t err = cudaFree($(float* d_A));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
} |]
return ret

cudaMemcpyHostToDevice :: Int -> Ptr C.CFloat -> Ptr C.CFloat -> IO ()
cudaMemcpyHostToDevice num host device = do
let cnum = fromIntegral num
[C.block| void {
cudaError_t err = cudaMemcpy($(float* device), $(float* host), $(int cnum) * sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
} |]

cudaMemcpyDeviceToHost :: Int -> Ptr C.CFloat -> Ptr C.CFloat -> IO ()
cudaMemcpyDeviceToHost num device host = do
let cnum = fromIntegral num
[C.block| void {
cudaError_t err = cudaMemcpy($(float* host), $(float* device), $(int cnum) * sizeof(float), cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
} |]


main :: IO ()
main = Hspec.hspec $ do
Hspec.describe "Basic CUDA" $ do
Hspec.it "Hello World" $ do
let x = 3
x `shouldBe` 3

Hspec.it "Add vectors on device" $ do
let numElements = 50000
cNumElements = fromIntegral numElements
allocaArray numElements $ \(h_A :: Ptr C.CFloat) -> do
allocaArray numElements $ \(h_B :: Ptr C.CFloat) -> do
allocaArray numElements $ \(h_C :: Ptr C.CFloat) -> do
cudaAllocaArray numElements $ \(d_A :: Ptr C.CFloat) -> do
cudaAllocaArray numElements $ \(d_B :: Ptr C.CFloat) -> do
cudaAllocaArray numElements $ \(d_C :: Ptr C.CFloat) -> do
[C.block| void {
for (int i = 0; i < $(int cNumElements); ++i)
{
$(float* h_A)[i] = rand()/(float)RAND_MAX;
$(float* h_B)[i] = rand()/(float)RAND_MAX;
}
} |]
cudaMemcpyHostToDevice numElements h_A d_A
cudaMemcpyHostToDevice numElements h_B d_B
[C.block| void {
int threadsPerBlock = 256;
int blocksPerGrid =($(int cNumElements) + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>($(float* d_A), $(float* d_B), $(float* d_C), $(int cNumElements));
} |]
cudaMemcpyDeviceToHost numElements d_C h_C
lA <- peekArray numElements h_A
lB <- peekArray numElements h_B
lC <- peekArray numElements h_C
all (< 1e-5) (map (\((a,b),c) -> abs(a + b - c)) (zip (zip lA lB) lC)) `shouldBe` True
1 change: 1 addition & 0 deletions stack.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ resolver: lts-20.11
packages:
- inline-c
- inline-c-cpp
- inline-c-cuda
- inline-c-objc
- sample-cabal-project
extra-deps:
Expand Down

0 comments on commit 9f3dc32

Please sign in to comment.