From 63ac4ff26ebff9a70de5a7af6f5c8ba78f326a9e Mon Sep 17 00:00:00 2001 From: Junji Hashimoto Date: Fri, 29 Sep 2023 20:23:53 +0900 Subject: [PATCH] Add inline-c-cuda package for CUDA --- inline-c-cuda/LICENSE | 20 +++ inline-c-cuda/README.md | 1 + inline-c-cuda/Setup.hs | 2 + inline-c-cuda/inline-c-cuda.cabal | 50 ++++++++ inline-c-cuda/src/Language/C/Inline/Cuda.hs | 53 ++++++++ inline-c-cuda/test/tests.hs | 133 ++++++++++++++++++++ inline-c/src/Language/C/Inline.hs | 1 + inline-c/src/Language/C/Inline/Context.hs | 5 + inline-c/src/Language/C/Inline/Internal.hs | 19 ++- 9 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 inline-c-cuda/LICENSE create mode 100644 inline-c-cuda/README.md create mode 100644 inline-c-cuda/Setup.hs create mode 100644 inline-c-cuda/inline-c-cuda.cabal create mode 100644 inline-c-cuda/src/Language/C/Inline/Cuda.hs create mode 100644 inline-c-cuda/test/tests.hs diff --git a/inline-c-cuda/LICENSE b/inline-c-cuda/LICENSE new file mode 100644 index 0000000..273e84e --- /dev/null +++ b/inline-c-cuda/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2015 FP Complete Corporation. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/inline-c-cuda/README.md b/inline-c-cuda/README.md new file mode 100644 index 0000000..a70a731 --- /dev/null +++ b/inline-c-cuda/README.md @@ -0,0 +1 @@ +Small set of utilities to inline CUDA code. See tests for example. diff --git a/inline-c-cuda/Setup.hs b/inline-c-cuda/Setup.hs new file mode 100644 index 0000000..9a994af --- /dev/null +++ b/inline-c-cuda/Setup.hs @@ -0,0 +1,2 @@ +import Distribution.Simple +main = defaultMain diff --git a/inline-c-cuda/inline-c-cuda.cabal b/inline-c-cuda/inline-c-cuda.cabal new file mode 100644 index 0000000..1af8144 --- /dev/null +++ b/inline-c-cuda/inline-c-cuda.cabal @@ -0,0 +1,50 @@ +cabal-version: 2.2 +name: inline-c-cuda +version: 0.1.0.0 +synopsis: Lets you embed CUDA code into Haskell. +description: Utilities to inline CUDA code into Haskell using inline-c. See + tests for example on how to build. +license: MIT +license-file: LICENSE +author: Francesco Mazzoli +maintainer: f@mazzo.li +copyright: (c) 2015-2016 FP Complete Corporation, (c) 2017-2019 Francesco Mazzoli +category: FFI +tested-with: GHC == 9.2.2 +build-type: Simple + +source-repository head + type: git + location: https://github.com/fpco/inline-c + +library + exposed-modules: Language.C.Inline.Cuda + build-depends: base >=4.7 && <5 + , bytestring + , inline-c >= 0.9.0.0 + , inline-c-cpp + , template-haskell + , text + , safe-exceptions + , containers + , process + hs-source-dirs: src + default-language: Haskell2010 + ghc-options: -Wall + extra-libraries: cudart + +test-suite tests + type: exitcode-stdio-1.0 + hs-source-dirs: test + main-is: tests.hs + build-depends: base >=4 && <5 + , bytestring + , inline-c + , inline-c-cpp + , inline-c-cuda + , safe-exceptions + , hspec + , containers + , template-haskell + , vector + default-language: Haskell2010 diff --git a/inline-c-cuda/src/Language/C/Inline/Cuda.hs b/inline-c-cuda/src/Language/C/Inline/Cuda.hs new file mode 100644 index 0000000..a98557e --- /dev/null +++ b/inline-c-cuda/src/Language/C/Inline/Cuda.hs @@ -0,0 +1,53 @@ +{-# LANGUAGE QuasiQuotes #-} +{-# LANGUAGE TemplateHaskell #-} +{-# LANGUAGE OverloadedStrings #-} + +-- | Module exposing a 'Context' to inline CUDA code. We only have used +-- this for experiments, so use with caution. See the CUDA tests to see +-- how to build inline CUDA code. +module Language.C.Inline.Cuda + ( module Language.C.Inline + , cudaCtx + , Cpp.cppTypePairs + , Cpp.using + , Cpp.AbstractCppExceptionPtr + ) where + +import qualified Language.Haskell.TH as TH +import qualified Language.Haskell.TH.Syntax as TH + +import Language.C.Inline +import Language.C.Inline.Context +import qualified Language.C.Types as CT +import qualified Language.C.Inline.Cpp as Cpp + +import qualified Data.Map as Map +import Control.Monad.IO.Class (liftIO) +import System.Exit (ExitCode(..)) +import System.Process (readProcessWithExitCode) + +compileCuda :: String -> TH.Q FilePath +compileCuda src = do + cuFile <- TH.addTempFile "cu" + oFile <- TH.addTempFile "o" + let (cmd,args) = ("nvcc", ["-c","-o",oFile, cuFile]) + (code, stdout, stderr) <- liftIO $ do + writeFile cuFile src + readProcessWithExitCode cmd args "" + case code of + ExitFailure _ -> fail $ "Compile Command: " ++ (foldl (\a b -> a ++ " " ++ b) " " (cmd : args)) ++ "\n" ++ " Output: " ++ stdout ++ "\n" ++ " Error: " ++ stderr + ExitSuccess -> return oFile + +-- | The equivalent of 'C.baseCtx' for CUDA. It specifies the @.cu@ +-- file extension for the CUDA file, so that nvcc will decide to build CUDA +-- instead of C. See the @.cabal@ test target for an example on how to +-- build. +cudaCtx :: Context +cudaCtx = Cpp.cppCtx <> mempty + { ctxForeignSrcLang = Just TH.RawObject + , ctxOutput = Just $ \s -> "extern \"C\" {\n" ++ s ++ "\n}" + , ctxEnableCpp = True + , ctxRawObjectCompile = Just compileCuda + , ctxTypesTable = Map.singleton (CT.TypeName "std::exception_ptr") [t|Cpp.AbstractCppExceptionPtr|] + } + diff --git a/inline-c-cuda/test/tests.hs b/inline-c-cuda/test/tests.hs new file mode 100644 index 0000000..1e5e528 --- /dev/null +++ b/inline-c-cuda/test/tests.hs @@ -0,0 +1,133 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE ConstraintKinds #-} +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE KindSignatures #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE PolyKinds #-} +{-# LANGUAGE QuasiQuotes #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TemplateHaskell #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE TypeInType #-} +{-# LANGUAGE TypeOperators #-} +{-# LANGUAGE UndecidableInstances #-} +{-# LANGUAGE TypeApplications #-} +{-# OPTIONS_GHC -Wno-deprecations #-} + +import Control.Exception.Safe +import Control.Monad +import qualified Language.C.Inline.Context as CC +import qualified Language.C.Types as CT +import qualified Language.C.Inline.Cuda as C +import qualified Test.Hspec as Hspec +import Test.Hspec (shouldBe) +import Foreign.Ptr (Ptr) +import Data.Monoid +import Foreign.Marshal.Array +import Foreign.Marshal.Alloc +import Foreign.Storable + + +C.context $ C.cudaCtx + +C.include "" +C.include "" + +[C.emitBlock| +__global__ void +vectorAdd(const float *A, const float *B, float *C, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + C[i] = A[i] + B[i]; + } +} +|] + +cudaAllocaArray :: forall b. Int -> (Ptr C.CFloat -> IO b) -> IO b +cudaAllocaArray size func = do + let csize = fromIntegral size + alloca $ \(ptr_d_A :: Ptr (Ptr C.CFloat)) -> do + [C.block| void { + cudaError_t err = cudaMalloc((void **)$(float** ptr_d_A), $(int csize) * sizeof(float)); + if (err != cudaSuccess) + { + fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + } |] + d_A <- peekElemOff ptr_d_A 0 + ret <- func d_A + [C.block| void { + cudaError_t err = cudaFree($(float* d_A)); + if (err != cudaSuccess) + { + fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + } |] + return ret + +cudaMemcpyHostToDevice :: Int -> Ptr C.CFloat -> Ptr C.CFloat -> IO () +cudaMemcpyHostToDevice num host device = do + let cnum = fromIntegral num + [C.block| void { + cudaError_t err = cudaMemcpy($(float* device), $(float* host), $(int cnum) * sizeof(float), cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + fprintf(stderr, "Failed to copy vector from host to device (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + } |] + +cudaMemcpyDeviceToHost :: Int -> Ptr C.CFloat -> Ptr C.CFloat -> IO () +cudaMemcpyDeviceToHost num device host = do + let cnum = fromIntegral num + [C.block| void { + cudaError_t err = cudaMemcpy($(float* host), $(float* device), $(int cnum) * sizeof(float), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + } |] + + +main :: IO () +main = Hspec.hspec $ do + Hspec.describe "Basic CUDA" $ do + Hspec.it "Add vectors on device" $ do + let numElements = 50000 + cNumElements = fromIntegral numElements + allocaArray numElements $ \(h_A :: Ptr C.CFloat) -> do + allocaArray numElements $ \(h_B :: Ptr C.CFloat) -> do + allocaArray numElements $ \(h_C :: Ptr C.CFloat) -> do + cudaAllocaArray numElements $ \(d_A :: Ptr C.CFloat) -> do + cudaAllocaArray numElements $ \(d_B :: Ptr C.CFloat) -> do + cudaAllocaArray numElements $ \(d_C :: Ptr C.CFloat) -> do + [C.block| void { + for (int i = 0; i < $(int cNumElements); ++i) + { + $(float* h_A)[i] = rand()/(float)RAND_MAX; + $(float* h_B)[i] = rand()/(float)RAND_MAX; + } + } |] + cudaMemcpyHostToDevice numElements h_A d_A + cudaMemcpyHostToDevice numElements h_B d_B + [C.block| void { + int threadsPerBlock = 256; + int blocksPerGrid =($(int cNumElements) + threadsPerBlock - 1) / threadsPerBlock; + vectorAdd<<>>($(float* d_A), $(float* d_B), $(float* d_C), $(int cNumElements)); + } |] + cudaMemcpyDeviceToHost numElements d_C h_C + lA <- peekArray numElements h_A + lB <- peekArray numElements h_B + lC <- peekArray numElements h_C + all (< 1e-5) (map (\((a,b),c) -> abs(a + b - c)) (zip (zip lA lB) lC)) `shouldBe` True diff --git a/inline-c/src/Language/C/Inline.hs b/inline-c/src/Language/C/Inline.hs index f864b11..7d7f1f2 100644 --- a/inline-c/src/Language/C/Inline.hs +++ b/inline-c/src/Language/C/Inline.hs @@ -42,6 +42,7 @@ module Language.C.Inline , block , include , verbatim + , emitBlock -- * 'Ptr' utils , withPtr diff --git a/inline-c/src/Language/C/Inline/Context.hs b/inline-c/src/Language/C/Inline/Context.hs index 516490a..42d12ed 100644 --- a/inline-c/src/Language/C/Inline/Context.hs +++ b/inline-c/src/Language/C/Inline/Context.hs @@ -161,6 +161,8 @@ data Context = Context , ctxForeignSrcLang :: Maybe TH.ForeignSrcLang -- ^ TH.LangC by default , ctxEnableCpp :: Bool + -- ^ Compile source code to raw object. + , ctxRawObjectCompile :: Maybe (String -> TH.Q FilePath) } @@ -172,6 +174,7 @@ instance Semigroup Context where , ctxOutput = ctxOutput ctx1 <|> ctxOutput ctx2 , ctxForeignSrcLang = ctxForeignSrcLang ctx1 <|> ctxForeignSrcLang ctx2 , ctxEnableCpp = ctxEnableCpp ctx1 || ctxEnableCpp ctx2 + , ctxRawObjectCompile = ctxRawObjectCompile ctx1 <|> ctxRawObjectCompile ctx2 } #endif @@ -182,6 +185,7 @@ instance Monoid Context where , ctxOutput = Nothing , ctxForeignSrcLang = Nothing , ctxEnableCpp = False + , ctxRawObjectCompile = Nothing } #if !MIN_VERSION_base(4,11,0) @@ -191,6 +195,7 @@ instance Monoid Context where , ctxOutput = ctxOutput ctx1 <|> ctxOutput ctx2 , ctxForeignSrcLang = ctxForeignSrcLang ctx1 <|> ctxForeignSrcLang ctx2 , ctxEnableCpp = ctxEnableCpp ctx1 || ctxEnableCpp ctx2 + , ctxRawObjectCompile = ctxRawObjectCompile ctx1 <|> ctxRawObjectCompile ctx2 } #endif diff --git a/inline-c/src/Language/C/Inline/Internal.hs b/inline-c/src/Language/C/Inline/Internal.hs index 2945dac..810dec2 100644 --- a/inline-c/src/Language/C/Inline/Internal.hs +++ b/inline-c/src/Language/C/Inline/Internal.hs @@ -32,6 +32,7 @@ module Language.C.Inline.Internal -- ** Emitting C code , emitVerbatim + , emitBlock -- ** Inlining C code -- $embedding @@ -168,11 +169,16 @@ initialiseModuleState mbContext = do Nothing -> fail "inline-c: ModuleState not present (initialiseModuleState)" Just ms -> return ms let lang = fromMaybe TH.LangC (ctxForeignSrcLang context) + addForeignSource = #if MIN_VERSION_base(4,12,0) - TH.addForeignSource lang (concat (reverse (msFileChunks ms))) + TH.addForeignSource #else - TH.addForeignFile lang (concat (reverse (msFileChunks ms))) + TH.addForeignFile #endif + src = (concat (reverse (msFileChunks ms))) + case (lang, ctxRawObjectCompile context) of + (TH.RawObject, Just compile) -> compile src >>= TH.addForeignFilePath lang + (_, _) -> addForeignSource lang src let moduleState = ModuleState { msContext = context , msGeneratedNames = 0 @@ -234,6 +240,15 @@ emitVerbatim s = do (ms{msFileChunks = chunk : msFileChunks ms}, ()) return [] +-- | Simply appends some string of block to the module's C file. Use with care. +emitBlock :: TH.QuasiQuoter +emitBlock = TH.QuasiQuoter + { TH.quoteExp = const $ fail "inline-c: quoteExp not implemented (quoteCode)" + , TH.quotePat = const $ fail "inline-c: quotePat not implemented (quoteCode)" + , TH.quoteType = const $ fail "inline-c: quoteType not implemented (quoteCode)" + , TH.quoteDec = emitVerbatim + } + ------------------------------------------------------------------------ -- Inlining