diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 87bb3a20f2a28..96ebc9192c708 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -205,17 +205,17 @@ effectiveStdenv.mkDerivation ( cmakeFlags = [ - (cmakeBool "LLAMA_NATIVE" false) (cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) - (cmakeBool "LLAMA_BLAS" useBlas) - (cmakeBool "LLAMA_CLBLAST" useOpenCL) - (cmakeBool "LLAMA_CUDA" useCuda) - (cmakeBool "LLAMA_HIPBLAS" useRocm) - (cmakeBool "LLAMA_METAL" useMetalKit) - (cmakeBool "LLAMA_VULKAN" useVulkan) - (cmakeBool "LLAMA_STATIC" enableStatic) + (cmakeBool "GGML_NATIVE" false) + (cmakeBool "GGML_BLAS" useBlas) + (cmakeBool "GGML_CLBLAST" useOpenCL) + (cmakeBool "GGML_CUDA" useCuda) + (cmakeBool "GGML_HIPBLAS" useRocm) + (cmakeBool "GGML_METAL" useMetalKit) + (cmakeBool "GGML_VULKAN" useVulkan) + (cmakeBool "GGML_STATIC" enableStatic) ] ++ optionals useCuda [ ( @@ -231,7 +231,7 @@ effectiveStdenv.mkDerivation ( ] ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") - (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) + (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) ]; # Environment variables needed for ROCm diff --git a/.github/labeler.yml b/.github/labeler.yml index 5c12bab735e9c..72c7959cc6ffe 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -2,31 +2,31 @@ Kompute: - changed-files: - any-glob-to-any-file: - - ggml-kompute.h - - ggml-kompute.cpp + - ggml/src/ggml-kompute.h + - ggml/src/ggml-kompute.cpp - README-kompute.md Apple Metal: - changed-files: - any-glob-to-any-file: - - ggml-metal.h - - ggml-metal.cpp + - ggml/src/ggml-metal.h + - ggml/src/ggml-metal.cpp - README-metal.md SYCL: - changed-files: - any-glob-to-any-file: - - ggml-sycl.h - - ggml-sycl.cpp + - ggml/src/ggml-sycl.h + - ggml/src/ggml-sycl.cpp - README-sycl.md Nvidia GPU: - changed-files: - any-glob-to-any-file: - - ggml-cuda.h - - ggml-cuda/** + - ggml/src/ggml-cuda.h + - ggml/src/ggml-cuda/** Vulkan: - changed-files: - any-glob-to-any-file: - - ggml_vk_generate_shaders.py - - ggml-vulkan* + - ggml/ggml_vk_generate_shaders.py + - ggml/src/ggml-vulkan* documentation: - changed-files: - any-glob-to-any-file: @@ -73,10 +73,10 @@ server: ggml: - changed-files: - any-glob-to-any-file: - - ggml.c - - ggml.h - - ggml-*.c - - ggml-*.h + - ggml/include/ggml*.h + - ggml/src/ggml*.c + - ggml/src/ggml*.cpp + - ggml/src/ggml*.h - ggml-cuda/** nix: - changed-files: diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 88ab4844ef123..eb69b82c47e64 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -109,7 +109,7 @@ jobs: run: | set -eux cmake -B build \ - -DLLAMA_NATIVE=OFF \ + -DGGML_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ -DLLAMA_CURL=ON \ -DLLAMA_CUBLAS=ON \ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a8fcae0435e00..0d91fc4e4e965 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -47,7 +47,7 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON .. + cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -105,7 +105,7 @@ jobs: sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON + cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -305,7 +305,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - name: Test @@ -335,7 +335,7 @@ jobs: run: | mkdir build cd build - cmake -DLLAMA_RPC=ON .. + cmake -DGGML_RPC=ON .. cmake --build . --config Release -j $(nproc) - name: Test @@ -363,7 +363,7 @@ jobs: run: | mkdir build cd build - cmake -DLLAMA_VULKAN=ON .. + cmake -DGGML_VULKAN=ON .. cmake --build . --config Release -j $(nproc) ubuntu-22-cmake-hip: @@ -384,13 +384,13 @@ jobs: - name: Build with native CMake HIP support id: cmake_build run: | - cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON + cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON cmake --build build --config Release -j $(nproc) - name: Build with legacy HIP support id: cmake_build_legacy_hip run: | - cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON + cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON cmake --build build2 --config Release -j $(nproc) ubuntu-22-cmake-sycl: @@ -431,7 +431,7 @@ jobs: source /opt/intel/oneapi/setvars.sh mkdir build cd build - cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. + cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. cmake --build . --config Release -j $(nproc) ubuntu-22-cmake-sycl-fp16: @@ -472,10 +472,10 @@ jobs: source /opt/intel/oneapi/setvars.sh mkdir build cd build - cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON .. + cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON .. cmake --build . --config Release -j $(nproc) - # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know + # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 macOS-latest-make: @@ -497,15 +497,15 @@ jobs: env: LLAMA_FATAL_WARNINGS: 1 run: | - LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) + GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) - name: Test id: make_test run: | - LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu) - LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu) + GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu) + GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu) - # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know + # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584 # would be great if we fix these @@ -529,7 +529,7 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF .. + cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -559,13 +559,14 @@ jobs: mkdir build cd build cmake -G Xcode .. \ - -DLLAMA_METAL_EMBED_LIBRARY=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) + -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO macOS-latest-cmake-tvos: runs-on: macos-latest @@ -588,13 +589,14 @@ jobs: mkdir build cd build cmake -G Xcode .. \ - -DLLAMA_METAL_EMBED_LIBRARY=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ -DCMAKE_SYSTEM_NAME=tvOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 - cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) + -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO macOS-latest-swift: runs-on: macos-latest @@ -662,7 +664,7 @@ jobs: - name: Build using make w/ OpenBLAS shell: msys2 {0} run: | - make LLAMA_OPENBLAS=1 -j $(nproc) + make GGML_OPENBLAS=1 -j $(nproc) - name: Build using CMake shell: msys2 {0} @@ -678,7 +680,7 @@ jobs: - name: Build using CMake w/ OpenBLAS shell: msys2 {0} run: | - cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS + cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS cmake --build build --config ${{ matrix.build }} -j $(nproc) windows-latest-cmake: @@ -693,25 +695,25 @@ jobs: matrix: include: - build: 'rpc-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' - build: 'noavx-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx2-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'avx-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx512-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' - build: 'openblas-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - build: 'kompute-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' - build: 'vulkan-x64' - defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' - build: 'llvm-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' - build: 'msvc-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' steps: - name: Clone @@ -724,7 +726,7 @@ jobs: id: clone_kompute if: ${{ matrix.build == 'kompute-x64' }} run: | - git submodule update --init kompute + git submodule update --init ggml/src/kompute - name: Download OpenBLAS id: get_openblas @@ -854,7 +856,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON + cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Determine tag name @@ -987,7 +989,7 @@ jobs: run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON cmake --build build --config Release ios-xcode-build: diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 311abf02af807..99feb28f2a545 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -92,12 +92,12 @@ jobs: if: ${{ matrix.sanitizer == 'THREAD' }} run: | cmake -B build \ - -DLLAMA_NATIVE=OFF \ + -DGGML_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ -DLLAMA_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DLLAMA_OPENMP=OFF ; + -DGGML_OPENMP=OFF ; cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - name: Build @@ -105,7 +105,7 @@ jobs: if: ${{ matrix.sanitizer != 'THREAD' }} run: | cmake -B build \ - -DLLAMA_NATIVE=OFF \ + -DGGML_NATIVE=OFF \ -DLLAMA_BUILD_SERVER=ON \ -DLLAMA_CURL=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ diff --git a/.gitignore b/.gitignore index a0c16e880b719..177e6a8dbb117 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,7 @@ CMakeSettings.json compile_commands.json ggml-metal-embed.metal llama-batched-swift +/rpc-server out/ tmp/ diff --git a/.gitmodules b/.gitmodules index b7e8b8ff2f64e..5861d59cb785d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "kompute"] - path = kompute + path = ggml/src/kompute url = https://github.com/nomic-ai/kompute.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cfe08d7b7d59..6b9b541304306 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,9 @@ cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target project("llama.cpp" C CXX) include(CheckIncludeFileCXX) +#set(CMAKE_WARN_DEPRECATED YES) +set(CMAKE_WARN_UNUSED_CLI YES) + set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) @@ -32,1291 +35,73 @@ else() endif() endif() +option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) # -# Option list +# option list # -if (APPLE) - set(LLAMA_METAL_DEFAULT ON) - set(LLAMA_BLAS_DEFAULT ON) - set(LLAMA_BLAS_VENDOR_DEFAULT "Apple") -else() - set(LLAMA_METAL_DEFAULT OFF) - set(LLAMA_BLAS_DEFAULT OFF) - set(LLAMA_BLAS_VENDOR_DEFAULT "Generic") -endif() - -set(LLAMA_LLAMAFILE_DEFAULT ON) - # general -option(BUILD_SHARED_LIBS "build shared libraries" OFF) -option(LLAMA_STATIC "llama: static link libraries" OFF) -option(LLAMA_NATIVE "llama: enable -march=native flag" ON) -option(LLAMA_LTO "llama: enable link time optimization" OFF) -option(LLAMA_CCACHE "llama: use ccache if available" ON) +option(LLAMA_CCACHE "llama: use ccache if available" ON) # debug -option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) -option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) -option(LLAMA_GPROF "llama: enable gprof" OFF) +option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) +option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) # build -option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) +option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) # sanitizers -option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) -option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) -option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) - -# instruction set specific -if (LLAMA_NATIVE) - set(INS_ENB OFF) -else() - set(INS_ENB ON) -endif() - -option(LLAMA_SVE "llama: enable SVE" OFF) -option(LLAMA_AVX "llama: enable AVX" ${INS_ENB}) -option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB}) -option(LLAMA_AVX512 "llama: enable AVX512" OFF) -option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) -option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) -option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF) -option(LLAMA_FMA "llama: enable FMA" ${INS_ENB}) -# in MSVC F16C is implied with AVX2/AVX512 -if (NOT MSVC) - option(LLAMA_F16C "llama: enable F16C" ${INS_ENB}) -endif() +option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) +option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) +option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) -if (WIN32) - set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version") -endif() +# extra artifacts +option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) # 3rd party libs -option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) -option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT}) -set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING - "llama: BLAS library vendor") -option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) -option(LLAMA_CUDA "llama: use CUDA" OFF) -option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) -option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) -option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) -set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") -set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") -option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) -set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") -set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING - "llama: max. batch size for using peer access") -option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF) -option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF) -option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF) - -option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) -option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) -option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) -option(LLAMA_VULKAN "llama: use Vulkan" OFF) -option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF) -option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF) -option(LLAMA_VULKAN_MEMORY_DEBUG "llama: enable Vulkan memory debug output" OFF) -option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF) -option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF) -option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) -option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) -option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) -option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF) -set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING - "llama: metal minimum macOS version") -set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") -option(LLAMA_KOMPUTE "llama: use Kompute" OFF) -option(LLAMA_RPC "llama: use RPC" OFF) -option(LLAMA_OPENMP "llama: use OpenMP" ON) -option(LLAMA_SYCL "llama: use SYCL" OFF) -option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) -set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") -option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF) -set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism") - -option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_SERVER "llama: build server example" ON) -option(LLAMA_LASX "llama: enable lasx" ON) -option(LLAMA_LSX "llama: enable lsx" ON) - -# add perf arguments -option(LLAMA_PERF "llama: enable perf" OFF) +option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) -# -# Compile flags -# - -if (LLAMA_SYCL) - set(CMAKE_CXX_STANDARD 17) -else() - set(CMAKE_CXX_STANDARD 11) -endif() - -set(CMAKE_CXX_STANDARD_REQUIRED true) -set(CMAKE_C_STANDARD 11) -set(CMAKE_C_STANDARD_REQUIRED true) -set(THREADS_PREFER_PTHREAD_FLAG ON) - -find_package(Threads REQUIRED) -include(CheckCXXCompilerFlag) - -add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES}) - -# enable libstdc++ assertions for debug builds -if (CMAKE_SYSTEM_NAME MATCHES "Linux") - add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) -endif() - -if (NOT MSVC) - if (LLAMA_SANITIZE_THREAD) - add_compile_options(-fsanitize=thread) - link_libraries (-fsanitize=thread) - endif() - - if (LLAMA_SANITIZE_ADDRESS) - add_compile_options(-fsanitize=address -fno-omit-frame-pointer) - link_libraries (-fsanitize=address) - endif() - - if (LLAMA_SANITIZE_UNDEFINED) - add_compile_options(-fsanitize=undefined) - link_libraries (-fsanitize=undefined) - endif() -endif() - -if (APPLE AND LLAMA_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate) - if (ACCELERATE_FRAMEWORK) - message(STATUS "Accelerate framework found") - - add_compile_definitions(GGML_USE_ACCELERATE) - add_compile_definitions(ACCELERATE_NEW_LAPACK) - add_compile_definitions(ACCELERATE_LAPACK_ILP64) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) - else() - message(WARNING "Accelerate framework not found") - endif() -endif() - -if (LLAMA_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) - - message(STATUS "Metal framework found") - set(GGML_HEADERS_METAL ggml-metal.h) - set(GGML_SOURCES_METAL ggml-metal.m) - - add_compile_definitions(GGML_USE_METAL) - if (LLAMA_METAL_NDEBUG) - add_compile_definitions(GGML_METAL_NDEBUG) - endif() - - # copy ggml-common.h and ggml-metal.metal to bin directory - configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) - configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) - - if (LLAMA_METAL_EMBED_LIBRARY) - enable_language(ASM) - add_compile_definitions(GGML_METAL_EMBED_LIBRARY) - - set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h") - set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") - - file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") - - # merge ggml-common.h and ggml-metal.metal into a single file - set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s") - set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") - - add_custom_command( - OUTPUT ${METALLIB_EMBED_ASM} - COMMAND echo "Embedding Metal library" - COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED} - COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} - COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} - COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} - COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} - COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} - COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} - DEPENDS ggml-metal.metal ggml-common.h - COMMENT "Generate assembly for embedded Metal library" - ) - - set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM}) - else() - if (LLAMA_METAL_SHADER_DEBUG) - # custom command to do the following: - # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air - # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib - # - # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works - # disabling fast math is needed in order to pass tests/test-backend-ops - # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 - # note: unfortunately, we have to call it default.metallib instead of ggml.metallib - # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 - set(XC_FLAGS -fno-fast-math -fno-inline -g) - else() - set(XC_FLAGS -O3) - endif() - - # Append macOS metal versioning flags - if (LLAMA_METAL_MACOSX_VERSION_MIN) - message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation") - list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN}) - endif() - if (LLAMA_METAL_STD) - message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation") - list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD}) - endif() - - add_custom_command( - OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air - COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air - COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h - COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal - DEPENDS ggml-metal.metal ggml-common.h - COMMENT "Compiling Metal kernels" - ) - - add_custom_target( - ggml-metal ALL - DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - ) - endif() # LLAMA_METAL_EMBED_LIBRARY - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} - ${FOUNDATION_LIBRARY} - ${METAL_FRAMEWORK} - ${METALKIT_FRAMEWORK} - ) -endif() - -if (LLAMA_OPENMP) - find_package(OpenMP) - if (OpenMP_FOUND) - message(STATUS "OpenMP found") - add_compile_definitions(GGML_USE_OPENMP) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - else() - message(WARNING "OpenMP not found") - endif() -endif() - -if (LLAMA_BLAS) - if (LLAMA_STATIC) - set(BLA_STATIC ON) - endif() - #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) - # set(BLA_SIZEOF_INTEGER 8) - #endif() - - set(BLA_VENDOR ${LLAMA_BLAS_VENDOR}) - find_package(BLAS) - - if (BLAS_FOUND) - message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - - if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple")) - # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. - # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 - find_package(PkgConfig REQUIRED) - if (${LLAMA_BLAS_VENDOR} MATCHES "Generic") - pkg_check_modules(DepBLAS REQUIRED blas) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS") - # As of openblas v0.3.22, the 64-bit is named openblas64.pc - pkg_check_modules(DepBLAS openblas64) - if (NOT DepBLAS_FOUND) - pkg_check_modules(DepBLAS REQUIRED openblas) - endif() - elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME") - pkg_check_modules(DepBLAS REQUIRED blis) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS") - pkg_check_modules(DepBLAS REQUIRED blas-atlas) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS") - pkg_check_modules(DepBLAS REQUIRED flexiblas_api) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel") - # all Intel* libraries share the same include path - pkg_check_modules(DepBLAS REQUIRED mkl-sdl) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC") - # this doesn't provide pkg-config - # suggest to assign BLAS_INCLUDE_DIRS on your own - if ("${NVHPC_VERSION}" STREQUAL "") - message(WARNING "Better to set NVHPC_VERSION") - else() - set(DepBLAS_FOUND ON) - set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include") - endif() - endif() - if (DepBLAS_FOUND) - set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS}) - else() - message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically" - " detected by pkgconfig, trying to find cblas.h from possible paths...") - find_path(BLAS_INCLUDE_DIRS - NAMES cblas.h - HINTS - /usr/include - /usr/local/include - /usr/include/openblas - /opt/homebrew/opt/openblas/include - /usr/local/opt/openblas/include - /usr/include/x86_64-linux-gnu/openblas/include - ) - endif() - endif() - - message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") - - add_compile_options(${BLAS_LINKER_FLAGS}) - - add_compile_definitions(GGML_USE_BLAS) - - if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) - add_compile_definitions(GGML_BLAS_USE_MKL) - endif() - - set(GGML_HEADERS_BLAS ggml-blas.h) - set(GGML_SOURCES_BLAS ggml-blas.cpp) - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) - set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) - else() - message(WARNING "BLAS not found, please refer to " - "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" - " to set correct LLAMA_BLAS_VENDOR") +# override ggml options +set(GGML_CCACHE ${LLAMA_CCACHE}) +set(GGML_BUILD_SHARED_LIBS ${LLAMA_BUILD_SHARED_LIBS}) +set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD}) +set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS}) +set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) +set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) +set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) +set(GGML_LLAMAFILE ON) + +# transition helpers +function (llama_option_depr TYPE OLD NEW) + if (${OLD}) + message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n") + set(${NEW} ON) endif() -endif() - -if (LLAMA_LLAMAFILE) - add_compile_definitions(GGML_USE_LLAMAFILE) - - set(GGML_HEADERS_LLAMAFILE sgemm.h) - set(GGML_SOURCES_LLAMAFILE sgemm.cpp) -endif() - -if (LLAMA_CUBLAS) - message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead") - set(LLAMA_CUDA ON) -endif() - -if (LLAMA_CUDA) - cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES - - find_package(CUDAToolkit) - if (CUDAToolkit_FOUND) - message(STATUS "CUDA found") - - if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - # 52 == lowest CUDA 12 standard - # 60 == f16 CUDA intrinsics - # 61 == integer CUDA intrinsics - # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster - if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16) - set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics - else() - set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics - #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work - endif() - endif() - message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") - - enable_language(CUDA) - - set(GGML_HEADERS_CUDA ggml-cuda.h) - - file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") - list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") - file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - - add_compile_definitions(GGML_USE_CUDA) - add_compile_definitions(GGML_CUDA_USE_GRAPHS) - if (LLAMA_CUDA_FORCE_DMMV) - add_compile_definitions(GGML_CUDA_FORCE_DMMV) - endif() - if (LLAMA_CUDA_FORCE_MMQ) - add_compile_definitions(GGML_CUDA_FORCE_MMQ) - endif() - if (LLAMA_CUDA_NO_VMM) - add_compile_definitions(GGML_CUDA_NO_VMM) - endif() - add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) - add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) - if (DEFINED LLAMA_CUDA_DMMV_Y) - add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility - endif() - if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16) - add_compile_definitions(GGML_CUDA_F16) - endif() - add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) - if (LLAMA_CUDA_NO_PEER_COPY) - add_compile_definitions(GGML_CUDA_NO_PEER_COPY) - endif() - if (LLAMA_CUDA_FA_ALL_QUANTS) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) - else() - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - endif() - - if (LLAMA_STATIC) - if (WIN32) - # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) - else () - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) - endif() - else() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) - endif() - - if (LLAMA_CUDA_NO_VMM) - # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) - else() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... - endif() - else() - message(WARNING "CUDA not found") - endif() -endif() - -if (LLAMA_RPC) - add_compile_definitions(GGML_USE_RPC) - - if (WIN32) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32) - endif() - - set(GGML_HEADERS_RPC ggml-rpc.h) - set(GGML_SOURCES_RPC ggml-rpc.cpp) -endif() - -if (LLAMA_VULKAN) - find_package(Vulkan) - if (Vulkan_FOUND) - message(STATUS "Vulkan found") - - set(GGML_HEADERS_VULKAN ggml-vulkan.h) - set(GGML_SOURCES_VULKAN ggml-vulkan.cpp) - - add_compile_definitions(GGML_USE_VULKAN) - - # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build - # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector - if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0) - endif() - - if (LLAMA_VULKAN_CHECK_RESULTS) - add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) - endif() - - if (LLAMA_VULKAN_DEBUG) - add_compile_definitions(GGML_VULKAN_DEBUG) - endif() - - if (LLAMA_VULKAN_MEMORY_DEBUG) - add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG) - endif() - - if (LLAMA_VULKAN_VALIDATE) - add_compile_definitions(GGML_VULKAN_VALIDATE) - endif() - - if (LLAMA_VULKAN_RUN_TESTS) - add_compile_definitions(GGML_VULKAN_RUN_TESTS) - endif() - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan) - else() - message(WARNING "Vulkan not found") - endif() -endif() - -if (LLAMA_HIPBLAS) - if (NOT EXISTS $ENV{ROCM_PATH}) - if (NOT EXISTS /opt/rocm) - set(ROCM_PATH /usr) - else() - set(ROCM_PATH /opt/rocm) - endif() - else() - set(ROCM_PATH $ENV{ROCM_PATH}) - endif() - list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) - list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake") - - # CMake on Windows doesn't support the HIP language yet - if(WIN32) - set(CXX_IS_HIPCC TRUE) - else() - string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}") - endif() - - if(CXX_IS_HIPCC) - if(LINUX) - if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") - message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") - endif() - - message(WARNING "Setting hipcc as the C++ compiler is legacy behavior." - " Prefer setting the HIP compiler directly. See README for details.") - endif() - else() - # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES. - if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) - set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) - endif() - cmake_minimum_required(VERSION 3.21) - enable_language(HIP) - endif() - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) - - message(STATUS "HIP and hipBLAS found") - - set(GGML_HEADERS_ROCM ggml-cuda.h) - - file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") - list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") - file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - - add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA) - - if (LLAMA_HIP_UMA) - add_compile_definitions(GGML_HIP_UMA) - endif() - - if (LLAMA_CUDA_FORCE_DMMV) - add_compile_definitions(GGML_CUDA_FORCE_DMMV) - endif() - - if (LLAMA_CUDA_FORCE_MMQ) - add_compile_definitions(GGML_CUDA_FORCE_MMQ) - endif() - - if (LLAMA_CUDA_NO_PEER_COPY) - add_compile_definitions(GGML_CUDA_NO_PEER_COPY) - endif() - - if (LLAMA_CUDA_FA_ALL_QUANTS) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) - else() - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - endif() - - add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) - add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) - add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - - if (CXX_IS_HIPCC) - set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device) - else() - set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) - endif() - - if (LLAMA_STATIC) - message(FATAL_ERROR "Static linking not supported for HIP/ROCm") - endif() - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas) -endif() - -if (LLAMA_SYCL) - if (NOT LLAMA_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$") - message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA") - endif() - - if ( NOT DEFINED ENV{ONEAPI_ROOT}) - message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") - endif() - #todo: AOT - - find_package(IntelSYCL REQUIRED) - find_package(MKL REQUIRED) - - message(STATUS "SYCL found") - - add_compile_definitions(GGML_USE_SYCL) - - if (LLAMA_SYCL_F16) - add_compile_definitions(GGML_SYCL_F16) - endif() - - if (LLAMA_CUDA_FORCE_MMQ) - add_compile_definitions(GGML_SYCL_FORCE_MMQ) - endif() - - add_compile_options(-I./) #include DPCT - - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") - if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - endif() - - set(GGML_HEADERS_SYCL ggml-sycl.h) - file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp") - list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp") - - if (WIN32) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) - else() - add_compile_options(-I/${SYCL_INCLUDE_DIR}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") - if (LLAMA_SYCL_TARGET STREQUAL "INTEL") - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) - elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl pthread m dl onemkl) - endif() - endif() -endif() - -if (LLAMA_KOMPUTE) - add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) - find_package(Vulkan COMPONENTS glslc REQUIRED) - find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc) - if (NOT glslc_executable) - message(FATAL_ERROR "glslc not found") - endif() - - function(compile_shader) - set(options) - set(oneValueArgs) - set(multiValueArgs SOURCES) - cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - foreach(source ${compile_shader_SOURCES}) - get_filename_component(filename ${source} NAME) - set(spv_file ${filename}.spv) - add_custom_command( - OUTPUT ${spv_file} - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp - COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} - COMMENT "Compiling ${source} to ${spv_file}" - ) - - get_filename_component(RAW_FILE_NAME ${spv_file} NAME) - set(FILE_NAME "shader${RAW_FILE_NAME}") - string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) - string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) - string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") - set(OUTPUT_HEADER_FILE "${HEADER_FILE}") - message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") - if(CMAKE_GENERATOR MATCHES "Visual Studio") - add_custom_command( - OUTPUT ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - DEPENDS ${spv_file} xxd - COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$/xxd" - ) - else() - add_custom_command( - OUTPUT ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - DEPENDS ${spv_file} xxd - COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" - ) - endif() - endforeach() - endfunction() - - if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") - message(STATUS "Kompute found") - set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level") - add_subdirectory(kompute) - - # Compile our shaders - compile_shader(SOURCES - kompute-shaders/op_scale.comp - kompute-shaders/op_scale_8.comp - kompute-shaders/op_add.comp - kompute-shaders/op_addrow.comp - kompute-shaders/op_mul.comp - kompute-shaders/op_silu.comp - kompute-shaders/op_relu.comp - kompute-shaders/op_gelu.comp - kompute-shaders/op_softmax.comp - kompute-shaders/op_norm.comp - kompute-shaders/op_rmsnorm.comp - kompute-shaders/op_diagmask.comp - kompute-shaders/op_mul_mat_mat_f32.comp - kompute-shaders/op_mul_mat_f16.comp - kompute-shaders/op_mul_mat_q8_0.comp - kompute-shaders/op_mul_mat_q4_0.comp - kompute-shaders/op_mul_mat_q4_1.comp - kompute-shaders/op_mul_mat_q6_k.comp - kompute-shaders/op_getrows_f32.comp - kompute-shaders/op_getrows_f16.comp - kompute-shaders/op_getrows_q4_0.comp - kompute-shaders/op_getrows_q4_1.comp - kompute-shaders/op_getrows_q6_k.comp - kompute-shaders/op_rope_f16.comp - kompute-shaders/op_rope_f32.comp - kompute-shaders/op_cpy_f16_f16.comp - kompute-shaders/op_cpy_f16_f32.comp - kompute-shaders/op_cpy_f32_f16.comp - kompute-shaders/op_cpy_f32_f32.comp - ) - - # Create a custom target for our generated shaders - add_custom_target(generated_shaders DEPENDS - shaderop_scale.h - shaderop_scale_8.h - shaderop_add.h - shaderop_addrow.h - shaderop_mul.h - shaderop_silu.h - shaderop_relu.h - shaderop_gelu.h - shaderop_softmax.h - shaderop_norm.h - shaderop_rmsnorm.h - shaderop_diagmask.h - shaderop_mul_mat_mat_f32.h - shaderop_mul_mat_f16.h - shaderop_mul_mat_q8_0.h - shaderop_mul_mat_q4_0.h - shaderop_mul_mat_q4_1.h - shaderop_mul_mat_q6_k.h - shaderop_getrows_f32.h - shaderop_getrows_f16.h - shaderop_getrows_q4_0.h - shaderop_getrows_q4_1.h - shaderop_getrows_q6_k.h - shaderop_rope_f16.h - shaderop_rope_f32.h - shaderop_cpy_f16_f16.h - shaderop_cpy_f16_f32.h - shaderop_cpy_f32_f16.h - shaderop_cpy_f32_f32.h - ) - - # Create a custom command that depends on the generated_shaders - add_custom_command( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp - COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp - DEPENDS generated_shaders - COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp" - ) - - # Add the stamp to the main sources to ensure dependency tracking - set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) - set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) - - add_compile_definitions(GGML_USE_KOMPUTE) - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute) - set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR}) - else() - message(WARNING "Kompute not found") - endif() -endif() - -if (LLAMA_CPU_HBM) - find_library(memkind memkind REQUIRED) - - add_compile_definitions(GGML_USE_CPU_HBM) - - target_link_libraries(ggml PUBLIC memkind) -endif() - -if (LLAMA_PERF) - add_compile_definitions(GGML_PERF) -endif() - -function(get_flags CCID CCVER) - set(C_FLAGS "") - set(CXX_FLAGS "") - - if (CCID MATCHES "Clang") - set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return) - set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi) - - if ( - (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR - (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) - ) - list(APPEND C_FLAGS -Wdouble-promotion) - endif() - elseif (CCID STREQUAL "GNU") - set(C_FLAGS -Wdouble-promotion) - set(CXX_FLAGS -Wno-array-bounds) - - if (CCVER VERSION_GREATER_EQUAL 7.1.0) - list(APPEND CXX_FLAGS -Wno-format-truncation) - endif() - if (CCVER VERSION_GREATER_EQUAL 8.1.0) - list(APPEND CXX_FLAGS -Wextra-semi) - endif() - endif() - - set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) - set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) endfunction() -if (LLAMA_FATAL_WARNINGS) - if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - list(APPEND C_FLAGS -Werror) - list(APPEND CXX_FLAGS -Werror) - elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - add_compile_options(/WX) - endif() -endif() - -if (LLAMA_ALL_WARNINGS) - if (NOT MSVC) - list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) - list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes - -Werror=implicit-int -Werror=implicit-function-declaration) - list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) - - list(APPEND C_FLAGS ${WARNING_FLAGS}) - list(APPEND CXX_FLAGS ${WARNING_FLAGS}) - - get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) - - add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" - "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") - else() - # todo : msvc - set(C_FLAGS "") - set(CXX_FLAGS "") - endif() -endif() - -set(CUDA_CXX_FLAGS "") - -if (LLAMA_CUDA) - set(CUDA_FLAGS -use_fast_math) - - if (LLAMA_FATAL_WARNINGS) - list(APPEND CUDA_FLAGS -Werror all-warnings) - endif() - - if (LLAMA_ALL_WARNINGS AND NOT MSVC) - set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) - if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") - list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) - endif() - - execute_process( - COMMAND ${NVCC_CMD} -Xcompiler --version - OUTPUT_VARIABLE CUDA_CCFULLVER - ERROR_QUIET - ) - - if (NOT CUDA_CCFULLVER MATCHES clang) - set(CUDA_CCID "GNU") - execute_process( - COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" - OUTPUT_VARIABLE CUDA_CCVER - ERROR_QUIET - ) - else() - if (CUDA_CCFULLVER MATCHES Apple) - set(CUDA_CCID "AppleClang") - else() - set(CUDA_CCID "Clang") - endif() - string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) - endif() - - message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") - - get_flags(${CUDA_CCID} ${CUDA_CCVER}) - list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later - endif() - - if (NOT MSVC) - list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) - endif() -endif() - -if (WIN32) - add_compile_definitions(_CRT_SECURE_NO_WARNINGS) - - if (BUILD_SHARED_LIBS) - set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) - endif() -endif() - -if (LLAMA_LTO) - include(CheckIPOSupported) - check_ipo_supported(RESULT result OUTPUT output) - if (result) - set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) - else() - message(WARNING "IPO is not supported: ${output}") - endif() -endif() - -if (LLAMA_CCACHE) - find_program(LLAMA_CCACHE_FOUND ccache) - if (LLAMA_CCACHE_FOUND) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) - set(ENV{CCACHE_SLOPPINESS} time_macros) - message(STATUS "ccache found, compilation results will be cached. Disable with LLAMA_CCACHE=OFF.") - else() - message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with LLAMA_CCACHE=OFF") - endif () -endif() - -# this version of Apple ld64 is buggy -execute_process( - COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v - ERROR_VARIABLE output - OUTPUT_QUIET -) - -if (output MATCHES "dyld-1015\.7") - add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) -endif() - -# Architecture specific -# TODO: probably these flags need to be tweaked on some architectures -# feel free to update the Makefile for your architecture and send a pull request or issue -message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") -if (MSVC) - string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) - message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") -else () - set(CMAKE_GENERATOR_PLATFORM_LWR "") -endif () - -if (NOT MSVC) - if (LLAMA_STATIC) - add_link_options(-static) - if (MINGW) - add_link_options(-static-libgcc -static-libstdc++) - endif() - endif() - if (LLAMA_GPROF) - add_compile_options(-pg) - endif() -endif() - -set(ARCH_FLAGS "") - -if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR - (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) - message(STATUS "ARM detected") - if (MSVC) - add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead - add_compile_definitions(__ARM_NEON) - add_compile_definitions(__ARM_FEATURE_FMA) - - set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) - string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) - if (GGML_COMPILER_SUPPORT_DOTPROD) - add_compile_definitions(__ARM_FEATURE_DOTPROD) - endif () - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) - if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) - endif () - - check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - endif () - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) - else() - check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) - if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") - list(APPEND ARCH_FLAGS -mfp16-format=ieee) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") - # Raspberry Pi 1, Zero - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") - # Android armeabi-v7a - list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) - else() - # Raspberry Pi 2 - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) - endif() - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") - # Android arm64-v8a - # Raspberry Pi 3, 4, Zero 2 (32-bit) - list(APPEND ARCH_FLAGS -mno-unaligned-access) - endif() - if (LLAMA_SVE) - list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) - endif() - endif() -elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR - (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) - message(STATUS "x86 detected") - if (MSVC) - # instruction set detection for MSVC only - if (LLAMA_NATIVE) - include(cmake/FindSIMD.cmake) - endif () - if (LLAMA_AVX512) - list(APPEND ARCH_FLAGS /arch:AVX512) - # MSVC has no compile-time flags enabling specific - # AVX512 extensions, neither it defines the - # macros corresponding to the extensions. - # Do it manually. - if (LLAMA_AVX512_VBMI) - add_compile_definitions($<$:__AVX512VBMI__>) - add_compile_definitions($<$:__AVX512VBMI__>) - endif() - if (LLAMA_AVX512_VNNI) - add_compile_definitions($<$:__AVX512VNNI__>) - add_compile_definitions($<$:__AVX512VNNI__>) - endif() - if (LLAMA_AVX512_BF16) - add_compile_definitions($<$:__AVX512BF16__>) - add_compile_definitions($<$:__AVX512BF16__>) - endif() - elseif (LLAMA_AVX2) - list(APPEND ARCH_FLAGS /arch:AVX2) - elseif (LLAMA_AVX) - list(APPEND ARCH_FLAGS /arch:AVX) - endif() - else() - if (LLAMA_NATIVE) - list(APPEND ARCH_FLAGS -march=native) - endif() - if (LLAMA_F16C) - list(APPEND ARCH_FLAGS -mf16c) - endif() - if (LLAMA_FMA) - list(APPEND ARCH_FLAGS -mfma) - endif() - if (LLAMA_AVX) - list(APPEND ARCH_FLAGS -mavx) - endif() - if (LLAMA_AVX2) - list(APPEND ARCH_FLAGS -mavx2) - endif() - if (LLAMA_AVX512) - list(APPEND ARCH_FLAGS -mavx512f) - list(APPEND ARCH_FLAGS -mavx512bw) - endif() - if (LLAMA_AVX512_VBMI) - list(APPEND ARCH_FLAGS -mavx512vbmi) - endif() - if (LLAMA_AVX512_VNNI) - list(APPEND ARCH_FLAGS -mavx512vnni) - endif() - if (LLAMA_AVX512_BF16) - list(APPEND ARCH_FLAGS -mavx512bf16) - endif() - endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") - message(STATUS "PowerPC detected") - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") - list(APPEND ARCH_FLAGS -mcpu=powerpc64le) - else() - list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) - #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) - endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") - message(STATUS "loongarch64 detected") - - list(APPEND ARCH_FLAGS -march=loongarch64) - if (LLAMA_LASX) - list(APPEND ARCH_FLAGS -mlasx) - endif() - if (LLAMA_LSX) - list(APPEND ARCH_FLAGS -mlsx) - endif() - -else() - message(STATUS "Unknown architecture") -endif() - -add_compile_options("$<$:${ARCH_FLAGS}>") -add_compile_options("$<$:${ARCH_FLAGS}>") - -if (LLAMA_CUDA) - list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) - list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument - if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") - list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) - endif() - add_compile_options("$<$:${CUDA_FLAGS}>") -endif() - -if (MINGW) - # Target Windows 8 for PrefetchVirtualMemory - add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER}) -endif() +llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA) +llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA) +llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE) +llama_option_depr(WARNING LLAMA_METAL GGML_METAL) +llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY) +llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) +llama_option_depr(WARNING LLAMA_OPENMP GGML_OPENMP) +llama_option_depr(WARNING LLAMA_RPC GGML_RPC) +llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) +llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) # -# POSIX conformance +# build the library # -# clock_gettime came in POSIX.1b (1993) -# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional -# posix_memalign came in POSIX.1-2001 / SUSv3 -# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) -add_compile_definitions(_XOPEN_SOURCE=600) - -# Somehow in OpenBSD whenever POSIX conformance is specified -# some string functions rely on locale_t availability, -# which was introduced in POSIX.1-2008, forcing us to go higher -if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") - remove_definitions(-D_XOPEN_SOURCE=600) - add_compile_definitions(_XOPEN_SOURCE=700) -endif() - -# Data types, macros and functions related to controlling CPU affinity and -# some memory allocation are available on Linux through GNU extensions in libc -if (CMAKE_SYSTEM_NAME MATCHES "Linux") - add_compile_definitions(_GNU_SOURCE) -endif() - -# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, -# and on macOS its availability depends on enabling Darwin extensions -# similarly on DragonFly, enabling BSD extensions is necessary -if ( - CMAKE_SYSTEM_NAME MATCHES "Darwin" OR - CMAKE_SYSTEM_NAME MATCHES "iOS" OR - CMAKE_SYSTEM_NAME MATCHES "tvOS" OR - CMAKE_SYSTEM_NAME MATCHES "DragonFly" -) - add_compile_definitions(_DARWIN_C_SOURCE) -endif() - -# alloca is a non-standard interface that is not visible on BSDs when -# POSIX conformance is specified, but not all of them provide a clean way -# to enable it in such cases -if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") - add_compile_definitions(__BSD_VISIBLE) -endif() -if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") - add_compile_definitions(_NETBSD_SOURCE) -endif() -if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") - add_compile_definitions(_BSD_SOURCE) -endif() - -# -# libraries -# - -# ggml - -add_library(ggml OBJECT - ggml.c - ggml.h - ggml-alloc.c - ggml-alloc.h - ggml-backend.c - ggml-backend.h - ggml-quants.c - ggml-quants.h - ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} - ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC} - ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} - ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} - ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} - ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} - ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} - ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} - ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} - ) - -target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) -target_compile_features (ggml PUBLIC c_std_11) # don't bump - -target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) - -add_library(ggml_static STATIC $) - -if (BUILD_SHARED_LIBS) - set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) - add_library(ggml_shared SHARED $) - target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) - install(TARGETS ggml_shared LIBRARY) -endif() - -# llama - -add_library(llama - llama.cpp - llama.h - unicode.h - unicode.cpp - unicode-data.cpp - ) - -target_include_directories(llama PUBLIC .) -target_compile_features (llama PUBLIC cxx_std_11) # don't bump - -target_link_libraries(llama PRIVATE - ggml - ${LLAMA_EXTRA_LIBS} - ) - -if (BUILD_SHARED_LIBS) - set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) - if (LLAMA_METAL) - set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") - endif() -endif() - +add_subdirectory(ggml) +add_subdirectory(src) # # install @@ -1325,17 +110,19 @@ endif() include(GNUInstallDirs) include(CMakePackageConfigHelpers) -set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} - CACHE PATH "Location of header files") -set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} - CACHE PATH "Location of library files") -set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} - CACHE PATH "Location of binary files") -set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) -set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) +set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) +set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER}) + +set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") +set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") +set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") + get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS) +set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) +install(TARGETS llama LIBRARY PUBLIC_HEADER) + configure_package_config_file( ${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake @@ -1353,17 +140,6 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama) -set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h" - "${GGML_HEADERS_CUDA}" - "${GGML_HEADERS_METAL}" - "${GGML_HEADERS_EXTRA}") - -set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") -install(TARGETS ggml PUBLIC_HEADER) - -set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h) -install(TARGETS llama LIBRARY PUBLIC_HEADER) - install( FILES convert-hf-to-gguf.py PERMISSIONS @@ -1375,22 +151,6 @@ install( WORLD_READ WORLD_EXECUTE DESTINATION ${CMAKE_INSTALL_BINDIR}) -if (LLAMA_METAL) - install( - FILES ggml-metal.metal - PERMISSIONS - OWNER_READ - OWNER_WRITE - GROUP_READ - WORLD_READ - DESTINATION ${CMAKE_INSTALL_BINDIR}) - if (NOT LLAMA_METAL_EMBED_LIBRARY) - install( - FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - DESTINATION ${CMAKE_INSTALL_BINDIR} - ) - endif() -endif() configure_file(cmake/llama.pc.in "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" diff --git a/CMakePresets.json b/CMakePresets.json index fba22af9a6bab..d69bc03447ae9 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -19,14 +19,14 @@ "cacheVariables": { "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", "CMAKE_CXX_COMPILER": "icx", - "LLAMA_SYCL": "ON", + "GGML_SYCL": "ON", "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." } }, { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, - { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, - { "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } }, + { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, + { "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } }, { "name": "arm64-windows-msvc", "hidden": true, diff --git a/Makefile b/Makefile index 4ea59c0b4ef29..862dbcb712e13 100644 --- a/Makefile +++ b/Makefile @@ -61,8 +61,85 @@ TEST_TARGETS = \ tests/test-tokenizer-1-bpe \ tests/test-tokenizer-1-spm -# Code coverage output files -COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report +# Deprecation aliases +ifdef LLAMA_CUBLAS +$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.) +endif + +ifdef LLAMA_CUDA +GGML_CUDA := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_KOMPUTE +GGML_KOMPUTE := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_METAL +GGML_METAL := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_METAL_EMBED_LIBRARY +GGML_METAL_EMBED_LIBRARY := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_OPENMP +GGML_OPENMP := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_RPC +GGML_RPC := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_SYCL +GGML_SYCL := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_SYCL_F16 +GGML_SYCL_F16 := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_OPENBLAS +GGML_OPENBLAS := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_OPENBLAS64 +GGML_OPENBLAS64 := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_BLIS +GGML_BLIS := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_NO_LLAMAFILE +GGML_NO_LLAMAFILE := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_NO_ACCELERATE +GGML_NO_ACCELERATE := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_NO_OPENMP +GGML_NO_OPENMP := 1 +DEPRECATE_WARNING := 1 +endif + +ifdef LLAMA_NO_METAL +GGML_NO_METAL := 1 +DEPRECATE_WARNING := 1 +endif ifndef UNAME_S UNAME_S := $(shell uname -s) @@ -90,11 +167,11 @@ endif # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 ifeq ($(UNAME_S),Darwin) - ifndef LLAMA_NO_METAL - LLAMA_METAL := 1 + ifndef GGML_NO_METAL + GGML_METAL := 1 endif - LLAMA_NO_OPENMP := 1 + GGML_NO_OPENMP := 1 ifneq ($(UNAME_P),arm) SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null) @@ -106,7 +183,7 @@ ifeq ($(UNAME_S),Darwin) endif endif -ifdef LLAMA_RPC +ifdef GGML_RPC BUILD_TARGETS += rpc-server endif @@ -147,18 +224,6 @@ test: $(TEST_TARGETS) all: $(BUILD_TARGETS) $(TEST_TARGETS) -coverage: ## Run code coverage - gcov -pb tests/*.cpp - -lcov-report: coverage ## Generate lcov report - mkdir -p lcov-report - lcov --capture --directory . --output-file lcov-report/coverage.info - genhtml lcov-report/coverage.info --output-directory lcov-report - -gcovr-report: coverage ## Generate gcovr report - mkdir -p gcovr-report - gcovr --root . --html --html-details --output gcovr-report/coverage.html - ifdef RISCV_CROSS_COMPILE CC := riscv64-unknown-linux-gnu-gcc CXX := riscv64-unknown-linux-gnu-g++ @@ -169,26 +234,11 @@ endif # # keep standard at C11 and C++11 -MK_CPPFLAGS = -I. -Icommon +MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon MK_CFLAGS = -std=c11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC MK_NVCCFLAGS = -std=c++11 -# -Ofast tends to produce faster code, but may not be available for some compilers. -ifdef LLAMA_FAST -MK_CFLAGS += -Ofast -HOST_CXXFLAGS += -Ofast -ifndef LLAMA_DEBUG -MK_NVCCFLAGS += -O3 -endif # LLAMA_DEBUG -else -MK_CFLAGS += -O3 -MK_CXXFLAGS += -O3 -ifndef LLAMA_DEBUG -MK_NVCCFLAGS += -O3 -endif # LLAMA_DEBUG -endif # LLAMA_FAST - ifndef LLAMA_NO_CCACHE CCACHE := $(shell which ccache) ifdef CCACHE @@ -243,8 +293,8 @@ ifeq ($(UNAME_S),OpenBSD) MK_CPPFLAGS += -D_BSD_SOURCE endif -ifdef LLAMA_SCHED_MAX_COPIES - MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES) +ifdef GGML_SCHED_MAX_COPIES + MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES) endif ifdef LLAMA_DEBUG @@ -287,19 +337,31 @@ ifdef LLAMA_SERVER_SSL MK_LDFLAGS += -lssl -lcrypto endif -ifdef LLAMA_CODE_COVERAGE - MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase '' -endif - ifdef LLAMA_DISABLE_LOGS MK_CPPFLAGS += -DLOG_DISABLE_LOGS endif # LLAMA_DISABLE_LOGS # warnings -WARN_FLAGS = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \ - -Werror=implicit-function-declaration -MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn +WARN_FLAGS = \ + -Wall \ + -Wextra \ + -Wpedantic \ + -Wcast-qual \ + -Wno-unused-function + +MK_CFLAGS += \ + $(WARN_FLAGS) \ + -Wshadow \ + -Wstrict-prototypes \ + -Wpointer-arith \ + -Wmissing-prototypes \ + -Werror=implicit-int \ + -Werror=implicit-function-declaration + +MK_CXXFLAGS += \ + $(WARN_FLAGS) \ + -Wmissing-declarations \ + -Wmissing-noreturn ifeq ($(LLAMA_FATAL_WARNINGS),1) MK_CFLAGS += -Werror @@ -344,9 +406,6 @@ ifdef LLAMA_GPROF MK_CFLAGS += -pg MK_CXXFLAGS += -pg endif -ifdef LLAMA_PERF - MK_CPPFLAGS += -DGGML_PERF -endif # Architecture specific # TODO: probably these flags need to be tweaked on some architectures @@ -437,7 +496,7 @@ else MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d endif -ifndef LLAMA_NO_ACCELERATE +ifndef GGML_NO_ACCELERATE # Mac OS - include Accelerate framework. # `-framework Accelerate` works both with Apple Silicon and Mac Intel ifeq ($(UNAME_S),Darwin) @@ -445,138 +504,153 @@ ifndef LLAMA_NO_ACCELERATE MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_LDFLAGS += -framework Accelerate - OBJS += ggml-blas.o + OBJ_GGML += ggml/src/ggml-blas.o endif -endif # LLAMA_NO_ACCELERATE +endif # GGML_NO_ACCELERATE -ifndef LLAMA_NO_OPENMP +ifndef GGML_NO_OPENMP MK_CPPFLAGS += -DGGML_USE_OPENMP MK_CFLAGS += -fopenmp MK_CXXFLAGS += -fopenmp -endif # LLAMA_NO_OPENMP +endif # GGML_NO_OPENMP -ifdef LLAMA_OPENBLAS +ifdef GGML_OPENBLAS MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas) - OBJS += ggml-blas.o -endif # LLAMA_OPENBLAS + OBJ_GGML += ggml/src/ggml-blas.o +endif # GGML_OPENBLAS -ifdef LLAMA_OPENBLAS64 +ifdef GGML_OPENBLAS64 MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) MK_LDFLAGS += $(shell pkg-config --libs openblas64) - OBJS += ggml-blas.o -endif # LLAMA_OPENBLAS64 + OBJ_GGML += ggml/src/ggml-blas.o +endif # GGML_OPENBLAS64 -ifdef LLAMA_BLIS +ifdef GGML_BLIS MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis MK_LDFLAGS += -lblis -L/usr/local/lib - OBJS += ggml-blas.o -endif # LLAMA_BLIS + OBJ_GGML += ggml/src/ggml-blas.o +endif # GGML_BLIS -ifndef LLAMA_NO_LLAMAFILE +ifndef GGML_NO_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJS += sgemm.o + OBJ_GGML += ggml/src/sgemm.o endif -ifdef LLAMA_RPC - MK_CPPFLAGS += -DGGML_USE_RPC - OBJS += ggml-rpc.o -endif # LLAMA_RPC +ifdef GGML_RPC + MK_CPPFLAGS += -DGGML_USE_RPC + OBJ_GGML += ggml/src/ggml-rpc.o +endif # GGML_RPC -ifdef LLAMA_CUBLAS -# LLAMA_CUBLAS is deprecated and will be removed in the future - LLAMA_CUDA := 1 -endif +OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu)) +OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu)) -OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu)) -OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu)) -ifdef LLAMA_CUDA_FA_ALL_QUANTS - OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu)) +ifdef GGML_CUDA_FA_ALL_QUANTS + OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*.cu)) else - OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)) - OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)) - OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu)) -endif # LLAMA_CUDA_FA_ALL_QUANTS + OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)) + OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)) + OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)) +endif # GGML_CUDA_FA_ALL_QUANTS -ifdef LLAMA_CUDA +ifdef GGML_CUDA ifneq ('', '$(wildcard /opt/cuda)') CUDA_PATH ?= /opt/cuda else CUDA_PATH ?= /usr/local/cuda endif + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib - OBJS += ggml-cuda.o - OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) - OBJS += $(OBJS_CUDA_TEMP_INST) MK_NVCCFLAGS += -use_fast_math + + OBJ_GGML += ggml/src/ggml-cuda.o + OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) + OBJ_GGML += $(OBJ_CUDA_TMPL) + ifdef LLAMA_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings endif # LLAMA_FATAL_WARNINGS + ifndef JETSON_EOL_MODULE_DETECT MK_NVCCFLAGS += --forward-unknown-to-host-compiler endif # JETSON_EOL_MODULE_DETECT + ifdef LLAMA_DEBUG MK_NVCCFLAGS += -lineinfo endif # LLAMA_DEBUG -ifdef LLAMA_CUDA_DEBUG + +ifdef GGML_CUDA_DEBUG MK_NVCCFLAGS += --device-debug -endif # LLAMA_CUDA_DEBUG -ifdef LLAMA_CUDA_NVCC - NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC) +endif # GGML_CUDA_DEBUG + +ifdef GGML_CUDA_NVCC + NVCC = $(CCACHE) $(GGML_CUDA_NVCC) else NVCC = $(CCACHE) nvcc -endif #LLAMA_CUDA_NVCC +endif #GGML_CUDA_NVCC + ifdef CUDA_DOCKER_ARCH MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) else ifndef CUDA_POWER_ARCH MK_NVCCFLAGS += -arch=native endif # CUDA_DOCKER_ARCH -ifdef LLAMA_CUDA_FORCE_DMMV + +ifdef GGML_CUDA_FORCE_DMMV MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV -endif # LLAMA_CUDA_FORCE_DMMV -ifdef LLAMA_CUDA_FORCE_MMQ +endif # GGML_CUDA_FORCE_DMMV + +ifdef GGML_CUDA_FORCE_MMQ MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ -endif # LLAMA_CUDA_FORCE_MMQ -ifdef LLAMA_CUDA_DMMV_X - MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) +endif # GGML_CUDA_FORCE_MMQ + +ifdef GGML_CUDA_DMMV_X + MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X) else MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32 -endif # LLAMA_CUDA_DMMV_X -ifdef LLAMA_CUDA_MMV_Y - MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) -else ifdef LLAMA_CUDA_DMMV_Y - MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility +endif # GGML_CUDA_DMMV_X + +ifdef GGML_CUDA_MMV_Y + MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y) +else ifdef GGML_CUDA_DMMV_Y + MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility else MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1 -endif # LLAMA_CUDA_MMV_Y -ifdef LLAMA_CUDA_F16 +endif # GGML_CUDA_MMV_Y + +ifdef GGML_CUDA_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16 -endif # LLAMA_CUDA_F16 -ifdef LLAMA_CUDA_DMMV_F16 +endif # GGML_CUDA_F16 + +ifdef GGML_CUDA_DMMV_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16 -endif # LLAMA_CUDA_DMMV_F16 -ifdef LLAMA_CUDA_KQUANTS_ITER - MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) +endif # GGML_CUDA_DMMV_F16 + +ifdef GGML_CUDA_KQUANTS_ITER + MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) else MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 endif -ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE - MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE) + +ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE + MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE) else MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE -ifdef LLAMA_CUDA_NO_PEER_COPY +endif # GGML_CUDA_PEER_MAX_BATCH_SIZE + +ifdef GGML_CUDA_NO_PEER_COPY MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY -endif # LLAMA_CUDA_NO_PEER_COPY -ifdef LLAMA_CUDA_CCBIN - MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) -endif # LLAMA_CUDA_CCBIN -ifdef LLAMA_CUDA_FA_ALL_QUANTS +endif # GGML_CUDA_NO_PEER_COPY + +ifdef GGML_CUDA_CCBIN + MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN) +endif # GGML_CUDA_CCBIN + +ifdef GGML_CUDA_FA_ALL_QUANTS MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS -endif # LLAMA_CUDA_FA_ALL_QUANTS +endif # GGML_CUDA_FA_ALL_QUANTS ifdef JETSON_EOL_MODULE_DETECT define NVCC_COMPILE @@ -588,135 +662,187 @@ define NVCC_COMPILE endef # NVCC_COMPILE endif # JETSON_EOL_MODULE_DETECT -ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh +ggml/src/ggml-cuda/%.o: \ + ggml/src/ggml-cuda/%.cu \ + ggml/include/ggml.h \ + ggml/src/ggml-common.h \ + ggml/src/ggml-cuda/common.cuh $(NVCC_COMPILE) -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) +ggml/src/ggml-cuda.o: \ + ggml/src/ggml-cuda.cu \ + ggml/src/ggml-cuda.h \ + ggml/include/ggml.h \ + ggml/include/ggml-backend.h \ + ggml/src/ggml-backend-impl.h \ + ggml/src/ggml-common.h \ + $(wildcard ggml/src/ggml-cuda/*.cuh) $(NVCC_COMPILE) -endif # LLAMA_CUDA +endif # GGML_CUDA -ifdef LLAMA_VULKAN - MK_CPPFLAGS += -DGGML_USE_VULKAN - MK_LDFLAGS += -lvulkan - OBJS += ggml-vulkan.o +ifdef GGML_VULKAN + MK_CPPFLAGS += -DGGML_USE_VULKAN + MK_LDFLAGS += -lvulkan + OBJ_GGML += ggml/src/ggml-vulkan.o -ifdef LLAMA_VULKAN_CHECK_RESULTS +ifdef GGML_VULKAN_CHECK_RESULTS MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS endif -ifdef LLAMA_VULKAN_DEBUG +ifdef GGML_VULKAN_DEBUG MK_CPPFLAGS += -DGGML_VULKAN_DEBUG endif -ifdef LLAMA_VULKAN_MEMORY_DEBUG +ifdef GGML_VULKAN_MEMORY_DEBUG MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG endif -ifdef LLAMA_VULKAN_VALIDATE +ifdef GGML_VULKAN_VALIDATE MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE endif -ifdef LLAMA_VULKAN_RUN_TESTS +ifdef GGML_VULKAN_RUN_TESTS MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS endif -ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h +ggml/src/ggml-vulkan.o: \ + ggml/src/ggml-vulkan.cpp \ + ggml/src/ggml-vulkan.h $(CXX) $(CXXFLAGS) -c $< -o $@ -endif # LLAMA_VULKAN +endif # GGML_VULKAN -ifdef LLAMA_HIPBLAS +ifdef GGML_HIPBLAS ifeq ($(wildcard /opt/rocm),) - ROCM_PATH ?= /usr + ROCM_PATH ?= /usr AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) else ROCM_PATH ?= /opt/rocm AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) endif - HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc - LLAMA_CUDA_DMMV_X ?= 32 - LLAMA_CUDA_MMV_Y ?= 1 - LLAMA_CUDA_KQUANTS_ITER ?= 2 + + GGML_CUDA_DMMV_X ?= 32 + GGML_CUDA_MMV_Y ?= 1 + GGML_CUDA_KQUANTS_ITER ?= 2 + MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA -ifdef LLAMA_HIP_UMA + +ifdef GGML_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA -endif # LLAMA_HIP_UMA - MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib - MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64 - MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas - HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS)) - HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) - HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) - HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) -ifdef LLAMA_CUDA_FORCE_DMMV - HIPFLAGS += -DGGML_CUDA_FORCE_DMMV -endif # LLAMA_CUDA_FORCE_DMMV -ifdef LLAMA_CUDA_NO_PEER_COPY - HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY -endif # LLAMA_CUDA_NO_PEER_COPY - OBJS += ggml-cuda.o - OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) - OBJS += $(OBJS_CUDA_TEMP_INST) - -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) +endif # GGML_HIP_UMA + + MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib + MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64 + MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas + + HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc + + HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS)) + HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X) + HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y) + HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) + +ifdef GGML_CUDA_FORCE_DMMV + HIPFLAGS += -DGGML_CUDA_FORCE_DMMV +endif # GGML_CUDA_FORCE_DMMV + +ifdef GGML_CUDA_NO_PEER_COPY + HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY +endif # GGML_CUDA_NO_PEER_COPY + + OBJ_GGML += ggml/src/ggml-cuda.o + OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) + OBJ_GGML += $(OBJ_CUDA_TMPL) + +ggml/src/ggml-cuda.o: \ + ggml/src/ggml-cuda.cu \ + ggml/src/ggml-cuda.h \ + ggml/include/ggml.h \ + ggml/include/ggml-backend.h \ + ggml/src/ggml-backend-impl.h \ + ggml/src/ggml-common.h \ + $(wildcard ggml/src/ggml-cuda/*.cuh) $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< -ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh +ggml/src/ggml-cuda/%.o: \ + ggml/src/ggml-cuda/%.cu \ + ggml/include/ggml.h \ + ggml/src/ggml-common.h \ + ggml/src/ggml-cuda/common.cuh $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< +endif # GGML_HIPBLAS -endif # LLAMA_HIPBLAS - -ifdef LLAMA_METAL +ifdef GGML_METAL MK_CPPFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit - OBJS += ggml-metal.o -ifdef LLAMA_METAL_NDEBUG + OBJ_GGML += ggml/src/ggml-metal.o +ifdef GGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG endif -ifdef LLAMA_METAL_EMBED_LIBRARY +ifdef GGML_METAL_EMBED_LIBRARY MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY - OBJS += ggml-metal-embed.o + OBJ_GGML += ggml/src/ggml-metal-embed.o endif -endif # LLAMA_METAL +endif # GGML_METAL -ifdef LLAMA_METAL -ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h +ifdef GGML_METAL +ggml/src/ggml-metal.o: \ + ggml/src/ggml-metal.m \ + ggml/src/ggml-metal.h \ + ggml/include/ggml.h $(CC) $(CFLAGS) -c $< -o $@ -ifdef LLAMA_METAL_EMBED_LIBRARY -ggml-metal-embed.o: ggml-metal.metal ggml-common.h +ifdef GGML_METAL_EMBED_LIBRARY +ggml/src/ggml-metal-embed.o: \ + ggml/src/ggml-metal.metal \ + ggml/src/ggml-common.h @echo "Embedding Metal library" - @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal + @sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal $(eval TEMP_ASSEMBLY=$(shell mktemp)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) - @echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) + @echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) @$(AS) $(TEMP_ASSEMBLY) -o $@ @rm -f ${TEMP_ASSEMBLY} endif -endif # LLAMA_METAL +endif # GGML_METAL -OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o -COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h -COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o +OBJ_GGML += \ + ggml/src/ggml.o \ + ggml/src/ggml-alloc.o \ + ggml/src/ggml-backend.o \ + ggml/src/ggml-quants.o -ifndef LLAMA_NO_LLAMAFILE -sgemm.o: sgemm.cpp sgemm.h ggml.h - $(CXX) $(CXXFLAGS) -c $< -o $@ -endif +OBJ_LLAMA = \ + src/llama.o \ + src/unicode.o \ + src/unicode-data.o -ifdef LLAMA_RPC -ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +OBJ_COMMON = \ + common/common.o \ + common/console.o \ + common/ngram-cache.o \ + common/sampling.o \ + common/train.o \ + common/grammar-parser.o \ + common/build-info.o \ + common/json-schema-to-grammar.o -rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON) -rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -endif # LLAMA_RPC +LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT) +LIB_GGML_S = $(LIB_PRE)ggml.a + +LIB_LLAMA = $(LIB_PRE)llama$(DSO_EXT) +LIB_LLAMA_S = $(LIB_PRE)llama.a + +LIB_COMMON = $(LIB_PRE)common$(DSO_EXT) +LIB_COMMON_S = $(LIB_PRE)common.a + +LIB_ALL = $(LIB_GGML) $(LIB_LLAMA) $(LIB_COMMON) +LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S) GF_CC := $(CC) include scripts/get-flags.mk @@ -759,81 +885,199 @@ ifdef LLAMA_CUDA $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) + ifndef CUDA_DOCKER_ARCH ifndef CUDA_POWER_ARCH $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus ) endif # CUDA_POWER_ARCH endif # CUDA_DOCKER_ARCH + endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) endif # LLAMA_CUDA $(info ) -ifdef LLAMA_CUBLAS -$(info !!!!) -$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.) -$(info !!!!) +ifdef DEPRECATE_WARNING +$(info !!! DEPRECATION WARNING !!!) +$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead) +$(info - LLAMA_CUDA) +$(info - LLAMA_METAL) +$(info - LLAMA_METAL_EMBED_LIBRARY) +$(info - LLAMA_OPENMP) +$(info - LLAMA_RPC) +$(info - LLAMA_SYCL) +$(info - LLAMA_SYCL_F16) +$(info - LLAMA_OPENBLAS) +$(info - LLAMA_OPENBLAS64) +$(info - LLAMA_BLIS) +$(info - LLAMA_NO_LLAMAFILE) +$(info - LLAMA_NO_ACCELERATE) +$(info - LLAMA_NO_OPENMP) +$(info - LLAMA_NO_METAL) $(info ) endif # -# Build library +# Build libraries # -ggml.o: ggml.c ggml.h ggml-cuda.h +# ggml + +ggml/src/ggml.o: \ + ggml/src/ggml.c \ + ggml/include/ggml.h $(CC) $(CFLAGS) -c $< -o $@ -ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h +ggml/src/ggml-alloc.o: \ + ggml/src/ggml-alloc.c \ + ggml/include/ggml.h \ + ggml/include/ggml-alloc.h $(CC) $(CFLAGS) -c $< -o $@ -ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h +ggml/src/ggml-backend.o: \ + ggml/src/ggml-backend.c \ + ggml/include/ggml.h \ + ggml/include/ggml-backend.h $(CC) $(CFLAGS) -c $< -o $@ -ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h +ggml/src/ggml-quants.o: \ + ggml/src/ggml-quants.c \ + ggml/include/ggml.h \ + ggml/src/ggml-quants.h \ + ggml/src/ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ -ggml-blas.o: ggml-blas.cpp ggml-blas.h +ggml/src/ggml-blas.o: \ + ggml/src/ggml-blas.cpp \ + ggml/src/ggml-blas.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ifndef GGML_NO_LLAMAFILE +ggml/src/sgemm.o: \ + ggml/src/sgemm.cpp \ + ggml/src/sgemm.h \ + ggml/include/ggml.h $(CXX) $(CXXFLAGS) -c $< -o $@ +endif # GGML_NO_LLAMAFILE -unicode.o: unicode.cpp unicode.h +ifdef GGML_RPC +ggml/src/ggml-rpc.o: \ + ggml/src/ggml-rpc.cpp \ + ggml/src/ggml-rpc.h $(CXX) $(CXXFLAGS) -c $< -o $@ +endif # GGML_RPC -unicode-data.o: unicode-data.cpp unicode-data.h +$(LIB_GGML): \ + $(OBJ_GGML) + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) + +$(LIB_GGML_S): \ + $(OBJ_GGML) + ar rcs $(LIB_GGML_S) $^ + +# llama + +src/unicode.o: \ + src/unicode.cpp \ + src/unicode.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/unicode-data.o: \ + src/unicode-data.cpp \ + src/unicode-data.h $(CXX) $(CXXFLAGS) -c $< -o $@ -llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h +src/llama.o: \ + src/llama.cpp \ + src/unicode.h \ + include/llama.h \ + ggml/src/ggml-cuda.h \ + ggml/src/ggml-metal.h \ + ggml/include/ggml.h \ + ggml/include/ggml-alloc.h \ + ggml/include/ggml-backend.h $(CXX) $(CXXFLAGS) -c $< -o $@ -common.o: common/common.cpp $(COMMON_H_DEPS) +$(LIB_LLAMA): \ + $(OBJ_LLAMA) \ + $(LIB_GGML) + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) + +$(LIB_LLAMA_S): \ + $(OBJ_LLAMA) + ar rcs $(LIB_LLAMA_S) $^ + +# common + +common/common.o: \ + common/common.cpp \ + common/common.h \ + common/console.h \ + common/sampling.h \ + common/json.hpp \ + common/json-schema-to-grammar.h \ + include/llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -sampling.o: common/sampling.cpp $(COMMON_H_DEPS) +common/sampling.o: \ + common/sampling.cpp \ + common/sampling.h \ + include/llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -console.o: common/console.cpp common/console.h +common/console.o: \ + common/console.cpp \ + common/console.h $(CXX) $(CXXFLAGS) -c $< -o $@ -grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h +common/grammar-parser.o: \ + common/grammar-parser.cpp \ + common/grammar-parser.h $(CXX) $(CXXFLAGS) -c $< -o $@ -json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h +common/json-schema-to-grammar.o: \ + common/json-schema-to-grammar.cpp \ + common/json-schema-to-grammar.h $(CXX) $(CXXFLAGS) -c $< -o $@ -train.o: common/train.cpp common/train.h +common/train.o: \ + common/train.cpp \ + common/train.h $(CXX) $(CXXFLAGS) -c $< -o $@ -ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h +common/ngram-cache.o: \ + common/ngram-cache.cpp \ + common/ngram-cache.h $(CXX) $(CXXFLAGS) -c $< -o $@ -libllama.so: llama.o ggml.o $(OBJS) +$(LIB_COMMON): \ + $(OBJ_COMMON) \ + $(LIB_LLAMA) \ + $(LIB_GGML) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS) - ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS) +$(LIB_COMMON_S): \ + $(OBJ_COMMON) + ar rcs $(LIB_COMMON_S) $^ clean: - rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) - rm -vrf ggml-cuda/*.o - rm -vrf ggml-cuda/template-instances/*.o + rm -vrf ggml/src/*.o src/*.o tests/*.o common/*.o *.a *.so ggml*.so *.dll common/build-info.cpp *.dot $(BUILD_TARGETS) $(TEST_TARGETS) + rm -vrf ggml/src/*.o + rm -rvf src/*.o + rm -rvf tests/*.o + rm -rvf common/*.o + rm -rvf *.a + rm -rvf *.dll + rm -rvf *.so + rm -rvf *.dot + rm -rvf ggml/*.a + rm -rvf ggml/*.dll + rm -rvf ggml/*.so + rm -rvf common/build-info.cpp + rm -vrf ggml/src/ggml-metal-embed.metal + rm -vrf ggml/src/ggml-cuda/*.o + rm -vrf ggml/src/ggml-cuda/template-instances/*.o + rm -rvf $(BUILD_TARGETS) + rm -rvf $(TEST_TARGETS) find examples pocs -type f -name "*.o" -delete # @@ -847,162 +1091,233 @@ clean: # Helper function that replaces .c, .cpp, and .cu file endings with .o: GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) -llama-cli: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) +llama-cli: examples/main/main.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo @echo '==== Run ./llama-cli -h for help. ====' @echo -llama-infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) +llama-infill: examples/infill/infill.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-simple: examples/simple/simple.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-tokenize: examples/tokenize/tokenize.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-batched: examples/batched/batched.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-batched-bench: examples/batched-bench/batched-bench.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-quantize: examples/quantize/quantize.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) +llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-perplexity: examples/perplexity/perplexity.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-imatrix: examples/imatrix/imatrix.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-embedding: examples/embedding/embedding.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-gritlm: examples/gritlm/gritlm.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-save-load-state: examples/save-load-state/save-load-state.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +llama-gguf: examples/gguf/gguf.cpp \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) - -# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`: -examples/server/%.hpp: examples/server/public/% Makefile - @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \ - echo "unsigned char $${NAME}[] = {" && \ - cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \ - echo "};" && \ - echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ - ) > $@ + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) +llama-gguf-split: examples/gguf-split/gguf-split.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-eval-callback: examples/eval-callback/eval-callback.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) +llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ + $(OBJ_GGML) $(OBJ_LLAMA) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) +llama-bench: examples/llama-bench/llama-bench.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-baby-llama: examples/baby-llama/baby-llama.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual - -llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-finetune: examples/finetune/finetune.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual - $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) +llama-export-lora: examples/export-lora/export-lora.cpp \ + $(OBJ_GGML) common/log.h $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) +llama-retrieval: examples/retrieval/retrieval.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) +llama-speculative: examples/speculative/speculative.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-parallel: examples/parallel/parallel.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +llama-lookahead: examples/lookahead/lookahead.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-lookup: examples/lookup/lookup.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-lookup-create: examples/lookup/lookup-create.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) +llama-lookup-merge: examples/lookup/lookup-merge.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-lookup-create: examples/lookup/lookup-create.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) +llama-lookup-stats: examples/lookup/lookup-stats.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-lookup-merge: examples/lookup/lookup-merge.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) +llama-passkey: examples/passkey/passkey.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-lookup-stats: examples/lookup/lookup-stats.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) +llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +ifdef GGML_RPC +rpc-server: examples/rpc/rpc-server.cpp \ + $(OBJ_GGML) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +endif # GGML_RPC + +llama-server: \ + examples/server/server.cpp \ + examples/server/utils.hpp \ + examples/server/httplib.h \ + examples/server/colorthemes.css.hpp \ + examples/server/style.css.hpp \ + examples/server/theme-beeninorder.css.hpp \ + examples/server/theme-ketivah.css.hpp \ + examples/server/theme-mangotango.css.hpp \ + examples/server/theme-playground.css.hpp \ + examples/server/theme-polarnight.css.hpp \ + examples/server/theme-snowstorm.css.hpp \ + examples/server/index.html.hpp \ + examples/server/index-new.html.hpp \ + examples/server/index.js.hpp \ + examples/server/completion.js.hpp \ + examples/server/system-prompts.js.hpp \ + examples/server/prompt-formats.js.hpp \ + examples/server/json-schema-to-grammar.mjs.hpp \ + common/json.hpp \ + common/stb_image.h \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) + +# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`: +examples/server/%.hpp: examples/server/public/% Makefile + @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \ + echo "unsigned char $${NAME}[] = {" && \ + cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \ + echo "};" && \ + echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ + ) > $@ + +libllava.a: examples/llava/llava.cpp \ + examples/llava/llava.h \ + examples/llava/clip.cpp \ + examples/llava/clip.h \ + common/stb_image.h \ + common/base64.hpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual -llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +llama-llava-cli: examples/llava/llava-cli.cpp \ + examples/llava/clip.h \ + examples/llava/clip.cpp \ + examples/llava/llava.h \ + examples/llava/llava.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift @@ -1017,7 +1332,7 @@ common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh rm $@.tmp; \ fi -build-info.o: common/build-info.cpp +common/build-info.o: common/build-info.cpp $(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@ # @@ -1026,7 +1341,8 @@ build-info.o: common/build-info.cpp tests: $(TEST_TARGETS) -llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) +llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp \ + $(OBJ_GGML) common/build-info.o $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1035,85 +1351,108 @@ run-benchmark-matmult: llama-benchmark-matmult .PHONY: run-benchmark-matmult swift -llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS) +tests/test-llama-grammar: tests/test-llama-grammar.cpp \ + $(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS) +tests/test-grammar-parser: tests/test-grammar-parser.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grammar-integration: tests/test-grammar-integration.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS) +tests/test-grammar-integration: tests/test-grammar-integration.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS) +tests/test-double-float: tests/test-double-float.cpp $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS) +tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS) +tests/test-grad0: tests/test-grad0.cpp \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS) +tests/test-opt: tests/test-opt.cpp \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS) +tests/test-quantize-fns: tests/test-quantize-fns.cpp \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS) +tests/test-quantize-perf: tests/test-quantize-perf.cpp \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS) +tests/test-sampling: tests/test-sampling.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) +tests/test-tokenizer-0: tests/test-tokenizer-0.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) +tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) +tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS) +tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-c.o: tests/test-c.c llama.h +tests/test-c.o: tests/test-c.c include/llama.h $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ -tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS) +tests/test-backend-ops: tests/test-backend-ops.cpp \ + $(OBJ_GGML) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) +tests/test-model-load-cancel: tests/test-model-load-cancel.cpp tests/get-model.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) +tests/test-autorelease: tests/test-autorelease.cpp tests/get-model.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-chat-template: tests/test-chat-template.cpp \ + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +# +# PoCs +# + +llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \ + $(OBJ_GGML) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ + $(OBJ_GGML) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/Package.swift b/Package.swift index 183e647575b42..77fed86df3105 100644 --- a/Package.swift +++ b/Package.swift @@ -3,14 +3,13 @@ import PackageDescription var sources = [ - "ggml.c", - "sgemm.cpp", - "llama.cpp", - "unicode.cpp", - "unicode-data.cpp", - "ggml-alloc.c", - "ggml-backend.c", - "ggml-quants.c", + "src/llama.cpp", + "src/unicode.cpp", + "src/unicode-data.cpp", + "ggml/src/ggml.c", + "ggml/src/ggml-alloc.c", + "ggml/src/ggml-backend.c", + "ggml/src/ggml-quants.c", ] var resources: [Resource] = [] @@ -26,8 +25,8 @@ var cSettings: [CSetting] = [ ] #if canImport(Darwin) -sources.append("ggml-metal.m") -resources.append(.process("ggml-metal.metal")) +sources.append("ggml/src/ggml-metal.m") +resources.append(.process("ggml/src/ggml-metal.metal")) linkerSettings.append(.linkedFramework("Accelerate")) cSettings.append( contentsOf: [ @@ -63,8 +62,6 @@ let package = Package( "models", "tests", "CMakeLists.txt", - "ggml-cuda.cu", - "ggml-cuda.h", "Makefile" ], sources: sources, diff --git a/README-sycl.md b/README-sycl.md index b7e2bb12a68e8..885983e92277e 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -115,12 +115,12 @@ The docker build option is currently limited to *intel GPU* targets. ### Build image ```sh # Using FP16 -docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile . +docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile . ``` *Notes*: -To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command. +To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command. You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative. @@ -244,10 +244,10 @@ source /opt/intel/oneapi/setvars.sh # Build LLAMA with MKL BLAS acceleration for intel GPU # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx # Option 2: Use FP16 -cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON +cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # build all binary cmake --build build --config Release -j -v @@ -264,10 +264,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR # Build LLAMA with Nvidia BLAS acceleration through SYCL # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx # Option 2: Use FP16 -cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # build all binary cmake --build build --config Release -j -v @@ -422,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release +cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release # Option 2: Or FP16 -cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON +cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON cmake --build build --config Release -j ``` @@ -440,7 +440,7 @@ Or, use CMake presets to build: cmake --preset x64-windows-sycl-release cmake --build build-x64-windows-sycl-release -j --target llama-cli -cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release +cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release cmake --build build-x64-windows-sycl-release -j --target llama-cli cmake --preset x64-windows-sycl-debug @@ -544,9 +544,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | Name | Value | Function | |--------------------|-----------------------------------|---------------------------------------------| -| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. | -| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | -| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | +| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. | +| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | +| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | | CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. | | CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | diff --git a/README.md b/README.md index 40793c8eab880..bec7061f81a93 100644 --- a/README.md +++ b/README.md @@ -415,7 +415,7 @@ Flox follows the nixpkgs build of llama.cpp. ### Metal Build On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU. -To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option. +To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option. When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line argument. @@ -435,7 +435,7 @@ Building the program with BLAS support may lead to some performance improvements - Using `make`: - On Linux: ```bash - make LLAMA_OPENBLAS=1 + make GGML_OPENBLAS=1 ``` - On Windows: @@ -450,13 +450,13 @@ Building the program with BLAS support may lead to some performance improvements 8. From here you can run: ```bash - make LLAMA_OPENBLAS=1 + make GGML_OPENBLAS=1 ``` - Using `CMake` on Linux: ```bash - cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS + cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS cmake --build build --config Release ``` @@ -475,10 +475,10 @@ Building the program with BLAS support may lead to some performance improvements Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md). - Using manual oneAPI installation: - By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: + By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: ```bash source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation - cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON + cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON cmake --build build --config Release ``` @@ -495,27 +495,27 @@ Building the program with BLAS support may lead to some performance improvements - Using `make`: ```bash - make LLAMA_CUDA=1 + make GGML_CUDA=1 ``` - Using `CMake`: ```bash - cmake -B build -DLLAMA_CUDA=ON + cmake -B build -DGGML_CUDA=ON cmake --build build --config Release ``` The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: - | Option | Legal values | Default | Description | - |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | - | LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | | - | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | - | LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | + | Option | Legal values | Default | Description | + |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | + | GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | + | GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | | + | GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | + | GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | + | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | - #### hipBLAS @@ -525,15 +525,15 @@ Building the program with BLAS support may lead to some performance improvements - Using `make`: ```bash - make LLAMA_HIPBLAS=1 + make GGML_HIPBLAS=1 ``` - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ - cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build --config Release -- -j 16 ``` - On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`. + On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). Note that if you get the following error: @@ -547,19 +547,19 @@ Building the program with BLAS support may lead to some performance improvements ```bash HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \ HIP_DEVICE_LIB_PATH= \ - cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build -- -j 16 ``` - Using `make` (example for target gfx1030, build with 16 CPU threads): ```bash - make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 + make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 ``` - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): ```bash set PATH=%HIP_PATH%\bin;%PATH% - cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release + cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release cmake --build build ``` Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) @@ -570,11 +570,11 @@ Building the program with BLAS support may lead to some performance improvements If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): - | Option | Legal values | Default | Description | - |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | Option | Legal values | Default | Description | + |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | + | GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### Vulkan @@ -612,7 +612,7 @@ Building the program with BLAS support may lead to some performance improvements Then, build llama.cpp using the cmake command below: ```bash - cmake -B build -DLLAMA_VULKAN=1 + cmake -B build -DGGML_VULKAN=1 cmake --build build --config Release # Test the output binary (with "-ngl 33" to offload all layers to GPU) ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 diff --git a/ci/run.sh b/ci/run.sh index 291c44f47b86d..9b46f26bc46a4 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -36,11 +36,11 @@ SRC=`pwd` CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" if [ ! -z ${GG_BUILD_METAL} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_EMBED_LIBRARY=ON" fi if [ ! -z ${GG_BUILD_CUDA} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1" fi if [ ! -z ${GG_BUILD_SYCL} ]; then @@ -50,7 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then exit 1 fi - CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON" fi ## helpers @@ -284,7 +284,7 @@ function gg_run_open_llama_7b_v2 { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf @@ -550,7 +550,7 @@ function gg_run_pythia_2_8b { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 171530c915332..c6fccc0255d1e 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,5 +1,6 @@ # common +find_package(Threads REQUIRED) # Build info header # @@ -83,5 +84,5 @@ if (LLAMA_CURL) endif () target_include_directories(${TARGET} PUBLIC .) -target_compile_features(${TARGET} PUBLIC cxx_std_11) -target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) +target_compile_features (${TARGET} PUBLIC cxx_std_11) +target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) diff --git a/docs/BLIS.md b/docs/BLIS.md index c933766b7f4ec..697317bc756ca 100644 --- a/docs/BLIS.md +++ b/docs/BLIS.md @@ -30,8 +30,8 @@ We recommend using openmp since it's easier to modify the cores being used. Makefile: ```bash -make LLAMA_BLIS=1 -j -# make LLAMA_BLIS=1 benchmark-matmult +make GGML_BLIS=1 -j +# make GGML_BLIS=1 benchmark-matmult ``` CMake: @@ -39,7 +39,7 @@ CMake: ```bash mkdir build cd build -cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME .. +cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME .. make -j ``` diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0b51c44c05e4e..7d9ab34572b74 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -39,13 +39,13 @@ else() add_subdirectory(quantize-stats) add_subdirectory(quantize) add_subdirectory(retrieval) - if (LLAMA_RPC) + if (GGML_RPC) add_subdirectory(rpc) endif() if (LLAMA_BUILD_SERVER) add_subdirectory(server) endif() - if (LLAMA_SYCL) + if (GGML_SYCL) add_subdirectory(sycl) endif() add_subdirectory(save-load-state) diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 38b36ee5a26fd..29602881a0d21 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -25,7 +25,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument ## Example ```bash -LLAMA_CUDA=1 make -j +GGML_CUDA=1 make -j # generate importance matrix (imatrix.dat) ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 05a8207e67b88..f6c619c87df55 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -194,7 +194,7 @@ llama_print_timings: total time = 44411.01 ms / 377 tokens ## Orin compile and run ### compile ```sh -make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 +make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32 ``` ### run on Orin ### case 1 diff --git a/examples/rpc/README.md b/examples/rpc/README.md index 86544e3fea2c3..e1da801f285c6 100644 --- a/examples/rpc/README.md +++ b/examples/rpc/README.md @@ -29,13 +29,13 @@ You can also run multiple `rpc-server` instances on the same host, each with a d ## Usage -On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options. +On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options. For example, to build the CUDA backend with RPC support: ```bash mkdir build-rpc-cuda cd build-rpc-cuda -cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON +cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON cmake --build . --config Release ``` @@ -58,12 +58,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. -On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`: +On the main host build `llama.cpp` only with `-DGGML_RPC=ON`: ```bash mkdir build-rpc cd build-rpc -cmake .. -DLLAMA_RPC=ON +cmake .. -DGGML_RPC=ON cmake --build . --config Release ``` diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh index db46d57cabe0b..8fe0a67902cbd 100755 --- a/examples/sycl/build.sh +++ b/examples/sycl/build.sh @@ -8,10 +8,10 @@ cd build source /opt/intel/oneapi/setvars.sh #for FP16 -#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference +#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference #for FP32 -cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx #build example/main #cmake --build . --config Release --target main diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat index 027173b0a974b..cdae5a52855a2 100644 --- a/examples/sycl/win-build-sycl.bat +++ b/examples/sycl/win-build-sycl.bat @@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR :: for FP16 :: faster for long-prompt inference -:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON +:: cmake -G "MinGW Makefiles" .. -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON :: for FP32 -cmake -G "Ninja" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release +cmake -G "Ninja" .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release if %errorlevel% neq 0 goto ERROR :: build example/main only :: make main diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt new file mode 100644 index 0000000000000..bdbda4255eea7 --- /dev/null +++ b/ggml/CMakeLists.txt @@ -0,0 +1,238 @@ +cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. +project("ggml" C CXX) +include(CheckIncludeFileCXX) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + +if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + set(GGML_STANDALONE ON) + + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + + # configure project version + # TODO +else() + set(GGML_STANDALONE OFF) +endif() + +if (EMSCRIPTEN) + set(BUILD_SHARED_LIBS_DEFAULT OFF) + + option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON) +else() + if (MINGW) + set(BUILD_SHARED_LIBS_DEFAULT OFF) + else() + set(BUILD_SHARED_LIBS_DEFAULT ON) + endif() +endif() + +option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) + +# +# option list +# + +# TODO: mark all options as advanced when not GGML_STANDALONE + +if (APPLE) + set(GGML_METAL_DEFAULT ON) + set(GGML_BLAS_DEFAULT ON) + set(GGML_BLAS_VENDOR_DEFAULT "Apple") +else() + set(GGML_METAL_DEFAULT OFF) + set(GGML_BLAS_DEFAULT OFF) + set(GGML_BLAS_VENDOR_DEFAULT "Generic") +endif() + +# general +option(GGML_STATIC "ggml: static link libraries" OFF) +option(GGML_NATIVE "ggml: enable -march=native flag" ON) +option(GGML_LTO "ggml: enable link time optimization" OFF) +option(GGML_CCACHE "ggml: use ccache if available" ON) + +# debug +option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON) +option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF) +option(GGML_GPROF "ggml: enable gprof" OFF) + +# build +option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF) + +# sanitizers +option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF) +option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF) +option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF) + +# instruction set specific +if (GGML_NATIVE) + set(INS_ENB OFF) +else() + set(INS_ENB ON) +endif() + +option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) + +option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) +option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) +option(GGML_AVX512 "ggml: enable AVX512" OFF) +option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) +option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) +option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) +option(GGML_FMA "ggml: enable FMA" ${INS_ENB}) +if (NOT MSVC) + option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512 +endif() +option(GGML_LASX "ggml: enable lasx" ON) +option(GGML_LSX "ggml: enable lsx" ON) +option(GGML_SVE "ggml: enable SVE" OFF) + +if (WIN32) + set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version") +endif() + +# ggml core +set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism") +option(GGML_PERF "ggml: enable perf" OFF) + +# 3rd party libs / backends +option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON) +option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT}) +set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING + "ggml: BLAS library vendor") +option(GGML_LLAMAFILE "ggml: use ggml SGEMM" OFF) + +option(GGML_CUDA "ggml: use CUDA" OFF) +option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) +option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) +set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels") +set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels") +option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF) +set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING + "ggml: iters./thread per block for Q2_K/Q6_K") +set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING + "ggml: max. batch size for using peer access") +option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) +option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) +option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) + +option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF) +option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) +option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) +option(GGML_VULKAN "ggml: use Vulkan" OFF) +option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) +option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) +option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF) +option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) +option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) +option(GGML_KOMPUTE "ggml: use Kompute" OFF) +option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) +option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) +option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF) +option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" OFF) +set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING + "ggml: metal minimum macOS version") +set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)") +option(GGML_OPENMP "ggml: use OpenMP" ON) +option(GGML_RPC "ggml: use RPC" OFF) +option(GGML_SYCL "ggml: use SYCL" OFF) +option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) +set (GGML_SYCL_TARGET "INTEL" CACHE STRING + "ggml: sycl target device") + +# extra artifacts +option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) +option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) + +# +# dependencies +# + +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_STANDARD_REQUIRED true) + +if (GGML_SYCL) + set(CMAKE_CXX_STANDARD 17) +else() + set(CMAKE_CXX_STANDARD 11) +endif() +set(CMAKE_CXX_STANDARD_REQUIRED true) + +set(THREADS_PREFER_PTHREAD_FLAG ON) + +find_package(Threads REQUIRED) + +# +# build the library +# + +add_subdirectory(src) + +# +# tests and examples +# + +if (GGML_BUILD_TESTS) + enable_testing() + add_subdirectory(tests) +endif () + +if (GGML_BUILD_EXAMPLES) + add_subdirectory(examples) +endif () + +# +# install +# + +include(GNUInstallDirs) +include(CMakePackageConfigHelpers) + +set(GGML_PUBLIC_HEADERS + include/ggml.h + include/ggml-alloc.h + include/ggml-backend.h + "${GGML_HEADERS_CUDA}" + "${GGML_HEADERS_METAL}" + "${GGML_HEADERS_EXTRA}") + +set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") +#if (GGML_METAL) +# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal") +#endif() +install(TARGETS ggml PUBLIC_HEADER) + +if (BUILD_SHARED_LIBS) + install(TARGETS ggml_shared LIBRARY) +endif() + +if (GGML_METAL) + install( + FILES src/ggml-metal.metal + PERMISSIONS + OWNER_READ + OWNER_WRITE + GROUP_READ + WORLD_READ + DESTINATION ${CMAKE_INSTALL_BINDIR}) + if (NOT GGML_METAL_EMBED_LIBRARY) + install( + FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + endif() +endif() + +if (GGML_STANDALONE) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in + ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc + @ONLY) + + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc + DESTINATION share/pkgconfig) +endif() diff --git a/cmake/FindSIMD.cmake b/ggml/cmake/FindSIMD.cmake similarity index 94% rename from cmake/FindSIMD.cmake rename to ggml/cmake/FindSIMD.cmake index 33377ec44de12..5533668ec4ab1 100644 --- a/cmake/FindSIMD.cmake +++ b/ggml/cmake/FindSIMD.cmake @@ -79,22 +79,22 @@ endmacro() # flags are for MSVC only! check_sse("AVX" " ;/arch:AVX") if (NOT ${AVX_FOUND}) - set(LLAMA_AVX OFF) + set(GGML_AVX OFF) else() - set(LLAMA_AVX ON) + set(GGML_AVX ON) endif() check_sse("AVX2" " ;/arch:AVX2") check_sse("FMA" " ;/arch:AVX2") if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND})) - set(LLAMA_AVX2 OFF) + set(GGML_AVX2 OFF) else() - set(LLAMA_AVX2 ON) + set(GGML_AVX2 ON) endif() check_sse("AVX512" " ;/arch:AVX512") if (NOT ${AVX512_FOUND}) - set(LLAMA_AVX512 OFF) + set(GGML_AVX512 OFF) else() - set(LLAMA_AVX512 ON) + set(GGML_AVX512 ON) endif() diff --git a/ggml_vk_generate_shaders.py b/ggml/ggml_vk_generate_shaders.py similarity index 100% rename from ggml_vk_generate_shaders.py rename to ggml/ggml_vk_generate_shaders.py diff --git a/ggml-alloc.h b/ggml/include/ggml-alloc.h similarity index 100% rename from ggml-alloc.h rename to ggml/include/ggml-alloc.h diff --git a/ggml-backend.h b/ggml/include/ggml-backend.h similarity index 100% rename from ggml-backend.h rename to ggml/include/ggml-backend.h diff --git a/ggml.h b/ggml/include/ggml.h similarity index 100% rename from ggml.h rename to ggml/include/ggml.h diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt new file mode 100644 index 0000000000000..84bc8e19d3d63 --- /dev/null +++ b/ggml/src/CMakeLists.txt @@ -0,0 +1,1139 @@ +include(CheckCXXCompilerFlag) + +add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES}) + +# enable libstdc++ assertions for debug builds +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) +endif() + +if (NOT MSVC) + if (GGML_SANITIZE_THREAD) + add_compile_options(-fsanitize=thread) + link_libraries (-fsanitize=thread) + endif() + + if (GGML_SANITIZE_ADDRESS) + add_compile_options(-fsanitize=address -fno-omit-frame-pointer) + link_libraries (-fsanitize=address) + endif() + + if (GGML_SANITIZE_UNDEFINED) + add_compile_options(-fsanitize=undefined) + link_libraries (-fsanitize=undefined) + endif() +endif() + +if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + add_compile_definitions(GGML_USE_ACCELERATE) + add_compile_definitions(ACCELERATE_NEW_LAPACK) + add_compile_definitions(ACCELERATE_LAPACK_ILP64) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) + else() + message(WARNING "Accelerate framework not found") + endif() +endif() + +if (GGML_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + + message(STATUS "Metal framework found") + set(GGML_HEADERS_METAL ggml-metal.h) + set(GGML_SOURCES_METAL ggml-metal.m) + + add_compile_definitions(GGML_USE_METAL) + if (GGML_METAL_NDEBUG) + add_compile_definitions(GGML_METAL_NDEBUG) + endif() + + # copy ggml-common.h and ggml-metal.metal to bin directory + configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) + configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) + + if (GGML_METAL_EMBED_LIBRARY) + enable_language(ASM) + add_compile_definitions(GGML_METAL_EMBED_LIBRARY) + + set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h") + set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") + + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") + + # merge ggml-common.h and ggml-metal.metal into a single file + set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s") + set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") + + add_custom_command( + OUTPUT ${METALLIB_EMBED_ASM} + COMMAND echo "Embedding Metal library" + COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED} + COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} + DEPENDS ggml-metal.metal ggml-common.h + COMMENT "Generate assembly for embedded Metal library" + ) + + set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM}) + else() + if (GGML_METAL_SHADER_DEBUG) + # custom command to do the following: + # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air + # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib + # + # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works + # disabling fast math is needed in order to pass tests/test-backend-ops + # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 + # note: unfortunately, we have to call it default.metallib instead of ggml.metallib + # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 + set(XC_FLAGS -fno-fast-math -fno-inline -g) + else() + set(XC_FLAGS -O3) + endif() + + # Append macOS metal versioning flags + if (GGML_METAL_MACOSX_VERSION_MIN) + message(STATUS "Adding -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation") + list (APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN}) + endif() + + if (GGML_METAL_STD) + message(STATUS "Adding -std=${GGML_METAL_STD} flag to metal compilation") + list (APPEND XC_FLAGS -std=${GGML_METAL_STD}) + endif() + + add_custom_command( + OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air + COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal + DEPENDS ggml-metal.metal ggml-common.h + COMMENT "Compiling Metal kernels" + ) + + add_custom_target( + ggml-metal ALL + DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + ) + endif() # GGML_METAL_EMBED_LIBRARY + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} + ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} + ${METALKIT_FRAMEWORK} + ) +endif() + +if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + message(STATUS "OpenMP found") + add_compile_definitions(GGML_USE_OPENMP) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + else() + message(WARNING "OpenMP not found") + endif() +endif() + +if (GGML_BLAS) + if (GGML_STATIC) + set(BLA_STATIC ON) + endif() + #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) + # set(BLA_SIZEOF_INTEGER 8) + #endif() + + set(BLA_VENDOR ${GGML_BLAS_VENDOR}) + find_package(BLAS) + + if (BLAS_FOUND) + message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") + + if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${GGML_BLAS_VENDOR} MATCHES "Apple")) + # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. + # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 + find_package(PkgConfig REQUIRED) + if (${GGML_BLAS_VENDOR} MATCHES "Generic") + pkg_check_modules(DepBLAS REQUIRED blas) + elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS") + # As of openblas v0.3.22, the 64-bit is named openblas64.pc + pkg_check_modules(DepBLAS openblas64) + if (NOT DepBLAS_FOUND) + pkg_check_modules(DepBLAS REQUIRED openblas) + endif() + elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME") + pkg_check_modules(DepBLAS REQUIRED blis) + elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS") + pkg_check_modules(DepBLAS REQUIRED blas-atlas) + elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS") + pkg_check_modules(DepBLAS REQUIRED flexiblas_api) + elseif (${GGML_BLAS_VENDOR} MATCHES "Intel") + # all Intel* libraries share the same include path + pkg_check_modules(DepBLAS REQUIRED mkl-sdl) + elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC") + # this doesn't provide pkg-config + # suggest to assign BLAS_INCLUDE_DIRS on your own + if ("${NVHPC_VERSION}" STREQUAL "") + message(WARNING "Better to set NVHPC_VERSION") + else() + set(DepBLAS_FOUND ON) + set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include") + endif() + endif() + if (DepBLAS_FOUND) + set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS}) + else() + message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically" + " detected by pkgconfig, trying to find cblas.h from possible paths...") + find_path(BLAS_INCLUDE_DIRS + NAMES cblas.h + HINTS + /usr/include + /usr/local/include + /usr/include/openblas + /opt/homebrew/opt/openblas/include + /usr/local/opt/openblas/include + /usr/include/x86_64-linux-gnu/openblas/include + ) + endif() + endif() + + message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") + + add_compile_options(${BLAS_LINKER_FLAGS}) + + add_compile_definitions(GGML_USE_BLAS) + + if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel")) + add_compile_definitions(GGML_BLAS_USE_MKL) + endif() + + set(GGML_HEADERS_BLAS ggml-blas.h) + set(GGML_SOURCES_BLAS ggml-blas.cpp) + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${BLAS_LIBRARIES}) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) + else() + message(WARNING "BLAS not found, please refer to " + "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" + " to set correct GGML_BLAS_VENDOR") + endif() +endif() + +if (GGML_LLAMAFILE) + message(STATUS "Using ggml SGEMM") + + add_compile_definitions(GGML_USE_LLAMAFILE) + + set(GGML_HEADERS_LLAMAFILE sgemm.h) + set(GGML_SOURCES_LLAMAFILE sgemm.cpp) +endif() + +if (GGML_CUDA) + cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES + + find_package(CUDAToolkit) + if (CUDAToolkit_FOUND) + message(STATUS "CUDA found") + + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + # 52 == lowest CUDA 12 standard + # 60 == f16 CUDA intrinsics + # 61 == integer CUDA intrinsics + # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster + if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) + set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics + else() + set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics + #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work + endif() + endif() + message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") + + enable_language(CUDA) + + file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh") + list(APPEND GGML_HEADERS_CUDA "ggml-cuda.h") + + file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") + list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") + file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + + if (GGML_CUDA_FA_ALL_QUANTS) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) + else() + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + endif() + + add_compile_definitions(GGML_USE_CUDA) + add_compile_definitions(GGML_CUDA_USE_GRAPHS) + add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) + add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) + add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) # TODO: remove + add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) + if (GGML_CUDA_FORCE_DMMV) + add_compile_definitions(GGML_CUDA_FORCE_DMMV) + endif() + if (GGML_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() + if (GGML_CUDA_NO_VMM) + add_compile_definitions(GGML_CUDA_NO_VMM) + endif() + if (DEFINED GGML_CUDA_DMMV_Y) + add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility + endif() + if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) + add_compile_definitions(GGML_CUDA_F16) + endif() + if (GGML_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) + endif() + + if (GGML_STATIC) + if (WIN32) + # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + else () + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + endif() + else() + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + endif() + + if (GGML_CUDA_NO_VMM) + # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) + else() + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + endif() + else() + message(WARNING "CUDA not found") + endif() +endif() + +if (GGML_HIPBLAS) + if (NOT EXISTS $ENV{ROCM_PATH}) + if (NOT EXISTS /opt/rocm) + set(ROCM_PATH /usr) + else() + set(ROCM_PATH /opt/rocm) + endif() + else() + set(ROCM_PATH $ENV{ROCM_PATH}) + endif() + list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) + list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake") + + # CMake on Windows doesn't support the HIP language yet + if(WIN32) + set(CXX_IS_HIPCC TRUE) + else() + string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}") + endif() + + if(CXX_IS_HIPCC) + if(LINUX) + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") + endif() + + message(WARNING "Setting hipcc as the C++ compiler is legacy behavior." + " Prefer setting the HIP compiler directly. See README for details.") + endif() + else() + # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES. + if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) + endif() + cmake_minimum_required(VERSION 3.21) + enable_language(HIP) + endif() + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) + + message(STATUS "HIP and hipBLAS found") + + file(GLOB GGML_HEADERS_ROCM "ggml-cuda/*.cuh") + list(APPEND GGML_HEADERS_ROCM "ggml-cuda.h") + + file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") + list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") + file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + + if (GGML_CUDA_FA_ALL_QUANTS) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) + else() + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + endif() + + + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA) + add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) + add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) + add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) + + if (GGML_HIP_UMA) + add_compile_definitions(GGML_HIP_UMA) + endif() + + if (GGML_CUDA_FORCE_DMMV) + add_compile_definitions(GGML_CUDA_FORCE_DMMV) + endif() + + if (GGML_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() + + if (GGML_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) + endif() + + if (CXX_IS_HIPCC) + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device) + else() + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) + endif() + + if (GGML_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") + endif() + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas) +endif() + +if (GGML_SYCL) + if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$") + message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA") + endif() + + if ( NOT DEFINED ENV{ONEAPI_ROOT}) + message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") + endif() + #todo: AOT + + find_package(IntelSYCL REQUIRED) + find_package(MKL REQUIRED) + + message(STATUS "SYCL found") + + add_compile_definitions(GGML_USE_SYCL) + + if (GGML_SYCL_F16) + add_compile_definitions(GGML_SYCL_F16) + endif() + + if (GGML_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_SYCL_FORCE_MMQ) + endif() + + add_compile_options(-I./) #include DPCT + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") + if (GGML_SYCL_TARGET STREQUAL "NVIDIA") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") + endif() + + file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp") + list(APPEND GGML_HEADERS_SYCL "ggml-sycl.h") + + file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp") + list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp") + + if (WIN32) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) + else() + add_compile_options(-I/${SYCL_INCLUDE_DIR}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") + + if (GGML_SYCL_TARGET STREQUAL "INTEL") + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl) + endif() + endif() +endif() + +if (GGML_RPC) + message(STATUS "RPC found") + + add_compile_definitions(GGML_USE_RPC) + + if (WIN32) + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32) + endif() + + set(GGML_HEADERS_RPC ggml-rpc.h) + set(GGML_SOURCES_RPC ggml-rpc.cpp) +endif() + +if (GGML_VULKAN) + find_package(Vulkan) + + if (Vulkan_FOUND) + message(STATUS "Vulkan found") + + set(GGML_HEADERS_VULKAN ggml-vulkan.h) + set(GGML_SOURCES_VULKAN ggml-vulkan.cpp) + + add_compile_definitions(GGML_USE_VULKAN) + + # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build + # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector + if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0) + endif() + + if (GGML_VULKAN_CHECK_RESULTS) + add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) + endif() + + if (GGML_VULKAN_DEBUG) + add_compile_definitions(GGML_VULKAN_DEBUG) + endif() + + if (GGML_VULKAN_MEMORY_DEBUG) + add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG) + endif() + + if (GGML_VULKAN_VALIDATE) + add_compile_definitions(GGML_VULKAN_VALIDATE) + endif() + + if (GGML_VULKAN_RUN_TESTS) + add_compile_definitions(GGML_VULKAN_RUN_TESTS) + endif() + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan) + else() + message(WARNING "Vulkan not found") + endif() +endif() + +if (GGML_KOMPUTE) + add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) + find_package(Vulkan COMPONENTS glslc REQUIRED) + find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc) + if (NOT glslc_executable) + message(FATAL_ERROR "glslc not found") + endif() + + function(compile_shader) + set(options) + set(oneValueArgs) + set(multiValueArgs SOURCES) + cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + foreach(source ${compile_shader_SOURCES}) + get_filename_component(filename ${source} NAME) + set(spv_file ${filename}.spv) + add_custom_command( + OUTPUT ${spv_file} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp + COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} + COMMENT "Compiling ${source} to ${spv_file}" + ) + + get_filename_component(RAW_FILE_NAME ${spv_file} NAME) + set(FILE_NAME "shader${RAW_FILE_NAME}") + string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) + string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) + string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") + set(OUTPUT_HEADER_FILE "${HEADER_FILE}") + message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") + if(CMAKE_GENERATOR MATCHES "Visual Studio") + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$/xxd" + ) + else() + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" + ) + endif() + endforeach() + endfunction() + + if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") + message(STATUS "Kompute found") + set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level") + add_subdirectory(kompute) + + # Compile our shaders + compile_shader(SOURCES + kompute-shaders/op_scale.comp + kompute-shaders/op_scale_8.comp + kompute-shaders/op_add.comp + kompute-shaders/op_addrow.comp + kompute-shaders/op_mul.comp + kompute-shaders/op_silu.comp + kompute-shaders/op_relu.comp + kompute-shaders/op_gelu.comp + kompute-shaders/op_softmax.comp + kompute-shaders/op_norm.comp + kompute-shaders/op_rmsnorm.comp + kompute-shaders/op_diagmask.comp + kompute-shaders/op_mul_mat_mat_f32.comp + kompute-shaders/op_mul_mat_f16.comp + kompute-shaders/op_mul_mat_q8_0.comp + kompute-shaders/op_mul_mat_q4_0.comp + kompute-shaders/op_mul_mat_q4_1.comp + kompute-shaders/op_mul_mat_q6_k.comp + kompute-shaders/op_getrows_f32.comp + kompute-shaders/op_getrows_f16.comp + kompute-shaders/op_getrows_q4_0.comp + kompute-shaders/op_getrows_q4_1.comp + kompute-shaders/op_getrows_q6_k.comp + kompute-shaders/op_rope_f16.comp + kompute-shaders/op_rope_f32.comp + kompute-shaders/op_cpy_f16_f16.comp + kompute-shaders/op_cpy_f16_f32.comp + kompute-shaders/op_cpy_f32_f16.comp + kompute-shaders/op_cpy_f32_f32.comp + ) + + # Create a custom target for our generated shaders + add_custom_target(generated_shaders DEPENDS + shaderop_scale.h + shaderop_scale_8.h + shaderop_add.h + shaderop_addrow.h + shaderop_mul.h + shaderop_silu.h + shaderop_relu.h + shaderop_gelu.h + shaderop_softmax.h + shaderop_norm.h + shaderop_rmsnorm.h + shaderop_diagmask.h + shaderop_mul_mat_mat_f32.h + shaderop_mul_mat_f16.h + shaderop_mul_mat_q8_0.h + shaderop_mul_mat_q4_0.h + shaderop_mul_mat_q4_1.h + shaderop_mul_mat_q6_k.h + shaderop_getrows_f32.h + shaderop_getrows_f16.h + shaderop_getrows_q4_0.h + shaderop_getrows_q4_1.h + shaderop_getrows_q6_k.h + shaderop_rope_f16.h + shaderop_rope_f32.h + shaderop_cpy_f16_f16.h + shaderop_cpy_f16_f32.h + shaderop_cpy_f32_f16.h + shaderop_cpy_f32_f32.h + ) + + # Create a custom command that depends on the generated_shaders + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp + COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp + DEPENDS generated_shaders + COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp" + ) + + # Add the stamp to the main sources to ensure dependency tracking + set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + + add_compile_definitions(GGML_USE_KOMPUTE) + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} kompute) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR}) + else() + message(WARNING "Kompute not found") + endif() +endif() + +if (GGML_CPU_HBM) + find_library(memkind memkind REQUIRED) + + message(STATUS "Using memkind for CPU HBM") + + add_compile_definitions(GGML_USE_CPU_HBM) + + target_link_libraries(ggml PUBLIC memkind) +endif() + +if (GGML_PERF) + add_compile_definitions(GGML_PERF) +endif() + +function(get_flags CCID CCVER) + set(C_FLAGS "") + set(CXX_FLAGS "") + + if (CCID MATCHES "Clang") + set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return) + set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi) + + if ( + (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR + (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) + ) + list(APPEND C_FLAGS -Wdouble-promotion) + endif() + elseif (CCID STREQUAL "GNU") + set(C_FLAGS -Wdouble-promotion) + set(CXX_FLAGS -Wno-array-bounds) + + if (CCVER VERSION_GREATER_EQUAL 7.1.0) + list(APPEND CXX_FLAGS -Wno-format-truncation) + endif() + if (CCVER VERSION_GREATER_EQUAL 8.1.0) + list(APPEND CXX_FLAGS -Wextra-semi) + endif() + endif() + + set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) + set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) +endfunction() + +if (GGML_FATAL_WARNINGS) + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + list(APPEND C_FLAGS -Werror) + list(APPEND CXX_FLAGS -Werror) + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + add_compile_options(/WX) + endif() +endif() + +if (GGML_ALL_WARNINGS) + if (NOT MSVC) + list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) + list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes + -Werror=implicit-int -Werror=implicit-function-declaration) + list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) + + list(APPEND C_FLAGS ${WARNING_FLAGS}) + list(APPEND CXX_FLAGS ${WARNING_FLAGS}) + + get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) + + add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" + "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") + else() + # todo : msvc + set(C_FLAGS "") + set(CXX_FLAGS "") + endif() +endif() + +set(CUDA_CXX_FLAGS "") + +if (GGML_CUDA) + set(CUDA_FLAGS -use_fast_math) + + if (GGML_FATAL_WARNINGS) + list(APPEND CUDA_FLAGS -Werror all-warnings) + endif() + + if (GGML_ALL_WARNINGS AND NOT MSVC) + set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) + if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") + list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) + endif() + + execute_process( + COMMAND ${NVCC_CMD} -Xcompiler --version + OUTPUT_VARIABLE CUDA_CCFULLVER + ERROR_QUIET + ) + + if (NOT CUDA_CCFULLVER MATCHES clang) + set(CUDA_CCID "GNU") + execute_process( + COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" + OUTPUT_VARIABLE CUDA_CCVER + ERROR_QUIET + ) + else() + if (CUDA_CCFULLVER MATCHES Apple) + set(CUDA_CCID "AppleClang") + else() + set(CUDA_CCID "Clang") + endif() + string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) + endif() + + message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") + + get_flags(${CUDA_CCID} ${CUDA_CCVER}) + list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later + endif() + + if (NOT MSVC) + list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) + endif() +endif() + +if (WIN32) + add_compile_definitions(_CRT_SECURE_NO_WARNINGS) + + if (BUILD_SHARED_LIBS) + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) + endif() +endif() + +if (GGML_LTO) + include(CheckIPOSupported) + check_ipo_supported(RESULT result OUTPUT output) + if (result) + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) + else() + message(WARNING "IPO is not supported: ${output}") + endif() +endif() + +if (GGML_CCACHE) + find_program(GGML_CCACHE_FOUND ccache) + + if (GGML_CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set(ENV{CCACHE_SLOPPINESS} time_macros) + message(STATUS "ccache found, compilation results will be cached. Disable with GGML_CCACHE=OFF.") + else() + message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF") + endif () +endif() + +# this version of Apple ld64 is buggy +execute_process( + COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v + ERROR_VARIABLE output + OUTPUT_QUIET +) + +if (output MATCHES "dyld-1015\.7") + add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) +endif() + +# architecture specific +# TODO: probably these flags need to be tweaked on some architectures +# feel free to update the Makefile for your architecture and send a pull request or issue +message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") +if (MSVC) + string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) + message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") +else () + set(CMAKE_GENERATOR_PLATFORM_LWR "") +endif () + +if (NOT MSVC) + if (GGML_STATIC) + add_link_options(-static) + if (MINGW) + add_link_options(-static-libgcc -static-libstdc++) + endif() + endif() + if (GGML_GPROF) + add_compile_options(-pg) + endif() +endif() + +set(ARCH_FLAGS "") + +if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR + CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR + (NOT CMAKE_OSX_ARCHITECTURES AND + NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) + + message(STATUS "ARM detected") + + if (MSVC) + add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead + add_compile_definitions(__ARM_NEON) + add_compile_definitions(__ARM_FEATURE_FMA) + + set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + add_compile_definitions(__ARM_FEATURE_DOTPROD) + endif () + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) + endif () + + check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + endif () + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) + else() + check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") + list(APPEND ARCH_FLAGS -mfp16-format=ieee) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") + # Raspberry Pi 1, Zero + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a + # Raspberry Pi 3, 4, Zero 2 (32-bit) + list(APPEND ARCH_FLAGS -mno-unaligned-access) + endif() + if (GGML_SVE) + list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) + endif() + endif() +elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) + message(STATUS "x86 detected") + if (MSVC) + # instruction set detection for MSVC only + if (GGML_NATIVE) + # TODO: improve, should not reference files from the parent folder + include(../cmake/FindSIMD.cmake) + endif () + if (GGML_AVX512) + list(APPEND ARCH_FLAGS /arch:AVX512) + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + if (GGML_AVX512_VBMI) + add_compile_definitions($<$:__AVX512VBMI__>) + add_compile_definitions($<$:__AVX512VBMI__>) + endif() + if (GGML_AVX512_VNNI) + add_compile_definitions($<$:__AVX512VNNI__>) + add_compile_definitions($<$:__AVX512VNNI__>) + endif() + if (GGML_AVX512_BF16) + add_compile_definitions($<$:__AVX512BF16__>) + add_compile_definitions($<$:__AVX512BF16__>) + endif() + elseif (GGML_AVX2) + list(APPEND ARCH_FLAGS /arch:AVX2) + elseif (GGML_AVX) + list(APPEND ARCH_FLAGS /arch:AVX) + endif() + else() + if (GGML_NATIVE) + list(APPEND ARCH_FLAGS -march=native) + endif() + if (GGML_F16C) + list(APPEND ARCH_FLAGS -mf16c) + endif() + if (GGML_FMA) + list(APPEND ARCH_FLAGS -mfma) + endif() + if (GGML_AVX) + list(APPEND ARCH_FLAGS -mavx) + endif() + if (GGML_AVX2) + list(APPEND ARCH_FLAGS -mavx2) + endif() + if (GGML_AVX512) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512bw) + endif() + if (GGML_AVX512_VBMI) + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() + if (GGML_AVX512_VNNI) + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + if (GGML_AVX512_BF16) + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + message(STATUS "PowerPC detected") + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) + else() + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) + #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") + message(STATUS "loongarch64 detected") + + list(APPEND ARCH_FLAGS -march=loongarch64) + if (GGML_LASX) + list(APPEND ARCH_FLAGS -mlasx) + endif() + if (GGML_LSX) + list(APPEND ARCH_FLAGS -mlsx) + endif() +else() + message(STATUS "Unknown architecture") +endif() + +add_compile_options("$<$:${ARCH_FLAGS}>") +add_compile_options("$<$:${ARCH_FLAGS}>") + +if (GGML_CUDA) + list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) + list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument + + if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") + list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) + endif() + + add_compile_options("$<$:${CUDA_FLAGS}>") +endif() + +if (MINGW) + # Target Windows 8 for PrefetchVirtualMemory + add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) +endif() + +# +# POSIX conformance +# + +# clock_gettime came in POSIX.1b (1993) +# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional +# posix_memalign came in POSIX.1-2001 / SUSv3 +# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) +add_compile_definitions(_XOPEN_SOURCE=600) + +# Somehow in OpenBSD whenever POSIX conformance is specified +# some string functions rely on locale_t availability, +# which was introduced in POSIX.1-2008, forcing us to go higher +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + remove_definitions(-D_XOPEN_SOURCE=600) + add_compile_definitions(_XOPEN_SOURCE=700) +endif() + +# Data types, macros and functions related to controlling CPU affinity and +# some memory allocation are available on Linux through GNU extensions in libc +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + add_compile_definitions(_GNU_SOURCE) +endif() + +# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, +# and on macOS its availability depends on enabling Darwin extensions +# similarly on DragonFly, enabling BSD extensions is necessary +if ( + CMAKE_SYSTEM_NAME MATCHES "Darwin" OR + CMAKE_SYSTEM_NAME MATCHES "iOS" OR + CMAKE_SYSTEM_NAME MATCHES "tvOS" OR + CMAKE_SYSTEM_NAME MATCHES "DragonFly" +) + add_compile_definitions(_DARWIN_C_SOURCE) +endif() + +# alloca is a non-standard interface that is not visible on BSDs when +# POSIX conformance is specified, but not all of them provide a clean way +# to enable it in such cases +if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") + add_compile_definitions(__BSD_VISIBLE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") + add_compile_definitions(_NETBSD_SOURCE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + add_compile_definitions(_BSD_SOURCE) +endif() + +# +# libraries +# + +# ggml + +add_library(ggml OBJECT + ../include/ggml.h + ../include/ggml-alloc.h + ../include/ggml-backend.h + ggml.c + ggml-alloc.c + ggml-backend.c + ggml-quants.c + ggml-quants.h + ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} + ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC} + ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} + ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} + ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} + ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} + ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} + ) + +target_include_directories(ggml PUBLIC . ../include ${GGML_EXTRA_INCLUDES}) +target_compile_features (ggml PUBLIC c_std_11) # don't bump + +target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS}) + +add_library(ggml_static STATIC $) + +if (BUILD_SHARED_LIBS) + set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) + add_library(ggml_shared SHARED $) + target_link_libraries(ggml_shared PRIVATE Threads::Threads ${GGML_EXTRA_LIBS}) +endif() diff --git a/ggml-alloc.c b/ggml/src/ggml-alloc.c similarity index 100% rename from ggml-alloc.c rename to ggml/src/ggml-alloc.c diff --git a/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h similarity index 100% rename from ggml-backend-impl.h rename to ggml/src/ggml-backend-impl.h diff --git a/ggml-backend.c b/ggml/src/ggml-backend.c similarity index 100% rename from ggml-backend.c rename to ggml/src/ggml-backend.c diff --git a/ggml-blas.cpp b/ggml/src/ggml-blas.cpp similarity index 100% rename from ggml-blas.cpp rename to ggml/src/ggml-blas.cpp diff --git a/ggml-blas.h b/ggml/src/ggml-blas.h similarity index 100% rename from ggml-blas.h rename to ggml/src/ggml-blas.h diff --git a/ggml-common.h b/ggml/src/ggml-common.h similarity index 100% rename from ggml-common.h rename to ggml/src/ggml-common.h diff --git a/ggml-cuda.cu b/ggml/src/ggml-cuda.cu similarity index 100% rename from ggml-cuda.cu rename to ggml/src/ggml-cuda.cu diff --git a/ggml-cuda.h b/ggml/src/ggml-cuda.h similarity index 100% rename from ggml-cuda.h rename to ggml/src/ggml-cuda.h diff --git a/ggml-cuda/acc.cu b/ggml/src/ggml-cuda/acc.cu similarity index 100% rename from ggml-cuda/acc.cu rename to ggml/src/ggml-cuda/acc.cu diff --git a/ggml-cuda/acc.cuh b/ggml/src/ggml-cuda/acc.cuh similarity index 100% rename from ggml-cuda/acc.cuh rename to ggml/src/ggml-cuda/acc.cuh diff --git a/ggml-cuda/arange.cu b/ggml/src/ggml-cuda/arange.cu similarity index 100% rename from ggml-cuda/arange.cu rename to ggml/src/ggml-cuda/arange.cu diff --git a/ggml-cuda/arange.cuh b/ggml/src/ggml-cuda/arange.cuh similarity index 100% rename from ggml-cuda/arange.cuh rename to ggml/src/ggml-cuda/arange.cuh diff --git a/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu similarity index 100% rename from ggml-cuda/argsort.cu rename to ggml/src/ggml-cuda/argsort.cu diff --git a/ggml-cuda/argsort.cuh b/ggml/src/ggml-cuda/argsort.cuh similarity index 100% rename from ggml-cuda/argsort.cuh rename to ggml/src/ggml-cuda/argsort.cuh diff --git a/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu similarity index 100% rename from ggml-cuda/binbcast.cu rename to ggml/src/ggml-cuda/binbcast.cu diff --git a/ggml-cuda/binbcast.cuh b/ggml/src/ggml-cuda/binbcast.cuh similarity index 100% rename from ggml-cuda/binbcast.cuh rename to ggml/src/ggml-cuda/binbcast.cuh diff --git a/ggml-cuda/clamp.cu b/ggml/src/ggml-cuda/clamp.cu similarity index 100% rename from ggml-cuda/clamp.cu rename to ggml/src/ggml-cuda/clamp.cu diff --git a/ggml-cuda/clamp.cuh b/ggml/src/ggml-cuda/clamp.cuh similarity index 100% rename from ggml-cuda/clamp.cuh rename to ggml/src/ggml-cuda/clamp.cuh diff --git a/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh similarity index 100% rename from ggml-cuda/common.cuh rename to ggml/src/ggml-cuda/common.cuh diff --git a/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu similarity index 100% rename from ggml-cuda/concat.cu rename to ggml/src/ggml-cuda/concat.cu diff --git a/ggml-cuda/concat.cuh b/ggml/src/ggml-cuda/concat.cuh similarity index 100% rename from ggml-cuda/concat.cuh rename to ggml/src/ggml-cuda/concat.cuh diff --git a/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu similarity index 100% rename from ggml-cuda/convert.cu rename to ggml/src/ggml-cuda/convert.cu diff --git a/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh similarity index 100% rename from ggml-cuda/convert.cuh rename to ggml/src/ggml-cuda/convert.cuh diff --git a/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu similarity index 100% rename from ggml-cuda/cpy.cu rename to ggml/src/ggml-cuda/cpy.cu diff --git a/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh similarity index 100% rename from ggml-cuda/cpy.cuh rename to ggml/src/ggml-cuda/cpy.cuh diff --git a/ggml-cuda/dequantize.cuh b/ggml/src/ggml-cuda/dequantize.cuh similarity index 100% rename from ggml-cuda/dequantize.cuh rename to ggml/src/ggml-cuda/dequantize.cuh diff --git a/ggml-cuda/diagmask.cu b/ggml/src/ggml-cuda/diagmask.cu similarity index 100% rename from ggml-cuda/diagmask.cu rename to ggml/src/ggml-cuda/diagmask.cu diff --git a/ggml-cuda/diagmask.cuh b/ggml/src/ggml-cuda/diagmask.cuh similarity index 100% rename from ggml-cuda/diagmask.cuh rename to ggml/src/ggml-cuda/diagmask.cuh diff --git a/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu similarity index 100% rename from ggml-cuda/dmmv.cu rename to ggml/src/ggml-cuda/dmmv.cu diff --git a/ggml-cuda/dmmv.cuh b/ggml/src/ggml-cuda/dmmv.cuh similarity index 100% rename from ggml-cuda/dmmv.cuh rename to ggml/src/ggml-cuda/dmmv.cuh diff --git a/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh similarity index 100% rename from ggml-cuda/fattn-common.cuh rename to ggml/src/ggml-cuda/fattn-common.cuh diff --git a/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu similarity index 100% rename from ggml-cuda/fattn-tile-f16.cu rename to ggml/src/ggml-cuda/fattn-tile-f16.cu diff --git a/ggml-cuda/fattn-tile-f16.cuh b/ggml/src/ggml-cuda/fattn-tile-f16.cuh similarity index 100% rename from ggml-cuda/fattn-tile-f16.cuh rename to ggml/src/ggml-cuda/fattn-tile-f16.cuh diff --git a/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu similarity index 100% rename from ggml-cuda/fattn-tile-f32.cu rename to ggml/src/ggml-cuda/fattn-tile-f32.cu diff --git a/ggml-cuda/fattn-tile-f32.cuh b/ggml/src/ggml-cuda/fattn-tile-f32.cuh similarity index 100% rename from ggml-cuda/fattn-tile-f32.cuh rename to ggml/src/ggml-cuda/fattn-tile-f32.cuh diff --git a/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh similarity index 100% rename from ggml-cuda/fattn-vec-f16.cuh rename to ggml/src/ggml-cuda/fattn-vec-f16.cuh diff --git a/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh similarity index 100% rename from ggml-cuda/fattn-vec-f32.cuh rename to ggml/src/ggml-cuda/fattn-vec-f32.cuh diff --git a/ggml-cuda/fattn-wmma-f16.cuh b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh similarity index 100% rename from ggml-cuda/fattn-wmma-f16.cuh rename to ggml/src/ggml-cuda/fattn-wmma-f16.cuh diff --git a/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu similarity index 100% rename from ggml-cuda/fattn.cu rename to ggml/src/ggml-cuda/fattn.cu diff --git a/ggml-cuda/fattn.cuh b/ggml/src/ggml-cuda/fattn.cuh similarity index 100% rename from ggml-cuda/fattn.cuh rename to ggml/src/ggml-cuda/fattn.cuh diff --git a/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu similarity index 100% rename from ggml-cuda/getrows.cu rename to ggml/src/ggml-cuda/getrows.cu diff --git a/ggml-cuda/getrows.cuh b/ggml/src/ggml-cuda/getrows.cuh similarity index 100% rename from ggml-cuda/getrows.cuh rename to ggml/src/ggml-cuda/getrows.cuh diff --git a/ggml-cuda/im2col.cu b/ggml/src/ggml-cuda/im2col.cu similarity index 100% rename from ggml-cuda/im2col.cu rename to ggml/src/ggml-cuda/im2col.cu diff --git a/ggml-cuda/im2col.cuh b/ggml/src/ggml-cuda/im2col.cuh similarity index 100% rename from ggml-cuda/im2col.cuh rename to ggml/src/ggml-cuda/im2col.cuh diff --git a/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh similarity index 100% rename from ggml-cuda/mma.cuh rename to ggml/src/ggml-cuda/mma.cuh diff --git a/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu similarity index 100% rename from ggml-cuda/mmq.cu rename to ggml/src/ggml-cuda/mmq.cu diff --git a/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh similarity index 100% rename from ggml-cuda/mmq.cuh rename to ggml/src/ggml-cuda/mmq.cuh diff --git a/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu similarity index 100% rename from ggml-cuda/mmvq.cu rename to ggml/src/ggml-cuda/mmvq.cu diff --git a/ggml-cuda/mmvq.cuh b/ggml/src/ggml-cuda/mmvq.cuh similarity index 100% rename from ggml-cuda/mmvq.cuh rename to ggml/src/ggml-cuda/mmvq.cuh diff --git a/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu similarity index 100% rename from ggml-cuda/norm.cu rename to ggml/src/ggml-cuda/norm.cu diff --git a/ggml-cuda/norm.cuh b/ggml/src/ggml-cuda/norm.cuh similarity index 100% rename from ggml-cuda/norm.cuh rename to ggml/src/ggml-cuda/norm.cuh diff --git a/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu similarity index 100% rename from ggml-cuda/pad.cu rename to ggml/src/ggml-cuda/pad.cu diff --git a/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh similarity index 100% rename from ggml-cuda/pad.cuh rename to ggml/src/ggml-cuda/pad.cuh diff --git a/ggml-cuda/pool2d.cu b/ggml/src/ggml-cuda/pool2d.cu similarity index 100% rename from ggml-cuda/pool2d.cu rename to ggml/src/ggml-cuda/pool2d.cu diff --git a/ggml-cuda/pool2d.cuh b/ggml/src/ggml-cuda/pool2d.cuh similarity index 100% rename from ggml-cuda/pool2d.cuh rename to ggml/src/ggml-cuda/pool2d.cuh diff --git a/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu similarity index 100% rename from ggml-cuda/quantize.cu rename to ggml/src/ggml-cuda/quantize.cu diff --git a/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh similarity index 100% rename from ggml-cuda/quantize.cuh rename to ggml/src/ggml-cuda/quantize.cuh diff --git a/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu similarity index 100% rename from ggml-cuda/rope.cu rename to ggml/src/ggml-cuda/rope.cu diff --git a/ggml-cuda/rope.cuh b/ggml/src/ggml-cuda/rope.cuh similarity index 100% rename from ggml-cuda/rope.cuh rename to ggml/src/ggml-cuda/rope.cuh diff --git a/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu similarity index 100% rename from ggml-cuda/scale.cu rename to ggml/src/ggml-cuda/scale.cu diff --git a/ggml-cuda/scale.cuh b/ggml/src/ggml-cuda/scale.cuh similarity index 100% rename from ggml-cuda/scale.cuh rename to ggml/src/ggml-cuda/scale.cuh diff --git a/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu similarity index 100% rename from ggml-cuda/softmax.cu rename to ggml/src/ggml-cuda/softmax.cu diff --git a/ggml-cuda/softmax.cuh b/ggml/src/ggml-cuda/softmax.cuh similarity index 100% rename from ggml-cuda/softmax.cuh rename to ggml/src/ggml-cuda/softmax.cuh diff --git a/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu similarity index 100% rename from ggml-cuda/sumrows.cu rename to ggml/src/ggml-cuda/sumrows.cu diff --git a/ggml-cuda/sumrows.cuh b/ggml/src/ggml-cuda/sumrows.cuh similarity index 100% rename from ggml-cuda/sumrows.cuh rename to ggml/src/ggml-cuda/sumrows.cuh diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu similarity index 100% rename from ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu rename to ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu diff --git a/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py similarity index 100% rename from ggml-cuda/template-instances/generate_cu_files.py rename to ggml/src/ggml-cuda/template-instances/generate_cu_files.py diff --git a/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q2_k.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q3_k.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q4_0.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q4_1.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q4_k.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q5_0.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q5_1.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q5_k.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q6_k.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu diff --git a/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu similarity index 100% rename from ggml-cuda/template-instances/mmq-instance-q8_0.cu rename to ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu diff --git a/ggml-cuda/tsembd.cu b/ggml/src/ggml-cuda/tsembd.cu similarity index 100% rename from ggml-cuda/tsembd.cu rename to ggml/src/ggml-cuda/tsembd.cu diff --git a/ggml-cuda/tsembd.cuh b/ggml/src/ggml-cuda/tsembd.cuh similarity index 100% rename from ggml-cuda/tsembd.cuh rename to ggml/src/ggml-cuda/tsembd.cuh diff --git a/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu similarity index 100% rename from ggml-cuda/unary.cu rename to ggml/src/ggml-cuda/unary.cu diff --git a/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh similarity index 100% rename from ggml-cuda/unary.cuh rename to ggml/src/ggml-cuda/unary.cuh diff --git a/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu similarity index 100% rename from ggml-cuda/upscale.cu rename to ggml/src/ggml-cuda/upscale.cu diff --git a/ggml-cuda/upscale.cuh b/ggml/src/ggml-cuda/upscale.cuh similarity index 100% rename from ggml-cuda/upscale.cuh rename to ggml/src/ggml-cuda/upscale.cuh diff --git a/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh similarity index 100% rename from ggml-cuda/vecdotq.cuh rename to ggml/src/ggml-cuda/vecdotq.cuh diff --git a/ggml-impl.h b/ggml/src/ggml-impl.h similarity index 100% rename from ggml-impl.h rename to ggml/src/ggml-impl.h diff --git a/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp similarity index 100% rename from ggml-kompute.cpp rename to ggml/src/ggml-kompute.cpp diff --git a/ggml-kompute.h b/ggml/src/ggml-kompute.h similarity index 100% rename from ggml-kompute.h rename to ggml/src/ggml-kompute.h diff --git a/ggml-metal.h b/ggml/src/ggml-metal.h similarity index 100% rename from ggml-metal.h rename to ggml/src/ggml-metal.h diff --git a/ggml-metal.m b/ggml/src/ggml-metal.m similarity index 100% rename from ggml-metal.m rename to ggml/src/ggml-metal.m diff --git a/ggml-metal.metal b/ggml/src/ggml-metal.metal similarity index 100% rename from ggml-metal.metal rename to ggml/src/ggml-metal.metal diff --git a/ggml-quants.c b/ggml/src/ggml-quants.c similarity index 100% rename from ggml-quants.c rename to ggml/src/ggml-quants.c diff --git a/ggml-quants.h b/ggml/src/ggml-quants.h similarity index 100% rename from ggml-quants.h rename to ggml/src/ggml-quants.h diff --git a/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp similarity index 100% rename from ggml-rpc.cpp rename to ggml/src/ggml-rpc.cpp diff --git a/ggml-rpc.h b/ggml/src/ggml-rpc.h similarity index 100% rename from ggml-rpc.h rename to ggml/src/ggml-rpc.h diff --git a/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp similarity index 100% rename from ggml-sycl.cpp rename to ggml/src/ggml-sycl.cpp diff --git a/ggml-sycl.h b/ggml/src/ggml-sycl.h similarity index 100% rename from ggml-sycl.h rename to ggml/src/ggml-sycl.h diff --git a/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp similarity index 100% rename from ggml-sycl/backend.hpp rename to ggml/src/ggml-sycl/backend.hpp diff --git a/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp similarity index 100% rename from ggml-sycl/common.cpp rename to ggml/src/ggml-sycl/common.cpp diff --git a/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp similarity index 100% rename from ggml-sycl/common.hpp rename to ggml/src/ggml-sycl/common.hpp diff --git a/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp similarity index 100% rename from ggml-sycl/convert.cpp rename to ggml/src/ggml-sycl/convert.cpp diff --git a/ggml-sycl/convert.hpp b/ggml/src/ggml-sycl/convert.hpp similarity index 100% rename from ggml-sycl/convert.hpp rename to ggml/src/ggml-sycl/convert.hpp diff --git a/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp similarity index 100% rename from ggml-sycl/dequantize.hpp rename to ggml/src/ggml-sycl/dequantize.hpp diff --git a/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp similarity index 100% rename from ggml-sycl/dmmv.cpp rename to ggml/src/ggml-sycl/dmmv.cpp diff --git a/ggml-sycl/dmmv.hpp b/ggml/src/ggml-sycl/dmmv.hpp similarity index 100% rename from ggml-sycl/dmmv.hpp rename to ggml/src/ggml-sycl/dmmv.hpp diff --git a/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp similarity index 100% rename from ggml-sycl/dpct/helper.hpp rename to ggml/src/ggml-sycl/dpct/helper.hpp diff --git a/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp similarity index 100% rename from ggml-sycl/mmq.cpp rename to ggml/src/ggml-sycl/mmq.cpp diff --git a/ggml-sycl/mmq.hpp b/ggml/src/ggml-sycl/mmq.hpp similarity index 100% rename from ggml-sycl/mmq.hpp rename to ggml/src/ggml-sycl/mmq.hpp diff --git a/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp similarity index 100% rename from ggml-sycl/mmvq.cpp rename to ggml/src/ggml-sycl/mmvq.cpp diff --git a/ggml-sycl/mmvq.hpp b/ggml/src/ggml-sycl/mmvq.hpp similarity index 100% rename from ggml-sycl/mmvq.hpp rename to ggml/src/ggml-sycl/mmvq.hpp diff --git a/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp similarity index 100% rename from ggml-sycl/presets.hpp rename to ggml/src/ggml-sycl/presets.hpp diff --git a/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp similarity index 100% rename from ggml-sycl/vecdotq.hpp rename to ggml/src/ggml-sycl/vecdotq.hpp diff --git a/ggml-vulkan-shaders.hpp b/ggml/src/ggml-vulkan-shaders.hpp similarity index 100% rename from ggml-vulkan-shaders.hpp rename to ggml/src/ggml-vulkan-shaders.hpp diff --git a/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp similarity index 100% rename from ggml-vulkan.cpp rename to ggml/src/ggml-vulkan.cpp diff --git a/ggml-vulkan.h b/ggml/src/ggml-vulkan.h similarity index 100% rename from ggml-vulkan.h rename to ggml/src/ggml-vulkan.h diff --git a/ggml.c b/ggml/src/ggml.c similarity index 100% rename from ggml.c rename to ggml/src/ggml.c diff --git a/kompute b/ggml/src/kompute similarity index 100% rename from kompute rename to ggml/src/kompute diff --git a/kompute-shaders/common.comp b/ggml/src/kompute-shaders/common.comp similarity index 100% rename from kompute-shaders/common.comp rename to ggml/src/kompute-shaders/common.comp diff --git a/kompute-shaders/op_add.comp b/ggml/src/kompute-shaders/op_add.comp similarity index 100% rename from kompute-shaders/op_add.comp rename to ggml/src/kompute-shaders/op_add.comp diff --git a/kompute-shaders/op_addrow.comp b/ggml/src/kompute-shaders/op_addrow.comp similarity index 100% rename from kompute-shaders/op_addrow.comp rename to ggml/src/kompute-shaders/op_addrow.comp diff --git a/kompute-shaders/op_cpy_f16_f16.comp b/ggml/src/kompute-shaders/op_cpy_f16_f16.comp similarity index 100% rename from kompute-shaders/op_cpy_f16_f16.comp rename to ggml/src/kompute-shaders/op_cpy_f16_f16.comp diff --git a/kompute-shaders/op_cpy_f16_f32.comp b/ggml/src/kompute-shaders/op_cpy_f16_f32.comp similarity index 100% rename from kompute-shaders/op_cpy_f16_f32.comp rename to ggml/src/kompute-shaders/op_cpy_f16_f32.comp diff --git a/kompute-shaders/op_cpy_f32_f16.comp b/ggml/src/kompute-shaders/op_cpy_f32_f16.comp similarity index 100% rename from kompute-shaders/op_cpy_f32_f16.comp rename to ggml/src/kompute-shaders/op_cpy_f32_f16.comp diff --git a/kompute-shaders/op_cpy_f32_f32.comp b/ggml/src/kompute-shaders/op_cpy_f32_f32.comp similarity index 100% rename from kompute-shaders/op_cpy_f32_f32.comp rename to ggml/src/kompute-shaders/op_cpy_f32_f32.comp diff --git a/kompute-shaders/op_diagmask.comp b/ggml/src/kompute-shaders/op_diagmask.comp similarity index 100% rename from kompute-shaders/op_diagmask.comp rename to ggml/src/kompute-shaders/op_diagmask.comp diff --git a/kompute-shaders/op_gelu.comp b/ggml/src/kompute-shaders/op_gelu.comp similarity index 100% rename from kompute-shaders/op_gelu.comp rename to ggml/src/kompute-shaders/op_gelu.comp diff --git a/kompute-shaders/op_getrows.comp b/ggml/src/kompute-shaders/op_getrows.comp similarity index 100% rename from kompute-shaders/op_getrows.comp rename to ggml/src/kompute-shaders/op_getrows.comp diff --git a/kompute-shaders/op_getrows_f16.comp b/ggml/src/kompute-shaders/op_getrows_f16.comp similarity index 100% rename from kompute-shaders/op_getrows_f16.comp rename to ggml/src/kompute-shaders/op_getrows_f16.comp diff --git a/kompute-shaders/op_getrows_f32.comp b/ggml/src/kompute-shaders/op_getrows_f32.comp similarity index 100% rename from kompute-shaders/op_getrows_f32.comp rename to ggml/src/kompute-shaders/op_getrows_f32.comp diff --git a/kompute-shaders/op_getrows_q4_0.comp b/ggml/src/kompute-shaders/op_getrows_q4_0.comp similarity index 100% rename from kompute-shaders/op_getrows_q4_0.comp rename to ggml/src/kompute-shaders/op_getrows_q4_0.comp diff --git a/kompute-shaders/op_getrows_q4_1.comp b/ggml/src/kompute-shaders/op_getrows_q4_1.comp similarity index 100% rename from kompute-shaders/op_getrows_q4_1.comp rename to ggml/src/kompute-shaders/op_getrows_q4_1.comp diff --git a/kompute-shaders/op_getrows_q6_k.comp b/ggml/src/kompute-shaders/op_getrows_q6_k.comp similarity index 100% rename from kompute-shaders/op_getrows_q6_k.comp rename to ggml/src/kompute-shaders/op_getrows_q6_k.comp diff --git a/kompute-shaders/op_mul.comp b/ggml/src/kompute-shaders/op_mul.comp similarity index 100% rename from kompute-shaders/op_mul.comp rename to ggml/src/kompute-shaders/op_mul.comp diff --git a/kompute-shaders/op_mul_mat_f16.comp b/ggml/src/kompute-shaders/op_mul_mat_f16.comp similarity index 100% rename from kompute-shaders/op_mul_mat_f16.comp rename to ggml/src/kompute-shaders/op_mul_mat_f16.comp diff --git a/kompute-shaders/op_mul_mat_mat_f32.comp b/ggml/src/kompute-shaders/op_mul_mat_mat_f32.comp similarity index 100% rename from kompute-shaders/op_mul_mat_mat_f32.comp rename to ggml/src/kompute-shaders/op_mul_mat_mat_f32.comp diff --git a/kompute-shaders/op_mul_mat_q4_0.comp b/ggml/src/kompute-shaders/op_mul_mat_q4_0.comp similarity index 100% rename from kompute-shaders/op_mul_mat_q4_0.comp rename to ggml/src/kompute-shaders/op_mul_mat_q4_0.comp diff --git a/kompute-shaders/op_mul_mat_q4_1.comp b/ggml/src/kompute-shaders/op_mul_mat_q4_1.comp similarity index 100% rename from kompute-shaders/op_mul_mat_q4_1.comp rename to ggml/src/kompute-shaders/op_mul_mat_q4_1.comp diff --git a/kompute-shaders/op_mul_mat_q6_k.comp b/ggml/src/kompute-shaders/op_mul_mat_q6_k.comp similarity index 100% rename from kompute-shaders/op_mul_mat_q6_k.comp rename to ggml/src/kompute-shaders/op_mul_mat_q6_k.comp diff --git a/kompute-shaders/op_mul_mat_q8_0.comp b/ggml/src/kompute-shaders/op_mul_mat_q8_0.comp similarity index 100% rename from kompute-shaders/op_mul_mat_q8_0.comp rename to ggml/src/kompute-shaders/op_mul_mat_q8_0.comp diff --git a/kompute-shaders/op_mul_mv_q_n.comp b/ggml/src/kompute-shaders/op_mul_mv_q_n.comp similarity index 100% rename from kompute-shaders/op_mul_mv_q_n.comp rename to ggml/src/kompute-shaders/op_mul_mv_q_n.comp diff --git a/kompute-shaders/op_mul_mv_q_n_pre.comp b/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp similarity index 100% rename from kompute-shaders/op_mul_mv_q_n_pre.comp rename to ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp diff --git a/kompute-shaders/op_norm.comp b/ggml/src/kompute-shaders/op_norm.comp similarity index 100% rename from kompute-shaders/op_norm.comp rename to ggml/src/kompute-shaders/op_norm.comp diff --git a/kompute-shaders/op_relu.comp b/ggml/src/kompute-shaders/op_relu.comp similarity index 100% rename from kompute-shaders/op_relu.comp rename to ggml/src/kompute-shaders/op_relu.comp diff --git a/kompute-shaders/op_rmsnorm.comp b/ggml/src/kompute-shaders/op_rmsnorm.comp similarity index 100% rename from kompute-shaders/op_rmsnorm.comp rename to ggml/src/kompute-shaders/op_rmsnorm.comp diff --git a/kompute-shaders/op_rope_f16.comp b/ggml/src/kompute-shaders/op_rope_f16.comp similarity index 100% rename from kompute-shaders/op_rope_f16.comp rename to ggml/src/kompute-shaders/op_rope_f16.comp diff --git a/kompute-shaders/op_rope_f32.comp b/ggml/src/kompute-shaders/op_rope_f32.comp similarity index 100% rename from kompute-shaders/op_rope_f32.comp rename to ggml/src/kompute-shaders/op_rope_f32.comp diff --git a/kompute-shaders/op_scale.comp b/ggml/src/kompute-shaders/op_scale.comp similarity index 100% rename from kompute-shaders/op_scale.comp rename to ggml/src/kompute-shaders/op_scale.comp diff --git a/kompute-shaders/op_scale_8.comp b/ggml/src/kompute-shaders/op_scale_8.comp similarity index 100% rename from kompute-shaders/op_scale_8.comp rename to ggml/src/kompute-shaders/op_scale_8.comp diff --git a/kompute-shaders/op_silu.comp b/ggml/src/kompute-shaders/op_silu.comp similarity index 100% rename from kompute-shaders/op_silu.comp rename to ggml/src/kompute-shaders/op_silu.comp diff --git a/kompute-shaders/op_softmax.comp b/ggml/src/kompute-shaders/op_softmax.comp similarity index 100% rename from kompute-shaders/op_softmax.comp rename to ggml/src/kompute-shaders/op_softmax.comp diff --git a/kompute-shaders/rope_common.comp b/ggml/src/kompute-shaders/rope_common.comp similarity index 100% rename from kompute-shaders/rope_common.comp rename to ggml/src/kompute-shaders/rope_common.comp diff --git a/sgemm.cpp b/ggml/src/sgemm.cpp similarity index 100% rename from sgemm.cpp rename to ggml/src/sgemm.cpp diff --git a/sgemm.h b/ggml/src/sgemm.h similarity index 100% rename from sgemm.h rename to ggml/src/sgemm.h diff --git a/vulkan-shaders/add.comp b/ggml/src/vulkan-shaders/add.comp similarity index 100% rename from vulkan-shaders/add.comp rename to ggml/src/vulkan-shaders/add.comp diff --git a/vulkan-shaders/argsort.comp b/ggml/src/vulkan-shaders/argsort.comp similarity index 100% rename from vulkan-shaders/argsort.comp rename to ggml/src/vulkan-shaders/argsort.comp diff --git a/vulkan-shaders/clamp.comp b/ggml/src/vulkan-shaders/clamp.comp similarity index 100% rename from vulkan-shaders/clamp.comp rename to ggml/src/vulkan-shaders/clamp.comp diff --git a/vulkan-shaders/copy.comp b/ggml/src/vulkan-shaders/copy.comp similarity index 100% rename from vulkan-shaders/copy.comp rename to ggml/src/vulkan-shaders/copy.comp diff --git a/vulkan-shaders/dequant_f32.comp b/ggml/src/vulkan-shaders/dequant_f32.comp similarity index 100% rename from vulkan-shaders/dequant_f32.comp rename to ggml/src/vulkan-shaders/dequant_f32.comp diff --git a/vulkan-shaders/dequant_funcs.comp b/ggml/src/vulkan-shaders/dequant_funcs.comp similarity index 100% rename from vulkan-shaders/dequant_funcs.comp rename to ggml/src/vulkan-shaders/dequant_funcs.comp diff --git a/vulkan-shaders/dequant_head.comp b/ggml/src/vulkan-shaders/dequant_head.comp similarity index 100% rename from vulkan-shaders/dequant_head.comp rename to ggml/src/vulkan-shaders/dequant_head.comp diff --git a/vulkan-shaders/dequant_q2_k.comp b/ggml/src/vulkan-shaders/dequant_q2_k.comp similarity index 100% rename from vulkan-shaders/dequant_q2_k.comp rename to ggml/src/vulkan-shaders/dequant_q2_k.comp diff --git a/vulkan-shaders/dequant_q3_k.comp b/ggml/src/vulkan-shaders/dequant_q3_k.comp similarity index 100% rename from vulkan-shaders/dequant_q3_k.comp rename to ggml/src/vulkan-shaders/dequant_q3_k.comp diff --git a/vulkan-shaders/dequant_q4_0.comp b/ggml/src/vulkan-shaders/dequant_q4_0.comp similarity index 100% rename from vulkan-shaders/dequant_q4_0.comp rename to ggml/src/vulkan-shaders/dequant_q4_0.comp diff --git a/vulkan-shaders/dequant_q4_1.comp b/ggml/src/vulkan-shaders/dequant_q4_1.comp similarity index 100% rename from vulkan-shaders/dequant_q4_1.comp rename to ggml/src/vulkan-shaders/dequant_q4_1.comp diff --git a/vulkan-shaders/dequant_q4_k.comp b/ggml/src/vulkan-shaders/dequant_q4_k.comp similarity index 100% rename from vulkan-shaders/dequant_q4_k.comp rename to ggml/src/vulkan-shaders/dequant_q4_k.comp diff --git a/vulkan-shaders/dequant_q5_0.comp b/ggml/src/vulkan-shaders/dequant_q5_0.comp similarity index 100% rename from vulkan-shaders/dequant_q5_0.comp rename to ggml/src/vulkan-shaders/dequant_q5_0.comp diff --git a/vulkan-shaders/dequant_q5_1.comp b/ggml/src/vulkan-shaders/dequant_q5_1.comp similarity index 100% rename from vulkan-shaders/dequant_q5_1.comp rename to ggml/src/vulkan-shaders/dequant_q5_1.comp diff --git a/vulkan-shaders/dequant_q5_k.comp b/ggml/src/vulkan-shaders/dequant_q5_k.comp similarity index 100% rename from vulkan-shaders/dequant_q5_k.comp rename to ggml/src/vulkan-shaders/dequant_q5_k.comp diff --git a/vulkan-shaders/dequant_q6_k.comp b/ggml/src/vulkan-shaders/dequant_q6_k.comp similarity index 100% rename from vulkan-shaders/dequant_q6_k.comp rename to ggml/src/vulkan-shaders/dequant_q6_k.comp diff --git a/vulkan-shaders/dequant_q8_0.comp b/ggml/src/vulkan-shaders/dequant_q8_0.comp similarity index 100% rename from vulkan-shaders/dequant_q8_0.comp rename to ggml/src/vulkan-shaders/dequant_q8_0.comp diff --git a/vulkan-shaders/diag_mask_inf.comp b/ggml/src/vulkan-shaders/diag_mask_inf.comp similarity index 100% rename from vulkan-shaders/diag_mask_inf.comp rename to ggml/src/vulkan-shaders/diag_mask_inf.comp diff --git a/vulkan-shaders/div.comp b/ggml/src/vulkan-shaders/div.comp similarity index 100% rename from vulkan-shaders/div.comp rename to ggml/src/vulkan-shaders/div.comp diff --git a/vulkan-shaders/gelu.comp b/ggml/src/vulkan-shaders/gelu.comp similarity index 100% rename from vulkan-shaders/gelu.comp rename to ggml/src/vulkan-shaders/gelu.comp diff --git a/vulkan-shaders/generic_binary_head.comp b/ggml/src/vulkan-shaders/generic_binary_head.comp similarity index 100% rename from vulkan-shaders/generic_binary_head.comp rename to ggml/src/vulkan-shaders/generic_binary_head.comp diff --git a/vulkan-shaders/generic_head.comp b/ggml/src/vulkan-shaders/generic_head.comp similarity index 100% rename from vulkan-shaders/generic_head.comp rename to ggml/src/vulkan-shaders/generic_head.comp diff --git a/vulkan-shaders/generic_unary_head.comp b/ggml/src/vulkan-shaders/generic_unary_head.comp similarity index 100% rename from vulkan-shaders/generic_unary_head.comp rename to ggml/src/vulkan-shaders/generic_unary_head.comp diff --git a/vulkan-shaders/get_rows.comp b/ggml/src/vulkan-shaders/get_rows.comp similarity index 100% rename from vulkan-shaders/get_rows.comp rename to ggml/src/vulkan-shaders/get_rows.comp diff --git a/vulkan-shaders/get_rows_quant.comp b/ggml/src/vulkan-shaders/get_rows_quant.comp similarity index 100% rename from vulkan-shaders/get_rows_quant.comp rename to ggml/src/vulkan-shaders/get_rows_quant.comp diff --git a/vulkan-shaders/mul.comp b/ggml/src/vulkan-shaders/mul.comp similarity index 100% rename from vulkan-shaders/mul.comp rename to ggml/src/vulkan-shaders/mul.comp diff --git a/vulkan-shaders/mul_mat_split_k_reduce.comp b/ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp similarity index 100% rename from vulkan-shaders/mul_mat_split_k_reduce.comp rename to ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp diff --git a/vulkan-shaders/mul_mat_vec.comp b/ggml/src/vulkan-shaders/mul_mat_vec.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec.comp rename to ggml/src/vulkan-shaders/mul_mat_vec.comp diff --git a/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/vulkan-shaders/mul_mat_vec_base.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_base.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_base.comp diff --git a/vulkan-shaders/mul_mat_vec_nc.comp b/ggml/src/vulkan-shaders/mul_mat_vec_nc.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_nc.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_nc.comp diff --git a/vulkan-shaders/mul_mat_vec_p021.comp b/ggml/src/vulkan-shaders/mul_mat_vec_p021.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_p021.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_p021.comp diff --git a/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_q2_k.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp diff --git a/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_q3_k.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp diff --git a/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_q4_k.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp diff --git a/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_q5_k.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_q5_k.comp diff --git a/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp similarity index 100% rename from vulkan-shaders/mul_mat_vec_q6_k.comp rename to ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp diff --git a/vulkan-shaders/mul_mm.comp b/ggml/src/vulkan-shaders/mul_mm.comp similarity index 100% rename from vulkan-shaders/mul_mm.comp rename to ggml/src/vulkan-shaders/mul_mm.comp diff --git a/vulkan-shaders/norm.comp b/ggml/src/vulkan-shaders/norm.comp similarity index 100% rename from vulkan-shaders/norm.comp rename to ggml/src/vulkan-shaders/norm.comp diff --git a/vulkan-shaders/relu.comp b/ggml/src/vulkan-shaders/relu.comp similarity index 100% rename from vulkan-shaders/relu.comp rename to ggml/src/vulkan-shaders/relu.comp diff --git a/vulkan-shaders/rms_norm.comp b/ggml/src/vulkan-shaders/rms_norm.comp similarity index 100% rename from vulkan-shaders/rms_norm.comp rename to ggml/src/vulkan-shaders/rms_norm.comp diff --git a/vulkan-shaders/rope_head.comp b/ggml/src/vulkan-shaders/rope_head.comp similarity index 100% rename from vulkan-shaders/rope_head.comp rename to ggml/src/vulkan-shaders/rope_head.comp diff --git a/vulkan-shaders/rope_neox.comp b/ggml/src/vulkan-shaders/rope_neox.comp similarity index 100% rename from vulkan-shaders/rope_neox.comp rename to ggml/src/vulkan-shaders/rope_neox.comp diff --git a/vulkan-shaders/rope_norm.comp b/ggml/src/vulkan-shaders/rope_norm.comp similarity index 100% rename from vulkan-shaders/rope_norm.comp rename to ggml/src/vulkan-shaders/rope_norm.comp diff --git a/vulkan-shaders/scale.comp b/ggml/src/vulkan-shaders/scale.comp similarity index 100% rename from vulkan-shaders/scale.comp rename to ggml/src/vulkan-shaders/scale.comp diff --git a/vulkan-shaders/silu.comp b/ggml/src/vulkan-shaders/silu.comp similarity index 100% rename from vulkan-shaders/silu.comp rename to ggml/src/vulkan-shaders/silu.comp diff --git a/vulkan-shaders/soft_max.comp b/ggml/src/vulkan-shaders/soft_max.comp similarity index 100% rename from vulkan-shaders/soft_max.comp rename to ggml/src/vulkan-shaders/soft_max.comp diff --git a/vulkan-shaders/square.comp b/ggml/src/vulkan-shaders/square.comp similarity index 100% rename from vulkan-shaders/square.comp rename to ggml/src/vulkan-shaders/square.comp diff --git a/vulkan-shaders/sum_rows.comp b/ggml/src/vulkan-shaders/sum_rows.comp similarity index 100% rename from vulkan-shaders/sum_rows.comp rename to ggml/src/vulkan-shaders/sum_rows.comp diff --git a/vulkan-shaders/types.comp b/ggml/src/vulkan-shaders/types.comp similarity index 100% rename from vulkan-shaders/types.comp rename to ggml/src/vulkan-shaders/types.comp diff --git a/llama.h b/include/llama.h similarity index 100% rename from llama.h rename to include/llama.h diff --git a/scripts/LlamaConfig.cmake.in b/scripts/LlamaConfig.cmake.in index 9311055d925d4..fd2aae288f41a 100644 --- a/scripts/LlamaConfig.cmake.in +++ b/scripts/LlamaConfig.cmake.in @@ -2,11 +2,11 @@ set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@) set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) -set(LLAMA_BLAS @LLAMA_BLAS@) -set(LLAMA_CUDA @LLAMA_CUDA@) -set(LLAMA_METAL @LLAMA_METAL@) -set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@) -set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@) +set(GGML_BLAS @GGML_BLAS@) +set(GGML_CUDA @GGML_CUDA@) +set(GGML_METAL @GGML_METAL@) +set(GGML_HIPBLAS @GGML_HIPBLAS@) +set(GGML_ACCELERATE @GGML_ACCELERATE@) @PACKAGE_INIT@ @@ -17,25 +17,25 @@ set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") # Ensure transient dependencies satisfied find_package(Threads REQUIRED) -if (APPLE AND LLAMA_ACCELERATE) +if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) endif() -if (LLAMA_BLAS) +if (GGML_BLAS) find_package(BLAS REQUIRED) endif() -if (LLAMA_CUDA) +if (GGML_CUDA) find_package(CUDAToolkit REQUIRED) endif() -if (LLAMA_METAL) +if (GGML_METAL) find_library(FOUNDATION_LIBRARY Foundation REQUIRED) find_library(METAL_FRAMEWORK Metal REQUIRED) find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) endif() -if (LLAMA_HIPBLAS) +if (GGML_HIPBLAS) find_package(hip REQUIRED) find_package(hipblas REQUIRED) find_package(rocblas REQUIRED) diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh index a45cd3962ac0a..70679f4e56470 100755 --- a/scripts/compare-commits.sh +++ b/scripts/compare-commits.sh @@ -12,7 +12,7 @@ bench_args="${@:3}" rm -f llama-bench.sqlite > /dev/null -# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...) +# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...) git checkout $1 > /dev/null make clean > /dev/null diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh index 7b2b601a96477..91946c514e6b2 100755 --- a/scripts/debug-test.sh +++ b/scripts/debug-test.sh @@ -110,7 +110,7 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir" ########################################################### # Note: test-eval-callback requires -DLLAMA_CURL -cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment" +cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment" pushd "$build_dir" make -j || abort "Failed to compile" popd > /dev/null || exit 1 diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 6ba499a2a2521..586d6ea18af01 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -42,7 +42,7 @@ git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -LLAMA_CUDA=1 make -j +GGML_CUDA=1 make -j ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b @@ -60,7 +60,7 @@ cd /workspace/llama.cpp mkdir build-cublas cd build-cublas -cmake -DLLAMA_CUDA=1 ../ +cmake -DGGML_CUDA=1 ../ make -j if [ "$1" -eq "0" ]; then @@ -186,17 +186,17 @@ if [ "$1" -eq "1" ]; then # batched cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 + GGML_CUDA=1 make -j && ./llama-batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 # batched-bench cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 + GGML_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 # parallel cd /workspace/llama.cpp - LLAMA_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb + GGML_CUDA=1 make -j && ./llama-parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb fi @@ -204,10 +204,10 @@ fi #if [ "$1" -eq "7" ]; then # cd /workspace/llama.cpp # -# LLAMA_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 +# GGML_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 #fi # more benches -#LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 -#LLAMA_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh index 199232440449b..802592a3e0d3b 100644 --- a/scripts/server-llm.sh +++ b/scripts/server-llm.sh @@ -380,7 +380,7 @@ fi if [[ "$backend" == "cuda" ]]; then printf "[+] Building with CUDA backend\n" - LLAMA_CUDA=1 make -j llama-server $log + GGML_CUDA=1 make -j llama-server $log elif [[ "$backend" == "cpu" ]]; then printf "[+] Building with CPU backend\n" make -j llama-server $log diff --git a/spm-headers/ggml-alloc.h b/spm-headers/ggml-alloc.h index a49d385a1b864..0361ffc386a1f 120000 --- a/spm-headers/ggml-alloc.h +++ b/spm-headers/ggml-alloc.h @@ -1 +1 @@ -../ggml-alloc.h \ No newline at end of file +../ggml/include/ggml-alloc.h \ No newline at end of file diff --git a/spm-headers/ggml-backend.h b/spm-headers/ggml-backend.h index 17c2cf14fe02b..7295f0f0da742 120000 --- a/spm-headers/ggml-backend.h +++ b/spm-headers/ggml-backend.h @@ -1 +1 @@ -../ggml-backend.h \ No newline at end of file +../ggml/include/ggml-backend.h \ No newline at end of file diff --git a/spm-headers/ggml-metal.h b/spm-headers/ggml-metal.h new file mode 120000 index 0000000000000..e0f107b124cf3 --- /dev/null +++ b/spm-headers/ggml-metal.h @@ -0,0 +1 @@ +../ggml/src/ggml-metal.h \ No newline at end of file diff --git a/spm-headers/ggml.h b/spm-headers/ggml.h index 39215298f981b..0bdfeacbdbead 120000 --- a/spm-headers/ggml.h +++ b/spm-headers/ggml.h @@ -1 +1 @@ -../ggml.h \ No newline at end of file +../ggml/include/ggml.h \ No newline at end of file diff --git a/spm-headers/llama.h b/spm-headers/llama.h index 9acceb980c264..b31388f0dd652 120000 --- a/spm-headers/llama.h +++ b/spm-headers/llama.h @@ -1 +1 @@ -../llama.h \ No newline at end of file +../include/llama.h \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000000..eaa5045679809 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,23 @@ +# +# libraries +# + +# llama + +add_library(llama + ../include/llama.h + llama.cpp + unicode.h + unicode.cpp + unicode-data.cpp + ) + +target_include_directories(llama PUBLIC . ../include) +target_compile_features (llama PUBLIC cxx_std_11) # don't bump + +target_link_libraries(llama PUBLIC ggml) + +if (BUILD_SHARED_LIBS) + set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) +endif() diff --git a/llama.cpp b/src/llama.cpp similarity index 100% rename from llama.cpp rename to src/llama.cpp diff --git a/unicode-data.cpp b/src/unicode-data.cpp similarity index 100% rename from unicode-data.cpp rename to src/unicode-data.cpp diff --git a/unicode-data.h b/src/unicode-data.h similarity index 100% rename from unicode-data.h rename to src/unicode-data.h diff --git a/unicode.cpp b/src/unicode.cpp similarity index 100% rename from unicode.cpp rename to src/unicode.cpp diff --git a/unicode.h b/src/unicode.h similarity index 100% rename from unicode.h rename to src/unicode.h