From 2ea8e7ba4f8c9c2ca52cf0eab65f13c2875a668b Mon Sep 17 00:00:00 2001 From: Finn Kumkar Date: Sat, 3 Aug 2024 12:15:12 +0200 Subject: [PATCH] Update Changelog and use gemma-2-9b-it-IQ4_XS.gguf model across all examples --- CHANGELOG.md | 24 ++++++++++++++++++++++++ README.md | 26 +++++++++++++------------- examples/server.ps1 | 10 +++++----- vendor/llama.cpp | 2 +- 4 files changed, 43 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5a416b..27e661f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,30 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.21.0] - 2024-08-03 + +### Added +- [Server] Add -help option +- [Server] Add -chatTemplate option +- [Server] Add human readable file size +- [Benchmark] Add llama-bench example + +### Changed +- [Build] Update torch to 2.2.1+cu121 +- [Build] Update OpenBLAS to 0.3.27 +- [Build] Update Python to 3.12 +- [Server] Default KV cache type to f16 +- [Documentation] Use gemma-2-9b-it-IQ4_XS.gguf model across all examples + +### Fixed +- [Build] Fix CUDA build after renaming in upstream llama.cpp +- [Build] Fix gguf_dump.py after renaming in upstream llama.cpp +- [Build] Add missing tiktoken package to support GLM models +- [Build] Fix wikitext URI + +### Removed +- [Server] Remove broken chrome startup + ## [1.20.0] - 2024-06-13 ### Changed diff --git a/README.md b/README.md index f2a300b..7b3546e 100644 --- a/README.md +++ b/README.md @@ -84,9 +84,9 @@ To build llama.cpp binaries for a Windows environment with the best available BL ### 7. Download a large language model -Download a large language model (LLM) with weights in the GGUF format into the `./vendor/llama.cpp/models` directory. You can for example download the [openchat-3.6-8b-20240522](https://huggingface.co/openchat/openchat-3.6-8b-20240522) 8B model in a quantized GGUF format: +Download a large language model (LLM) with weights in the GGUF format into the `./vendor/llama.cpp/models` directory. You can for example download the [gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it) model in a quantized GGUF format: -* https://huggingface.co/bartowski/openchat-3.6-8b-20240522-GGUF/blob/main/openchat-3.6-8b-20240522-Q5_K_M.gguf +* https://huggingface.co/bartowski/gemma-2-9b-it-GGUF/resolve/main/gemma-2-9b-it-IQ4_XS.gguf > [!TIP] > See the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSYS Chatbot Arena Leaderboard](https://chat.lmsys.org/?leaderboard) for best in class open source LLMs. @@ -98,7 +98,7 @@ Download a large language model (LLM) with weights in the GGUF format into the ` You can easily chat with a specific model by using the [.\examples\server.ps1](./examples/server.ps1) script: ```PowerShell -.\examples\server.ps1 -model ".\vendor\llama.cpp\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" +.\examples\server.ps1 -model ".\vendor\llama.cpp\models\gemma-2-9b-it-IQ4_XS.gguf" ``` > [!NOTE] @@ -116,12 +116,12 @@ You can now chat with the model: ```PowerShell ./vendor/llama.cpp/build/bin/Release/llama-cli ` - --model "./vendor/llama.cpp/models/openchat-3.6-8b-20240522-Q5_K_M.gguf" ` + --model "./vendor/llama.cpp/models/gemma-2-9b-it-IQ4_XS.gguf" ` --ctx-size 8192 ` --threads 16 ` --n-gpu-layers 33 ` --reverse-prompt '[[USER_NAME]]:' ` - --prompt-cache "./cache/openchat-3.6-8b-20240522-Q5_K_M.gguf.prompt" ` + --prompt-cache "./cache/gemma-2-9b-it-IQ4_XS.gguf.prompt" ` --file "./vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt" ` --color ` --interactive @@ -133,7 +133,7 @@ You can start llama.cpp as a webserver: ```PowerShell ./vendor/llama.cpp/build/bin/Release/llama-server ` - --model "./vendor/llama.cpp/models/openchat-3.6-8b-20240522-Q5_K_M.gguf" ` + --model "./vendor/llama.cpp/models/gemma-2-9b-it-IQ4_XS.gguf" ` --ctx-size 8192 ` --threads 16 ` --n-gpu-layers 33 @@ -160,14 +160,14 @@ To extend the context to 32k execute the following: ```PowerShell ./vendor/llama.cpp/build/bin/Release/llama-cli ` - --model "./vendor/llama.cpp/models/openchat-3.6-8b-20240522-Q5_K_M.gguf" ` + --model "./vendor/llama.cpp/models/gemma-2-9b-it-IQ4_XS.gguf" ` --ctx-size 32768 ` --rope-freq-scale 0.25 ` --rope-freq-base 40000 ` --threads 16 ` --n-gpu-layers 33 ` --reverse-prompt '[[USER_NAME]]:' ` - --prompt-cache "./cache/openchat-3.6-8b-20240522-Q5_K_M.gguf.prompt" ` + --prompt-cache "./cache/gemma-2-9b-it-IQ4_XS.gguf.prompt" ` --file "./vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt" ` --color ` --interactive @@ -179,11 +179,11 @@ You can enforce a specific grammar for the response generation. The following wi ```PowerShell ./vendor/llama.cpp/build/bin/Release/llama-cli ` - --model "./vendor/llama.cpp/models/openchat-3.6-8b-20240522-Q5_K_M.gguf" ` + --model "./vendor/llama.cpp/models/gemma-2-9b-it-IQ4_XS.gguf" ` --ctx-size 8192 ` --threads 16 ` --n-gpu-layers 33 ` - --prompt-cache "./cache/openchat-3.6-8b-20240522-Q5_K_M.gguf.prompt" ` + --prompt-cache "./cache/gemma-2-9b-it-IQ4_XS.gguf.prompt" ` --prompt "The scientific classification (Taxonomy) of a Llama: " ` --grammar-file "./vendor/llama.cpp/grammars/json.gbnf" --color @@ -195,7 +195,7 @@ Execute the following to measure the perplexity of the GGML formatted model: ```PowerShell ./vendor/llama.cpp/build/bin/Release/llama-perplexity ` - --model "./vendor/llama.cpp/models/openchat-3.6-8b-20240522-Q5_K_M.gguf" ` + --model "./vendor/llama.cpp/models/gemma-2-9b-it-IQ4_XS.gguf" ` --ctx-size 8192 ` --threads 16 ` --n-gpu-layers 33 ` @@ -208,7 +208,7 @@ You can easily count the tokens of a prompt for a specific model by using the [. ```PowerShell .\examples\count_tokens.ps1 ` - -model ".\vendor\llama.cpp\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" ` + -model ".\vendor\llama.cpp\models\gemma-2-9b-it-IQ4_XS.gguf" ` -file ".\prompts\chat_with_llm.txt" ``` @@ -216,7 +216,7 @@ To inspect the actual tokenization result you can use the `-debug` flag: ```PowerShell .\examples\count_tokens.ps1 ` - -model ".\vendor\llama.cpp\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" ` + -model ".\vendor\llama.cpp\models\gemma-2-9b-it-IQ4_XS.gguf" ` -prompt "Hello Word!" ` -debug ``` diff --git a/examples/server.ps1 b/examples/server.ps1 index 4fa7df7..1f62d35 100644 --- a/examples/server.ps1 +++ b/examples/server.ps1 @@ -35,19 +35,19 @@ Increases the verbosity of the llama.cpp server. Shows the manual on how to use this script. .EXAMPLE -.\server.ps1 -model "..\vendor\llama.cpp\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" +.\server.ps1 -model "..\vendor\llama.cpp\models\gemma-2-9b-it-IQ4_XS.gguf" .EXAMPLE -.\server.ps1 -model "C:\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" -chatTemplate "llama3" -parallel 4 +.\server.ps1 -model "C:\models\gemma-2-9b-it-IQ4_XS.gguf" -chatTemplate "llama3" -parallel 4 .EXAMPLE -.\server.ps1 -model "C:\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" -contextSize 4096 -numberOfGPULayers 10 +.\server.ps1 -model "C:\models\gemma-2-9b-it-IQ4_XS.gguf" -contextSize 4096 -numberOfGPULayers 10 .EXAMPLE -.\server.ps1 -model "C:\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" -port 8081 -kvCacheDataType q8_0 +.\server.ps1 -model "C:\models\gemma-2-9b-it-IQ4_XS.gguf" -port 8081 -kvCacheDataType q8_0 .EXAMPLE -.\server.ps1 -model "..\vendor\llama.cpp\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" -verbose +.\server.ps1 -model "..\vendor\llama.cpp\models\gemma-2-9b-it-IQ4_XS.gguf" -verbose #> Param ( diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 345c8c0..b72c20b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 345c8c0c87a97c1595f9c8b14833d531c8c7d8df +Subproject commit b72c20b85c1029d135022d39e9a20d4807c11893