From 2ea8e7ba4f8c9c2ca52cf0eab65f13c2875a668b Mon Sep 17 00:00:00 2001
From: Finn Kumkar <kumkar@stadtwerk.org>
Date: Sat, 3 Aug 2024 12:15:12 +0200
Subject: [PATCH] Update Changelog and use gemma-2-9b-it-IQ4_XS.gguf model
 across all examples

---
 CHANGELOG.md        | 24 ++++++++++++++++++++++++
 README.md           | 26 +++++++++++++-------------
 examples/server.ps1 | 10 +++++-----
 vendor/llama.cpp    |  2 +-
 4 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d5a416b..27e661f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,30 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.21.0] - 2024-08-03
+
+### Added
+- [Server] Add -help option
+- [Server] Add -chatTemplate option
+- [Server] Add human readable file size
+- [Benchmark] Add llama-bench example
+
+### Changed
+- [Build] Update torch to 2.2.1+cu121
+- [Build] Update OpenBLAS to 0.3.27
+- [Build] Update Python to 3.12
+- [Server] Default KV cache type to f16
+- [Documentation] Use gemma-2-9b-it-IQ4_XS.gguf model across all examples
+
+### Fixed
+- [Build] Fix CUDA build after renaming in upstream llama.cpp
+- [Build] Fix gguf_dump.py after renaming in upstream llama.cpp
+- [Build] Add missing tiktoken package to support GLM models
+- [Build] Fix wikitext URI
+
+### Removed
+- [Server] Remove broken chrome startup
+
 ## [1.20.0] - 2024-06-13
 
 ### Changed
diff --git a/README.md b/README.md
index f2a300b..7b3546e 100644
--- a/README.md
+++ b/README.md
@@ -84,9 +84,9 @@ To build llama.cpp binaries for a Windows environment with the best available BL
 
 ### 7. Download a large language model
 
-Download a large language model (LLM) with weights in the GGUF format into the `./vendor/llama.cpp/models` directory. You can for example download the [openchat-3.6-8b-20240522](https://huggingface.co/openchat/openchat-3.6-8b-20240522) 8B model in a quantized GGUF format:
+Download a large language model (LLM) with weights in the GGUF format into the `./vendor/llama.cpp/models` directory. You can for example download the [gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it) model in a quantized GGUF format:
 
-* https://huggingface.co/bartowski/openchat-3.6-8b-20240522-GGUF/blob/main/openchat-3.6-8b-20240522-Q5_K_M.gguf
+* https://huggingface.co/bartowski/gemma-2-9b-it-GGUF/resolve/main/gemma-2-9b-it-IQ4_XS.gguf
 
 > [!TIP]
 > See the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSYS Chatbot Arena Leaderboard](https://chat.lmsys.org/?leaderboard) for best in class open source LLMs.
@@ -98,7 +98,7 @@ Download a large language model (LLM) with weights in the GGUF format into the `
 You can easily chat with a specific model by using the [.\examples\server.ps1](./examples/server.ps1) script:
 
 ```PowerShell
-.\examples\server.ps1 -model ".\vendor\llama.cpp\models\openchat-3.6-8b-20240522-Q5_K_M.gguf"
+.\examples\server.ps1 -model ".\vendor\llama.cpp\models\gemma-2-9b-it-IQ4_XS.gguf"
 ```
 
 > [!NOTE]
@@ -116,12 +116,12 @@ You can now chat with the model:
 
 ```PowerShell
 ./vendor/llama.cpp/build/bin/Release/llama-cli `
-    --model "./vendor/llama.cpp/models/openchat-3.6-8b-20240522-Q5_K_M.gguf" `
+    --model "./vendor/llama.cpp/models/gemma-2-9b-it-IQ4_XS.gguf" `
     --ctx-size 8192 `
     --threads 16 `
     --n-gpu-layers 33 `
     --reverse-prompt '[[USER_NAME]]:' `
-    --prompt-cache "./cache/openchat-3.6-8b-20240522-Q5_K_M.gguf.prompt" `
+    --prompt-cache "./cache/gemma-2-9b-it-IQ4_XS.gguf.prompt" `
     --file "./vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt" `
     --color `
     --interactive
@@ -133,7 +133,7 @@ You can start llama.cpp as a webserver:
 
 ```PowerShell
 ./vendor/llama.cpp/build/bin/Release/llama-server `
-    --model "./vendor/llama.cpp/models/openchat-3.6-8b-20240522-Q5_K_M.gguf" `
+    --model "./vendor/llama.cpp/models/gemma-2-9b-it-IQ4_XS.gguf" `
     --ctx-size 8192 `
     --threads 16 `
     --n-gpu-layers 33
@@ -160,14 +160,14 @@ To extend the context to 32k execute the following:
 
 ```PowerShell
 ./vendor/llama.cpp/build/bin/Release/llama-cli `
-    --model "./vendor/llama.cpp/models/openchat-3.6-8b-20240522-Q5_K_M.gguf" `
+    --model "./vendor/llama.cpp/models/gemma-2-9b-it-IQ4_XS.gguf" `
     --ctx-size 32768 `
     --rope-freq-scale 0.25 `
     --rope-freq-base 40000 `
     --threads 16 `
     --n-gpu-layers 33 `
     --reverse-prompt '[[USER_NAME]]:' `
-    --prompt-cache "./cache/openchat-3.6-8b-20240522-Q5_K_M.gguf.prompt" `
+    --prompt-cache "./cache/gemma-2-9b-it-IQ4_XS.gguf.prompt" `
     --file "./vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt" `
     --color `
     --interactive
@@ -179,11 +179,11 @@ You can enforce a specific grammar for the response generation. The following wi
 
 ```PowerShell
 ./vendor/llama.cpp/build/bin/Release/llama-cli `
-    --model "./vendor/llama.cpp/models/openchat-3.6-8b-20240522-Q5_K_M.gguf" `
+    --model "./vendor/llama.cpp/models/gemma-2-9b-it-IQ4_XS.gguf" `
     --ctx-size 8192 `
     --threads 16 `
     --n-gpu-layers 33 `
-    --prompt-cache "./cache/openchat-3.6-8b-20240522-Q5_K_M.gguf.prompt" `
+    --prompt-cache "./cache/gemma-2-9b-it-IQ4_XS.gguf.prompt" `
     --prompt "The scientific classification (Taxonomy) of a Llama: " `
     --grammar-file "./vendor/llama.cpp/grammars/json.gbnf"
     --color
@@ -195,7 +195,7 @@ Execute the following to measure the perplexity of the GGML formatted model:
 
 ```PowerShell
 ./vendor/llama.cpp/build/bin/Release/llama-perplexity `
-    --model "./vendor/llama.cpp/models/openchat-3.6-8b-20240522-Q5_K_M.gguf" `
+    --model "./vendor/llama.cpp/models/gemma-2-9b-it-IQ4_XS.gguf" `
     --ctx-size 8192 `
     --threads 16 `
     --n-gpu-layers 33 `
@@ -208,7 +208,7 @@ You can easily count the tokens of a prompt for a specific model by using the [.
 
 ```PowerShell
  .\examples\count_tokens.ps1 `
-     -model ".\vendor\llama.cpp\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" `
+     -model ".\vendor\llama.cpp\models\gemma-2-9b-it-IQ4_XS.gguf" `
      -file ".\prompts\chat_with_llm.txt"
 ```
 
@@ -216,7 +216,7 @@ To inspect the actual tokenization result you can use the `-debug` flag:
 
 ```PowerShell
  .\examples\count_tokens.ps1 `
-     -model ".\vendor\llama.cpp\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" `
+     -model ".\vendor\llama.cpp\models\gemma-2-9b-it-IQ4_XS.gguf" `
      -prompt "Hello Word!" `
      -debug
 ```
diff --git a/examples/server.ps1 b/examples/server.ps1
index 4fa7df7..1f62d35 100644
--- a/examples/server.ps1
+++ b/examples/server.ps1
@@ -35,19 +35,19 @@ Increases the verbosity of the llama.cpp server.
 Shows the manual on how to use this script.
 
 .EXAMPLE
-.\server.ps1 -model "..\vendor\llama.cpp\models\openchat-3.6-8b-20240522-Q5_K_M.gguf"
+.\server.ps1 -model "..\vendor\llama.cpp\models\gemma-2-9b-it-IQ4_XS.gguf"
 
 .EXAMPLE
-.\server.ps1 -model "C:\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" -chatTemplate "llama3" -parallel 4
+.\server.ps1 -model "C:\models\gemma-2-9b-it-IQ4_XS.gguf" -chatTemplate "llama3" -parallel 4
 
 .EXAMPLE
-.\server.ps1 -model "C:\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" -contextSize 4096 -numberOfGPULayers 10
+.\server.ps1 -model "C:\models\gemma-2-9b-it-IQ4_XS.gguf" -contextSize 4096 -numberOfGPULayers 10
 
 .EXAMPLE
-.\server.ps1 -model "C:\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" -port 8081 -kvCacheDataType q8_0
+.\server.ps1 -model "C:\models\gemma-2-9b-it-IQ4_XS.gguf" -port 8081 -kvCacheDataType q8_0
 
 .EXAMPLE
-.\server.ps1 -model "..\vendor\llama.cpp\models\openchat-3.6-8b-20240522-Q5_K_M.gguf" -verbose
+.\server.ps1 -model "..\vendor\llama.cpp\models\gemma-2-9b-it-IQ4_XS.gguf" -verbose
 #>
 
 Param (
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 345c8c0..b72c20b 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 345c8c0c87a97c1595f9c8b14833d531c8c7d8df
+Subproject commit b72c20b85c1029d135022d39e9a20d4807c11893