diff --git a/Makefile b/Makefile index 71fbdbdb4bc322..2dd0092465d0a8 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ BUILD_TARGETS = \ llama-finetune \ llama-gbnf-validator \ llama-gguf \ - llama-gguf-hash + llama-gguf-hash \ llama-gguf-split \ llama-gritlm \ llama-imatrix \ @@ -920,8 +920,15 @@ llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) +# Build Libaries specific to llama-gguf-hash +xxhash.o: examples/gguf-hash/deps/xxhash/xxhash.c examples/gguf-hash/deps/xxhash/xxhash.h + $(CC) $(CFLAGS) -c $< -o $@ + +sha1.o: examples/gguf-hash/deps/sha1/sha1.c examples/gguf-hash/deps/sha1/sha1.h + $(CC) $(CFLAGS) -c $< -o $@ + +llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp ggml.o llama.o xxhash.o sha1.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md index c740e40ee74a7f..b6a7d71abe37e9 100644 --- a/examples/gguf-hash/README.md +++ b/examples/gguf-hash/README.md @@ -6,6 +6,7 @@ CLI to hash GGUF files. - `--xxhash`: use xhash (default) - `--sha1`: use sha1 +- `--uuid`: use uuid ### Compile Example @@ -16,6 +17,7 @@ make -C build llama-gguf-hash VERBOSE=1 ./build/bin/llama-gguf-hash test.gguf ./build/bin/llama-gguf-hash --xxhash test.gguf ./build/bin/llama-gguf-hash --sha1 test.gguf +./build/bin/llama-gguf-hash --uuid test.gguf ``` ### Crypto/Hash Libraries Used diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index 2d67165e5793e9..0d67d1b20360ed 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -16,10 +16,15 @@ #include "sha256/sha256.h" #endif +// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp') +#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5" +#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5 + struct hash_params { std::string input; bool xxhash = false; bool sha1 = false; + bool uuid = false; #ifdef SHA256 bool sha256 = false; #endif @@ -36,6 +41,7 @@ static void hash_print_usage(const char * executable) { printf(" -h, --help show this help message and exit\n"); printf(" --xxhash use xxhash\n"); printf(" --sha1 use sha1\n"); + printf(" --uuid use uuid\n"); #ifdef SHA256 printf(" --sha256 use sha256\n"); #endif @@ -69,6 +75,11 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par params.sha1 = true; } + if (arg == "--uuid") { + arg_found = true; + params.uuid = true; + } + #ifdef SHA256 if (arg == "--sha256") { arg_found = true; @@ -83,6 +94,7 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par if (!params.xxhash && !params.sha1 + && !params.uuid #ifdef SHA256 && !params.sha256 #endif @@ -254,11 +266,78 @@ static bool gguf_hash(const hash_params & hash_params) { return true; } +static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) { + // Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5 + // Assumes that digest was processed correctly with the expected namespace + for (int i = 0; i < 16; i++) { + uuid[i] = sha1_digest[i]; + } + + // Set bits corresponding to UUID ver 5 + uuid[ 6] &= ~(0xF << 4); + uuid[ 6] |= (5 << 4); + + // Set bits corresponding to UUID variant 0b10XX + uuid[ 8] &= ~(0xc << 4); + uuid[ 8] |= (0x8 << 4); +} + +static bool gguf_uuid(const hash_params & hash_params) { + if (!hash_params.uuid) { + return true; + } + + const std::string & fname = hash_params.input; + struct ggml_context * ctx_data = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + }; + + // sha1 init + SHA1_CTX sha1_model_hash_ctx; + SHA1Init(&sha1_model_hash_ctx); + + unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX}; + SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace)); + + struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); + const int n_tensors = gguf_get_n_tensors(ctx); + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx, i); + struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + auto n_bytes = ggml_nbytes(cur); + auto *raw_data = cur->data; + SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes); + } + + unsigned char result[21]; + SHA1Final(result, &sha1_model_hash_ctx); + + unsigned char uuid[16]; + generate_uuidv5(result, uuid); + + char string_buffer[37] = {0}; + sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + uuid[0], uuid[1], uuid[2], uuid[3], + uuid[4], uuid[5], uuid[6], uuid[7], + uuid[8], uuid[9], uuid[10], uuid[11], + uuid[12], uuid[13], uuid[14], uuid[15]); + printf("UUIDv5 %s %s\n", string_buffer, fname.c_str()); + + ggml_free(ctx_data); + gguf_free(ctx); + + return true; +} + int main(int argc, const char ** argv) { hash_params params; hash_params_parse(argc, argv, params); gguf_hash(params); + gguf_uuid(params); return 0; }