Skip to content

Commit

Permalink
gguf-hash: add --uuid option to c implementation for model ID
Browse files Browse the repository at this point in the history
  • Loading branch information
mofosyne committed Jun 23, 2024
1 parent a410d23 commit 0dbd834
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 3 deletions.
12 changes: 9 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ BUILD_TARGETS = \
llama-finetune \
llama-gbnf-validator \
llama-gguf \
llama-gguf-hash
llama-gguf-hash \
llama-gguf-split \
llama-gritlm \
llama-imatrix \
Expand Down Expand Up @@ -920,8 +920,14 @@ llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
xxhash.o: examples/gguf-hash/deps/xxhash/xxhash.c examples/gguf-hash/deps/xxhash/xxhash.h
$(CXX) $(CXXFLAGS) -c $< -o $@

sha1.o: examples/gguf-hash/deps/sha1/sha1.c examples/gguf-hash/deps/sha1/sha1.h
$(CXX) $(CXXFLAGS) -c $< -o $@

llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp ggml.o llama.o xxhash.o sha1.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
Expand Down
2 changes: 2 additions & 0 deletions examples/gguf-hash/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ CLI to hash GGUF files.

- `--xxhash`: use xhash (default)
- `--sha1`: use sha1
- `--uuid`: use uuid

### Compile Example

Expand All @@ -16,6 +17,7 @@ make -C build llama-gguf-hash VERBOSE=1
./build/bin/llama-gguf-hash test.gguf
./build/bin/llama-gguf-hash --xxhash test.gguf
./build/bin/llama-gguf-hash --sha1 test.gguf
./build/bin/llama-gguf-hash --uuid test.gguf
```

### Crypto/Hash Libraries Used
Expand Down
79 changes: 79 additions & 0 deletions examples/gguf-hash/gguf-hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,15 @@
#include "sha256/sha256.h"
#endif

// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5

struct hash_params {
std::string input;
bool xxhash = false;
bool sha1 = false;
bool uuid = false;
#ifdef SHA256
bool sha256 = false;
#endif
Expand All @@ -36,6 +41,7 @@ static void hash_print_usage(const char * executable) {
printf(" -h, --help show this help message and exit\n");
printf(" --xxhash use xxhash\n");
printf(" --sha1 use sha1\n");
printf(" --uuid use uuid\n");
#ifdef SHA256
printf(" --sha256 use sha256\n");
#endif
Expand Down Expand Up @@ -69,6 +75,11 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
params.sha1 = true;
}

if (arg == "--uuid") {
arg_found = true;
params.uuid = true;
}

#ifdef SHA256
if (arg == "--sha256") {
arg_found = true;
Expand All @@ -83,6 +94,7 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par

if (!params.xxhash
&& !params.sha1
&& !params.uuid
#ifdef SHA256
&& !params.sha256
#endif
Expand Down Expand Up @@ -254,11 +266,78 @@ static bool gguf_hash(const hash_params & hash_params) {
return true;
}

static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
// Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
// Assumes that digest was processed correctly with the expected namespace
for (int i = 0; i < 16; i++) {
uuid[i] = sha1_digest[i];
}

// Set bits corresponding to UUID ver 5
uuid[ 6] &= ~(0xF << 4);
uuid[ 6] |= (5 << 4);

// Set bits corresponding to UUID variant 0b10XX
uuid[ 8] &= ~(0xc << 4);
uuid[ 8] |= (0x8 << 4);
}

static bool gguf_uuid(const hash_params & hash_params) {
if (!hash_params.uuid) {
return true;
}

const std::string & fname = hash_params.input;
struct ggml_context * ctx_data = NULL;

struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
};

// sha1 init
SHA1_CTX sha1_model_hash_ctx;
SHA1Init(&sha1_model_hash_ctx);

unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));

struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
auto n_bytes = ggml_nbytes(cur);
auto *raw_data = cur->data;
SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
}

unsigned char result[21];
SHA1Final(result, &sha1_model_hash_ctx);

unsigned char uuid[16];
generate_uuidv5(result, uuid);

char string_buffer[37] = {0};
sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
uuid[0], uuid[1], uuid[2], uuid[3],
uuid[4], uuid[5], uuid[6], uuid[7],
uuid[8], uuid[9], uuid[10], uuid[11],
uuid[12], uuid[13], uuid[14], uuid[15]);
printf("UUIDv5 %s %s\n", string_buffer, fname.c_str());

ggml_free(ctx_data);
gguf_free(ctx);

return true;
}

int main(int argc, const char ** argv) {
hash_params params;
hash_params_parse(argc, argv, params);

gguf_hash(params);
gguf_uuid(params);

return 0;
}

0 comments on commit 0dbd834

Please sign in to comment.