gguf-hash: add --uuid option to c implementation for model ID

ggerganov · Jun 23, 2024 · 029a963 · 029a963
1 parent a410d23
commit 029a963
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 3 deletions.
diff --git a/Makefile b/Makefile
@@ -14,7 +14,7 @@ BUILD_TARGETS = \
  llama-finetune \
  llama-gbnf-validator \
  llama-gguf \
- llama-gguf-hash
+ llama-gguf-hash \
  llama-gguf-split \
  llama-gritlm \
  llama-imatrix \
@@ -920,8 +920,15 @@ llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+# Build Libaries specific to llama-gguf-hash
+xxhash.o: examples/gguf-hash/deps/xxhash/xxhash.c examples/gguf-hash/deps/xxhash/xxhash.h
+ $(CC) $(CFLAGS) -c $< -o $@
+
+sha1.o: examples/gguf-hash/deps/sha1/sha1.c examples/gguf-hash/deps/sha1/sha1.h
+ $(CC) $(CFLAGS) -c $< -o $@
+
+llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp ggml.o llama.o xxhash.o sha1.o $(COMMON_DEPS) $(OBJS)
+ $(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
 llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)

diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md
@@ -6,6 +6,7 @@ CLI to hash GGUF files.
 
 - `--xxhash`: use xhash (default)
 - `--sha1`: use sha1
+- `--uuid`: use uuid
 
 ### Compile Example
 
@@ -16,6 +17,7 @@ make -C build llama-gguf-hash VERBOSE=1
 ./build/bin/llama-gguf-hash test.gguf
 ./build/bin/llama-gguf-hash --xxhash test.gguf
 ./build/bin/llama-gguf-hash --sha1 test.gguf
+./build/bin/llama-gguf-hash --uuid test.gguf
 ```
 
 ### Crypto/Hash Libraries Used

diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp
@@ -16,10 +16,15 @@
 #include "sha256/sha256.h"
 #endif
 
+// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
+#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
+#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
+
 struct hash_params {
  std::string input;
  bool xxhash = false;
  bool sha1 = false;
+ bool uuid = false;
 #ifdef SHA256
  bool sha256 = false;
 #endif
@@ -36,6 +41,7 @@ static void hash_print_usage(const char * executable) {
  printf(" -h, --help show this help message and exit\n");
  printf(" --xxhash use xxhash\n");
  printf(" --sha1 use sha1\n");
+ printf(" --uuid use uuid\n");
 #ifdef SHA256
  printf(" --sha256 use sha256\n");
 #endif
@@ -69,6 +75,11 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
  params.sha1 = true;
  }
 
+ if (arg == "--uuid") {
+ arg_found = true;
+ params.uuid = true;
+ }
+
 #ifdef SHA256
  if (arg == "--sha256") {
  arg_found = true;
@@ -83,6 +94,7 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
 
  if (!params.xxhash
  && !params.sha1
+ && !params.uuid
 #ifdef SHA256
  && !params.sha256
 #endif
@@ -254,11 +266,78 @@ static bool gguf_hash(const hash_params & hash_params) {
  return true;
 }
 
+static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
+ // Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
+ // Assumes that digest was processed correctly with the expected namespace
+ for (int i = 0; i < 16; i++) {
+ uuid[i] = sha1_digest[i];
+ }
+
+ // Set bits corresponding to UUID ver 5
+ uuid[ 6] &= ~(0xF << 4);
+ uuid[ 6] |= (5 << 4);
+
+ // Set bits corresponding to UUID variant 0b10XX
+ uuid[ 8] &= ~(0xc << 4);
+ uuid[ 8] |= (0x8 << 4);
+}
+
+static bool gguf_uuid(const hash_params & hash_params) {
+ if (!hash_params.uuid) {
+ return true;
+ }
+
+ const std::string & fname = hash_params.input;
+ struct ggml_context * ctx_data = NULL;
+
+ struct gguf_init_params params = {
+ /*.no_alloc = */ false,
+ /*.ctx = */ &ctx_data,
+ };
+
+ // sha1 init
+ SHA1_CTX sha1_model_hash_ctx;
+ SHA1Init(&sha1_model_hash_ctx);
+
+ unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
+ SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
+
+ struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+ const int n_tensors = gguf_get_n_tensors(ctx);
+ for (int i = 0; i < n_tensors; ++i) {
+ const char * name = gguf_get_tensor_name(ctx, i);
+ struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+ auto n_bytes = ggml_nbytes(cur);
+ auto *raw_data = cur->data;
+ SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
+ }
+
+ unsigned char result[21];
+ SHA1Final(result, &sha1_model_hash_ctx);
+
+ unsigned char uuid[16];
+ generate_uuidv5(result, uuid);
+
+ char string_buffer[37] = {0};
+ sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+ uuid[0], uuid[1], uuid[2], uuid[3],
+ uuid[4], uuid[5], uuid[6], uuid[7],
+ uuid[8], uuid[9], uuid[10], uuid[11],
+ uuid[12], uuid[13], uuid[14], uuid[15]);
+ printf("UUIDv5 %s %s\n", string_buffer, fname.c_str());
+
+ ggml_free(ctx_data);
+ gguf_free(ctx);
+
+ return true;
+}
+
 int main(int argc, const char ** argv) {
  hash_params params;
  hash_params_parse(argc, argv, params);
 
  gguf_hash(params);
+ gguf_uuid(params);
 
  return 0;
 }