ggerganov · teleprint-me · May 7, 2024 · May 7, 2024 · May 7, 2024 · May 7, 2024
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -23,14 +23,14 @@
 # TODO: automate the update of convert-hf-to-gguf.py
 #
 
+import json
 import logging
 import os
-import requests
 import sys
-import json
-
-from hashlib import sha256
 from enum import IntEnum, auto
+from hashlib import sha256
+
+import requests
 from transformers import AutoTokenizer
 
 logging.basicConfig(level=logging.DEBUG)
@@ -65,6 +65,13 @@ class TOKENIZER_TYPE(IntEnum):
  {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
  {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
  {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+ {"name": "phi", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-1", },
+ {"name": "stablelm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
+ {"name": "qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen-tokenizer", },
+ {"name": "mistral-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", },
+ {"name": "mistral-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", },
+ {"name": "mixtral-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
+ {"name": "mixtral-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
  {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
  {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
 ]
@@ -290,12 +297,18 @@ def get_vocab_base_pre(self, tokenizer) -> str:
  logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
 
 # generate commands for creating vocab files
-
-logger.info("\nRun the following commands to generate the vocab files for testing:\n")
+shscript = "#!/usr/bin/env bash\n\n"
 
 for model in models:
  name = model["name"]
+ tmpline = f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only\n"
+ shscript += tmpline
+ logging.info(tmpline.strip())
 
- print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
+with open("generate-vocab.sh", "w", encoding="utf-8") as f:
+ f.writelines(shscript)
+ logging.info(f"Wrote {len(shscript)} bytes to generate-vocab.sh")
 
-logger.info("\n")
+logging.info("Run the following command to generate the vocab files for testing:")
+logging.info("Enable execution: chmod +x generate-vocab.sh")
+logging.info("Execute with ./generate-vocab.sh")
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -2,18 +2,27 @@
 
 from __future__ import annotations
 
-import logging
 import argparse
 import contextlib
 import json
+import logging
 import os
 import re
 import sys
 from abc import ABC, abstractmethod
 from enum import IntEnum
-from pathlib import Path
 from hashlib import sha256
-from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
+from pathlib import Path
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ ContextManager,
+ Iterator,
+ Sequence,
+ TypeVar,
+ cast,
+)
 
 import numpy as np
 import torch
@@ -308,6 +317,21 @@ def get_vocab_base_pre(self, tokenizer) -> str:
  if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
  # ref: https://huggingface.co/openai-community/gpt2
  res = "gpt-2"
+ if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
+ # ref: https://huggingface.co/microsoft/phi-1
+ res = "phi"
+ if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
+ # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
+ res = "stablelm"
+ if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
+ # ref: https://huggingface.co/Qwen/Qwen-tokenizer
+ res = "qwen"
+ if chkhsh == "e750a9b14dfed9b73287639bd1ecda50c38fa6011138f2f609804c6dab9ed5c2":
+ # ref: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
+ res = "mistral-bpe"
+ if chkhsh == "e750a9b14dfed9b73287639bd1ecda50c38fa6011138f2f609804c6dab9ed5c2":
+ # ref: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+ res = "mixtral-bpe"
  if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
  # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
  res = "refact"

diff --git a/generate-vocab.sh b/generate-vocab.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+python3 convert-hf-to-gguf.py models/tokenizers/llama-spm/ --outfile models/ggml-vocab-llama-spm.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/llama-bpe/ --outfile models/ggml-vocab-llama-bpe.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/phi-3/ --outfile models/ggml-vocab-phi-3.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/deepseek-llm/ --outfile models/ggml-vocab-deepseek-llm.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/deepseek-coder/ --outfile models/ggml-vocab-deepseek-coder.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/falcon/ --outfile models/ggml-vocab-falcon.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/bert-bge/ --outfile models/ggml-vocab-bert-bge.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/mpt/ --outfile models/ggml-vocab-mpt.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/starcoder/ --outfile models/ggml-vocab-starcoder.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/gpt-2/ --outfile models/ggml-vocab-gpt-2.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/phi/ --outfile models/ggml-vocab-phi.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/stablelm/ --outfile models/ggml-vocab-stablelm.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/qwen/ --outfile models/ggml-vocab-qwen.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/mistral-bpe/ --outfile models/ggml-vocab-mistral-bpe.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/mistral-spm/ --outfile models/ggml-vocab-mistral-spm.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/mixtral-bpe/ --outfile models/ggml-vocab-mixtral-bpe.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/mixtral-spm/ --outfile models/ggml-vocab-mixtral-spm.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/refact/ --outfile models/ggml-vocab-refact.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/command-r/ --outfile models/ggml-vocab-command-r.gguf --vocab-only
diff --git a/models/ggml-vocab-stablelm.gguf b/models/ggml-vocab-stablelm.gguf