feat(mlx-lm): export the GGUF (fp16) format model weights from fuse.py (

#555) * wip * wip * feat: convert mlx model to gguf f16 * chore: conver norm layer to float32 to avoid overflow issue * chore: add support for mixtral * chore: clean up * chore: remove unused import statement * chore: clean up weight name mapping * version and readme * actual version bump --------- Co-authored-by: Awni Hannun <[email protected]>
ml-explore · Mar 21, 2024 · fe96ef3 · fe96ef3
1 parent 8f906c8
commit fe96ef3
Show file tree

Hide file tree

Showing 4 changed files with 351 additions and 6 deletions.
diff --git a/llms/mlx_lm/LORA.md b/llms/mlx_lm/LORA.md
@@ -9,6 +9,7 @@ LoRA (QLoRA).[^qlora] LoRA fine-tuning works with the following model families:
 - Phi2
 - Mixtral
 - Qwen2
+- Gemma
 - OLMo
 
 ## Contents
@@ -17,7 +18,7 @@ LoRA (QLoRA).[^qlora] LoRA fine-tuning works with the following model families:
  * [Fine-tune](#Fine-tune)
  * [Evaluate](#Evaluate)
  * [Generate](#Generate)
-* [Fuse and Upload](#Fuse-and-Upload)
+* [Fuse](#Fuse)
 * [Data](#Data)
 * [Memory Issues](#Memory-Issues)
 
@@ -93,11 +94,14 @@ python -m mlx_lm.generate \
  --prompt "<your_model_prompt>"
 ```
 
-## Fuse and Upload
+## Fuse
 
 You can generate a model fused with the low-rank adapters using the
-`mlx_lm.fuse` command. This command also allows you to upload the fused model
-to the Hugging Face Hub.
+`mlx_lm.fuse` command. This command also allows you to optionally:
+
+- Upload the fused model to the Hugging Face Hub.
+- Export the fused model to GGUF. Note GGUF support is limited to Mistral,
+ Mixtral, and Llama style models in fp16 precision.
 
 To see supported options run:
 
@@ -127,6 +131,17 @@ python -m mlx_lm.fuse \
  --hf-path mistralai/Mistral-7B-v0.1
 ```
 
+To export a fused model to GGUF, run:
+
+```shell
+python -m mlx_lm.fuse \
+ --model mistralai/Mistral-7B-v0.1 \
+ --export-gguf
+```
+
+This will save the GGUF model in `lora_fused_model/ggml-model-f16.gguf`. You
+can specify the file name with `--gguf-path`.
+
 ## Data
 
 The LoRA command expects you to provide a dataset with `--data`. The MLX

diff --git a/llms/mlx_lm/fuse.py b/llms/mlx_lm/fuse.py
@@ -3,10 +3,10 @@
 import json
 import shutil
 from pathlib import Path
-from typing import Any, Dict, Union
 
 from mlx.utils import tree_flatten, tree_unflatten
 
+from .gguf import convert_to_gguf
 from .tuner.lora import LoRALinear
 from .tuner.utils import apply_lora_layers, dequantize
 from .utils import (
@@ -53,6 +53,17 @@ def parse_arguments() -> argparse.Namespace:
  help="Generate a de-quantized model.",
  action="store_true",
  )
+ parser.add_argument(
+ "--export-gguf",
+ help="Export model weights in GGUF format.",
+ action="store_true",
+ )
+ parser.add_argument(
+ "--gguf-path",
+ help="Path to save the exported GGUF format model weights. Default is ggml-model-f16.gguf.",
+ default="ggml-model-f16.gguf",
+ type=str,
+ )
  return parser.parse_args()
 
 
@@ -95,6 +106,14 @@ def main() -> None:
 
  save_config(config, config_path=save_path / "config.json")
 
+ if args.export_gguf:
+ model_type = config["model_type"]
+ if model_type not in ["llama", "mixtral", "mistral"]:
+ raise ValueError(
+ f"Model type {model_type} not supported for GGUF conversion."
+ )
+ convert_to_gguf(model_path, weights, config, str(save_path / args.gguf_path))
+
  if args.upload_repo is not None:
  hf_path = args.hf_path or (
  args.model if not Path(args.model).exists() else None