Significant-Gravitas · k8si · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024
@@ -0,0 +1,2 @@
+*.llamafile
+*.llamafile.exe
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""
+Use llamafile to serve a (quantized) mistral-7b-instruct-v0.2 model
+
+Usage:
+ cd <repo-root>/autogpt
+ ./scripts/llamafile/serve.py
+"""
+
+import os
+import platform
+import subprocess
+from pathlib import Path
+
+LLAMAFILE = Path(
+ "mistral-7b-instruct-v0.2.Q5_K_M.llamafile"
+ + (".exe" if platform.system() == "Windows" else "")
+)
+
+
+def report_download_progress(chunk_number: int, chunk_size: int, total_size: int):
+ if total_size != -1:
+ downloaded_size = chunk_number * chunk_size
+ percent = min(1, downloaded_size / total_size)
+ bar = "#" * int(40 * percent)
+ print(
+ f"\rDownloading: [{bar:<40}] {percent:.0%}"
+ f" - {downloaded_size/1e6:.1f}/{total_size/1e6:.1f} MB",
+ end="",
+ )
+
+
+def download_llamafile():
+ print(f"Downloading {LLAMAFILE.name}...")
+ import urllib.request
+
+ url = "https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.llamafile" # noqa
+
+ urllib.request.urlretrieve(url, LLAMAFILE.name, reporthook=report_download_progress)
+ print()
+
+ LLAMAFILE.chmod(0o755)
+ subprocess.run([LLAMAFILE, "--version"], check=True)
+
+ print(
+ "\n"
+ "NOTE: To use other models besides mistral-7b-instruct-v0.2, "
+ "download them into autogpt/scripts/llamafile/"
+ )
+
+
+# Go to autogpt/scripts/llamafile/
+os.chdir(Path(__file__).resolve().parent)
+
+if not LLAMAFILE.is_file():
+ download_llamafile()
+
+subprocess.run(
+ [LLAMAFILE, "--server", "--nobrowser", "--ctx-size", "0", "--n-predict", "1024"],
+ check=True,
+)
+
+# note: --ctx-size 0 means the prompt context size will be set directly from the
+# underlying model configuration. This may cause slow response times or consume
+# a lot of memory.
diff --git a/docs/content/AutoGPT/setup/index.md b/docs/content/AutoGPT/setup/index.md
@@ -190,3 +190,35 @@ If you don't know which to choose, you can safely go with OpenAI*.
 
 [groq/api-keys]: https://console.groq.com/keys
 [groq/models]: https://console.groq.com/docs/models
+
+
+### Llamafile
+
+With llamafile you can run models locally, which means no need to set up billing,
+and guaranteed data privacy.
+
+!!! warning
+ At the moment, llamafile only serves one model at a time. This means you can not
+ set `SMART_LLM` and `FAST_LLM` to two different llamafile models.
+
+!!! note
+ These instructions will download and use `mistral-7b-instruct-v0.2.Q5_K_M.llamafile`.
+ `mistral-7b-instruct-v0.2` is currently the only tested and supported model.
+ If you want to try other models, you'll have to add them to `LlamafileModelName` in
+ [`llamafile.py`][forge/llamafile.py].
+ For optimal results, you may also have to add some logic to adapt the message format,
+ like `LlamafileProvider._adapt_chat_messages_for_mistral_instruct(..)` does.
+
+1. Run the llamafile serve script:
+ ```shell
+ python3 ./scripts/llamafile/serve.py
+ ```
+ The first time this is run, it will download a file containing the model + runtime,
+ which may take a while and a few gigabytes of disk space.
+
+3. In `.env`, set `SMART_LLM`/`FAST_LLM` or both to `mistral-7b-instruct-v0.2`
+
+4. If the server is running on different address than `http://localhost:8080/v1`,
+ set `LLAMAFILE_API_BASE` in `.env` to the right base URL
+
+[forge/llamafile.py]: https://github.com/Significant-Gravitas/AutoGPT/blob/master/forge/forge/llm/providers/llamafile/llamafile.py
diff --git a/forge/forge/llm/providers/llamafile/README.md b/forge/forge/llm/providers/llamafile/README.md
@@ -0,0 +1,36 @@
+# Llamafile Integration Notes
+
+Tested with:
+* Python 3.11
+* Apple M2 Pro (32 GB), macOS 14.2.1
+* quantized mistral-7b-instruct-v0.2
+
+## Setup
+
+Download a `mistral-7b-instruct-v0.2` llamafile:
+```shell
+wget -nc https://huggingface.co/jartine/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.llamafile
+chmod +x mistral-7b-instruct-v0.2.Q5_K_M.llamafile
+./mistral-7b-instruct-v0.2.Q5_K_M.llamafile --version
+```
+
+Run the llamafile server:
+```shell
+LLAMAFILE="./mistral-7b-instruct-v0.2.Q5_K_M.llamafile"
+
+"${LLAMAFILE}" \
+--server \
+--nobrowser \
+--ctx-size 0 \
+--n-predict 1024
+
+# note: ctx-size=0 means the prompt context size will be set directly from the
+# underlying model configuration. This may cause slow response times or consume
+# a lot of memory.
+```
+
+## TODOs
+
+* `SMART_LLM`/`FAST_LLM` configuration: Currently, the llamafile server only serves one model at a time. However, there's no reason you can't start multiple llamafile servers on different ports. To support using different models for `smart_llm` and `fast_llm`, you could implement config vars like `LLAMAFILE_SMART_LLM_URL` and `LLAMAFILE_FAST_LLM_URL` that point to different llamafile servers (one serving a 'big model' and one serving a 'fast model'). 
+* Authorization: the `serve.sh` script does not set up any authorization for the llamafile server; this can be turned on by adding arg `--api-key <some-key>` to the server startup command. However I haven't attempted to test whether the integration with autogpt works when this feature is turned on.
+* Test with other models
diff --git a/forge/forge/llm/providers/llamafile/__init__.py b/forge/forge/llm/providers/llamafile/__init__.py
@@ -0,0 +1,17 @@
+from .llamafile import (
+ LLAMAFILE_CHAT_MODELS,
+ LLAMAFILE_EMBEDDING_MODELS,
+ LlamafileCredentials,
+ LlamafileModelName,
+ LlamafileProvider,
+ LlamafileSettings,
+)
+
+__all__ = [
+ "LLAMAFILE_CHAT_MODELS",
+ "LLAMAFILE_EMBEDDING_MODELS",
+ "LlamafileCredentials",
+ "LlamafileModelName",
+ "LlamafileProvider",
+ "LlamafileSettings",
+]