Merge remote-tracking branch 'origin/master' into bins

ggerganov · Jun 10, 2024 · daeaeb1 · daeaeb1
2 parents 5265c15 + fd5ea0f
commit daeaeb1
Show file tree

Hide file tree

Showing 30 changed files with 1,118 additions and 492 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
@@ -0,0 +1,5 @@
+- Self Reported Review Complexity:
+ - [ ] Review Complexity : Low
+ - [ ] Review Complexity : Medium
+ - [ ] Review Complexity : High
+- [ ] I have read the [contributing guidelines](CONTRIBUTING.md)
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
@@ -16,11 +16,9 @@ on:
  branches:
  - master
  paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
- pull_request_target:
+ pull_request:
  types: [opened, synchronize, reopened]
  paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
- schedule:
- - cron: '2 4 * * *'
 
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -115,7 +113,7 @@ jobs:
 
 
  server-windows:
- runs-on: windows-latest
+ runs-on: windows-2019
 
  steps:
  - name: Clone

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,14 @@
+# Contributing Guidelines
+
+## Checklist
+
+* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
+* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
+* Execute [the full CI locally on your machine](ci/README.md) before publishing
+
+## PR formatting
+
+* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
+ - The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
+* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
+* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
diff --git a/README.md b/README.md
@@ -53,7 +53,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
   <li><a href="#quantization">Quantization</a></li>
   <li><a href="#interactive-mode">Interactive mode</a></li>
   <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
-  <li><a href="#instruct-mode">Instruct mode</a></li>
   <li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
   <li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
   <li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
@@ -769,34 +768,6 @@ The `grammars/` folder contains a handful of sample grammars. To write your own,
 
 For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
 
-### Instruct mode
-
-1. First, download and place the `ggml` model into the `./models` folder
-2. Run the `main` tool like this:
-
-```
-./examples/alpaca.sh
-```
-
-Sample run:
-
-```
-== Running in interactive mode. ==
- - Press Ctrl+C to interject at any time.
- - Press Return to return control to LLaMA.
- - If you want to submit another line, end your input in '\'.
-
- Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-> How many letters are there in the English alphabet?
-There 26 letters in the English Alphabet
-> What is the most common way of transportation in Amsterdam?
-The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
-> List 5 words that start with "ca".
-cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
->
-```
-
 ### Obtaining and using the Facebook LLaMA 2 model
 
 - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.

diff --git a/common/common.cpp b/common/common.cpp
@@ -200,19 +200,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
  }
  params.hf_file = params.model;
  } else if (params.model.empty()) {
- std::string cache_directory = fs_get_cache_directory();
- const bool success = fs_create_directory_with_parents(cache_directory);
- if (!success) {
- throw std::runtime_error("failed to create cache directory: " + cache_directory);
- }
- params.model = cache_directory + string_split(params.hf_file, '/').back();
+ params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
  }
  } else if (!params.model_url.empty()) {
  if (params.model.empty()) {
  auto f = string_split(params.model_url, '#').front();
  f = string_split(f, '?').front();
- f = string_split(f, '/').back();
- params.model = "models/" + f;
+ params.model = fs_get_cache_file(string_split(f, '/').back());
  }
  } else if (params.model.empty()) {
  params.model = DEFAULT_MODEL_PATH;
@@ -2279,6 +2273,16 @@ std::string fs_get_cache_directory() {
  return ensure_trailing_slash(cache_directory);
 }
 
+std::string fs_get_cache_file(const std::string & filename) {
+ GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
+ std::string cache_directory = fs_get_cache_directory();
+ const bool success = fs_create_directory_with_parents(cache_directory);
+ if (!success) {
+ throw std::runtime_error("failed to create cache directory: " + cache_directory);
+ }
+ return cache_directory + filename;
+}
+
 
 //
 // Model utils

diff --git a/common/common.h b/common/common.h
@@ -277,6 +277,7 @@ bool fs_validate_filename(const std::string & filename);
 bool fs_create_directory_with_parents(const std::string & path);
 
 std::string fs_get_cache_directory();
+std::string fs_get_cache_file(const std::string & filename);
 
 //
 // Model utils

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -47,11 +47,12 @@ class Model:
  _model_classes: dict[str, type[Model]] = {}
 
  dir_model: Path
- ftype: int
+ ftype: gguf.LlamaFileType
  is_big_endian: bool
  endianess: gguf.GGUFEndian
  use_temp_file: bool
  lazy: bool
+ model_name: str | None
  part_names: list[str]
  is_safetensors: bool
  hparams: dict[str, Any]
@@ -64,7 +65,7 @@ class Model:
  # subclasses should define this!
  model_arch: gguf.MODEL_ARCH
 
- def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
+ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
  if type(self) is Model:
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
  self.dir_model = dir_model
@@ -73,10 +74,11 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
  self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
  self.use_temp_file = use_temp_file
  self.lazy = not eager
- self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
+ self.model_name = model_name
+ self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
  self.is_safetensors = len(self.part_names) > 0
  if not self.is_safetensors:
- self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
+ self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
  self.hparams = Model.load_hparams(self.dir_model)
  self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
  self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@@ -94,7 +96,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
  ftype_lw: str = ftype_up.lower()
  # allow templating the file name with the output ftype, useful with the "auto" ftype
  self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
- self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
+ self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
 
  @classmethod
  def __init_subclass__(cls):
@@ -182,7 +184,7 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
  return new_name
 
  def set_gguf_parameters(self):
- self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
  self.gguf_writer.add_block_count(self.block_count)
 
  if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@@ -324,21 +326,21 @@ def write_tensors(self):
 
  def write(self):
  self.write_tensors()
- self.gguf_writer.write_header_to_file()
+ self.gguf_writer.write_header_to_file(self.fname_out)
  self.gguf_writer.write_kv_data_to_file()
  self.gguf_writer.write_tensors_to_file(progress=True)
  self.gguf_writer.close()
 
  def write_vocab(self):
- self.gguf_writer.write_header_to_file()
+ self.gguf_writer.write_header_to_file(self.fname_out)
  self.gguf_writer.write_kv_data_to_file()
  self.gguf_writer.close()
 
  @staticmethod
- def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
+ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
  part_names: list[str] = []
  for filename in os.listdir(dir_model):
- if filename.endswith(suffix):
+ if filename.startswith(prefix) and filename.endswith(suffix):
  part_names.append(filename)
 
  part_names.sort()
@@ -665,7 +667,7 @@ class GPTNeoXModel(Model):
  def set_gguf_parameters(self):
  block_count = self.hparams["num_hidden_layers"]
 
- self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
  self.gguf_writer.add_block_count(block_count)
@@ -798,7 +800,7 @@ def set_vocab(self):
 
  def set_gguf_parameters(self):
  block_count = self.hparams["n_layers"]
- self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
  self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
  self.gguf_writer.add_embedding_length(self.hparams["d_model"])
  self.gguf_writer.add_block_count(block_count)
@@ -850,7 +852,7 @@ def set_gguf_parameters(self):
  raise ValueError("gguf: can not find ctx length parameter.")
 
  self.gguf_writer.add_file_type(self.ftype)
- self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
  self.gguf_writer.add_source_hf_repo(hf_repo)
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  self.gguf_writer.add_context_length(ctx_length)
@@ -887,7 +889,7 @@ def set_gguf_parameters(self):
  else:
  raise ValueError("gguf: can not find ctx length parameter.")
 
- self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
  self.gguf_writer.add_source_hf_repo(hf_repo)
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  self.gguf_writer.add_context_length(ctx_length)
@@ -1010,7 +1012,7 @@ def set_gguf_parameters(self):
  else:
  raise ValueError("gguf: can not find ctx length parameter.")
 
- self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
  self.gguf_writer.add_source_hf_repo(hf_repo)
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
  self.gguf_writer.add_context_length(ctx_length)
@@ -1206,7 +1208,7 @@ def set_gguf_parameters(self):
  hparams = self.hparams
  block_count = hparams["num_hidden_layers"]
 
- self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  self.gguf_writer.add_block_count(block_count)
@@ -1681,7 +1683,7 @@ class GPT2Model(Model):
  model_arch = gguf.MODEL_ARCH.GPT2
 
  def set_gguf_parameters(self):
- self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
  self.gguf_writer.add_block_count(self.hparams["n_layer"])
  self.gguf_writer.add_context_length(self.hparams["n_ctx"])
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@@ -2248,7 +2250,7 @@ def set_gguf_parameters(self):
  hparams = self.hparams
  block_count = hparams["num_hidden_layers"]
 
- self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
  self.gguf_writer.add_block_count(block_count)
@@ -2348,7 +2350,7 @@ def set_gguf_parameters(self):
  # Fail early for models which don't have a block expansion factor of 2
  assert d_inner == 2 * d_model
 
- self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
  self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
  self.gguf_writer.add_embedding_length(d_model)
  self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@@ -2852,7 +2854,7 @@ def main() -> None:
  logger.error(f"Model {hparams['architectures'][0]} is not supported")
  sys.exit(1)
 
- model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
+ model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
 
  logger.info("Set model parameters")
  model_instance.set_gguf_parameters()

diff --git a/examples/alpaca.sh b/examples/alpaca.sh
diff --git a/examples/gpt4all.sh b/examples/gpt4all.sh