Merge branch 'ggerganov:master' into avx_iq

ggerganov · Jun 11, 2024 · 2f37328 · 2f37328
2 parents b7e1707 + 73bac2b
commit 2f37328
Show file tree

Hide file tree

Showing 30 changed files with 2,468 additions and 1,498 deletions.
diff --git a/...REQUEST_TEMPLATE/pull_request_template.md → .github/pull_request_template.md b/...REQUEST_TEMPLATE/pull_request_template.md → .github/pull_request_template.md
@@ -2,4 +2,4 @@
  - [ ] Review Complexity : Low
  - [ ] Review Complexity : Medium
  - [ ] Review Complexity : High
-- [ ] I have read the [contributing guidelines](CONTRIBUTING.md)
+- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -13,7 +13,7 @@ on:
  paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
  pull_request:
  types: [opened, synchronize, reopened]
- paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
+ paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
 
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -684,7 +684,7 @@ jobs:
  cmake --build build --config ${{ matrix.build }} -j $(nproc)
 
  windows-latest-cmake:
- runs-on: windows-latest
+ runs-on: windows-2019
 
  env:
  OPENBLAS_VERSION: 0.3.23
@@ -829,7 +829,7 @@ jobs:
  name: llama-bin-win-${{ matrix.build }}.zip
 
  windows-latest-cmake-cuda:
- runs-on: windows-latest
+ runs-on: windows-2019
 
  strategy:
  matrix:
@@ -843,8 +843,9 @@ jobs:
  with:
  fetch-depth: 0
 
- - uses: Jimver/cuda-toolkit@v0.2.11
+ - name: Install CUDA toolkit
  id: cuda-toolkit
+ uses: Jimver/[email protected]
  with:
  cuda: ${{ matrix.cuda }}
  method: 'network'

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
@@ -16,11 +16,9 @@ on:
  branches:
  - master
  paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
- pull_request_target:
+ pull_request:
  types: [opened, synchronize, reopened]
  paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
- schedule:
- - cron: '2 4 * * *'
 
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -115,7 +113,7 @@ jobs:
 
 
  server-windows:
- runs-on: windows-latest
+ runs-on: windows-2019
 
  steps:
  - name: Clone

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -402,12 +402,26 @@ if (LLAMA_CUBLAS)
 endif()
 
 if (LLAMA_CUDA)
- cmake_minimum_required(VERSION 3.17)
+ cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
 
  find_package(CUDAToolkit)
  if (CUDAToolkit_FOUND)
  message(STATUS "CUDA found")
 
+ if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+ # 52 == lowest CUDA 12 standard
+ # 60 == f16 CUDA intrinsics
+ # 61 == integer CUDA intrinsics
+ # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
+ if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
+ set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
+ else()
+ set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
+ #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
+ endif()
+ endif()
+ message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
  enable_language(CUDA)
 
  set(GGML_HEADERS_CUDA ggml-cuda.h)
@@ -472,21 +486,6 @@ if (LLAMA_CUDA)
  else()
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
  endif()
-
- if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
- # 52 == lowest CUDA 12 standard
- # 60 == f16 CUDA intrinsics
- # 61 == integer CUDA intrinsics
- # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
- if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
- set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
- else()
- set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
- #set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
- endif()
- endif()
- message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
  else()
  message(WARNING "CUDA not found")
  endif()

diff --git a/README.md b/README.md
@@ -53,7 +53,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
   <li><a href="#quantization">Quantization</a></li>
   <li><a href="#interactive-mode">Interactive mode</a></li>
   <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
-  <li><a href="#instruct-mode">Instruct mode</a></li>
   <li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
   <li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
   <li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
@@ -769,34 +768,6 @@ The `grammars/` folder contains a handful of sample grammars. To write your own,
 
 For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
 
-### Instruct mode
-
-1. First, download and place the `ggml` model into the `./models` folder
-2. Run the `main` tool like this:
-
-```
-./examples/alpaca.sh
-```
-
-Sample run:
-
-```
-== Running in interactive mode. ==
- - Press Ctrl+C to interject at any time.
- - Press Return to return control to LLaMA.
- - If you want to submit another line, end your input in '\'.
-
- Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-> How many letters are there in the English alphabet?
-There 26 letters in the English Alphabet
-> What is the most common way of transportation in Amsterdam?
-The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
-> List 5 words that start with "ca".
-cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
->
-```
-
 ### Obtaining and using the Facebook LLaMA 2 model
 
 - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
@@ -40,7 +40,7 @@ static std::string build_repetition(const std::string & item_rule, int min_items
  return result;
 }
 
-const std::string SPACE_RULE = "\" \"?";
+const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
 
 struct BuiltinRule {
  std::string content;
@@ -57,7 +57,7 @@ std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
  {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
  {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
  {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
- {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
+ {"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
  {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
  {"null", {"\"null\" space", {}}},
 };

diff --git a/examples/alpaca.sh b/examples/alpaca.sh
diff --git a/examples/gpt4all.sh b/examples/gpt4all.sh
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
@@ -29,9 +29,8 @@ def __init__(self, content: str, deps: list = None):
  self.content = content
  self.deps = deps or []
 
-# whitespace is constrained to a single space char to prevent model "running away" in
-# whitespace. Also maybe improves generation quality?
-SPACE_RULE = '" "?'
+# Constraining spaces to prevent model "running away".
+SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
 
 PRIMITIVE_RULES = {
  'boolean' : BuiltinRule('("true" | "false") space', []),
@@ -43,7 +42,7 @@ def __init__(self, content: str, deps: list = None):
  'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
  'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
  'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
- 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
+ 'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
  'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
  'null' : BuiltinRule('"null" space', []),
 }

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
@@ -1033,6 +1033,27 @@ struct markdown_printer : public printer {
  if (field == "n_gpu_layers") {
  return 3;
  }
+ if (field == "n_threads") {
+ return 7;
+ }
+ if (field == "n_batch") {
+ return 7;
+ }
+ if (field == "n_ubatch") {
+ return 8;
+ }
+ if (field == "type_k" || field == "type_v") {
+ return 6;
+ }
+ if (field == "split_mode") {
+ return 5;
+ }
+ if (field == "flash_attn") {
+ return 2;
+ }
+ if (field == "use_mmap") {
+ return 4;
+ }
  if (field == "test") {
  return 13;
  }

diff --git a/examples/llama2-13b.sh b/examples/llama2-13b.sh
diff --git a/examples/llama2.sh b/examples/llama2.sh
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
@@ -1,5 +1,5 @@
 // WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
-const SPACE_RULE = '" "?';
+const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}';
 
 function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
  if (minItems === 0 && maxItems === 1) {
@@ -41,7 +41,7 @@ const PRIMITIVE_RULES = {
  object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
  array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
  uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
- char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
+ char : new BuiltinRule(`[^"\\\\\\x7F\\x00-\\x1F] | [\\\\] (["\\\\bfnrt] | "u" [0-9a-fA-F]{4})`, []),
  string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
  null : new BuiltinRule('"null" space', []),
 };

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -147,7 +147,7 @@ struct server_slot {
  int32_t n_prompt_tokens = 0;
  int32_t n_prompt_tokens_processed = 0;
 
- json prompt;
+ std::string prompt;
 
  // when a task is submitted, we first tokenize the prompt and store it here
  std::vector<llama_token> prompt_tokens;
@@ -822,13 +822,8 @@ struct server_context {
  continue;
  }
 
- // skip the slot if it does not contains prompt
- if (!slot.prompt.is_string()) {
- continue;
- }
-
  // current slot's prompt
- std::string slot_prompt = slot.prompt.get<std::string>();
+ std::string slot_prompt = slot.prompt;
 
  // length of the current slot's prompt
  int slot_prompt_len = slot_prompt.size();
@@ -958,13 +953,16 @@ struct server_context {
  if (!task.infill) {
  const auto & prompt = data.find("prompt");
  if (prompt == data.end()) {
- send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
+ send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
  return false;
- } else {
- slot.prompt = *prompt;
  }
- if (slot.prompt.is_array() && slot.prompt.size() == 0) {
- send_error(task, "\"prompt\" cannot be an empty array", ERROR_TYPE_INVALID_REQUEST);
+
+ if (prompt->is_string()) {
+ slot.prompt = prompt->get<std::string>();
+ } else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) {
+ slot.prompt = prompt->at(0).get<std::string>();
+ } else {
+ send_error(task, "\"prompt\" must be a string or an array of strings", ERROR_TYPE_INVALID_REQUEST);
  return false;
  }
  }
@@ -1582,14 +1580,18 @@ struct server_context {
  switch (task.type) {
  case SERVER_TASK_TYPE_COMPLETION:
  {
- int id_slot = json_value(task.data, "id_slot", -1);
- std::string prompt = json_value(task.data, "prompt", std::string());
+ const int id_slot = json_value(task.data, "id_slot", -1);
 
  server_slot * slot;
 
  if (id_slot != -1) {
  slot = get_slot_by_id(id_slot);
  } else {
+ std::string prompt;
+ if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
+ json_value(task.data, "prompt", std::string());
+ }
+
  slot = get_available_slot(prompt);
  }
 

diff --git a/ggml-alloc.c b/ggml-alloc.c
@@ -886,7 +886,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
  fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
 #endif
  for (size_t i = 0; i < *n_buffers; i++) {
- ggml_backend_buffer_free(*buffers[i]);
+ ggml_backend_buffer_free((*buffers)[i]);
  }
  free(*buffers);
  return false;