Skip to content

Commit

Permalink
Merge branch 'ggerganov:master' into avx_iq
Browse files Browse the repository at this point in the history
  • Loading branch information
netrunnereve committed Jun 11, 2024
2 parents b7e1707 + 73bac2b commit 2f37328
Show file tree
Hide file tree
Showing 30 changed files with 2,468 additions and 1,498 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
- [ ] Review Complexity : Low
- [ ] Review Complexity : Medium
- [ ] Review Complexity : High
- [ ] I have read the [contributing guidelines](CONTRIBUTING.md)
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
9 changes: 5 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ on:
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
Expand Down Expand Up @@ -684,7 +684,7 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake:
runs-on: windows-latest
runs-on: windows-2019

env:
OPENBLAS_VERSION: 0.3.23
Expand Down Expand Up @@ -829,7 +829,7 @@ jobs:
name: llama-bin-win-${{ matrix.build }}.zip

windows-latest-cmake-cuda:
runs-on: windows-latest
runs-on: windows-2019

strategy:
matrix:
Expand All @@ -843,8 +843,9 @@ jobs:
with:
fetch-depth: 0

- uses: Jimver/cuda-toolkit@v0.2.11
- name: Install CUDA toolkit
id: cuda-toolkit
uses: Jimver/[email protected]
with:
cuda: ${{ matrix.cuda }}
method: 'network'
Expand Down
6 changes: 2 additions & 4 deletions .github/workflows/server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,9 @@ on:
branches:
- master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
pull_request_target:
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
schedule:
- cron: '2 4 * * *'

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
Expand Down Expand Up @@ -115,7 +113,7 @@ jobs:
server-windows:
runs-on: windows-latest
runs-on: windows-2019

steps:
- name: Clone
Expand Down
31 changes: 15 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -402,12 +402,26 @@ if (LLAMA_CUBLAS)
endif()

if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES

find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "CUDA found")

if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

enable_language(CUDA)

set(GGML_HEADERS_CUDA ggml-cuda.h)
Expand Down Expand Up @@ -472,21 +486,6 @@ if (LLAMA_CUDA)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()

if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

else()
message(WARNING "CUDA not found")
endif()
Expand Down
29 changes: 0 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
<li><a href="#quantization">Quantization</a></li>
<li><a href="#interactive-mode">Interactive mode</a></li>
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
<li><a href="#instruct-mode">Instruct mode</a></li>
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
Expand Down Expand Up @@ -769,34 +768,6 @@ The `grammars/` folder contains a handful of sample grammars. To write your own,

For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.

### Instruct mode

1. First, download and place the `ggml` model into the `./models` folder
2. Run the `main` tool like this:

```
./examples/alpaca.sh
```

Sample run:

```
== Running in interactive mode. ==
- Press Ctrl+C to interject at any time.
- Press Return to return control to LLaMA.
- If you want to submit another line, end your input in '\'.
Below is an instruction that describes a task. Write a response that appropriately completes the request.
> How many letters are there in the English alphabet?
There 26 letters in the English Alphabet
> What is the most common way of transportation in Amsterdam?
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
> List 5 words that start with "ca".
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
```

### Obtaining and using the Facebook LLaMA 2 model

- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
Expand Down
4 changes: 2 additions & 2 deletions common/json-schema-to-grammar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ static std::string build_repetition(const std::string & item_rule, int min_items
return result;
}

const std::string SPACE_RULE = "\" \"?";
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";

struct BuiltinRule {
std::string content;
Expand All @@ -57,7 +57,7 @@ std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};
Expand Down
19 changes: 0 additions & 19 deletions examples/alpaca.sh

This file was deleted.

15 changes: 0 additions & 15 deletions examples/gpt4all.sh

This file was deleted.

7 changes: 3 additions & 4 deletions examples/json_schema_to_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,8 @@ def __init__(self, content: str, deps: list = None):
self.content = content
self.deps = deps or []

# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
# Constraining spaces to prevent model "running away".
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'

PRIMITIVE_RULES = {
'boolean' : BuiltinRule('("true" | "false") space', []),
Expand All @@ -43,7 +42,7 @@ def __init__(self, content: str, deps: list = None):
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null' : BuiltinRule('"null" space', []),
}
Expand Down
21 changes: 21 additions & 0 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,27 @@ struct markdown_printer : public printer {
if (field == "n_gpu_layers") {
return 3;
}
if (field == "n_threads") {
return 7;
}
if (field == "n_batch") {
return 7;
}
if (field == "n_ubatch") {
return 8;
}
if (field == "type_k" || field == "type_v") {
return 6;
}
if (field == "split_mode") {
return 5;
}
if (field == "flash_attn") {
return 2;
}
if (field == "use_mmap") {
return 4;
}
if (field == "test") {
return 13;
}
Expand Down
18 changes: 0 additions & 18 deletions examples/llama2-13b.sh

This file was deleted.

18 changes: 0 additions & 18 deletions examples/llama2.sh

This file was deleted.

4 changes: 2 additions & 2 deletions examples/server/public/json-schema-to-grammar.mjs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
const SPACE_RULE = '" "?';
const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}';

function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
if (minItems === 0 && maxItems === 1) {
Expand Down Expand Up @@ -41,7 +41,7 @@ const PRIMITIVE_RULES = {
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
char : new BuiltinRule(`[^"\\\\\\x7F\\x00-\\x1F] | [\\\\] (["\\\\bfnrt] | "u" [0-9a-fA-F]{4})`, []),
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
null : new BuiltinRule('"null" space', []),
};
Expand Down
30 changes: 16 additions & 14 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ struct server_slot {
int32_t n_prompt_tokens = 0;
int32_t n_prompt_tokens_processed = 0;

json prompt;
std::string prompt;

// when a task is submitted, we first tokenize the prompt and store it here
std::vector<llama_token> prompt_tokens;
Expand Down Expand Up @@ -822,13 +822,8 @@ struct server_context {
continue;
}

// skip the slot if it does not contains prompt
if (!slot.prompt.is_string()) {
continue;
}

// current slot's prompt
std::string slot_prompt = slot.prompt.get<std::string>();
std::string slot_prompt = slot.prompt;

// length of the current slot's prompt
int slot_prompt_len = slot_prompt.size();
Expand Down Expand Up @@ -958,13 +953,16 @@ struct server_context {
if (!task.infill) {
const auto & prompt = data.find("prompt");
if (prompt == data.end()) {
send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
return false;
} else {
slot.prompt = *prompt;
}
if (slot.prompt.is_array() && slot.prompt.size() == 0) {
send_error(task, "\"prompt\" cannot be an empty array", ERROR_TYPE_INVALID_REQUEST);

if (prompt->is_string()) {
slot.prompt = prompt->get<std::string>();
} else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) {
slot.prompt = prompt->at(0).get<std::string>();
} else {
send_error(task, "\"prompt\" must be a string or an array of strings", ERROR_TYPE_INVALID_REQUEST);
return false;
}
}
Expand Down Expand Up @@ -1582,14 +1580,18 @@ struct server_context {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
{
int id_slot = json_value(task.data, "id_slot", -1);
std::string prompt = json_value(task.data, "prompt", std::string());
const int id_slot = json_value(task.data, "id_slot", -1);

server_slot * slot;

if (id_slot != -1) {
slot = get_slot_by_id(id_slot);
} else {
std::string prompt;
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
json_value(task.data, "prompt", std::string());
}

slot = get_available_slot(prompt);
}

Expand Down
2 changes: 1 addition & 1 deletion ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -886,7 +886,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif
for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free(*buffers[i]);
ggml_backend_buffer_free((*buffers)[i]);
}
free(*buffers);
return false;
Expand Down
Loading

0 comments on commit 2f37328

Please sign in to comment.