ggerganov · goerch · Jul 20, 2023 · Jul 20, 2023 · Jul 20, 2023 · Jul 20, 2023
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,7 @@ zig-out/
 zig-cache/
 
 *.dot
+
+.gitignore/
+build-clang/
+build-icx/
diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp
@@ -231,6 +231,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo
  /*.mem_size =*/ ctx_size,
  /*.mem_buffer =*/ NULL,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ 1,
  };
 
  model.ctx = ggml_init(params);
@@ -492,6 +493,7 @@ bool dollyv2_eval(
  /*.mem_size =*/ buf_size,
  /*.mem_buffer =*/ buf,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(params);

diff --git a/examples/dolly-v2/quantize.cpp b/examples/dolly-v2/quantize.cpp
@@ -139,7 +139,7 @@ int main(int argc, char ** argv) {
 
  // needed to initialize f16 tables
  {
- struct ggml_init_params params = { 0, NULL, false };
+ struct ggml_init_params params = { 0, NULL, false, 1 };
  struct ggml_context * ctx = ggml_init(params);
  ggml_free(ctx);
  }

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
@@ -203,6 +203,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  /*.mem_size =*/ ctx_size,
  /*.mem_buffer =*/ NULL,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ 1,
  };
 
  model.ctx = ggml_init(params);
@@ -425,6 +426,7 @@ bool gpt2_eval(
  /*.mem_size =*/ buf_size,
  /*.mem_buffer =*/ buf,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(params);

diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp
@@ -145,7 +145,7 @@ int main(int argc, char ** argv) {
 
  // needed to initialize f16 tables
  {
- struct ggml_init_params params = { 0, NULL, false };
+ struct ggml_init_params params = { 0, NULL, false, 1 };
  struct ggml_context * ctx = ggml_init(params);
  ggml_free(ctx);
  }

diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp
@@ -202,6 +202,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
  /*.mem_size =*/ ctx_size,
  /*.mem_buffer =*/ NULL,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ 1,
  };
 
  model.ctx = ggml_init(params);
@@ -421,6 +422,7 @@ bool gptj_eval(
  /*.mem_size =*/ buf_size,
  /*.mem_buffer =*/ buf,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(params);

diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
 
  // needed to initialize f16 tables
  {
- struct ggml_init_params params = { 0, NULL, false };
+ struct ggml_init_params params = { 0, NULL, false, 1 };
  struct ggml_context * ctx = ggml_init(params);
  ggml_free(ctx);
  }

diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp
@@ -205,6 +205,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_
  /*.mem_size =*/ ctx_size,
  /*.mem_buffer =*/ NULL,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ 1,
  };
 
  model.ctx = ggml_init(params);
@@ -472,6 +473,7 @@ bool gpt_neox_eval(
  /*.mem_size =*/ buf_size,
  /*.mem_buffer =*/ buf,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(params);

diff --git a/examples/gpt-neox/quantize.cpp b/examples/gpt-neox/quantize.cpp
@@ -139,7 +139,7 @@ int main(int argc, char ** argv) {
 
  // needed to initialize f16 tables
  {
- struct ggml_init_params params = { 0, NULL, false };
+ struct ggml_init_params params = { 0, NULL, false, 1 };
  struct ggml_context * ctx = ggml_init(params);
  ggml_free(ctx);
  }

diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
@@ -51,6 +51,7 @@ int mnist_eval(
  /*.mem_size =*/ buf_size,
  /*.mem_buffer =*/ buf,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ n_threads,
  };
 
  struct ggml_context * ctx_work = ggml_init(params);

diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
@@ -45,6 +45,7 @@ int mnist_eval(
  /*.mem_size =*/ buf_size,
  /*.mem_buffer =*/ buf,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ 1,
  };
 
  struct ggml_context * ctx_work = ggml_init(params);

diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
@@ -80,6 +80,7 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) {
  /*.mem_size =*/ ctx_size + 1024*1024,
  /*.mem_buffer =*/ NULL,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ 1,
  };
 
  model.ctx = ggml_init(params);
@@ -182,6 +183,7 @@ int mnist_eval(
  /*.mem_size =*/ buf_size,
  /*.mem_buffer =*/ buf,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(params);

diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp
@@ -298,6 +298,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
  /*.mem_size =*/ ctx_size,
  /*.mem_buffer =*/ NULL,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ 1,
  };
 
  model.ctx = ggml_init(params);
@@ -495,6 +496,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
  /*.mem_size =*/ buf_size,
  /*.mem_buffer =*/ buf,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(params);

diff --git a/examples/mpt/quantize.cpp b/examples/mpt/quantize.cpp
@@ -144,7 +144,7 @@ int main(int argc, char ** argv) {
 
  // needed to initialize f16 tables
  {
- struct ggml_init_params params = {0, NULL, false};
+ struct ggml_init_params params = {0, NULL, false, 1 };
  struct ggml_context * ctx = ggml_init(params);
  ggml_free(ctx);
  }

diff --git a/examples/replit/main.cpp b/examples/replit/main.cpp
@@ -280,6 +280,7 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t
  /*.mem_size =*/ ctx_size,
  /*.mem_buffer =*/ NULL,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ 1,
  };
 
  model.ctx = ggml_init(params);
@@ -472,6 +473,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
  /*.mem_size =*/ buf_size,
  /*.mem_buffer =*/ buf,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(params);

diff --git a/examples/replit/quantize.cpp b/examples/replit/quantize.cpp
@@ -140,7 +140,7 @@ int main(int argc, char ** argv) {
 
  // needed to initialize f16 tables
  {
- struct ggml_init_params params = {0, NULL, false};
+ struct ggml_init_params params = {0, NULL, false, 1};
  struct ggml_context * ctx = ggml_init(params);
  ggml_free(ctx);
  }

diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp
@@ -226,6 +226,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
  /*.mem_size =*/ ctx_size,
  /*.mem_buffer =*/ NULL,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ 1,
  };
 
  model.ctx = ggml_init(params);
@@ -460,6 +461,7 @@ bool starcoder_eval(
  /*.mem_size =*/ buf_size,
  /*.mem_buffer =*/ buf,
  /*.no_alloc =*/ false,
+ /*.n_threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(params);

diff --git a/examples/starcoder/quantize.cpp b/examples/starcoder/quantize.cpp
@@ -145,7 +145,7 @@ int main(int argc, char ** argv) {
 
  // needed to initialize f16 tables
  {
- struct ggml_init_params params = { 0, NULL, false };
+ struct ggml_init_params params = { 0, NULL, false, 1 };
  struct ggml_context * ctx = ggml_init(params);
  ggml_free(ctx);
  }

diff --git a/examples/starcoder/starcoder-mmap.cpp b/examples/starcoder/starcoder-mmap.cpp
@@ -352,6 +352,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
  /*.mem_size =*/ ctx_size,
  /*.mem_buffer =*/ NULL,
  /*.no_alloc =*/ true,
+ /*.n_threads =*/ 1,
  };
 
  model.ctx = ggml_init(params);
@@ -450,6 +451,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
  c_params.mem_size = model.cache.buf.size;
  c_params.mem_buffer = model.cache.buf.addr;
  c_params.no_alloc = false;
+ c_params.n_threads = 1;
 
  model.cache.ctx = ggml_init(c_params);
 
@@ -667,6 +669,7 @@ bool starcoder_eval(
  /*.mem_size =*/ buf_size,
  /*.mem_buffer =*/ buf,
  /*.no_alloc =*/ false,
+ /*.threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(params);

diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp
@@ -184,7 +184,7 @@ int main(int argc, char ** argv) {
 
  // needed to initialize f16 tables
  {
- struct ggml_init_params params = { 0, NULL, false };
+ struct ggml_init_params params = { 0, NULL, false, 1 };
  struct ggml_context * ctx = ggml_init(params);
  ggml_free(ctx);
  }

diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp
@@ -741,6 +741,7 @@ static bool kv_cache_init(
  /*.mem_size =*/ cache.buf.size(),
  /*.mem_buffer =*/ cache.buf.data(),
  /*.no_alloc =*/ false,
+ /*.threads =*/ 1,
  };
 
  cache.ctx = ggml_init(params);
@@ -777,6 +778,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
  /*.mem_size =*/ cache.buf.size(),
  /*.mem_buffer =*/ cache.buf.data(),
  /*.no_alloc =*/ false,
+ /*.threads =*/ 1,
  };
 
  cache.ctx = ggml_init(params);
@@ -1136,6 +1138,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
  /*.mem_size =*/ wctx.model.buf->size(),
  /*.mem_buffer =*/ wctx.model.buf->data(),
  /*.no_alloc =*/ false,
+ /*.threads =*/ 1,
  };
 
  model.ctx = ggml_init(params);
@@ -1456,6 +1459,7 @@ static bool whisper_encode_internal(
  /*.mem_size =*/ wstate.buf_compute.size(),
  /*.mem_buffer =*/ wstate.buf_compute.data(),
  /*.no_alloc =*/ false,
+ /*.threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(params);
@@ -1935,6 +1939,7 @@ static bool whisper_decode_internal(
  /*.mem_size =*/ wstate.buf_compute.size(),
  /*.mem_buffer =*/ wstate.buf_compute.data(),
  /*.no_alloc =*/ false,
+ /*.threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(params);
@@ -5084,6 +5089,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
  /*.mem_size =*/ buf.size(),
  /*.mem_buffer =*/ buf.data(),
  /*.no_alloc =*/ false,
+ /*.threads =*/ n_threads,
  };
 
  struct ggml_context * ctx0 = ggml_init(gparams);

diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
@@ -482,6 +482,7 @@ extern "C" {
  size_t mem_size; // bytes
  void * mem_buffer; // if NULL, memory will be allocated internally
  bool no_alloc; // don't allocate memory for the tensor data
+ int n_threads; // number of threads for the thread pool
  };
 
 
@@ -1350,7 +1351,7 @@ extern "C" {
  // ggml_graph_plan() has to be called before ggml_graph_compute()
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
  GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
- GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+ GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan, void * tpool);
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
 
  // same as ggml_graph_compute() but the work data is allocated as a part of the context