Merge branch 'feature/llama_supports_rpc' of github.com:martindevans/…

…llama.cpp into feature/llama_supports_rpc
ggerganov · Jun 9, 2024 · f87e6ac · f87e6ac
2 parents b67c0df + 9b15621
commit f87e6ac
Showing 1 changed file with 46 additions and 23 deletions.
diff --git a/llama.h b/llama.h
@@ -97,7 +97,7 @@ extern "C" {
  LLAMA_ROPE_TYPE_GLM = 4,
  };
 
- enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
+ enum llama_token_type {
  LLAMA_TOKEN_TYPE_UNDEFINED = 0,
  LLAMA_TOKEN_TYPE_NORMAL = 1,
  LLAMA_TOKEN_TYPE_UNKNOWN = 2,
@@ -107,20 +107,6 @@ extern "C" {
  LLAMA_TOKEN_TYPE_BYTE = 6,
  };
 
- enum llama_token_attr {
- LLAMA_TOKEN_ATTR_UNDEFINED = 0,
- LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0,
- LLAMA_TOKEN_ATTR_UNUSED = 1 << 1,
- LLAMA_TOKEN_ATTR_NORMAL = 1 << 2,
- LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL?
- LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
- LLAMA_TOKEN_ATTR_BYTE = 1 << 5,
- LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6,
- LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7,
- LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8,
- LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9,
- };
-
  // model file types
  enum llama_ftype {
  LLAMA_FTYPE_ALL_F32 = 0,
@@ -365,9 +351,6 @@ extern "C" {
  // modifies a preceding LLAMA_GRETYPE_CHAR or
  // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
  LLAMA_GRETYPE_CHAR_ALT = 6,
-
- // any character (.)
- LLAMA_GRETYPE_CHAR_ANY = 7,
  };
 
  typedef struct llama_grammar_element {
@@ -430,7 +413,7 @@ extern "C" {
 
  LLAMA_API bool llama_supports_mmap (void);
  LLAMA_API bool llama_supports_mlock (void);
- LLAMA_API bool llama_supports_rpc (void);
+ LLAMA_API bool llama_supports_rpc (void); // TMP: https://github.com/ggerganov/llama.cpp/pull/7647#issuecomment-2140234367
  LLAMA_API bool llama_supports_gpu_offload(void);
 
  LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
@@ -442,8 +425,8 @@ extern "C" {
 
  LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
 
- LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
- LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
+ LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model  * model);
+ LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model  * model);
 
  LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
  LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@@ -839,7 +822,7 @@ extern "C" {
 
  LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
 
- LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
+ LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
 
  // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
  LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
@@ -1060,9 +1043,49 @@ extern "C" {
  llama_token token);
 
  //
- // Model split
+ // Beam search
  //
 
+ struct llama_beam_view {
+ const llama_token * tokens;
+
+ size_t n_tokens;
+ float p; // Cumulative beam probability (renormalized relative to all beams)
+ bool eob; // Callback should set this to true when a beam is at end-of-beam.
+ };
+
+ // Passed to beam_search_callback function.
+ // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
+ // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
+ // These pointers are valid only during the synchronous callback, so should not be saved.
+ struct llama_beams_state {
+ struct llama_beam_view * beam_views;
+
+ size_t n_beams; // Number of elements in beam_views[].
+ size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
+ bool last_call; // True iff this is the last callback invocation.
+ };
+
+ // Type of pointer to the beam_search_callback function.
+ // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
+ // passed back to beam_search_callback. This avoids having to use global variables in the callback.
+ typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
+
+ /// @details Deterministically returns entire sentence constructed by a beam search.
+ /// @param ctx Pointer to the llama_context.
+ /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
+ /// @param callback_data A pointer that is simply passed back to callback.
+ /// @param n_beams Number of beams to use.
+ /// @param n_past Number of tokens already evaluated.
+ /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
+ LLAMA_API void llama_beam_search(
+ struct llama_context * ctx,
+ llama_beam_search_callback_fn_t callback,
+ void * callback_data,
+ size_t n_beams,
+ int32_t n_past,
+ int32_t n_predict);
+
  /// @details Build a split GGUF final path for this chunk.
  /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
  // Returns the split_path length.