Skip to content

Commit

Permalink
Merge branch 'feature/llama_supports_rpc' of github.com:martindevans/…
Browse files Browse the repository at this point in the history
…llama.cpp into feature/llama_supports_rpc
  • Loading branch information
martindevans committed Jun 9, 2024
2 parents b67c0df + 9b15621 commit f87e6ac
Showing 1 changed file with 46 additions and 23 deletions.
69 changes: 46 additions & 23 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ extern "C" {
LLAMA_ROPE_TYPE_GLM = 4,
};

enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
enum llama_token_type {
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
LLAMA_TOKEN_TYPE_NORMAL = 1,
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
Expand All @@ -107,20 +107,6 @@ extern "C" {
LLAMA_TOKEN_TYPE_BYTE = 6,
};

enum llama_token_attr {
LLAMA_TOKEN_ATTR_UNDEFINED = 0,
LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0,
LLAMA_TOKEN_ATTR_UNUSED = 1 << 1,
LLAMA_TOKEN_ATTR_NORMAL = 1 << 2,
LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL?
LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
LLAMA_TOKEN_ATTR_BYTE = 1 << 5,
LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6,
LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7,
LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8,
LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9,
};

// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,
Expand Down Expand Up @@ -365,9 +351,6 @@ extern "C" {
// modifies a preceding LLAMA_GRETYPE_CHAR or
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
LLAMA_GRETYPE_CHAR_ALT = 6,

// any character (.)
LLAMA_GRETYPE_CHAR_ANY = 7,
};

typedef struct llama_grammar_element {
Expand Down Expand Up @@ -430,7 +413,7 @@ extern "C" {

LLAMA_API bool llama_supports_mmap (void);
LLAMA_API bool llama_supports_mlock (void);
LLAMA_API bool llama_supports_rpc (void);
LLAMA_API bool llama_supports_rpc (void); // TMP: https://github.com/ggerganov/llama.cpp/pull/7647#issuecomment-2140234367
LLAMA_API bool llama_supports_gpu_offload(void);

LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
Expand All @@ -442,8 +425,8 @@ extern "C" {

LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);

LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);

LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
Expand Down Expand Up @@ -839,7 +822,7 @@ extern "C" {

LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);

LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);

// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
Expand Down Expand Up @@ -1060,9 +1043,49 @@ extern "C" {
llama_token token);

//
// Model split
// Beam search
//

struct llama_beam_view {
const llama_token * tokens;

size_t n_tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Callback should set this to true when a beam is at end-of-beam.
};

// Passed to beam_search_callback function.
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
// These pointers are valid only during the synchronous callback, so should not be saved.
struct llama_beams_state {
struct llama_beam_view * beam_views;

size_t n_beams; // Number of elements in beam_views[].
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
bool last_call; // True iff this is the last callback invocation.
};

// Type of pointer to the beam_search_callback function.
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);

/// @details Deterministically returns entire sentence constructed by a beam search.
/// @param ctx Pointer to the llama_context.
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
/// @param callback_data A pointer that is simply passed back to callback.
/// @param n_beams Number of beams to use.
/// @param n_past Number of tokens already evaluated.
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
LLAMA_API void llama_beam_search(
struct llama_context * ctx,
llama_beam_search_callback_fn_t callback,
void * callback_data,
size_t n_beams,
int32_t n_past,
int32_t n_predict);

/// @details Build a split GGUF final path for this chunk.
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
// Returns the split_path length.
Expand Down

0 comments on commit f87e6ac

Please sign in to comment.