Skip to content

Commit

Permalink
llama : streamline embeddings from "non-embedding" models (ggerganov#…
Browse files Browse the repository at this point in the history
  • Loading branch information
iamlemec authored and Nexesenex committed Jul 8, 2024
1 parent dfab2f7 commit 2e0a8dc
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 10 deletions.
13 changes: 12 additions & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; }
return true;
}
if (arg == "--attention") {
CHECK_ARG
std::string value(argv[i]);
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
else { invalid_param = true; }
return true;
}
if (arg == "--defrag-thold" || arg == "-dt") {
CHECK_ARG
params.defrag_thold = std::stof(argv[i]);
Expand Down Expand Up @@ -1493,8 +1501,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });

options.push_back({ "embedding" });
options.push_back({ "embedding", " --pooling {none,mean,cls}",
options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
"pooling type for embeddings, use model default if unspecified" });
options.push_back({ "embedding", " --attention {causal,non-causal}",
"attention type for embeddings, use model default if unspecified" });

options.push_back({ "context hacking" });
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
Expand Down Expand Up @@ -2200,6 +2210,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
cparams.attention_type = params.attention_type;
cparams.defrag_thold = params.defrag_thold;
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ struct gpt_params {
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

// sampling parameters
int32_t top_k = 40; // <= 0 to use vocab size
Expand Down
25 changes: 16 additions & 9 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13998,7 +13998,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
}
}

if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
const int64_t n_tokens = batch.n_tokens;

GGML_ASSERT(lctx.inp_mean);
Expand Down Expand Up @@ -14030,7 +14030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
}
}

if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
const int64_t n_tokens = batch.n_tokens;

GGML_ASSERT(lctx.inp_cls);
Expand All @@ -14051,7 +14051,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
}
}

if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
const int64_t n_tokens = batch.n_tokens;

GGML_ASSERT(lctx.inp_cls);
Expand Down Expand Up @@ -14339,14 +14339,15 @@ static int llama_decode_internal(
std::vector<llama_seq_id *> seq_id_arr;
std::vector<std::vector<llama_seq_id>> seq_id;

// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;

// count outputs
if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
n_outputs = n_tokens_all;
} else if (batch_all.logits) {
if (batch_all.logits && !embd_pooled) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs += batch_all.logits[i] != 0;
}
} else if (lctx.logits_all) {
} else if (lctx.logits_all || embd_pooled) {
n_outputs = n_tokens_all;
} else {
// keep last output only
Expand Down Expand Up @@ -14392,7 +14393,7 @@ static int llama_decode_internal(
{
int32_t n_outputs_new = 0;

if (u_batch.logits) {
if (u_batch.logits && !embd_pooled) {
for (uint32_t i = 0; i < n_tokens; i++) {
n_outputs_new += u_batch.logits[i] != 0;
}
Expand Down Expand Up @@ -21188,6 +21189,7 @@ struct llama_context_params llama_context_default_params() {
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
/*.rope_freq_base =*/ 0.0f,
/*.rope_freq_scale =*/ 0.0f,
/*.yarn_ext_factor =*/ -1.0f,
Expand Down Expand Up @@ -21428,7 +21430,6 @@ struct llama_context * llama_new_context_with_model(
}

cparams.yarn_attn_factor *= hparams.rope_attn_factor;
cparams.causal_attn = hparams.causal_attn;

if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
Expand All @@ -21438,6 +21439,12 @@ struct llama_context * llama_new_context_with_model(
}
}

if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
cparams.causal_attn = hparams.causal_attn;
} else {
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
}

if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
Expand Down
7 changes: 7 additions & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,12 @@ extern "C" {
LLAMA_POOLING_TYPE_LAST = 3,
};

enum llama_attention_type {
LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
LLAMA_ATTENTION_TYPE_CAUSAL = 0,
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
};

enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
Expand Down Expand Up @@ -361,6 +367,7 @@ extern "C" {

enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
enum llama_attention_type attention_type; // attention type to use for embeddings

// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
Expand Down

0 comments on commit 2e0a8dc

Please sign in to comment.