Skip to content

Commit 6b90566

Browse files
committed
control vector api and implementation
1 parent 8030da7 commit 6b90566

File tree

4 files changed

+364
-0
lines changed

4 files changed

+364
-0
lines changed

common/common.cpp

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,35 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
562562
break;
563563
}
564564
params.lora_base = argv[i];
565+
} else if (arg == "--control-vector") {
566+
if (++i >= argc) {
567+
invalid_param = true;
568+
break;
569+
}
570+
params.control_vectors.push_back(std::make_tuple(argv[i], 1.0f));
571+
} else if (arg == "--control-vector-scaled") {
572+
if (++i >= argc) {
573+
invalid_param = true;
574+
break;
575+
}
576+
const char * control_vector = argv[i];
577+
if (++i >= argc) {
578+
invalid_param = true;
579+
break;
580+
}
581+
params.control_vectors.push_back(std::make_tuple(control_vector, std::stof(argv[i])));
582+
} else if (arg == "--control-vector-layer-range") {
583+
if (++i >= argc) {
584+
invalid_param = true;
585+
break;
586+
}
587+
int32_t start = std::stoi(argv[i]);
588+
if (++i >= argc) {
589+
invalid_param = true;
590+
break;
591+
}
592+
int32_t end = std::stoi(argv[i]);
593+
params.control_vector_layer_range = std::make_tuple(start, end);
565594
} else if (arg == "--mmproj") {
566595
if (++i >= argc) {
567596
invalid_param = true;
@@ -1087,6 +1116,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
10871116
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
10881117
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
10891118
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
1119+
printf(" --control-vector FNAME\n");
1120+
printf(" add a control vector\n");
1121+
printf(" --control-vector-scaled FNAME S\n");
1122+
printf(" add a control vector with user defined scaling S\n");
1123+
printf(" --control-vector-layer-range START END\n");
1124+
printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
10901125
printf(" -m FNAME, --model FNAME\n");
10911126
printf(" model path (default: %s)\n", params.model.c_str());
10921127
printf(" -md FNAME, --model-draft FNAME\n");
@@ -1351,6 +1386,35 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
13511386
return std::make_tuple(nullptr, nullptr);
13521387
}
13531388

1389+
if (!params.control_vectors.empty()) {
1390+
int32_t layer_start, layer_end;
1391+
std::tie(layer_start, layer_end) = params.control_vector_layer_range;
1392+
1393+
if (layer_start == 0) layer_start = 1;
1394+
if (layer_end == 0) layer_end = 31;
1395+
1396+
std::vector<float> control_vector;
1397+
int n_embd;
1398+
std::tie(control_vector, n_embd) = llama_control_vector_load(params.control_vectors);
1399+
if (n_embd == -1) {
1400+
llama_free(lctx);
1401+
llama_free_model(model);
1402+
return std::make_tuple(nullptr, nullptr);
1403+
}
1404+
1405+
int err = llama_control_vector_apply(lctx,
1406+
control_vector.data(),
1407+
control_vector.size(),
1408+
n_embd,
1409+
layer_start,
1410+
layer_end);
1411+
if (err) {
1412+
llama_free(lctx);
1413+
llama_free_model(model);
1414+
return std::make_tuple(nullptr, nullptr);
1415+
}
1416+
}
1417+
13541418
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
13551419
const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
13561420
float lora_scale = std::get<1>(params.lora_adapter[i]);
@@ -1867,3 +1931,156 @@ void llama_embd_normalize(const float * inp, float * out, int n) {
18671931
}
18681932
}
18691933

1934+
//
1935+
// Control vector utils
1936+
//
1937+
1938+
static std::tuple<std::vector<float>, int> llama_control_vector_load_one(const std::string & path, float strength) {
1939+
int n_tensors;
1940+
size_t n_bytes = 0;
1941+
uint32_t max_direction_layer = 0;
1942+
int n_embd = -1;
1943+
1944+
// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
1945+
{
1946+
struct ggml_init_params meta_params = {
1947+
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
1948+
/* .mem_buffer = */ nullptr,
1949+
/* .no_alloc = */ true,
1950+
};
1951+
ggml_context * meta_ctx = ggml_init(meta_params);
1952+
struct gguf_init_params meta_gguf_params = {
1953+
/* .no_alloc = */ true,
1954+
/* .ctx = */ &meta_ctx,
1955+
};
1956+
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path.c_str(), meta_gguf_params);
1957+
if (!meta_ctx_gguf) {
1958+
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str());
1959+
ggml_free(meta_ctx);
1960+
return std::make_tuple(std::vector<float>(), -1);
1961+
}
1962+
1963+
n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
1964+
for (int i = 0; i < n_tensors; i++) {
1965+
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
1966+
1967+
// split on '.'
1968+
size_t dotpos = name.find('.');
1969+
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
1970+
try {
1971+
uint32_t layer = std::stoi(name.substr(dotpos + 1));
1972+
if (layer == 0) {
1973+
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str());
1974+
ggml_free(meta_ctx);
1975+
gguf_free(meta_ctx_gguf);
1976+
return std::make_tuple(std::vector<float>(), -1);
1977+
}
1978+
if (layer > max_direction_layer) {
1979+
max_direction_layer = layer;
1980+
}
1981+
} catch (...) {
1982+
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str());
1983+
ggml_free(meta_ctx);
1984+
gguf_free(meta_ctx_gguf);
1985+
return std::make_tuple(std::vector<float>(), -1);
1986+
}
1987+
}
1988+
1989+
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
1990+
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
1991+
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str());
1992+
ggml_free(meta_ctx);
1993+
gguf_free(meta_ctx_gguf);
1994+
return std::make_tuple(std::vector<float>(), -1);
1995+
}
1996+
if (n_embd == -1) {
1997+
n_embd = ggml_nelements(tensor_meta);
1998+
} else if (ggml_nelements(tensor_meta) != n_embd) {
1999+
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, path.c_str());
2000+
ggml_free(meta_ctx);
2001+
gguf_free(meta_ctx_gguf);
2002+
return std::make_tuple(std::vector<float>(), -1);
2003+
}
2004+
n_bytes += ggml_nbytes(tensor_meta);
2005+
}
2006+
ggml_free(meta_ctx);
2007+
gguf_free(meta_ctx_gguf);
2008+
}
2009+
2010+
if (n_tensors == 0) {
2011+
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, path.c_str());
2012+
return std::make_tuple(std::vector<float>(), -1);
2013+
}
2014+
2015+
// load and scale tensors into final control vector context
2016+
struct ggml_init_params ggml_params = {
2017+
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
2018+
/* .mem_buffer = */ nullptr,
2019+
/* .no_alloc = */ false,
2020+
};
2021+
struct ggml_context * ctx = ggml_init(ggml_params);
2022+
2023+
struct gguf_init_params params = {
2024+
/*.no_alloc = */ false,
2025+
/*.ctx = */ &ctx,
2026+
};
2027+
struct gguf_context * ctx_gguf = gguf_init_from_file(path.c_str(), params);
2028+
if (!ctx_gguf) {
2029+
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str());
2030+
ggml_free(ctx);
2031+
return std::make_tuple(std::vector<float>(), -1);
2032+
}
2033+
2034+
std::vector<float> vector;
2035+
for (uint32_t i = 1; i < max_direction_layer; i++) {
2036+
std::string name = "direction." + std::to_string(i);
2037+
ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
2038+
if (tensor) {
2039+
const float * data = (const float *) tensor->data;
2040+
for (int i = 0; i < n_embd; i++) {
2041+
vector.push_back(data[i] * strength);
2042+
}
2043+
} else {
2044+
vector.insert(vector.end(), n_embd, 0.); // as a filler
2045+
}
2046+
}
2047+
2048+
return std::make_tuple(vector, n_embd);
2049+
}
2050+
2051+
std::tuple<std::vector<float>, int> llama_control_vector_load(const std::vector<std::tuple<std::string, float>> & vectors) {
2052+
std::vector<float> vector;
2053+
int n_embd = -1;
2054+
2055+
for (const auto& pair : vectors) {
2056+
std::string path;
2057+
float strength;
2058+
std::tie(path, strength) = pair;
2059+
2060+
std::vector<float> v;
2061+
int v_n_embd;
2062+
std::tie(v, v_n_embd) = llama_control_vector_load_one(path, strength);
2063+
2064+
if (v_n_embd == -1) {
2065+
return std::make_tuple(std::vector<float>(), -1);
2066+
}
2067+
if (n_embd != -1 && (n_embd != v_n_embd || v.size() != vector.size())) {
2068+
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, path.c_str());
2069+
return std::make_tuple(std::vector<float>(), -1);
2070+
}
2071+
2072+
if (n_embd == -1) {
2073+
vector = std::move(v);
2074+
n_embd = v_n_embd;
2075+
} else {
2076+
for (size_t i = 0; i < vector.size(); i++) {
2077+
vector[i] += v[i];
2078+
}
2079+
}
2080+
}
2081+
2082+
if (n_embd == -1) {
2083+
fprintf(stderr, "%s: no vectors passed\n", __func__);
2084+
}
2085+
return std::make_tuple(vector, n_embd);
2086+
}

common/common.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ struct gpt_params {
102102
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
103103
std::string lora_base = ""; // base model path for the lora adapter
104104

105+
std::vector<std::tuple<std::string, float>> control_vectors; // control vector with user defined scale
106+
std::tuple<int32_t, int32_t> control_vector_layer_range; // layer range for control vector
107+
105108
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
106109
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
107110
// (which is more convenient to use for plotting)
@@ -267,3 +270,12 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
267270

268271
void llama_embd_normalize(const float * inp, float * out, int n);
269272

273+
//
274+
// Control vector utils
275+
//
276+
277+
// Load control vectors from a tuple of {path, strength}, scale each by strength, and add them together.
278+
// Returns a tuple of {concatenated vector data (n_emnd x n_layer), n_embd}
279+
// On error, returns a tuple of {empty, -1}
280+
std::tuple<std::vector<float>, int> llama_control_vector_load(
281+
const std::vector<std::tuple<std::string, float>> & vectors);

0 commit comments

Comments
 (0)