Skip to content

WIP: Complete removal or f16_kv, add offload_kqv field #1019

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,7 +751,6 @@ def __init__(
yarn_beta_slow: float = 1.0,
yarn_orig_ctx: int = 0,
mul_mat_q: bool = True,
f16_kv: bool = True,
logits_all: bool = False,
embedding: bool = False,
# Sampling Params
Expand Down Expand Up @@ -817,7 +816,6 @@ def __init__(
yarn_beta_fast: YaRN low correction dim
yarn_beta_slow: YaRN high correction dim
yarn_orig_ctx: YaRN original context size
f16_kv: Use fp16 for KV cache, fp32 otherwise
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
embedding: Embedding mode only.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
Expand Down Expand Up @@ -904,7 +902,6 @@ def __init__(
)
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
self.context_params.mul_mat_q = mul_mat_q
# self.context_params.f16_kv = f16_kv
self.context_params.logits_all = logits_all
self.context_params.embedding = embedding

Expand Down Expand Up @@ -2155,7 +2152,6 @@ def __getstate__(self):
yarn_beta_slow=self.context_params.yarn_beta_slow,
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
mul_mat_q=self.context_params.mul_mat_q,
f16_kv=self.context_params.f16_kv,
logits_all=self.context_params.logits_all,
embedding=self.context_params.embedding,
# Sampling Params
Expand Down Expand Up @@ -2198,7 +2194,6 @@ def __setstate__(self, state):
yarn_beta_slow=state["yarn_beta_slow"],
yarn_orig_ctx=state["yarn_orig_ctx"],
mul_mat_q=state["mul_mat_q"],
f16_kv=state["f16_kv"],
logits_all=state["logits_all"],
embedding=state["embedding"],
# Sampling Params
Expand Down
6 changes: 3 additions & 3 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,9 +432,9 @@ class llama_context_params(Structure):
type_k (int): data type for K cache
type_v (int): data type for V cache
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
f16_kv (bool): use fp16 for KV cache, fp32 otherwise
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
embedding (bool): embedding mode only"""
embedding (bool): embedding mode only
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU"""
_fields_ = [
("seed", c_uint32),
("n_ctx", c_uint32),
Expand All @@ -452,9 +452,9 @@ class llama_context_params(Structure):
("type_k", c_int),
("type_v", c_int),
("mul_mat_q", c_bool),
("f16_kv", c_bool),
("logits_all", c_bool),
("embedding", c_bool),
("offload_kqv", c_bool),
]


Expand Down
2 changes: 0 additions & 2 deletions llama_cpp/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ class Settings(BaseSettings):
mul_mat_q: bool = Field(
default=True, description="if true, use experimental mul_mat_q kernels"
)
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
logits_all: bool = Field(default=True, description="Whether to return logits.")
embedding: bool = Field(default=True, description="Whether to use embeddings.")
# Sampling Params
Expand Down Expand Up @@ -408,7 +407,6 @@ def create_app(settings: Optional[Settings] = None):
yarn_beta_slow=settings.yarn_beta_slow,
yarn_orig_ctx=settings.yarn_orig_ctx,
mul_mat_q=settings.mul_mat_q,
f16_kv=settings.f16_kv,
logits_all=settings.logits_all,
embedding=settings.embedding,
# Sampling Params
Expand Down