diff --git a/ggml.c b/ggml.c index 49f3b7aba31f5..17f0ce4877592 100644 --- a/ggml.c +++ b/ggml.c @@ -16602,6 +16602,10 @@ static void ggml_compute_forward_cross_entropy_loss_back( static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); + if (tensor->op == GGML_OP_NONE) { + return; + } + #ifdef GGML_USE_CUBLAS bool skip_cpu = ggml_cuda_compute_forward(params, tensor); if (skip_cpu) { diff --git a/llama.cpp b/llama.cpp index 3653493355234..76558059c4194 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5815,6 +5815,33 @@ static struct ggml_cgraph * llama_build_graph( GGML_ASSERT(false); } +#if 1 + for (int i = 0; i < result->n_nodes; ++i) { + struct ggml_tensor * node = result->nodes[i]; + if (getenv("SKIP_KQ_ALL")) { + if ( + strcmp(node->name, "KQ") == 0 || + strcmp(node->name, "KQ_scaled") == 0 || + strcmp(node->name, "KQ_masked") == 0 || + strcmp(node->name, "KQ_soft_max") == 0 || + strcmp(node->name, "KQV") == 0 || + false) { + //printf("skipping %s\n", dst->name); + node->op = GGML_OP_NONE; + } + } + if (getenv("SKIP_KQ_KQV")) { + if ( + strcmp(node->name, "KQ") == 0 || + strcmp(node->name, "KQV") == 0 || + false) { + //printf("skipping %s\n", dst->name); + node->op = GGML_OP_NONE; + } + } + } +#endif + return result; }