llama : add option to skip the compute of a batch

ggerganov · ggerganov · commit e2f222ceca49 · 2025-11-02T18:45:39.000+02:00
diff --git a/include/llama.h b/include/llama.h
@@ -907,6 +907,8 @@ extern "C" {
     // Set abort callback
     LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 
+    LLAMA_API void llama_set_skip_compute(struct llama_context * ctx, bool val);
+
     // Wait until all computations are finished
     // This is automatically done when using one of the functions below to obtain the computation results
     // and is not necessary to call it explicitly in most cases
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -691,6 +691,12 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
     }
 }
 
+void llama_context::set_skip_compute(bool val) {
+    LLAMA_LOG_DEBUG("%s: val = %d\n", __func__, val);
+
+    skip_compute = val;
+}
+
 void llama_context::set_embeddings(bool value) {
     LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
 
@@ -799,6 +805,10 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
         //LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
     }
 
+    if (skip_compute) {
+        return res;
+    }
+
     const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
     if (status != GGML_STATUS_SUCCESS) {
         LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
@@ -2447,6 +2457,10 @@ void llama_set_abort_callback(llama_context * ctx, bool (*abort_callback)(void *
     ctx->set_abort_callback(abort_callback, abort_callback_data);
 }
 
+void llama_set_skip_compute(llama_context * ctx, bool val) {
+    ctx->set_skip_compute(val);
+}
+
 void llama_set_embeddings(llama_context * ctx, bool embeddings) {
     ctx->set_embeddings(embeddings);
 }
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -76,6 +76,8 @@ struct llama_context {
 
     void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data);
 
+    void set_skip_compute(bool val);
+
     void set_embeddings (bool value);
     void set_causal_attn(bool value);
     void set_warmup(bool value);
@@ -279,6 +281,8 @@ struct llama_context {
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
 
+    bool skip_compute = false; // skip the actual computation of the model (useful for benchmarking)
+
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
 
     // buffer types used for the compute buffer of each backend
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
@@ -2138,11 +2138,13 @@ int main(int argc, char ** argv) {
                     fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
                             i + 1, params.reps);
                 }
+                llama_set_skip_compute(ctx, true);
                 bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
                 if (!res) {
                     fprintf(stderr, "%s: error: failed to run depth\n", __func__);
                     exit(1);
                 }
+                llama_set_skip_compute(ctx, false);
             }
 
             uint64_t t_start = get_time_ns();

Original file line number	Diff line number	Diff line change
`@@ -691,6 +691,12 @@ void llama_context::set_abort_callback(bool (abort_callback)(void data), void`
`691`	`691`	`}`
`692`	`692`	`}`
`693`	`693`
	`694`	`+void llama_context::set_skip_compute(bool val) {`
	`695`	`+ LLAMA_LOG_DEBUG("%s: val = %d\n", __func__, val);`
	`696`	`+`
	`697`	`+ skip_compute = val;`
	`698`	`+}`
	`699`	`+`
`694`	`700`	`void llama_context::set_embeddings(bool value) {`
`695`	`701`	`LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);`
`696`	`702`
`@@ -799,6 +805,10 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll`
`799`	`805`	`//LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);`
`800`	`806`	`}`
`801`	`807`
	`808`	`+ if (skip_compute) {`
	`809`	`+ return res;`
	`810`	`+ }`
	`811`	`+`
`802`	`812`	`const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);`
`803`	`813`	`if (status != GGML_STATUS_SUCCESS) {`
`804`	`814`	`LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);`
`@@ -2447,6 +2457,10 @@ void llama_set_abort_callback(llama_context * ctx, bool (abort_callback)(void `
`2447`	`2457`	`ctx->set_abort_callback(abort_callback, abort_callback_data);`
`2448`	`2458`	`}`
`2449`	`2459`
	`2460`	`+void llama_set_skip_compute(llama_context * ctx, bool val) {`
	`2461`	`+ ctx->set_skip_compute(val);`
	`2462`	`+}`
	`2463`	`+`
`2450`	`2464`	`void llama_set_embeddings(llama_context * ctx, bool embeddings) {`
`2451`	`2465`	`ctx->set_embeddings(embeddings);`
`2452`	`2466`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2138,11 +2138,13 @@ int main(int argc, char ** argv) {`
`2138`	`2138`	`fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,`
`2139`	`2139`	`i + 1, params.reps);`
`2140`	`2140`	`}`
	`2141`	`+ llama_set_skip_compute(ctx, true);`
`2141`	`2142`	`bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);`
`2142`	`2143`	`if (!res) {`
`2143`	`2144`	`fprintf(stderr, "%s: error: failed to run depth\n", __func__);`
`2144`	`2145`	`exit(1);`
`2145`	`2146`	`}`
	`2147`	`+ llama_set_skip_compute(ctx, false);`
`2146`	`2148`	`}`
`2147`	`2149`
`2148`	`2150`	`uint64_t t_start = get_time_ns();`