context : fix init of n_outputs (#12397)

ggml-ci
2025-04-16 03:26:08 +00:00 · 2025-03-16 19:29:36 +02:00 · 2025-03-16 19:29:36 +02:00 · dc079cfdff
commit dc079cfdff
parent 7b61bcc87c
1 changed files with 8 additions and 2 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -285,11 +285,15 @@ llama_context::llama_context(

    // reserve worst-case graph
    if (!hparams.vocab_only) {
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+        const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);

        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph

+        // restore later
+        // TODO: something cleaner
+        const auto n_outputs_save = n_outputs;
+
        // max number of outputs
        n_outputs = n_tokens;

@ -341,6 +345,8 @@ llama_context::llama_context(
            }
        }

+        n_outputs = n_outputs_save;
+
        for (size_t i = 0; i < backend_ptrs.size(); ++i) {
            ggml_backend_t             backend = backend_ptrs[i];
            ggml_backend_buffer_type_t buft    = backend_buft[i];