diff --git a/VULKAN_BUILD_VERIFICATION.md b/VULKAN_BUILD_VERIFICATION.md new file mode 100644 index 000000000000..014d47453b14 --- /dev/null +++ b/VULKAN_BUILD_VERIFICATION.md @@ -0,0 +1,117 @@ +# Vulkan Build Verification for PR #79 + +Verifies that PR #79 (`dflash: enable Qwen3-Coder-Next on Vulkan`, commit `b788b4af1`) builds cleanly for the Vulkan backend on two independent host environments. + +--- + +## Verification 1 — AMD Strix Halo (Debian 13 / Mesa RADV) + +**Commit verified:** `b788b4af1` — `dflash: enable Qwen3-Coder-Next on Vulkan` + +### Build System + +| Component | Detail | +|---|---| +| **OS** | Debian 13 (trixie), Linux 6.12.90-amd64 | +| **CPU** | AMD Ryzen AI MAX+ 395 w/ Radeon 8060S (16C/32T, up to 5185 MHz) | +| **GPU** | AMD Strix Halo Radeon 8060S (RADV GFX1151, Mesa 25.0.7) | +| **RAM** | 30 GiB | +| **CMake** | 3.31.6 | +| **Compiler** | GCC 14.2.0 (`-march=native`) | +| **Make** | GNU Make 4.4.1, 32 parallel jobs | +| **Vulkan SDK** | 1.4.309, glslc (shaderc 2025.2, glslang 15.1.0) | +| **ccache** | Enabled (found automatically) | + +## CMake Configuration + +``` +cmake -B build_vulkan -DGGML_VULKAN=ON -DGGML_NATIVE=ON -DCMAKE_BUILD_TYPE=Release +``` + +Key flags: +- `GGML_VULKAN=ON` — Vulkan backend enabled +- `GGML_NATIVE=ON` — CPU backend compiled with `-march=native` +- `GGML_CPU=ON` — CPU backend included (cooperative with Vulkan) +- `CMAKE_BUILD_TYPE=Release` + +Vulkan shader features detected: +- `GL_KHR_cooperative_matrix` — supported +- `GL_NV_cooperative_matrix2` — supported +- `GL_EXT_integer_dot_product` — supported +- `GL_EXT_bfloat16` — not supported (driver/hardware limitation) + +## Build Result + +- **Configure:** Success (rc=0, 1.8s) +- **Build:** Success (100%, 0 warnings treated as errors) +- **Binaries produced:** `llama-server`, `llama-cli`, `llama-bench`, `llama`, `llama-perplexity`, `llama-imatrix`, `llama-quantize`, `llama-llama-bench`, all test binaries +- **Working tree:** Clean, nothing to commit + +### Conclusion + +PR #79 builds cleanly for Vulkan on native AMD hardware (Strix Halo / Radeon 8060S) with Mesa RADV driver. No compilation errors or warnings. + +--- + +## Verification 2 — NVIDIA RTX 3090 host (Debian 11 / NVIDIA Vulkan ICD) + +**Commit verified:** `b788b4af1` — `dflash: enable Qwen3-Coder-Next on Vulkan` (on the updated PR head `4efe54303`) + +### Build System + +| Component | Detail | +|---|---| +| **OS** | Debian GNU/Linux 11 (bullseye), Linux 5.10.0-43-amd64 | +| **CPU** | AMD Ryzen 9 5950X 16-Core (32 threads, `-march=native`) | +| **GPU** | NVIDIA GeForce RTX 3090 (GA102) — host for the build | +| **RAM** | 62 GiB | +| **CMake** | 3.31.11 | +| **Compiler** | GCC 10.2.1 (`g++ (Debian 10.2.1-6)`) | +| **Make/Ninja** | Ninja 1.10.1, 32 parallel jobs | +| **glslc** | shaderc v2023.2 (built from source, installed to `~/.local/bin/glslc`; not packaged in Debian 11) | +| **Vulkan headers** | 1.4.309 (KhronosGroup `Vulkan-Headers` v1.4.309, header-only, installed to `~/.local`; Debian 11 ships only 1.2.162 which is too old for current `ggml-vulkan.cpp`) | +| **Vulkan loader** | libvulkan 1.2.162 (system `libvulkan-dev`) — sufficient to link; newer symbols are resolved at runtime via the loader/ICD | + +### Setup notes for Debian 11 + +Debian 11 does not ship `glslc`/`shaderc` and its Vulkan headers (1.2.162) predate symbols the current Vulkan backend requires (`vk::PhysicalDeviceMaintenance4Properties`, `vk::DriverId::eMesaTurnip`/`eMesaDozen`, `layer_setting_info`). Two host-side additions were needed, neither touching the PR source: + +1. Build and install `glslc` from Shaderc `v2023.2` source → `~/.local/bin/glslc`. +2. Install header-only `Vulkan-Headers` v1.4.309 → `~/.local/include/vulkan`. + +Then configure with the local prefix so CMake's `FindVulkan` picks up the new headers: + +```bash +export CMAKE_PREFIX_PATH="$HOME/.local:$CMAKE_PREFIX_PATH" +cmake -B build_vulkan -DGGML_VULKAN=ON -DGGML_NATIVE=ON -DCMAKE_BUILD_TYPE=Release \ + -DVulkan_INCLUDE_DIR="$HOME/.local/include" +cmake --build build_vulkan -j32 +``` + +### CMake Configuration + +``` +-- Found Vulkan: /usr/lib/x86_64-linux-gnu/libvulkan.so (found version "1.4.309") found components: glslc missing components: glslangValidator +-- Vulkan found +-- GL_KHR_cooperative_matrix not supported by glslc +-- GL_NV_cooperative_matrix2 not supported by glslc +-- GL_EXT_integer_dot_product not supported by glslc +-- GL_EXT_bfloat16 not supported by glslc +-- Including Vulkan backend +``` + +(Shader feature probes are `not supported` due to the older `glslc` v2023.2 build; this only disables optional cooperative-matrix fast paths and does not affect compilation.) + +### Build Result + +- **Configure:** Success (rc=0) +- **ggml-vulkan target:** Success (rc=0, 100%) +- **Full build:** Success (rc=0, 100%) +- **Binaries produced:** `llama-server`, `llama-cli`, `llama-bench`, `llama-perplexity`, `test-dflash-plumbing` (plus shared libs `libggml-vulkan.so`, `libllama-server-impl.so`, etc.) +- **DFlash plumbing test:** `test-dflash-plumbing` → rc=0 +- **Errors:** 0 (zero `error:` lines in the full build log) +- **Warnings:** only benign — 35× `-Wdouble-promotion`, 1× `-Wmissing-field-initializers` (no `-Werror`) + +### Conclusion + +PR #79 builds cleanly for Vulkan on a Debian 11 / NVIDIA RTX 3090 host once `glslc` and current Vulkan headers are supplied locally (no PR source changes required). Combined with Verification 1, the Vulkan backend compiles end-to-end on both AMD/Mesa RADV (Debian 13) and NVIDIA (Debian 11) hosts. diff --git a/common/speculative.cpp b/common/speculative.cpp index 74850df208fe..c0493ddc365a 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -2574,6 +2574,12 @@ struct common_speculative_impl_dflash : public common_speculative_impl { llama_set_dflash_gpu_capture(ctx_tgt, false); LOG_WRN("dflash: GPU cross ring unavailable; using CPU hidden capture\n"); } + + // Full-attention DFlash layers may read target context from the drafter's + // normal KV cache only when the GPU cross ring can populate it. Vulkan + // CPU hidden capture has no such cache and must project K/V freshly from + // target_hidden in the drafter graph. + llama_set_dflash_target_kv_available(ctx_dft, gpu_ring_handle != nullptr); } ~common_speculative_impl_dflash() override { @@ -3005,13 +3011,16 @@ struct common_speculative_impl_dflash : public common_speculative_impl { // build drafter batch: [id_last, mask, mask, ..., mask] // positions stay on the target model's absolute timeline so RoPE // tracks the accepted suffix instead of restarting at the window. - // batch size adapts to n_draft+1 (saves compute when n_max < block_size-1) - const int batch_len = n_draft + 1; + // DFlash attention is non-causal over the query block, so shorter + // n_draft+1 batches are not semantically equivalent to the trained + // full block. Decode the full block but consume only output_len rows. + const int batch_len = block_size; + const int output_len = n_draft + 1; const int draft_pos_base = committed_len; common_batch_clear(batch_dft); common_batch_add(batch_dft, id_last, draft_pos_base, { seq_id }, true); for (int i = 1; i < batch_len; ++i) { - common_batch_add(batch_dft, mask_token_id, draft_pos_base + i, { seq_id }, true); + common_batch_add(batch_dft, mask_token_id, draft_pos_base + i, { seq_id }, i < output_len); } const int64_t t2 = ggml_time_us(); @@ -3025,7 +3034,7 @@ struct common_speculative_impl_dflash : public common_speculative_impl { const int64_t t3 = ggml_time_us(); - // read argmax tokens for positions 1..batch_len-1 (skip position 0 = staged_first) + // read argmax tokens for output positions 1..output_len-1 (skip position 0 = staged_first) { int32_t * argmax = llama_get_logits_argmax(ctx_dft); float * argmax_probs = llama_get_logits_argmax_probs(ctx_dft); @@ -3033,7 +3042,7 @@ struct common_speculative_impl_dflash : public common_speculative_impl { const int argmax_rows = llama_get_logits_argmax_n(ctx_dft); if (argmax) { const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model_dft)); - if (!common_dflash_argmax_shape_valid(__func__, argmax_rows, batch_len, K_flat)) { + if (!common_dflash_argmax_shape_valid(__func__, argmax_rows, output_len, K_flat)) { if (dp.draft_log_probs) { dp.draft_log_probs->clear(); } @@ -3042,21 +3051,21 @@ struct common_speculative_impl_dflash : public common_speculative_impl { } // GPU argmax path - only top-k ids/probs are transferred. - for (int i = 1; i < batch_len && (int) result.size() < n_draft; ++i) { + for (int i = 1; i < output_len && (int) result.size() < n_draft; ++i) { const auto params = dp; if (argmax_probs && p_min > 0.0f && (int) result.size() >= params.n_min) { float log_prob = argmax_probs[i * K_flat]; float log_p_min = logf(p_min); if (log_prob < log_p_min) { LOG_DBG("dflash: early stop at position %d/%d (prob %.3f < p_min %.3f)\n", - i, batch_len, expf(log_prob), p_min); + i, output_len, expf(log_prob), p_min); break; } } const int32_t token_raw = argmax[i * K_flat]; if (!common_dflash_argmax_token_valid(token_raw, n_vocab)) { const float score = argmax_probs ? argmax_probs[i * K_flat] : std::numeric_limits::quiet_NaN(); - note_invalid_reduced_logits(__func__, token_raw, i, batch_len, K_flat, + note_invalid_reduced_logits(__func__, token_raw, i, output_len, K_flat, committed_len, cross_len, -1, 0, score); if (dp.draft_log_probs) { dp.draft_log_probs->clear(); @@ -3073,7 +3082,7 @@ struct common_speculative_impl_dflash : public common_speculative_impl { } else { // fallback: CPU argmax over full vocab const int n_vocab_dft = llama_vocab_n_tokens(llama_model_get_vocab(model_dft)); - for (int i = 1; i < batch_len && (int) result.size() < n_draft; ++i) { + for (int i = 1; i < output_len && (int) result.size() < n_draft; ++i) { float * logits = llama_get_logits_ith(ctx_dft, i); if (!logits) { break; @@ -4561,7 +4570,11 @@ void common_speculative_draft_batch( const llama_model * model_dft = llama_get_model(ctx_dft); const int block_size = llama_model_dflash_block_size(model_dft); const int n_draft = std::min(block_size - 1, params.n_max); - const int batch_len = n_draft + 1; + // Keep the full query block for flat/batched DFlash too. Tree drafting + // already does this; non-causal query attention makes shorter batches + // semantically different from the trained full block. + const int batch_len = block_size; + const int output_len = n_draft + 1; const llama_token mask_tok = (llama_token) llama_model_dflash_mask_token_id(model_dft); const int64_t t0 = ggml_time_us(); @@ -4641,7 +4654,7 @@ void common_speculative_draft_batch( for (const auto & rs : ready) { common_batch_add(batch, id_last_per_spec[rs.spec_idx], rs.draft_pos_base, { rs.seq_id }, true); for (int i = 1; i < batch_len; i++) { - common_batch_add(batch, mask_tok, rs.draft_pos_base + i, { rs.seq_id }, true); + common_batch_add(batch, mask_tok, rs.draft_pos_base + i, { rs.seq_id }, i < output_len); } } @@ -4664,11 +4677,11 @@ void common_speculative_draft_batch( auto & rs = ready[r]; auto & result = result_per_spec[rs.spec_idx]; std::vector * log_probs = log_probs_per_spec ? &(*log_probs_per_spec)[rs.spec_idx] : nullptr; - const int offset = r * batch_len; + const int offset = r * output_len; if (argmax) { const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model_dft)); - if (!common_dflash_argmax_shape_valid(__func__, argmax_rows, n_ready * batch_len, K_flat)) { + if (!common_dflash_argmax_shape_valid(__func__, argmax_rows, n_ready * output_len, K_flat)) { if (log_probs) { log_probs->clear(); } @@ -4676,7 +4689,7 @@ void common_speculative_draft_batch( return; } - for (int i = 1; i < batch_len && (int) result.size() < n_draft; i++) { + for (int i = 1; i < output_len && (int) result.size() < n_draft; i++) { if (argmax_probs && params.p_min > 0.0f && (int) result.size() >= params.n_min) { float log_prob = argmax_probs[(offset + i) * K_flat]; if (log_prob < logf(params.p_min)) { @@ -4687,7 +4700,7 @@ void common_speculative_draft_batch( if (!common_dflash_argmax_token_valid(token_raw, n_vocab)) { const float score = argmax_probs ? argmax_probs[(offset + i) * K_flat] : std::numeric_limits::quiet_NaN(); auto * dfl = static_cast(rs.impl); - dfl->note_invalid_reduced_logits(__func__, token_raw, i, batch_len, K_flat, + dfl->note_invalid_reduced_logits(__func__, token_raw, i, output_len, K_flat, rs.draft_pos_base, rs.cross_len, rs.spec_idx, offset, score); if (log_probs) { log_probs->clear(); @@ -4703,7 +4716,7 @@ void common_speculative_draft_batch( } } else { const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model_dft)); - for (int i = 1; i < batch_len && (int) result.size() < n_draft; i++) { + for (int i = 1; i < output_len && (int) result.size() < n_draft; i++) { float * logits = llama_get_logits_ith(ctx_dft, offset + i); if (!logits) { break; diff --git a/docs/enable-dflash-qwen3-coder-next-on-vulkan.md b/docs/enable-dflash-qwen3-coder-next-on-vulkan.md new file mode 100644 index 000000000000..6485a512a780 --- /dev/null +++ b/docs/enable-dflash-qwen3-coder-next-on-vulkan.md @@ -0,0 +1,308 @@ +# Enable DFlash Qwen3-Coder-Next on Vulkan + +This document is the canonical rationale for the Qwen3-Coder-Next DFlash Vulkan fix. It replaces the scattered investigation notes that were created while debugging the original 0% draft-token acceptance problem. + +## Summary + +Qwen3-Coder-Next DFlash originally produced near-0% accepted draft tokens on Vulkan. The failure was not caused by a generic DFlash verifier bug: Qwen3.6 DFlash worked, and Qwen3-Coder-Next could work under other implementations. The issue was a chain of Qwen3Next-specific runtime and drafter-input mismatches. + +The decisive Vulkan runtime fix is: + +- Vulkan does not have the DFlash GPU cross ring. +- Without the GPU cross ring, the drafter's normal full-attention KV cache is not populated with target-context K/V. +- The Qwen3-Coder-Next drafter was still taking the full-attention KV-cache branch, reading empty/stale context. +- The DFlash drafter graph now only builds that branch when target KV is actually available. On Vulkan it falls back to projecting K/V freshly from CPU-captured `target_hidden`. + +With the runtime fixes and a drafter trained for the live C++ position semantics, in-distribution acceptance was observed at roughly 68-72% instead of 0%. + +## User-facing command shape + +The working Vulkan launch shape is: + +```bash +build-vulkan/bin/llama-server \ + -m /crypt/models/Qwen3-Coder-Next-Q8_0-00001-of-00003.gguf \ + --spec-draft-model /crypt/tmp/zlab_finetuned_pos1_f16.gguf \ + --spec-type dflash \ + --spec-dflash-cross-ctx 512 \ + --spec-draft-n-max 4 \ + --spec-branch-budget 0 \ + -np 1 \ + --kv-unified \ + -ngl all \ + --spec-draft-ngl all \ + -b 2048 \ + --ctx-size 8192 \ + --flash-attn on \ + --jinja \ + --temp 0.0 \ + --top-k 1 +``` + +The exact drafter file matters. The runtime fix enables the correct Vulkan path, but a drafter trained against the wrong input/position semantics can still show poor acceptance. + +## Root-cause chain + +### 1. Qwen3Next conv state was not advanced during DFlash rollback + +Qwen3-Coder-Next has recurrent / convolutional state. During DFlash verification and rollback, accepted tokens must advance the target recurrent state. The tape replay path had to reconstruct both DeltaNet state and convolution state. + +The Qwen3Next graph emits the pre-conv QKV tensor as: + +```text +linear_attn_qkv_mixed- +``` + +with a legacy path named: + +```text +qkv_mixed- +``` + +The DFlash tape map did not capture these names, so `tape_replay_conv()` skipped every Qwen3Next recurrent layer. The conv state `r_l` remained frozen at the pre-draft backup state. That could produce garbled target output even when no draft tokens were accepted. + +Fix: + +```cpp +dflash_capture->tape_name_map["linear_attn_qkv_mixed-" + il_str] = {idx, DFLASH_TAPE_QKV}; +dflash_capture->tape_name_map["qkv_mixed-" + il_str] = {idx, DFLASH_TAPE_QKV}; +``` + +This made the conv replay path see the Qwen3Next pre-conv data and advance `r_l` correctly. + +### 2. Vulkan lacked a populated drafter full-attention KV cache + +DFlash keeps recent target hidden states in a cross-attention ring. On CUDA, the GPU cross ring can also populate the drafter-side full-attention KV cache with target-context K/V. + +On Vulkan, the GPU cross ring is unavailable, so DFlash uses CPU hidden capture: + +```text +gpu_ring_handle == nullptr +``` + +In that case, `update_drafter_kv_cache()` returns early and the drafter full-attention KV cache is not populated with target context. + +However, the DFlash drafter graph still built `inp_attn_kv_full` whenever the drafter had a memory context. For all-full-attention drafters, this selected the branch that reads target context from the normal KV cache: + +```cpp +if (inp_attn_kv_full && !hparams.is_swa(il)) { + // read target context from drafter KV cache +} +``` + +On Vulkan that cache was empty or stale. The drafter therefore generated from little or no real target context, producing repeated bad predictions and 0% acceptance. + +Fix: add an explicit context parameter: + +```cpp +bool dflash_target_kv_available = false; +``` + +and a public setter: + +```cpp +llama_set_dflash_target_kv_available(ctx_dft, gpu_ring_handle != nullptr); +``` + +Then build the full-attention KV input only when the target KV cache is actually populated: + +```cpp +llm_graph_input_attn_kv * inp_attn_kv_full = nullptr; +if (mctx && cparams.dflash_target_kv_available) { + inp_attn_kv_full = dflash_build_base_attn_input(...); +} +``` + +When the flag is false, the graph uses the existing fresh-projection path: + +```cpp +Kcur_ctx = wk(fused_target) +Vcur_ctx = wv(fused_target) +``` + +That path works with CPU-captured `target_hidden`, which is the correct Vulkan fallback. + +### 3. Flat DFlash must keep the full query block + +DFlash query-token attention is non-causal over the draft block. Shortening the query block can change logits, including early positions. + +The flat path previously used: + +```cpp +batch_len = n_draft + 1; +``` + +Tree DFlash already used the full block. The flat and batched flat paths now keep the trained/inferred block shape: + +```cpp +batch_len = block_size; +output_len = n_draft + 1; +``` + +Only `output_len` rows are marked for output and consumed by the sampler. This preserves full-block drafter semantics while still limiting active draft horizon. + +### 4. Runtime parity required comparing exact live C++ inputs + +The debugging path added several diagnostics that made it possible to prove where the mismatch lived: + +- `GGML_DFLASH_RX_DIAG=1` logs cross-ring / hidden-window availability. +- `GGML_DFLASH_TOKEN_TRACE=1` logs sampled token, drafted tokens, and verified tokens. +- `GGML_DFLASH_TOPK_TRACE=1` logs C++ drafter top-k predictions. +- `GGML_DFLASH_CROSS_DUMP=/path/file.txt` dumps the live cross-attention hidden window for PyTorch replay. + +The cross dump showed that after the Vulkan KV branch fix, C++ and PyTorch forward passes matched for the same live input. That narrowed the remaining 0% behavior to drafter training/input semantics rather than a C++ graph mismatch. + +### 5. Drafter position semantics matter + +C++ consumes DFlash predictions starting at row / position 1: + +```cpp +for (int i = 1; i < output_len; ++i) { + // consume draft prediction rows +} +``` + +Position 0 is the known root / bonus token. The first actual draft token is position 1. + +A drafter trained to predict from position 0 can appear valid in a standalone training loop while still producing poor live C++ acceptance, because C++ reads position 1. Training the live-consumed position fixed the in-distribution acceptance collapse once the runtime graph was correct. + +This document does not cover broader drafter generalization work; that is a separate training/data problem. The Vulkan runtime issue is the empty target-KV branch and Qwen3Next recurrent-state handling described above. + +## Ruled-out or deprioritized hypotheses + +The investigation ruled out several plausible but ultimately non-decisive causes. + +### k_norm ordering + +There was a suspected mismatch around whether target K normalization happened before or after concatenating context/noise K. Further comparison showed this was not the cause of the 0% acceptance. The attempted k_norm fix was reverted. + +### Basic cross-ring emptiness + +Diagnostics showed the cross window was not generally empty. The real problem was more specific: full-attention layers were reading target context from an unpopulated drafter KV cache on Vulkan instead of using the fresh `target_hidden` projection path. + +### Generic DFlash verifier failure + +Qwen3.6 DFlash worked, and the server verification path could reject all drafts while still producing coherent target output when recurrent state was handled correctly. This made a global verifier failure unlikely. + +### C++ vs PyTorch graph mismatch after KV-branch fix + +After dumping the exact live hidden-state window and replaying it in PyTorch, C++ and PyTorch forward outputs matched. That eliminated the C++ graph as the remaining source of the live acceptance issue after the Vulkan KV fix. + +### Hidden tensor layout/type corruption + +Earlier diagnostics did not support basic hidden-state layout or tensor-type corruption as the persistent root cause. + +## Relevant code changes + +### Public API + +`include/llama.h`: + +```cpp +LLAMA_API void llama_set_dflash_target_kv_available(struct llama_context * ctx, bool avail); +``` + +Debug-only recurrent state dump API: + +```cpp +LLAMA_API void llama_dflash_dump_recurrent_state_dbg( + struct llama_context * ctx, + llama_seq_id seq_id, + const char * tag); +``` + +### Context parameter + +`src/llama-cparams.h`: + +```cpp +bool dflash_target_kv_available = false; +``` + +### Flag propagation + +`common/speculative.cpp`: + +```cpp +llama_set_dflash_target_kv_available(ctx_dft, gpu_ring_handle != nullptr); +``` + +### Drafter graph branch gate + +`src/models/dflash_draft.cpp`: + +```cpp +if (mctx && cparams.dflash_target_kv_available) { + inp_attn_kv_full = dflash_build_base_attn_input(...); +} +``` + +### Qwen3Next tape capture + +`src/llama-context.cpp` registers the Qwen3Next pre-conv QKV names: + +```cpp +linear_attn_qkv_mixed- +qkv_mixed- +``` + +### Full-block flat drafting + +`common/speculative.cpp` uses: + +```cpp +const int batch_len = block_size; +const int output_len = n_draft + 1; +``` + +and consumes only rows `1..output_len-1`. + +## Diagnostics retained + +The following environment variables are useful for future Vulkan/Qwen3Next DFlash debugging: + +| Variable | Purpose | +|---|---| +| `GGML_DFLASH_RX_DIAG=1` | Logs cross-window, ring, and hidden-row availability. | +| `GGML_DFLASH_TOKEN_TRACE=1` | Logs sampled/drafted/verified token IDs. | +| `GGML_DFLASH_TOPK_TRACE=1` | Logs drafter top-k rows for C++ vs PyTorch comparison. | +| `GGML_DFLASH_CROSS_DUMP=/path/file.txt` | Dumps a live cross-attention hidden window for replay. | +| `GGML_DFLASH_FORCE_REDECODE=1` | Forces re-decode instead of tape replay, useful to isolate recurrent-state replay issues. | +| `GGML_DFLASH_DISABLE_KV_CACHE=1` | Disables the DFlash projection cache for isolation. | + +## Final status + +The Vulkan path is enabled by making the drafter graph choose the correct target-context source: + +- CUDA/GPU-ring path: full-attention layers may read target K/V from the populated drafter KV cache. +- Vulkan/CPU-hidden path: full-attention layers compute fresh K/V from `target_hidden`. + +The remaining low acceptance on out-of-distribution prompts is not the same runtime bug. It is a drafter training/generalization limitation and should be tracked separately from the Vulkan enablement rationale. + +## Archived investigation documents + +The following detailed investigation notes were consolidated into this document and moved under: + +```text +docs/archive/dflash-q3cn-0acceptance-investigation/ +``` + +- `dflash-q3cn-vulkan-working.md` +- `dflash-q3cn-state-corruption-confirmed.md` +- `qwen3next-drafter-runtime-investigation.md` +- `dflash-drafter-debug-findings.md` +- `dflash-drafter-retrain-status.md` +- `dflash-cpp-vs-pytorch-graph-comparison.md` +- `dflash-rx-diag-results.md` +- `dflash-k-norm-fix-analysis.md` +- `dflash-k-norm-order-mismatch.md` +- `dflash-k-norm-reverted.md` +- `dflash-glm-state-review.md` +- `dflash-acceptance-diagnostic-report.md` +- `dflash-acceptance-diagnostic-report-v2.md` +- `dflash-acceptance-intermediate-report.md` +- `dflash-drafter-rootcause.md` +- `dflash-runtime-trace-results.md` +- `dflash-next-steps.md` +- `dflash-qwen3-coder-next-diagnostic-report.md` +- `vulkan-dflash-status.md` diff --git a/docs/vulkan-cross-ring-plan.md b/docs/vulkan-cross-ring-plan.md new file mode 100644 index 000000000000..96a3c8f95a69 --- /dev/null +++ b/docs/vulkan-cross-ring-plan.md @@ -0,0 +1,368 @@ +# Vulkan DFlash GPU Cross-Ring Port Plan + +## Executive Summary + +DFlash on Vulkan currently uses a **CPU-only hidden capture path**: hidden states are read from the Vulkan GPU via `ggml_backend_tensor_get` into CPU memory, then copied into a CPU ring buffer. The drafter model also runs on GPU but reads from this CPU ring buffer via `llama_dflash_cross_ring_gpu_write` (H2D upload). + +On CUDA, the **GPU cross-ring** enables direct GPU-to-GPU hidden state transfer: the target model's hidden states are copied directly from the target GPU to the drafter GPU ring buffer without touching CPU memory. This avoids: +1. GPU→CPU memory transfer latency +2. Potential precision changes from the CPU readback path +3. CPU ring buffer memory overhead + +**Status**: The GPU cross-ring is CUDA-only. `dflash_gpu_backend_reg()` in `llama-context.cpp` only checks for "CUDA" and "ROCm" backends. Vulkan returns `nullptr` from `llama_dflash_cross_ring_gpu_init`. + +## F16 Drafter Test Results + +| Drafter Quantization | Acceptance Rate | Notes | +|---|---|---| +| Q4_K_M | 0% | Tested 2026-06-10 | +| Q8_0 | 0% | Tested 2026-06-11 | +| F16 | 0% | Tested 2026-06-12 | + +**Conclusion**: The ~0% acceptance for Qwen3-Coder-Next is **not** a quantization precision issue. It's a fundamental model-pair problem. Even the F16 drafter produces 0% acceptance. The vulkan cross-ring port will help Qwen3.6-27B performance but will not fix Qwen3-Coder-Next acceptance. + +## Architecture Overview + +### Current CUDA Cross-Ring (Reference) + +File: `ggml/src/ggml-cuda/cross-ring-interleave.cu` (956 lines) + +**Data structure** (`dflash_cross_ring_gpu`): +```cpp +struct dflash_cross_ring_gpu { + int device; // CUDA device where ring buffers are allocated + int n_layers; // Number of target layers to capture (e.g., 5) + int n_embd; // Embedding dimension (e.g., 2048) + int ring_size; // Ring buffer capacity (e.g., 1024 tokens) + + float ** d_layer_rings; // Device: array of n_layers device pointers + float * d_staging; // Device: interleaved output [ring_size * n_layers * n_embd] + float ** h_layer_ptrs; // Host: copy of per-layer device pointers +}; +``` + +**Functions** (exposed via `ggml_backend_cuda_reg_get_proc_address`): + +| Function | Purpose | CUDA API Used | +|---|---|---| +| `dflash_cross_ring_gpu_alloc` | Allocate per-layer ring buffers + staging | `cudaMalloc`, `cudaMemset` | +| `dflash_cross_ring_gpu_alloc_device` | Same but on specific device | `cudaSetDevice` + above | +| `dflash_cross_ring_gpu_free` | Free all allocations | `cudaFree` | +| `dflash_cross_ring_gpu_write` | H2D copy: CPU hidden → GPU ring | `cudaMemcpyAsync` (H2D) | +| `dflash_cross_ring_gpu_write_d2d` | D2D copy: GPU hidden → GPU ring | `cudaMemcpyAsync` (D2D), `cudaMemcpyPeerAsync` | +| `dflash_cross_ring_gpu_synchronize` | Wait for pending copies | `cudaStreamSynchronize` | +| `dflash_cross_ring_gpu_snapshot` | D2H copy: GPU ring → CPU | `cudaMemcpyAsync` (D2H) | +| `dflash_cross_ring_gpu_interleave` | Rearrange ring into interleaved staging | Compute kernel `k_cross_ring_interleave` | +| `dflash_cross_ring_gpu_set_tensor` | D2D copy with sync (for GGML graph) | `cudaMemcpyAsync` + `cudaStreamSynchronize` | + +**Key compute kernel** (`k_cross_ring_interleave`): +```cuda +__global__ static void k_cross_ring_interleave( + const float * const * __restrict__ d_rings, + float * __restrict__ d_out, + const int ring_size, + const int read_start, + const int cross_len, + const int n_layers, + const int n_embd) { + const int t = blockIdx.x; // token index + const int l = blockIdx.y; // layer index + // Copy one (token, layer) slice from ring to staging + for (int i = threadIdx.x; i < n_embd; i += blockDim.x) { + dst[i] = src[i]; + } +} +``` + +### Additional CUDA DFlash Functions + +Also in `cross-ring-interleave.cu`: + +| Function | Purpose | Used By | +|---|---|---| +| `dflash_rebuild_conv_state` | Rebuild convolution state from QKV | Tape replay | +| `dflash_cuda_copy_d2d` | Generic D2D copy with pointer check | KV cache operations | +| `dflash_cuda_copy_d2d_no_check` | Fast D2D without checks | KV cache operations | +| `dflash_cuda_prepare_ptr` | Validate device pointer | KV cache operations | +| `dflash_cuda_set_device` | Set CUDA device | Multi-GPU | +| `dflash_cuda_synchronize_ptr` | Sync stream for pointer's device | Multi-GPU | +| `dflash_cuda_ptr_device` | Get device ID from pointer | Multi-GPU | +| `dflash_cuda_synchronize_device` | Sync specific device | Multi-GPU | +| `dflash_cuda_backend_wait_for_stream` | Backend stream sync | Multi-GPU | +| `dflash_cuda_backend_wait_for_dflash_stream` | DFlash stream sync | Multi-GPU | +| `dflash_replay_gdn_state_no_check` | GPU GDN tape replay | Tape replay | +| `dflash_kv_cache_write_d2d` | KV cache D2D write | KV cache | +| `dflash_kv_cache_write_d2d_no_check` | Fast KV cache D2D | KV cache | +| `dflash_kv_cache_append_d2d` | KV cache append D2D | KV cache | +| `dflash_kv_cache_append_d2d_no_check` | Fast KV cache append D2D | KV cache | +| `dflash_kv_cache_interleave` | KV cache interleave | KV cache | + +## Vulkan Port Strategy + +### Phase 1: Core Cross-Ring (Required for GPU Hidden Capture) + +Port the 9 core cross-ring functions from CUDA to Vulkan. The `dflash_cross_ring_handle` struct (line 8774 of `llama-context.cpp`) stores 7 function pointers, plus 2 allocation functions: + +| CUDA | Vulkan | Notes | +|---|---|---| +| `cudaMalloc` | `vkCreateBuffer` + `vkAllocateMemory` | Use `VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT` | +| `cudaMemset` | `vkCmdFillBuffer` or `vkMapMemory` + `memset` | | +| `cudaMemcpyAsync` (H2D) | Staging buffer + `vkCmdCopyBuffer` | Vulkan requires staging for H2D | +| `cudaMemcpyAsync` (D2D) | `vkCmdCopyBuffer` | Direct buffer copy | +| `cudaStreamSynchronize` | `vkQueueWaitIdle` or semaphore wait | | +| Compute kernel | `vkCmdDispatch` with `.comp` shader | Port GLSL kernel | + +**New file**: `ggml/src/ggml-vulkan/vulkan-shaders/dflash_cross_ring.comp` +- Port `k_cross_ring_interleave` to GLSL compute shader +- Use storage buffers for `d_rings` and `d_out` + +**New file**: `ggml/src/ggml-vulkan/ggml-vulkan-cross-ring.cpp` +- Implement all 9 functions using Vulkan APIs +- Register via `ggml_backend_reg_get_proc_address` +- Update `CMakeLists.txt` to compile and link the new file + +### Phase 2: Vulkan Backend Registration + +Modify `dflash_gpu_backend_reg()` in `llama-context.cpp`: +```cpp +static ggml_backend_reg_t dflash_gpu_backend_reg() { + ggml_backend_reg_t reg = ggml_backend_reg_by_name("CUDA"); + if (!reg) { + reg = ggml_backend_reg_by_name("ROCm"); + } + if (!reg) { + reg = ggml_backend_reg_by_name("Vulkan"); // NEW + } + return reg; +} +``` + +Add `get_proc_address` to the Vulkan backend registry (currently `NULL` in `ggml_backend_vk_reg_i` of `ggml-vulkan.cpp`): +```cpp +static void * ggml_backend_vulkan_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (strcmp(name, "dflash_cross_ring_gpu_alloc") == 0) return (void *)dflash_cross_ring_gpu_alloc; + // ... all 10 functions ... + return nullptr; +} +``` + +### Phase 3: GPU Hidden Capture for qwen3next + +Add `LLM_ARCH_QWEN3NEXT` to `llama_dflash_gpu_hidden_supported_arch()` and port the GPU hidden capture block from `qwen35.cpp` to `qwen3next.cpp`. This enables the `hidden_gpu` path on CUDA/ROCm/Vulkan. + +### Phase 4: GPU Tape Replay (Optional, High Effort) + +Port GPU GDN tape replay (`dflash_replay_gdn_state_no_check`) to Vulkan. Currently crashes on Vulkan because GDN ops are CUDA-specific. This is lower priority since CPU tape replay works correctly. + +## Implementation Details + +### Vulkan Buffer Management + +**Design decision**: Use `vk_buffer`/`ggml_vk_create_buffer` from the existing +Vulkan backend instead of raw Vulkan API calls. This ensures: +- Consistent memory accounting with the rest of the backend +- Correct alignment via `ggml_backend_vk_buffer_type_get_alignment` +- Proper cleanup via `shared_ptr` reference counting +- Access to the `vk_device` struct for queue/pipeline management + +The cross-ring needs: +1. **Per-layer ring buffers** (`n_layers` × `ring_size` × `n_embd` × 4 bytes each) + - For Qwen3.6: 5 layers × 1024 tokens × 2048 embd × 4 = ~40 MB + - Use `VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT` + - Use `VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT` for performance + - Allocate via `ggml_vk_create_buffer(device, size, {DEVICE_LOCAL}, {HOST_VISIBLE | HOST_COHERENT})` + +2. **Staging buffer** (`ring_size` × `n_layers` × `n_embd` × 4 bytes) + - For Qwen3.6: 1024 × 5 × 2048 × 4 = ~40 MB + - Used by the interleave kernel output + - Same flags as per-layer rings + +3. **Host staging buffer** (for H2D uploads) + - `n_tokens` × `n_embd` × 4 bytes per upload + - Use `VK_BUFFER_USAGE_TRANSFER_SRC_BIT` + `VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT` + - Required because Vulkan H2D copies need a host-visible intermediate buffer: + 1. `memcpy` hidden state data into the host staging buffer + 2. `vkCmdCopyBuffer` from staging → device-local ring + 3. Pipeline barrier to ensure copy completes before use + +**Env var check**: `dflash_cross_ring_gpu_alloc` checks `getenv("GGML_DFLASH_GPU_RING")` +and returns `nullptr` if it's `"0"`. The Vulkan port must replicate this check. + +### Command Buffer & Queue Integration + +The Vulkan backend has an existing command pool/buffer abstraction: +- `vk_command_pool` (line 227 of `ggml-vulkan.cpp`): manages `vk_command_buffer`s +- `ggml_vk_submit`: submits a context's command buffer to the queue +- `ggml_vk_dispatch_pipeline`: dispatches a compute pipeline with descriptor sets + +The cross-ring operations must integrate with this model: +- `write`: record `vkCmdCopyBuffer` into the current context's command buffer +- `synchronize`: submit and wait (or use `vk_queue_wait_idle`) +- `interleave`: dispatch the compute shader via `ggml_vk_dispatch_pipeline` +- `snapshot`: `vkCmdCopyBuffer` device→host-visible staging, then `memcpy` from mapped memory + +On CUDA, `cudaStreamPerThread` handles implicit ordering. Vulkan requires explicit +pipeline barriers (`vkCmdPipelineBarrier`) between copy and compute operations. + +### Pointer Provenance Check for `write_d2d` + +On CUDA, `cudaPointerGetAttributes` determines if a pointer is device memory. +Vulkan has no equivalent. The `write_d2d` function must verify that the source +tensor belongs to the Vulkan backend. Options: + +1. **Buffer name check**: `ggml_backend_buffer_name(t->buffer)` returns "VulkanXX" + for Vulkan-allocated tensors. Check for "Vulkan" prefix. +2. **Buffer type check**: Compare `t->buffer->buft` against the Vulkan buffer type. +3. **Side table**: Maintain a set of known Vulkan buffer pointers. + +Option 1 is simplest and consistent with how the existing code detects +CUDA buffers (see `dflash_is_cuda_compatible_tensor` in `llama-context.cpp`). + +### Shader Compilation + +The Vulkan backend compiles compute shaders to SPIR-V at runtime during +pipeline creation (`ggml_vk_create_pipeline`). The interleave shader must be: +1. Written as a `.comp` file in `vulkan-shaders/` +2. Compiled to SPIR-V (either offline or via shaderc at runtime) +3. Registered as a pipeline in the `vk_device_struct` initialization + +Cold-start impact: ~50-100ms for one additional pipeline compilation. +Negligible compared to the model loading time. + +### Interleave Compute Shader (GLSL) + +```glsl +#version 450 +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout(set = 0, binding = 0, std430) buffer Rings { + float rings[]; // per-layer data, indexed by caller +}; +layout(set = 0, binding = 1) uniform RingParams { + uint ringSize; + uint readStart; + uint crossLen; + uint nLayers; + uint nEmb; + uint layerOffset; // offset within rings buffer for current layer +}; + +layout(set = 0, binding = 2, std430) buffer Output { + float out[]; // interleaved output +}; + +void main() { + uint t = gl_GlobalInvocationID.x; // token index + uint l = gl_GlobalInvocationID.y; // layer index + if (t >= crossLen || l >= nLayers) return; + + uint slot = (readStart + t) % ringSize; + uint localIdx = gl_LocalInvocationID.x; + uint srcIdx = layerOffset + slot * nEmb + localIdx; + uint dstIdx = t * nLayers * nEmb + l * nEmb + localIdx; + + for (uint i = localIdx; i < nEmb; i += 256) { + out[dstIdx + i] = rings[srcIdx + i]; + } +} +``` + +Note: The CUDA version uses an array of device pointers (`float **`). Vulkan doesn't support arrays of device pointers in storage buffers. Options: +1. **Flatten**: Single buffer with `layerOffset` computed from stride +2. **Descriptor array**: Use `vkCmdBindDescriptorSets` with per-layer descriptors +3. **Per-layer dispatch**: Launch kernel once per layer + +Option 1 (flatten) is simplest and most performant. + +### `write_d2d` on Vulkan + +The CUDA version uses `cudaPointerGetAttributes` to detect if source is device memory and on which device. Vulkan has no equivalent. The port must determine whether a source tensor's buffer belongs to the Vulkan backend. Use `ggml_backend_buffer_name(t->buffer)` and check for "Vulkan" prefix (same pattern as `dflash_is_cuda_compatible_tensor` in `llama-context.cpp`): +- If the source is a Vulkan device buffer → use `vkCmdCopyBuffer` for D2D copy +- If the source is not a Vulkan device buffer → return `false` (fallback to H2D path) +- For multi-GPU: use `vkCmdCopyBuffer` with buffers on different devices (requires device group operations) + +### Synchronization Model + +CUDA uses `cudaStreamPerThread` for implicit ordering. Vulkan requires explicit synchronization: +- Use **pipeline barriers** (`vkCmdPipelineBarrier`) between copy and compute ops +- Use `vk_queue_wait_idle` or `vkWaitForFences` for `dflash_cross_ring_gpu_synchronize` +- Integrate with the existing `vk_command_pool`/`ggml_vk_submit` model + +### `dflash_cross_ring_gpu_snapshot` (GPU→CPU read) + +This function copies ring buffer data to CPU memory for state serialization. +On Vulkan: +1. `vkCmdCopyBuffer` from device-local ring → host-visible staging buffer +2. Pipeline barrier to ensure copy completes +3. `vkMapMemory` + `memcpy` to read staging buffer data into the caller's buffer +4. `vkUnmapMemory` to release the mapping + +Alternative: If using host-visible ring buffers (Approach A), `snapshot` is a +simple `memcpy` from mapped memory — no staging or command buffer needed. + +## Effort Estimates + +| Phase | Files | Lines | Complexity | Risk | +|---|---|---|---|---| +| Phase 1: Core cross-ring | 2 new files + CMakeLists | ~500 | Medium | Medium | +| Phase 2: Registration | 2 existing files | ~30 | Low | Low | +| Phase 3: qwen3next capture | 2 existing files | ~50 | Low | Low | +| Phase 4: GPU tape replay | 1 new file | ~200 | High | High | +| **Total** | | ~780 | | | + +## Expected Impact + +| Metric | Current (CPU path) | With GPU Cross-Ring | +|---|---|---| +| Hidden capture latency | GPU→CPU transfer per batch | GPU→GPU copy per batch | +| Ring buffer memory | CPU ring only | GPU ring only (CPU ring not maintained when GPU ring active) | +| Qwen3.6-27B tok/s | ~11-12 tok/s | ~12-13 tok/s (estimated) | +| Qwen3-Coder-Next acceptance | ~0% | ~0% (unchanged — drafter quality issue) | + +The main benefit is **reduced latency per hidden capture cycle**, which matters most during verification decode (small batches). The impact on overall throughput is modest (~10-15%) because the bottleneck is the forward pass, not the hidden capture. Without a CUDA baseline benchmark for the same model, the Vulkan improvement estimate is uncertain. + +## Known Limitations + +1. **Qwen3-Coder-Next acceptance**: Cannot be fixed by cross-ring alone. Requires better drafter training or architectural changes. +2. **Multi-GPU Vulkan**: Device group operations are complex; single-GPU path is sufficient for most users. +3. **Vulkan compute shader portability**: The interleave kernel must work across RADV, Intel ANV, and NVIDIA Vulkan drivers. +4. **GDN GPU tape replay**: Separate from cross-ring; requires porting GDN CUDA kernels to Vulkan compute shaders. + +## Alternative Approaches + +### Approach A: Host-Visible GPU Buffers (UMA Systems) + +On your AMD APU (UMA system), use `VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT` for the ring buffer. This allows the CPU to read/write GPU memory directly without explicit transfers. + +**Pros**: Minimal code changes, no compute shader needed +**Cons**: Only works on UMA systems, potentially slower than device-local memory + +### Approach B: Enhanced CPU Path + +Optimize the existing CPU path: +- Batch `ggml_backend_tensor_get` calls across layers instead of per-layer +- Use pinned CPU memory for faster H2D uploads +- Profile the actual bottleneck (is it the readback or the upload?) + +**Pros**: Low risk, immediate benefit +**Cons**: Still has GPU→CPU→GPU round-trip + +### Approach C: Hybrid + +Implement Phase 1 (core cross-ring) for the GPU ring buffer, but keep the CPU eval callback for hidden capture. Hidden states still go GPU→CPU via eval callback, but `ring_write` uploads them to the GPU ring via `fn_write` (H2D) instead of writing to a CPU ring buffer. The drafter's cross-attention reads directly from the GPU ring via `fn_interleave` + `fn_set_tensor`, eliminating: + +1. The CPU interleave loop in `build_cross_data` (line 2360 of `speculative.cpp`) +2. The `llama_set_cross_data_seq` CPU→GPU upload of interleaved cross-data + +The target's hidden capture still goes GPU→CPU→GPU (eval callback readback + H2D upload to GPU ring), so the total data movement is similar, but the interleave computation moves from CPU to GPU. + +**Pros**: Partial benefit with lower effort, drafter cross-attention reads stay on-GPU +**Cons**: Still has GPU→CPU readback for hidden capture, H2D upload adds per-call overhead + +## Recommendations + +1. **Start with Approach B** (enhanced CPU path) for immediate, low-risk gains +2. **Implement Phase 1 + 2** (core cross-ring + registration) for the full GPU path +3. **Phase 3** (qwen3next GPU capture) is low-hanging fruit if Phase 1 is done +4. **Defer Phase 4** (GPU tape replay) unless GDN GPU replay is a proven bottleneck +5. **Do not invest in Qwen3-Coder-Next DFlash optimization** — the 0% acceptance across all quantizations indicates a fundamental model-pair incompatibility diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 2d9e85794ad4..42e2fac4026e 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -61,6 +61,7 @@ if (Vulkan_FOUND) ggml_add_backend_library(ggml-vulkan ggml-vulkan.cpp + ggml-vulkan-cross-ring.cpp ../../include/ggml-vulkan.h ) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-cross-ring.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-cross-ring.cpp new file mode 100644 index 000000000000..d546255c14ab --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-cross-ring.cpp @@ -0,0 +1,51 @@ +// Vulkan UMA DFlash GPU cross-ring — Approach A (placeholder) +// +// Placeholder implementation. The Vulkan GPU ring is currently not used +// (dflash_gpu_backend_reg does not return Vulkan). This file exists to +// provide symbols for the get_proc_address resolver but is not actually +// invoked at runtime. +// +// When Approach A is fully implemented, this will use host-visible coherent +// Vulkan buffers for the ring + interleave, and ggml_backend_tensor_set for +// the final H2D upload to the drafter's target_hidden tensor. + +#include +#include +#include + +struct dflash_cross_ring_vk { + int n_layers; + int n_embd; + int ring_size; + float ** h_layer_ptrs; + float * h_staging; +}; + +extern "C" void * dflash_cross_ring_gpu_alloc(int n_layers, int n_embd, int ring_size) { + (void)n_layers; (void)n_embd; (void)ring_size; + return nullptr; // not used — Vulkan ring is disabled +} + +extern "C" void * dflash_cross_ring_gpu_alloc_device(int, int n_layers, int n_embd, int ring_size) { + return dflash_cross_ring_gpu_alloc(n_layers, n_embd, ring_size); +} + +extern "C" void dflash_cross_ring_gpu_free(void * handle) { + if (!handle) return; + auto * ring = (dflash_cross_ring_vk *)handle; + if (ring->h_staging) free(ring->h_staging); + if (ring->h_layer_ptrs) { + for (int l = 0; l < ring->n_layers; l++) { + if (ring->h_layer_ptrs[l]) free(ring->h_layer_ptrs[l]); + } + delete[] ring->h_layer_ptrs; + } + delete ring; +} + +extern "C" void dflash_cross_ring_gpu_write(void *, int, int, const float *, int, int) {} +extern "C" bool dflash_cross_ring_gpu_write_d2d(void *, int, int, const void *, int, int) { return false; } +extern "C" void dflash_cross_ring_gpu_synchronize(void *) {} +extern "C" bool dflash_cross_ring_gpu_snapshot(void *, int, int, int, float *, int, int, int) { return false; } +extern "C" const float * dflash_cross_ring_gpu_interleave(void *, int, int, int) { return nullptr; } +extern "C" void dflash_cross_ring_gpu_set_tensor(void *, const void *, size_t, size_t) {} diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 6735afbd7f6d..822a463b8885 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -17112,11 +17112,38 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, return devices[device]; } +// Forward declarations for DFlash cross-ring functions (defined in ggml-vulkan-cross-ring.cpp) +extern "C" { + void * dflash_cross_ring_gpu_alloc(int, int, int); + void * dflash_cross_ring_gpu_alloc_device(int, int, int, int); + void dflash_cross_ring_gpu_free(void *); + void dflash_cross_ring_gpu_write(void *, int, int, const float *, int, int); + bool dflash_cross_ring_gpu_write_d2d(void *, int, int, const void *, int, int); + void dflash_cross_ring_gpu_synchronize(void *); + bool dflash_cross_ring_gpu_snapshot(void *, int, int, int, float *, int, int, int); + const float * dflash_cross_ring_gpu_interleave(void *, int, int, int); + void dflash_cross_ring_gpu_set_tensor(void *, const void *, size_t, size_t); +} + +// DFlash cross-ring function pointer resolver for Vulkan UMA +static void * ggml_backend_vk_reg_get_proc_address(ggml_backend_reg_t /* reg */, const char * name) { + if (strcmp(name, "dflash_cross_ring_gpu_alloc") == 0) return (void *)(void *)dflash_cross_ring_gpu_alloc; + if (strcmp(name, "dflash_cross_ring_gpu_alloc_device") == 0) return (void *)(void *)dflash_cross_ring_gpu_alloc_device; + if (strcmp(name, "dflash_cross_ring_gpu_free") == 0) return (void *)dflash_cross_ring_gpu_free; + if (strcmp(name, "dflash_cross_ring_gpu_write") == 0) return (void *)dflash_cross_ring_gpu_write; + if (strcmp(name, "dflash_cross_ring_gpu_write_d2d") == 0) return (void *)dflash_cross_ring_gpu_write_d2d; + if (strcmp(name, "dflash_cross_ring_gpu_synchronize") == 0) return (void *)dflash_cross_ring_gpu_synchronize; + if (strcmp(name, "dflash_cross_ring_gpu_snapshot") == 0) return (void *)dflash_cross_ring_gpu_snapshot; + if (strcmp(name, "dflash_cross_ring_gpu_interleave") == 0) return (void *)dflash_cross_ring_gpu_interleave; + if (strcmp(name, "dflash_cross_ring_gpu_set_tensor") == 0) return (void *)dflash_cross_ring_gpu_set_tensor; + return nullptr; +} + static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = { /* .get_name = */ ggml_backend_vk_reg_get_name, /* .get_device_count = */ ggml_backend_vk_reg_get_device_count, /* .get_device = */ ggml_backend_vk_reg_get_device, - /* .get_proc_address = */ NULL, + /* .get_proc_address = */ ggml_backend_vk_reg_get_proc_address, }; ggml_backend_reg_t ggml_backend_vk_reg() { diff --git a/include/llama.h b/include/llama.h index ba01f215c0b3..c4d8bca910cc 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1159,6 +1159,11 @@ extern "C" { // cache and forces a reserve on next decode. LLAMA_API void llama_set_dflash_n_slots(struct llama_context * ctx, int n); + // DFlash: mark whether the drafter's normal KV cache is populated with + // TARGET context K/V. Vulkan CPU-hidden capture does not populate it, so + // full-attention DFlash layers must use fresh K/V from target_hidden. + LLAMA_API void llama_set_dflash_target_kv_available(struct llama_context * ctx, bool avail); + // DFlash: enable/disable tape recording for DeltaNet rollback // When enabled, the eval callback records per-token DeltaNet inputs (k, v, gate, beta) // during verification decode for efficient state replay instead of full re-evaluation @@ -1200,7 +1205,10 @@ extern "C" { // - KV cache: trims rejected draft positions (keeps accepted tokens' KV entries) // - Recurrent state: restores from backup + tape replay for accepted tokens // This replaces the manual seq_rm/seq_cp + tape_replay sequence - LLAMA_API void llama_dflash_rollback( + // Returns the number of positions that were NOT advanced by tape replay + // and need re-decoding (e.g., when tape replay is unavailable on the backend). + // A return value of 0 means the rollback was fully successful. + LLAMA_API int llama_dflash_rollback( struct llama_context * ctx, llama_seq_id seq_id, llama_seq_id seq_backup, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 2dfa35538b2f..8660a52fef28 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -22,6 +22,7 @@ #include "ggml-alloc.h" #include +#include #include #include #include @@ -76,7 +77,8 @@ static bool dflash_is_cuda_compatible_tensor(const ggml_tensor * t) { return false; } const char * name = ggml_backend_buffer_name(t->buffer); - return name && (std::strncmp(name, "CUDA", 4) == 0 || std::strncmp(name, "ROCm", 4) == 0); + return name && (std::strncmp(name, "CUDA", 4) == 0 || std::strncmp(name, "ROCm", 4) == 0 || + std::strncmp(name, "Vulkan", 6) == 0); } static bool dflash_tensor_span_in_bounds(const ggml_tensor * t, size_t offset_bytes, size_t n_bytes) { @@ -1767,6 +1769,16 @@ void llama_context::set_dflash_n_slots(int n) { gf_res_prev->reset(); } +void llama_context::set_dflash_target_kv_available(bool avail) { + if (cparams.dflash_target_kv_available == avail) { + return; + } + cparams.dflash_target_kv_available = avail; + // The drafter graph attention branch depends on this flag. + sched_need_reserve = true; + gf_res_prev->reset(); +} + void llama_context::set_dflash_capture(const int32_t * layer_ids, int32_t n_layers) { if (layer_ids == nullptr || n_layers <= 0) { // Permanent deconfiguration: clear layer config and remove callback. @@ -1993,7 +2005,20 @@ void llama_context::dflash_ensure_recurrent_setup() { dflash_capture->tape_name_map["v_conv_predelta-" + il_str] = {idx, DFLASH_TAPE_V}; dflash_capture->tape_name_map["gate-" + il_str] = {idx, DFLASH_TAPE_GATE}; dflash_capture->tape_name_map["beta-" + il_str] = {idx, DFLASH_TAPE_BETA}; + // qwen3next (Qwen3-Coder-Next) names the beta tensor "b" instead of "beta" + dflash_capture->tape_name_map["b-" + il_str] = {idx, DFLASH_TAPE_BETA}; dflash_capture->tape_name_map["qkv_mixed_pretranspose-" + il_str] = {idx, DFLASH_TAPE_QKV}; + // qwen3next (Qwen3-Coder-Next) builds the pre-conv (pre-transpose) QKV via two + // code paths in build_qkvz(), both projecting the layer INPUT (pre-conv1d): + // wqkv path: "linear_attn_qkv_mixed-" (qwen3next.cpp:304) [used by Qwen3-Coder-Next] + // ssm_in legacy: "qkv_mixed-" (qwen3next.cpp:364) [concat of q/k/v flats] + // Both are the input to build_conv_state() (qwen3next.cpp:440), i.e. PRE-conv. + // Without recording one of these, tape_replay_conv skips every layer (empty + // qkv_mixed) and the conv state (r_l) is never advanced, so the conv state stays + // frozen at the backup value and the target output is garbled (0% acceptance). + // Verified: 504/504 conv layers OK after this fix; conv state matches re-decode ref. + dflash_capture->tape_name_map["linear_attn_qkv_mixed-" + il_str] = {idx, DFLASH_TAPE_QKV}; + dflash_capture->tape_name_map["qkv_mixed-" + il_str] = {idx, DFLASH_TAPE_QKV}; } } dflash_capture->tape_layers.resize(dflash_capture->recurrent_layer_ids.size()); @@ -2609,9 +2634,13 @@ void llama_context::set_active_dflash_slot(int slot_idx) { } } -void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { +int llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { if (!dflash_capture || n_accepted <= 0) { - return; + return 0; + } + + if (const char * env = std::getenv("GGML_DFLASH_FORCE_REDECODE"); env && std::atoi(env) != 0) { + return n_accepted; } // ensure any previous async replay is complete before launching a new one @@ -2624,7 +2653,7 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { n_accepted <= gpu_tape->n_tokens); if (!use_gpu_tape && dflash_capture->tape_layers.empty()) { - return; + return 0; } auto * mem_recurrent = dynamic_cast(memory.get()); @@ -2636,7 +2665,7 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { } if (!mem_recurrent) { LLAMA_LOG_WARN("%s: tape replay requires recurrent memory\n", __func__); - return; + return n_accepted; } const auto & hparams = model.hparams; @@ -2653,7 +2682,7 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { } if (cell_idx < 0) { LLAMA_LOG_WARN("%s: no active cell for seq %d\n", __func__, seq_id); - return; + return n_accepted; } const uint32_t n_embd_s = hparams.n_embd_s(); @@ -2671,7 +2700,7 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { if (!gpu_backend) { tape_replay_cpu(mem_recurrent, cell_idx, n_accepted); tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id); - return; + return 0; } // partial offload: if any recurrent layer's state lives on CPU, fall back to CPU replay @@ -2681,7 +2710,7 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { if (s_tensor && s_tensor->buffer && ggml_backend_buffer_is_host(s_tensor->buffer)) { tape_replay_cpu(mem_recurrent, cell_idx, n_accepted); tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id); - return; + return 0; } } @@ -2694,14 +2723,14 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { dflash_capture->replay_cell_idx = cell_idx; dflash_capture->replay_seq_id = seq_id; dflash_capture->replay_mem_recurrent = mem_recurrent; - return; + return 0; } const bool multi_gpu_target = model.n_devices() > 1; if (multi_gpu_target) { if (tape_replay_gdn_direct_from_cpu_tape(mem_recurrent, cell_idx, n_accepted)) { tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id); - return; + return 0; } if (!dflash_capture->multi_gpu_replay_fallback_logged) { LLAMA_LOG_WARN("%s: multi-GPU target detected (%zu devices); exact CUDA DFlash replay unavailable, using CPU recurrent replay fallback\n", @@ -2710,7 +2739,7 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { } tape_replay_cpu(mem_recurrent, cell_idx, n_accepted); tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id); - return; + return 0; } // GPU tape replay: build a ggml graph with GDN ops for all recurrent layers @@ -2857,7 +2886,7 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { if (!use_gpu_tape) { tape_replay_cpu(mem_recurrent, cell_idx, n_accepted); tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id); - return; + return 0; } goto conv_rebuild; } @@ -2885,7 +2914,7 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { ggml_free(ctx); tape_replay_cpu(mem_recurrent, cell_idx, n_accepted); tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id); - return; + return 0; } // assign tensors within the persistent buffer @@ -2940,8 +2969,9 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { if (!use_gpu_tape) { tape_replay_cpu(mem_recurrent, cell_idx, n_accepted); tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id); + return 0; } - return; + return n_accepted; } if (dflash_capture->profile) { const uint64_t elapsed = ggml_time_us() - t_replay_enqueue_us; @@ -2961,11 +2991,12 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { dflash_capture->replay_cell_idx = cell_idx; dflash_capture->replay_seq_id = seq_id; dflash_capture->replay_mem_recurrent = mem_recurrent; - return; // conv rebuild deferred to tape_replay_sync() + return 0; // conv rebuild deferred to tape_replay_sync() } conv_rebuild: tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id); + return 0; } bool llama_context::tape_replay_gdn_direct_gpu(llama_memory_recurrent * mem_recurrent, int32_t cell_idx, int n_accepted) { @@ -3929,11 +3960,11 @@ void llama_context::tape_replay_cpu(llama_memory_recurrent * mem_recurrent, int3 } } -void llama_context::dflash_rollback(llama_seq_id seq_id, llama_seq_id seq_backup, int n_past_before, int n_accepted) { +int llama_context::dflash_rollback(llama_seq_id seq_id, llama_seq_id seq_backup, int n_past_before, int n_accepted) { auto * mem_hybrid = dynamic_cast(memory.get()); if (!mem_hybrid) { LLAMA_LOG_WARN("%s: dflash_rollback requires hybrid memory\n", __func__); - return; + return n_accepted; } const bool profile = dflash_capture && dflash_profile_has(dflash_capture->profile_flags, DFLASH_PROFILE_COPY); @@ -3982,7 +4013,7 @@ void llama_context::dflash_rollback(llama_seq_id seq_id, llama_seq_id seq_backup profile_lap(recurrent_restore_us); // Replay DeltaNet state updates for accepted tokens - tape_replay(seq_id, n_accepted); + const int n_redecode = tape_replay(seq_id, n_accepted); profile_lap(tape_launch_us); if (profile) { @@ -4004,6 +4035,7 @@ void llama_context::dflash_rollback(llama_seq_id seq_id, llama_seq_id seq_backup tape_launch_us / 1e3, (ggml_time_us() - t_start_us) / 1e3); } + return n_redecode; } void llama_context::dflash_prepare_branch(llama_seq_id seq_id, llama_seq_id seq_backup, int depth) { @@ -8589,6 +8621,10 @@ void llama_set_dflash_n_slots(llama_context * ctx, int n) { ctx->set_dflash_n_slots(n); } +void llama_set_dflash_target_kv_available(llama_context * ctx, bool avail) { + ctx->set_dflash_target_kv_available(avail); +} + void llama_set_tape_recording(llama_context * ctx, bool enable) { ctx->set_tape_recording(enable); } @@ -8625,8 +8661,8 @@ bool llama_dflash_memory_seq_cp_recurrent_ordered( return ctx ? ctx->dflash_memory_seq_cp_recurrent_ordered(seq_id_src, seq_id_dst, p0, p1) : false; } -void llama_dflash_rollback(llama_context * ctx, llama_seq_id seq_id, llama_seq_id seq_backup, int n_past_before, int n_accepted) { - ctx->dflash_rollback(seq_id, seq_backup, n_past_before, n_accepted); +int llama_dflash_rollback(llama_context * ctx, llama_seq_id seq_id, llama_seq_id seq_backup, int n_past_before, int n_accepted) { + return ctx->dflash_rollback(seq_id, seq_backup, n_past_before, n_accepted); } void llama_dflash_prepare_branch(llama_context * ctx, llama_seq_id seq_id, llama_seq_id seq_backup, int depth) { diff --git a/src/llama-context.h b/src/llama-context.h index 0c9e80297aaa..8735e9278d16 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -700,6 +700,11 @@ struct llama_context { void set_dflash_consume_reduced(bool enabled); void set_dflash_n_slots(int n); + // DFlash: mark whether the drafter's normal KV cache is populated with + // TARGET context K/V. When false, full-attention DFlash layers fall back + // to fresh K/V projection from target_hidden. + void set_dflash_target_kv_available(bool avail); + // DFlash: reset hidden-state capture for a fresh decode() call so the // eval callback accumulates across this call's ubatches void dflash_reset_hidden_capture(); @@ -733,7 +738,9 @@ struct llama_context { void set_active_dflash_slot(int slot_idx); // DFlash: replay tape data to reconstruct DeltaNet state for n_accepted tokens - void tape_replay(llama_seq_id seq_id, int n_accepted); + // Returns the number of positions that were NOT advanced by tape replay + // and need re-decoding. A return value of 0 means the rollback was fully successful. + int tape_replay(llama_seq_id seq_id, int n_accepted); void tape_replay_sync(); bool tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent, int32_t cell_idx, int n_accepted); bool tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent, int32_t cell_idx, int n_accepted, bool advance_pos); @@ -745,7 +752,9 @@ struct llama_context { bool dflash_memory_seq_cp_recurrent_ordered(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); // DFlash: complete rollback for hybrid models (KV trim + recurrent restore + tape replay) - void dflash_rollback(llama_seq_id seq_id, llama_seq_id seq_backup, int n_past_before, int n_accepted); + // Returns the number of positions that were NOT advanced by tape replay + // and need re-decoding. A return value of 0 means the rollback was fully successful. + int dflash_rollback(llama_seq_id seq_id, llama_seq_id seq_backup, int n_past_before, int n_accepted); // DFlash: prepare DeltaNet state for branch verification (recurrent restore + tape replay, no KV touch) void dflash_prepare_branch(llama_seq_id seq_id, llama_seq_id seq_backup, int depth); diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 1643b2b7376c..9df62da47a05 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -68,6 +68,12 @@ struct llama_cparams { int dflash_verify_topk = 1; bool dflash_reduced_consumer_active = false; + // DFlash drafter: true when the drafter's normal KV cache is populated + // with TARGET context K/V (via the GPU cross ring). When false (e.g. + // Vulkan CPU hidden capture), full-attention layers must project K/V + // freshly from target_hidden instead of reading an empty/stale KV cache. + bool dflash_target_kv_available = false; + // DFlash: cross-attention window in tokens (how many target hidden states the drafter sees) int dflash_cross_ctx = 512; diff --git a/src/models/dflash_draft.cpp b/src/models/dflash_draft.cpp index fbf34cb485f4..df960772bb45 100644 --- a/src/models/dflash_draft.cpp +++ b/src/models/dflash_draft.cpp @@ -880,8 +880,11 @@ llm_build_dflash_draft::llm_build_dflash_draft( // Full-attention draft layers consume accepted-prefix K/V from the normal // drafter KV cache. Sliding layers still read the current target-hidden // window directly because their context is already bounded by SWA. + // Only build this input when the cache is actually populated with TARGET + // context K/V. Vulkan CPU hidden capture has no GPU cross ring, so it must + // use the fresh K/V projection path below instead of reading an empty cache. llm_graph_input_attn_kv * inp_attn_kv_full = nullptr; - if (mctx) { + if (mctx && cparams.dflash_target_kv_available) { const bool rebind_base_from_iswa = hparams.swa_type != LLAMA_SWA_TYPE_NONE; const llama_kv_cache_context * base_ctx = rebind_base_from_iswa diff --git a/tests/test-dflash-plumbing.cpp b/tests/test-dflash-plumbing.cpp index 63d609452f9f..4c93802cc5d6 100644 --- a/tests/test-dflash-plumbing.cpp +++ b/tests/test-dflash-plumbing.cpp @@ -180,6 +180,27 @@ int main(int argc, char ** argv) { cmake_root.find("CMAKE_CUDA_ARCHITECTURES=120") != std::string::npos, "CMake must fail loudly when users pass obsolete GGML_CUDA_ARCH instead of CMAKE_CUDA_ARCHITECTURES"); + ok &= expect(llama_h.find("llama_set_dflash_target_kv_available") != std::string::npos && + context_h.find("void set_dflash_target_kv_available(bool avail)") != std::string::npos && + context_cpp.find("void llama_context::set_dflash_target_kv_available(bool avail)") != std::string::npos, + "DFlash must expose a target-KV availability flag for drafter graph selection"); + ok &= expect(cparams_h.find("bool dflash_target_kv_available = false") != std::string::npos, + "DFlash target-KV availability must default false so Vulkan CPU-hidden capture avoids empty KV cache reads"); + ok &= expect(speculative.find("llama_set_dflash_target_kv_available(ctx_dft, gpu_ring_handle != nullptr)") != std::string::npos, + "DFlash must mark target KV available only when the GPU cross ring exists"); + ok &= expect(dflash_draft.find("if (mctx && cparams.dflash_target_kv_available)") != std::string::npos, + "DFlash full-attention layers must not read drafter target KV unless it was populated"); + ok &= expect(context_cpp.find("linear_attn_qkv_mixed-") != std::string::npos && + context_cpp.find("qkv_mixed-") != std::string::npos, + "Qwen3Next DFlash tape capture must include pre-conv QKV tensor aliases for conv-state replay"); + ok &= expect(speculative.find("const int batch_len = block_size;") != std::string::npos && + speculative.find("const int batch_len = block_size;") != std::string::npos && + speculative.find("const int output_len = n_draft + 1;") != std::string::npos && + speculative.find("const int output_len = n_draft + 1;") != std::string::npos, + "Flat DFlash drafting must decode the full block while consuming only the requested output rows"); + ok &= expect(speculative.find("const int offset = r * output_len;") != std::string::npos, + "Batched flat DFlash reduced-logits offsets must use compact output rows"); + { const size_t zero_reuse_reset = server_context.find("n_past == 0"); const size_t stale_ring_reset = server_context.find("common_speculative_discard_dflash_state(slot.get_spec(), nullptr)"); @@ -590,7 +611,7 @@ int main(int argc, char ** argv) { arg_cpp.find("drafter doesn't need the full main ctx") != std::string::npos, "DFlash default -cd must stay at the production 256-token drafter context unless the user overrides it"); ok &= expect(speculative.find("float * logits = llama_get_logits_ith(ctx_dft, i);") != std::string::npos, "DFlash flat draft fallback rows must preserve seed-token offset"); - ok &= expect(speculative.find("const int offset = r * batch_len;") != std::string::npos, "DFlash batched draft argmax row offsets must preserve seed-token output rows"); + ok &= expect(speculative.find("const int offset = r * output_len;") != std::string::npos, "DFlash batched draft argmax row offsets must use compact output rows after full-block drafting"); ok &= expect(speculative.find("graph_reuse=%d") != std::string::npos, "DFlash draft profile must report drafter graph reuse"); ok &= expect(context_cpp.find("output_reorder();\n if (logits_argmax_buf.empty())") != std::string::npos, "argmax row access must honor output reordering"); ok &= expect(context_cpp.find("std::swap(logits_argmax_buf") != std::string::npos, "output reordering must include reduced logits ids"); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index df782c76729b..a0de12133ecc 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1809,6 +1809,12 @@ struct server_context_impl { llama_context * ctx_tgt = nullptr; + // DFlash: if reduced verify (TOPK-based argmax) failed on first attempt, + // the backend likely lacks GGML_OP_TOPK support (e.g., Vulkan). Set to true + // to disable reduced verify permanently for this model session. Resets when + // the model is reloaded (since server_context_impl is destroyed/recreated). + bool dflash_reduced_verify_broken = false; + // DFlash: one drafter context shared across all slots' // common_speculative states (non-owning refs). Must outlive all specs — the // destroy() order below (specs first, then this) enforces that; when destroy() @@ -5750,6 +5756,17 @@ struct server_context_impl { int32_t dflash_reduced_verify_view_start = 0; bool dflash_verify_graph_enabled = false; + // If reduced verify has been attempted but never succeeded, the backend + // likely doesn't support the TOPK compute op (e.g. Vulkan). Disable + // it permanently so we fall back to full logits. + const bool dflash_use_reduced_verify = dflash_verify_plan.enabled && !dflash_reduced_verify_broken; + + // Set to true when reduced verify fails on the current cycle. Skips all + // speculative token processing (KV rollback, accept, output) because we + // have no logits or argmax to work with. The draft state is cleared and + // the slot continues generating with a fresh single-token decode. + bool dflash_reduced_verify_recovery = false; + if (dflash_verify_plan.enabled) { for (int32_t j = 0; j < batch.n_tokens; j += std::min(n_batch, batch.n_tokens - j)) { const int32_t n_tokens_probe = std::min(n_batch, batch.n_tokens - j); @@ -5891,7 +5908,7 @@ struct server_context_impl { dflash_reduced_verify_ready = false; const char * dflash_reduce_reason = dflash_verify_plan.reason; bool dflash_reduce_this_view = false; - if (dflash_verify_plan.enabled) { + if (dflash_use_reduced_verify) { dflash_reduce_this_view = dflash_batch_view_is_reduced_verify(slots, params_base.sampling, params_base.speculative, use_rejection_sampling, ddtree_batch_active, i, n_tokens, @@ -6034,10 +6051,21 @@ struct server_context_impl { dflash_reduced_verify_ready = true; dflash_reduced_verify_top_k = dflash_verify_plan.top_k; dflash_reduced_verify_view_start = i; - } else if (dflash_server_profile_enabled(DFLASH_PROFILE_VERIFY)) { - SRV_INF("dflash compact-output mismatch: view_start=%d n_tokens=%d expected_top_k=%d got_ptr=%d got_n=%d got_k=%d\n", - i, n_tokens, dflash_verify_plan.top_k, - compact_argmax != nullptr ? 1 : 0, compact_n, compact_k); + } else { + // Backend doesn't produce argmax/TOPK output (e.g. Vulkan + // doesn't support GGML_OP_TOPK). Disable reduced verify + // permanently so subsequent cycles fall back to full logits. + if (!dflash_reduced_verify_broken) { + SRV_WRN("DFlash reduced verify failed (no argmax output, top_k=%d); " + "disabling reduced verify for this session (backend may not support TOPK)\n", + dflash_verify_plan.top_k); + } + dflash_reduced_verify_broken = true; + if (dflash_server_profile_enabled(DFLASH_PROFILE_VERIFY)) { + SRV_INF("dflash compact-output mismatch: view_start=%d n_tokens=%d expected_top_k=%d got_ptr=%d got_n=%d got_k=%d\n", + i, n_tokens, dflash_verify_plan.top_k, + compact_argmax != nullptr ? 1 : 0, compact_n, compact_k); + } } } const int64_t t_verify_elapsed = ggml_time_us() - t_verify_start; @@ -6397,7 +6425,18 @@ struct server_context_impl { GGML_ABORT("DFlash reduced verifier output missing; falling back is unsafe because raw logits were not copied\n"); } } else { - prefetched.ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx_tgt, slot.spec_i_batch, slot.spec_draft, false, on_accept); + if (dflash_reduce_this_view && !dflash_reduced_verify_ready) { + // Reduced verify was active but failed to produce argmax results, + // and the logits buffer was not allocated. Skip all speculative + // token processing; clear the draft state and let the slot + // continue with a fresh single-token decode. + SLT_WRN(slot, "DFlash reduced verify failed and logits unavailable; " + "discarding %d draft tokens and recovering\n", + (int) slot.spec_draft.size()); + dflash_reduced_verify_recovery = true; + } else { + prefetched.ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx_tgt, slot.spec_i_batch, slot.spec_draft, false, on_accept); + } } } prefetched.sample_us = ggml_time_us() - t_sample_start; @@ -6765,7 +6804,26 @@ struct server_context_impl { GGML_ABORT("DFlash reduced verifier output missing; falling back is unsafe because raw logits were not copied\n"); } } else { - ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx_tgt, slot.spec_i_batch, slot.spec_draft, false, on_accept); + // When DFlash reduced verify was active (dflash_reduce_this_view=true), + // output_reserve skips the logits buffer allocation. If the reduced + // verify path then fails to produce argmax output + // (reduced_verify_ready=false), we have no logits and no argmax. + // Accept 0 draft tokens and recover on the next cycle. + if (dflash_reduce_this_view && !dflash_reduced_verify_ready) { + // Reduced verify was active but failed to produce argmax results, + // and the logits buffer was not allocated because output_reserve + // saw dflash_reduced_consumer_active. We have no logits and no + // argmax output. Skip all speculative token processing; clear + // the draft state and let the slot continue with a fresh + // single-token decode on the next cycle (which will use full + // logits since dflash_reduced_verify_broken is now true). + SLT_WRN(slot, "DFlash reduced verify failed and logits unavailable; " + "discarding %d draft tokens and recovering\n", + (int) slot.spec_draft.size()); + dflash_reduced_verify_recovery = true; + } else { + ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx_tgt, slot.spec_i_batch, slot.spec_draft, false, on_accept); + } } } profile_accept_lap(profile_accept_sample_us); @@ -6793,6 +6851,32 @@ struct server_context_impl { profile_accept_lap(profile_accept_update_us); } + // When reduced verify failed, we have no logits or argmax output. + // Skip all speculative token processing (KV rollback, accept, output) + // but still reset the draft model's state so the next cycle is consistent. + if (dflash_reduced_verify_recovery) { + if (params_base.speculative.type() == COMMON_SPECULATIVE_TYPE_DFLASH) { + llama_dflash_set_active_slot(ctx_tgt, slot.id); + common_speculative_accept(slot.get_spec(), 0); + } + // Clean up draft tokens from KV cache and prompt so the next cycle + // starts from a consistent position. The draft was already decoded + // into the KV cache, but we have no logits to verify against. + { + auto * mem = llama_get_memory(ctx_tgt); + llama_memory_seq_rm(mem, slot.id, slot.n_pos_before_draft, -1); + if (ctx_dft) { + auto * mem_dft = llama_get_memory(ctx_dft.get()); + llama_memory_seq_rm(mem_dft, slot.id, slot.n_pos_before_draft, -1); + } + // Remove draft tokens + target token from prompt + slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - (size_t) slot.spec_draft.size() - 1); + } + dflash_skip_tg_slot_for_next_cycle(slot); + slot.has_draft_backup = false; + continue; + } + GGML_ASSERT(slot.remaining_generation_budget(params_base) == -1 || (int32_t) ids.size() <= slot.remaining_generation_budget(params_base)); @@ -7044,12 +7128,29 @@ struct server_context_impl { llama_memory_seq_rm(mem, slot.id, slot.prompt.tokens.pos_next(), -1); } else { llama_clear_tree_parent_ids(ctx_tgt); - llama_dflash_rollback(ctx_tgt, slot.id, seq_backup, slot.n_pos_before_draft, n_hidden_keep); + const int n_reeval = llama_dflash_rollback(ctx_tgt, slot.id, seq_backup, slot.n_pos_before_draft, n_hidden_keep); if (n_slots_drafted > 1) { // Multi-slot accept can immediately roll back another seq; make this // seq's async recurrent replay visible before the next mutation. llama_tape_replay_sync(ctx_tgt); } + // When tape replay was unavailable (e.g., Vulkan), re-decode + // the accepted positions to advance the recurrent state. + // logits=false: we only need the state advance, not the output. + if (n_reeval > 0) { + llama_batch batch_reeval = llama_batch_init(n_reeval, 0, 1); + for (int j = 0; j < n_reeval; ++j) { + const llama_pos pos = slot.n_pos_before_draft + j; + common_batch_add(batch_reeval, slot.prompt.tokens[slot.n_tokens_before_draft + j], pos, { slot.id }, false); + } + const int ret_reeval = llama_decode(ctx_tgt, batch_reeval); + llama_batch_free(batch_reeval); + if (ret_reeval != 0) { + SLT_WRN(slot, "re-decode of %d accepted tokens failed (ret=%d), " + "recurrent state may be inconsistent\n", + n_reeval, ret_reeval); + } + } } } else { auto * mem = llama_get_memory(ctx_tgt);