Neverdecel · Neverdecel · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/coderag/retrieval/search.py b/coderag/retrieval/search.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import logging
+import time
 from typing import TYPE_CHECKING, Dict, List, Optional
 
 from coderag.config import Config
@@ -32,7 +33,17 @@ def __init__(
         self.store = store
         self.reranker = reranker
 
-    def search(self, query: str, top_k: int) -> List[SearchHit]:
+    def search(
+        self,
+        query: str,
+        top_k: int,
+        *,
+        timings: Optional[Dict[str, float]] = None,
+    ) -> List[SearchHit]:
+        """Hybrid search. Pass a ``timings`` dict to receive a per-phase latency
+        breakdown (``embed_ms``/``dense_ms``/``lexical_ms``/``hydrate_ms``/``rerank_ms``)
+        in milliseconds — used by the demo UI to show where retrieval time actually goes.
+        """
         if not query or not query.strip():
             return []
 
@@ -42,16 +53,26 @@ def search(self, query: str, top_k: int) -> List[SearchHit]:
             pool = max(self.config.rerank_candidates, top_k)
         fetch_k = max(self.config.fetch_k, pool)
 
-        # Dense retrieval (vector ANN over the store).
+        # Dense retrieval (vector ANN over the store). The query embedding is the model
+        # inference; on a busy/throttled host it can dwarf the store ops, so time it apart.
+        t0 = time.perf_counter()
         qvec = self.provider.embed_query(query)
+        if timings is not None:
+            timings["embed_ms"] = (time.perf_counter() - t0) * 1000.0
+        t0 = time.perf_counter()
         dense = self.store.vector_search(qvec, fetch_k)
+        if timings is not None:
+            timings["dense_ms"] = (time.perf_counter() - t0) * 1000.0
         similarity: Dict[int, float] = {
             cid: float(max(0.0, min(1.0, s))) for cid, s in dense
         }
         dense_ranked = [cid for cid, _ in dense]
 
         # Lexical retrieval (BM25 over the store).
+        t0 = time.perf_counter()
         lexical_ranked = [cid for cid, _ in self.store.lexical_search(query, fetch_k)]
+        if timings is not None:
+            timings["lexical_ms"] = (time.perf_counter() - t0) * 1000.0
 
         # Fuse, then trim to the candidate pool (top_k, or deeper when reranking).
         # Weights may adapt to the query type (dense-up for NL, BM25-up for identifiers).
@@ -79,7 +100,10 @@ def search(self, query: str, top_k: int) -> List[SearchHit]:
             return []
 
         ids = [cid for cid, _ in fused]
+        t0 = time.perf_counter()
         rows = self.store.hydrate(ids)
+        if timings is not None:
+            timings["hydrate_ms"] = (time.perf_counter() - t0) * 1000.0
 
         hits: List[SearchHit] = []
         for cid, score in fused:
@@ -102,7 +126,10 @@ def search(self, query: str, top_k: int) -> List[SearchHit]:
             )
 
         if self.reranker is not None:
+            t0 = time.perf_counter()
             hits = self._rerank(query, hits)
+            if timings is not None:
+                timings["rerank_ms"] = (time.perf_counter() - t0) * 1000.0
         return hits[:top_k]
 
     def _graph_neighbors(

diff --git a/coderag/surfaces/http_api.py b/coderag/surfaces/http_api.py
@@ -146,6 +146,11 @@ def run_server(cr: "CodeRAG", host: str = "127.0.0.1", port: int = 8000) -> None
             "can reach this port. Set CODERAG_API_KEY to require authentication.",
             host,
         )
-    # Warm the index/provider so the first request isn't slow.
-    cr.status()
+    # Warm the index/provider AND the embedding model (loads the model + JITs the query
+    # path) so the first request isn't slow — matches the UI. status() alone builds the
+    # store/provider but never embeds, leaving the first query to pay the cold model load.
+    try:
+        cr.warm()
+    except Exception:  # pragma: no cover - warm-up is best-effort
+        logger.exception("HTTP API warm-up failed (continuing).")
     uvicorn.run(create_app(cr), host=host, port=port)
diff --git a/coderag/surfaces/static/app.css b/coderag/surfaces/static/app.css
@@ -317,6 +317,7 @@ fieldset.field legend { padding: 0 0.3rem; color: var(--ink-3); font-family: var
 /* demo-only: show how fast local retrieval was (separate from the AI answer) */
 .speed-badge { margin-left: auto; display: inline-flex; align-items: baseline; gap: 0.5rem; font-family: var(--mono); font-variant-numeric: tabular-nums; }
 .speed-badge .speed-ms { font-size: 0.78rem; font-weight: 700; color: var(--accent-strong); background: var(--accent-soft); border-radius: 999px; padding: 0.1rem 0.55rem; white-space: nowrap; }
+.speed-badge .speed-split { font-size: 0.72rem; color: var(--ink-3); font-variant-numeric: tabular-nums; }
 .speed-badge .speed-corpus { font-size: 0.72rem; color: var(--ink-3); }
 .results { list-style: none; margin: 0; padding: 0; display: flex; flex-direction: column; gap: 0.9rem; }
 .hit {

diff --git a/coderag/surfaces/templates/index.html b/coderag/surfaces/templates/index.html
@@ -66,8 +66,9 @@
       <h2>{{ hits | length }} result{{ '' if hits | length == 1 else 's' }}</h2>
       {% if demo and search_ms is defined %}
       <span class="speed-badge" data-search-ms="{{ '%.0f' | format(search_ms) }}"
-            title="Local hybrid retrieval time — separate from the optional AI answer">
+            title="Local hybrid retrieval time — separate from the optional AI answer.{% if embed_ms is defined %} Breakdown: model embedding {{ '%.0f' | format(embed_ms) }} ms + store (vector + BM25 + hydrate) {{ '%.0f' | format(store_ms) }} ms.{% endif %}">
         <span class="speed-ms">⚡ {{ '%.0f' | format(search_ms) }} ms</span>
+        {% if embed_ms is defined %}<span class="speed-split">embed {{ '%.0f' | format(embed_ms) }} · store {{ '%.0f' | format(store_ms) }} ms</span>{% endif %}
         {% if status %}<span class="speed-corpus">over {{ status.total_chunks }} chunks · {{ status.total_files }} files</span>{% endif %}
       </span>
       {% endif %}

diff --git a/coderag/surfaces/webui.py b/coderag/surfaces/webui.py
@@ -193,20 +193,22 @@ def _run_search(
     langs: List[str],
     kinds: List[str],
     path: Optional[str],
-) -> Tuple[List[SearchHit], float]:
+) -> Tuple[List[SearchHit], float, Dict[str, float]]:
     """Search, then post-filter. Fetches extra candidates when filters are active.
 
     ``search`` has no server-side filtering, so to keep filtered results useful we pull a
     larger candidate set and narrow it down to ``k`` here. Also returns the wall-clock
     retrieval time in milliseconds — timed around the ``.search()`` call only (not
-    filtering or highlighting) — so the demo UI can show how fast the index answers.
+    filtering or highlighting) — plus a per-phase ``timings`` breakdown (embed vs store),
+    so the demo UI can show how fast the index answers and where the time goes.
     """
     filtering = bool(langs or kinds or path)
     fetch = max(k, 50) if filtering else k
+    timings: Dict[str, float] = {}
     t0 = time.perf_counter()
-    hits = _searcher_for(cr, dense, lexical).search(query, fetch)
+    hits = _searcher_for(cr, dense, lexical).search(query, fetch, timings=timings)
     elapsed_ms = (time.perf_counter() - t0) * 1000.0
-    return _apply_filters(hits, langs, kinds, path)[:k], elapsed_ms
+    return _apply_filters(hits, langs, kinds, path)[:k], elapsed_ms, timings
 
 
 # --- app factory ---
@@ -382,7 +384,7 @@ def home(
             }
         )
         if q and q.strip():
-            hits, search_ms = _run_search(
+            hits, search_ms, timings = _run_search(
                 cr,
                 q.strip(),
                 k,
@@ -394,6 +396,14 @@ def home(
             )
             ctx["hits"] = _hit_views(hits)
             ctx["search_ms"] = search_ms
+            # Split the badge into embedding (model inference) vs store (vector + BM25 +
+            # hydrate) so a slow result is attributable — on a busy host the embedding
+            # usually dominates, not the LanceDB retrieval.
+            ctx["embed_ms"] = timings.get("embed_ms", 0.0)
+            ctx["store_ms"] = sum(
+                timings.get(key, 0.0)
+                for key in ("dense_ms", "lexical_ms", "hydrate_ms", "rerank_ms")
+            )
             ctx["answer_qs"] = urlencode({"q": q.strip(), "k": k})
         resp = templates.TemplateResponse(request, "index.html", ctx)
         if demo: