diff --git a/coderag/retrieval/search.py b/coderag/retrieval/search.py index 13717ce..b8e7f55 100644 --- a/coderag/retrieval/search.py +++ b/coderag/retrieval/search.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import time from typing import TYPE_CHECKING, Dict, List, Optional from coderag.config import Config @@ -32,7 +33,17 @@ def __init__( self.store = store self.reranker = reranker - def search(self, query: str, top_k: int) -> List[SearchHit]: + def search( + self, + query: str, + top_k: int, + *, + timings: Optional[Dict[str, float]] = None, + ) -> List[SearchHit]: + """Hybrid search. Pass a ``timings`` dict to receive a per-phase latency + breakdown (``embed_ms``/``dense_ms``/``lexical_ms``/``hydrate_ms``/``rerank_ms``) + in milliseconds — used by the demo UI to show where retrieval time actually goes. + """ if not query or not query.strip(): return [] @@ -42,16 +53,26 @@ def search(self, query: str, top_k: int) -> List[SearchHit]: pool = max(self.config.rerank_candidates, top_k) fetch_k = max(self.config.fetch_k, pool) - # Dense retrieval (vector ANN over the store). + # Dense retrieval (vector ANN over the store). The query embedding is the model + # inference; on a busy/throttled host it can dwarf the store ops, so time it apart. + t0 = time.perf_counter() qvec = self.provider.embed_query(query) + if timings is not None: + timings["embed_ms"] = (time.perf_counter() - t0) * 1000.0 + t0 = time.perf_counter() dense = self.store.vector_search(qvec, fetch_k) + if timings is not None: + timings["dense_ms"] = (time.perf_counter() - t0) * 1000.0 similarity: Dict[int, float] = { cid: float(max(0.0, min(1.0, s))) for cid, s in dense } dense_ranked = [cid for cid, _ in dense] # Lexical retrieval (BM25 over the store). + t0 = time.perf_counter() lexical_ranked = [cid for cid, _ in self.store.lexical_search(query, fetch_k)] + if timings is not None: + timings["lexical_ms"] = (time.perf_counter() - t0) * 1000.0 # Fuse, then trim to the candidate pool (top_k, or deeper when reranking). # Weights may adapt to the query type (dense-up for NL, BM25-up for identifiers). @@ -79,7 +100,10 @@ def search(self, query: str, top_k: int) -> List[SearchHit]: return [] ids = [cid for cid, _ in fused] + t0 = time.perf_counter() rows = self.store.hydrate(ids) + if timings is not None: + timings["hydrate_ms"] = (time.perf_counter() - t0) * 1000.0 hits: List[SearchHit] = [] for cid, score in fused: @@ -102,7 +126,10 @@ def search(self, query: str, top_k: int) -> List[SearchHit]: ) if self.reranker is not None: + t0 = time.perf_counter() hits = self._rerank(query, hits) + if timings is not None: + timings["rerank_ms"] = (time.perf_counter() - t0) * 1000.0 return hits[:top_k] def _graph_neighbors( diff --git a/coderag/surfaces/http_api.py b/coderag/surfaces/http_api.py index d235901..83f7aef 100644 --- a/coderag/surfaces/http_api.py +++ b/coderag/surfaces/http_api.py @@ -146,6 +146,11 @@ def run_server(cr: "CodeRAG", host: str = "127.0.0.1", port: int = 8000) -> None "can reach this port. Set CODERAG_API_KEY to require authentication.", host, ) - # Warm the index/provider so the first request isn't slow. - cr.status() + # Warm the index/provider AND the embedding model (loads the model + JITs the query + # path) so the first request isn't slow — matches the UI. status() alone builds the + # store/provider but never embeds, leaving the first query to pay the cold model load. + try: + cr.warm() + except Exception: # pragma: no cover - warm-up is best-effort + logger.exception("HTTP API warm-up failed (continuing).") uvicorn.run(create_app(cr), host=host, port=port) diff --git a/coderag/surfaces/static/app.css b/coderag/surfaces/static/app.css index e8d4849..7b7f4e0 100644 --- a/coderag/surfaces/static/app.css +++ b/coderag/surfaces/static/app.css @@ -317,6 +317,7 @@ fieldset.field legend { padding: 0 0.3rem; color: var(--ink-3); font-family: var /* demo-only: show how fast local retrieval was (separate from the AI answer) */ .speed-badge { margin-left: auto; display: inline-flex; align-items: baseline; gap: 0.5rem; font-family: var(--mono); font-variant-numeric: tabular-nums; } .speed-badge .speed-ms { font-size: 0.78rem; font-weight: 700; color: var(--accent-strong); background: var(--accent-soft); border-radius: 999px; padding: 0.1rem 0.55rem; white-space: nowrap; } +.speed-badge .speed-split { font-size: 0.72rem; color: var(--ink-3); font-variant-numeric: tabular-nums; } .speed-badge .speed-corpus { font-size: 0.72rem; color: var(--ink-3); } .results { list-style: none; margin: 0; padding: 0; display: flex; flex-direction: column; gap: 0.9rem; } .hit { diff --git a/coderag/surfaces/templates/index.html b/coderag/surfaces/templates/index.html index cb6beb3..dd725e1 100644 --- a/coderag/surfaces/templates/index.html +++ b/coderag/surfaces/templates/index.html @@ -66,8 +66,9 @@

{{ hits | length }} result{{ '' if hits | length == 1 else 's' }}

{% if demo and search_ms is defined %} + title="Local hybrid retrieval time — separate from the optional AI answer.{% if embed_ms is defined %} Breakdown: model embedding {{ '%.0f' | format(embed_ms) }} ms + store (vector + BM25 + hydrate) {{ '%.0f' | format(store_ms) }} ms.{% endif %}"> ⚡ {{ '%.0f' | format(search_ms) }} ms + {% if embed_ms is defined %}embed {{ '%.0f' | format(embed_ms) }} · store {{ '%.0f' | format(store_ms) }} ms{% endif %} {% if status %}over {{ status.total_chunks }} chunks · {{ status.total_files }} files{% endif %} {% endif %} diff --git a/coderag/surfaces/webui.py b/coderag/surfaces/webui.py index fca0641..4190771 100644 --- a/coderag/surfaces/webui.py +++ b/coderag/surfaces/webui.py @@ -193,20 +193,22 @@ def _run_search( langs: List[str], kinds: List[str], path: Optional[str], -) -> Tuple[List[SearchHit], float]: +) -> Tuple[List[SearchHit], float, Dict[str, float]]: """Search, then post-filter. Fetches extra candidates when filters are active. ``search`` has no server-side filtering, so to keep filtered results useful we pull a larger candidate set and narrow it down to ``k`` here. Also returns the wall-clock retrieval time in milliseconds — timed around the ``.search()`` call only (not - filtering or highlighting) — so the demo UI can show how fast the index answers. + filtering or highlighting) — plus a per-phase ``timings`` breakdown (embed vs store), + so the demo UI can show how fast the index answers and where the time goes. """ filtering = bool(langs or kinds or path) fetch = max(k, 50) if filtering else k + timings: Dict[str, float] = {} t0 = time.perf_counter() - hits = _searcher_for(cr, dense, lexical).search(query, fetch) + hits = _searcher_for(cr, dense, lexical).search(query, fetch, timings=timings) elapsed_ms = (time.perf_counter() - t0) * 1000.0 - return _apply_filters(hits, langs, kinds, path)[:k], elapsed_ms + return _apply_filters(hits, langs, kinds, path)[:k], elapsed_ms, timings # --- app factory --- @@ -382,7 +384,7 @@ def home( } ) if q and q.strip(): - hits, search_ms = _run_search( + hits, search_ms, timings = _run_search( cr, q.strip(), k, @@ -394,6 +396,14 @@ def home( ) ctx["hits"] = _hit_views(hits) ctx["search_ms"] = search_ms + # Split the badge into embedding (model inference) vs store (vector + BM25 + + # hydrate) so a slow result is attributable — on a busy host the embedding + # usually dominates, not the LanceDB retrieval. + ctx["embed_ms"] = timings.get("embed_ms", 0.0) + ctx["store_ms"] = sum( + timings.get(key, 0.0) + for key in ("dense_ms", "lexical_ms", "hydrate_ms", "rerank_ms") + ) ctx["answer_qs"] = urlencode({"q": q.strip(), "k": k}) resp = templates.TemplateResponse(request, "index.html", ctx) if demo: