diff --git a/coderag/retrieval/search.py b/coderag/retrieval/search.py index 13717ce..b8e7f55 100644 --- a/coderag/retrieval/search.py +++ b/coderag/retrieval/search.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import time from typing import TYPE_CHECKING, Dict, List, Optional from coderag.config import Config @@ -32,7 +33,17 @@ def __init__( self.store = store self.reranker = reranker - def search(self, query: str, top_k: int) -> List[SearchHit]: + def search( + self, + query: str, + top_k: int, + *, + timings: Optional[Dict[str, float]] = None, + ) -> List[SearchHit]: + """Hybrid search. Pass a ``timings`` dict to receive a per-phase latency + breakdown (``embed_ms``/``dense_ms``/``lexical_ms``/``hydrate_ms``/``rerank_ms``) + in milliseconds — used by the demo UI to show where retrieval time actually goes. + """ if not query or not query.strip(): return [] @@ -42,16 +53,26 @@ def search(self, query: str, top_k: int) -> List[SearchHit]: pool = max(self.config.rerank_candidates, top_k) fetch_k = max(self.config.fetch_k, pool) - # Dense retrieval (vector ANN over the store). + # Dense retrieval (vector ANN over the store). The query embedding is the model + # inference; on a busy/throttled host it can dwarf the store ops, so time it apart. + t0 = time.perf_counter() qvec = self.provider.embed_query(query) + if timings is not None: + timings["embed_ms"] = (time.perf_counter() - t0) * 1000.0 + t0 = time.perf_counter() dense = self.store.vector_search(qvec, fetch_k) + if timings is not None: + timings["dense_ms"] = (time.perf_counter() - t0) * 1000.0 similarity: Dict[int, float] = { cid: float(max(0.0, min(1.0, s))) for cid, s in dense } dense_ranked = [cid for cid, _ in dense] # Lexical retrieval (BM25 over the store). + t0 = time.perf_counter() lexical_ranked = [cid for cid, _ in self.store.lexical_search(query, fetch_k)] + if timings is not None: + timings["lexical_ms"] = (time.perf_counter() - t0) * 1000.0 # Fuse, then trim to the candidate pool (top_k, or deeper when reranking). # Weights may adapt to the query type (dense-up for NL, BM25-up for identifiers). @@ -79,7 +100,10 @@ def search(self, query: str, top_k: int) -> List[SearchHit]: return [] ids = [cid for cid, _ in fused] + t0 = time.perf_counter() rows = self.store.hydrate(ids) + if timings is not None: + timings["hydrate_ms"] = (time.perf_counter() - t0) * 1000.0 hits: List[SearchHit] = [] for cid, score in fused: @@ -102,7 +126,10 @@ def search(self, query: str, top_k: int) -> List[SearchHit]: ) if self.reranker is not None: + t0 = time.perf_counter() hits = self._rerank(query, hits) + if timings is not None: + timings["rerank_ms"] = (time.perf_counter() - t0) * 1000.0 return hits[:top_k] def _graph_neighbors( diff --git a/coderag/surfaces/http_api.py b/coderag/surfaces/http_api.py index d235901..83f7aef 100644 --- a/coderag/surfaces/http_api.py +++ b/coderag/surfaces/http_api.py @@ -146,6 +146,11 @@ def run_server(cr: "CodeRAG", host: str = "127.0.0.1", port: int = 8000) -> None "can reach this port. Set CODERAG_API_KEY to require authentication.", host, ) - # Warm the index/provider so the first request isn't slow. - cr.status() + # Warm the index/provider AND the embedding model (loads the model + JITs the query + # path) so the first request isn't slow — matches the UI. status() alone builds the + # store/provider but never embeds, leaving the first query to pay the cold model load. + try: + cr.warm() + except Exception: # pragma: no cover - warm-up is best-effort + logger.exception("HTTP API warm-up failed (continuing).") uvicorn.run(create_app(cr), host=host, port=port) diff --git a/coderag/surfaces/static/app.css b/coderag/surfaces/static/app.css index e8d4849..7b7f4e0 100644 --- a/coderag/surfaces/static/app.css +++ b/coderag/surfaces/static/app.css @@ -317,6 +317,7 @@ fieldset.field legend { padding: 0 0.3rem; color: var(--ink-3); font-family: var /* demo-only: show how fast local retrieval was (separate from the AI answer) */ .speed-badge { margin-left: auto; display: inline-flex; align-items: baseline; gap: 0.5rem; font-family: var(--mono); font-variant-numeric: tabular-nums; } .speed-badge .speed-ms { font-size: 0.78rem; font-weight: 700; color: var(--accent-strong); background: var(--accent-soft); border-radius: 999px; padding: 0.1rem 0.55rem; white-space: nowrap; } +.speed-badge .speed-split { font-size: 0.72rem; color: var(--ink-3); font-variant-numeric: tabular-nums; } .speed-badge .speed-corpus { font-size: 0.72rem; color: var(--ink-3); } .results { list-style: none; margin: 0; padding: 0; display: flex; flex-direction: column; gap: 0.9rem; } .hit { diff --git a/coderag/surfaces/templates/index.html b/coderag/surfaces/templates/index.html index cb6beb3..dd725e1 100644 --- a/coderag/surfaces/templates/index.html +++ b/coderag/surfaces/templates/index.html @@ -66,8 +66,9 @@