Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions coderag/retrieval/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import logging
import time
from typing import TYPE_CHECKING, Dict, List, Optional

from coderag.config import Config
Expand Down Expand Up @@ -32,7 +33,17 @@ def __init__(
self.store = store
self.reranker = reranker

def search(self, query: str, top_k: int) -> List[SearchHit]:
def search(
self,
query: str,
top_k: int,
*,
timings: Optional[Dict[str, float]] = None,
) -> List[SearchHit]:
"""Hybrid search. Pass a ``timings`` dict to receive a per-phase latency
breakdown (``embed_ms``/``dense_ms``/``lexical_ms``/``hydrate_ms``/``rerank_ms``)
in milliseconds — used by the demo UI to show where retrieval time actually goes.
"""
if not query or not query.strip():
return []

Expand All @@ -42,16 +53,26 @@ def search(self, query: str, top_k: int) -> List[SearchHit]:
pool = max(self.config.rerank_candidates, top_k)
fetch_k = max(self.config.fetch_k, pool)

# Dense retrieval (vector ANN over the store).
# Dense retrieval (vector ANN over the store). The query embedding is the model
# inference; on a busy/throttled host it can dwarf the store ops, so time it apart.
t0 = time.perf_counter()
qvec = self.provider.embed_query(query)
if timings is not None:
timings["embed_ms"] = (time.perf_counter() - t0) * 1000.0
t0 = time.perf_counter()
dense = self.store.vector_search(qvec, fetch_k)
if timings is not None:
timings["dense_ms"] = (time.perf_counter() - t0) * 1000.0
similarity: Dict[int, float] = {
cid: float(max(0.0, min(1.0, s))) for cid, s in dense
}
dense_ranked = [cid for cid, _ in dense]

# Lexical retrieval (BM25 over the store).
t0 = time.perf_counter()
lexical_ranked = [cid for cid, _ in self.store.lexical_search(query, fetch_k)]
if timings is not None:
timings["lexical_ms"] = (time.perf_counter() - t0) * 1000.0

# Fuse, then trim to the candidate pool (top_k, or deeper when reranking).
# Weights may adapt to the query type (dense-up for NL, BM25-up for identifiers).
Expand Down Expand Up @@ -79,7 +100,10 @@ def search(self, query: str, top_k: int) -> List[SearchHit]:
return []

ids = [cid for cid, _ in fused]
t0 = time.perf_counter()
rows = self.store.hydrate(ids)
if timings is not None:
timings["hydrate_ms"] = (time.perf_counter() - t0) * 1000.0

hits: List[SearchHit] = []
for cid, score in fused:
Expand All @@ -102,7 +126,10 @@ def search(self, query: str, top_k: int) -> List[SearchHit]:
)

if self.reranker is not None:
t0 = time.perf_counter()
hits = self._rerank(query, hits)
if timings is not None:
timings["rerank_ms"] = (time.perf_counter() - t0) * 1000.0
return hits[:top_k]

def _graph_neighbors(
Expand Down
9 changes: 7 additions & 2 deletions coderag/surfaces/http_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ def run_server(cr: "CodeRAG", host: str = "127.0.0.1", port: int = 8000) -> None
"can reach this port. Set CODERAG_API_KEY to require authentication.",
host,
)
# Warm the index/provider so the first request isn't slow.
cr.status()
# Warm the index/provider AND the embedding model (loads the model + JITs the query
# path) so the first request isn't slow — matches the UI. status() alone builds the
# store/provider but never embeds, leaving the first query to pay the cold model load.
try:
cr.warm()
except Exception: # pragma: no cover - warm-up is best-effort
logger.exception("HTTP API warm-up failed (continuing).")
uvicorn.run(create_app(cr), host=host, port=port)
1 change: 1 addition & 0 deletions coderag/surfaces/static/app.css
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ fieldset.field legend { padding: 0 0.3rem; color: var(--ink-3); font-family: var
/* demo-only: show how fast local retrieval was (separate from the AI answer) */
.speed-badge { margin-left: auto; display: inline-flex; align-items: baseline; gap: 0.5rem; font-family: var(--mono); font-variant-numeric: tabular-nums; }
.speed-badge .speed-ms { font-size: 0.78rem; font-weight: 700; color: var(--accent-strong); background: var(--accent-soft); border-radius: 999px; padding: 0.1rem 0.55rem; white-space: nowrap; }
.speed-badge .speed-split { font-size: 0.72rem; color: var(--ink-3); font-variant-numeric: tabular-nums; }
.speed-badge .speed-corpus { font-size: 0.72rem; color: var(--ink-3); }
.results { list-style: none; margin: 0; padding: 0; display: flex; flex-direction: column; gap: 0.9rem; }
.hit {
Expand Down
3 changes: 2 additions & 1 deletion coderag/surfaces/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@
<h2>{{ hits | length }} result{{ '' if hits | length == 1 else 's' }}</h2>
{% if demo and search_ms is defined %}
<span class="speed-badge" data-search-ms="{{ '%.0f' | format(search_ms) }}"
title="Local hybrid retrieval time — separate from the optional AI answer">
title="Local hybrid retrieval time — separate from the optional AI answer.{% if embed_ms is defined %} Breakdown: model embedding {{ '%.0f' | format(embed_ms) }} ms + store (vector + BM25 + hydrate) {{ '%.0f' | format(store_ms) }} ms.{% endif %}">
<span class="speed-ms">⚡ {{ '%.0f' | format(search_ms) }} ms</span>
{% if embed_ms is defined %}<span class="speed-split">embed {{ '%.0f' | format(embed_ms) }} · store {{ '%.0f' | format(store_ms) }} ms</span>{% endif %}
{% if status %}<span class="speed-corpus">over {{ status.total_chunks }} chunks · {{ status.total_files }} files</span>{% endif %}
</span>
{% endif %}
Expand Down
20 changes: 15 additions & 5 deletions coderag/surfaces/webui.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,20 +193,22 @@ def _run_search(
langs: List[str],
kinds: List[str],
path: Optional[str],
) -> Tuple[List[SearchHit], float]:
) -> Tuple[List[SearchHit], float, Dict[str, float]]:
"""Search, then post-filter. Fetches extra candidates when filters are active.

``search`` has no server-side filtering, so to keep filtered results useful we pull a
larger candidate set and narrow it down to ``k`` here. Also returns the wall-clock
retrieval time in milliseconds — timed around the ``.search()`` call only (not
filtering or highlighting) — so the demo UI can show how fast the index answers.
filtering or highlighting) — plus a per-phase ``timings`` breakdown (embed vs store),
so the demo UI can show how fast the index answers and where the time goes.
"""
filtering = bool(langs or kinds or path)
fetch = max(k, 50) if filtering else k
timings: Dict[str, float] = {}
t0 = time.perf_counter()
hits = _searcher_for(cr, dense, lexical).search(query, fetch)
hits = _searcher_for(cr, dense, lexical).search(query, fetch, timings=timings)
elapsed_ms = (time.perf_counter() - t0) * 1000.0
return _apply_filters(hits, langs, kinds, path)[:k], elapsed_ms
return _apply_filters(hits, langs, kinds, path)[:k], elapsed_ms, timings


# --- app factory ---
Expand Down Expand Up @@ -382,7 +384,7 @@ def home(
}
)
if q and q.strip():
hits, search_ms = _run_search(
hits, search_ms, timings = _run_search(
cr,
q.strip(),
k,
Expand All @@ -394,6 +396,14 @@ def home(
)
ctx["hits"] = _hit_views(hits)
ctx["search_ms"] = search_ms
# Split the badge into embedding (model inference) vs store (vector + BM25 +
# hydrate) so a slow result is attributable — on a busy host the embedding
# usually dominates, not the LanceDB retrieval.
ctx["embed_ms"] = timings.get("embed_ms", 0.0)
ctx["store_ms"] = sum(
timings.get(key, 0.0)
for key in ("dense_ms", "lexical_ms", "hydrate_ms", "rerank_ms")
)
ctx["answer_qs"] = urlencode({"q": q.strip(), "k": k})
resp = templates.TemplateResponse(request, "index.html", ctx)
if demo:
Expand Down
Loading