diff --git a/.dockerignore b/.dockerignore index f0c86b7c..8bccea2d 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,8 @@ -.git -**/node_modules -**/npm-debug.log **/.env* **/.next +**/node_modules +**/npm-debug.log +.worktrees +.git +target +tmp diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 56c34954..0c1c728c 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,14 +5,10 @@ updates: schedule: interval: 'daily' time: '00:00' - labels: - - 'bot' - - 'dependencies' + labels: [] - package-ecosystem: 'github-actions' directory: '/' schedule: interval: 'daily' time: '00:00' - labels: - - 'bot' - - 'dependencies' + labels: [] diff --git a/.github/fixtures/trace_gate/config.toml b/.github/fixtures/trace_gate/config.toml new file mode 100644 index 00000000..9768c018 --- /dev/null +++ b/.github/fixtures/trace_gate/config.toml @@ -0,0 +1,186 @@ +[service] +admin_bind = "127.0.0.1:0" +http_bind = "127.0.0.1:0" +log_level = "info" +mcp_bind = "127.0.0.1:0" + +[storage.postgres] +dsn = "postgres://postgres:postgres@127.0.0.1:5432/elf" +pool_max_conns = 5 + +[storage.qdrant] +collection = "ci_trace_gate" +docs_collection = "ci_trace_gate_docs" +url = "http://127.0.0.1:6334" +vector_dim = 4 + +[providers.embedding] +api_base = "http://127.0.0.1" +api_key = "ci" +default_headers = {} +dimensions = 4 +model = "disabled" +path = "/embeddings" +provider_id = "ci" +timeout_ms = 1_000 + +[providers.rerank] +api_base = "http://127.0.0.1" +api_key = "ci" +default_headers = {} +model = "disabled" +path = "/rerank" +provider_id = "ci" +timeout_ms = 1_000 + +[providers.llm_extractor] +api_base = "http://127.0.0.1" +api_key = "ci" +default_headers = {} +model = "disabled" +path = "/chat/completions" +provider_id = "ci" +temperature = 0.0 +timeout_ms = 1_000 + +[scopes] +allowed = ["agent_private"] + +[scopes.read_profiles] +all_scopes = ["agent_private"] +private_only = ["agent_private"] +private_plus_project = ["agent_private"] + +[scopes.precedence] +agent_private = 10 +org_shared = 0 +project_shared = 0 + +[scopes.write_allowed] +agent_private = true +org_shared = false +project_shared = false + +[memory] +candidate_k = 10 +dup_sim_threshold = 0.92 +max_note_chars = 240 +max_notes_per_add_event = 3 +top_k = 3 +update_sim_threshold = 0.85 + +[memory.policy] + +[[memory.policy.rules]] +min_confidence = 0.0 +min_importance = 0.0 + +[chunking] +enabled = true +max_tokens = 256 +overlap_tokens = 64 +tokenizer_repo = "gpt2" + +[search.expansion] +include_original = true +max_queries = 4 +mode = "off" + +[search.dynamic] +min_candidates = 1 +min_top_score = 0.0 + +[search.prefilter] +max_candidates = 0 + +[search.cache] +enabled = false +expansion_ttl_days = 7 +max_payload_bytes = 262_144 +rerank_ttl_days = 7 + +[search.explain] +candidate_retention_days = 1 +capture_candidates = false +retention_days = 2 +write_mode = "outbox" + +[search.recursive] +enabled = false +max_children_per_node = 4 +max_depth = 2 +max_nodes_per_scope = 32 +max_total_nodes = 256 + +[search.graph_context] +enabled = false +max_evidence_notes_per_fact = 16 +max_facts_per_item = 16 + +[ranking] +recency_tau_days = 0.0 +tie_breaker_weight = 0.0 + +[ranking.deterministic] +enabled = false + +[ranking.deterministic.lexical] +enabled = false +max_query_terms = 16 +max_text_terms = 1024 +min_ratio = 0.3 +weight = 0.0 + +[ranking.deterministic.hits] +enabled = false +half_saturation = 8.0 +last_hit_tau_days = 14.0 +weight = 0.0 + +[ranking.deterministic.decay] +enabled = false +tau_days = 30.0 +weight = 0.0 + +[ranking.blend] +enabled = false +rerank_normalization = "rank" +retrieval_normalization = "rank" + +[[ranking.blend.segments]] +max_retrieval_rank = 1_000_000 +retrieval_weight = 0.5 + +[ranking.diversity] +enabled = false +max_skips = 64 +mmr_lambda = 0.7 +sim_threshold = 0.88 + +[ranking.retrieval_sources] +fusion_priority = 1 +fusion_weight = 1.0 +structured_field_priority = 0 +structured_field_weight = 0.0 + +[lifecycle.ttl_days] +constraint = 0 +decision = 0 +fact = 180 +plan = 14 +preference = 0 +profile = 0 + +[lifecycle] +purge_deleted_after_days = 30 +purge_deprecated_after_days = 180 + +[security] +auth_keys = [] +auth_mode = "off" +bind_localhost_only = true +evidence_max_quote_chars = 320 +evidence_max_quotes = 2 +evidence_min_quotes = 1 +redact_secrets_on_write = true +reject_non_english = true diff --git a/.github/fixtures/trace_gate/fixture.sql b/.github/fixtures/trace_gate/fixture.sql new file mode 100644 index 00000000..5f457c38 --- /dev/null +++ b/.github/fixtures/trace_gate/fixture.sql @@ -0,0 +1,769 @@ +BEGIN; + +INSERT INTO search_traces ( + trace_id, + tenant_id, + project_id, + agent_id, + read_profile, + query, + expansion_mode, + expanded_queries, + allowed_scopes, + candidate_count, + top_k, + config_snapshot, + trace_version, + created_at, + expires_at +) +VALUES + ( + '11111111-1111-1111-1111-111111111111', + 't', + 'p', + 'a', + 'private_only', + 'alpha trace gate query', + 'off', + '[]'::jsonb, + '["agent_private"]'::jsonb, + 5, + 3, + '{}'::jsonb, + 3, + '2026-02-01T00:00:00Z'::timestamptz, + '2027-02-01T00:00:00Z'::timestamptz + ), + ( + '22222222-2222-2222-2222-222222222222', + 't', + 'p', + 'a', + 'private_only', + 'beta trace gate query', + 'off', + '[]'::jsonb, + '["agent_private"]'::jsonb, + 5, + 3, + '{}'::jsonb, + 3, + '2026-02-02T00:00:00Z'::timestamptz, + '2027-02-02T00:00:00Z'::timestamptz + ), + ( + '33333333-3333-3333-3333-333333333333', + 't', + 'p', + 'a', + 'private_only', + 'gamma trace gate query', + 'off', + '[]'::jsonb, + '["agent_private"]'::jsonb, + 6, + 3, + '{}'::jsonb, + 3, + '2026-02-03T00:00:00Z'::timestamptz, + '2027-02-03T00:00:00Z'::timestamptz + ), + ( + '44444444-4444-4444-4444-444444444444', + 't', + 'p', + 'a', + 'private_only', + 'delta trace gate query', + 'off', + '[]'::jsonb, + '["agent_private"]'::jsonb, + 6, + 3, + '{}'::jsonb, + 3, + '2026-02-04T00:00:00Z'::timestamptz, + '2027-02-04T00:00:00Z'::timestamptz + ), + ( + '55555555-5555-5555-5555-555555555555', + 't', + 'p', + 'a', + 'private_only', + 'epsilon trace gate query', + 'off', + '[]'::jsonb, + '["agent_private"]'::jsonb, + 5, + 3, + '{}'::jsonb, + 3, + '2026-02-05T00:00:00Z'::timestamptz, + '2027-02-05T00:00:00Z'::timestamptz + ); + +INSERT INTO search_trace_candidates ( + candidate_id, + trace_id, + note_id, + chunk_id, + chunk_index, + snippet, + candidate_snapshot, + retrieval_rank, + rerank_score, + note_scope, + note_importance, + note_updated_at, + note_hit_count, + note_last_hit_at, + created_at, + expires_at +) +VALUES + -- Trace 1 + ( + 'aaaaaaaa-0000-0000-0000-000000000001', + '11111111-1111-1111-1111-111111111111', + 'aaaaaaaa-1111-1111-1111-111111111111', + 'aaaaaaaa-2222-2222-2222-222222222221', + 0, + 'alpha candidate 1', + '{}'::jsonb, + 1, + 0.90, + 'agent_private', + 0.50, + '2026-01-31T00:00:00Z'::timestamptz, + 10, + NULL, + '2026-02-01T00:00:00Z'::timestamptz, + '2027-02-01T00:00:00Z'::timestamptz + ), + ( + 'aaaaaaaa-0000-0000-0000-000000000002', + '11111111-1111-1111-1111-111111111111', + 'aaaaaaaa-1111-1111-1111-111111111112', + 'aaaaaaaa-2222-2222-2222-222222222222', + 0, + 'alpha candidate 2', + '{}'::jsonb, + 2, + 0.80, + 'agent_private', + 0.40, + '2026-01-31T00:00:00Z'::timestamptz, + 9, + NULL, + '2026-02-01T00:00:00Z'::timestamptz, + '2027-02-01T00:00:00Z'::timestamptz + ), + ( + 'aaaaaaaa-0000-0000-0000-000000000003', + '11111111-1111-1111-1111-111111111111', + 'aaaaaaaa-1111-1111-1111-111111111113', + 'aaaaaaaa-2222-2222-2222-222222222223', + 0, + 'alpha candidate 3', + '{}'::jsonb, + 3, + 0.70, + 'agent_private', + 0.30, + '2026-01-31T00:00:00Z'::timestamptz, + 8, + NULL, + '2026-02-01T00:00:00Z'::timestamptz, + '2027-02-01T00:00:00Z'::timestamptz + ), + ( + 'aaaaaaaa-0000-0000-0000-000000000004', + '11111111-1111-1111-1111-111111111111', + 'aaaaaaaa-1111-1111-1111-111111111114', + 'aaaaaaaa-2222-2222-2222-222222222224', + 0, + 'alpha candidate 4', + '{}'::jsonb, + 4, + 0.10, + 'agent_private', + 0.20, + '2026-01-31T00:00:00Z'::timestamptz, + 7, + NULL, + '2026-02-01T00:00:00Z'::timestamptz, + '2027-02-01T00:00:00Z'::timestamptz + ), + ( + 'aaaaaaaa-0000-0000-0000-000000000005', + '11111111-1111-1111-1111-111111111111', + 'aaaaaaaa-1111-1111-1111-111111111115', + 'aaaaaaaa-2222-2222-2222-222222222225', + 0, + 'alpha candidate 5', + '{}'::jsonb, + 5, + 0.05, + 'agent_private', + 0.10, + '2026-01-31T00:00:00Z'::timestamptz, + 6, + NULL, + '2026-02-01T00:00:00Z'::timestamptz, + '2027-02-01T00:00:00Z'::timestamptz + ), + -- Trace 2 + ( + 'bbbbbbbb-0000-0000-0000-000000000001', + '22222222-2222-2222-2222-222222222222', + 'bbbbbbbb-1111-1111-1111-111111111111', + 'bbbbbbbb-2222-2222-2222-222222222221', + 0, + 'beta candidate 1', + '{}'::jsonb, + 1, + 0.95, + 'agent_private', + 0.50, + '2026-02-01T00:00:00Z'::timestamptz, + 10, + NULL, + '2026-02-02T00:00:00Z'::timestamptz, + '2027-02-02T00:00:00Z'::timestamptz + ), + ( + 'bbbbbbbb-0000-0000-0000-000000000002', + '22222222-2222-2222-2222-222222222222', + 'bbbbbbbb-1111-1111-1111-111111111112', + 'bbbbbbbb-2222-2222-2222-222222222222', + 0, + 'beta candidate 2', + '{}'::jsonb, + 2, + 0.60, + 'agent_private', + 0.40, + '2026-02-01T00:00:00Z'::timestamptz, + 9, + NULL, + '2026-02-02T00:00:00Z'::timestamptz, + '2027-02-02T00:00:00Z'::timestamptz + ), + ( + 'bbbbbbbb-0000-0000-0000-000000000003', + '22222222-2222-2222-2222-222222222222', + 'bbbbbbbb-1111-1111-1111-111111111113', + 'bbbbbbbb-2222-2222-2222-222222222223', + 0, + 'beta candidate 3', + '{}'::jsonb, + 3, + 0.50, + 'agent_private', + 0.30, + '2026-02-01T00:00:00Z'::timestamptz, + 8, + NULL, + '2026-02-02T00:00:00Z'::timestamptz, + '2027-02-02T00:00:00Z'::timestamptz + ), + ( + 'bbbbbbbb-0000-0000-0000-000000000004', + '22222222-2222-2222-2222-222222222222', + 'bbbbbbbb-1111-1111-1111-111111111114', + 'bbbbbbbb-2222-2222-2222-222222222224', + 0, + 'beta candidate 4', + '{}'::jsonb, + 4, + 0.20, + 'agent_private', + 0.20, + '2026-02-01T00:00:00Z'::timestamptz, + 7, + NULL, + '2026-02-02T00:00:00Z'::timestamptz, + '2027-02-02T00:00:00Z'::timestamptz + ), + ( + 'bbbbbbbb-0000-0000-0000-000000000005', + '22222222-2222-2222-2222-222222222222', + 'bbbbbbbb-1111-1111-1111-111111111115', + 'bbbbbbbb-2222-2222-2222-222222222225', + 0, + 'beta candidate 5', + '{}'::jsonb, + 5, + 0.10, + 'agent_private', + 0.10, + '2026-02-01T00:00:00Z'::timestamptz, + 6, + NULL, + '2026-02-02T00:00:00Z'::timestamptz, + '2027-02-02T00:00:00Z'::timestamptz + ), + -- Trace 3 + ( + 'eeeeeeee-0000-0000-0000-000000000001', + '33333333-3333-3333-3333-333333333333', + 'eeeeeeee-1111-1111-1111-111111111111', + 'eeeeeeee-2222-2222-2222-222222222221', + 0, + 'gamma candidate 1', + '{"note_id":"eeeeeeee-1111-1111-1111-111111111111","chunk_id":"eeeeeeee-2222-2222-2222-222222222221","chunk_index":0,"snippet":"gamma candidate 1","retrieval_rank":1,"rerank_score":0.95,"note_scope":"agent_private","note_importance":0.60,"note_updated_at":"2026-02-02T00:00:00Z","note_hit_count":11,"note_last_hit_at":"2026-02-02T12:00:00Z"}'::jsonb, + 1, + 0.95, + 'agent_private', + 0.60, + '2026-02-02T00:00:00Z'::timestamptz, + 11, + '2026-02-02T12:00:00Z'::timestamptz, + '2026-02-03T00:00:00Z'::timestamptz, + '2027-02-03T00:00:00Z'::timestamptz + ), + ( + 'eeeeeeee-0000-0000-0000-000000000002', + '33333333-3333-3333-3333-333333333333', + 'eeeeeeee-1111-1111-1111-111111111112', + 'eeeeeeee-2222-2222-2222-222222222222', + 0, + 'gamma candidate 2', + '{}'::jsonb, + 2, + 0.85, + 'agent_private', + 0.50, + '2026-02-02T00:00:00Z'::timestamptz, + 10, + NULL, + '2026-02-03T00:00:00Z'::timestamptz, + '2027-02-03T00:00:00Z'::timestamptz + ), + ( + 'eeeeeeee-0000-0000-0000-000000000003', + '33333333-3333-3333-3333-333333333333', + 'eeeeeeee-1111-1111-1111-111111111113', + 'eeeeeeee-2222-2222-2222-222222222223', + 0, + 'gamma candidate 3', + '{}'::jsonb, + 3, + 0.75, + 'agent_private', + 0.40, + '2026-02-02T00:00:00Z'::timestamptz, + 9, + NULL, + '2026-02-03T00:00:00Z'::timestamptz, + '2027-02-03T00:00:00Z'::timestamptz + ), + ( + 'eeeeeeee-0000-0000-0000-000000000004', + '33333333-3333-3333-3333-333333333333', + 'eeeeeeee-1111-1111-1111-111111111114', + 'eeeeeeee-2222-2222-2222-222222222224', + 0, + 'gamma candidate 4', + '{}'::jsonb, + 4, + 0.65, + 'agent_private', + 0.30, + '2026-02-02T00:00:00Z'::timestamptz, + 8, + NULL, + '2026-02-03T00:00:00Z'::timestamptz, + '2027-02-03T00:00:00Z'::timestamptz + ), + ( + 'eeeeeeee-0000-0000-0000-000000000005', + '33333333-3333-3333-3333-333333333333', + 'eeeeeeee-1111-1111-1111-111111111115', + 'eeeeeeee-2222-2222-2222-222222222225', + 0, + 'gamma candidate 5', + '{}'::jsonb, + 5, + 0.55, + 'agent_private', + 0.20, + '2026-02-02T00:00:00Z'::timestamptz, + 7, + NULL, + '2026-02-03T00:00:00Z'::timestamptz, + '2027-02-03T00:00:00Z'::timestamptz + ), + ( + 'eeeeeeee-0000-0000-0000-000000000006', + '33333333-3333-3333-3333-333333333333', + 'eeeeeeee-1111-1111-1111-111111111116', + 'eeeeeeee-2222-2222-2222-222222222226', + 0, + 'gamma candidate 6', + '{}'::jsonb, + 6, + 0.45, + 'agent_private', + 0.10, + '2026-02-02T00:00:00Z'::timestamptz, + 6, + NULL, + '2026-02-03T00:00:00Z'::timestamptz, + '2027-02-03T00:00:00Z'::timestamptz + ), + -- Trace 4 + ( + 'ffffffff-0000-0000-0000-000000000001', + '44444444-4444-4444-4444-444444444444', + 'ffffffff-1111-1111-1111-111111111111', + 'ffffffff-2222-2222-2222-222222222221', + 1, + 'delta candidate 1', + '{}'::jsonb, + 1, + 0.92, + 'agent_private', + 0.55, + '2026-02-03T00:00:00Z'::timestamptz, + 10, + '2026-02-03T12:00:00Z'::timestamptz, + '2026-02-04T00:00:00Z'::timestamptz, + '2027-02-04T00:00:00Z'::timestamptz + ), + ( + 'ffffffff-0000-0000-0000-000000000002', + '44444444-4444-4444-4444-444444444444', + 'ffffffff-1111-1111-1111-111111111112', + 'ffffffff-2222-2222-2222-222222222222', + 1, + 'delta candidate 2', + '{}'::jsonb, + 2, + 0.82, + 'agent_private', + 0.45, + '2026-02-03T00:00:00Z'::timestamptz, + 9, + NULL, + '2026-02-04T00:00:00Z'::timestamptz, + '2027-02-04T00:00:00Z'::timestamptz + ), + ( + 'ffffffff-0000-0000-0000-000000000003', + '44444444-4444-4444-4444-444444444444', + 'ffffffff-1111-1111-1111-111111111113', + 'ffffffff-2222-2222-2222-222222222223', + 1, + 'delta candidate 3', + '{}'::jsonb, + 3, + 0.72, + 'agent_private', + 0.35, + '2026-02-03T00:00:00Z'::timestamptz, + 8, + NULL, + '2026-02-04T00:00:00Z'::timestamptz, + '2027-02-04T00:00:00Z'::timestamptz + ), + ( + 'ffffffff-0000-0000-0000-000000000004', + '44444444-4444-4444-4444-444444444444', + 'ffffffff-1111-1111-1111-111111111114', + 'ffffffff-2222-2222-2222-222222222224', + 1, + 'delta candidate 4', + '{}'::jsonb, + 4, + 0.62, + 'agent_private', + 0.25, + '2026-02-03T00:00:00Z'::timestamptz, + 7, + NULL, + '2026-02-04T00:00:00Z'::timestamptz, + '2027-02-04T00:00:00Z'::timestamptz + ), + ( + 'ffffffff-0000-0000-0000-000000000005', + '44444444-4444-4444-4444-444444444444', + 'ffffffff-1111-1111-1111-111111111115', + 'ffffffff-2222-2222-2222-222222222225', + 1, + 'delta candidate 5', + '{}'::jsonb, + 5, + 0.52, + 'agent_private', + 0.15, + '2026-02-03T00:00:00Z'::timestamptz, + 6, + NULL, + '2026-02-04T00:00:00Z'::timestamptz, + '2027-02-04T00:00:00Z'::timestamptz + ), + ( + 'ffffffff-0000-0000-0000-000000000006', + '44444444-4444-4444-4444-444444444444', + 'ffffffff-1111-1111-1111-111111111116', + 'ffffffff-2222-2222-2222-222222222226', + 1, + 'delta candidate 6', + '{}'::jsonb, + 6, + 0.42, + 'agent_private', + 0.05, + '2026-02-03T00:00:00Z'::timestamptz, + 5, + NULL, + '2026-02-04T00:00:00Z'::timestamptz, + '2027-02-04T00:00:00Z'::timestamptz + ), + -- Trace 5 + ( + '55555555-0000-0000-0000-000000000001', + '55555555-5555-5555-5555-555555555555', + '55555555-1111-1111-1111-111111111111', + '55555555-2222-2222-2222-222222222221', + 0, + 'epsilon candidate 1', + '{}'::jsonb, + 1, + 0.82, + 'agent_private', + 0.55, + '2026-02-04T00:00:00Z'::timestamptz, + 10, + '2026-02-04T12:00:00Z'::timestamptz, + '2026-02-05T00:00:00Z'::timestamptz, + '2027-02-05T00:00:00Z'::timestamptz + ), + ( + '55555555-0000-0000-0000-000000000002', + '55555555-5555-5555-5555-555555555555', + '55555555-1111-1111-1111-111111111112', + '55555555-2222-2222-2222-222222222222', + 0, + 'epsilon candidate 2', + '{"note_id":"55555555-1111-1111-1111-111111111112","chunk_id":"55555555-2222-2222-2222-222222222222","chunk_index":0,"snippet":"epsilon candidate 2","retrieval_rank":2,"rerank_score":0.72,"note_scope":"agent_private","note_importance":0.45,"note_updated_at":"2026-02-04T00:00:00Z","note_hit_count":9,"note_last_hit_at":null}'::jsonb, + 2, + 0.72, + 'agent_private', + 0.45, + '2026-02-04T00:00:00Z'::timestamptz, + 9, + NULL, + '2026-02-05T00:00:00Z'::timestamptz, + '2027-02-05T00:00:00Z'::timestamptz + ), + ( + '55555555-0000-0000-0000-000000000003', + '55555555-5555-5555-5555-555555555555', + '55555555-1111-1111-1111-111111111113', + '55555555-2222-2222-2222-222222222223', + 0, + 'epsilon candidate 3', + '{}'::jsonb, + 3, + 0.92, + 'agent_private', + 0.35, + '2026-02-04T00:00:00Z'::timestamptz, + 8, + NULL, + '2026-02-05T00:00:00Z'::timestamptz, + '2027-02-05T00:00:00Z'::timestamptz + ), + ( + '55555555-0000-0000-0000-000000000004', + '55555555-5555-5555-5555-555555555555', + '55555555-1111-1111-1111-111111111114', + '55555555-2222-2222-2222-222222222224', + 0, + 'epsilon candidate 4', + '{}'::jsonb, + 4, + 0.62, + 'agent_private', + 0.25, + '2026-02-04T00:00:00Z'::timestamptz, + 7, + NULL, + '2026-02-05T00:00:00Z'::timestamptz, + '2027-02-05T00:00:00Z'::timestamptz + ), + ( + '55555555-0000-0000-0000-000000000005', + '55555555-5555-5555-5555-555555555555', + '55555555-1111-1111-1111-111111111115', + '55555555-2222-2222-2222-222222222225', + 0, + 'epsilon candidate 5', + '{}'::jsonb, + 5, + 0.52, + 'agent_private', + 0.15, + '2026-02-04T00:00:00Z'::timestamptz, + 6, + NULL, + '2026-02-05T00:00:00Z'::timestamptz, + '2027-02-05T00:00:00Z'::timestamptz + ); + +INSERT INTO search_trace_items ( + item_id, + trace_id, + note_id, + chunk_id, + rank, + final_score, + explain +) +VALUES + -- Trace 1 baseline top_k = 3 (ordered by rerank_score desc) + ( + 'cccccccc-0000-0000-0000-000000000001', + '11111111-1111-1111-1111-111111111111', + 'aaaaaaaa-1111-1111-1111-111111111111', + 'aaaaaaaa-2222-2222-2222-222222222221', + 1, + 1.00, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":1.0,"terms":[]}}'::jsonb + ), + ( + 'cccccccc-0000-0000-0000-000000000002', + '11111111-1111-1111-1111-111111111111', + 'aaaaaaaa-1111-1111-1111-111111111112', + 'aaaaaaaa-2222-2222-2222-222222222222', + 2, + 0.80, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":0.8,"terms":[]}}'::jsonb + ), + ( + 'cccccccc-0000-0000-0000-000000000003', + '11111111-1111-1111-1111-111111111111', + 'aaaaaaaa-1111-1111-1111-111111111113', + 'aaaaaaaa-2222-2222-2222-222222222223', + 3, + 0.60, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":0.6,"terms":[]}}'::jsonb + ), + -- Trace 2 baseline top_k = 3 (ordered by rerank_score desc) + ( + 'dddddddd-0000-0000-0000-000000000001', + '22222222-2222-2222-2222-222222222222', + 'bbbbbbbb-1111-1111-1111-111111111111', + 'bbbbbbbb-2222-2222-2222-222222222221', + 1, + 1.00, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":1.0,"terms":[]}}'::jsonb + ), + ( + 'dddddddd-0000-0000-0000-000000000002', + '22222222-2222-2222-2222-222222222222', + 'bbbbbbbb-1111-1111-1111-111111111112', + 'bbbbbbbb-2222-2222-2222-222222222222', + 2, + 0.75, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":0.75,"terms":[]}}'::jsonb + ), + ( + 'dddddddd-0000-0000-0000-000000000003', + '22222222-2222-2222-2222-222222222222', + 'bbbbbbbb-1111-1111-1111-111111111113', + 'bbbbbbbb-2222-2222-2222-222222222223', + 3, + 0.60, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":0.6,"terms":[]}}'::jsonb + ), + -- Trace 3 baseline top_k = 3 (ordered by rerank_score desc) + ( + 'eeeeeeee-9999-0000-0000-000000000001', + '33333333-3333-3333-3333-333333333333', + 'eeeeeeee-1111-1111-1111-111111111111', + 'eeeeeeee-2222-2222-2222-222222222221', + 1, + 1.00, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":1.0,"terms":[]}}'::jsonb + ), + ( + 'eeeeeeee-9999-0000-0000-000000000002', + '33333333-3333-3333-3333-333333333333', + 'eeeeeeee-1111-1111-1111-111111111112', + 'eeeeeeee-2222-2222-2222-222222222222', + 2, + 0.85, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":0.85,"terms":[]}}'::jsonb + ), + ( + 'eeeeeeee-9999-0000-0000-000000000003', + '33333333-3333-3333-3333-333333333333', + 'eeeeeeee-1111-1111-1111-111111111113', + 'eeeeeeee-2222-2222-2222-222222222223', + 3, + 0.75, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":0.75,"terms":[]}}'::jsonb + ), + -- Trace 4 baseline top_k = 3 (ordered by rerank_score desc) + ( + 'ffffffff-9999-0000-0000-000000000001', + '44444444-4444-4444-4444-444444444444', + 'ffffffff-1111-1111-1111-111111111111', + 'ffffffff-2222-2222-2222-222222222221', + 1, + 1.00, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":1.0,"terms":[]}}'::jsonb + ), + ( + 'ffffffff-9999-0000-0000-000000000002', + '44444444-4444-4444-4444-444444444444', + 'ffffffff-1111-1111-1111-111111111112', + 'ffffffff-2222-2222-2222-222222222222', + 2, + 0.82, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":0.82,"terms":[]}}'::jsonb + ), + ( + 'ffffffff-9999-0000-0000-000000000003', + '44444444-4444-4444-4444-444444444444', + 'ffffffff-1111-1111-1111-111111111113', + 'ffffffff-2222-2222-2222-222222222223', + 3, + 0.72, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":0.72,"terms":[]}}'::jsonb + ), + -- Trace 5 baseline top_k = 3 (ordered by rerank_score desc) + ( + '55555555-9999-0000-0000-000000000001', + '55555555-5555-5555-5555-555555555555', + '55555555-1111-1111-1111-111111111113', + '55555555-2222-2222-2222-222222222223', + 1, + 1.00, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":1.0,"terms":[]}}'::jsonb + ), + ( + '55555555-9999-0000-0000-000000000002', + '55555555-5555-5555-5555-555555555555', + '55555555-1111-1111-1111-111111111111', + '55555555-2222-2222-2222-222222222221', + 2, + 0.82, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":0.82,"terms":[]}}'::jsonb + ), + ( + '55555555-9999-0000-0000-000000000003', + '55555555-5555-5555-5555-555555555555', + '55555555-1111-1111-1111-111111111112', + '55555555-2222-2222-2222-222222222222', + 3, + 0.72, + '{"match":{"matched_terms":[],"matched_fields":[]},"ranking":{"schema":"search_ranking_explain/v2","policy_id":"baseline","final_score":0.72,"terms":[]}}'::jsonb + ); + +COMMIT; diff --git a/.github/fixtures/trace_gate/gate.json b/.github/fixtures/trace_gate/gate.json new file mode 100644 index 00000000..42250ab9 --- /dev/null +++ b/.github/fixtures/trace_gate/gate.json @@ -0,0 +1,16 @@ +{ + "defaults": { + "max_positional_churn_at_k": 0.0, + "max_set_churn_at_k": 0.0, + "min_retrieval_top_rank_retention": 1.0 + }, + "top_k": 3, + "retrieval_retention_rank": 3, + "traces": [ + { "trace_id": "11111111-1111-1111-1111-111111111111" }, + { "trace_id": "22222222-2222-2222-2222-222222222222" }, + { "trace_id": "33333333-3333-3333-3333-333333333333" }, + { "trace_id": "44444444-4444-4444-4444-444444444444" }, + { "trace_id": "55555555-5555-5555-5555-555555555555" } + ] +} diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml new file mode 100644 index 00000000..96d0589d --- /dev/null +++ b/.github/workflows/e2e.yml @@ -0,0 +1,137 @@ +name: E2E Harness (Context Misranking) + +permissions: + contents: read + +on: + push: + branches: + - main + paths-ignore: + - "**/*.md" + - ".gitignore" + - "docs/**" + pull_request: + branches: + - main + paths-ignore: + - "**/*.md" + - ".gitignore" + - "docs/**" + merge_group: + paths-ignore: + - "**/*.md" + - ".gitignore" + - "docs/**" + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +jobs: + e2e: + name: Run E2E harness + runs-on: ubuntu-latest + timeout-minutes: 30 + env: + ELF_PG_DSN: postgres://postgres:postgres@127.0.0.1:5432/postgres + ELF_QDRANT_HTTP_URL: http://127.0.0.1:6333 + ELF_QDRANT_GRPC_URL: http://127.0.0.1:6334 + ELF_QDRANT_URL: http://127.0.0.1:6334 + ELF_HARNESS_RUN_ID: gha-${{ github.run_id }} + ELF_HARNESS_VECTOR_DIM: 256 + RUST_BACKTRACE: full + + services: + postgres: + image: pgvector/pgvector:pg18 + env: + POSTGRES_PASSWORD: postgres + POSTGRES_USER: postgres + POSTGRES_DB: postgres + ports: + - 5432:5432 + options: >- + --health-cmd "pg_isready -U postgres -d postgres" + --health-interval 10s + --health-timeout 5s + --health-retries 10 + qdrant: + image: qdrant/qdrant:v1.16.3 + ports: + - 6333:6333 + - 6334:6334 + + steps: + - name: Fetch latest code + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 + + - name: Set up Rust toolchain + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 + with: + cache: true + rustflags: "" + + - name: Install OS tools (psql, jq) + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends postgresql-client jq + + - name: Install taplo + uses: taiki-e/install-action@b8cecb83565409bcc297b2df6e77f030b2a468d5 + with: + tool: taplo + + - name: Install cargo-make + uses: taiki-e/install-action@b8cecb83565409bcc297b2df6e77f030b2a468d5 + with: + tool: cargo-make + + - name: Wait for Postgres + run: | + for i in {1..60}; do + pg_isready -h 127.0.0.1 -p 5432 -U postgres -d postgres >/dev/null && exit 0 + sleep 1 + done + echo "Postgres did not become ready in time." + exit 1 + + - name: Wait for Qdrant + run: | + for i in {1..60}; do + curl -sSf http://127.0.0.1:6333/collections >/dev/null && exit 0 + sleep 1 + done + echo "Qdrant did not become ready in time." + exit 1 + + - name: Run context misranking harness + run: | + mkdir -p tmp + cargo make test-e2e + + - name: Upload harness outputs + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a + with: + name: e2e-context-misranking-${{ github.run_id }} + if-no-files-found: warn + retention-days: 14 + path: | + tmp/elf.harness.out.base.json + tmp/elf.harness.out.context.json + + - name: Upload harness logs (on failure) + if: failure() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a + with: + name: e2e-context-misranking-${{ github.run_id }}-logs + if-no-files-found: warn + retention-days: 7 + path: | + tmp/elf.harness.worker.log + tmp/elf.harness.api.log + tmp/elf.harness.base.toml + tmp/elf.harness.context.toml + tmp/elf.harness.dataset.json diff --git a/.github/workflows/external-memory-pattern-radar.yml b/.github/workflows/external-memory-pattern-radar.yml new file mode 100644 index 00000000..a0be11af --- /dev/null +++ b/.github/workflows/external-memory-pattern-radar.yml @@ -0,0 +1,47 @@ +name: External Memory Pattern Radar + +permissions: + contents: read + +on: + workflow_dispatch: + schedule: + # Weekly on Wednesday at 04:20 UTC. + - cron: "20 4 * * 3" + +concurrency: + group: external-memory-pattern-radar + cancel-in-progress: true + +jobs: + radar: + name: Run read-only radar artifact refresh + runs-on: ubuntu-latest + steps: + - name: Fetch latest code + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 + + - name: Set up Rust toolchain + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 + with: + cache: true + rustflags: "" + + - name: Install cargo-make + uses: taiki-e/install-action@b8cecb83565409bcc297b2df6e77f030b2a468d5 + with: + tool: cargo-make + + - name: Run radar artifact refresh + run: cargo make external-memory-radar-artifact + + - name: Upload radar artifacts + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a + with: + name: external-memory-pattern-radar-${{ github.run_id }} + if-no-files-found: error + retention-days: 30 + path: | + tmp/external-memory-pattern-radar/cursor.json + tmp/external-memory-pattern-radar/latest.md diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 0a739641..3c2dab8c 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -4,10 +4,33 @@ permissions: contents: read on: + push: + branches: + - main + paths-ignore: + - "**/*.md" + - ".gitignore" + - "docs/**" + pull_request: + branches: + - main + paths-ignore: + - "**/*.md" + - ".gitignore" + - "docs/**" + merge_group: + paths-ignore: + - "**/*.md" + - ".gitignore" + - "docs/**" workflow_dispatch: schedule: # Daily at 00:00 UTC. Manual runs use workflow_dispatch. - - cron: "0 0 * * *" + - cron: '0 0 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} jobs: integration: @@ -15,11 +38,13 @@ jobs: runs-on: ubuntu-latest env: ELF_PG_DSN: postgres://postgres:postgres@127.0.0.1:5432/postgres + ELF_QDRANT_HTTP_URL: http://127.0.0.1:6333 + ELF_QDRANT_GRPC_URL: http://127.0.0.1:6334 ELF_QDRANT_URL: http://127.0.0.1:6334 RUST_BACKTRACE: full services: postgres: - image: postgres:16 + image: pgvector/pgvector:pg18 env: POSTGRES_PASSWORD: postgres POSTGRES_USER: postgres @@ -38,19 +63,24 @@ jobs: - 6334:6334 steps: - name: Fetch latest code - uses: actions/checkout@v6 + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 - name: Set up Rust toolchain - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 with: cache: true rustflags: '' - name: Install nextest - uses: taiki-e/install-action@v2 + uses: taiki-e/install-action@b8cecb83565409bcc297b2df6e77f030b2a468d5 with: tool: nextest + - name: Install cargo-make + uses: taiki-e/install-action@b8cecb83565409bcc297b2df6e77f030b2a468d5 + with: + tool: cargo-make + - name: Wait for Qdrant run: | for i in {1..30}; do @@ -61,4 +91,4 @@ jobs: exit 1 - name: Run integration tests - run: cargo nextest run --workspace --all-targets --all-features --run-ignored=only + run: cargo make test-rust-all diff --git a/.github/workflows/language.yml b/.github/workflows/language.yml index 2ef7b5bb..7a40f346 100644 --- a/.github/workflows/language.yml +++ b/.github/workflows/language.yml @@ -6,13 +6,15 @@ permissions: on: push: - branches: [main] + branches: + - main paths-ignore: - '**/*.md' - '.gitignore' - 'docs/**' pull_request: - branches: [main] + branches: + - main paths-ignore: - '**/*.md' - '.gitignore' @@ -28,15 +30,15 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} jobs: - rust: - name: Rust checks + repo: + name: Repository checks runs-on: ubuntu-latest steps: - name: Fetch latest code - uses: actions/checkout@v6 + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 - name: Set up Rust toolchain - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 with: cache: true rustflags: '' @@ -46,28 +48,34 @@ jobs: run: rustup toolchain install nightly --component rustfmt - name: Install cargo-make - uses: taiki-e/install-action@v2 + uses: taiki-e/install-action@b8cecb83565409bcc297b2df6e77f030b2a468d5 with: tool: cargo-make - - name: Run lint - run: cargo make lint-rust + - name: Install vibe-style (latest release) + run: | + set -euo pipefail + VERSION="$(curl -fsSL https://api.github.com/repos/hack-ink/vibe-style/releases/latest | grep -oE '"tag_name": "v[^"]+"' | cut -d'"' -f4)" + TARGET="x86_64-unknown-linux-gnu" + ASSET="vibe-style-${TARGET}-${VERSION}.tgz" - - name: Run Rust format checks - run: cargo make fmt-rust-check + curl -fsSLO "https://github.com/hack-ink/vibe-style/releases/download/${VERSION}/${ASSET}" + tar -xzf "${ASSET}" - - name: Install taplo - uses: taiki-e/install-action@v2 - with: - tool: taplo - - - name: Run TOML format checks - run: cargo make fmt-toml-check + mkdir -p "$HOME/.cargo/bin" + install -m 0755 "vibe-style-${TARGET}-${VERSION}/vstyle" "$HOME/.cargo/bin/vstyle" + install -m 0755 "vibe-style-${TARGET}-${VERSION}/cargo-vstyle" "$HOME/.cargo/bin/cargo-vstyle" + echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" - name: Install nextest - uses: taiki-e/install-action@v2 + uses: taiki-e/install-action@b8cecb83565409bcc297b2df6e77f030b2a468d5 with: tool: nextest - - name: Run tests - run: cargo make test-rust + - name: Install taplo + uses: taiki-e/install-action@b8cecb83565409bcc297b2df6e77f030b2a468d5 + with: + tool: taplo + + - name: Run repository checks + run: cargo make check diff --git a/.github/workflows/nightly-harness-signals.yml b/.github/workflows/nightly-harness-signals.yml new file mode 100644 index 00000000..d7ebe21a --- /dev/null +++ b/.github/workflows/nightly-harness-signals.yml @@ -0,0 +1,127 @@ +name: Nightly Harness Signals + +permissions: + contents: read + +on: + workflow_dispatch: + schedule: + # Nightly at 02:30 UTC. + - cron: "30 2 * * *" + +concurrency: + group: nightly-harness-signals + cancel-in-progress: true + +jobs: + harness: + name: Run harness scripts + runs-on: ubuntu-latest + timeout-minutes: 60 + env: + ELF_PG_DSN: postgres://postgres:postgres@127.0.0.1:5432/postgres + ELF_QDRANT_HTTP_URL: http://127.0.0.1:6333 + ELF_QDRANT_GRPC_URL: http://127.0.0.1:6334 + ELF_HARNESS_RUN_ID: gha-${{ github.run_id }} + ELF_HARNESS_VECTOR_DIM: 256 + RUST_BACKTRACE: full + + services: + postgres: + image: pgvector/pgvector:pg18 + env: + POSTGRES_PASSWORD: postgres + POSTGRES_USER: postgres + POSTGRES_DB: postgres + ports: + - 5432:5432 + options: >- + --health-cmd "pg_isready -U postgres -d postgres" + --health-interval 10s + --health-timeout 5s + --health-retries 10 + qdrant: + image: qdrant/qdrant:v1.16.3 + ports: + - 6333:6333 + - 6334:6334 + + steps: + - name: Fetch latest code + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 + + - name: Set up Rust toolchain + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 + with: + cache: true + rustflags: "" + + - name: Install OS tools (psql, jq) + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends postgresql-client jq + + - name: Install taplo + uses: taiki-e/install-action@b8cecb83565409bcc297b2df6e77f030b2a468d5 + with: + tool: taplo + + - name: Wait for Postgres + run: | + for i in {1..60}; do + pg_isready -h 127.0.0.1 -p 5432 -U postgres -d postgres >/dev/null && exit 0 + sleep 1 + done + echo "Postgres did not become ready in time." + exit 1 + + - name: Wait for Qdrant + run: | + for i in {1..60}; do + curl -sSf http://127.0.0.1:6333/collections >/dev/null && exit 0 + sleep 1 + done + echo "Qdrant did not become ready in time." + exit 1 + + - name: Run context misranking harness + run: | + mkdir -p tmp + bash scripts/context-misranking-harness.sh + + - name: Run ranking stability harness + run: | + mkdir -p tmp + bash scripts/ranking-stability-harness.sh + + - name: Upload harness outputs + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a + with: + name: nightly-harness-signals-${{ github.run_id }} + if-no-files-found: warn + retention-days: 14 + path: | + tmp/elf.harness.out.base.json + tmp/elf.harness.out.context.json + tmp/elf.stability.out.json + + - name: Upload harness logs (on failure) + if: failure() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a + with: + name: nightly-harness-signals-${{ github.run_id }}-logs + if-no-files-found: warn + retention-days: 7 + path: | + tmp/elf.harness.worker.log + tmp/elf.harness.api.log + tmp/elf.stability.worker.log + tmp/elf.stability.api.log + tmp/elf.harness.base.toml + tmp/elf.harness.context.toml + tmp/elf.stability.base.toml + tmp/elf.stability.det.toml + tmp/elf.stability.dataset.json + tmp/elf.stability.worker.log + tmp/elf.stability.api.log diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml new file mode 100644 index 00000000..16bace80 --- /dev/null +++ b/.github/workflows/quality.yml @@ -0,0 +1,91 @@ +name: Quality Gates + +permissions: + contents: read + +on: + push: + branches: + - main + paths-ignore: + - "**/*.md" + - ".gitignore" + - "docs/**" + pull_request: + branches: + - main + paths-ignore: + - "**/*.md" + - ".gitignore" + - "docs/**" + merge_group: + paths-ignore: + - "**/*.md" + - ".gitignore" + - "docs/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +jobs: + trace-regression-gate: + name: Trace regression gate + runs-on: ubuntu-latest + env: + PG_DSN: postgres://postgres:postgres@127.0.0.1:5432/elf + RUST_BACKTRACE: full + services: + postgres: + image: pgvector/pgvector:pg18 + env: + POSTGRES_PASSWORD: postgres + POSTGRES_USER: postgres + POSTGRES_DB: elf + ports: + - 5432:5432 + options: >- + --health-cmd "pg_isready -U postgres -d elf" + --health-interval 10s + --health-timeout 5s + --health-retries 10 + steps: + - name: Fetch latest code + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 + + - name: Set up Rust toolchain + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 + with: + cache: true + rustflags: '' + + - name: Install cargo-make + uses: taiki-e/install-action@b8cecb83565409bcc297b2df6e77f030b2a468d5 + with: + tool: cargo-make + + - name: Install Postgres client + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends postgresql-client + + - name: Wait for Postgres + run: | + for i in {1..30}; do + pg_isready -h 127.0.0.1 -p 5432 -U postgres -d elf >/dev/null && exit 0 + sleep 1 + done + echo "Postgres did not become ready in time." + exit 1 + + - name: Run trace regression gate + run: TRACE_GATE_REPORT_PATH=trace_gate.report.json cargo make check-trace-gate + + - name: Upload trace gate report + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a + with: + name: trace_gate_report + path: trace_gate.report.json + if-no-files-found: warn + retention-days: 7 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0514f324..603d3a7e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -2,6 +2,7 @@ name: Release permissions: contents: write + discussions: write env: CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse @@ -12,7 +13,7 @@ env: on: push: tags: - - "v[0-9]+.[0-9]+.[0-9]+" + - 'v[0-9]+.[0-9]+.[0-9]+' concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -32,10 +33,10 @@ jobs: ] steps: - name: Fetch latest code - uses: actions/checkout@v6 + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 - name: Set up Rust toolchain - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@46268bd060767258de96ed93c1251119784f2ab6 with: cache: true components: rustfmt, clippy @@ -44,31 +45,43 @@ jobs: run: rustup target add ${{ matrix.target.name }} - name: Build - run: cargo build --profile final-release --locked --target ${{ matrix.target.name }} + run: cargo build --release --locked --target ${{ matrix.target.name }} -p elf-api -p elf-worker -p elf-mcp -p elf-eval - - name: Pack (macOS) - if: matrix.target.os == 'macos-latest' + - name: Pack (macOS, Linux) + if: matrix.target.os != 'windows-latest' run: | - mv target/${{ matrix.target.name }}/final-release/vibe-mono . - zip vibe-mono-${{ matrix.target.name }}.zip vibe-mono + mkdir -p dist + for bin in elf-api elf-worker elf-mcp elf-eval; do + cp "target/${{ matrix.target.name }}/release/${bin}" dist/ + done - - name: Pack (Windows) - if: matrix.target.os == 'windows-latest' + - name: Archive (macOS) + if: matrix.target.os == 'macos-latest' run: | - mv target/${{ matrix.target.name }}/final-release/vibe-mono.exe . - Compress-Archive -Path vibe-mono.exe -DestinationPath vibe-mono-${{ matrix.target.name }}.zip + cd dist + zip "../elf-${{ matrix.target.name }}.zip" ./* - - name: Pack (Linux) + - name: Archive (Linux) if: matrix.target.os == 'ubuntu-latest' run: | - mv target/${{ matrix.target.name }}/final-release/vibe-mono . - tar -czvf vibe-mono-${{ matrix.target.name }}.tar.gz vibe-mono + tar -czvf "elf-${{ matrix.target.name }}.tar.gz" -C dist . + + - name: Pack (Windows) + if: matrix.target.os == 'windows-latest' + shell: pwsh + run: | + New-Item -ItemType Directory -Force dist | Out-Null + Copy-Item "target/${{ matrix.target.name }}/release/elf-api.exe" dist/ + Copy-Item "target/${{ matrix.target.name }}/release/elf-worker.exe" dist/ + Copy-Item "target/${{ matrix.target.name }}/release/elf-mcp.exe" dist/ + Copy-Item "target/${{ matrix.target.name }}/release/elf-eval.exe" dist/ + Compress-Archive -Path dist/* -DestinationPath "elf-${{ matrix.target.name }}.zip" - name: Upload artifact - uses: actions/upload-artifact@v6 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a with: - name: vibe-mono-${{ matrix.target.name }} - path: vibe-mono-${{ matrix.target.name }}.* + name: elf-${{ matrix.target.name }} + path: elf-${{ matrix.target.name }}.* retention-days: 1 # release: @@ -87,40 +100,40 @@ jobs: needs: [build] steps: - name: Download artifacts - uses: actions/download-artifact@v7 + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c - name: Hash run: | mkdir -p artifacts - mv vibe-mono-*/* artifacts/ + mv -- elf-*/* artifacts/ cd artifacts - sha256sum * | tee ../SHA256 - md5sum * | tee ../MD5 + sha256sum -- * | tee ../SHA256 + md5sum -- * | tee ../MD5 mv ../SHA256 . mv ../MD5 . - name: Publish - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@b4309332981a82ec1c5618f44dd2e27cc8bfbfda with: discussion_category_name: Announcements generate_release_notes: true files: artifacts/* - publish-on-crates-io: - name: Publish on crates.io - runs-on: ubuntu-latest - steps: - - name: Fetch latest code - uses: actions/checkout@v6 - - - name: Set up Rust toolchain - uses: actions-rust-lang/setup-rust-toolchain@v1 - with: - cache: true - components: rustfmt, clippy - - - name: Login - run: cargo login ${{ secrets.CARGO_REGISTRY_TOKEN }} - - - name: Publish - run: cargo publish --locked + # publish-on-crates-io: + # name: Publish on crates.io + # runs-on: ubuntu-latest + # steps: + # - name: Fetch latest code + # uses: actions/checkout@v4 + # + # - name: Set up Rust toolchain + # uses: actions-rust-lang/setup-rust-toolchain@v1 + # with: + # cache: true + # components: rustfmt, clippy + # + # - name: Login + # run: cargo login ${{ secrets.CARGO_REGISTRY_TOKEN }} + # + # - name: Publish + # run: cargo publish --locked diff --git a/.gitignore b/.gitignore index 83bfb29f..cd4bbf10 100644 --- a/.gitignore +++ b/.gitignore @@ -1,19 +1,29 @@ # AI +.agent .codex -.specify -.worktrees # Editor .vscode .zed +# General Ignores +*.bak +*.log +.env* +.turbo +/backups/ +/elf.toml +/elf.*.toml +!/elf.example.toml +model +tmp + +# Kubernetes +.kube + # Language Specifics ## JavaScript/TypeScript -!.yarn/patches -!.yarn/plugins -!.yarn/releases -!.yarn/versions *.tsbuildinfo .next .pnp @@ -22,6 +32,10 @@ .vercel .vite .yarn/* +!.yarn/patches +!.yarn/plugins +!.yarn/releases +!.yarn/versions build coverage dist @@ -45,16 +59,9 @@ target .build xcuserdata -# General Ignores -*.bak -*.log -.env* -.turbo -model -tmp - -# Kubernetes -.kube - # System .DS_Store + +# Work Dirs +.workspaces +.worktrees diff --git a/.prettierignore b/.prettierignore index 2298e2e5..f4f8122c 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1,4 +1,4 @@ -node_modules +.next .pnp .pnp.* .yarn/* @@ -6,16 +6,16 @@ node_modules !.yarn/plugins !.yarn/releases !.yarn/versions -dist -build -.next -out -coverage -apps/ui/src/ui/Svg -apps/ui/public -apps/ui/**/*.ttf -apps/ui/**/*.png +apps/ui/**/*.gif apps/ui/**/*.jpg +apps/ui/**/*.png apps/ui/**/*.svg -apps/ui/**/*.gif +apps/ui/**/*.ttf apps/ui/README.md +apps/ui/public +apps/ui/src/ui/Svg +build +coverage +dist +node_modules +out diff --git a/.prettierrc b/.prettierrc index 4cbc711c..c9ae50d7 100644 --- a/.prettierrc +++ b/.prettierrc @@ -2,6 +2,6 @@ "semi": true, "singleQuote": true, "trailingComma": "all", - "printWidth": 100, + "printWidth": 80, "tabWidth": 2 } diff --git a/.taplo.toml b/.taplo.toml index 3c5cf781..1324aa64 100644 --- a/.taplo.toml +++ b/.taplo.toml @@ -1,6 +1,65 @@ exclude = [ + "**/Makefile.toml", + "*.bak", + "*.log", + "*.tsbuildinfo", + ".build", + ".build/**", + ".codex", + ".codex/**", + ".env*", + ".kube", + ".kube/**", + ".next", + ".next/**", + ".pnp.*", + ".pnpm-debug.log*", + ".pytest_cache", + ".pytest_cache/**", + ".ruff_cache", + ".ruff_cache/**", + ".turbo", + ".turbo/**", + ".venv", + ".venv/**", + ".vercel", + ".vercel/**", + ".vite", + ".vite/**", + ".vscode", + ".vscode/**", + ".workspaces", + ".workspaces/**", ".worktrees", + ".worktrees/**", + ".yarn", + ".yarn/**", + ".zed", + ".zed/**", "Makefile.toml", + "__pycache__", + "__pycache__/**", + "build", + "build/**", + "coverage", + "coverage/**", + "dist", + "dist/**", + "model", + "model/**", + "node_modules", + "node_modules/**", + "npm-debug.log*", + "out", + "out/**", + "target", + "target/**", + "tmp", + "tmp/**", + "xcuserdata", + "xcuserdata/**", + "yarn-debug.log*", + "yarn-error.log*", ] [formatting] diff --git a/AGENTS.md b/AGENTS.md index 31e35edf..e6b30502 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,136 +1,14 @@ -# AGENTS.md — Repository Rules for Automated Agents +# AGENTS.md — Repository-Specific Rules for Automated Agents -These instructions define repository-specific **execution rules**, **scope limits**, **language requirements**, -and **hard prohibitions** for automated agents operating in this repository. - -They supplement the global agent rules and override local patterns when conflicting with any rule below. - ---- - -# 0. Prime Directives - -- **Strict compliance:** Follow every rule in this document exactly. -- **Scope lock:** Modify only what is strictly necessary for the explicit user request. - -If unrelated issues are noticed: - -1. Do not modify them. -2. Finish the requested task. -3. Optionally list them under _Future suggestions_. - ---- - -## 0.1 Repository Language & Tone Rules (repository content) - -These requirements apply to **repository artifacts** generated or modified by agents, including: - -- Code comments -- Documentation and README content -- Log messages and tracing output -- Error messages, panic text, diagnostics -- User-facing strings stored in the codebase (CLI, UI, HTTP responses) -- Commit messages, summaries, and explanations written into repository files - -They **do not** constrain interactive chat responses outside the repository. For chat, use the language requested or implied by the user (for example, Chinese when the user is speaking Chinese). - -Requirements for repository artifacts: - -- Use **clear, grammatically correct English**. -- Start sentences with a capital letter and end with proper punctuation. -- Avoid slang, shorthand, and mixed languages. -- Avoid ambiguous abbreviations (`u`, `tho`, `w/`, etc.). -- Ignore poor style in surrounding text; follow these rules instead. - -**These language rules override any conflicting rules elsewhere for repository artifacts.** - -### Commit Message Schema - -Commit messages are exempt from the English language and punctuation rules above. -All commit messages must follow the schema below exactly. - -Schema (single line JSON with fixed key order): -`{"schema":"cmsg/1","type":"feat|fix|refactor|docs|chore|build|ci|perf|revert","scope":"global|","summary":"...","intent":"...","impact":"...","breaking":false,"risk":"low|medium|high","refs":[]}` - -Rules: - -- The JSON object must be a single line with no extra whitespace. -- Keys must appear in the exact order shown. -- Only the keys shown are allowed. -- `schema` must be `cmsg/1`. -- `type` must be one of `feat`, `fix`, `refactor`, `docs`, `chore`, `build`, `ci`, `perf`, or `revert`. -- `scope` must be `global` or a lowercase kebab-case component name. -- `summary`, `intent`, and `impact` must be short text without double quotes, backslashes, or newlines. -- `breaking` must be `true` or `false`. -- `risk` must be `low`, `medium`, or `high`. -- `refs` must be an array of strings. Each string must use one of the following forms: `gh:/#`, `pr:`, `doc:`, `url:`. Use an empty array when there are no references. - -Commenting guidance: - -- Avoid redundant comments that restate the code in different words. -- Prefer clear, descriptive names for variables, functions, and types as the primary form of documentation. -- Add comments only when intent, constraints, or trade-offs are not clear from the code and naming. +These instructions define repository-specific execution rules and scope limits for this repository. --- -## 0.2 Conflict Precedence - -If these rules conflict with higher-priority instructions (system, developer, or user), follow the higher-priority instruction and briefly note the conflict in your response. - ---- - -# 1. Execution Model - -Language- or stack-specific execution rules live in `docs/guide/development/languages/`. -Language- or stack-specific rules must be documented under `docs/guide/development/languages/` and linked from `docs/guide/index.md`. - -Run verification commands only when requested or when you need evidence before claiming completion. +## 1. Execution Model ## 1.1 Workspace Automation (cargo make) -- Use `cargo make` tasks from `Makefile.toml` when they are the best fit for the job. -- Treat `Makefile.toml` as the source of truth for task names and behavior. Do not invent task names. -- Preferred tasks for common workflows are listed below. - - Formatting: `cargo make fmt` or `cargo make fmt-check`. - - Linting: `cargo make lint` for full workspace, or `cargo make lint-rust` for Rust-only. - - Tests: `cargo make test` for full workspace, or `cargo make test-rust` for Rust-only. - - SQLx metadata: `scripts/sqlx-prepare.sh`. - - Full validation: `cargo make checks`. - -# 2. Implementation Scope - -- Implement exactly what the user asks. -- Maintain clarity and correctness. -- Add tests only when logically required by the change. -- Allow minimal adjacent edits required for compilation or consistent behavior. - ---- - -# 3. Editing Constraints - -- Prefer `apply_patch` for edits unless generation or scripting is more appropriate. -- Never revert user-made changes. - ---- - -# 4. Hard Prohibitions - -Violating any of these invalidates the output: - -## 4.1 File Boundaries - -Never modify: - -- Generated files, unless they are regenerated by their tooling instead of edited by hand. -- `target/` -- Vendored/third-party code -- Files outside the repository root. -- Treat any file with a “Generated by” or “Do not edit” header, or any file under directories named target/, dist/, build/, gen/, or .next/ as generated. - ---- - -# 5. Language-Specific Rules Reference - -Rust development and style rules live in `docs/guide/development/languages/rust.md`. -These rules apply **only** when editing Rust code and do **not** override -the global behavior and language rules in this file. -Async and runtime safety rules are defined in the language guides. +- `Makefile.toml` is the source of truth for task names and behavior. +- Run `cargo make` from the repository root, and use it whenever an equivalent task exists. +- Run standalone commands only when `Makefile.toml` does not cover the capability or cannot produce the required effect for the current task. +- When task details are needed, inspect `Makefile.toml` directly or run `cargo make --list-all-steps`. diff --git a/Cargo.lock b/Cargo.lock index 1ec16734..f4df4963 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,20 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.4", + "once_cell", + "serde", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -43,9 +57,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.21" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" dependencies = [ "anstyle", "anstyle-parse", @@ -58,15 +72,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" -version = "0.2.7" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" dependencies = [ "utf8parse", ] @@ -93,9 +107,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.100" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "arrayref" @@ -159,9 +173,31 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "autocfg" -version = "1.5.0" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + +[[package]] +name = "aws-lc-rs" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ec2f1fc3ec205783a5da9a7e6c1509cc69dedf09a1949e412c1e18469326d00" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.41.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +checksum = "1a2f9779ce85b93ab6170dd940ad0169b5766ff848247aff13bb788b832fe3f4" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] [[package]] name = "axum" @@ -176,8 +212,6 @@ dependencies = [ "http", "http-body", "http-body-util", - "hyper", - "hyper-util", "itoa", "matchit 0.7.3", "memchr", @@ -186,22 +220,17 @@ dependencies = [ "pin-project-lite", "rustversion", "serde", - "serde_json", - "serde_path_to_error", - "serde_urlencoded", "sync_wrapper", - "tokio", "tower 0.5.3", "tower-layer", "tower-service", - "tracing", ] [[package]] name = "axum" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" +checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" dependencies = [ "axum-core 0.5.6", "bytes", @@ -248,7 +277,6 @@ dependencies = [ "sync_wrapper", "tower-layer", "tower-service", - "tracing", ] [[package]] @@ -297,33 +325,27 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" -[[package]] -name = "base64ct" -version = "1.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" - [[package]] name = "bitflags" -version = "2.10.0" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" dependencies = [ "serde_core", ] [[package]] name = "blake3" -version = "1.8.3" +version = "1.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" +checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", - "cpufeatures", + "cpufeatures 0.3.0", ] [[package]] @@ -336,63 +358,75 @@ dependencies = [ ] [[package]] -name = "bumpalo" -version = "3.19.1" +name = "block-buffer" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" +dependencies = [ + "hybrid-array", +] [[package]] -name = "byteorder" -version = "1.5.0" +name = "bon" +version = "3.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +checksum = "a602c73c7b0148ec6d12af6fd5cc7a46e2eacc8878271a999abac56eed12f561" +dependencies = [ + "bon-macros", + "rustversion", +] [[package]] -name = "bytes" -version = "1.11.0" +name = "bon-macros" +version = "3.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +checksum = "6dee98b0db6a962de883bf5d20362dee4d7ca0d12fe39a7c6c73c844e1cd7c1f" +dependencies = [ + "darling 0.23.0", + "ident_case", + "prettyplease", + "proc-macro2", + "quote", + "rustversion", + "syn", +] [[package]] -name = "camino" -version = "1.2.2" +name = "bumpalo" +version = "3.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e629a66d692cb9ff1a1c664e41771b3dcaf961985a9774c0eb0bd1b51cf60a48" -dependencies = [ - "serde_core", -] +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" [[package]] -name = "cargo-platform" -version = "0.3.2" +name = "byteorder" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87a0c0e6148f11f01f32650a2ea02d532b2ad4e81d8bd41e6e565b5adc5e6082" -dependencies = [ - "serde", - "serde_core", -] +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] -name = "cargo_metadata" -version = "0.23.1" +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "castaway" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef987d17b0a113becdd19d3d0022d04d7ef41f9efe4f3fb63ac44ba61df3ade9" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" dependencies = [ - "camino", - "cargo-platform", - "semver", - "serde", - "serde_json", - "thiserror 2.0.18", + "rustversion", ] [[package]] name = "cc" -version = "1.2.55" +version = "1.2.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b26a0954ae34af09b50f0de26458fa95369a0d478d8236d3f93082b219bd29" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] @@ -408,25 +442,34 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "rand_core 0.10.1", +] + [[package]] name = "chrono" -version = "0.4.43" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" dependencies = [ "iana-time-zone", - "js-sys", "num-traits", "serde", - "wasm-bindgen", "windows-link", ] [[package]] name = "clap" -version = "4.5.56" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75ca66430e33a14957acc24c5077b503e7d374151b2b4b3a10c83b4ceb4be0e" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" dependencies = [ "clap_builder", "clap_derive", @@ -434,9 +477,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.56" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793207c7fa6300a0608d1080b858e5fdbe713cdc1c8db9fb17777d8a13e63df0" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ "anstream", "anstyle", @@ -446,9 +489,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.55" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" dependencies = [ "heck", "proc-macro2", @@ -458,9 +501,24 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.7" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "cmake" +version = "0.1.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" +checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" +dependencies = [ + "cc", +] + +[[package]] +name = "cmov" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c9ea0ac24bc397ab3c98583a3c9ba74fa56b09a4449bbe172b9b1ddb016027a" [[package]] name = "color-eyre" @@ -491,9 +549,34 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + +[[package]] +name = "compact_str" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "9dfdd1c2274d9aa354115b09dc9a901d6c5576818cdf70d14cae2bdb47df00ab" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] [[package]] name = "concurrent-queue" @@ -518,10 +601,16 @@ dependencies = [ ] [[package]] -name = "const-oid" -version = "0.9.6" +name = "console" +version = "0.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87" +dependencies = [ + "encode_unicode", + "libc", + "unicode-width", + "windows-sys 0.61.2", +] [[package]] name = "constant_time_eq" @@ -529,16 +618,6 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" -[[package]] -name = "core-foundation" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "core-foundation" version = "0.10.1" @@ -564,6 +643,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc" version = "3.4.0" @@ -575,9 +663,9 @@ dependencies = [ [[package]] name = "crc-catalog" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +checksum = "217698eaf96b4a3f0bc4f3662aaa55bdf913cd54d7204591faa790070c6d0853" [[package]] name = "crc32fast" @@ -624,14 +712,38 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crypto-common" -version = "0.1.7" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ "generic-array", "typenum", ] +[[package]] +name = "crypto-common" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6e4c961d6cd6c9a86db418387425e8bdeaf05b3c8bc1411e6dca4c252f1453" +dependencies = [ + "hybrid-array", +] + +[[package]] +name = "ctutils" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5515a3834141de9eafb9717ad39eea8247b5674e6066c404e8c4b365d2a29e" +dependencies = [ + "cmov", +] + +[[package]] +name = "daachorse" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f55d7153ba3b507595872a3874803f07a8a81d1e888abed8e5db7da0597d6e2" + [[package]] name = "darling" version = "0.20.11" @@ -702,23 +814,20 @@ dependencies = [ ] [[package]] -name = "der" -version = "0.7.10" +name = "dary_heap" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe" dependencies = [ - "const-oid", - "pem-rfc7468", - "zeroize", + "serde", ] [[package]] name = "deranged" -version = "0.5.5" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ - "powerfmt", "serde_core", ] @@ -759,38 +868,47 @@ version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ - "block-buffer", - "const-oid", - "crypto-common", - "subtle", + "block-buffer 0.10.4", + "crypto-common 0.1.6", +] + +[[package]] +name = "digest" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" +dependencies = [ + "block-buffer 0.12.1", + "crypto-common 0.2.2", + "ctutils", ] [[package]] name = "dirs" -version = "5.0.1" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" dependencies = [ "dirs-sys", ] [[package]] name = "dirs-sys" -version = "0.4.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.48.0", + "windows-sys 0.61.2", ] [[package]] name = "displaydoc" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", @@ -803,6 +921,12 @@ version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "dyn-clone" version = "1.0.20" @@ -811,39 +935,56 @@ checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" [[package]] name = "either" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" dependencies = [ "serde", ] +[[package]] +name = "elf" +version = "0.2.0" +dependencies = [ + "clap", + "color-eyre", + "elf-cli", + "reqwest 0.13.4", + "serde_json", + "tokio", +] + [[package]] name = "elf-api" -version = "0.1.0" +version = "0.2.0" dependencies = [ - "axum 0.7.9", + "axum 0.8.9", "clap", "color-eyre", "elf-cli", "elf-config", + "elf-domain", "elf-service", "elf-storage", "elf-testkit", + "qdrant-client", "serde", "serde_json", "sqlx", + "time", "tokio", "tower 0.5.3", "tracing", "tracing-subscriber", + "utoipa", + "utoipa-scalar", "uuid", "vergen-gitcl", ] [[package]] name = "elf-chunking" -version = "0.1.0" +version = "0.2.0" dependencies = [ "tokenizers", "tracing", @@ -852,7 +993,7 @@ dependencies = [ [[package]] name = "elf-cli" -version = "0.1.0" +version = "0.2.0" dependencies = [ "clap", "vergen-gitcl", @@ -860,36 +1001,49 @@ dependencies = [ [[package]] name = "elf-config" -version = "0.1.0" +version = "0.2.0" dependencies = [ - "color-eyre", "serde", "serde_json", + "thiserror 2.0.18", "toml", ] [[package]] name = "elf-domain" -version = "0.1.0" +version = "0.2.0" dependencies = [ "elf-config", "regex", + "serde", "serde_json", "time", + "unicode-normalization", + "unicode-script", + "uuid", + "whatlang", ] [[package]] name = "elf-eval" -version = "0.1.0" +version = "0.2.0" dependencies = [ + "blake3", "clap", "color-eyre", + "elf-chunking", "elf-cli", "elf-config", + "elf-domain", "elf-service", "elf-storage", + "elf-testkit", + "elf-worker", + "reqwest 0.13.4", "serde", "serde_json", + "sqlx", + "time", "tokio", "tracing", "tracing-subscriber", @@ -899,67 +1053,68 @@ dependencies = [ [[package]] name = "elf-mcp" -version = "0.1.0" +version = "0.2.0" dependencies = [ - "axum 0.7.9", + "axum 0.8.9", "clap", "color-eyre", "elf-cli", "elf-config", - "reqwest", + "reqwest 0.13.4", "rmcp", "serde_json", "tokio", + "uuid", "vergen-gitcl", ] [[package]] name = "elf-providers" -version = "0.1.0" +version = "0.2.0" dependencies = [ - "color-eyre", + "blake3", "elf-config", - "reqwest", - "serde", + "reqwest 0.13.4", "serde_json", - "tokio", + "thiserror 2.0.18", ] [[package]] name = "elf-service" -version = "0.1.0" +version = "0.2.0" dependencies = [ - "axum 0.7.9", + "ahash", + "axum 0.8.9", "blake3", - "color-eyre", "elf-chunking", "elf-config", "elf-domain", "elf-providers", "elf-storage", "elf-testkit", + "elf-worker", "qdrant-client", "serde", "serde_json", "sqlx", + "thiserror 2.0.18", "time", "tokenizers", "tokio", "tracing", - "unicode-segmentation", "uuid", ] [[package]] name = "elf-storage" -version = "0.1.0" +version = "0.2.0" dependencies = [ - "color-eyre", "elf-config", "elf-testkit", "qdrant-client", "serde_json", "sqlx", + "thiserror 2.0.18", "time", "tokio", "uuid", @@ -967,29 +1122,32 @@ dependencies = [ [[package]] name = "elf-testkit" -version = "0.1.0" +version = "0.2.0" dependencies = [ - "color-eyre", + "qdrant-client", "sqlx", + "thiserror 2.0.18", "tokio", "uuid", ] [[package]] name = "elf-worker" -version = "0.1.0" +version = "0.2.0" dependencies = [ "clap", "color-eyre", "elf-chunking", "elf-cli", "elf-config", + "elf-domain", "elf-providers", "elf-storage", "qdrant-client", "serde", "serde_json", "sqlx", + "thiserror 2.0.18", "time", "tokio", "tracing", @@ -1005,30 +1163,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" [[package]] -name = "encoding_rs" -version = "0.8.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "equivalent" -version = "1.0.2" +name = "equivalent" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" -[[package]] -name = "errno" -version = "0.3.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" -dependencies = [ - "libc", - "windows-sys 0.61.2", -] - [[package]] name = "esaxx-rs" version = "0.1.10" @@ -1040,13 +1179,12 @@ dependencies = [ [[package]] name = "etcetera" -version = "0.8.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943" +checksum = "de48cc4d1c1d97a20fd819def54b890cadde72ed3ad0c614822a0a433361be96" dependencies = [ "cfg-if", - "home", - "windows-sys 0.48.0", + "windows-sys 0.61.2", ] [[package]] @@ -1070,12 +1208,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "fastrand" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" - [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -1084,9 +1216,9 @@ checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "flate2" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b375d6465b98090a5f25b1c7703f3859783755aa9a80433b36e0379a3ec2f369" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", @@ -1094,9 +1226,9 @@ dependencies = [ [[package]] name = "flume" -version = "0.11.1" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" +checksum = "5e139bc46ca777eb5efaf62df0ab8cc5fd400866427e56c68b22e414e53bd3be" dependencies = [ "futures-core", "futures-sink", @@ -1116,19 +1248,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" [[package]] -name = "foreign-types" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" -dependencies = [ - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-shared" -version = "0.1.1" +name = "foldhash" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" [[package]] name = "form_urlencoded" @@ -1139,11 +1262,17 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futures" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" dependencies = [ "futures-channel", "futures-core", @@ -1156,9 +1285,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" dependencies = [ "futures-core", "futures-sink", @@ -1166,15 +1295,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" [[package]] name = "futures-executor" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" dependencies = [ "futures-core", "futures-task", @@ -1194,15 +1323,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" [[package]] name = "futures-macro" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", @@ -1211,21 +1340,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" [[package]] name = "futures-task" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" [[package]] name = "futures-util" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ "futures-channel", "futures-core", @@ -1235,15 +1364,14 @@ dependencies = [ "futures-task", "memchr", "pin-project-lite", - "pin-utils", "slab", ] [[package]] name = "generic-array" -version = "0.14.7" +version = "0.14.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" dependencies = [ "typenum", "version_check", @@ -1271,11 +1399,25 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "rand_core 0.10.1", + "wasip2", + "wasip3", +] + [[package]] name = "gimli" version = "0.32.3" @@ -1284,9 +1426,9 @@ checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" [[package]] name = "h2" -version = "0.4.13" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155" dependencies = [ "atomic-waker", "bytes", @@ -1294,7 +1436,7 @@ dependencies = [ "futures-core", "futures-sink", "http", - "indexmap 2.13.0", + "indexmap 2.14.0", "slab", "tokio", "tokio-util", @@ -1315,7 +1457,7 @@ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.1.5", ] [[package]] @@ -1323,14 +1465,25 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" [[package]] name = "hashlink" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" +checksum = "824e001ac4f3012dd16a264bec811403a67ca9deb6c102fc5049b32c4574b35f" dependencies = [ - "hashbrown 0.15.5", + "hashbrown 0.16.1", ] [[package]] @@ -1347,53 +1500,46 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "hf-hub" -version = "0.3.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732" +checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97" dependencies = [ "dirs", - "indicatif", + "http", + "indicatif 0.17.11", + "libc", "log", - "native-tls", - "rand 0.8.5", + "rand 0.9.4", "serde", "serde_json", - "thiserror 1.0.69", + "thiserror 2.0.18", "ureq", + "windows-sys 0.60.2", ] [[package]] name = "hkdf" -version = "0.12.4" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +checksum = "4aaa26c720c68b866f2c96ef5c1264b3e6f473fe5d4ce61cd44bbe913e553018" dependencies = [ "hmac", ] [[package]] name = "hmac" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" -dependencies = [ - "digest", -] - -[[package]] -name = "home" -version = "0.5.12" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +checksum = "6303bc9732ae41b04cb554b844a762b4115a61bfaa81e3e83050991eeb56863f" dependencies = [ - "windows-sys 0.61.2", + "digest 0.11.3", ] [[package]] name = "http" -version = "1.4.0" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" dependencies = [ "bytes", "itoa", @@ -1434,11 +1580,20 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "hybrid-array" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da" +dependencies = [ + "typenum", +] + [[package]] name = "hyper" -version = "1.8.1" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +checksum = "55281c53a1894c864990125767da440a4e630446785086f52523b20033b74498" dependencies = [ "atomic-waker", "bytes", @@ -1451,7 +1606,6 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "pin-utils", "smallvec", "tokio", "want", @@ -1459,19 +1613,18 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.7" +version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ "http", "hyper", "hyper-util", "rustls", - "rustls-pki-types", "tokio", "tokio-rustls", "tower-service", - "webpki-roots 1.0.5", + "webpki-roots 1.0.7", ] [[package]] @@ -1487,32 +1640,15 @@ dependencies = [ "tower-service", ] -[[package]] -name = "hyper-tls" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" -dependencies = [ - "bytes", - "http-body-util", - "hyper", - "hyper-util", - "native-tls", - "tokio", - "tokio-native-tls", - "tower-service", -] - [[package]] name = "hyper-util" -version = "0.1.19" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ "base64 0.22.1", "bytes", "futures-channel", - "futures-core", "futures-util", "http", "http-body", @@ -1521,12 +1657,10 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.2", - "system-configuration", + "socket2 0.6.4", "tokio", "tower-service", "tracing", - "windows-registry", ] [[package]] @@ -1555,12 +1689,13 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" dependencies = [ "displaydoc", "potential_utf", + "utf8_iter", "yoke", "zerofrom", "zerovec", @@ -1568,9 +1703,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" dependencies = [ "displaydoc", "litemap", @@ -1581,9 +1716,9 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" dependencies = [ "icu_collections", "icu_normalizer_data", @@ -1595,15 +1730,15 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" [[package]] name = "icu_properties" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" dependencies = [ "icu_collections", "icu_locale_core", @@ -1615,15 +1750,15 @@ dependencies = [ [[package]] name = "icu_properties_data" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" [[package]] name = "icu_provider" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" dependencies = [ "displaydoc", "icu_locale_core", @@ -1634,6 +1769,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "ident_case" version = "1.0.1" @@ -1653,9 +1794,9 @@ dependencies = [ [[package]] name = "idna_adapter" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" dependencies = [ "icu_normalizer", "icu_properties", @@ -1679,12 +1820,14 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.16.1", + "hashbrown 0.17.1", + "serde", + "serde_core", ] [[package]] @@ -1693,7 +1836,7 @@ version = "0.17.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" dependencies = [ - "console", + "console 0.15.11", "number_prefix", "portable-atomic", "unicode-width", @@ -1701,20 +1844,23 @@ dependencies = [ ] [[package]] -name = "ipnet" -version = "2.11.0" +name = "indicatif" +version = "0.18.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" +checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb" +dependencies = [ + "console 0.16.3", + "portable-atomic", + "unicode-width", + "unit-prefix", + "web-time", +] [[package]] -name = "iri-string" -version = "0.7.10" +name = "ipnet" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" -dependencies = [ - "memchr", - "serde", -] +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" [[package]] name = "is_terminal_polyfill" @@ -1724,44 +1870,86 @@ checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" -version = "0.11.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" dependencies = [ "either", ] [[package]] -name = "itertools" -version = "0.12.1" +name = "itoa" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "jni" +version = "0.22.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efd9a482cf3a427f00d6b35f14332adc7902ce91efb778580e180ff90fa3498" dependencies = [ - "either", + "cfg-if", + "combine", + "jni-macros", + "jni-sys", + "log", + "simd_cesu8", + "thiserror 2.0.18", + "walkdir", + "windows-link", ] [[package]] -name = "itertools" -version = "0.14.0" +name = "jni-macros" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +checksum = "a00109accc170f0bdb141fed3e393c565b6f5e072365c3bd58f5b062591560a3" dependencies = [ - "either", + "proc-macro2", + "quote", + "rustc_version", + "simd_cesu8", + "syn", ] [[package]] -name = "itoa" -version = "1.0.17" +name = "jni-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2" +dependencies = [ + "jni-sys-macros", +] + +[[package]] +name = "jni-sys-macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "jobserver" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] [[package]] name = "js-sys" -version = "0.3.85" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" +checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" dependencies = [ - "once_cell", + "cfg-if", + "futures-util", "wasm-bindgen", ] @@ -1770,54 +1958,43 @@ name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" -dependencies = [ - "spin", -] [[package]] -name = "libc" -version = "0.2.180" +name = "leb128fmt" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] -name = "libm" -version = "0.2.16" +name = "libc" +version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" [[package]] name = "libredox" -version = "0.1.12" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" +checksum = "f02ab6bace2054fb888a3c16f990117b579d14a3088e472d63c6011fa185c9d3" dependencies = [ - "bitflags", "libc", - "redox_syscall 0.7.0", ] [[package]] name = "libsqlite3-sys" -version = "0.30.1" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149" +checksum = "b1f111c8c41e7c61a49cd34e44c7619462967221a6443b0ec299e0ac30cfb9b1" dependencies = [ "pkg-config", "vcpkg", ] -[[package]] -name = "linux-raw-sys" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" - [[package]] name = "litemap" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" [[package]] name = "lock_api" @@ -1830,9 +2007,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.29" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" [[package]] name = "lru-slab" @@ -1879,19 +2056,19 @@ checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" [[package]] name = "md-5" -version = "0.10.6" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +checksum = "69b6441f590336821bb897fb28fc622898ccceb1d6cea3fde5ea86b090c4de98" dependencies = [ "cfg-if", - "digest", + "digest 0.11.3", ] [[package]] name = "memchr" -version = "2.7.6" +version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" [[package]] name = "mime" @@ -1917,9 +2094,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.1.1" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +checksum = "02bd0af71c67b473010cbbc60715ee815645a4dc942899111f494b4b737d6fda" dependencies = [ "libc", "wasi", @@ -1948,23 +2125,6 @@ dependencies = [ "syn", ] -[[package]] -name = "native-tls" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" -dependencies = [ - "libc", - "log", - "openssl", - "openssl-probe 0.1.6", - "openssl-sys", - "schannel", - "security-framework 2.11.1", - "security-framework-sys", - "tempfile", -] - [[package]] name = "nom" version = "7.1.3" @@ -1984,47 +2144,11 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "num-bigint-dig" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e661dda6640fad38e827a6d4a310ff4763082116fe217f279885c97f511bb0b7" -dependencies = [ - "lazy_static", - "libm", - "num-integer", - "num-iter", - "num-traits", - "rand 0.8.5", - "smallvec", - "zeroize", -] - [[package]] name = "num-conv" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" - -[[package]] -name = "num-integer" -version = "0.1.46" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-iter" -version = "0.1.45" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] +checksum = "521739c6d2bac4aa25192232afe6841231376b2b26d4d9fae5ecf8ca5772e441" [[package]] name = "num-traits" @@ -2033,7 +2157,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", - "libm", ] [[package]] @@ -2062,9 +2185,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.21.3" +version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] name = "once_cell_polyfill" @@ -2074,9 +2197,9 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "onig" -version = "6.5.1" +version = "6.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0" +checksum = "0cc3cbf698f9438986c11a880c90a6d04b9de27575afd28bbf45b154b6c709e2" dependencies = [ "bitflags", "libc", @@ -2086,64 +2209,20 @@ dependencies = [ [[package]] name = "onig_sys" -version = "69.9.1" +version = "69.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7f86c6eef3d6df15f23bcfb6af487cbd2fed4e5581d58d5bf1f5f8b7f6727dc" +checksum = "1e68317604e77e53b85896388e1a803c1d21b74c899ec9e5e1112db90735edd7" dependencies = [ "cc", "pkg-config", ] -[[package]] -name = "openssl" -version = "0.10.75" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" -dependencies = [ - "bitflags", - "cfg-if", - "foreign-types", - "libc", - "once_cell", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "openssl-probe" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" - [[package]] name = "openssl-probe" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" -[[package]] -name = "openssl-sys" -version = "0.9.111" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" -dependencies = [ - "cc", - "libc", - "pkg-config", - "vcpkg", -] - [[package]] name = "option-ext" version = "0.2.0" @@ -2152,9 +2231,9 @@ checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" [[package]] name = "owo-colors" -version = "4.2.3" +version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c6901729fa79e91a0913333229e9ca5dc725089d1c363b2f4b4760709dc4a52" +checksum = "d211803b9b6b570f68772237e415a029d5a50c65d382910b879fb19d3271f94d" [[package]] name = "parking" @@ -2180,7 +2259,7 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.5.18", + "redox_syscall", "smallvec", "windows-link", ] @@ -2193,18 +2272,9 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pastey" -version = "0.2.1" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec" - -[[package]] -name = "pem-rfc7468" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" -dependencies = [ - "base64ct", -] +checksum = "2ee67f1008b1ba2321834326597b8e186293b049a023cdef258527550b9935b4" [[package]] name = "percent-encoding" @@ -2214,62 +2284,35 @@ checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "pin-project" -version = "1.1.10" +version = "1.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" +checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.10" +version = "1.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" +checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" dependencies = [ "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "pkcs1" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" -dependencies = [ - "der", - "pkcs8", - "spki", + "quote", + "syn", ] [[package]] -name = "pkcs8" -version = "0.10.2" +name = "pin-project-lite" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" -dependencies = [ - "der", - "spki", -] +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" [[package]] name = "pkg-config" -version = "0.3.32" +version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" [[package]] name = "portable-atomic" @@ -2279,9 +2322,9 @@ checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "potential_utf" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" dependencies = [ "zerovec", ] @@ -2301,6 +2344,16 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -2327,7 +2380,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools", "proc-macro2", "quote", "syn", @@ -2344,9 +2397,9 @@ dependencies = [ [[package]] name = "qdrant-client" -version = "1.16.0" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a76499f3e8385dae785d65a0216e0dfa8fadaddd18038adf04f438631683b26a" +checksum = "82cef4e669bcf9c07471463adab5ee080dd9bc9381f3652ea4981f6030b2c309" dependencies = [ "anyhow", "derive_builder", @@ -2355,7 +2408,7 @@ dependencies = [ "parking_lot", "prost", "prost-types", - "reqwest", + "reqwest 0.12.28", "semver", "serde", "serde_json", @@ -2377,7 +2430,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.2", + "socket2 0.6.4", "thiserror 2.0.18", "tokio", "tracing", @@ -2386,14 +2439,15 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.13" +version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" dependencies = [ + "aws-lc-rs", "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand 0.9.4", "ring", "rustc-hash", "rustls", @@ -2414,16 +2468,16 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.2", + "socket2 0.6.4", "tracing", "windows-sys 0.60.2", ] [[package]] name = "quote" -version = "1.0.44" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] @@ -2434,11 +2488,17 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "rand" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" dependencies = [ "libc", "rand_chacha 0.3.1", @@ -2447,14 +2507,25 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.5", ] +[[package]] +name = "rand" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" +dependencies = [ + "chacha20", + "getrandom 0.4.2", + "rand_core 0.10.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -2493,11 +2564,17 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" + [[package]] name = "rayon" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" dependencies = [ "either", "rayon-core", @@ -2505,12 +2582,12 @@ dependencies = [ [[package]] name = "rayon-cond" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" dependencies = [ "either", - "itertools 0.11.0", + "itertools", "rayon", ] @@ -2533,24 +2610,15 @@ dependencies = [ "bitflags", ] -[[package]] -name = "redox_syscall" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f3fe0889e69e2ae9e41f4d6c4c0181701d00e4697b356fb1f74173a5e0ee27" -dependencies = [ - "bitflags", -] - [[package]] name = "redox_users" -version = "0.4.6" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" +checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.17", "libredox", - "thiserror 1.0.69", + "thiserror 2.0.18", ] [[package]] @@ -2575,9 +2643,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.12.2" +version = "1.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" dependencies = [ "aho-corasick", "memchr", @@ -2587,9 +2655,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", @@ -2598,9 +2666,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.8" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" [[package]] name = "reqwest" @@ -2610,7 +2678,6 @@ checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ "base64 0.22.1", "bytes", - "encoding_rs", "futures-core", "futures-util", "h2", @@ -2619,12 +2686,9 @@ dependencies = [ "http-body-util", "hyper", "hyper-rustls", - "hyper-tls", "hyper-util", "js-sys", "log", - "mime", - "native-tls", "percent-encoding", "pin-project-lite", "quinn", @@ -2635,7 +2699,6 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", - "tokio-native-tls", "tokio-rustls", "tokio-util", "tower 0.5.3", @@ -2646,7 +2709,45 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 1.0.5", + "webpki-roots 1.0.7", +] + +[[package]] +name = "reqwest" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "219c5811de6525e5416c7d5d53bb656d3afdbc6c5af816e0802bcfa42dbdc1c3" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-core", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-pki-types", + "rustls-platform-verifier", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tower 0.5.3", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", ] [[package]] @@ -2665,12 +2766,11 @@ dependencies = [ [[package]] name = "rmcp" -version = "0.13.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1815dbc06c414d720f8bc1951eccd66bc99efc6376331f1e7093a119b3eb508" +checksum = "0810a9f717d9828f475fe1f629f4c305c8464b7f496c3a854b58d29e65f4058e" dependencies = [ "async-trait", - "axum 0.8.8", "base64 0.22.1", "bytes", "chrono", @@ -2680,7 +2780,7 @@ dependencies = [ "http-body-util", "pastey", "pin-project-lite", - "rand 0.9.2", + "rand 0.10.1", "rmcp-macros", "schemars", "serde", @@ -2697,9 +2797,9 @@ dependencies = [ [[package]] name = "rmcp-macros" -version = "0.13.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11f0bc7008fa102e771a76c6d2c9b253be3f2baa5964e060464d038ae1cbc573" +checksum = "6aefac48c364756e97f04c0401ba3231e8607882c7c1d92da0437dc16307904d" dependencies = [ "darling 0.23.0", "proc-macro2", @@ -2708,26 +2808,6 @@ dependencies = [ "syn", ] -[[package]] -name = "rsa" -version = "0.9.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8573f03f5883dcaebdfcf4725caa1ecb9c15b2ef50c43a07b816e06799bb12d" -dependencies = [ - "const-oid", - "digest", - "num-bigint-dig", - "num-integer", - "num-traits", - "pkcs1", - "pkcs8", - "rand_core 0.6.4", - "signature", - "spki", - "subtle", - "zeroize", -] - [[package]] name = "rustc-demangle" version = "0.1.27" @@ -2736,29 +2816,26 @@ checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" [[package]] name = "rustc-hash" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" [[package]] -name = "rustix" -version = "1.1.3" +name = "rustc_version" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys", - "windows-sys 0.61.2", + "semver", ] [[package]] name = "rustls" -version = "0.23.36" +version = "0.23.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" dependencies = [ + "aws-lc-rs", "log", "once_cell", "ring", @@ -2770,14 +2847,14 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +checksum = "dab5152771c58876a2146916e53e35057e1a4dfa2b9df0f0305b07f611fdea4d" dependencies = [ - "openssl-probe 0.2.1", + "openssl-probe", "rustls-pki-types", "schannel", - "security-framework 3.5.1", + "security-framework", ] [[package]] @@ -2791,20 +2868,48 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.14.0" +version = "1.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" dependencies = [ "web-time", "zeroize", ] +[[package]] +name = "rustls-platform-verifier" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d1e2536ce4f35f4846aa13bff16bd0ff40157cdb14cc056c7b14ba41233ba0" +dependencies = [ + "core-foundation", + "core-foundation-sys", + "jni", + "log", + "once_cell", + "rustls", + "rustls-native-certs", + "rustls-platform-verifier-android", + "rustls-webpki", + "security-framework", + "security-framework-sys", + "webpki-root-certs", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls-platform-verifier-android" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" + [[package]] name = "rustls-webpki" -version = "0.103.9" +version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ + "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -2818,24 +2923,33 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "ryu" -version = "1.0.22" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "same-file" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] [[package]] name = "schannel" -version = "0.1.28" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" dependencies = [ "windows-sys 0.61.2", ] [[package]] name = "schemars" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54e910108742c57a770f492731f99be216a52fadd361b06c8fb59d74ccc267d2" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" dependencies = [ "chrono", "dyn-clone", @@ -2847,9 +2961,9 @@ dependencies = [ [[package]] name = "schemars_derive" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4908ad288c5035a8eb12cfdf0d49270def0a268ee162b75eeee0f85d155a7c45" +checksum = "7d115b50f4aaeea07e79c1912f645c7513d81715d0420f8bc77a18c6260b307f" dependencies = [ "proc-macro2", "quote", @@ -2865,25 +2979,12 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "2.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" -dependencies = [ - "bitflags", - "core-foundation 0.9.4", - "core-foundation-sys", - "libc", - "security-framework-sys", -] - -[[package]] -name = "security-framework" -version = "3.5.1" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ "bitflags", - "core-foundation 0.10.1", + "core-foundation", "core-foundation-sys", "libc", "security-framework-sys", @@ -2891,9 +2992,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.15.0" +version = "2.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" dependencies = [ "core-foundation-sys", "libc", @@ -2901,13 +3002,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" -dependencies = [ - "serde", - "serde_core", -] +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" [[package]] name = "serde" @@ -2952,9 +3049,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.149" +version = "1.0.150" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" dependencies = [ "itoa", "memchr", @@ -2976,11 +3073,11 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.9" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26" dependencies = [ - "serde", + "serde_core", ] [[package]] @@ -2997,13 +3094,13 @@ dependencies = [ [[package]] name = "sha1" -version = "0.10.6" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +checksum = "aacc4cc499359472b4abe1bf11d0b12e688af9a805fa5e3016f9a386dc2d0214" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.3.0", + "digest 0.11.3", ] [[package]] @@ -3019,8 +3116,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + +[[package]] +name = "sha2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "digest 0.11.3", ] [[package]] @@ -3034,25 +3142,31 @@ dependencies = [ [[package]] name = "shlex" -version = "1.3.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" [[package]] -name = "signature" -version = "2.2.0" +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + +[[package]] +name = "simd_cesu8" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +checksum = "94f90157bb87cddf702797c5dadfa0be7d266cdf49e22da2fcaa32eff75b2c33" dependencies = [ - "digest", - "rand_core 0.6.4", + "rustc_version", + "simdutf8", ] [[package]] -name = "simd-adler32" -version = "0.3.8" +name = "simdutf8" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] name = "slab" @@ -3062,9 +3176,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" -version = "1.15.1" +version = "1.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" dependencies = [ "serde", ] @@ -3081,31 +3195,32 @@ dependencies = [ [[package]] name = "socket2" -version = "0.6.2" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0" +checksum = "52d1cfed4120b4d927bf7c0f86d2087a4a7d6027c906d9f9d525a80573b9be51" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] -name = "spin" -version = "0.9.8" +name = "socks" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b" dependencies = [ - "lock_api", + "byteorder", + "libc", + "winapi", ] [[package]] -name = "spki" -version = "0.7.3" +name = "spin" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" dependencies = [ - "base64ct", - "der", + "lock_api", ] [[package]] @@ -3122,9 +3237,9 @@ dependencies = [ [[package]] name = "sqlx" -version = "0.8.6" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fefb893899429669dcdd979aff487bd78f4064e5e7907e4269081e0ef7d97dc" +checksum = "378620ccc25c62c89d8be1c819e76a88d59bdcc3304733330788948e619bfd71" dependencies = [ "sqlx-core", "sqlx-macros", @@ -3135,12 +3250,13 @@ dependencies = [ [[package]] name = "sqlx-core" -version = "0.8.6" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee6798b1838b6a0f69c007c133b8df5866302197e404e8b6ee8ed3e3a5e68dc6" +checksum = "05b44e85bf579a8eeb4ceaa77a3a523baf2bf0e9bac7e40f405d537b5d2d5ccb" dependencies = [ "base64 0.22.1", "bytes", + "cfg-if", "crc", "crossbeam-queue", "either", @@ -3149,17 +3265,16 @@ dependencies = [ "futures-intrusive", "futures-io", "futures-util", - "hashbrown 0.15.5", + "hashbrown 0.16.1", "hashlink", - "indexmap 2.13.0", + "indexmap 2.14.0", "log", "memchr", - "once_cell", "percent-encoding", "rustls", "serde", "serde_json", - "sha2", + "sha2 0.10.9", "smallvec", "thiserror 2.0.18", "time", @@ -3168,14 +3283,14 @@ dependencies = [ "tracing", "url", "uuid", - "webpki-roots 0.26.11", + "webpki-roots 1.0.7", ] [[package]] name = "sqlx-macros" -version = "0.8.6" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2d452988ccaacfbf5e0bdbc348fb91d7c8af5bee192173ac3636b5fb6e6715d" +checksum = "bd2b84f2bc39a5705ef27ec785a11c934a41bbd4a24941e257927cddc26b60bf" dependencies = [ "proc-macro2", "quote", @@ -3186,78 +3301,63 @@ dependencies = [ [[package]] name = "sqlx-macros-core" -version = "0.8.6" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19a9c1841124ac5a61741f96e1d9e2ec77424bf323962dd894bdb93f37d5219b" +checksum = "fb8d96de5fdc85a5c4ec813432b523ec637e80ba98f046555f75f7908ddac7c3" dependencies = [ + "cfg-if", "dotenvy", "either", "heck", "hex", - "once_cell", "proc-macro2", "quote", "serde", "serde_json", - "sha2", + "sha2 0.10.9", "sqlx-core", "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", "syn", + "thiserror 2.0.18", "tokio", "url", ] [[package]] name = "sqlx-mysql" -version = "0.8.6" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa003f0038df784eb8fecbbac13affe3da23b45194bd57dba231c8f48199c526" +checksum = "90b8020fe17c5f2c245bfa2505d7ef59c5604839527c740266ad2214acebea27" dependencies = [ - "atoi", - "base64 0.22.1", "bitflags", "byteorder", "bytes", "crc", - "digest", + "digest 0.11.3", "dotenvy", "either", - "futures-channel", "futures-core", - "futures-io", "futures-util", "generic-array", - "hex", - "hkdf", - "hmac", - "itoa", "log", - "md-5", - "memchr", - "once_cell", "percent-encoding", - "rand 0.8.5", - "rsa", "serde", "sha1", - "sha2", - "smallvec", + "sha2 0.11.0", "sqlx-core", - "stringprep", "thiserror 2.0.18", "time", "tracing", "uuid", - "whoami", ] [[package]] name = "sqlx-postgres" -version = "0.8.6" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db58fcd5a53cf07c184b154801ff91347e4c30d17a3562a635ff028ad5deda46" +checksum = "87a2bdd6e83f6b3ea525ca9fee568030508b58355a43d0b2c1674d5f79dcd65e" dependencies = [ "atoi", "base64 0.22.1", @@ -3272,16 +3372,14 @@ dependencies = [ "hex", "hkdf", "hmac", - "home", "itoa", "log", "md-5", "memchr", - "once_cell", - "rand 0.8.5", + "rand 0.10.1", "serde", "serde_json", - "sha2", + "sha2 0.11.0", "smallvec", "sqlx-core", "stringprep", @@ -3294,12 +3392,13 @@ dependencies = [ [[package]] name = "sqlx-sqlite" -version = "0.8.6" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2d12fe70b2c1b4401038055f90f151b78208de1f9f89a7dbfd41587a10c3eea" +checksum = "488e99c397a62007e4229aec669a179816339afc6d2620ca6fa420dbee2e982c" dependencies = [ "atoi", "flume", + "form_urlencoded", "futures-channel", "futures-core", "futures-executor", @@ -3309,7 +3408,6 @@ dependencies = [ "log", "percent-encoding", "serde", - "serde_urlencoded", "sqlx-core", "thiserror 2.0.18", "time", @@ -3320,9 +3418,9 @@ dependencies = [ [[package]] name = "sse-stream" -version = "0.2.1" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb4dc4d33c68ec1f27d386b5610a351922656e1fdf5c05bbaad930cd1519479a" +checksum = "f3962b63f038885f15bce2c6e02c0e7925c072f1ac86bb60fd44c5c6b762fb72" dependencies = [ "bytes", "futures-util", @@ -3337,6 +3435,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "stringprep" version = "0.1.5" @@ -3362,9 +3466,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.114" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -3387,42 +3491,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "system-configuration" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" -dependencies = [ - "bitflags", - "core-foundation 0.9.4", - "system-configuration-sys", -] - -[[package]] -name = "system-configuration-sys" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" -dependencies = [ - "core-foundation-sys", - "libc", -] - -[[package]] -name = "tempfile" -version = "3.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" -dependencies = [ - "fastrand", - "getrandom 0.3.4", - "once_cell", - "rustix", - "windows-sys 0.61.2", + "quote", + "syn", ] [[package]] @@ -3476,12 +3546,11 @@ dependencies = [ [[package]] name = "time" -version = "0.3.46" +version = "0.3.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9da98b7d9b7dad93488a84b8248efc35352b0b2657397d4167e7ad67e5d535e5" +checksum = "711a53c2d47bbd818258c498c8dbfe186a2526c631495cfe7e078567f86b8469" dependencies = [ "deranged", - "itoa", "libc", "num-conv", "num_threads", @@ -3493,15 +3562,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" +checksum = "9e1c906769ad99c88eaa54e728060edef082f8e358ff32030cb7c7d315e81109" [[package]] name = "time-macros" -version = "0.2.26" +version = "0.2.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78cc610bac2dcee56805c99642447d4c5dbde4d01f752ffea0199aee1f601dc4" +checksum = "71c652a3727a9cbb9a02f707f530b618ce00d0ccd762009c8c23bd191df3c17d" dependencies = [ "num-conv", "time-core", @@ -3509,9 +3578,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" dependencies = [ "displaydoc", "zerovec", @@ -3519,9 +3588,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" dependencies = [ "tinyvec_macros", ] @@ -3534,24 +3603,26 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokenizers" -version = "0.20.4" +version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b08cc37428a476fc9e20ac850132a513a2e1ce32b6a31addf2b74fa7033b905" +checksum = "44e5bea67576e04b6ff8564c5d9e09c2ef0cf476502245f2f120e497769d3112" dependencies = [ - "aho-corasick", + "ahash", + "compact_str", + "daachorse", + "dary_heap", "derive_builder", "esaxx-rs", - "getrandom 0.2.17", + "getrandom 0.3.4", "hf-hub", - "indicatif", - "itertools 0.12.1", - "lazy_static", + "indicatif 0.18.4", + "itertools", "log", "macro_rules_attribute", "monostate", "onig", "paste", - "rand 0.8.5", + "rand 0.9.4", "rayon", "rayon-cond", "regex", @@ -3559,7 +3630,7 @@ dependencies = [ "serde", "serde_json", "spm_precompiled", - "thiserror 1.0.69", + "thiserror 2.0.18", "unicode-normalization-alignments", "unicode-segmentation", "unicode_categories", @@ -3567,40 +3638,30 @@ dependencies = [ [[package]] name = "tokio" -version = "1.49.0" +version = "1.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" dependencies = [ "bytes", "libc", "mio", "pin-project-lite", - "socket2 0.6.2", + "socket2 0.6.4", "tokio-macros", "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", "syn", ] -[[package]] -name = "tokio-native-tls" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" -dependencies = [ - "native-tls", - "tokio", -] - [[package]] name = "tokio-rustls" version = "0.26.4" @@ -3637,44 +3698,42 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.23" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +checksum = "81f3d15e84cbcd896376e6730314d59fb5a87f31e4b038454184435cd57defee" dependencies = [ - "serde", + "indexmap 2.14.0", + "serde_core", "serde_spanned", "toml_datetime", - "toml_edit", + "toml_parser", + "toml_writer", + "winnow", ] [[package]] name = "toml_datetime" -version = "0.6.11" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" dependencies = [ - "serde", + "serde_core", ] [[package]] -name = "toml_edit" -version = "0.22.27" +name = "toml_parser" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" dependencies = [ - "indexmap 2.13.0", - "serde", - "serde_spanned", - "toml_datetime", - "toml_write", "winnow", ] [[package]] -name = "toml_write" -version = "0.1.2" +name = "toml_writer" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" +checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db" [[package]] name = "tonic" @@ -3721,7 +3780,7 @@ dependencies = [ "indexmap 1.9.3", "pin-project", "pin-project-lite", - "rand 0.8.5", + "rand 0.8.6", "slab", "tokio", "tokio-util", @@ -3748,20 +3807,20 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.6.8" +version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +checksum = "4cfcf7e2740e6fc6d4d688b4ef00650406bb94adf4731e43c096c3a19fe40840" dependencies = [ "bitflags", "bytes", "futures-util", "http", "http-body", - "iri-string", "pin-project-lite", "tower 0.5.3", "tower-layer", "tower-service", + "url", ] [[package]] @@ -3832,9 +3891,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.22" +version = "0.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" dependencies = [ "matchers", "nu-ansi-term", @@ -3856,9 +3915,9 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "typenum" -version = "1.19.0" +version = "1.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" [[package]] name = "unicode-bidi" @@ -3868,9 +3927,9 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.22" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-normalization" @@ -3896,11 +3955,17 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" +[[package]] +name = "unicode-script" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "383ad40bb927465ec0ce7720e033cb4ca06912855fc35db31b5755d0de75b1ee" + [[package]] name = "unicode-segmentation" -version = "1.12.0" +version = "1.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" [[package]] name = "unicode-width" @@ -3908,12 +3973,24 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "unicode_categories" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +[[package]] +name = "unit-prefix" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" + [[package]] name = "untrusted" version = "0.9.0" @@ -3929,12 +4006,12 @@ dependencies = [ "base64 0.22.1", "flate2", "log", - "native-tls", "once_cell", "rustls", "rustls-pki-types", "serde", "serde_json", + "socks", "url", "webpki-roots 0.26.11", ] @@ -3963,13 +4040,50 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "utoipa" +version = "5.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bde15df68e80b16c7d16b9616e80770ad158988daa56a27dccd1e55558b0160" +dependencies = [ + "indexmap 2.14.0", + "serde", + "serde_json", + "utoipa-gen", +] + +[[package]] +name = "utoipa-gen" +version = "5.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba0b99ee52df3028635d93840c797102da61f8a7bb3cf751032455895b52ef8" +dependencies = [ + "proc-macro2", + "quote", + "regex", + "syn", + "uuid", +] + +[[package]] +name = "utoipa-scalar" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59559e1509172f6b26c1cdbc7247c4ddd1ac6560fe94b584f81ee489b141f719" +dependencies = [ + "axum 0.8.9", + "serde", + "serde_json", + "utoipa", +] + [[package]] name = "uuid" -version = "1.20.0" +version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee48d38b119b0cd71fe4141b30f5ba9c7c5d9f4e7a3a8b4a674e4b6ef789976f" +checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ - "getrandom 0.3.4", + "getrandom 0.4.2", "js-sys", "serde_core", "sha1_smol", @@ -3990,26 +4104,24 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vergen" -version = "9.1.0" +version = "10.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b849a1f6d8639e8de261e81ee0fc881e3e3620db1af9f2e0da015d4382ceaf75" +checksum = "7bdf18a54cf91b4d98a8e8b67f6321606539fbcdcac02536286ad1de37b53fd2" dependencies = [ "anyhow", - "cargo_metadata", - "derive_builder", - "regex", + "bon", "rustversion", "vergen-lib", ] [[package]] name = "vergen-gitcl" -version = "9.1.0" +version = "10.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ff3b5300a085d6bcd8fc96a507f706a28ae3814693236c9b409db71a1d15b9" +checksum = "4961429ed12888cb3c6dd20f7dc9508c821091a3ba5fec0156ed5a654c1c4572" dependencies = [ "anyhow", - "derive_builder", + "bon", "rustversion", "time", "vergen", @@ -4018,12 +4130,12 @@ dependencies = [ [[package]] name = "vergen-lib" -version = "9.1.0" +version = "10.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b34a29ba7e9c59e62f229ae1932fb1b8fb8a6fdcc99215a641913f5f5a59a569" +checksum = "910e8471e27130bbc019e9bfa6bda16dfc4c6dd7c5d0793da70a9256caeae984" dependencies = [ "anyhow", - "derive_builder", + "bon", "rustversion", ] @@ -4033,6 +4145,16 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -4050,24 +4172,27 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.2+wasi-0.2.9" +version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.57.1", ] [[package]] -name = "wasite" -version = "0.1.0" +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", +] [[package]] name = "wasm-bindgen" -version = "0.2.108" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" +checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" dependencies = [ "cfg-if", "once_cell", @@ -4078,23 +4203,19 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.58" +version = "0.4.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" +checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280" dependencies = [ - "cfg-if", - "futures-util", "js-sys", - "once_cell", "wasm-bindgen", - "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.108" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" +checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4102,9 +4223,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.108" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" +checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" dependencies = [ "bumpalo", "proc-macro2", @@ -4115,13 +4236,35 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.108" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" +checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.14.0", + "wasm-encoder", + "wasmparser", +] + [[package]] name = "wasm-streams" version = "0.4.2" @@ -4135,11 +4278,23 @@ dependencies = [ "web-sys", ] +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap 2.14.0", + "semver", +] + [[package]] name = "web-sys" -version = "0.3.85" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" +checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" dependencies = [ "js-sys", "wasm-bindgen", @@ -4155,34 +4310,79 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-root-certs" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "webpki-roots" version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" dependencies = [ - "webpki-roots 1.0.5", + "webpki-roots 1.0.7", ] [[package]] name = "webpki-roots" -version = "1.0.5" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12bed680863276c63889429bfd6cab3b99943659923822de1c8a39c49e4d722c" +checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" dependencies = [ "rustls-pki-types", ] +[[package]] +name = "whatlang" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5e8f38b596e2a359b755342473520a99421e43658548c79489ee221b728c107" +dependencies = [ + "hashbrown 0.15.5", +] + [[package]] name = "whoami" -version = "1.6.1" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "998767ef88740d1f5b0682a9c53c24431453923962269c2db68ee43788c5a40d" + +[[package]] +name = "winapi" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d4a4db5077702ca3015d3d02d74974948aba2ad9e12ab7df718ee64ccd7e97d" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ - "libredox", - "wasite", + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.62.2" @@ -4224,17 +4424,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" -[[package]] -name = "windows-registry" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" -dependencies = [ - "windows-link", - "windows-result", - "windows-strings", -] - [[package]] name = "windows-result" version = "0.4.1" @@ -4253,15 +4442,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-sys" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" -dependencies = [ - "windows-targets 0.48.5", -] - [[package]] name = "windows-sys" version = "0.52.0" @@ -4298,21 +4478,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", -] - [[package]] name = "windows-targets" version = "0.52.6" @@ -4346,12 +4511,6 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -4364,12 +4523,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -4382,12 +4535,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -4412,12 +4559,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -4430,12 +4571,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -4448,12 +4583,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -4466,12 +4595,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -4486,30 +4609,115 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" -version = "0.7.14" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +checksum = "0592e1c9d151f854e6fd382574c3a0855250e1d9b2f99d9281c6e6391af352f1" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" dependencies = [ - "memchr", + "wit-bindgen-rust-macro", ] [[package]] name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "wit-bindgen-core" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap 2.14.0", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap 2.14.0", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.14.0", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] [[package]] name = "writeable" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" [[package]] name = "yoke" -version = "0.8.1" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -4518,9 +4726,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", @@ -4530,18 +4738,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.37" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7456cf00f0685ad319c5b1693f291a650eaf345e941d082fc4e03df8a03996ac" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.37" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1328722bbf2115db7e19d69ebcc15e795719e2d66b60827c6a69a117365e37a0" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", "quote", @@ -4550,18 +4758,18 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", @@ -4571,15 +4779,15 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" [[package]] name = "zerotrie" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" dependencies = [ "displaydoc", "yoke", @@ -4588,9 +4796,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" dependencies = [ "yoke", "zerofrom", @@ -4599,9 +4807,9 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", @@ -4610,6 +4818,6 @@ dependencies = [ [[package]] name = "zmij" -version = "1.0.18" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1966f8ac2c1f76987d69a74d0e0f929241c10e78136434e3be70ff7f58f64214" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index df2d65c0..faca7f20 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,28 +9,48 @@ resolver = "3" authors = ["Xavier Lau "] description = "Evidence-linked fact memory for agents." edition = "2024" -homepage = "https://github.com/hack-ink/ELF" +homepage = "https://hack.ink/elf" license = "GPL-3.0" readme = "README.md" -repository = "https://github.com/hack-ink/ELF" -version = "0.1.0" +repository = "https://github.com/hack-ink/elf" +version = "0.2.0" [workspace.dependencies] -axum = { version = "0.7" } -blake3 = { version = "1.5" } -clap = { version = "4.5", features = ["derive"] } -color-eyre = { version = "0.6" } -qdrant-client = { version = "1.0" } -reqwest = { version = "0.12", features = ["json", "rustls-tls"] } -serde = { version = "1.0", features = ["derive"] } -serde_json = { version = "1.0" } -sqlx = { version = "0.8", features = ["json", "postgres", "runtime-tokio", "time", "tls-rustls", "uuid"] } -time = { version = "0.3", features = ["macros", "serde"] } -tokenizers = { version = "0.20", features = ["http"] } -tokio = { version = "1.0", features = ["macros", "rt-multi-thread", "time"] } -toml = { version = "0.8" } -tracing = { version = "0.1" } -tracing-subscriber = { version = "0.3", features = ["env-filter"] } -unicode-segmentation = { version = "1.11" } -uuid = { version = "1.0", features = ["serde", "v4", "v5"] } -vergen-gitcl = { version = "9.1", features = ["cargo"] } +ahash = { version = "0.8" } +axum = { version = "0.8" } +blake3 = { version = "1.8" } +clap = { version = "4.6", features = ["derive", "env"] } +color-eyre = { version = "0.6" } +qdrant-client = { version = "1.18.0" } +regex = { version = "1.12" } +reqwest = { version = "0.13", default-features = false, features = ["json", "query", "rustls"] } +rmcp = { version = "1.7", features = ["transport-streamable-http-server"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = { version = "1.0" } +sqlx = { version = "0.9", features = ["json", "postgres", "runtime-tokio", "time", "tls-rustls", "uuid"] } +thiserror = { version = "2.0" } +time = { version = "0.3", features = ["macros", "serde"] } +tokenizers = { version = "0.23", features = ["http"] } +tokio = { version = "1.52", features = ["macros", "rt-multi-thread", "time"] } +toml = { version = "1.1" } +tower = { version = "0.5" } +tracing = { version = "0.1" } +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +unicode-normalization = { version = "0.1" } +unicode-script = { version = "0.5" } +unicode-segmentation = { version = "1.13" } +utoipa = { version = "5.5", features = ["axum_extras", "time", "uuid"] } +utoipa-scalar = { version = "0.3", features = ["axum"] } +uuid = { version = "1.23", features = ["serde", "v4", "v5"] } +vergen-gitcl = { version = "10.0", features = ["cargo"] } +whatlang = { version = "0.18" } + +elf-chunking = { version = "0.2", path = "packages/elf-chunking" } +elf-cli = { version = "0.2", path = "packages/elf-cli" } +elf-config = { version = "0.2", path = "packages/elf-config" } +elf-domain = { version = "0.2", path = "packages/elf-domain" } +elf-providers = { version = "0.2", path = "packages/elf-providers" } +elf-service = { version = "0.2", path = "packages/elf-service" } +elf-storage = { version = "0.2", path = "packages/elf-storage" } +elf-testkit = { version = "0.2", path = "packages/elf-testkit" } +elf-worker = { version = "0.2", path = "apps/elf-worker" } diff --git a/Makefile.toml b/Makefile.toml index e4dc8d8a..02654763 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -1,74 +1,940 @@ # Rust workspace tasks. -# Lint -# | task | type | cwd | -# | ------------- | --------- | --- | -# | lint | composite | | -# | lint-fix | composite | | -# | lint-rust | command | | -# | lint-fix-rust | extend | | +# Benchmark +# | task | type | cwd | +# | ------------------------------------------ | --------- | --- | +# | baseline-backfill-100k-docker | command | | +# | baseline-backfill-10k-docker | command | | +# | baseline-backfill-docker | command | | +# | baseline-live-docker | command | | +# | baseline-live-report | command | | +# | baseline-production-private | command | | +# | baseline-production-private-addendum | command | | +# | baseline-production-synthetic | command | | +# | baseline-soak-docker | command | | +# | openmemory-ui-export-readback | command | | +# | parity-docker | command | | +# | real-world-first-generation-oss | composite | | +# | real-world-first-generation-oss-json | command | | +# | real-world-first-generation-oss-report | command | | +# | real-world-job-operator-ux | composite | | +# | real-world-job-operator-ux-json | command | | +# | real-world-job-operator-ux-live-adapters | command | | +# | real-world-job-operator-ux-report | command | | +# | real-world-memory | composite | | +# | real-world-memory-consolidation | composite | | +# | real-world-memory-consolidation-json | command | | +# | real-world-memory-consolidation-report | command | | +# | real-world-memory-core-archival | composite | | +# | real-world-memory-core-archival-json | command | | +# | real-world-memory-core-archival-report | command | | +# | real-world-memory-evolution | composite | | +# | real-world-memory-evolution-json | command | | +# | real-world-memory-evolution-report | command | | +# | real-world-memory-graph-rag | composite | | +# | real-world-memory-graph-rag-json | command | | +# | real-world-memory-graph-rag-report | command | | +# | real-world-memory-json | command | | +# | real-world-memory-knowledge | composite | | +# | real-world-memory-knowledge-json | command | | +# | real-world-memory-knowledge-report | command | | +# | real-world-memory-live-adapters | command | | +# | real-world-memory-live-consolidation | command | | +# | real-world-memory-proactive-brief | composite | | +# | real-world-memory-proactive-brief-json | command | | +# | real-world-memory-proactive-brief-report | command | | +# | real-world-memory-production-ops | composite | | +# | real-world-memory-production-ops-json | command | | +# | real-world-memory-production-ops-report | command | | +# | real-world-memory-project-decisions | composite | | +# | real-world-memory-project-decisions-json | command | | +# | real-world-memory-project-decisions-report | command | | +# | real-world-memory-report | command | | +# | real-world-memory-retrieval | composite | | +# | real-world-memory-retrieval-json | command | | +# | real-world-memory-retrieval-report | command | | +# | real-world-memory-scheduled | composite | | +# | real-world-memory-scheduled-json | command | | +# | real-world-memory-scheduled-report | command | | +# | real-world-memory-summary | composite | | +# | real-world-memory-summary-json | command | | +# | real-world-memory-summary-report | command | | + +[tasks.baseline-backfill-100k-docker] +workspace = false +command = "bash" +args = [ + "scripts/baseline-docker.sh", + "backfill-100k", +] + +[tasks.baseline-backfill-10k-docker] +workspace = false +command = "bash" +args = [ + "scripts/baseline-docker.sh", + "backfill-10k", +] + +[tasks.baseline-backfill-docker] +workspace = false +command = "bash" +args = [ + "scripts/baseline-docker.sh", + "backfill", +] + +[tasks.baseline-live-docker] +workspace = false +command = "bash" +args = [ + "scripts/baseline-docker.sh", + "live", +] + +[tasks.baseline-live-report] +workspace = false +command = "bash" +args = [ + "scripts/live-baseline-report-to-md.sh", +] + +[tasks.baseline-production-private] +workspace = false +command = "bash" +args = [ + "scripts/baseline-docker.sh", + "production-private", +] + +[tasks.baseline-production-private-addendum] +workspace = false +command = "bash" +args = [ + "scripts/baseline-docker.sh", + "production-private-addendum", +] + +[tasks.baseline-production-synthetic] +workspace = false +command = "bash" +args = [ + "scripts/baseline-docker.sh", + "production-synthetic", +] + +[tasks.baseline-soak-docker] +workspace = false +command = "bash" +args = [ + "scripts/baseline-docker.sh", + "soak", +] + +[tasks.openmemory-ui-export-readback] +workspace = false +command = "bash" +args = [ + "scripts/baseline-docker.sh", + "openmemory-ui-export-readback", +] + +[tasks.parity-docker] +workspace = false +command = "docker" +args = [ + "compose", + "-f", + "docker-compose.parity.yml", + "run", + "--build", + "--rm", + "parity-runner", +] + +[tasks.real-world-first-generation-oss] +workspace = false +dependencies = [ + "real-world-first-generation-oss-report", +] + +[tasks.real-world-first-generation-oss-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss", + "--out", + "tmp/real-world-memory/first-generation-oss/report.json", + "--run-id", + "first-generation-oss-continuity-source-store", + "--adapter-id", + "fixture_first_generation_oss", + "--adapter-name", + "First-generation OSS fixture coverage", +] + +[tasks.real-world-first-generation-oss-report] +workspace = false +dependencies = [ + "real-world-first-generation-oss-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/first-generation-oss/report.json", + "--out", + "tmp/real-world-memory/first-generation-oss/report.md", +] + +[tasks.real-world-job-operator-ux] +workspace = false +dependencies = [ + "real-world-job-operator-ux-report", +] + +[tasks.real-world-job-operator-ux-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux", + "--out", + "tmp/real-world-job/real-world-job-operator-ux-report.json", + "--run-id", + "real-world-job-operator-ux", + "--adapter-id", + "fixture_operator_ux", + "--adapter-name", + "ELF operator UX fixture", +] + +[tasks.real-world-job-operator-ux-live-adapters] +workspace = false +command = "bash" +args = [ + "scripts/real-world-docker.sh", + "job-operator-ux-live-adapters", +] + +[tasks.real-world-job-operator-ux-report] +workspace = false +dependencies = [ + "real-world-job-operator-ux-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-job/real-world-job-operator-ux-report.json", + "--out", + "tmp/real-world-job/real-world-job-operator-ux-report.md", +] + +[tasks.real-world-memory] +workspace = false +dependencies = [ + "real-world-memory-report", +] + +[tasks.real-world-memory-consolidation] +workspace = false +dependencies = [ + "real-world-memory-consolidation-report", +] + +[tasks.real-world-memory-consolidation-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/consolidation", + "--out", + "tmp/real-world-memory/consolidation/report.json", + "--run-id", + "real-world-memory-consolidation", + "--adapter-id", + "fixture_consolidation", + "--adapter-name", + "ELF consolidation fixture", +] + +[tasks.real-world-memory-consolidation-report] +workspace = false +dependencies = [ + "real-world-memory-consolidation-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/consolidation/report.json", + "--out", + "tmp/real-world-memory/consolidation/report.md", +] + +[tasks.real-world-memory-core-archival] +workspace = false +dependencies = [ + "real-world-memory-core-archival-report", +] + +[tasks.real-world-memory-core-archival-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/core_archival_memory", + "--out", + "tmp/real-world-memory/core-archival/report.json", + "--run-id", + "real-world-memory-core-archival", + "--adapter-id", + "fixture_core_archival_memory", + "--adapter-name", + "ELF core and archival memory fixture", +] + +[tasks.real-world-memory-core-archival-report] +workspace = false +dependencies = [ + "real-world-memory-core-archival-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/core-archival/report.json", + "--out", + "tmp/real-world-memory/core-archival/report.md", +] + +[tasks.real-world-memory-evolution] +workspace = false +dependencies = [ + "real-world-memory-evolution-report", +] + +[tasks.real-world-memory-evolution-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/evolution", + "--out", + "tmp/real-world-memory/evolution-report.json", + "--run-id", + "real-world-memory-evolution", + "--adapter-id", + "fixture_memory_evolution", + "--adapter-name", + "ELF fixture memory evolution", +] + +[tasks.real-world-memory-evolution-report] +workspace = false +dependencies = [ + "real-world-memory-evolution-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/evolution-report.json", + "--out", + "tmp/real-world-memory/evolution-report.md", +] + +[tasks.real-world-memory-graph-rag] +workspace = false +dependencies = [ + "real-world-memory-graph-rag-report", +] + +[tasks.real-world-memory-graph-rag-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag", + "--out", + "tmp/real-world-memory/graph-rag/report.json", + "--run-id", + "real-world-memory-graph-rag", + "--adapter-id", + "fixture_graph_rag_external_adapters", + "--adapter-name", + "Graph/RAG representative external-adapter fixtures", +] + +[tasks.real-world-memory-graph-rag-report] +workspace = false +dependencies = [ + "real-world-memory-graph-rag-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/graph-rag/report.json", + "--out", + "tmp/real-world-memory/graph-rag/report.md", +] + +[tasks.real-world-memory-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory", + "--out", + "tmp/real-world-memory/real-world-memory-report.json", + "--run-id", + "real-world-memory", + "--adapter-id", + "elf_real_world_memory_fixture", + "--adapter-name", + "ELF real-world memory fixture", +] + +[tasks.real-world-memory-knowledge] +workspace = false +dependencies = [ + "real-world-memory-knowledge-report", +] + +[tasks.real-world-memory-knowledge-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/knowledge", + "--out", + "tmp/real-world-memory/knowledge-report.json", + "--run-id", + "real-world-memory-knowledge", + "--adapter-id", + "fixture_knowledge", + "--adapter-name", + "ELF knowledge fixture", +] + +[tasks.real-world-memory-knowledge-report] +workspace = false +dependencies = [ + "real-world-memory-knowledge-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/knowledge-report.json", + "--out", + "tmp/real-world-memory/knowledge-report.md", +] + +[tasks.real-world-memory-live-adapters] +workspace = false +command = "bash" +args = [ + "scripts/real-world-docker.sh", + "memory-live-adapters", +] + +[tasks.real-world-memory-live-consolidation] +workspace = false +command = "bash" +args = [ + "scripts/real-world-docker.sh", + "memory-live-consolidation", +] + +[tasks.real-world-memory-proactive-brief] +workspace = false +dependencies = [ + "real-world-memory-proactive-brief-report", +] + +[tasks.real-world-memory-proactive-brief-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/proactive_brief", + "--out", + "tmp/real-world-memory/proactive-brief/report.json", + "--run-id", + "real-world-memory-proactive-brief", + "--adapter-id", + "fixture_proactive_brief", + "--adapter-name", + "ELF proactive brief fixture", +] + +[tasks.real-world-memory-proactive-brief-report] +workspace = false +dependencies = [ + "real-world-memory-proactive-brief-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/proactive-brief/report.json", + "--out", + "tmp/real-world-memory/proactive-brief/report.md", +] + +[tasks.real-world-memory-production-ops] +workspace = false +dependencies = [ + "real-world-memory-production-ops-report", +] + +[tasks.real-world-memory-production-ops-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/production_ops", + "--run-id", + "real-world-memory-production-ops", + "--adapter-id", + "fixture_production_ops", + "--adapter-name", + "ELF production-ops fixture", + "--out", + "tmp/real-world-memory/production-ops-report.json", +] + +[tasks.real-world-memory-production-ops-report] +workspace = false +dependencies = [ + "real-world-memory-production-ops-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/production-ops-report.json", + "--out", + "tmp/real-world-memory/production-ops-report.md", +] + +[tasks.real-world-memory-project-decisions] +workspace = false +dependencies = [ + "real-world-memory-project-decisions-report", +] + +[tasks.real-world-memory-project-decisions-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/project_decisions", + "--out", + "tmp/real-world-memory/project-decisions/report.json", + "--run-id", + "real-world-memory-project-decisions", + "--adapter-id", + "fixture_project_decisions", + "--adapter-name", + "ELF project decision fixture", +] + +[tasks.real-world-memory-project-decisions-report] +workspace = false +dependencies = [ + "real-world-memory-project-decisions-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/project-decisions/report.json", + "--out", + "tmp/real-world-memory/project-decisions/report.md", +] + +[tasks.real-world-memory-report] +workspace = false +dependencies = [ + "real-world-memory-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/real-world-memory-report.json", + "--out", + "tmp/real-world-memory/real-world-memory-report.md", +] + +[tasks.real-world-memory-retrieval] +workspace = false +dependencies = [ + "real-world-memory-retrieval-report", +] + +[tasks.real-world-memory-retrieval-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/retrieval", + "--run-id", + "real-world-memory-retrieval", + "--adapter-id", + "fixture_retrieval", + "--adapter-name", + "ELF fixture retrieval cases", + "--out", + "tmp/real-world-memory/retrieval-report.json", +] + +[tasks.real-world-memory-retrieval-report] +workspace = false +dependencies = [ + "real-world-memory-retrieval-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/retrieval-report.json", + "--out", + "tmp/real-world-memory/retrieval-report.md", +] + +[tasks.real-world-memory-scheduled] +workspace = false +dependencies = [ + "real-world-memory-scheduled-report", +] + +[tasks.real-world-memory-scheduled-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/scheduled_memory", + "--out", + "tmp/real-world-memory/scheduled/report.json", + "--run-id", + "real-world-memory-scheduled", + "--adapter-id", + "fixture_scheduled_memory", + "--adapter-name", + "ELF scheduled memory fixture", +] + +[tasks.real-world-memory-scheduled-report] +workspace = false +dependencies = [ + "real-world-memory-scheduled-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/scheduled/report.json", + "--out", + "tmp/real-world-memory/scheduled/report.md", +] + +[tasks.real-world-memory-summary] +workspace = false +dependencies = [ + "real-world-memory-summary-report", +] + +[tasks.real-world-memory-summary-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/memory_summary", + "--out", + "tmp/real-world-memory/memory-summary/report.json", + "--run-id", + "real-world-memory-summary", + "--adapter-id", + "fixture_memory_summary", + "--adapter-name", + "ELF memory summary fixture", +] -[tasks.lint] +[tasks.real-world-memory-summary-report] workspace = false dependencies = [ - "lint-rust", + "real-world-memory-summary-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/memory-summary/report.json", + "--out", + "tmp/real-world-memory/memory-summary/report.md", ] -[tasks.lint-fix] +# Check +# | task | type | cwd | +# | ---------------- | --------- | --- | +# | check | composite | | +# | check-docs | command | | +# | check-rust | command | | +# | check-trace-gate | command | | + +[tasks.check] +clear = true workspace = false dependencies = [ - "lint-fix-rust", + "fmt-check", + "check-docs", + "check-rust", + "lint", + "test", ] -[tasks.lint-rust] +[tasks.check-docs] workspace = false -command = "cargo" +command = "python3" args = [ - "clippy", - "--workspace", - "--all-targets", - "--all-features", - "--", - "-D", - "warnings", + "scripts/check-docs.py", ] -[tasks.lint-fix-rust] -extend = "lint-rust" +[tasks.check-rust] +workspace = false +command = "cargo" args = [ - "clippy", - "--fix", - "--allow-dirty", + "check", "--workspace", "--all-targets", "--all-features", ] +[tasks.check-trace-gate] +workspace = false +command = "bash" +args = [ + "scripts/trace-gate.sh", +] -# Test -# | task | type | cwd | -# | --------- | --------- | --- | -# | test | composite | | -# | test-rust | command | | +# Clean +# | task | type | cwd | +# | -------------------------- | ------- | --- | +# | clean-baseline-live-docker | command | | +# | clean-parity-docker | command | | -[tasks.test] +[tasks.clean-baseline-live-docker] workspace = false -dependencies = [ - "test-rust", +command = "docker" +args = [ + "compose", + "-f", + "docker-compose.baseline.yml", + "down", + "-v", + "--remove-orphans", ] -[tasks.test-rust] +[tasks.clean-parity-docker] workspace = false -command = "cargo" +command = "docker" args = [ - "nextest", - "run", - "--workspace", - "--all-targets", - "--all-features", + "compose", + "-f", + "docker-compose.parity.yml", + "down", + "-v", + "--remove-orphans", ] - # Format # | task | type | cwd | # | -------------- | --------- | --- | @@ -115,16 +981,393 @@ args = [ "--check", ] +# Lint +# | task | type | cwd | +# | ----------- | --------- | --- | +# | lint | composite | | +# | lint-rust | command | | +# | lint-vstyle | command | | + +[tasks.lint] +workspace = false +dependencies = [ + "lint-rust", + "lint-vstyle", +] + +[tasks.lint-rust] +workspace = false +command = "cargo" +args = [ + "clippy", + "--all-features", + "--all-targets", + "--workspace", + "--", + "-D", + "clippy::all", + "-D", + "clippy::too_many_lines", + "-D", + "clippy::unwrap_used", + "-D", + "clippy::use_self", + "-D", + "clippy::wildcard_imports", + "-D", + "missing-docs", + "-D", + "unused-crate-dependencies", + "-D", + "warnings", +] + +[tasks.lint-vstyle] +workspace = false +command = "cargo" +args = [ + "vstyle", + "curate", + "--language", + "rust", + "--workspace", + "--all-features", +] -# Meta -# | task | type | cwd | -# | ------ | --------- | --- | -# | checks | composite | | +# Lint Fix +# | task | type | cwd | +# | --------------- | --------- | --- | +# | lint-fix | composite | | +# | lint-fix-rust | command | | +# | lint-fix-vstyle | command | | -[tasks.checks] +[tasks.lint-fix] workspace = false dependencies = [ - "lint", - "test", - "fmt-check", + "lint-fix-rust", + "lint-fix-vstyle", +] + +[tasks.lint-fix-rust] +workspace = false +command = "cargo" +args = [ + "clippy", + "--fix", + "--allow-dirty", + "--all-features", + "--all-targets", + "--workspace", + "--", + "-D", + "clippy::all", + "-D", + "clippy::too_many_lines", + "-D", + "clippy::unwrap_used", + "-D", + "clippy::use_self", + "-D", + "clippy::wildcard_imports", + "-D", + "missing-docs", + "-D", + "unused-crate-dependencies", + "-D", + "warnings", +] + +[tasks.lint-fix-vstyle] +workspace = false +command = "cargo" +args = [ + "vstyle", + "tune", + "--language", + "rust", + "--workspace", + "--all-features", + "--strict", +] + +# Research +# | task | type | cwd | +# | --------------------------------------- | --------- | --- | +# | external-memory-radar | command | | +# | external-memory-radar-artifact | composite | | +# | external-memory-radar-artifact-json | command | | +# | external-memory-radar-artifact-validate | command | | +# | external-memory-radar-dry-run | composite | | +# | external-memory-radar-dry-run-json | command | | +# | external-memory-radar-dry-run-validate | command | | +# | external-memory-radar-validate | command | | + +[tasks.external-memory-radar] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "external_memory_pattern_radar", + "--", + "run", + "--cursor", + "docs/research/external_memory_pattern_radar/cursor.json", + "--summary", + "docs/research/external_memory_pattern_radar/latest.md", +] + +[tasks.external-memory-radar-artifact] +workspace = false +dependencies = [ + "external-memory-radar-artifact-json", + "external-memory-radar-artifact-validate", +] + +[tasks.external-memory-radar-artifact-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "external_memory_pattern_radar", + "--", + "run", + "--cursor", + "docs/research/external_memory_pattern_radar/cursor.json", + "--out-cursor", + "tmp/external-memory-pattern-radar/cursor.json", + "--summary", + "tmp/external-memory-pattern-radar/latest.md", +] + +[tasks.external-memory-radar-artifact-validate] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "external_memory_pattern_radar", + "--", + "validate", + "--cursor", + "tmp/external-memory-pattern-radar/cursor.json", +] + +[tasks.external-memory-radar-dry-run] +workspace = false +dependencies = [ + "external-memory-radar-dry-run-json", + "external-memory-radar-dry-run-validate", +] + +[tasks.external-memory-radar-dry-run-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "external_memory_pattern_radar", + "--", + "run", + "--mode", + "offline", + "--cursor", + "docs/research/external_memory_pattern_radar/cursor.json", + "--out-cursor", + "tmp/external-memory-pattern-radar/cursor.json", + "--summary", + "tmp/external-memory-pattern-radar/latest.md", +] + +[tasks.external-memory-radar-dry-run-validate] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "external_memory_pattern_radar", + "--", + "validate", + "--cursor", + "tmp/external-memory-pattern-radar/cursor.json", +] + +[tasks.external-memory-radar-validate] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "external_memory_pattern_radar", + "--", + "validate", + "--cursor", + "docs/research/external_memory_pattern_radar/cursor.json", +] + +# Smoke +# | task | type | cwd | +# | ---------------------------------- | --------- | --- | +# | smoke-graphify-docker-graph-report | command | | +# | smoke-graphiti-zep-docker-temporal | command | | +# | smoke-graphrag-docker | command | | +# | smoke-lightrag-docker-context | command | | +# | smoke-ragflow-docker | command | | +# | smoke-real-world-job | composite | | +# | smoke-real-world-job-json | command | | +# | smoke-real-world-job-report | command | | + +[tasks.smoke-graphify-docker-graph-report] +workspace = false +command = "bash" +args = [ + "scripts/smoke-docker.sh", + "graphify-docker-graph-report", +] + +[tasks.smoke-graphiti-zep-docker-temporal] +workspace = false +command = "bash" +args = [ + "scripts/smoke-docker.sh", + "graphiti-zep-docker-temporal", +] + +[tasks.smoke-graphrag-docker] +workspace = false +command = "bash" +args = [ + "scripts/smoke-docker.sh", + "graphrag-docker", +] + +[tasks.smoke-lightrag-docker-context] +workspace = false +command = "bash" +args = [ + "scripts/smoke-docker.sh", + "lightrag-docker-context", +] + +[tasks.smoke-ragflow-docker] +workspace = false +command = "bash" +args = [ + "scripts/ragflow-docker-evidence-smoke.sh", +] + +[tasks.smoke-real-world-job] +workspace = false +dependencies = [ + "smoke-real-world-job-report", +] + +[tasks.smoke-real-world-job-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/work_resume", + "--out", + "tmp/real-world-job/real-world-job-smoke-report.json", +] + +[tasks.smoke-real-world-job-report] +workspace = false +dependencies = [ + "smoke-real-world-job-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-job/real-world-job-smoke-report.json", + "--out", + "tmp/real-world-job/real-world-job-smoke-report.md", +] + +# Test +# | task | type | cwd | +# | --------------------- | --------- | --- | +# | test | composite | | +# | test-e2e | command | | +# | test-rust | command | | +# | test-rust-all | command | | +# | test-rust-integration | command | | + +[tasks.test] +clear = true +workspace = false +dependencies = [ + "test-rust", +] + +[tasks.test-e2e] +workspace = false +command = "bash" +args = [ + "scripts/context-misranking-harness.sh", +] + +[tasks.test-rust] +workspace = false +command = "cargo" +args = [ + "nextest", + "run", + "--workspace", + "--all-targets", + "--all-features", +] + +[tasks.test-rust-all] +workspace = false +command = "cargo" +args = [ + "nextest", + "run", + "--workspace", + "--all-targets", + "--all-features", + "--run-ignored", + "all", +] + +[tasks.test-rust-integration] +workspace = false +command = "cargo" +args = [ + "nextest", + "run", + "--workspace", + "--all-targets", + "--all-features", + "--run-ignored", + "only", ] diff --git a/README.md b/README.md index a74f4d98..5649d0d6 100644 --- a/README.md +++ b/README.md @@ -15,18 +15,52 @@ Evidence-linked fact memory for agents. ## What Is ELF? -ELF is a memory service that stores short, evidence-linked facts for agents. It separates deterministic writes from LLM extraction, enforces evidence binding, and provides hybrid retrieval with configurable quality and cost controls. Postgres with pgvector is the source of truth; Qdrant is a derived index for fast candidate retrieval. ELF exposes HTTP and MCP interfaces for agent integrations. +ELF is a memory service for LLM agents that stores short, evidence-linked facts and retrieves them with chunk-first hybrid search. Postgres with pgvector is the source of truth for notes and embeddings. Qdrant is a derived, rebuildable index for fast candidate retrieval. ELF can also persist evidence-bound entity/relation facts and optionally attach them as `relation_context` in search explain output. ELF exposes both HTTP and MCP interfaces. -## Why ELF +## Project Goals -- Evidence-linked memory. Every extracted note includes verbatim evidence quotes. -- Deterministic ingestion. `add_note` never calls an LLM; `add_event` always does. -- Source-of-truth storage. Postgres is authoritative; Qdrant can be rebuilt at any time. -- Hybrid retrieval. Dense + BM25 candidate retrieval with optional reranking. -- Query expansion modes. `off`, `always`, or `dynamic` to balance recall and latency. -- Multi-tenant scoping. Tenant, project, agent, and scope boundaries are enforced. -- MCP integration. A dedicated `elf-mcp` server for Claude and other MCP clients. -- Evaluation-ready. `elf-eval` lets you measure retrieval quality quickly. +- Improve effective context usage with compact memory retrieval instead of replaying long history. +- Preserve correctness over time with update and lifecycle semantics, not append-only memory. +- Keep memory behavior auditable with deterministic boundaries, evidence, and replayable traces. +- Enable safe multi-agent collaboration through explicit scopes and sharing controls. +- Make quality measurable with repeatable evaluation and regression checks. + +## Why Choose ELF + +- Evidence-linked memory with strict provenance requirements. +- Deterministic `add_note` and LLM-driven `add_event` separation. +- Postgres source-of-truth plus rebuildable retrieval index. +- Chunk-first hybrid retrieval with expansion and rerank controls. +- Multi-tenant scoped APIs for service-style integration. +- Evaluation tooling (`elf-eval`) for retrieval quality and replay analysis. + +## Quickstart + +Use the canonical setup guide: + +- `docs/guide/getting_started.md` +- For single-user production operation, backup, restore, and Qdrant rebuild, use + [docs/guide/single_user_production.md](docs/guide/single_user_production.md). + +Fast path: + +```sh +docker compose -f docker-compose.yml up -d postgres qdrant + +# Terminal 1 +cargo run -p elf-api -- -c config/local/elf.docker.toml + +# Terminal 2 +cargo run -p elf-worker -- -c config/local/elf.docker.toml + +# Terminal 3 +curl -fsS http://127.0.0.1:51892/health +``` + +For provider-backed development, copy `elf.example.toml` to `elf.toml` and fill the provider blocks. +For production use, do not rely on these quickstart commands; follow the single-user +production runbook linked above so backup, restore, rollback, and provider config +handling are explicit. ## Architecture @@ -45,8 +79,8 @@ flowchart TB end subgraph Storage - PG[(Postgres + pgvector\nsource of truth)] - Qdrant[(Qdrant\nrebuildable index)] + PG[(Postgres with pgvector
source of truth)] + Qdrant[(Qdrant
rebuildable index)] end subgraph Providers @@ -61,13 +95,14 @@ flowchart TB Eval -->|HTTP| API API -->|add_note| PG + API -->|memory_ingest_decisions| PG API -->|add_event| Extractor Extractor -->|evidence-bound notes| API API -->|persist| PG PG -->|outbox| Worker - Worker -->|index dense + BM25| Qdrant + Worker -->|index chunks, dense and BM25| Qdrant - API -->|search| Expand{Expand?\noff/always/dynamic} + API -->|search| Expand{Expand mode
off, always, dynamic} Expand -->|original| Embed Expand -->|LLM variants| Extractor Extractor -->|expanded queries| Embed @@ -76,119 +111,319 @@ flowchart TB Qdrant -->|RRF fusion candidates| API API -->|scope/TTL filter| PG PG -->|notes| API - API -->|rerank + recency| Rerank + API -->|rerank and recency| Rerank Rerank -->|scores| API API -->|top-k| Agent ``` -## Comparison (qmd, claude-mem) - -Comparison focuses on shared capabilities plus ELF strengths. - -### Interfaces And Integration - -| Capability | ELF | qmd | claude-mem | -| ------------------------------- | --- | --- | ---------- | -| Local-first, self-hosted memory | ✅ | ✅ | ✅ | -| MCP integration | ✅ | ✅ | ✅ | -| HTTP API service | ✅ | — | ✅ | -| CLI-first workflow | — | ✅ | — | -| Web UI viewer | — | — | ✅ | - -### Retrieval Pipeline - -| Capability | ELF | qmd | claude-mem | -| ------------------------------- | --- | --- | ---------- | -| Full-text search (BM25 or FTS) | ✅ | ✅ | ✅ | -| Vector semantic search | ✅ | ✅ | ✅ | -| Hybrid dense + sparse fusion | ✅ | ✅ | ✅ | -| LLM reranking stage | ✅ | ✅ | — | -| Query expansion | ✅ | ✅ | — | -| Progressive disclosure workflow | — | — | ✅ | - -### Quality, Safety, And Memory Semantics - -| Capability | ELF | qmd | claude-mem | -| ----------------------------------------- | --- | --- | ---------- | -| Evidence-bound notes (verbatim quotes) | ✅ | — | — | -| Deterministic vs LLM ingestion separation | ✅ | — | — | -| Source-of-truth DB with rebuildable index | ✅ | — | — | -| Multi-tenant scoping | ✅ | — | — | -| TTL and lifecycle policies | ✅ | — | — | -| English-only boundary enforcement | ✅ | — | — | -| Redaction on write | ✅ | — | — | - -### Operations And Evaluation - -| Capability | ELF | qmd | claude-mem | -| ------------------------ | --- | --- | ---------- | -| Retrieval evaluation CLI | ✅ | — | — | -| Structured JSON outputs | ✅ | ✅ | — | +## Comparison + +### Checked-In Live Benchmark Snapshot + +The June 9, 2026 Docker-only live baseline and production adoption gate, plus the +June 10 post-adapter adoption refresh, use generated corpus/query manifests across ELF +and the external memory projects below. ELF was run with the production embedding +provider path, `Qwen3-Embedding-8B`, and 4096-dimensional embeddings where +provider-backed ELF evidence was required. + +- Production adoption gate verdict: ELF is ready for personal production use with + bounded caveats. The private production corpus profile was not run because no + operator-owned private manifest was available; the task failed closed at the missing + manifest guard, so no private-corpus pass is claimed. +- Post-adapter production adoption refresh verdict: keep adopting ELF for personal + production use with bounded caveats. The full live real-world sweep, OpenViking + dependency refresh, and RAG/graph research gates sharpen the limits but do not + create a new production blocker. +- ELF production-provider synthetic run: 8 documents, 6 queries, `8/8` encoded checks, + `retrieval_pass`, and `pass` in 59 seconds. +- ELF production-provider stress run: 480 documents, 16 queries, `9/9` encoded checks, + `retrieval_pass`, and `pass` in 779 seconds. +- ELF production-provider backfill run: 2,000 documents, 16 queries, `9/9` encoded + checks, resume from 1,000 to 2,000 imported documents, zero duplicate source notes, + and `pass` in 2,804 seconds. +- Single-user production restore proof: Docker Compose backup/restore plus Qdrant + rebuild returned `rebuilt_count=1`, `missing_vector_count=0`, `error_count=0`, and + search recovered the restored note. +- Fresh all-project smoke run: ELF and qmd passed every encoded check. agentmemory + passed same-corpus retrieval but failed lifecycle/cold-start coverage. mem0/OpenMemory + and memsearch now pass their scoped local baseline smokes, while OpenMemory + UI/export, hosted mem0 Platform, optional graph memory, and broader memsearch prompt + and TTL coverage remain blocked, unsupported, or not encoded. OpenViking now reaches + its pinned Docker local embedding path and is reported as `wrong_result` when + same-corpus evidence terms are missed; claude-mem and OpenViking non-retrieval + coverage remain typed non-pass states. +- Real-world agent memory aggregate after XY-954: 60 fixture-backed + jobs across 16 suites, 53 pass, 0 incomplete, 7 blocked, 0 wrong-result, + 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are + production-ops operator boundaries plus blocked OpenViking staged trajectory, + hierarchy selection, recursive/context expansion measurement gates, and the + private-corpus/private-provider scheduler blockers tied to XY-930, not hidden benchmark wins. The + `scheduled_memory` suite contributes four passing source-linked scheduled task + readbacks plus one typed private/provider scheduler blocker tied to XY-930. The + `core_archival_memory` suite passes 6 fixture jobs for core block attachment, scope, + provenance, stale-core detection, archival fallback, and project-decision recovery; + it does not create an ELF-over-Letta claim. The + `memory_summary` fixture passes 1 source-trace job for reviewable top-of-mind, + background, stale, superseded, tombstoned, and derived project-profile entries; it + does not create a managed-memory parity claim. The new `proactive_brief` fixture + scores 5 jobs, with 4 pass and 1 blocked private-corpus case; it does not create + Pulse or hosted managed-memory parity. +- Full-suite live real-world adapter sweep after XY-926: ELF and qmd emit + Docker-isolated `live_real_world` records for all 55 checked-in jobs across 13 suites + through `cargo make real-world-memory-live-adapters`. Both keep the original + targeted `work_resume`, `retrieval`, and `project_decisions` slice passing, but the + full sweep is not a full-suite pass. ELF now live-scores capture/write-policy, + consolidation proposal review, knowledge-page rebuild/lint, and operator-debugging + fixtures. The remaining ELF non-pass boundaries are production-ops operator + boundaries, the core/archival live adapter gap, and blocked context-trajectory + measurement. qmd remains the local retrieval-debug UX reference; + it keeps consolidation, knowledge, capture, and core/archival typed non-pass states + and is `wrong_result` for operator-debug trace hydration, so no broad ELF-over-qmd + claim is allowed. +- Live temporal reconciliation after XY-905: `cargo make real-world-memory-live-adapters` + now reports ELF live `memory_evolution` as 6/6 pass, score mean `1.000`, + conflict detection count `5`, update rationale count `6`, and zero + selected-but-not-narrated conflict evidence. The report adds current, historical, + rationale, tombstone, invalidation, selected, dropped, and lifecycle-demoted + evidence fields. qmd remains `wrong_result` on the same slice, but this is not a + broad qmd, Graphiti/Zep, mem0/OpenMemory, Letta, hosted-memory, or private-corpus + superiority claim. +- Live consolidation proposal scoring after XY-934: `cargo make + real-world-memory-live-consolidation` runs the consolidation fixture slice through + `ElfService` consolidation run creation, worker proposal materialization, and + apply/defer/discard review audit transitions. ELF passes 4/4 live consolidation jobs + with complete lineage, one unsupported-claim flag preserved, and zero source + mutations. Managed dreaming and Always-On Memory Agent patterns remain product + references, not direct live competitors, because no contained runner emits comparable + artifacts. +- Live operator-debugging slice after XY-932: `cargo make + real-world-job-operator-ux-live-adapters` emits narrow Docker-isolated + `live_real_world` records for ELF and qmd over the operator-debugging fixtures. + ELF passes trace hydration, candidate-drop visibility, selected-but-not-narrated + evidence, replay-command availability, and repair-action clarity. qmd ties replay + command and repair-action clarity but is `wrong_result` for trace hydration and + candidate-drop stage visibility. OpenMemory UI/export remains blocked, and + claude-mem viewer flows remain blocked until Docker-contained hook/viewer evidence + exists, so this is not a broad viewer-product claim. +- First-generation OSS continuity/source-store follow-up after XY-925: `cargo make + real-world-first-generation-oss` emits a fixture-backed external-adapter slice for + agentmemory, memsearch, and claude-mem with 6 jobs, 4 pass, 2 blocked, and full + evidence/source-ref/quote coverage. It selects agentmemory's durable local path, + adds memsearch canonical Markdown source-store and retrieval-debug prompt coverage, + and records claude-mem progressive-disclosure/retrieval-repair coverage while + keeping hook and viewer/operator workflows blocked. +- Expanded adapter-pack coverage after XY-834: the real-world external adapter + manifest now includes `research_gate` records for RAGFlow, LightRAG, GraphRAG, + Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, and deeper + qmd/OpenViking profiles, while graphify now has a scored tiny Docker smoke record. + These records carry source/setup/runtime/resource/retry metadata and typed + `blocked`, `incomplete`, `wrong_result`, or `not_encoded` states; they are not + fixture-backed or live adapter pass evidence. +- Graph/RAG scored-smoke promotion after XY-900 and representative slice after XY-929: + RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify smokes now emit scored or + typed non-pass `real_world_job` adapter reports when run. `cargo make + real-world-memory-graph-rag` adds representative graph/RAG citation, summary, + temporal-validity, graph-report, stale-source-lint, and unsupported-claim fixtures: + RAGFlow, GraphRAG, and Graphiti/Zep are blocked; LightRAG is incomplete with + comparison blocked; graphify is `wrong_result`; llm-wiki is not_tested; gbrain is + blocked; private and hosted graph/RAG profiles are non_goal. These reports preserve + the smoke and typed non-pass boundaries and do not create an ELF win claim against + graph/RAG strengths. +- mem0/OpenMemory history follow-up after XY-924 and XY-931: the local OSS mem0 + adapter now passes encoded preference correction history, entity-scoped + personalization, local `get_all` export-style readback, and deletion audit history. + The separate OpenMemory export-helper setup probe in `live-baseline-20260611122416` + records `blocked` with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`, so SDK `get_all` + is still not UI/export evidence. The comparison records ELF as a loss on preference + correction history, ties on scoped personalization and delete audit, `not_tested` + for local SDK export-style parity, `blocked` for OpenMemory UI/export, and + `non_goal` for hosted Platform export and optional graph memory in the local OSS + lane. +- Capture/write-policy live follow-up after XY-933: ELF now passes 4/4 live + `capture_integration` jobs with zero redaction leaks, source ids preserved in + source refs, write-policy redaction audit counts, evidence binding, and no secret + leakage. qmd remains `not_encoded` for this suite. agentmemory capture comparison is + blocked by mocked/in-memory storage, and claude-mem hook/viewer capture remains + blocked until Docker-contained hook/viewer capture evidence exists, so no broad + capture-breadth superiority claim is allowed. +- The benchmark runner and report publisher are checked in and Docker-isolated: + `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`, + `cargo make baseline-production-private-addendum`, + `cargo make baseline-backfill-10k-docker`, + `cargo make baseline-backfill-100k-docker`, + `cargo make baseline-soak-docker`, `cargo make baseline-live-report`, + `cargo make real-world-memory-live-adapters`, + `cargo make real-world-first-generation-oss`, and + `cargo make clean-baseline-live-docker`. Expensive 100k and long-soak profiles + are opt-in and do not run in normal checks. + +Detailed evidence and interpretation: + +- [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md) +- [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md) +- [Production Adoption Gate Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md) +- [Real-World Comparison Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md) +- [Live Real-World Adapter Sweep Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md) +- [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md) +- [qmd and OpenViking Strength-Profile Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md) +- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) +- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) +- [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) +- [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md) +- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md) +- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) +- [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) +- [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md) +- [Scheduled Memory Task Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md) +- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) +- [Single-User Production Runbook](docs/guide/single_user_production.md) +- Benchmark contract: + [Real-World Agent Memory Benchmark v1](docs/spec/real_world_agent_memory_benchmark_v1.md). + This contract defines job-level suites for agent work. `cargo make real-world-memory` + now reports fixture-backed ELF evidence plus the external adapter coverage manifest + for the first memory-project set plus expanded RAG and graph-memory research gates. + The report still distinguishes fixture-backed, live-baseline-only, research-gate, + and true live real-world adapter evidence; ELF and qmd now execute a full encoded + live sweep, but that sweep still contains typed non-pass states and is not + full-suite parity. + +Evidence-backed position after the June 16 temporal reconciliation report: + +- ELF is better evidenced than the tested alternatives on evidence-bound writes, + deterministic ingestion boundaries, Postgres source-of-truth plus rebuildable Qdrant + indexing, scoped service APIs, and fixture-backed provenance/resume/evolution checks. +- ELF and qmd are both strong in the current encoded retrieval evidence: qmd remains + the local retrieval-debug baseline and now has full-suite live sweep evidence with + typed non-pass states, while ELF has the stronger service and provenance contract. +- ELF is still behind or not yet proven on full-suite live real-world pass parity, + private-corpus production quality, credentialed production-ops gates, + qmd-style local debug knobs, agentmemory/claude-mem/OpenMemory-style capture and + continuity UX, + OpenViking-style context trajectory, and hosted managed memory. + +Quick comparison snapshot (objective/high-level). +This table compares capability coverage, not overall project quality. + +| Capability | ELF | agentmemory | OpenViking | mem0 | qmd | claude-mem | memsearch | +| ---------- | --- | ----------- | ---------- | ---- | --- | ---------- | --------- | +| Local-first self-hosted workflow | ✅ | ✅ | ✅ | ✅ (OpenMemory) | ✅ | ✅ | ✅ | +| MCP integration | ✅ | ✅ | — | ✅ (OpenMemory) | ✅ | ✅ | ⚠️ | +| CLI-first developer workflow | — | ✅ | ✅ | — | ✅ | ⚠️ | ✅ | +| HTTP API service surface | ✅ | ✅ | ✅ | ✅ | ⚠️ (MCP Streamable HTTP) | ✅ | — | +| Query expansion or query rewriting | ✅ | ⚠️ | ✅ | ⚠️ | ✅ | — | — | +| LLM reranking stage | ✅ | ⚠️ | ⚠️ | ⚠️ | ✅ | — | — | +| Hybrid dense + sparse retrieval | ✅ | ✅ | ✅ | ⚠️ | ✅ | ✅ | ✅ | +| Progressive disclosure style retrieval | ✅ | ⚠️ | ✅ | — | — | ✅ | ⚠️ | +| Evidence-bound memory writes | ✅ | — | — | — | — | — | — | +| Deterministic and LLM-ingestion boundary | ✅ | ⚠️ | ⚠️ | ⚠️ | — | — | — | +| Source-of-truth + rebuildable derived index | ✅ | ⚠️ | ✅ | ⚠️ | ⚠️ | ⚠️ | ✅ | +| Hierarchical/recursive retrieval strategy | ⚠️ (in progress) | ⚠️ | ✅ | ⚠️ | ⚠️ | ⚠️ | ⚠️ | +| Progressive context loading (L0/L1/L2 style) | ⚠️ (in progress) | ⚠️ | ✅ | ⚠️ | — | ⚠️ | — | +| Built-in web memory inspector/viewer | ✅ | ✅ | — | ✅ (OpenMemory) | — | ✅ | — | +| Hosted managed option | — | — | — | ✅ | — | — | — | +| Multi-tenant scope semantics | ✅ | ⚠️ | ⚠️ | ✅ | — | — | — | +| TTL/lifecycle policy controls | ✅ | ⚠️ | ⚠️ | ✅ | — | ⚠️ | — | +| Graph memory mode | ⚠️ (graph-lite: structured relations persisted; optional search `relation_context`) | ⚠️ | ⚠️ (URI-link relations) | ✅ (optional) | — | — | — | + +Legend: `✅` built-in and documented; `⚠️` partial, optional, or in-progress; `—` not a first-class documented capability. + +Project signature strengths (what each does especially well): + +| Project | Signature strengths | Potential ELF adoption value | +| ------- | ------------------- | ---------------------------- | +| ELF | Evidence-bound writes, deterministic ingestion boundary, SoT + rebuildable index, eval tooling | Keep as core differentiators while extending retrieval and UX | +| agentmemory | Cross-agent hooks, MCP/REST packaging, local viewer, iii console observability, coding-agent continuity benchmarks | Use as adapter/baseline and UX reference, not a replacement for ELF provenance semantics | +| OpenViking | Filesystem-like context model (`viking://`), hierarchical retrieval, staged retrieval trajectory | Improve query planning, recursive retrieval, and explainable stage outputs | +| mem0 | Broad ecosystem (SDK + hosted + OpenMemory), multi-entity scope, lifecycle + optional graph memory | Strengthen event/history APIs and additive graph context channel | +| qmd | High-quality local retrieval pipeline (query expansion + weighted fusion + rerank), strong CLI/MCP workflow | Borrow transparent routing/fusion knobs and local debugging ergonomics | +| claude-mem | Progressive disclosure UX, automatic capture loop, practical local viewer/inspection workflow | Add operator-facing viewer/status/trace surfaces for faster tuning | +| memsearch | Markdown-first canonical store, incremental reindex, practical hybrid retrieval | Reinforce ingest/index consistency and developer-friendly local workflows | + +Detailed comparison, mechanism-level analysis, and source map: + +- [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md) +- [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md) +- [Production Adoption Gate Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md) +- [Real-World Comparison Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md) +- [Live Real-World Adapter Sweep Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md) +- [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md) +- [Competitor Strength Evidence Matrix - June 11, 2026](docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md) +- [Temporal History Competitor Gap Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md) +- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) +- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) +- [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) +- [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md) +- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md) +- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) +- [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) +- [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md) +- [Scheduled Memory Task Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md) +- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) +- [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md) +- [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md) +- [Detailed External Comparison](docs/guide/research/comparison_external_projects.md) +- [Research Projects Inventory](docs/guide/research/research_projects_inventory.md) +- [Agent Memory Selection Research Run](docs/research/2026-06-08-agent-memory-selection.json) +- [Real-World Benchmark Dimension Research Run](docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json) +- [RAG/Graph Adapter Feasibility Research Run](docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json) + +Latest real-world benchmark report: June 16, 2026. Latest external research refresh: +June 11, 2026; June 16 adds live temporal reconciliation, live consolidation +self-check evidence, and fixture-backed scheduled-memory task scoring. + +## Documentation + +- Start here: `docs/index.md` +- Operational guide index: `docs/guide/index.md` +- Single-user production runbook: + [docs/guide/single_user_production.md](docs/guide/single_user_production.md) +- Benchmarking guides and reports: `docs/guide/benchmarking/index.md` +- Research index: `docs/guide/research/index.md` +- Specifications: `docs/spec/index.md` +- System contract: `docs/spec/system_elf_memory_service_v2.md` +- Ingest policy: `policy_decision` values (`remember`, `update`, `ignore`, `reject`) are returned for each note result in `add_note` and `add_event`. +- All ingest decisions are also written to `memory_ingest_decisions` with policy inputs and thresholds for auditability. +- Evaluation guide: `docs/guide/evaluation.md` +- Integration testing: `docs/guide/integration-testing.md` -### ELF-Only Advantages - -- Evidence binding with verbatim quote checks. -- Postgres is the source of truth; vector index is fully rebuildable. -- Deterministic `add_note` and LLM-only `add_event` semantics. -- Query expansion modes (`off`, `always`, `dynamic`) for cost/latency control. -- Dedicated evaluation CLI to measure retrieval quality. - -### Learnings Integrated - -- Hybrid retrieval + rerank as a first-class pipeline, inspired by qmd's local hybrid stack. -- Progressive cost control for retrieval, informed by claude-mem's progressive disclosure approach. - -## Quickstart - -### Requirements - -- Postgres with pgvector -- Qdrant -- Provider endpoints for embeddings, rerank, and extraction - -### Run +## Development ```sh -cp elf.example.toml elf.toml -# Fill in providers and storage values in elf.toml - -cargo run -p elf-worker -- -c elf.toml -cargo run -p elf-api -- -c elf.toml -cargo run -p elf-mcp -- -c elf.toml +cargo make fmt +cargo make check +cargo make test-rust ``` -### Evaluate +For integration and E2E workflows, use `docs/guide/getting_started.md` and `docs/guide/integration-testing.md`. -```sh -cargo run -p elf-eval -- -c elf.toml -i path/to/eval.json -``` +## Support Me -## Configuration +If you find this project helpful and would like to support its development, you can buy me a coffee! -See `elf.example.toml` and `docs/spec/system_elf_memory_service_v1.md` for the full contract. All config is explicit and required; no environment defaults are allowed. Embedding dimensions must match the Qdrant vector dimension. +Your support is greatly appreciated and motivates me to keep improving this project. -## Development +- **Fiat** + - [Ko-fi](https://ko-fi.com/hack_ink) + - [Afdian](https://afdian.com/a/hack_ink) +- **Crypto** + - **Bitcoin** + - `bc1pedlrf67ss52md29qqkzr2avma6ghyrt4jx9ecp9457qsl75x247sqcp43c` + - **Ethereum** + - `0x3e25247CfF03F99a7D83b28F207112234feE73a6` + - **Polkadot** + - `156HGo9setPcU2qhFMVWLkcmtCEGySLwNqa3DaEiYSWtte4Y` -```sh -cargo make fmt -cargo make lint -cargo make test -``` +Thank you for your support! -## Support +## Appreciation -If you find this project helpful and want to support its development: +We would like to extend our heartfelt gratitude to the following projects and contributors: -- Ko-fi: https://ko-fi.com/hack_ink -- Afdian: https://afdian.com/a/hack_ink +- The Rust community for their continuous support and development of the Rust ecosystem. -- Bitcoin: `bc1pedlrf67ss52md29qqkzr2avma6ghyrt4jx9ecp9457qsl75x247sqcp43c` -- Ethereum: `0x3e25247CfF03F99a7D83b28F207112234feE73a6` -- Polkadot: `156HGo9setPcU2qhFMVWLkcmtCEGySLwNqa3DaEiYSWtte4Y` +## Additional Acknowledgements -## Appreciation - -- The Rust community for their continuous support and development of the Rust ecosystem. +- None.
diff --git a/apps/elf-api/Cargo.toml b/apps/elf-api/Cargo.toml index 854d8c15..6c198ff2 100644 --- a/apps/elf-api/Cargo.toml +++ b/apps/elf-api/Cargo.toml @@ -2,27 +2,34 @@ build = "../../build.rs" edition = "2024" name = "elf-api" -version = "0.1.0" +version = "0.2.0" [dependencies] axum = { workspace = true } clap = { workspace = true } color-eyre = { workspace = true } -elf-cli = { path = "../../packages/elf-cli" } -elf-config = { path = "../../packages/elf-config" } -elf-service = { path = "../../packages/elf-service" } -elf-storage = { path = "../../packages/elf-storage" } serde = { workspace = true } serde_json = { workspace = true } +time = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } +utoipa = { workspace = true } +utoipa-scalar = { workspace = true } uuid = { workspace = true } +elf-cli = { workspace = true } +elf-config = { workspace = true } +elf-domain = { workspace = true } +elf-service = { workspace = true } +elf-storage = { workspace = true } + [build-dependencies] vergen-gitcl = { workspace = true } [dev-dependencies] -elf-testkit = { path = "../../packages/elf-testkit" } -sqlx = { workspace = true } -tower = { version = "0.5" } +qdrant-client = { workspace = true } +sqlx = { workspace = true } +tower = { workspace = true } + +elf-testkit = { workspace = true } diff --git a/apps/elf-api/src/lib.rs b/apps/elf-api/src/lib.rs index 6e07f592..46eadb56 100644 --- a/apps/elf-api/src/lib.rs +++ b/apps/elf-api/src/lib.rs @@ -1,18 +1,21 @@ +#![cfg_attr(test, allow(unused_crate_dependencies))] + +//! HTTP API application bootstrap for ELF. + pub mod routes; pub mod state; -// std use std::{net::SocketAddr, path::PathBuf}; -// crates.io use clap::Parser; -use color_eyre::eyre; +use color_eyre::{Result, eyre}; use tokio::net::TcpListener; use tracing_subscriber::EnvFilter; -// self use crate::state::AppState; +use elf_config::Config; +/// CLI arguments for launching the ELF API service. #[derive(Debug, Parser)] #[command( version = elf_cli::VERSION, @@ -20,42 +23,68 @@ use crate::state::AppState; styles = elf_cli::styles(), )] pub struct Args { + /// Path to the ELF configuration file. #[arg(long, short = 'c', value_name = "FILE")] pub config: PathBuf, } -pub async fn run(args: Args) -> color_eyre::Result<()> { +/// Starts the public and admin HTTP servers. +pub async fn run(args: Args) -> Result<()> { let config = elf_config::load(&args.config)?; - init_tracing(&config)?; let http_addr: SocketAddr = config.service.http_bind.parse()?; let admin_addr: SocketAddr = config.service.admin_bind.parse()?; + + init_tracing(&config)?; + if config.security.bind_localhost_only && !http_addr.ip().is_loopback() { return Err(eyre::eyre!( "http_bind must be a loopback address when bind_localhost_only is true." )); } + + let auth_mode = config.security.auth_mode.trim(); + + if !http_addr.ip().is_loopback() { + match auth_mode { + "off" => { + return Err(eyre::eyre!( + "security.auth_mode=off is only allowed when http_bind is a loopback address." + )); + }, + "static_keys" => {}, + _ => { + return Err(eyre::eyre!("security.auth_mode must be one of off or static_keys.")); + }, + } + } if !admin_addr.ip().is_loopback() { return Err(eyre::eyre!("admin_bind must be a loopback address.")); } + let state = AppState::new(config).await?; let app = routes::router(state.clone()); let admin_app = routes::admin_router(state); - let http_listener = TcpListener::bind(http_addr).await?; + tracing::info!(%http_addr, "HTTP server listening."); - let http_server = axum::serve(http_listener, app); + let http_server = axum::serve(http_listener, app); let admin_listener = TcpListener::bind(admin_addr).await?; + tracing::info!(%admin_addr, "Admin server listening."); + let admin_server = axum::serve(admin_listener, admin_app); tokio::try_join!(http_server, admin_server)?; + Ok(()) } -fn init_tracing(config: &elf_config::Config) -> color_eyre::Result<()> { +fn init_tracing(config: &Config) -> Result<()> { let filter = EnvFilter::try_new(&config.service.log_level).unwrap_or_else(|_| EnvFilter::new("info")); + tracing_subscriber::fmt().with_env_filter(filter).init(); + Ok(()) } diff --git a/apps/elf-api/src/main.rs b/apps/elf-api/src/main.rs index f2e73ba2..db9d1b52 100644 --- a/apps/elf-api/src/main.rs +++ b/apps/elf-api/src/main.rs @@ -1,11 +1,17 @@ -// crates.io +#![allow(unused_crate_dependencies)] + +//! Binary entrypoint for the ELF HTTP API app. + use clap::Parser; -// self +use color_eyre::Result; + use elf_api::Args; #[tokio::main] -async fn main() -> color_eyre::Result<()> { +async fn main() -> Result<()> { color_eyre::install()?; + let args = Args::parse(); + elf_api::run(args).await } diff --git a/apps/elf-api/src/routes.rs b/apps/elf-api/src/routes.rs index 6aa418cb..d255d6cf 100644 --- a/apps/elf-api/src/routes.rs +++ b/apps/elf-api/src/routes.rs @@ -1,177 +1,538 @@ -// crates.io +//! HTTP route builders and request handlers. + use axum::{ Json, Router, + body::{self, Body}, extract::{ - Path, Query, State, + DefaultBodyLimit, Extension, Path, Query, State, rejection::{JsonRejection, QueryRejection}, }, - http::StatusCode, + http::{ + HeaderMap, HeaderValue, Request, StatusCode, + header::{CACHE_CONTROL, CONTENT_LENGTH, CONTENT_TYPE}, + }, + middleware::{self, Next}, response::{IntoResponse, Response}, - routing::{get, post}, + routing, }; -use serde::Serialize; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; +use utoipa::{OpenApi, ToSchema}; +use utoipa_scalar::{Scalar, Servable}; +use uuid::Uuid; -// self use crate::state::AppState; -use elf_service::ServiceError; +use elf_config::{SecurityAuthKey, SecurityAuthRole}; +use elf_domain::{ + consolidation::{ + ConsolidationInputRef, ConsolidationLineage, ConsolidationReviewAction, + ConsolidationReviewState, + }, + english_gate, + knowledge::KnowledgePageKind, + writegate::WritePolicy, +}; +use elf_service::{ + AddEventRequest, AddEventResponse, AddNoteInput, AddNoteRequest, AddNoteResponse, + AdminGraphPredicateAliasAddRequest, AdminGraphPredicateAliasesListRequest, + AdminGraphPredicateAliasesResponse, AdminGraphPredicatePatchRequest, + AdminGraphPredicateResponse, AdminGraphPredicatesListRequest, AdminGraphPredicatesListResponse, + AdminIngestionProfileCreateRequest, AdminIngestionProfileDefaultGetRequest, + AdminIngestionProfileDefaultResponse, AdminIngestionProfileDefaultSetRequest, + AdminIngestionProfileGetRequest, AdminIngestionProfileListRequest, + AdminIngestionProfileResponse, AdminIngestionProfileVersionsListRequest, + AdminIngestionProfileVersionsListResponse, AdminIngestionProfilesListResponse, + ConsolidationProposalGetRequest, ConsolidationProposalInput, ConsolidationProposalResponse, + ConsolidationProposalReviewRequest, ConsolidationProposalsListRequest, + ConsolidationProposalsListResponse, ConsolidationRunCreateRequest, + ConsolidationRunCreateResponse, ConsolidationRunGetRequest, ConsolidationRunResponse, + ConsolidationRunsListRequest, ConsolidationRunsListResponse, CoreBlockAttachRequest, + CoreBlockAttachResponse, CoreBlockDetachRequest, CoreBlockDetachResponse, + CoreBlockUpsertRequest, CoreBlockUpsertResponse, CoreBlocksGetRequest, CoreBlocksResponse, + DeleteRequest, DeleteResponse, DocType, DocsExcerptResponse, DocsExcerptsGetRequest, + DocsGetRequest, DocsGetResponse, DocsPutRequest, DocsPutResponse, DocsSearchL0Request, + DocsSearchL0Response, Error, EventMessage, GranteeKind, GraphQueryEntityRef, + GraphQueryPredicateRef, GraphQueryRequest, GraphQueryResponse, IngestionProfileSelector, + KnowledgePageGetRequest, KnowledgePageLintRequest, KnowledgePageLintResponse, + KnowledgePageRebuildRequest, KnowledgePageRebuildResponse, KnowledgePageResponse, + KnowledgePageSearchRequest, KnowledgePageSearchResponse, KnowledgePagesListRequest, + KnowledgePagesListResponse, ListRequest, ListResponse, MemoryHistoryGetRequest, + MemoryHistoryResponse, NoteFetchRequest, NoteFetchResponse, NoteProvenanceBundleResponse, + NoteProvenanceGetRequest, PayloadLevel, PublishNoteRequest, QueryPlan, RankingRequestOverride, + RebuildReport, SearchDetailsRequest, SearchDetailsResult, SearchExplainRequest, + SearchExplainResponse, SearchIndexItem, SearchRequest, SearchResponse, SearchSessionGetRequest, + SearchTimelineGroup, SearchTimelineRequest, SearchTrajectoryResponse, SearchTrajectorySummary, + ShareScope, SpaceGrantRevokeRequest, SpaceGrantRevokeResponse, SpaceGrantUpsertRequest, + SpaceGrantsListRequest, TextPositionSelector, TextQuoteSelector, TraceBundleGetRequest, + TraceBundleResponse, TraceGetRequest, TraceGetResponse, TraceRecentListRequest, + TraceRecentListResponse, TraceTrajectoryGetRequest, UnpublishNoteRequest, UpdateRequest, + UpdateResponse, search::TraceBundleMode, +}; -pub fn router(state: AppState) -> Router { - Router::new() - .route("/health", get(health)) - .route("/v1/memory/add_note", post(add_note)) - .route("/v1/memory/add_event", post(add_event)) - .route("/v1/memory/search", post(search)) - .route("/v1/memory/search/explain", get(search_explain)) - .route("/v1/memory/notes/:note_id", get(get_note)) - .route("/v1/memory/list", get(list)) - .route("/v1/memory/update", post(update)) - .route("/v1/memory/delete", post(delete)) - .with_state(state) +/// JSON OpenAPI contract route. +pub const OPENAPI_JSON_PATH: &str = "/openapi.json"; +/// Scalar API reference route. +pub const SCALAR_DOCS_PATH: &str = "/docs"; +/// Local read-only admin viewer route. +pub const ADMIN_VIEWER_PATH: &str = "/viewer"; + +const HEADER_TENANT_ID: &str = "X-ELF-Tenant-Id"; +const HEADER_PROJECT_ID: &str = "X-ELF-Project-Id"; +const HEADER_AGENT_ID: &str = "X-ELF-Agent-Id"; +const HEADER_REQUEST_ID: &str = "X-ELF-Request-Id"; +const HEADER_READ_PROFILE: &str = "X-ELF-Read-Profile"; +const HEADER_AUTHORIZATION: &str = "Authorization"; +const HEADER_TRUSTED_TOKEN_ID: &str = "X-ELF-Trusted-Token-Id"; +const MAX_CONTEXT_HEADER_CHARS: usize = 128; +const MAX_REQUEST_BYTES: usize = 1_048_576; +const MAX_DOC_REQUEST_BYTES: usize = 4 * 1_024 * 1_024; +const MAX_NOTES_PER_INGEST: usize = 256; +const MAX_MESSAGES_PER_EVENT: usize = 256; +const MAX_MESSAGE_CHARS: usize = 16_384; +const MAX_QUERY_CHARS: usize = 2_048; +const DOC_STATUSES: [&str; 2] = ["active", "deleted"]; +const MAX_NOTE_IDS_PER_DETAILS: usize = 256; +const MAX_TOP_K: u32 = 100; +const MAX_CANDIDATE_K: u32 = 1_000; +const MAX_ERROR_LOG_CHARS: usize = 1_024; +const VIEWER_HTML: &str = include_str!("../static/viewer.html"); + +/// Generated OpenAPI document for the ELF HTTP API. +#[derive(OpenApi)] +#[openapi( + info( + title = "ELF API", + version = env!("CARGO_PKG_VERSION"), + description = "Evidence-linked fact memory HTTP and admin API." + ), + paths( + health, + notes_ingest, + events_ingest, + docs_put, + docs_get, + docs_search_l0, + docs_excerpts_get, + core_blocks_get, + admin_core_block_upsert, + admin_core_block_attach, + admin_core_block_detach, + graph_query, + searches_create, + searches_get, + searches_timeline, + searches_notes, + notes_list, + notes_get, + notes_patch, + notes_delete, + notes_publish, + notes_unpublish, + space_grants_list, + space_grant_upsert, + space_grant_revoke, + admin_ingestion_profiles_list, + admin_ingestion_profile_create, + admin_ingestion_profile_get, + admin_ingestion_profile_versions_list, + admin_ingestion_profile_default_get, + admin_ingestion_profile_default_set, + consolidation_run_create, + consolidation_runs_list, + consolidation_run_get, + consolidation_proposals_list, + consolidation_proposal_get, + consolidation_proposal_review, + knowledge_page_rebuild, + knowledge_pages_list, + knowledge_pages_search, + knowledge_page_get, + knowledge_page_lint, + rebuild_qdrant, + searches_raw, + trace_recent_list, + trace_get, + trace_bundle_get, + trace_trajectory_get, + trace_item_get, + admin_graph_predicates_list, + admin_graph_predicate_patch, + admin_graph_predicate_alias_add, + admin_graph_predicate_aliases_list, + admin_note_provenance_get, + admin_note_history_get, + ), + components(schemas( + AdminIngestionProfileDefaultResponseV2, + AdminIngestionProfileDefaultSetBody, + ErrorBody, + )), + tags( + (name = "health", description = "Health and process liveness."), + (name = "notes", description = "Memory note ingestion, listing, mutation, and sharing."), + (name = "events", description = "Event ingestion through the extractor pipeline."), + (name = "docs", description = "Document extension ingestion, search, and excerpt retrieval."), + (name = "search", description = "Progressive search sessions and raw search diagnostics."), + (name = "graph", description = "Graph query and predicate administration."), + (name = "consolidation", description = "Reviewable derived consolidation proposals."), + (name = "knowledge", description = "Derived knowledge page rebuild and lint readback."), + (name = "admin", description = "Local admin and operator inspection routes."), + ) +)] +pub struct ApiDoc; + +#[derive(Clone, Debug)] +struct RequestContext { + tenant_id: String, + project_id: String, + agent_id: String, } +impl RequestContext { + fn from_headers(headers: &HeaderMap) -> Result { + let tenant_id = required_header(headers, HEADER_TENANT_ID)?; + let project_id = required_header(headers, HEADER_PROJECT_ID)?; + let agent_id = required_header(headers, HEADER_AGENT_ID)?; -pub fn admin_router(state: AppState) -> Router { - Router::new().route("/v1/admin/rebuild_qdrant", post(rebuild_qdrant)).with_state(state) + Ok(Self { tenant_id, project_id, agent_id }) + } } -async fn health() -> StatusCode { - StatusCode::OK +#[derive(Clone, Debug, Deserialize)] +struct NotesIngestRequest { + scope: String, + notes: Vec, } -async fn add_note( - State(state): State, - payload: Result, JsonRejection>, -) -> Result, ApiError> { - let Json(payload) = payload.map_err(|err| { - tracing::warn!(error = %err, "Invalid request payload."); - json_error( - StatusCode::BAD_REQUEST, - "INVALID_REQUEST", - "Invalid request payload.".to_string(), - None, - ) - })?; - let response = state.service.add_note(payload).await?; - Ok(Json(response)) +#[derive(Clone, Debug, Deserialize)] +struct EventsIngestRequest { + scope: Option, + dry_run: Option, + ingestion_profile: Option, + messages: Vec, } -async fn add_event( - State(state): State, - payload: Result, JsonRejection>, -) -> Result, ApiError> { - let Json(payload) = payload.map_err(|err| { - tracing::warn!(error = %err, "Invalid request payload."); - json_error( - StatusCode::BAD_REQUEST, - "INVALID_REQUEST", - "Invalid request payload.".to_string(), - None, - ) - })?; - let response = state.service.add_event(payload).await?; - Ok(Json(response)) +#[derive(Clone, Debug, Deserialize)] +struct DocsPutBody { + scope: String, + doc_type: Option, + title: Option, + #[serde(default)] + source_ref: Value, + + write_policy: Option, + content: String, } -async fn search( - State(state): State, - payload: Result, JsonRejection>, -) -> Result, ApiError> { - let Json(payload) = payload.map_err(|err| { - tracing::warn!(error = %err, "Invalid request payload."); - json_error( - StatusCode::BAD_REQUEST, - "INVALID_REQUEST", - "Invalid request payload.".to_string(), - None, - ) - })?; - let response = state.service.search(payload).await?; - Ok(Json(response)) +#[derive(Clone, Debug, Deserialize)] +struct CoreBlockUpsertBody { + block_id: Option, + scope: String, + key: String, + title: String, + content: String, + #[serde(default)] + source_ref: Value, + reason: Option, } -async fn search_explain( - State(state): State, - query: Result, QueryRejection>, -) -> Result, ApiError> { - let Query(query) = query.map_err(|err| { - tracing::warn!(error = %err, "Invalid query parameters."); - json_error( - StatusCode::BAD_REQUEST, - "INVALID_REQUEST", - "Invalid query parameters.".to_string(), - None, - ) - })?; - let response = state.service.search_explain(query).await?; - Ok(Json(response)) +#[derive(Clone, Debug, Deserialize)] +struct CoreBlockAttachBody { + target_agent_id: String, + read_profile: String, + reason: Option, } -async fn get_note( - State(state): State, - Path(note_id): Path, -) -> Result, ApiError> { - let response = state.service.get_note(elf_service::NoteFetchRequest { note_id }).await?; - Ok(Json(response)) +#[derive(Clone, Debug, Deserialize)] +struct DocsSearchL0Body { + query: String, + scope: Option, + status: Option, + doc_type: Option, + sparse_mode: Option, + domain: Option, + repo: Option, + agent_id: Option, + thread_id: Option, + updated_after: Option, + updated_before: Option, + ts_gte: Option, + ts_lte: Option, + top_k: Option, + candidate_k: Option, + explain: Option, } -async fn list( - State(state): State, - query: Result, QueryRejection>, -) -> Result, ApiError> { - let Query(query) = query.map_err(|err| { - tracing::warn!(error = %err, "Invalid query parameters."); - json_error( - StatusCode::BAD_REQUEST, - "INVALID_REQUEST", - "Invalid query parameters.".to_string(), - None, - ) - })?; - let response = state.service.list(query).await?; - Ok(Json(response)) +#[derive(Clone, Debug, Deserialize)] +struct DocsExcerptsGetBody { + doc_id: Uuid, + level: String, + chunk_id: Option, + quote: Option, + position: Option, + explain: Option, } -async fn update( - State(state): State, - payload: Result, JsonRejection>, -) -> Result, ApiError> { - let Json(payload) = payload.map_err(|err| { - tracing::warn!(error = %err, "Invalid request payload."); - json_error( - StatusCode::BAD_REQUEST, - "INVALID_REQUEST", - "Invalid request payload.".to_string(), - None, - ) - })?; - let response = state.service.update(payload).await?; - Ok(Json(response)) +#[derive(Clone, Debug, Deserialize)] +struct GraphQueryBody { + subject: GraphQueryEntityRef, + predicate: Option, + scopes: Option>, + as_of: Option, + limit: Option, + explain: Option, } -async fn delete( - State(state): State, - payload: Result, JsonRejection>, -) -> Result, ApiError> { - let Json(payload) = payload.map_err(|err| { - tracing::warn!(error = %err, "Invalid request payload."); - json_error( - StatusCode::BAD_REQUEST, - "INVALID_REQUEST", - "Invalid request payload.".to_string(), - None, - ) - })?; - let response = state.service.delete(payload).await?; - Ok(Json(response)) +#[derive(Clone, Debug, Deserialize)] +struct SearchCreateRequest { + mode: SearchMode, + query: String, + top_k: Option, + candidate_k: Option, + + filter: Option, + payload_level: Option, + ranking: Option, } -async fn rebuild_qdrant( - State(state): State, -) -> Result, ApiError> { - let response = state.service.rebuild_qdrant().await?; - Ok(Json(response)) +#[derive(Clone, Debug, Serialize)] +struct SearchIndexResponseV2 { + mode: SearchMode, + trace_id: Uuid, + search_id: Uuid, + #[serde(with = "elf_service::time_serde")] + expires_at: OffsetDateTime, + items: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + trajectory_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + query_plan: Option, +} + +#[derive(Clone, Debug, Serialize)] +struct SearchCreateResponseV2 { + mode: SearchMode, + trace_id: Uuid, + search_id: Uuid, + #[serde(with = "elf_service::time_serde")] + expires_at: OffsetDateTime, + items: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + trajectory_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + query_plan: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct SearchSessionGetQuery { + payload_level: Option, + top_k: Option, + touch: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct SearchTimelineQuery { + payload_level: Option, + group_by: Option, +} + +#[derive(Clone, Debug, Serialize)] +struct SearchTimelineResponseV2 { + search_id: Uuid, + #[serde(with = "elf_service::time_serde")] + expires_at: OffsetDateTime, + groups: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct SearchDetailsBody { + note_ids: Vec, + payload_level: Option, + record_hits: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct AdminIngestionProfileCreateBody { + profile_id: String, + version: Option, + profile: Value, + created_by: String, +} + +#[derive(Clone, Debug, Deserialize)] +struct AdminIngestionProfileGetQuery { + version: Option, +} + +#[derive(Clone, Debug, Deserialize, ToSchema)] +struct AdminIngestionProfileDefaultSetBody { + profile_id: String, + version: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct ConsolidationRunCreateBody { + job_kind: String, + input_refs: Vec, + #[serde(default = "empty_json_object")] + source_snapshot: Value, + lineage: ConsolidationLineage, + #[serde(default)] + proposals: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct ConsolidationRunsListQuery { + limit: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct ConsolidationProposalsListQuery { + run_id: Option, + review_state: Option, + limit: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct ConsolidationProposalReviewBody { + action: ConsolidationReviewAction, + review_comment: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct KnowledgePageRebuildBody { + page_kind: KnowledgePageKind, + page_key: String, + title: Option, + #[serde(default)] + note_ids: Vec, + #[serde(default)] + event_ids: Vec, + #[serde(default)] + relation_ids: Vec, + #[serde(default)] + proposal_ids: Vec, + #[serde(default = "empty_json_object")] + provider_metadata: Value, +} + +#[derive(Clone, Debug, Deserialize)] +struct KnowledgePagesListQuery { + page_kind: Option, + limit: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct KnowledgePagesSearchBody { + query: String, + page_kind: Option, + limit: Option, +} + +#[derive(Clone, Debug, Serialize, ToSchema)] +struct AdminIngestionProfileDefaultResponseV2 { + profile_id: String, + version: Option, + updated_at: String, +} + +#[derive(Clone, Debug, Serialize)] +struct SearchDetailsResponseV2 { + search_id: Uuid, + #[serde(with = "elf_service::time_serde")] + expires_at: OffsetDateTime, + results: Vec, } -#[derive(Debug, Serialize)] +#[derive(Clone, Debug, Deserialize)] +struct NotesListQuery { + scope: Option, + status: Option, + r#type: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct NotePatchRequest { + text: Option, + importance: Option, + confidence: Option, + ttl_days: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct AdminGraphPredicatesListQuery { + scope: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct AdminGraphPredicatePatchBody { + status: Option, + cardinality: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct AdminGraphPredicateAliasAddBody { + alias: String, +} + +#[derive(Clone, Debug, Deserialize)] +struct TraceRecentListQuery { + limit: Option, + cursor_created_at: Option, + cursor_trace_id: Option, + agent_id: Option, + read_profile: Option, + created_after: Option, + created_before: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct TraceBundleGetQuery { + mode: Option, + stage_items_limit: Option, + candidates_limit: Option, +} + +#[derive(Clone, Debug, Deserialize)] +struct ShareScopeBody { + space: String, +} + +#[derive(Clone, Debug, Deserialize)] +struct SpaceGrantUpsertBody { + grantee_kind: GranteeKind, + grantee_agent_id: Option, +} + +#[derive(Clone, Debug, Serialize)] +struct PublishResponseV2 { + note_id: Uuid, + space: String, +} + +#[derive(Clone, Debug, Serialize)] +struct SpaceGrantUpsertResponseV2 { + space: String, + grantee_kind: GranteeKind, + grantee_agent_id: Option, + granted: bool, +} + +#[derive(Clone, Debug, Serialize)] +struct SpaceGrantItemV2 { + space: String, + grantee_kind: GranteeKind, + grantee_agent_id: Option, + granted_by_agent_id: String, + granted_at: OffsetDateTime, +} + +#[derive(Clone, Debug, Serialize)] +struct SpaceGrantsListResponseV2 { + grants: Vec, +} + +#[derive(Debug, Serialize, ToSchema)] struct ErrorBody { error_code: String, message: String, @@ -179,13 +540,12 @@ struct ErrorBody { } #[derive(Debug)] -pub struct ApiError { +struct ApiError { status: StatusCode, error_code: String, message: String, fields: Option>, } - impl ApiError { fn new( status: StatusCode, @@ -197,30 +557,28 @@ impl ApiError { } } -pub fn json_error( - status: StatusCode, - code: &str, - message: impl Into, - fields: Option>, -) -> ApiError { - ApiError::new(status, code, message, fields) -} - -impl From for ApiError { - fn from(err: ServiceError) -> Self { +impl From for ApiError { + fn from(err: Error) -> Self { match err { - ServiceError::NonEnglishInput { field } => json_error( + Error::NonEnglishInput { field } => json_error( StatusCode::UNPROCESSABLE_ENTITY, "NON_ENGLISH_INPUT", - "CJK detected; upstream must canonicalize to English before calling ELF.", + "Non-English input detected; upstream must canonicalize to English before calling ELF.", Some(vec![field]), ), - ServiceError::InvalidRequest { message } => + Error::InvalidRequest { message } => json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", message, None), - ServiceError::ScopeDenied { message } => + Error::ScopeDenied { message } => json_error(StatusCode::FORBIDDEN, "SCOPE_DENIED", message, None), - ServiceError::Provider { message } => { - tracing::error!(error = %message, "Provider error."); + Error::NotFound { message } => + json_error(StatusCode::NOT_FOUND, "NOT_FOUND", message, None), + Error::Conflict { message } => + json_error(StatusCode::CONFLICT, "CONFLICT", message, None), + Error::Provider { message } => { + let sanitized = sanitize_log_text(message.as_str()); + + tracing::error!(error = %sanitized, "Provider error."); + json_error( StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", @@ -228,8 +586,11 @@ impl From for ApiError { None, ) }, - ServiceError::Storage { message } => { - tracing::error!(error = %message, "Storage error."); + Error::Storage { message } => { + let sanitized = sanitize_log_text(message.as_str()); + + tracing::error!(error = %sanitized, "Storage error."); + json_error( StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", @@ -237,8 +598,11 @@ impl From for ApiError { None, ) }, - ServiceError::Qdrant { message } => { - tracing::error!(error = %message, "Qdrant error."); + Error::Qdrant { message } => { + let sanitized = sanitize_log_text(message.as_str()); + + tracing::error!(error = %sanitized, "Qdrant error."); + json_error( StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", @@ -254,6 +618,3358 @@ impl IntoResponse for ApiError { fn into_response(self) -> Response { let body = ErrorBody { error_code: self.error_code, message: self.message, fields: self.fields }; + (self.status, Json(body)).into_response() } } + +#[derive(Clone, Copy, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum SearchMode { + QuickFind, + PlannedSearch, +} + +/// Builds the authenticated public API router. +pub fn router(state: AppState) -> Router { + let auth_state = state.clone(); + let api_router = Router::new() + .route("/health", routing::get(health)) + .route("/v2/notes/ingest", routing::post(notes_ingest)) + .route("/v2/events/ingest", routing::post(events_ingest)) + .route("/v2/core-blocks", routing::get(core_blocks_get)) + .route("/v2/searches", routing::post(searches_create)) + .route("/v2/searches/{search_id}", routing::get(searches_get)) + .route("/v2/searches/{search_id}/timeline", routing::get(searches_timeline)) + .route("/v2/searches/{search_id}/notes", routing::post(searches_notes)) + .route("/v2/graph/query", routing::post(graph_query)) + .route("/v2/notes", routing::get(notes_list)) + .route( + "/v2/notes/{note_id}", + routing::get(notes_get).patch(notes_patch).delete(notes_delete), + ) + .route("/v2/notes/{note_id}/publish", routing::post(notes_publish)) + .route("/v2/notes/{note_id}/unpublish", routing::post(notes_unpublish)) + .route( + "/v2/spaces/{space}/grants", + routing::get(space_grants_list).post(space_grant_upsert), + ) + .route("/v2/spaces/{space}/grants/revoke", routing::post(space_grant_revoke)) + .with_state(state.clone()) + .layer(DefaultBodyLimit::max(MAX_REQUEST_BYTES)); + let docs_router = Router::new() + .route("/v2/docs", routing::post(docs_put)) + .route("/v2/docs/{doc_id}", routing::get(docs_get)) + .route("/v2/docs/search/l0", routing::post(docs_search_l0)) + .route("/v2/docs/excerpts", routing::post(docs_excerpts_get)) + .with_state(state) + .layer(DefaultBodyLimit::max(MAX_DOC_REQUEST_BYTES)); + + Router::new() + .merge(contract_router()) + .merge(api_router) + .merge(docs_router) + .layer(middleware::from_fn_with_state(auth_state, api_auth_middleware)) +} + +/// Builds the authenticated admin API router. +pub fn admin_router(state: AppState) -> Router { + let auth_state = state.clone(); + let protected_router = Router::new() + .route("/v2/admin/searches", routing::post(searches_create)) + .route("/v2/admin/searches/{search_id}", routing::get(searches_get)) + .route("/v2/admin/searches/{search_id}/timeline", routing::get(searches_timeline)) + .route("/v2/admin/searches/{search_id}/notes", routing::post(searches_notes)) + .route("/v2/admin/core-blocks", routing::post(admin_core_block_upsert)) + .route( + "/v2/admin/core-blocks/{block_id}/attachments", + routing::post(admin_core_block_attach), + ) + .route( + "/v2/admin/core-blocks/attachments/{attachment_id}", + routing::delete(admin_core_block_detach), + ) + .route("/v2/admin/notes", routing::get(notes_list)) + .route("/v2/admin/notes/{note_id}", routing::get(notes_get)) + .route( + "/v2/admin/events/ingestion-profiles/default", + routing::get(admin_ingestion_profile_default_get) + .put(admin_ingestion_profile_default_set), + ) + .route( + "/v2/admin/events/ingestion-profiles/{profile_id}/versions", + routing::get(admin_ingestion_profile_versions_list), + ) + .route( + "/v2/admin/events/ingestion-profiles/{profile_id}", + routing::get(admin_ingestion_profile_get), + ) + .route( + "/v2/admin/events/ingestion-profiles", + routing::get(admin_ingestion_profiles_list).post(admin_ingestion_profile_create), + ) + .route( + "/v2/admin/consolidation/runs", + routing::get(consolidation_runs_list).post(consolidation_run_create), + ) + .route("/v2/admin/consolidation/runs/{run_id}", routing::get(consolidation_run_get)) + .route("/v2/admin/consolidation/proposals", routing::get(consolidation_proposals_list)) + .route( + "/v2/admin/consolidation/proposals/{proposal_id}", + routing::get(consolidation_proposal_get), + ) + .route( + "/v2/admin/consolidation/proposals/{proposal_id}/review", + routing::post(consolidation_proposal_review), + ) + .route("/v2/admin/knowledge/pages", routing::get(knowledge_pages_list)) + .route("/v2/admin/knowledge/pages/rebuild", routing::post(knowledge_page_rebuild)) + .route("/v2/admin/knowledge/pages/search", routing::post(knowledge_pages_search)) + .route("/v2/admin/knowledge/pages/{page_id}", routing::get(knowledge_page_get)) + .route("/v2/admin/knowledge/pages/{page_id}/lint", routing::post(knowledge_page_lint)) + .route("/v2/admin/qdrant/rebuild", routing::post(rebuild_qdrant)) + .route("/v2/admin/searches/raw", routing::post(searches_raw)) + .route("/v2/admin/traces/recent", routing::get(trace_recent_list)) + .route("/v2/admin/traces/{trace_id}", routing::get(trace_get)) + .route("/v2/admin/traces/{trace_id}/bundle", routing::get(trace_bundle_get)) + .route("/v2/admin/trajectories/{trace_id}", routing::get(trace_trajectory_get)) + .route("/v2/admin/trace-items/{item_id}", routing::get(trace_item_get)) + .route("/v2/admin/graph/predicates", routing::get(admin_graph_predicates_list)) + .route( + "/v2/admin/graph/predicates/{predicate_id}", + routing::patch(admin_graph_predicate_patch), + ) + .route( + "/v2/admin/graph/predicates/{predicate_id}/aliases", + routing::post(admin_graph_predicate_alias_add).get(admin_graph_predicate_aliases_list), + ) + .route("/v2/admin/notes/{note_id}/provenance", routing::get(admin_note_provenance_get)) + .route("/v2/admin/notes/{note_id}/history", routing::get(admin_note_history_get)) + .with_state(state) + .layer(DefaultBodyLimit::max(MAX_REQUEST_BYTES)) + .layer(middleware::from_fn_with_state(auth_state, admin_auth_middleware)); + + Router::new() + .route(ADMIN_VIEWER_PATH, routing::get(admin_viewer)) + .route("/", routing::get(admin_viewer)) + .merge(protected_router) +} + +/// Builds the API contract router. +pub fn contract_router() -> Router +where + S: Clone + Send + Sync + 'static, +{ + Router::new() + .route(OPENAPI_JSON_PATH, routing::get(openapi_json)) + .merge(Scalar::with_url(SCALAR_DOCS_PATH, ::openapi())) +} + +fn empty_json_object() -> Value { + Value::Object(Map::new()) +} + +fn json_error( + status: StatusCode, + code: &str, + message: impl Into, + fields: Option>, +) -> ApiError { + ApiError::new(status, code, message, fields) +} + +fn sanitize_log_text(text: &str) -> String { + let mut parts = Vec::new(); + let mut redact_next = false; + + for raw in text.split_whitespace() { + let mut word = raw.to_string(); + + if redact_next { + word = "[REDACTED]".to_string(); + redact_next = false; + } + if raw.eq_ignore_ascii_case("bearer") { + redact_next = true; + } + + let lowered = raw.to_ascii_lowercase(); + + for key in ["api_key", "apikey", "password", "secret", "token"] { + if lowered.contains(key) && (lowered.contains('=') || lowered.contains(':')) { + let sep = if raw.contains('=') { '=' } else { ':' }; + let prefix = match raw.split(sep).next() { + Some(prefix) => prefix, + None => raw, + }; + + word = format!("{prefix}{sep}[REDACTED]"); + + break; + } + } + + parts.push(word); + } + + let mut out = parts.join(" "); + + if out.chars().count() > MAX_ERROR_LOG_CHARS { + out = out.chars().take(MAX_ERROR_LOG_CHARS).collect(); + + out.push_str("..."); + } + + out +} + +fn required_header(headers: &HeaderMap, name: &'static str) -> Result { + let raw = headers.get(name).ok_or_else(|| { + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + format!("{name} header is required."), + Some(vec![format!("$.headers.{name}")]), + ) + })?; + let value = raw.to_str().map_err(|_| { + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + format!("{name} header must be a valid string."), + Some(vec![format!("$.headers.{name}")]), + ) + })?; + let trimmed = value.trim(); + + if trimmed.is_empty() { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + format!("{name} header must be non-empty."), + Some(vec![format!("$.headers.{name}")]), + )); + } + if trimmed.chars().count() > MAX_CONTEXT_HEADER_CHARS { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + format!("{name} header is too long."), + Some(vec![format!("$.headers.{name}")]), + )); + } + if !english_gate::is_english_identifier(trimmed) { + return Err(json_error( + StatusCode::UNPROCESSABLE_ENTITY, + "NON_ENGLISH_INPUT", + "Non-English input detected; upstream must canonicalize to English before calling ELF." + .to_string(), + Some(vec![format!("$.headers.{name}")]), + )); + } + + Ok(trimmed.to_string()) +} + +fn required_read_profile(headers: &HeaderMap) -> Result { + required_header(headers, HEADER_READ_PROFILE) +} + +fn parse_space(scope: &str) -> Result { + match scope { + "team_shared" | "project_shared" => Ok(ShareScope::ProjectShared), + "org_shared" => Ok(ShareScope::OrgShared), + _ => Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid space.".to_string(), + Some(vec!["$.space".to_string()]), + )), + } +} + +fn format_space(scope: ShareScope) -> &'static str { + match scope { + ShareScope::ProjectShared => "team_shared", + ShareScope::OrgShared => "org_shared", + } +} + +fn format_scope(scope: &str) -> Result<&'static str, ApiError> { + match scope { + "project_shared" => Ok("team_shared"), + "org_shared" => Ok("org_shared"), + "agent_private" => Ok("agent_private"), + _ => Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid space.".to_string(), + Some(vec!["$.space".to_string()]), + )), + } +} + +fn parse_request_id_from_headers(headers: &HeaderMap) -> Result { + if let Some(raw) = headers.get(HEADER_REQUEST_ID) { + let raw = raw.to_str().map_err(|_| { + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + format!("{HEADER_REQUEST_ID} header must be a valid string."), + Some(vec![format!("$.headers.{HEADER_REQUEST_ID}")]), + ) + })?; + let trimmed = raw.trim(); + + if trimmed.is_empty() { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + format!("{HEADER_REQUEST_ID} header must be non-empty."), + Some(vec![format!("$.headers.{HEADER_REQUEST_ID}")]), + )); + } + + Uuid::parse_str(trimmed).map_err(|_| { + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + format!("{HEADER_REQUEST_ID} header must be a valid UUID."), + Some(vec![format!("$.headers.{HEADER_REQUEST_ID}")]), + ) + }) + } else { + Ok(Uuid::new_v4()) + } +} + +fn inject_request_id_into_json_body(body: &[u8], request_id: &Uuid) -> Option> { + let mut response_body: Value = serde_json::from_slice(body).ok()?; + let object = response_body.as_object_mut()?; + + object.insert("request_id".to_string(), Value::String(request_id.to_string())); + + serde_json::to_vec(&response_body).ok() +} + +fn trusted_token_id(headers: &HeaderMap) -> Option { + let raw = headers.get(HEADER_TRUSTED_TOKEN_ID)?; + let value = raw.to_str().ok()?.trim(); + + if value.is_empty() { None } else { Some(value.to_string()) } +} + +fn sanitize_trusted_token_header(headers: &mut HeaderMap) { + headers.remove(HEADER_TRUSTED_TOKEN_ID); +} + +fn effective_token_id(auth_mode: &str, headers: &HeaderMap) -> Option { + match auth_mode.trim() { + "static_keys" => trusted_token_id(headers), + _ => None, + } +} + +fn bearer_token(headers: &HeaderMap) -> Option { + let raw = headers.get(HEADER_AUTHORIZATION)?; + let value = raw.to_str().ok()?.trim(); + let token = value.strip_prefix("Bearer ")?; + let token = token.trim(); + + if token.is_empty() { None } else { Some(token.to_string()) } +} + +fn resolve_auth_key<'a>( + headers: &HeaderMap, + auth_keys: &'a [SecurityAuthKey], +) -> Result<&'a SecurityAuthKey, ApiError> { + let token = bearer_token(headers).ok_or_else(|| { + json_error(StatusCode::UNAUTHORIZED, "UNAUTHORIZED", "Authentication required.", None) + })?; + + auth_keys.iter().find(|key| key.token == token).ok_or_else(|| { + json_error(StatusCode::UNAUTHORIZED, "UNAUTHORIZED", "Authentication required.", None) + }) +} + +fn set_context_header( + headers: &mut HeaderMap, + name: &'static str, + value: &str, +) -> Result<(), ApiError> { + let header_value = value.parse().map_err(|_| { + json_error( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + format!("Invalid configured auth context for {name}."), + None, + ) + })?; + + headers.insert(name, header_value); + + Ok(()) +} + +fn apply_auth_key_context(headers: &mut HeaderMap, key: &SecurityAuthKey) -> Result<(), ApiError> { + let agent_id = key.agent_id.as_deref().ok_or_else(|| { + json_error(StatusCode::FORBIDDEN, "FORBIDDEN", "Token is not scoped to an agent_id.", None) + })?; + + set_context_header(headers, HEADER_TENANT_ID, key.tenant_id.as_str())?; + set_context_header(headers, HEADER_PROJECT_ID, key.project_id.as_str())?; + set_context_header(headers, HEADER_AGENT_ID, agent_id)?; + set_context_header(headers, HEADER_READ_PROFILE, key.read_profile.as_str())?; + set_context_header(headers, HEADER_TRUSTED_TOKEN_ID, key.token_id.as_str())?; + + Ok(()) +} + +fn require_admin_for_org_shared_writes( + auth_mode: &str, + role: Option, +) -> Result<(), ApiError> { + if auth_mode.trim() != "static_keys" { + return Ok(()); + } + if matches!(role, Some(SecurityAuthRole::Admin | SecurityAuthRole::SuperAdmin)) { + return Ok(()); + } + + Err(json_error(StatusCode::FORBIDDEN, "FORBIDDEN", "Admin token required.", None)) +} + +fn parse_optional_rfc3339( + raw: Option<&String>, + path: &str, +) -> Result, ApiError> { + let Some(raw) = raw else { + return Ok(None); + }; + let raw = raw.trim(); + + if raw.is_empty() { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + format!("{path} must be non-empty."), + Some(vec![path.to_string()]), + )); + } + + OffsetDateTime::parse(raw, &Rfc3339).map(Some).map_err(|_| { + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + format!("{path} must be an RFC3339 datetime string."), + Some(vec![path.to_string()]), + ) + }) +} + +async fn openapi_json() -> Response { + let mut response = Json(::openapi()).into_response(); + + response + .headers_mut() + .insert(CONTENT_TYPE, HeaderValue::from_static("application/vnd.oai.openapi+json")); + + response +} + +async fn admin_viewer() -> Response { + let mut response = VIEWER_HTML.into_response(); + + response + .headers_mut() + .insert(CONTENT_TYPE, HeaderValue::from_static("text/html; charset=utf-8")); + response.headers_mut().insert(CACHE_CONTROL, HeaderValue::from_static("no-store")); + + response +} + +async fn with_request_id(response: Response, request_id: Uuid) -> Response { + let (mut parts, body) = response.into_parts(); + + parts.headers.insert( + HEADER_REQUEST_ID, + request_id.to_string().parse().expect("request_id is valid uuid string"), + ); + + let is_json_response = parts + .headers + .get(CONTENT_TYPE) + .and_then(|value| value.to_str().ok()) + .map(|content_type| content_type.starts_with("application/json")) + .unwrap_or(false); + + if !is_json_response { + return Response::from_parts(parts, body); + } + + let body_bytes = match body::to_bytes(body, usize::MAX).await { + Ok(bytes) => bytes, + Err(_) => return Response::from_parts(parts, Body::empty()), + }; + + if let Some(response_body) = inject_request_id_into_json_body(&body_bytes, &request_id) { + parts.headers.remove(CONTENT_LENGTH); + + Response::from_parts(parts, Body::from(response_body)) + } else { + Response::from_parts(parts, Body::from(body_bytes)) + } +} + +async fn api_auth_middleware( + State(state): State, + req: Request, + next: Next, +) -> Response { + let security = &state.service.cfg.security; + let request_id = match parse_request_id_from_headers(req.headers()) { + Ok(request_id) => request_id, + Err(err) => return with_request_id(err.into_response(), Uuid::new_v4()).await, + }; + let mut req = req; + + sanitize_trusted_token_header(req.headers_mut()); + + let response = match security.auth_mode.trim() { + "off" => next.run(req).await, + "static_keys" => { + let key = match resolve_auth_key(req.headers(), &security.auth_keys) { + Ok(key) => key, + Err(err) => return with_request_id(err.into_response(), request_id).await, + }; + + req.extensions_mut().insert(key.role); + + if let Err(err) = apply_auth_key_context(req.headers_mut(), key) { + return with_request_id(err.into_response(), request_id).await; + } + + next.run(req).await + }, + _ => json_error( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Invalid security.auth_mode configuration.", + None, + ) + .into_response(), + }; + + with_request_id(response, request_id).await +} + +async fn admin_auth_middleware( + State(state): State, + req: Request, + next: Next, +) -> Response { + let security = &state.service.cfg.security; + let request_id = match parse_request_id_from_headers(req.headers()) { + Ok(request_id) => request_id, + Err(err) => return with_request_id(err.into_response(), Uuid::new_v4()).await, + }; + let mut req = req; + + sanitize_trusted_token_header(req.headers_mut()); + + let response = match security.auth_mode.trim() { + "off" => next.run(req).await, + "static_keys" => { + let key = match resolve_auth_key(req.headers(), &security.auth_keys) { + Ok(key) => key, + Err(err) => return with_request_id(err.into_response(), request_id).await, + }; + + req.extensions_mut().insert(key.role); + + if !matches!(key.role, SecurityAuthRole::Admin | SecurityAuthRole::SuperAdmin) { + return with_request_id( + json_error(StatusCode::FORBIDDEN, "FORBIDDEN", "Admin token required.", None) + .into_response(), + request_id, + ) + .await; + } + + if let Err(err) = apply_auth_key_context(req.headers_mut(), key) { + return with_request_id(err.into_response(), request_id).await; + } + + next.run(req).await + }, + _ => json_error( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Invalid security.auth_mode configuration.", + None, + ) + .into_response(), + }; + + with_request_id(response, request_id).await +} + +#[utoipa::path( + get, + path = "/health", + tag = "health", + responses((status = 200, description = "API process is healthy.")) +)] +async fn health() -> StatusCode { + StatusCode::OK +} + +#[utoipa::path( + post, + path = "/v2/notes/ingest", + tag = "notes", + request_body = Value, + responses( + (status = 200, description = "Notes were processed.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 422, description = "Non-English input rejected.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn notes_ingest( + State(state): State, + headers: HeaderMap, + role: Option>, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let role = role.map(|Extension(role)| role); + + if payload.scope.trim() == "org_shared" { + require_admin_for_org_shared_writes(state.service.cfg.security.auth_mode.as_str(), role)?; + } + if payload.notes.len() > MAX_NOTES_PER_INGEST { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Notes list is too large.", + Some(vec!["$.notes".to_string()]), + )); + } + + let response = state + .service + .add_note(AddNoteRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + scope: payload.scope, + notes: payload.notes, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/events/ingest", + tag = "events", + request_body = Value, + responses( + (status = 200, description = "Event messages were processed.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 422, description = "Non-English input rejected.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn events_ingest( + State(state): State, + headers: HeaderMap, + role: Option>, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let role = role.map(|Extension(role)| role); + + if payload.scope.as_deref().map(str::trim) == Some("org_shared") { + require_admin_for_org_shared_writes(state.service.cfg.security.auth_mode.as_str(), role)?; + } + if payload.messages.len() > MAX_MESSAGES_PER_EVENT { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Messages list is too large.", + Some(vec!["$.messages".to_string()]), + )); + } + + for (idx, msg) in payload.messages.iter().enumerate() { + if msg.content.chars().count() > MAX_MESSAGE_CHARS { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Message content is too long.", + Some(vec![format!("$.messages[{idx}].content")]), + )); + } + } + + let response = state + .service + .add_event(AddEventRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + scope: payload.scope, + dry_run: payload.dry_run, + ingestion_profile: payload.ingestion_profile, + messages: payload.messages, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/docs", + tag = "docs", + request_body = Value, + responses( + (status = 200, description = "Document was stored.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 422, description = "Non-English input rejected.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn docs_put( + State(state): State, + headers: HeaderMap, + role: Option>, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let role = role.map(|Extension(role)| role); + + if payload.scope.trim() == "org_shared" { + require_admin_for_org_shared_writes(state.service.cfg.security.auth_mode.as_str(), role)?; + } + + let response = state + .service + .docs_put(DocsPutRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + scope: payload.scope, + doc_type: payload.doc_type.map(|doc_type| doc_type.as_str().to_string()), + title: payload.title, + source_ref: payload.source_ref, + write_policy: payload.write_policy, + content: payload.content, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/core-blocks", + tag = "core_blocks", + responses( + (status = 200, description = "Attached core memory blocks.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn core_blocks_get( + State(state): State, + headers: HeaderMap, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let read_profile = required_read_profile(&headers)?; + let response = state + .service + .core_blocks_get(CoreBlocksGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + read_profile, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/admin/core-blocks", + tag = "core_blocks", + request_body = Value, + responses( + (status = 200, description = "Core block was stored.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 409, description = "Core block conflict.", body = ErrorBody), + (status = 422, description = "Non-English input rejected.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_core_block_upsert( + State(state): State, + headers: HeaderMap, + role: Option>, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let role = role.map(|Extension(role)| role); + + if payload.scope.trim() == "org_shared" { + require_admin_for_org_shared_writes(state.service.cfg.security.auth_mode.as_str(), role)?; + } + + let response = state + .service + .core_block_upsert(CoreBlockUpsertRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + block_id: payload.block_id, + scope: payload.scope, + key: payload.key, + title: payload.title, + content: payload.content, + source_ref: payload.source_ref, + reason: payload.reason, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/admin/core-blocks/{block_id}/attachments", + tag = "core_blocks", + params(("block_id" = Uuid, Path, description = "Core block ID.")), + request_body = Value, + responses( + (status = 200, description = "Core block was attached.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 404, description = "Core block was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_core_block_attach( + State(state): State, + headers: HeaderMap, + Path(block_id): Path, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let response = state + .service + .core_block_attach(CoreBlockAttachRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + block_id, + target_agent_id: payload.target_agent_id, + read_profile: payload.read_profile, + reason: payload.reason, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + delete, + path = "/v2/admin/core-blocks/attachments/{attachment_id}", + tag = "core_blocks", + params(("attachment_id" = Uuid, Path, description = "Core block attachment ID.")), + responses( + (status = 200, description = "Core block attachment was detached.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_core_block_detach( + State(state): State, + headers: HeaderMap, + Path(attachment_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .core_block_detach(CoreBlockDetachRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + attachment_id, + reason: None, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/docs/{doc_id}", + tag = "docs", + params(("doc_id" = Uuid, Path, description = "Document ID.")), + responses( + (status = 200, description = "Document was fetched.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 404, description = "Document was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn docs_get( + State(state): State, + headers: HeaderMap, + Path(doc_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let read_profile = required_read_profile(&headers)?; + let response = state + .service + .docs_get(DocsGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + read_profile, + doc_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/docs/search/l0", + tag = "docs", + request_body = Value, + responses( + (status = 200, description = "L0 document search results.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 422, description = "Non-English input rejected.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn docs_search_l0( + State(state): State, + headers: HeaderMap, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let read_profile = required_read_profile(&headers)?; + let Json(mut payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let status = payload.status.as_deref().map(str::trim).filter(|status| !status.is_empty()); + + if let Some(status) = status { + let status = status.to_lowercase(); + + if !DOC_STATUSES.contains(&status.as_str()) { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "status must be one of: active|deleted.", + Some(vec!["$.status".to_string()]), + )); + } + + payload.status = Some(status); + } + + let updated_after = parse_optional_rfc3339(payload.updated_after.as_ref(), "$.updated_after")?; + let updated_before = + parse_optional_rfc3339(payload.updated_before.as_ref(), "$.updated_before")?; + let ts_gte = parse_optional_rfc3339(payload.ts_gte.as_ref(), "$.ts_gte")?; + let ts_lte = parse_optional_rfc3339(payload.ts_lte.as_ref(), "$.ts_lte")?; + + if let (Some(ts_gte), Some(ts_lte)) = (ts_gte, ts_lte) + && ts_gte >= ts_lte + { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "ts_gte must be earlier than ts_lte.", + Some(vec!["$.ts_gte".to_string(), "$.ts_lte".to_string()]), + )); + } + if let (Some(updated_after), Some(updated_before)) = (updated_after, updated_before) + && updated_after >= updated_before + { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "updated_after must be earlier than updated_before.", + Some(vec!["$.updated_after".to_string(), "$.updated_before".to_string()]), + )); + } + + if payload.query.chars().count() > MAX_QUERY_CHARS { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Query is too long.", + Some(vec!["$.query".to_string()]), + )); + } + + let response = state + .service + .docs_search_l0(DocsSearchL0Request { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + caller_agent_id: ctx.agent_id, + read_profile, + query: payload.query, + scope: payload.scope, + status: payload.status, + doc_type: payload.doc_type.map(|doc_type| doc_type.as_str().to_string()), + sparse_mode: payload.sparse_mode, + domain: payload.domain, + repo: payload.repo, + agent_id: payload.agent_id, + thread_id: payload.thread_id, + updated_after: payload.updated_after, + updated_before: payload.updated_before, + ts_gte: payload.ts_gte, + ts_lte: payload.ts_lte, + top_k: payload.top_k, + candidate_k: payload.candidate_k, + explain: payload.explain, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/docs/excerpts", + tag = "docs", + request_body = Value, + responses( + (status = 200, description = "Document excerpt result.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 404, description = "Document or excerpt was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn docs_excerpts_get( + State(state): State, + headers: HeaderMap, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let read_profile = required_read_profile(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let response = state + .service + .docs_excerpts_get(DocsExcerptsGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + read_profile, + doc_id: payload.doc_id, + level: payload.level, + chunk_id: payload.chunk_id, + quote: payload.quote, + position: payload.position, + explain: payload.explain, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/graph/query", + tag = "graph", + request_body = Value, + responses( + (status = 200, description = "Graph facts matching the query.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 422, description = "Non-English input rejected.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn graph_query( + State(state): State, + headers: HeaderMap, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let read_profile = required_read_profile(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let as_of = parse_optional_rfc3339(payload.as_of.as_ref(), "$.as_of")?; + let response = state + .service + .graph_query(GraphQueryRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + read_profile, + subject: payload.subject, + predicate: payload.predicate, + scopes: payload.scopes, + as_of, + limit: payload.limit, + explain: payload.explain, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/searches", + tag = "search", + request_body = Value, + responses( + (status = 200, description = "Search session was created.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 422, description = "Non-English input rejected.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn searches_create( + State(state): State, + headers: HeaderMap, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let read_profile = required_read_profile(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + + if payload.query.chars().count() > MAX_QUERY_CHARS { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Query is too long.", + Some(vec!["$.query".to_string()]), + )); + } + if payload.top_k.unwrap_or(state.service.cfg.memory.top_k) > MAX_TOP_K { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "top_k is too large.", + Some(vec!["$.top_k".to_string()]), + )); + } + if payload.candidate_k.unwrap_or(state.service.cfg.memory.candidate_k) > MAX_CANDIDATE_K { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "candidate_k is too large.", + Some(vec!["$.candidate_k".to_string()]), + )); + } + if payload.ranking.is_some() { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Ranking overrides are only supported on admin endpoints.".to_string(), + None, + )); + } + + let mode = payload.mode; + let token_id = effective_token_id(state.service.cfg.security.auth_mode.as_str(), &headers); + let build_request = || SearchRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + token_id: token_id.clone(), + read_profile, + query: payload.query.clone(), + top_k: payload.top_k, + candidate_k: payload.candidate_k, + filter: payload.filter.clone(), + payload_level: payload.payload_level.unwrap_or_default(), + record_hits: Some(false), + ranking: None, + }; + let response = match mode { + SearchMode::QuickFind => { + let response = state.service.search_quick(build_request()).await?; + + SearchCreateResponseV2 { + mode, + trace_id: response.trace_id, + search_id: response.search_session_id, + expires_at: response.expires_at, + items: response.items, + trajectory_summary: response.trajectory_summary, + query_plan: None, + } + }, + SearchMode::PlannedSearch => { + let response = state.service.search_planned(build_request()).await?; + + SearchCreateResponseV2 { + mode, + trace_id: response.trace_id, + search_id: response.search_session_id, + expires_at: response.expires_at, + items: response.items, + trajectory_summary: response.trajectory_summary, + query_plan: Some(response.query_plan), + } + }, + }; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/searches/{search_id}", + tag = "search", + params( + ("search_id" = Uuid, Path, description = "Search session ID."), + ("payload_level" = Option, Query, description = "Optional payload level."), + ("top_k" = Option, Query, description = "Optional result limit override."), + ("touch" = Option, Query, description = "Whether to extend the session TTL."), + ), + responses( + (status = 200, description = "Search session index view.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 404, description = "Search session was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn searches_get( + State(state): State, + headers: HeaderMap, + Path(search_id): Path, + query: Result, QueryRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Query(query) = query.map_err(|err| { + tracing::warn!(error = %err, "Invalid query parameters."); + + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid query parameters.".to_string(), + None, + ) + })?; + let response = state + .service + .search_session_get(SearchSessionGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + search_session_id: search_id, + payload_level: query.payload_level.unwrap_or_default(), + top_k: query.top_k, + touch: query.touch, + }) + .await?; + let mode = if response.query_plan.is_some() { + SearchMode::PlannedSearch + } else { + SearchMode::QuickFind + }; + + Ok(Json(SearchIndexResponseV2 { + mode, + trace_id: response.trace_id, + search_id: response.search_session_id, + expires_at: response.expires_at, + items: response.items, + trajectory_summary: response.trajectory_summary, + query_plan: response.query_plan, + })) +} + +#[utoipa::path( + get, + path = "/v2/searches/{search_id}/timeline", + tag = "search", + params( + ("search_id" = Uuid, Path, description = "Search session ID."), + ("payload_level" = Option, Query, description = "Optional payload level."), + ("group_by" = Option, Query, description = "Timeline grouping mode."), + ), + responses( + (status = 200, description = "Search session timeline.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 404, description = "Search session was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn searches_timeline( + State(state): State, + headers: HeaderMap, + Path(search_id): Path, + query: Result, QueryRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Query(query) = query.map_err(|err| { + tracing::warn!(error = %err, "Invalid query parameters."); + + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid query parameters.".to_string(), + None, + ) + })?; + let response = state + .service + .search_timeline(SearchTimelineRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + search_session_id: search_id, + payload_level: query.payload_level.unwrap_or_default(), + group_by: query.group_by, + }) + .await?; + + Ok(Json(SearchTimelineResponseV2 { + search_id: response.search_session_id, + expires_at: response.expires_at, + groups: response.groups, + })) +} + +#[utoipa::path( + post, + path = "/v2/searches/{search_id}/notes", + tag = "search", + params(("search_id" = Uuid, Path, description = "Search session ID.")), + request_body = Value, + responses( + (status = 200, description = "Hydrated search note details.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 404, description = "Search session was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn searches_notes( + State(state): State, + headers: HeaderMap, + Path(search_id): Path, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + + if payload.note_ids.len() > MAX_NOTE_IDS_PER_DETAILS { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "note_ids list is too large.", + Some(vec!["$.note_ids".to_string()]), + )); + } + + let response = state + .service + .search_details(SearchDetailsRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + search_session_id: search_id, + payload_level: payload.payload_level.unwrap_or_default(), + note_ids: payload.note_ids, + record_hits: payload.record_hits, + }) + .await?; + + Ok(Json(SearchDetailsResponseV2 { + search_id: response.search_session_id, + expires_at: response.expires_at, + results: response.results, + })) +} + +#[utoipa::path( + get, + path = "/v2/notes", + tag = "notes", + params( + ("scope" = Option, Query, description = "Optional note scope filter."), + ("status" = Option, Query, description = "Optional note status filter."), + ("type" = Option, Query, description = "Optional note type filter."), + ), + responses( + (status = 200, description = "Notes visible to the caller.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn notes_list( + State(state): State, + headers: HeaderMap, + query: Result, QueryRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Query(query) = query.map_err(|err| { + tracing::warn!(error = %err, "Invalid query parameters."); + + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid query parameters.".to_string(), + None, + ) + })?; + let response = state + .service + .list(ListRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: Some(ctx.agent_id), + scope: query.scope, + status: query.status, + r#type: query.r#type, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/notes/{note_id}", + tag = "notes", + params(("note_id" = Uuid, Path, description = "Note ID.")), + responses( + (status = 200, description = "Note details.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 404, description = "Note was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn notes_get( + State(state): State, + headers: HeaderMap, + Path(note_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .get_note(NoteFetchRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + note_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + patch, + path = "/v2/notes/{note_id}", + tag = "notes", + params(("note_id" = Uuid, Path, description = "Note ID.")), + request_body = Value, + responses( + (status = 200, description = "Note was updated.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 404, description = "Note was not found.", body = ErrorBody), + (status = 422, description = "Non-English input rejected.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn notes_patch( + State(state): State, + headers: HeaderMap, + Path(note_id): Path, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let response = state + .service + .update(UpdateRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + note_id, + text: payload.text, + importance: payload.importance, + confidence: payload.confidence, + ttl_days: payload.ttl_days, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + delete, + path = "/v2/notes/{note_id}", + tag = "notes", + params(("note_id" = Uuid, Path, description = "Note ID.")), + responses( + (status = 200, description = "Note was deleted.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 404, description = "Note was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn notes_delete( + State(state): State, + headers: HeaderMap, + Path(note_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .delete(DeleteRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + note_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/notes/{note_id}/publish", + tag = "notes", + params(("note_id" = Uuid, Path, description = "Note ID.")), + request_body = Value, + responses( + (status = 200, description = "Note was published to a shared space.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 404, description = "Note was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn notes_publish( + State(state): State, + headers: HeaderMap, + role: Option>, + Path(note_id): Path, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let scope = parse_space(payload.space.as_str())?; + let role = role.map(|Extension(role)| role); + + if matches!(scope, ShareScope::OrgShared) { + require_admin_for_org_shared_writes(state.service.cfg.security.auth_mode.as_str(), role)?; + } + + let response = state + .service + .publish_note(PublishNoteRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + note_id, + scope, + }) + .await?; + + Ok(Json(PublishResponseV2 { + note_id: response.note_id, + space: format_scope(response.scope.as_str())?.to_string(), + })) +} + +#[utoipa::path( + post, + path = "/v2/notes/{note_id}/unpublish", + tag = "notes", + params(("note_id" = Uuid, Path, description = "Note ID.")), + request_body = Value, + responses( + (status = 200, description = "Note was returned to private scope.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 404, description = "Note was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn notes_unpublish( + State(state): State, + headers: HeaderMap, + role: Option>, + Path(note_id): Path, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let scope = parse_space(payload.space.as_str())?; + let role = role.map(|Extension(role)| role); + + if matches!(scope, ShareScope::OrgShared) { + require_admin_for_org_shared_writes(state.service.cfg.security.auth_mode.as_str(), role)?; + } + + let response = state + .service + .unpublish_note(UnpublishNoteRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + note_id, + }) + .await?; + + Ok(Json(PublishResponseV2 { + note_id: response.note_id, + space: format_scope(response.scope.as_str())?.to_string(), + })) +} + +#[utoipa::path( + get, + path = "/v2/spaces/{space}/grants", + tag = "notes", + params(("space" = String, Path, description = "Shared space name.")), + responses( + (status = 200, description = "Space grants.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn space_grants_list( + State(state): State, + headers: HeaderMap, + Path(space): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let scope = parse_space(space.as_str())?; + let response = state + .service + .space_grants_list(SpaceGrantsListRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + scope, + }) + .await?; + + Ok(Json(SpaceGrantsListResponseV2 { + grants: response + .grants + .into_iter() + .map(|item| SpaceGrantItemV2 { + space: format_space(item.scope).to_string(), + grantee_kind: item.grantee_kind, + grantee_agent_id: item.grantee_agent_id, + granted_by_agent_id: item.granted_by_agent_id, + granted_at: item.granted_at, + }) + .collect(), + })) +} + +#[utoipa::path( + post, + path = "/v2/spaces/{space}/grants", + tag = "notes", + params(("space" = String, Path, description = "Shared space name.")), + request_body = Value, + responses( + (status = 200, description = "Space grant was upserted.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn space_grant_upsert( + State(state): State, + headers: HeaderMap, + role: Option>, + Path(space): Path, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let scope = parse_space(space.as_str())?; + let role = role.map(|Extension(role)| role); + + if matches!(scope, ShareScope::OrgShared) { + require_admin_for_org_shared_writes(state.service.cfg.security.auth_mode.as_str(), role)?; + } + + let response = state + .service + .space_grant_upsert(SpaceGrantUpsertRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + scope, + grantee_kind: payload.grantee_kind, + grantee_agent_id: payload.grantee_agent_id, + }) + .await?; + + Ok(Json(SpaceGrantUpsertResponseV2 { + space: format_scope(response.scope.as_str())?.to_string(), + grantee_kind: response.grantee_kind, + grantee_agent_id: response.grantee_agent_id, + granted: response.granted, + })) +} + +#[utoipa::path( + post, + path = "/v2/spaces/{space}/grants/revoke", + tag = "notes", + params(("space" = String, Path, description = "Shared space name.")), + request_body = Value, + responses( + (status = 200, description = "Space grant was revoked.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Scope denied.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn space_grant_revoke( + State(state): State, + headers: HeaderMap, + role: Option>, + Path(space): Path, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let scope = parse_space(space.as_str())?; + let role = role.map(|Extension(role)| role); + + if matches!(scope, ShareScope::OrgShared) { + require_admin_for_org_shared_writes(state.service.cfg.security.auth_mode.as_str(), role)?; + } + + let response = state + .service + .space_grant_revoke(SpaceGrantRevokeRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + scope, + grantee_kind: payload.grantee_kind, + grantee_agent_id: payload.grantee_agent_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/graph/predicates", + tag = "graph", + params(("scope" = Option, Query, description = "Predicate scope filter.")), + responses( + (status = 200, description = "Graph predicates.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_graph_predicates_list( + State(state): State, + headers: HeaderMap, + query: Result, QueryRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Query(query) = query.map_err(|err| { + tracing::warn!(error = %err, "Invalid query parameters."); + + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid query parameters.".to_string(), + None, + ) + })?; + let response = state + .service + .admin_graph_predicates_list(AdminGraphPredicatesListRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + scope: query.scope, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + patch, + path = "/v2/admin/graph/predicates/{predicate_id}", + tag = "graph", + params(("predicate_id" = Uuid, Path, description = "Predicate ID.")), + request_body = Value, + responses( + (status = 200, description = "Graph predicate was updated.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Predicate was not found.", body = ErrorBody), + (status = 409, description = "Predicate update conflicted.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_graph_predicate_patch( + State(state): State, + headers: HeaderMap, + Path(predicate_id): Path, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let token_id = effective_token_id(state.service.cfg.security.auth_mode.as_str(), &headers); + let response = state + .service + .admin_graph_predicate_patch(AdminGraphPredicatePatchRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + token_id, + predicate_id, + status: payload.status, + cardinality: payload.cardinality, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/admin/graph/predicates/{predicate_id}/aliases", + tag = "graph", + params(("predicate_id" = Uuid, Path, description = "Predicate ID.")), + request_body = Value, + responses( + (status = 200, description = "Graph predicate alias was added.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Predicate was not found.", body = ErrorBody), + (status = 409, description = "Predicate update conflicted.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_graph_predicate_alias_add( + State(state): State, + headers: HeaderMap, + Path(predicate_id): Path, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let token_id = effective_token_id(state.service.cfg.security.auth_mode.as_str(), &headers); + let response = state + .service + .admin_graph_predicate_alias_add(AdminGraphPredicateAliasAddRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + token_id, + predicate_id, + alias: payload.alias, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/graph/predicates/{predicate_id}/aliases", + tag = "graph", + params(("predicate_id" = Uuid, Path, description = "Predicate ID.")), + responses( + (status = 200, description = "Graph predicate aliases.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Predicate was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_graph_predicate_aliases_list( + State(state): State, + headers: HeaderMap, + Path(predicate_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .admin_graph_predicate_aliases_list(AdminGraphPredicateAliasesListRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + predicate_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/notes/{note_id}/provenance", + tag = "admin", + params(("note_id" = Uuid, Path, description = "Note ID.")), + responses( + (status = 200, description = "Note provenance bundle.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Note was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_note_provenance_get( + State(state): State, + headers: HeaderMap, + Path(note_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .note_provenance_get(NoteProvenanceGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + note_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/notes/{note_id}/history", + tag = "admin", + params(("note_id" = Uuid, Path, description = "Note ID.")), + responses( + (status = 200, description = "Memory history timeline.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Note was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_note_history_get( + State(state): State, + headers: HeaderMap, + Path(note_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .memory_history_get(MemoryHistoryGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + note_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/admin/consolidation/runs", + tag = "consolidation", + request_body = Value, + responses( + (status = 200, description = "Consolidation run was created.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn consolidation_run_create( + State(state): State, + headers: HeaderMap, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let response = state + .service + .consolidation_run_create(ConsolidationRunCreateRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + job_kind: payload.job_kind, + input_refs: payload.input_refs, + source_snapshot: payload.source_snapshot, + lineage: payload.lineage, + proposals: payload.proposals, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/consolidation/runs", + tag = "consolidation", + params(("limit" = Option, Query, description = "Maximum runs to return.")), + responses( + (status = 200, description = "Consolidation runs.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn consolidation_runs_list( + State(state): State, + headers: HeaderMap, + query: Result, QueryRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Query(query) = query.map_err(|err| { + tracing::warn!(error = %err, "Invalid query parameters."); + + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid query parameters.".to_string(), + None, + ) + })?; + let response = state + .service + .consolidation_runs_list(ConsolidationRunsListRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + limit: query.limit, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/consolidation/runs/{run_id}", + tag = "consolidation", + params(("run_id" = Uuid, Path, description = "Consolidation run ID.")), + responses( + (status = 200, description = "Consolidation run.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Consolidation run was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn consolidation_run_get( + State(state): State, + headers: HeaderMap, + Path(run_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .consolidation_run_get(ConsolidationRunGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + run_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/consolidation/proposals", + tag = "consolidation", + params( + ("run_id" = Option, Query, description = "Optional run filter."), + ("review_state" = Option, Query, description = "Optional review-state filter."), + ("limit" = Option, Query, description = "Maximum proposals to return."), + ), + responses( + (status = 200, description = "Consolidation proposals.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn consolidation_proposals_list( + State(state): State, + headers: HeaderMap, + query: Result, QueryRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Query(query) = query.map_err(|err| { + tracing::warn!(error = %err, "Invalid query parameters."); + + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid query parameters.".to_string(), + None, + ) + })?; + let response = state + .service + .consolidation_proposals_list(ConsolidationProposalsListRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + run_id: query.run_id, + review_state: query.review_state, + limit: query.limit, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/consolidation/proposals/{proposal_id}", + tag = "consolidation", + params(("proposal_id" = Uuid, Path, description = "Consolidation proposal ID.")), + responses( + (status = 200, description = "Consolidation proposal.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Consolidation proposal was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn consolidation_proposal_get( + State(state): State, + headers: HeaderMap, + Path(proposal_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .consolidation_proposal_get(ConsolidationProposalGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + proposal_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/admin/consolidation/proposals/{proposal_id}/review", + tag = "consolidation", + params(("proposal_id" = Uuid, Path, description = "Consolidation proposal ID.")), + request_body = Value, + responses( + (status = 200, description = "Consolidation proposal review action was applied.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Consolidation proposal was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn consolidation_proposal_review( + State(state): State, + headers: HeaderMap, + Path(proposal_id): Path, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let response = state + .service + .consolidation_proposal_review(ConsolidationProposalReviewRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + reviewer_agent_id: ctx.agent_id, + proposal_id, + review_action: payload.action, + review_comment: payload.review_comment, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/admin/knowledge/pages/rebuild", + tag = "knowledge", + request_body = Value, + responses( + (status = 200, description = "Knowledge page was rebuilt.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn knowledge_page_rebuild( + State(state): State, + headers: HeaderMap, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let response = state + .service + .knowledge_page_rebuild(KnowledgePageRebuildRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + page_kind: payload.page_kind, + page_key: payload.page_key, + title: payload.title, + note_ids: payload.note_ids, + event_ids: payload.event_ids, + relation_ids: payload.relation_ids, + proposal_ids: payload.proposal_ids, + provider_metadata: payload.provider_metadata, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/knowledge/pages", + tag = "knowledge", + params( + ("page_kind" = Option, Query, description = "Optional page-kind filter."), + ("limit" = Option, Query, description = "Maximum pages to return."), + ), + responses( + (status = 200, description = "Knowledge pages.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn knowledge_pages_list( + State(state): State, + headers: HeaderMap, + query: Result, QueryRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Query(query) = query.map_err(|err| { + tracing::warn!(error = %err, "Invalid query parameters."); + + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid query parameters.".to_string(), + None, + ) + })?; + let response = state + .service + .knowledge_pages_list(KnowledgePagesListRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + page_kind: query.page_kind, + limit: query.limit, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/admin/knowledge/pages/search", + tag = "knowledge", + request_body = Value, + responses( + (status = 200, description = "Knowledge page section search results.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 422, description = "Non-English input rejected.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn knowledge_pages_search( + State(state): State, + headers: HeaderMap, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let response = state + .service + .knowledge_pages_search(KnowledgePageSearchRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + query: payload.query, + page_kind: payload.page_kind, + limit: payload.limit, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/knowledge/pages/{page_id}", + tag = "knowledge", + params(("page_id" = Uuid, Path, description = "Knowledge page ID.")), + responses( + (status = 200, description = "Knowledge page.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Knowledge page was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn knowledge_page_get( + State(state): State, + headers: HeaderMap, + Path(page_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .knowledge_page_get(KnowledgePageGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + page_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/admin/knowledge/pages/{page_id}/lint", + tag = "knowledge", + params(("page_id" = Uuid, Path, description = "Knowledge page ID.")), + responses( + (status = 200, description = "Knowledge page lint findings.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Knowledge page was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn knowledge_page_lint( + State(state): State, + headers: HeaderMap, + Path(page_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .knowledge_page_lint(KnowledgePageLintRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + page_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/events/ingestion-profiles", + tag = "admin", + responses( + (status = 200, description = "Ingestion profile versions.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_ingestion_profiles_list( + State(state): State, + headers: HeaderMap, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .admin_ingestion_profiles_list(AdminIngestionProfileListRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/admin/events/ingestion-profiles", + tag = "admin", + request_body = Value, + responses( + (status = 200, description = "Ingestion profile version was created.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_ingestion_profile_create( + State(state): State, + headers: HeaderMap, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let response = state + .service + .admin_ingestion_profile_create(AdminIngestionProfileCreateRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + profile_id: payload.profile_id, + version: payload.version, + profile: payload.profile, + created_by: payload.created_by, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/events/ingestion-profiles/{profile_id}", + tag = "admin", + params( + ("profile_id" = String, Path, description = "Ingestion profile ID."), + ("version" = Option, Query, description = "Optional profile version."), + ), + responses( + (status = 200, description = "Ingestion profile version.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Profile was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_ingestion_profile_get( + State(state): State, + headers: HeaderMap, + Path(profile_id): Path, + query: Result, QueryRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Query(query) = query.map_err(|err| { + tracing::warn!(error = %err, "Invalid query parameters."); + + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid query parameters.".to_string(), + None, + ) + })?; + let response = state + .service + .admin_ingestion_profile_get(AdminIngestionProfileGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + profile_id, + version: query.version, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/events/ingestion-profiles/{profile_id}/versions", + tag = "admin", + params(("profile_id" = String, Path, description = "Ingestion profile ID.")), + responses( + (status = 200, description = "Versions for one ingestion profile.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_ingestion_profile_versions_list( + State(state): State, + headers: HeaderMap, + Path(profile_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .admin_ingestion_profile_versions_list(AdminIngestionProfileVersionsListRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + profile_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/events/ingestion-profiles/default", + tag = "admin", + responses( + ( + status = 200, + description = "Default add_event ingestion profile pointer.", + body = AdminIngestionProfileDefaultResponseV2, + ), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_ingestion_profile_default_get( + State(state): State, + headers: HeaderMap, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .admin_ingestion_profile_default_get(AdminIngestionProfileDefaultGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + put, + path = "/v2/admin/events/ingestion-profiles/default", + tag = "admin", + request_body = AdminIngestionProfileDefaultSetBody, + responses( + ( + status = 200, + description = "Default add_event ingestion profile pointer was updated.", + body = AdminIngestionProfileDefaultResponseV2, + ), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Profile was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn admin_ingestion_profile_default_set( + State(state): State, + headers: HeaderMap, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + let response = state + .service + .admin_ingestion_profile_default_set(AdminIngestionProfileDefaultSetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + profile_id: payload.profile_id, + version: payload.version, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/admin/qdrant/rebuild", + tag = "admin", + responses( + (status = 200, description = "Qdrant rebuild report.", body = Value), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn rebuild_qdrant(State(state): State) -> Result, ApiError> { + let response = state.service.rebuild_qdrant().await?; + + Ok(Json(response)) +} + +#[utoipa::path( + post, + path = "/v2/admin/searches/raw", + tag = "search", + request_body = Value, + responses( + (status = 200, description = "Raw admin search response.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 422, description = "Non-English input rejected.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn searches_raw( + State(state): State, + headers: HeaderMap, + payload: Result, JsonRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let read_profile = required_read_profile(&headers)?; + let Json(payload) = payload.map_err(|err| { + tracing::warn!(error = %err, "Invalid request payload."); + + json_error(StatusCode::BAD_REQUEST, "INVALID_REQUEST", "Invalid request payload.", None) + })?; + + if payload.query.chars().count() > MAX_QUERY_CHARS { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Query is too long.", + Some(vec!["$.query".to_string()]), + )); + } + if payload.top_k.unwrap_or(state.service.cfg.memory.top_k) > MAX_TOP_K { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "top_k is too large.", + Some(vec!["$.top_k".to_string()]), + )); + } + if payload.candidate_k.unwrap_or(state.service.cfg.memory.candidate_k) > MAX_CANDIDATE_K { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "candidate_k is too large.", + Some(vec!["$.candidate_k".to_string()]), + )); + } + + let request = SearchRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + token_id: effective_token_id(state.service.cfg.security.auth_mode.as_str(), &headers), + read_profile, + query: payload.query, + filter: payload.filter, + payload_level: payload.payload_level.unwrap_or_default(), + top_k: payload.top_k, + candidate_k: payload.candidate_k, + record_hits: Some(false), + ranking: payload.ranking, + }; + let response = match payload.mode { + SearchMode::QuickFind => state.service.search_raw_quick(request).await?, + SearchMode::PlannedSearch => { + let response = state.service.search_raw_planned(request).await?; + + SearchResponse { + trace_id: response.trace_id, + items: response.items, + trajectory_summary: response.trajectory_summary, + } + }, + }; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/traces/{trace_id}", + tag = "admin", + params(("trace_id" = Uuid, Path, description = "Search trace ID.")), + responses( + (status = 200, description = "Search trace bundle without full stage internals.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Trace was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn trace_get( + State(state): State, + headers: HeaderMap, + Path(trace_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .trace_get(TraceGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + trace_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/traces/recent", + tag = "admin", + params( + ("limit" = Option, Query, description = "Page size."), + ("cursor_created_at" = Option, Query, description = "Created-at page cursor."), + ("cursor_trace_id" = Option, Query, description = "Trace ID page cursor."), + ("agent_id" = Option, Query, description = "Optional trace creator filter."), + ("read_profile" = Option, Query, description = "Optional read profile filter."), + ("created_after" = Option, Query, description = "Strict lower created_at bound."), + ("created_before" = Option, Query, description = "Strict upper created_at bound."), + ), + responses( + (status = 200, description = "Recent search traces.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn trace_recent_list( + State(state): State, + headers: HeaderMap, + query: Result, QueryRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Query(query) = query.map_err(|err| { + tracing::warn!(error = %err, "Invalid query parameters."); + + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid query parameters.".to_string(), + None, + ) + })?; + let cursor_created_at = + parse_optional_rfc3339(query.cursor_created_at.as_ref(), "$.cursor_created_at")?; + let cursor_trace_id = query.cursor_trace_id; + let created_after = parse_optional_rfc3339(query.created_after.as_ref(), "$.created_after")?; + let created_before = parse_optional_rfc3339(query.created_before.as_ref(), "$.created_before")?; + + if cursor_created_at.is_some() != cursor_trace_id.is_some() { + return Err(json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "cursor_created_at and cursor_trace_id must be both set or both omitted.".to_string(), + Some(vec!["$.cursor_created_at".to_string(), "$.cursor_trace_id".to_string()]), + )); + } + + let response = state + .service + .trace_recent_list(TraceRecentListRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + limit: query.limit, + cursor_created_at, + cursor_trace_id, + agent_id_filter: query.agent_id, + read_profile: query.read_profile, + created_after, + created_before, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/trajectories/{trace_id}", + tag = "admin", + params(("trace_id" = Uuid, Path, description = "Search trace ID.")), + responses( + (status = 200, description = "Search trace retrieval trajectory.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Trace was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn trace_trajectory_get( + State(state): State, + headers: HeaderMap, + Path(trace_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .trace_trajectory_get(TraceTrajectoryGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + trace_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/trace-items/{item_id}", + tag = "admin", + params(("item_id" = Uuid, Path, description = "Trace item/result handle ID.")), + responses( + (status = 200, description = "Search trace item explain payload.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Trace item was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn trace_item_get( + State(state): State, + headers: HeaderMap, + Path(item_id): Path, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let response = state + .service + .search_explain(SearchExplainRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + result_handle: item_id, + }) + .await?; + + Ok(Json(response)) +} + +#[utoipa::path( + get, + path = "/v2/admin/traces/{trace_id}/bundle", + tag = "admin", + params( + ("trace_id" = Uuid, Path, description = "Search trace ID."), + ("mode" = Option, Query, description = "bounded or full."), + ("stage_items_limit" = Option, Query, description = "Maximum stage items."), + ("candidates_limit" = Option, Query, description = "Maximum candidate snapshot items."), + ), + responses( + (status = 200, description = "Search trace bundle.", body = Value), + (status = 400, description = "Invalid request.", body = ErrorBody), + (status = 401, description = "Authentication required.", body = ErrorBody), + (status = 403, description = "Admin access required.", body = ErrorBody), + (status = 404, description = "Trace was not found.", body = ErrorBody), + (status = 500, description = "Internal error.", body = ErrorBody), + ) +)] +async fn trace_bundle_get( + State(state): State, + headers: HeaderMap, + Path(trace_id): Path, + query: Result, QueryRejection>, +) -> Result, ApiError> { + let ctx = RequestContext::from_headers(&headers)?; + let Query(query) = query.map_err(|err| { + tracing::warn!(error = %err, "Invalid query parameters."); + + json_error( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "Invalid query parameters.".to_string(), + None, + ) + })?; + let response = state + .service + .trace_bundle_get(TraceBundleGetRequest { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + trace_id, + mode: query.mode.unwrap_or_default(), + stage_items_limit: query.stage_items_limit, + candidates_limit: query.candidates_limit, + }) + .await?; + + Ok(Json(response)) +} + +#[cfg(test)] +mod tests { + use axum::http::HeaderMap; + use uuid::Uuid; + + use crate::routes::{ + self, ADMIN_VIEWER_PATH, HEADER_AGENT_ID, HEADER_AUTHORIZATION, HEADER_PROJECT_ID, + HEADER_READ_PROFILE, HEADER_REQUEST_ID, HEADER_TENANT_ID, HEADER_TRUSTED_TOKEN_ID, + }; + use elf_config::{SecurityAuthKey, SecurityAuthRole}; + + #[test] + fn require_admin_for_org_shared_writes_denies_user_in_static_keys_mode() { + let err = routes::require_admin_for_org_shared_writes( + "static_keys", + Some(SecurityAuthRole::User), + ) + .expect_err("Expected forbidden error for non-admin role."); + + assert_eq!(err.status, axum::http::StatusCode::FORBIDDEN); + } + + #[test] + fn require_admin_for_org_shared_writes_allows_admin_in_static_keys_mode() { + routes::require_admin_for_org_shared_writes("static_keys", Some(SecurityAuthRole::Admin)) + .expect("Expected admin role to be allowed."); + } + + #[test] + fn require_admin_for_org_shared_writes_allows_superadmin_in_static_keys_mode() { + routes::require_admin_for_org_shared_writes( + "static_keys", + Some(SecurityAuthRole::SuperAdmin), + ) + .expect("Expected superadmin role to be allowed."); + } + + #[test] + fn require_admin_for_org_shared_writes_allows_non_static_keys_auth_mode() { + routes::require_admin_for_org_shared_writes("off", None) + .expect("Expected auth_mode != static_keys."); + } + + #[test] + fn admin_viewer_is_admin_prefixed_and_read_only() { + let html = routes::VIEWER_HTML; + + assert_eq!(ADMIN_VIEWER_PATH, "/viewer"); + assert!(html.contains("/v2/admin/searches")); + assert!(html.contains("/v2/admin/traces/recent")); + assert!(html.contains("/v2/admin/traces/${encodeURIComponent(traceId)}/bundle")); + assert!(html.contains("/v2/admin/notes/")); + assert!(html.contains("/v2/admin/knowledge/pages/search")); + assert!(html.contains("mode: \"full\"")); + assert!(html.contains("candidates_limit: 200")); + assert!(html.contains("Replay Candidates")); + assert!(html.contains("Selected Final Results")); + assert!(html.contains("Providers And Ranking")); + assert!(html.contains("Relation Context")); + assert!(html.contains("Knowledge Page Snippets")); + assert!(html.contains("Derived page: source notes")); + assert!(html.contains("directTraceId")); + assert!(html.contains("trace_id")); + assert!(html.contains("loadInitialTrace")); + assert!(!html.contains("method: \"PATCH\"")); + assert!(!html.contains("method: \"PUT\"")); + assert!(!html.contains("method: \"DELETE\"")); + assert!(!html.contains("/v2/notes/ingest")); + assert!(!html.contains("/v2/events/ingest")); + assert!(!html.contains("/publish")); + } + + #[test] + fn resolve_auth_key_requires_bearer_header() { + let headers = HeaderMap::new(); + let keys = vec![SecurityAuthKey { + token_id: "k1".to_string(), + token: "secret".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::User, + }]; + let err = + routes::resolve_auth_key(&headers, &keys).expect_err("Expected unauthorized error."); + + assert_eq!(err.status, axum::http::StatusCode::UNAUTHORIZED); + } + + #[test] + fn resolve_auth_key_rejects_unknown_token() { + let keys = vec![SecurityAuthKey { + token_id: "k1".to_string(), + token: "secret".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::User, + }]; + let mut headers = HeaderMap::new(); + + headers.insert(HEADER_AUTHORIZATION, "Bearer wrong".parse().expect("invalid header")); + + let err = routes::resolve_auth_key(&headers, &keys) + .expect_err("Expected unauthorized error for bad key."); + + assert_eq!(err.status, axum::http::StatusCode::UNAUTHORIZED); + } + + #[test] + fn resolve_auth_key_rejects_non_bearer_authorization() { + let keys = vec![SecurityAuthKey { + token_id: "k1".to_string(), + token: "secret".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::User, + }]; + let mut headers = HeaderMap::new(); + + headers.insert(HEADER_AUTHORIZATION, "Token secret".parse().expect("invalid header")); + + let err = routes::resolve_auth_key(&headers, &keys) + .expect_err("Expected unauthorized error for non-bearer authorization."); + + assert_eq!(err.status, axum::http::StatusCode::UNAUTHORIZED); + } + + #[test] + fn resolve_auth_key_rejects_lowercase_bearer_prefix() { + let keys = vec![SecurityAuthKey { + token_id: "k1".to_string(), + token: "secret".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::User, + }]; + let mut headers = HeaderMap::new(); + + headers.insert(HEADER_AUTHORIZATION, "bearer secret".parse().expect("invalid header")); + + let err = routes::resolve_auth_key(&headers, &keys) + .expect_err("Expected unauthorized error for lowercase bearer prefix."); + + assert_eq!(err.status, axum::http::StatusCode::UNAUTHORIZED); + } + + #[test] + fn apply_auth_key_context_overrides_headers() { + let mut headers = HeaderMap::new(); + + headers.insert(HEADER_AUTHORIZATION, "Bearer old".parse().expect("invalid header")); + headers.insert(HEADER_TENANT_ID, "bad-tenant".parse().expect("invalid header")); + headers.insert(HEADER_PROJECT_ID, "bad-project".parse().expect("invalid header")); + headers.insert(HEADER_AGENT_ID, "bad-agent".parse().expect("invalid header")); + headers.insert(HEADER_READ_PROFILE, "private_only".parse().expect("invalid header")); + headers.insert(HEADER_TRUSTED_TOKEN_ID, "old-id".parse().expect("invalid header")); + + let key = SecurityAuthKey { + token_id: "k1".to_string(), + token: "secret".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "all_scopes".to_string(), + role: SecurityAuthRole::Admin, + }; + + routes::apply_auth_key_context(&mut headers, &key).expect("Expected context injection."); + + assert_eq!( + headers.get(HEADER_TENANT_ID).and_then(|v| v.to_str().ok()).expect("missing tenant"), + "t" + ); + assert_eq!( + headers.get(HEADER_PROJECT_ID).and_then(|v| v.to_str().ok()).expect("missing project"), + "p" + ); + assert_eq!( + headers.get(HEADER_AGENT_ID).and_then(|v| v.to_str().ok()).expect("missing agent"), + "a" + ); + assert_eq!( + headers + .get(HEADER_READ_PROFILE) + .and_then(|v| v.to_str().ok()) + .expect("missing read profile"), + "all_scopes" + ); + assert_eq!( + headers + .get(HEADER_TRUSTED_TOKEN_ID) + .and_then(|v| v.to_str().ok()) + .expect("missing trusted token_id"), + "k1" + ); + } + + #[test] + fn apply_auth_key_context_requires_agent_scope() { + let mut headers = HeaderMap::new(); + let key = SecurityAuthKey { + token_id: "k1".to_string(), + token: "secret".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: None, + read_profile: "all_scopes".to_string(), + role: SecurityAuthRole::User, + }; + let err = routes::apply_auth_key_context(&mut headers, &key) + .expect_err("Expected forbidden error for missing agent_id."); + + assert_eq!(err.status, axum::http::StatusCode::FORBIDDEN); + } + + #[test] + fn effective_token_id_ignores_header_when_auth_mode_off() { + let mut headers = HeaderMap::new(); + + headers.insert(HEADER_TRUSTED_TOKEN_ID, "user-supplied".parse().expect("invalid header")); + + assert_eq!(routes::effective_token_id("off", &headers), None); + } + + #[test] + fn effective_token_id_uses_header_when_auth_mode_static_keys() { + let mut headers = HeaderMap::new(); + + headers.insert(HEADER_TRUSTED_TOKEN_ID, "k1".parse().expect("invalid header")); + + assert_eq!(routes::effective_token_id("static_keys", &headers), Some("k1".to_string())); + } + + #[test] + fn sanitize_trusted_token_header_removes_header() { + let mut headers = HeaderMap::new(); + + headers.insert(HEADER_TRUSTED_TOKEN_ID, "user-supplied".parse().expect("invalid header")); + + routes::sanitize_trusted_token_header(&mut headers); + + assert!(headers.get(HEADER_TRUSTED_TOKEN_ID).is_none()); + } + + #[test] + fn parse_request_id_from_headers_generates_when_missing() { + let headers = HeaderMap::new(); + let request_id = routes::parse_request_id_from_headers(&headers) + .expect("Expected a generated request ID when header is missing."); + + assert_ne!(request_id.to_string(), Uuid::nil().to_string()); + } + + #[test] + fn parse_request_id_from_headers_rejects_invalid() { + let mut headers = HeaderMap::new(); + + headers.insert(HEADER_REQUEST_ID, "not-a-uuid".parse().expect("invalid request_id")); + + let err = routes::parse_request_id_from_headers(&headers) + .expect_err("Expected invalid request_id to be rejected."); + + assert_eq!(err.status, axum::http::StatusCode::BAD_REQUEST); + assert_eq!(err.error_code, "INVALID_REQUEST"); + assert_eq!(err.fields, Some(vec![format!("$.headers.{HEADER_REQUEST_ID}")])); + } + + #[test] + fn inject_request_id_into_json_body_adds_request_id_to_object() { + let request_id = + Uuid::parse_str("123e4567-e89b-12d3-a456-426614174000").expect("valid uuid"); + let body = serde_json::json!({"note_id":"abc","status":"ok"}).to_string(); + let response_body = routes::inject_request_id_into_json_body(body.as_bytes(), &request_id) + .expect("Expected request_id field to be injected."); + let response_value = serde_json::from_slice::(&response_body) + .expect("Expected valid JSON"); + + assert_eq!(response_value["request_id"], request_id.to_string()); + } + + #[test] + fn inject_request_id_into_json_body_skips_non_object() { + let request_id = + Uuid::parse_str("123e4567-e89b-12d3-a456-426614174000").expect("valid uuid"); + let body = serde_json::json!(["a", "b", "c"]).to_string(); + + assert!(routes::inject_request_id_into_json_body(body.as_bytes(), &request_id).is_none()); + } +} diff --git a/apps/elf-api/src/state.rs b/apps/elf-api/src/state.rs index 2fc3b6d5..bc9ec40d 100644 --- a/apps/elf-api/src/state.rs +++ b/apps/elf-api/src/state.rs @@ -1,21 +1,43 @@ -// std +//! Shared application state bootstrap and backend wiring. + use std::sync::Arc; -// self +use color_eyre::Result; + +use elf_config::Config; use elf_service::ElfService; -use elf_storage::{db::Db, qdrant::QdrantStore}; +use elf_storage::{ + db::Db, + qdrant::{DOCS_SEARCH_FILTER_INDEXES, QdrantStore}, +}; +/// Shared state for API handlers. #[derive(Clone)] pub struct AppState { + /// The service instance serving API requests. pub service: Arc, } - impl AppState { - pub async fn new(config: elf_config::Config) -> color_eyre::Result { + /// Builds application state and ensures storage backends are ready. + pub async fn new(config: Config) -> Result { let db = Db::connect(&config.storage.postgres).await?; + db.ensure_schema(config.storage.qdrant.vector_dim).await?; + let qdrant = QdrantStore::new(&config.storage.qdrant)?; + + qdrant.ensure_collection().await?; + + let docs_qdrant = QdrantStore::new_with_collection( + &config.storage.qdrant, + &config.storage.qdrant.docs_collection, + )?; + + docs_qdrant.ensure_collection().await?; + docs_qdrant.ensure_payload_indexes(&DOCS_SEARCH_FILTER_INDEXES).await?; + let service = ElfService::new(config, db, qdrant); + Ok(Self { service: Arc::new(service) }) } } diff --git a/apps/elf-api/static/viewer.html b/apps/elf-api/static/viewer.html new file mode 100644 index 00000000..83e555bc --- /dev/null +++ b/apps/elf-api/static/viewer.html @@ -0,0 +1,1748 @@ + + + + + + ELF Viewer + + + +
+ + +
+
+
Ready.
+
+ +
+
+ +
+
+
+
+

Search Session

+ +
+
+ +
+ + + + +
+
+ +
+
+ +
+
+
+ +
+
+
+
+

Index

+
+
+
+
No session loaded.
+
+
+
+
+

Timeline

+ +
+
+
No timeline loaded.
+
+
+
+
+

Knowledge Page Snippets

+ +
+
+
Run a search to load derived page snippets.
+
+
+
+
+
+

Note Detail

+
Select a note.
+
+
+

Knowledge Page Detail

+
Select a derived page snippet.
+
+
+

Trace Explain

+
Run or load a session.
+
+
+
+
+ +
+
+
+

Notes

+ +
+
+
+ + + +
+
+
+
+
+

Note List

+
No notes loaded.
+
+
+

Note Metadata

+
Select a note.
+
+
+
+ +
+
+
+

Recent Traces

+ +
+
+
+ + + +
+
+ +
+
+ +
+
+
+
+
+

Trace List

+
No traces loaded.
+
+
+

Trace Bundle

+
Select a trace.
+
+
+
+
+
+
+ + + + diff --git a/apps/elf-api/tests/http.rs b/apps/elf-api/tests/http.rs index 35d4b31e..a59acdba 100644 --- a/apps/elf-api/tests/http.rs +++ b/apps/elf-api/tests/http.rs @@ -1,62 +1,120 @@ -// std -use std::env; +#![allow(unused_crate_dependencies)] + +//! End-to-end HTTP integration tests for the ELF API app. + +use std::{collections::HashMap, env}; -// crates.io use axum::{ + Router, body::{self, Body}, - http::{Request, StatusCode}, + http::{Request, Response, StatusCode}, +}; +use qdrant_client::{ + Payload, + qdrant::{Document, PointStruct, UpsertPointsBuilder, Vector}, }; use serde_json::Map; -use tower::util::ServiceExt; +use tower::util::ServiceExt as _; +use tracing::Level; +use uuid::Uuid; -// self -use elf_api::{routes, state::AppState}; +use elf_api::{ + routes::{self, OPENAPI_JSON_PATH, SCALAR_DOCS_PATH}, + state::AppState, +}; +use elf_config::{ + Chunking, Config, EmbeddingProviderConfig, Lifecycle, LlmProviderConfig, Memory, MemoryPolicy, + Postgres, ProviderConfig, Providers, Qdrant, Ranking, RankingBlend, RankingBlendSegment, + RankingDeterministic, RankingDeterministicDecay, RankingDeterministicHits, + RankingDeterministicLexical, RankingDiversity, RankingRetrievalSources, ReadProfiles, + ScopePrecedence, ScopeWriteAllowed, Scopes, Search, SearchCache, SearchDynamic, + SearchExpansion, SearchExplain, SearchGraphContext, SearchPrefilter, SearchRecursive, Security, + SecurityAuthKey, SecurityAuthRole, Service, Storage, TtlDays, +}; +use elf_storage::qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME}; use elf_testkit::TestDatabase; -async fn test_env() -> Option<(elf_testkit::TestDatabase, String, String)> { - let base_dsn = match elf_testkit::env_dsn() { - Some(value) => value, - None => { - eprintln!("Skipping HTTP tests; set ELF_PG_DSN to run this test."); - return None; +const TEST_TENANT_ID: &str = "tenant_alpha"; +const TEST_PROJECT_ID: &str = "project_alpha"; +const TEST_PROJECT_ID_B: &str = "project_beta"; +const TEST_AGENT_A: &str = "a"; +const TEST_AGENT_B: &str = "b"; + +fn test_ranking() -> Ranking { + Ranking { + recency_tau_days: 60.0, + tie_breaker_weight: 0.1, + deterministic: RankingDeterministic { + enabled: false, + lexical: RankingDeterministicLexical { + enabled: false, + weight: 0.05, + min_ratio: 0.3, + max_query_terms: 16, + max_text_terms: 1_024, + }, + hits: RankingDeterministicHits { + enabled: false, + weight: 0.05, + half_saturation: 8.0, + last_hit_tau_days: 14.0, + }, + decay: RankingDeterministicDecay { enabled: false, weight: 0.05, tau_days: 30.0 }, }, - }; - let qdrant_url = match env::var("ELF_QDRANT_URL") { - Ok(value) => value, - Err(_) => { - eprintln!("Skipping HTTP tests; set ELF_QDRANT_URL to run this test."); - return None; + blend: RankingBlend { + enabled: true, + rerank_normalization: "rank".to_string(), + retrieval_normalization: "rank".to_string(), + segments: vec![ + RankingBlendSegment { max_retrieval_rank: 3, retrieval_weight: 0.8 }, + RankingBlendSegment { max_retrieval_rank: 10, retrieval_weight: 0.5 }, + RankingBlendSegment { max_retrieval_rank: 1_000_000, retrieval_weight: 0.2 }, + ], }, - }; - let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); - let collection = test_db.collection_name("elf_http"); - Some((test_db, qdrant_url, collection)) + diversity: RankingDiversity { + enabled: true, + sim_threshold: 0.88, + mmr_lambda: 0.7, + max_skips: 64, + }, + retrieval_sources: RankingRetrievalSources { + fusion_weight: 1.0, + structured_field_weight: 1.0, + fusion_priority: 1, + structured_field_priority: 0, + }, + } } -fn test_config(dsn: String, qdrant_url: String, collection: String) -> elf_config::Config { - elf_config::Config { - service: elf_config::Service { +fn test_config(dsn: String, qdrant_url: String, collection: String) -> Config { + Config { + service: Service { http_bind: "127.0.0.1:0".to_string(), mcp_bind: "127.0.0.1:0".to_string(), admin_bind: "127.0.0.1:0".to_string(), log_level: "info".to_string(), }, - storage: elf_config::Storage { - postgres: elf_config::Postgres { dsn, pool_max_conns: 1 }, - qdrant: elf_config::Qdrant { url: qdrant_url, collection, vector_dim: 3 }, + storage: Storage { + postgres: Postgres { dsn, pool_max_conns: 4 }, + qdrant: Qdrant { + url: qdrant_url, + collection: collection.clone(), + docs_collection: format!("{collection}_docs"), + vector_dim: 4_096, + }, }, - providers: elf_config::Providers { + providers: Providers { embedding: dummy_embedding_provider(), rerank: dummy_provider(), llm_extractor: dummy_llm_provider(), }, - scopes: elf_config::Scopes { + scopes: Scopes { allowed: vec![ "agent_private".to_string(), "project_shared".to_string(), "org_shared".to_string(), ], - read_profiles: elf_config::ReadProfiles { + read_profiles: ReadProfiles { private_only: vec!["agent_private".to_string()], private_plus_project: vec![ "agent_private".to_string(), @@ -68,46 +126,26 @@ fn test_config(dsn: String, qdrant_url: String, collection: String) -> elf_confi "org_shared".to_string(), ], }, - precedence: elf_config::ScopePrecedence { - agent_private: 30, - project_shared: 20, - org_shared: 10, - }, - write_allowed: elf_config::ScopeWriteAllowed { + precedence: ScopePrecedence { agent_private: 30, project_shared: 20, org_shared: 10 }, + write_allowed: ScopeWriteAllowed { agent_private: true, project_shared: true, org_shared: true, }, }, - memory: elf_config::Memory { + memory: Memory { max_notes_per_add_event: 3, max_note_chars: 240, dup_sim_threshold: 0.92, update_sim_threshold: 0.85, candidate_k: 60, top_k: 12, + policy: MemoryPolicy { rules: vec![] }, }, - search: elf_config::Search { - expansion: elf_config::SearchExpansion { - mode: "off".to_string(), - max_queries: 4, - include_original: true, - }, - dynamic: elf_config::SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, - prefilter: elf_config::SearchPrefilter { max_candidates: 0 }, - cache: elf_config::SearchCache { - enabled: true, - expansion_ttl_days: 7, - rerank_ttl_days: 7, - max_payload_bytes: Some(262_144), - expansion_version: "v1".to_string(), - rerank_version: "v1".to_string(), - }, - explain: elf_config::SearchExplain { retention_days: 7 }, - }, - ranking: elf_config::Ranking { recency_tau_days: 60.0, tie_breaker_weight: 0.1 }, - lifecycle: elf_config::Lifecycle { - ttl_days: elf_config::TtlDays { + search: test_search(), + ranking: test_ranking(), + lifecycle: Lifecycle { + ttl_days: TtlDays { plan: 14, fact: 180, preference: 0, @@ -118,211 +156,2599 @@ fn test_config(dsn: String, qdrant_url: String, collection: String) -> elf_confi purge_deleted_after_days: 30, purge_deprecated_after_days: 180, }, - security: elf_config::Security { + security: Security { bind_localhost_only: true, - reject_cjk: true, + reject_non_english: true, redact_secrets_on_write: true, evidence_min_quotes: 1, evidence_max_quotes: 2, evidence_max_quote_chars: 320, + auth_mode: "off".to_string(), + auth_keys: vec![], }, - chunking: elf_config::Chunking { + chunking: Chunking { enabled: true, max_tokens: 512, overlap_tokens: 128, - tokenizer_repo: None, + tokenizer_repo: "gpt2".to_string(), }, + context: None, + mcp: None, } } -fn dummy_embedding_provider() -> elf_config::EmbeddingProviderConfig { - elf_config::EmbeddingProviderConfig { - provider_id: "test".to_string(), +fn test_search() -> Search { + Search { + expansion: SearchExpansion { + mode: "off".to_string(), + max_queries: 4, + include_original: true, + }, + dynamic: SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, + prefilter: SearchPrefilter { max_candidates: 0 }, + cache: SearchCache { + enabled: true, + expansion_ttl_days: 7, + rerank_ttl_days: 7, + max_payload_bytes: Some(262_144), + }, + explain: SearchExplain { + retention_days: 7, + capture_candidates: false, + candidate_retention_days: 2, + write_mode: "outbox".to_string(), + }, + recursive: SearchRecursive { + enabled: false, + max_depth: 2, + max_children_per_node: 4, + max_nodes_per_scope: 32, + max_total_nodes: 256, + }, + graph_context: SearchGraphContext { + enabled: false, + max_facts_per_item: 16, + max_evidence_notes_per_fact: 16, + }, + } +} + +fn dummy_embedding_provider() -> EmbeddingProviderConfig { + EmbeddingProviderConfig { + provider_id: "local".to_string(), api_base: "http://127.0.0.1:1".to_string(), api_key: "test-key".to_string(), path: "/".to_string(), - model: "test".to_string(), - dimensions: 3, - timeout_ms: 1000, + model: "local-hash".to_string(), + dimensions: 4_096, + timeout_ms: 1_000, default_headers: Map::new(), } } -fn dummy_provider() -> elf_config::ProviderConfig { - elf_config::ProviderConfig { - provider_id: "test".to_string(), +fn dummy_provider() -> ProviderConfig { + ProviderConfig { + provider_id: "local".to_string(), api_base: "http://127.0.0.1:1".to_string(), api_key: "test-key".to_string(), path: "/".to_string(), - model: "test".to_string(), - timeout_ms: 1000, + model: "local-token-overlap".to_string(), + timeout_ms: 1_000, default_headers: Map::new(), } } -fn dummy_llm_provider() -> elf_config::LlmProviderConfig { - elf_config::LlmProviderConfig { +fn dummy_llm_provider() -> LlmProviderConfig { + LlmProviderConfig { provider_id: "test".to_string(), api_base: "http://127.0.0.1:1".to_string(), api_key: "test-key".to_string(), path: "/".to_string(), model: "test".to_string(), temperature: 0.1, - timeout_ms: 1000, + timeout_ms: 1_000, default_headers: Map::new(), } } -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn health_ok() { - let Some((test_db, qdrant_url, collection)) = test_env().await else { - return; +fn assert_openapi_method(spec: &serde_json::Value, path: &str, method: &str) { + let operation = spec + .get("paths") + .and_then(|paths| paths.get(path)) + .and_then(|path_item| path_item.get(method)); + + assert!(operation.is_some(), "Missing OpenAPI operation {method} {path}"); +} + +fn init_test_tracing() { + let _ = tracing_subscriber::fmt().with_max_level(Level::ERROR).with_test_writer().try_init(); +} + +fn context_request( + method: &str, + uri: impl AsRef, + agent_id: &str, + read_profile: &str, +) -> Request { + Request::builder() + .method(method) + .uri(uri.as_ref()) + .header("content-type", "application/json") + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", agent_id) + .header("X-ELF-Read-Profile", read_profile) + .body(Body::empty()) + .expect("Failed to build context request.") +} + +async fn test_env() -> Option<(TestDatabase, String, String)> { + let base_dsn = match elf_testkit::env_dsn() { + Some(value) => value, + None => { + eprintln!("Skipping HTTP tests; set ELF_PG_DSN to run this test."); + + return None; + }, }; - let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let qdrant_url = match env::var("ELF_QDRANT_GRPC_URL").or_else(|_| env::var("ELF_QDRANT_URL")) { + Ok(value) => value, + Err(_) => { + eprintln!( + "Skipping HTTP tests; set ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run this test." + ); + + return None; + }, + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let collection = test_db.collection_name("elf_http"); + + Some((test_db, qdrant_url, collection)) +} + +async fn insert_note( + state: &AppState, + note_id: Uuid, + note_scope: &str, + note_agent: &str, + note_text: &str, +) { + sqlx::query( + "INSERT INTO memory_notes ( + note_id, + tenant_id, + project_id, + agent_id, + scope, + type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, now(), now(), NULL, $12, $13)", + ) + .bind(note_id) + .bind(TEST_TENANT_ID) + .bind(TEST_PROJECT_ID) + .bind(note_agent) + .bind(note_scope) + .bind("fact") + .bind(None::) + .bind(note_text) + .bind(0.7_f32) + .bind(0.9_f32) + .bind("active") + .bind("v2-test") + .bind(serde_json::json!({ "source": "integration-test" })) + .execute(&state.service.db.pool) + .await + .expect("Failed to seed memory note."); +} + +async fn insert_project_scope_grant( + state: &AppState, + owner_agent_id: &str, + granter_agent_id: &str, +) { + sqlx::query( + "INSERT INTO memory_space_grants ( + grant_id, + tenant_id, + project_id, + scope, + space_owner_agent_id, + grantee_kind, + grantee_agent_id, + granted_by_agent_id + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)", + ) + .bind(Uuid::new_v4()) + .bind(TEST_TENANT_ID) + .bind(TEST_PROJECT_ID) + .bind("project_shared") + .bind(owner_agent_id) + .bind("project") + .bind(None::) + .bind(granter_agent_id) + .execute(&state.service.db.pool) + .await + .expect("Failed to seed project scope grant."); +} + +async fn search_session_count(state: &AppState) -> i64 { + sqlx::query_scalar("SELECT COUNT(*) FROM search_sessions") + .fetch_one(&state.service.db.pool) + .await + .expect("Failed to count search sessions.") +} + +async fn post_admin_json( + app: &Router, + uri: impl AsRef, + agent_id: &str, + body: serde_json::Value, +) -> (StatusCode, serde_json::Value) { + let request = Request::builder() + .method("POST") + .uri(uri.as_ref()) + .header("content-type", "application/json") + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", agent_id) + .body(Body::from(body.to_string())) + .expect("Failed to build admin JSON request."); + let response = app.clone().oneshot(request).await.expect("Failed to call admin route."); + let status = response.status(); + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read admin response body."); + + (status, serde_json::from_slice(&body).expect("Failed to parse admin response.")) +} + +async fn create_core_block(admin_app: &Router, scope: &str, key: &str, content: &str) -> Uuid { + let payload = serde_json::json!({ + "scope": scope, + "key": key, + "title": "Operating context", + "content": content, + "source_ref": { + "schema": "core_block_source/v1", + "ref": { "issue": "XY-832" } + } + }); + let (status, body) = + post_admin_json(admin_app, "/v2/admin/core-blocks", TEST_AGENT_A, payload).await; + + assert_eq!(status, StatusCode::OK); + + Uuid::parse_str( + body.pointer("/block/block_id") + .and_then(serde_json::Value::as_str) + .expect("Missing core block id."), + ) + .expect("Invalid core block id.") +} + +async fn attach_core_block( + admin_app: &Router, + block_id: Uuid, + target_agent_id: &str, + read_profile: &str, +) -> (StatusCode, serde_json::Value) { + let payload = serde_json::json!({ + "target_agent_id": target_agent_id, + "read_profile": read_profile, + "reason": "Attach fixture block." + }); + let uri = format!("/v2/admin/core-blocks/{block_id}/attachments"); + + post_admin_json(admin_app, uri, TEST_AGENT_A, payload).await +} + +async fn get_core_blocks(app: &Router, agent_id: &str, read_profile: &str) -> serde_json::Value { + let response = app + .clone() + .oneshot(context_request("GET", "/v2/core-blocks", agent_id, read_profile)) + .await + .expect("Failed to fetch core blocks."); + + assert_eq!(response.status(), StatusCode::OK); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read core blocks response body."); + + serde_json::from_slice(&body).expect("Failed to parse core blocks response.") +} + +async fn active_project_grant_count(state: &AppState, owner_agent_id: &str) -> i64 { + sqlx::query_scalar( + "SELECT COUNT(*) FROM memory_space_grants \ + WHERE tenant_id = $1 AND project_id = $2 AND scope = 'project_shared' \ + AND space_owner_agent_id = $3 AND grantee_kind = 'project' AND revoked_at IS NULL", + ) + .bind(TEST_TENANT_ID) + .bind(TEST_PROJECT_ID) + .bind(owner_agent_id) + .fetch_one(&state.service.db.pool) + .await + .expect("Failed to query project grant count.") +} + +async fn note_scope_and_project_id(state: &AppState, note_id: Uuid) -> (String, String) { + let row: (String, String) = sqlx::query_as( + "SELECT scope, project_id FROM memory_notes WHERE tenant_id = $1 AND note_id = $2", + ) + .bind(TEST_TENANT_ID) + .bind(note_id) + .fetch_one(&state.service.db.pool) + .await + .expect("Failed to query note scope and project id."); + + row +} + +async fn active_org_shared_project_grant_count(state: &AppState, owner_agent_id: &str) -> i64 { + sqlx::query_scalar( + "SELECT COUNT(*) FROM memory_space_grants \ + WHERE tenant_id = $1 AND project_id = '__org__' AND scope = 'org_shared' \ + AND space_owner_agent_id = $2 AND grantee_kind = 'project' AND revoked_at IS NULL", + ) + .bind(TEST_TENANT_ID) + .bind(owner_agent_id) + .fetch_one(&state.service.db.pool) + .await + .expect("Failed to query org_shared project grant count.") +} + +async fn active_org_shared_project_grant_count_for_project( + state: &AppState, + project_id: &str, + owner_agent_id: &str, +) -> i64 { + sqlx::query_scalar( + "SELECT COUNT(*) FROM memory_space_grants \ + WHERE tenant_id = $1 AND project_id = $2 AND scope = 'org_shared' \ + AND space_owner_agent_id = $3 AND grantee_kind = 'project' AND revoked_at IS NULL", + ) + .bind(TEST_TENANT_ID) + .bind(project_id) + .bind(owner_agent_id) + .fetch_one(&state.service.db.pool) + .await + .expect("Failed to query org_shared project grant count for project.") +} + +async fn org_shared_note_is_visible_across_projects_fixture() +-> Option<(TestDatabase, Router, AppState, Uuid)> { + let (test_db, qdrant_url, collection) = test_env().await?; + let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + + config.security.auth_mode = "static_keys".to_string(); + config.security.auth_keys = vec![ + SecurityAuthKey { + token_id: "admin-token-id".to_string(), + token: "admin-token".to_string(), + tenant_id: TEST_TENANT_ID.to_string(), + project_id: TEST_PROJECT_ID.to_string(), + agent_id: Some("admin-agent".to_string()), + read_profile: "all_scopes".to_string(), + role: SecurityAuthRole::Admin, + }, + SecurityAuthKey { + token_id: "reader-token-id".to_string(), + token: "reader-token".to_string(), + tenant_id: TEST_TENANT_ID.to_string(), + project_id: TEST_PROJECT_ID_B.to_string(), + agent_id: Some("reader-agent".to_string()), + read_profile: "all_scopes".to_string(), + role: SecurityAuthRole::User, + }, + ]; + let state = AppState::new(config).await.expect("Failed to initialize app state."); let app = routes::router(state.clone()); - let _ = routes::admin_router(state); + let note_id = Uuid::new_v4(); + + insert_note( + &state, + note_id, + "agent_private", + "admin-agent", + "Fact: org_shared cross-project visibility.", + ) + .await; + + Some((test_db, app, state, note_id)) +} + +async fn list_org_shared_notes_as_reader(app: &Router) -> serde_json::Value { let response = app + .clone() .oneshot( Request::builder() - .uri("/health") + .method("GET") + .uri("/v2/notes?scope=org_shared") + .header("Authorization", "Bearer reader-token") .body(Body::empty()) - .expect("Failed to build request."), + .expect("Failed to build list request."), ) .await - .expect("Failed to call /health."); + .expect("Failed to call notes list."); + assert_eq!(response.status(), StatusCode::OK); - test_db.cleanup().await.expect("Failed to cleanup test database."); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read list response body."); + + serde_json::from_slice(&body).expect("Failed to parse list response.") } -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn rejects_cjk_in_add_note() { - let Some((test_db, qdrant_url, collection)) = test_env().await else { - return; - }; - let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); - let state = AppState::new(config).await.expect("Failed to initialize app state."); - let app = routes::router(state); +async fn publish_org_shared_note_as_reader_can_see(scope_app: &Router, note_id: Uuid) { + let payload = serde_json::json!({ "space": "org_shared" }).to_string(); + let response = scope_app + .clone() + .oneshot( + Request::builder() + .method("POST") + .uri(format!("/v2/notes/{note_id}/publish")) + .header("Authorization", "Bearer admin-token") + .header("content-type", "application/json") + .body(Body::from(payload)) + .expect("Failed to build note publish request."), + ) + .await + .expect("Failed to call notes publish."); + + assert_eq!(response.status(), StatusCode::OK); +} + +async fn assert_note_visible_to_project_reader( + scope_app: &Router, + state: &AppState, + note_id: Uuid, +) { + let (scope, project_id) = note_scope_and_project_id(state, note_id).await; + + assert_eq!(scope, "org_shared"); + // org_shared note rows live in the synthetic org project, not the request project. + assert_eq!(project_id, "__org__"); + + let org_grant_count = active_org_shared_project_grant_count(state, "admin-agent").await; + + assert!(org_grant_count > 0); + + // org_shared grant rows live in '__org__' as well; they should not be written into the request + // project. + let request_project_grant_count = + active_org_shared_project_grant_count_for_project(state, TEST_PROJECT_ID, "admin-agent") + .await; + + assert_eq!(request_project_grant_count, 0); + + let list_after_json = list_org_shared_notes_as_reader(scope_app).await; + let items = list_after_json["items"].as_array().expect("Missing items array."); + let ids: Vec<&str> = items.iter().filter_map(|item| item["note_id"].as_str()).collect(); + let note_id_str = note_id.to_string(); + + assert!(ids.contains(¬e_id_str.as_str())); +} + +async fn post_with_authorization_and_json_body( + app: &Router, + uri: &str, + auth: &str, + payload: &str, + build_expect: &str, + call_expect: &str, +) -> Response { + app.clone() + .oneshot( + Request::builder() + .method("POST") + .uri(uri) + .header("Authorization", auth) + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect(build_expect), + ) + .await + .expect(call_expect) +} + +async fn create_note_for_payload_level_tests( + app: &Router, + state: &AppState, + text: &str, + source_ref: serde_json::Value, +) -> Uuid { + init_test_tracing(); + let payload = serde_json::json!({ - "tenant_id": "t", - "project_id": "p", - "agent_id": "a", "scope": "agent_private", "notes": [{ "type": "fact", "key": null, - "text": "你好", - "importance": 0.5, + "text": text, + "importance": 0.8, "confidence": 0.9, "ttl_days": null, - "source_ref": {} + "source_ref": source_ref, }] }); - let response = app + .clone() .oneshot( Request::builder() .method("POST") - .uri("/v1/memory/add_note") + .uri("/v2/notes/ingest") + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_A) .header("content-type", "application/json") .body(Body::from(payload.to_string())) - .expect("Failed to build request."), + .expect("Failed to build note ingest request."), ) .await - .expect("Failed to call add_note."); - - assert_eq!(response.status(), StatusCode::UNPROCESSABLE_ENTITY); + .expect("Failed to call note ingest."); + let status = response.status(); let body = body::to_bytes(response.into_body(), usize::MAX) .await - .expect("Failed to read response body."); - let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response."); - assert_eq!(json["error_code"], "NON_ENGLISH_INPUT"); - assert_eq!(json["fields"][0], "$.notes[0].text"); - test_db.cleanup().await.expect("Failed to cleanup test database."); + .expect("Failed to read note ingest response body."); + + assert_eq!( + status, + StatusCode::OK, + "Unexpected note ingest status with body: {}", + String::from_utf8_lossy(&body) + ); + + let json: serde_json::Value = + serde_json::from_slice(&body).expect("Failed to parse note ingest response."); + let note_id = json["results"] + .as_array() + .expect("Missing results array in note ingest response.") + .first() + .and_then(|result| result["note_id"].as_str()) + .expect("Missing note_id in note ingest response."); + let note_id = Uuid::parse_str(note_id).expect("Invalid note_id in note ingest response."); + + index_note_for_payload_level_tests(state, note_id, text).await; + + note_id } -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn rejects_cjk_in_add_event() { - let Some((test_db, qdrant_url, collection)) = test_env().await else { - return; - }; - let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); - let state = AppState::new(config).await.expect("Failed to initialize app state."); - let app = routes::router(state); +async fn index_note_for_payload_level_tests(state: &AppState, note_id: Uuid, text: &str) { + let chunk_id = Uuid::new_v4(); + let embedding_version = format!( + "{}:{}:{}", + state.service.cfg.providers.embedding.provider_id, + state.service.cfg.providers.embedding.model, + state.service.cfg.storage.qdrant.vector_dim + ); + + sqlx::query( + "INSERT INTO memory_note_chunks ( + chunk_id, + note_id, + chunk_index, + start_offset, + end_offset, + text, + embedding_version + ) VALUES ($1, $2, $3, $4, $5, $6, $7)", + ) + .bind(chunk_id) + .bind(note_id) + .bind(0_i32) + .bind(0_i32) + .bind(i32::try_from(text.len()).expect("Payload-level test text fits i32 offsets.")) + .bind(text) + .bind(embedding_version.as_str()) + .execute(&state.service.db.pool) + .await + .expect("Failed to seed memory note chunk."); + + let mut payload = Payload::new(); + + payload.insert("note_id", note_id.to_string()); + payload.insert("chunk_id", chunk_id.to_string()); + payload.insert("chunk_index", 0_i64); + payload.insert("start_offset", 0_i64); + payload.insert("end_offset", i64::try_from(text.len()).expect("Test text fits i64 offsets.")); + payload.insert("tenant_id", TEST_TENANT_ID); + payload.insert("project_id", TEST_PROJECT_ID); + payload.insert("agent_id", TEST_AGENT_A); + payload.insert("scope", "agent_private"); + payload.insert("type", "fact"); + payload.insert("status", "active"); + payload.insert("embedding_version", embedding_version); + + let mut vectors = HashMap::new(); + + vectors.insert( + DENSE_VECTOR_NAME.to_string(), + Vector::from(vec![0.0_f32; state.service.qdrant.vector_dim as usize]), + ); + vectors.insert( + BM25_VECTOR_NAME.to_string(), + Vector::from(Document::new(text.to_string(), BM25_MODEL)), + ); + + let point = PointStruct::new(chunk_id.to_string(), vectors, payload); + + state + .service + .qdrant + .client + .upsert_points( + UpsertPointsBuilder::new(state.service.qdrant.collection.clone(), vec![point]) + .wait(true), + ) + .await + .expect("Failed to seed Qdrant point."); +} + +async fn insert_note_summary_field(state: &AppState, note_id: Uuid, summary: &str) { + sqlx::query( + "INSERT INTO memory_note_fields (field_id, note_id, field_kind, item_index, text) \ + VALUES ($1, $2, $3, $4, $5)", + ) + .bind(Uuid::new_v4()) + .bind(note_id) + .bind("summary") + .bind(0) + .bind(summary) + .execute(&state.service.db.pool) + .await + .expect("Failed to insert note summary field."); +} + +async fn fetch_search_notes_for_payload_level( + app: &Router, + search_id: Uuid, + note_id: Uuid, + payload_level: &str, +) -> serde_json::Value { let payload = serde_json::json!({ - "tenant_id": "t", - "project_id": "p", - "agent_id": "a", - "scope": "agent_private", - "dry_run": true, - "messages": [{ - "role": "user", - "content": "こんにちは" - }] + "note_ids": [note_id], + "payload_level": payload_level, + "record_hits": false, }); - let response = app + .clone() .oneshot( Request::builder() .method("POST") - .uri("/v1/memory/add_event") + .uri(format!("/v2/searches/{search_id}/notes")) + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_A) .header("content-type", "application/json") .body(Body::from(payload.to_string())) - .expect("Failed to build request."), + .expect("Failed to build search notes request."), ) .await - .expect("Failed to call add_event."); - - assert_eq!(response.status(), StatusCode::UNPROCESSABLE_ENTITY); + .expect("Failed to call search notes."); + let status = response.status(); let body = body::to_bytes(response.into_body(), usize::MAX) .await - .expect("Failed to read response body."); - let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response."); - assert_eq!(json["error_code"], "NON_ENGLISH_INPUT"); - assert_eq!(json["fields"][0], "$.messages[0].content"); - test_db.cleanup().await.expect("Failed to cleanup test database."); + .expect("Failed to read search notes response body."); + + assert_eq!( + status, + StatusCode::OK, + "Unexpected search notes response: {}", + String::from_utf8_lossy(&body) + ); + + let json: serde_json::Value = + serde_json::from_slice(&body).expect("Failed to parse search notes response."); + + json.get("results") + .and_then(serde_json::Value::as_array) + .and_then(|results| results.first()) + .and_then(|result| result.get("note")) + .cloned() + .expect("Expected note in search notes response.") } -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn rejects_cjk_in_search() { - let Some((test_db, qdrant_url, collection)) = test_env().await else { - return; - }; - let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); - let state = AppState::new(config).await.expect("Failed to initialize app state."); - let app = routes::router(state); +async fn fetch_admin_search_raw_source_ref( + app: &Router, + query: &str, + payload_level: &str, +) -> serde_json::Value { let payload = serde_json::json!({ - "tenant_id": "t", - "project_id": "p", - "agent_id": "a", - "read_profile": "private_only", - "query": "안녕하세요", + "mode": "quick_find", + "query": query, "top_k": 5, - "candidate_k": 10 + "candidate_k": 10, + "payload_level": payload_level, }); - let response = app + .clone() .oneshot( Request::builder() .method("POST") - .uri("/v1/memory/search") + .uri("/v2/admin/searches/raw") + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_A) + .header("X-ELF-Read-Profile", "private_only") .header("content-type", "application/json") .body(Body::from(payload.to_string())) - .expect("Failed to build request."), + .expect("Failed to build admin search raw request."), ) .await - .expect("Failed to call search."); + .expect("Failed to call admin search raw."); + let status = response.status(); + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read admin search raw response body."); + + assert_eq!( + status, + StatusCode::OK, + "Unexpected admin search raw status with body: {}", + String::from_utf8_lossy(&body) + ); + + let json: serde_json::Value = + serde_json::from_slice(&body).expect("Failed to parse admin search raw response."); + let item = json["items"] + .as_array() + .expect("Missing items in admin search raw response.") + .first() + .expect("Expected at least one raw search item."); + + item["source_ref"].clone() +} + +async fn contract_json() -> serde_json::Value { + let app = routes::contract_router::<()>(); + let response = app + .oneshot( + Request::builder() + .uri(OPENAPI_JSON_PATH) + .body(Body::empty()) + .expect("Failed to build OpenAPI request."), + ) + .await + .expect("Failed to call OpenAPI route."); + + assert_eq!(response.status(), StatusCode::OK); - assert_eq!(response.status(), StatusCode::UNPROCESSABLE_ENTITY); let body = body::to_bytes(response.into_body(), usize::MAX) .await - .expect("Failed to read response body."); - let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response."); - assert_eq!(json["error_code"], "NON_ENGLISH_INPUT"); - assert_eq!(json["fields"][0], "$.query"); + .expect("Failed to read OpenAPI response body."); + + serde_json::from_slice(&body).expect("Failed to parse OpenAPI response.") +} + +#[tokio::test] +async fn openapi_json_route_serves_generated_contract() { + let spec = contract_json().await; + + assert_eq!(spec["info"]["title"], "ELF API"); + assert!(spec.get("request_id").is_none()); + + assert_openapi_method(&spec, "/health", "get"); + assert_openapi_method(&spec, "/v2/notes/ingest", "post"); + assert_openapi_method(&spec, "/v2/events/ingest", "post"); + assert_openapi_method(&spec, "/v2/core-blocks", "get"); + assert_openapi_method(&spec, "/v2/docs/search/l0", "post"); + assert_openapi_method(&spec, "/v2/searches/{search_id}/notes", "post"); + assert_openapi_method(&spec, "/v2/admin/core-blocks", "post"); + assert_openapi_method(&spec, "/v2/admin/core-blocks/{block_id}/attachments", "post"); + assert_openapi_method(&spec, "/v2/admin/core-blocks/attachments/{attachment_id}", "delete"); + assert_openapi_method(&spec, "/v2/admin/searches/raw", "post"); + assert_openapi_method(&spec, "/v2/admin/events/ingestion-profiles/default", "get"); + assert_openapi_method(&spec, "/v2/admin/events/ingestion-profiles/default", "put"); + assert_openapi_method(&spec, "/v2/admin/consolidation/runs", "post"); + assert_openapi_method(&spec, "/v2/admin/consolidation/runs", "get"); + assert_openapi_method(&spec, "/v2/admin/consolidation/runs/{run_id}", "get"); + assert_openapi_method(&spec, "/v2/admin/consolidation/proposals", "get"); + assert_openapi_method(&spec, "/v2/admin/consolidation/proposals/{proposal_id}", "get"); + assert_openapi_method(&spec, "/v2/admin/consolidation/proposals/{proposal_id}/review", "post"); + assert_openapi_method(&spec, "/v2/admin/knowledge/pages/rebuild", "post"); + assert_openapi_method(&spec, "/v2/admin/knowledge/pages", "get"); + assert_openapi_method(&spec, "/v2/admin/knowledge/pages/search", "post"); + assert_openapi_method(&spec, "/v2/admin/knowledge/pages/{page_id}", "get"); + assert_openapi_method(&spec, "/v2/admin/knowledge/pages/{page_id}/lint", "post"); +} + +#[tokio::test] +async fn scalar_docs_route_serves_api_reference_html() { + let app = routes::contract_router::<()>(); + let response = app + .oneshot( + Request::builder() + .uri(SCALAR_DOCS_PATH) + .body(Body::empty()) + .expect("Failed to build Scalar docs request."), + ) + .await + .expect("Failed to call Scalar docs route."); + + assert_eq!(response.status(), StatusCode::OK); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read Scalar docs response body."); + let html = String::from_utf8(body.to_vec()).expect("Scalar docs response was not UTF-8."); + + assert!(html.contains("@scalar/api-reference")); + assert!(html.contains("/v2/admin/events/ingestion-profiles/default")); + assert!(html.contains("/v2/admin/consolidation/proposals")); + assert!(html.contains("/v2/admin/knowledge/pages")); + assert!(html.contains("/v2/admin/knowledge/pages/search")); +} + +#[tokio::test] +async fn openapi_includes_default_ingestion_profile_get_put_contract() { + let spec = contract_json().await; + let default_path = &spec["paths"]["/v2/admin/events/ingestion-profiles/default"]; + let get_schema_ref = + default_path["get"]["responses"]["200"]["content"]["application/json"]["schema"]["$ref"] + .as_str() + .expect("Missing default profile GET response schema ref."); + let put_request_schema_ref = default_path["put"]["requestBody"]["content"]["application/json"] + ["schema"]["$ref"] + .as_str() + .expect("Missing default profile PUT request schema ref."); + let put_response_schema_ref = + default_path["put"]["responses"]["200"]["content"]["application/json"]["schema"]["$ref"] + .as_str() + .expect("Missing default profile PUT response schema ref."); + + assert!(get_schema_ref.ends_with("/AdminIngestionProfileDefaultResponseV2")); + assert!(put_request_schema_ref.ends_with("/AdminIngestionProfileDefaultSetBody")); + assert!(put_response_schema_ref.ends_with("/AdminIngestionProfileDefaultResponseV2")); + + let schemas = &spec["components"]["schemas"]; + let request_schema = &schemas["AdminIngestionProfileDefaultSetBody"]; + let response_schema = &schemas["AdminIngestionProfileDefaultResponseV2"]; + + assert!(request_schema["properties"].get("profile_id").is_some()); + assert!(request_schema["properties"].get("version").is_some()); + assert!( + request_schema["required"] + .as_array() + .expect("Missing request required fields") + .contains(&serde_json::json!("profile_id")) + ); + assert!(response_schema["properties"].get("profile_id").is_some()); + assert!(response_schema["properties"].get("version").is_some()); + assert!(response_schema["properties"].get("updated_at").is_some()); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn sharing_visibility_requires_explicit_project_grant() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state.clone()); + let note_id = Uuid::new_v4(); + + insert_note(&state, note_id, "project_shared", TEST_AGENT_A, "Fact: shared note without grant") + .await; + + let response = app + .clone() + .oneshot( + Request::builder() + .method("GET") + .uri("/v2/notes?scope=project_shared") + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_B) + .body(Body::empty()) + .expect("Failed to build list request."), + ) + .await + .expect("Failed to call notes list."); + + assert_eq!(response.status(), StatusCode::OK); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read list response body."); + let list_json: serde_json::Value = + serde_json::from_slice(&body).expect("Failed to parse list response."); + + assert_eq!(list_json["items"].as_array().expect("Missing items array.").len(), 0); + + let note_response = app + .clone() + .oneshot( + Request::builder() + .uri(format!("/v2/notes/{note_id}")) + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_B) + .body(Body::empty()) + .expect("Failed to build get request."), + ) + .await + .expect("Failed to call notes get."); + + assert_eq!(note_response.status(), StatusCode::BAD_REQUEST); + + let body = body::to_bytes(note_response.into_body(), usize::MAX) + .await + .expect("Failed to read get response body."); + let note_json: serde_json::Value = + serde_json::from_slice(&body).expect("Failed to parse get response."); + + assert_eq!(note_json["error_code"], "INVALID_REQUEST"); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn core_blocks_are_explicitly_attached_and_separate_from_archival_search() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state.clone()); + let admin_app = routes::admin_router(state.clone()); + let private_block_id = create_core_block( + &admin_app, + "agent_private", + "private_operating_context", + "Preference: Keep core context separate from archival search.", + ) + .await; + let note_id = Uuid::new_v4(); + + insert_note( + &state, + note_id, + "agent_private", + TEST_AGENT_A, + "Fact: This archival note must not appear in attached core blocks.", + ) + .await; + + let (status, _) = + attach_core_block(&admin_app, private_block_id, TEST_AGENT_A, "private_only").await; + let before_sessions = search_session_count(&state).await; + let blocks = get_core_blocks(&app, TEST_AGENT_A, "private_only").await; + let after_sessions = search_session_count(&state).await; + + assert_eq!(status, StatusCode::OK); + assert_eq!(before_sessions, after_sessions); + assert_eq!(blocks["schema"], "elf.core_memory_blocks/v1"); + assert_eq!(blocks["items"].as_array().expect("items array").len(), 1); + assert_eq!( + blocks["items"][0]["content"], + "Preference: Keep core context separate from archival search." + ); + assert_eq!(blocks["items"][0]["source_ref"]["schema"], "core_block_source/v1"); + assert!(blocks["items"][0]["audit_history"].as_array().expect("audit history").len() >= 2); + assert!(!blocks.to_string().contains("archival note must not appear")); + + let b_private = get_core_blocks(&app, TEST_AGENT_B, "private_only").await; + + assert_eq!(b_private["items"].as_array().expect("items array").len(), 0); + + let shared_block_id = create_core_block( + &admin_app, + "project_shared", + "shared_operating_context", + "Constraint: Shared core context requires explicit project grant and attachment.", + ) + .await; + let (denied_status, _) = + attach_core_block(&admin_app, shared_block_id, TEST_AGENT_B, "private_plus_project").await; + + assert_eq!(denied_status, StatusCode::FORBIDDEN); + + insert_project_scope_grant(&state, TEST_AGENT_A, TEST_AGENT_A).await; + + let (shared_status, _) = + attach_core_block(&admin_app, shared_block_id, TEST_AGENT_B, "private_plus_project").await; + let b_shared = get_core_blocks(&app, TEST_AGENT_B, "private_plus_project").await; + let b_wrong_profile = get_core_blocks(&app, TEST_AGENT_B, "private_only").await; + + assert_eq!(shared_status, StatusCode::OK); + assert_eq!(b_shared["items"].as_array().expect("items array").len(), 1); + assert_eq!(b_shared["items"][0]["scope"], "project_shared"); + assert_eq!(b_wrong_profile["items"].as_array().expect("items array").len(), 0); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn org_shared_note_is_visible_across_projects() { + let Some((test_db, app, state, note_id)) = + org_shared_note_is_visible_across_projects_fixture().await + else { + return; + }; + let list_before_json = list_org_shared_notes_as_reader(&app).await; + + assert_eq!(list_before_json["items"].as_array().expect("Missing items array.").len(), 0); + + publish_org_shared_note_as_reader_can_see(&app, note_id).await; + + let grant_upsert_payload = serde_json::json!({ "grantee_kind": "project" }).to_string(); + let grant_upsert_response = post_with_authorization_and_json_body( + &app, + "/v2/spaces/org_shared/grants", + "Bearer admin-token", + &grant_upsert_payload, + "Failed to build grant upsert request.", + "Failed to call grant upsert.", + ) + .await; + + assert_eq!(grant_upsert_response.status(), StatusCode::OK); + + assert_note_visible_to_project_reader(&app, &state, note_id).await; + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn sharing_project_grant_enables_agent_access_to_shared_note() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state.clone()); + let note_id = Uuid::new_v4(); + + insert_note( + &state, + note_id, + "project_shared", + TEST_AGENT_A, + "Fact: shared note with explicit grant.", + ) + .await; + insert_project_scope_grant(&state, TEST_AGENT_A, TEST_AGENT_A).await; + + let response = app + .clone() + .oneshot( + Request::builder() + .method("GET") + .uri("/v2/notes?scope=project_shared") + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_B) + .body(Body::empty()) + .expect("Failed to build list request."), + ) + .await + .expect("Failed to call notes list."); + + assert_eq!(response.status(), StatusCode::OK); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read list response body."); + let list_json: serde_json::Value = + serde_json::from_slice(&body).expect("Failed to parse list response."); + let items = list_json["items"].as_array().expect("Missing items array."); + + assert_eq!(items.len(), 1); + assert_eq!(items[0]["note_id"], note_id.to_string()); + + let note_response = app + .clone() + .oneshot( + Request::builder() + .uri(format!("/v2/notes/{note_id}")) + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_B) + .body(Body::empty()) + .expect("Failed to build get request."), + ) + .await + .expect("Failed to call notes get."); + + assert_eq!(note_response.status(), StatusCode::OK); + + let body = body::to_bytes(note_response.into_body(), usize::MAX) + .await + .expect("Failed to read get response body."); + let note_json: serde_json::Value = + serde_json::from_slice(&body).expect("Failed to parse get response."); + + assert_eq!(note_json["note_id"], note_id.to_string()); + assert_eq!(note_json["scope"], "project_shared"); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn sharing_publish_creates_scope_and_grant_visibility() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state.clone()); + let note_id = Uuid::new_v4(); + + insert_note( + &state, + note_id, + "agent_private", + TEST_AGENT_A, + "Fact: private note for publish test.", + ) + .await; + + let initial_grant_count = active_project_grant_count(&state, TEST_AGENT_A).await; + + assert_eq!(initial_grant_count, 0); + + let publish_payload = serde_json::json!({"space":"team_shared"}).to_string(); + let publish_response = app + .clone() + .oneshot( + Request::builder() + .method("POST") + .uri(format!("/v2/notes/{note_id}/publish")) + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_A) + .header("content-type", "application/json") + .body(Body::from(publish_payload)) + .expect("Failed to build publish request."), + ) + .await + .expect("Failed to call note publish."); + + assert_eq!(publish_response.status(), StatusCode::OK); + + let publish_body = body::to_bytes(publish_response.into_body(), usize::MAX) + .await + .expect("Failed to read publish response body."); + let publish_json: serde_json::Value = + serde_json::from_slice(&publish_body).expect("Failed to parse publish response."); + + assert_eq!(publish_json["note_id"], note_id.to_string()); + assert_eq!(publish_json["space"], "team_shared"); + + let after_grant_count = active_project_grant_count(&state, TEST_AGENT_A).await; + + assert_eq!(after_grant_count, 1); + + let list_response = app + .clone() + .oneshot( + Request::builder() + .method("GET") + .uri("/v2/notes?scope=project_shared") + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_B) + .body(Body::empty()) + .expect("Failed to build list request."), + ) + .await + .expect("Failed to call notes list."); + + assert_eq!(list_response.status(), StatusCode::OK); + + let list_body = body::to_bytes(list_response.into_body(), usize::MAX) + .await + .expect("Failed to read list response body."); + let list_json: serde_json::Value = + serde_json::from_slice(&list_body).expect("Failed to parse list response."); + let items = list_json["items"].as_array().expect("Missing items array."); + + assert_eq!(items.len(), 1); + assert_eq!(items[0]["note_id"], note_id.to_string()); + + let get_response = app + .clone() + .oneshot( + Request::builder() + .uri(format!("/v2/notes/{note_id}")) + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_B) + .body(Body::empty()) + .expect("Failed to build get request."), + ) + .await + .expect("Failed to call notes get."); + + assert_eq!(get_response.status(), StatusCode::OK); + + let get_body = body::to_bytes(get_response.into_body(), usize::MAX) + .await + .expect("Failed to read get response body."); + let get_json: serde_json::Value = + serde_json::from_slice(&get_body).expect("Failed to parse get response."); + + assert_eq!(get_json["note_id"], note_id.to_string()); + assert_eq!(get_json["scope"], "project_shared"); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn sharing_revoke_project_grant_removes_visibility() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state.clone()); + let note_id = Uuid::new_v4(); + + insert_note( + &state, + note_id, + "project_shared", + TEST_AGENT_A, + "Fact: shared note for revoke test.", + ) + .await; + insert_project_scope_grant(&state, TEST_AGENT_A, TEST_AGENT_A).await; + + let grant_count_before = active_project_grant_count(&state, TEST_AGENT_A).await; + + assert_eq!(grant_count_before, 1); + + let list_before = app + .clone() + .oneshot( + Request::builder() + .method("GET") + .uri("/v2/notes?scope=project_shared") + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_B) + .body(Body::empty()) + .expect("Failed to build list request."), + ) + .await + .expect("Failed to call notes list."); + let list_before_body = body::to_bytes(list_before.into_body(), usize::MAX) + .await + .expect("Failed to read list response body."); + let list_before_json: serde_json::Value = + serde_json::from_slice(&list_before_body).expect("Failed to parse list response."); + + assert_eq!(list_before_json["items"].as_array().expect("Missing items array.").len(), 1); + + let revoke_payload = serde_json::json!({"grantee_kind":"project"}).to_string(); + let revoke_response = app + .clone() + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/spaces/team_shared/grants/revoke") + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_A) + .header("content-type", "application/json") + .body(Body::from(revoke_payload)) + .expect("Failed to build revoke request."), + ) + .await + .expect("Failed to call grant revoke."); + + assert_eq!(revoke_response.status(), StatusCode::OK); + + let grant_count_after = active_project_grant_count(&state, TEST_AGENT_A).await; + + assert_eq!(grant_count_after, 0); + + let list_after = app + .clone() + .oneshot( + Request::builder() + .method("GET") + .uri("/v2/notes?scope=project_shared") + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_B) + .body(Body::empty()) + .expect("Failed to build list request."), + ) + .await + .expect("Failed to call notes list."); + + assert_eq!(list_after.status(), StatusCode::OK); + + let list_after_body = body::to_bytes(list_after.into_body(), usize::MAX) + .await + .expect("Failed to read list response body."); + let list_after_json: serde_json::Value = + serde_json::from_slice(&list_after_body).expect("Failed to parse list response."); + + assert_eq!(list_after_json["items"].as_array().expect("Missing items array.").len(), 0); + + let get_after = app + .oneshot( + Request::builder() + .uri(format!("/v2/notes/{note_id}")) + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_B) + .body(Body::empty()) + .expect("Failed to build get request."), + ) + .await + .expect("Failed to call notes get."); + + assert_eq!(get_after.status(), StatusCode::BAD_REQUEST); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn health_ok() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state.clone()); + let _ = routes::admin_router(state); + let response = app + .oneshot( + Request::builder() + .uri("/health") + .body(Body::empty()) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call /health."); + + assert_eq!(response.status(), StatusCode::OK); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn rejects_non_english_in_add_note() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state); + let payload = serde_json::json!({ + "scope": "agent_private", + "notes": [{ + "type": "fact", + "key": null, + "text": "你好", + "importance": 0.5, + "confidence": 0.9, + "ttl_days": null, + "source_ref": {} + }] + }); + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/notes/ingest") + .header("X-ELF-Tenant-Id", "t") + .header("X-ELF-Project-Id", "p") + .header("X-ELF-Agent-Id", "a") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call add_note."); + + assert_eq!(response.status(), StatusCode::UNPROCESSABLE_ENTITY); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read response body."); + let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response."); + + assert_eq!(json["error_code"], "NON_ENGLISH_INPUT"); + assert_eq!(json["fields"][0], "$.notes[0].text"); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn rejects_cyrillic_in_add_note() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state); + let payload = serde_json::json!({ + "scope": "agent_private", + "notes": [{ + "type": "fact", + "key": null, + "text": "Привет мир", + "importance": 0.5, + "confidence": 0.9, + "ttl_days": null, + "source_ref": {} + }] + }); + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/notes/ingest") + .header("X-ELF-Tenant-Id", "t") + .header("X-ELF-Project-Id", "p") + .header("X-ELF-Agent-Id", "a") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call add_note."); + + assert_eq!(response.status(), StatusCode::UNPROCESSABLE_ENTITY); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read response body."); + let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response."); + + assert_eq!(json["error_code"], "NON_ENGLISH_INPUT"); + assert_eq!(json["fields"][0], "$.notes[0].text"); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn rejects_non_english_in_add_event() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { return }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state); + let payload = serde_json::json!({ + "scope": "agent_private", + "dry_run": true, + "messages": [{ + "role": "user", + "content": "こんにちは" + }] + }); + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/events/ingest") + .header("X-ELF-Tenant-Id", "t") + .header("X-ELF-Project-Id", "p") + .header("X-ELF-Agent-Id", "a") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call add_event."); + + assert_eq!(response.status(), StatusCode::UNPROCESSABLE_ENTITY); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read response body."); + let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response."); + + assert_eq!(json["error_code"], "NON_ENGLISH_INPUT"); + assert_eq!(json["fields"][0], "$.messages[0].content"); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn rejects_cyrillic_in_add_event() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { return }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state); + let payload = serde_json::json!({ + "scope": "agent_private", + "dry_run": true, + "messages": [{ + "role": "user", + "content": "Это не английский текст." + }] + }); + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/events/ingest") + .header("X-ELF-Tenant-Id", "t") + .header("X-ELF-Project-Id", "p") + .header("X-ELF-Agent-Id", "a") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call add_event."); + + assert_eq!(response.status(), StatusCode::UNPROCESSABLE_ENTITY); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read response body."); + let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response."); + + assert_eq!(json["error_code"], "NON_ENGLISH_INPUT"); + assert_eq!(json["fields"][0], "$.messages[0].content"); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn rejects_non_english_in_search() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state); + + for mode in ["quick_find", "planned_search"] { + let payload = serde_json::json!({ + "mode": mode, + "query": "안녕하세요", + "top_k": 5, + "candidate_k": 10, + }); + let response = app + .clone() + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/searches") + .header("X-ELF-Tenant-Id", "t") + .header("X-ELF-Project-Id", "p") + .header("X-ELF-Agent-Id", "a") + .header("X-ELF-Read-Profile", "private_only") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call search."); + + assert_eq!(response.status(), StatusCode::UNPROCESSABLE_ENTITY); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read response body."); + let json: serde_json::Value = + serde_json::from_slice(&body).expect("Failed to parse response."); + + assert_eq!(json["error_code"], "NON_ENGLISH_INPUT"); + assert_eq!(json["fields"][0], "$.query"); + } + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn rejects_cyrillic_in_search() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state); + + for mode in ["quick_find", "planned_search"] { + let payload = serde_json::json!({ + "mode": mode, + "query": "Привет", + "top_k": 5, + "candidate_k": 10, + }); + let response = app + .clone() + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/searches") + .header("X-ELF-Tenant-Id", "t") + .header("X-ELF-Project-Id", "p") + .header("X-ELF-Agent-Id", "a") + .header("X-ELF-Read-Profile", "private_only") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call search."); + + assert_eq!(response.status(), StatusCode::UNPROCESSABLE_ENTITY); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read response body."); + let json: serde_json::Value = + serde_json::from_slice(&body).expect("Failed to parse response."); + + assert_eq!(json["error_code"], "NON_ENGLISH_INPUT"); + assert_eq!(json["fields"][0], "$.query"); + } + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn searches_notes_payload_level_shapes_source_ref_and_structured() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state.clone()); + let source_ref = serde_json::json!({ + "schema": "note_source_ref/v1", + "locator": { + "document_id": Uuid::new_v4().to_string(), + "chunk_id": Uuid::new_v4().to_string(), + "revision": "payload-shaping-contract-test" + }, + "metadata": { + "heavy_field": "This field should be hidden when payload_level is below l2." + } + }); + let structured_summary = "Compact structured summary used for payload-level l1 and l2 shaping."; + let note_text = + "Payload shaping note used in contract tests for search details output shaping."; + let note_id = + create_note_for_payload_level_tests(&app, &state, note_text, source_ref.clone()).await; + + insert_note_summary_field(&state, note_id, structured_summary).await; + + let search_response = app + .clone() + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/searches") + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_A) + .header("X-ELF-Read-Profile", "private_only") + .header("content-type", "application/json") + .body(Body::from( + serde_json::json!({ + "mode": "quick_find", + "query": "payload shaping", + "top_k": 5, + "candidate_k": 10, + }) + .to_string(), + )) + .expect("Failed to build searches request."), + ) + .await + .expect("Failed to call searches."); + + assert_eq!(search_response.status(), StatusCode::OK); + + let search_body = body::to_bytes(search_response.into_body(), usize::MAX) + .await + .expect("Failed to read searches response body."); + let search_json: serde_json::Value = + serde_json::from_slice(&search_body).expect("Failed to parse searches response."); + let trajectory = &search_json["trajectory_summary"]; + + if !trajectory.is_null() { + assert!(trajectory.is_object()); + assert!(trajectory.get("stages").is_some()); + } + + let search_id = Uuid::parse_str( + search_json["search_id"].as_str().expect("Missing search_id in searches response."), + ) + .expect("Invalid search_id value."); + let notes_l0 = fetch_search_notes_for_payload_level(&app, search_id, note_id, "l0").await; + let notes_l1 = fetch_search_notes_for_payload_level(&app, search_id, note_id, "l1").await; + let notes_l2 = fetch_search_notes_for_payload_level(&app, search_id, note_id, "l2").await; + let search_get_response = app + .clone() + .oneshot( + Request::builder() + .method("GET") + .uri(format!("/v2/searches/{search_id}")) + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_A) + .header("X-ELF-Read-Profile", "private_only") + .body(Body::empty()) + .expect("Failed to build searches get request."), + ) + .await + .expect("Failed to call searches get."); + + assert_eq!(search_get_response.status(), StatusCode::OK); + + let search_get_body = body::to_bytes(search_get_response.into_body(), usize::MAX) + .await + .expect("Failed to read searches get response body."); + let search_get_json: serde_json::Value = + serde_json::from_slice(&search_get_body).expect("Failed to parse searches get response."); + let search_get_trajectory = &search_get_json["trajectory_summary"]; + + if !search_get_trajectory.is_null() { + assert!(search_get_trajectory.is_object()); + assert!(search_get_trajectory.get("stages").is_some()); + } + + let notes_l0_text = notes_l0["text"].as_str().expect("Missing l0 text."); + let notes_l1_text = notes_l1["text"].as_str().expect("Missing l1 text."); + let notes_l2_text = notes_l2["text"].as_str().expect("Missing l2 text."); + + assert_eq!(notes_l0["source_ref"], serde_json::json!({})); + assert_eq!(notes_l1["source_ref"], serde_json::json!({})); + assert_eq!(notes_l2["source_ref"], source_ref); + assert!(notes_l0["structured"].is_null()); + assert!(notes_l1["structured"].is_object()); + assert!(notes_l2["structured"].is_object()); + assert!(notes_l0_text.len() <= 240); + assert_eq!(notes_l0_text, note_text); + assert_eq!(notes_l1_text, structured_summary); + assert_eq!(notes_l2_text, note_text); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn admin_searches_raw_payload_level_shapes_source_ref() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state.clone()); + let admin_app = routes::admin_router(state.clone()); + let source_ref = serde_json::json!({ + "schema": "note_source_ref/v1", + "locator": { + "document_id": Uuid::new_v4().to_string(), + "chunk_id": Uuid::new_v4().to_string(), + "revision": "admin-raw-contract-test" + }, + "metadata": { + "heavy_field": "This field should be hidden when payload_level is below l2." + } + }); + let note_text = + "Admin raw search payload shaping contract note. This long note should be indexed."; + let _note_id = + create_note_for_payload_level_tests(&app, &state, note_text, source_ref.clone()).await; + let raw_l0 = fetch_admin_search_raw_source_ref(&admin_app, "payload shaping", "l0").await; + let raw_l1 = fetch_admin_search_raw_source_ref(&admin_app, "payload shaping", "l1").await; + let raw_l2 = fetch_admin_search_raw_source_ref(&admin_app, "payload shaping", "l2").await; + + assert_eq!(raw_l0, serde_json::json!({})); + assert_eq!(raw_l1, serde_json::json!({})); + assert_eq!(raw_l2, source_ref); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn static_keys_requires_bearer_header() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + + config.security.auth_mode = "static_keys".to_string(); + config.security.auth_keys = vec![SecurityAuthKey { + token_id: "k1".to_string(), + token: "secret".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::User, + }]; + + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state); + let no_auth = app + .clone() + .oneshot(Request::builder().uri("/health").body(Body::empty()).expect("build request")) + .await + .expect("call /health without auth"); + + assert_eq!(no_auth.status(), StatusCode::UNAUTHORIZED); + + let non_bearer_auth = app + .clone() + .oneshot( + Request::builder() + .uri("/health") + .header("Authorization", "Basic secret") + .body(Body::empty()) + .expect("build non-bearer auth request"), + ) + .await + .expect("call /health with non-bearer auth"); + + assert_eq!(non_bearer_auth.status(), StatusCode::UNAUTHORIZED); + + let bearer_auth = app + .oneshot( + Request::builder() + .uri("/health") + .header("Authorization", "Bearer secret") + .body(Body::empty()) + .expect("build bearer auth request"), + ) + .await + .expect("call /health with bearer auth"); + + assert_eq!(bearer_auth.status(), StatusCode::OK); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +async fn static_keys_admin_required_for_org_shared_writes_fixture() +-> Option<(TestDatabase, Router, Uuid)> { + let (test_db, qdrant_url, collection) = test_env().await?; + let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + + config.security.auth_mode = "static_keys".to_string(); + config.security.auth_keys = vec![ + SecurityAuthKey { + token_id: "user-token-id".to_string(), + token: "user-token".to_string(), + tenant_id: TEST_TENANT_ID.to_string(), + project_id: TEST_PROJECT_ID.to_string(), + agent_id: Some("user-agent".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::User, + }, + SecurityAuthKey { + token_id: "admin-token-id".to_string(), + token: "admin-token".to_string(), + tenant_id: TEST_TENANT_ID.to_string(), + project_id: TEST_PROJECT_ID.to_string(), + agent_id: Some("admin-agent".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::Admin, + }, + ]; + + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state.clone()); + let note_id = Uuid::new_v4(); + + insert_note( + &state, + note_id, + "agent_private", + "admin-agent", + "Fact: org-shared publish setup note.", + ) + .await; + + Some((test_db, app, note_id)) +} + +async fn static_keys_admin_required_for_org_shared_writes_requests(app: &Router, note_id: Uuid) { + static_keys_admin_required_for_org_shared_writes_ingest_checks(app).await; + static_keys_admin_required_for_org_shared_writes_publish_checks(app, note_id).await; + static_keys_admin_required_for_org_shared_writes_grant_checks(app).await; +} + +async fn static_keys_admin_required_for_org_shared_writes_ingest_checks(app: &Router) { + let notes_payload = serde_json::json!({ + "scope": "org_shared", + "notes": [{ + "type": "fact", + "key": null, + "text": "你好", + "importance": 0.5, + "confidence": 0.9, + "ttl_days": null, + "source_ref": {} + }] + }) + .to_string(); + let user_ingest = post_with_authorization_and_json_body( + app, + "/v2/notes/ingest", + "Bearer user-token", + ¬es_payload, + "Failed to build notes ingest request.", + "Failed to call notes ingest.", + ) + .await; + + assert_eq!(user_ingest.status(), StatusCode::FORBIDDEN); + + let admin_ingest = post_with_authorization_and_json_body( + app, + "/v2/notes/ingest", + "Bearer admin-token", + ¬es_payload, + "Failed to build notes ingest request.", + "Failed to call notes ingest (admin).", + ) + .await; + + assert_eq!(admin_ingest.status(), StatusCode::UNPROCESSABLE_ENTITY); + + let admin_ingest_body = body::to_bytes(admin_ingest.into_body(), usize::MAX) + .await + .expect("Failed to read notes ingest response body."); + let admin_ingest_json: serde_json::Value = + serde_json::from_slice(&admin_ingest_body).expect("Failed to parse response."); + + assert_eq!(admin_ingest_json["error_code"], "NON_ENGLISH_INPUT"); +} + +async fn static_keys_admin_required_for_org_shared_writes_publish_checks( + app: &Router, + note_id: Uuid, +) { + let publish_payload = serde_json::json!({ "space": "org_shared" }).to_string(); + let user_publish = post_with_authorization_and_json_body( + app, + &format!("/v2/notes/{note_id}/publish"), + "Bearer user-token", + &publish_payload, + "Failed to build note publish request.", + "Failed to call notes publish.", + ) + .await; + + assert_eq!(user_publish.status(), StatusCode::FORBIDDEN); + + let admin_publish = post_with_authorization_and_json_body( + app, + &format!("/v2/notes/{note_id}/publish"), + "Bearer admin-token", + &publish_payload, + "Failed to build note publish request.", + "Failed to call notes publish (admin).", + ) + .await; + + assert_eq!(admin_publish.status(), StatusCode::OK); +} + +async fn static_keys_admin_required_for_org_shared_writes_grant_checks(app: &Router) { + let grant_upsert_payload = serde_json::json!({ "grantee_kind": "project" }).to_string(); + let user_grant_upsert = post_with_authorization_and_json_body( + app, + "/v2/spaces/org_shared/grants", + "Bearer user-token", + &grant_upsert_payload, + "Failed to build grant upsert request.", + "Failed to call grant upsert.", + ) + .await; + + assert_eq!(user_grant_upsert.status(), StatusCode::FORBIDDEN); + + let admin_grant_upsert = post_with_authorization_and_json_body( + app, + "/v2/spaces/org_shared/grants", + "Bearer admin-token", + &grant_upsert_payload, + "Failed to build grant upsert request.", + "Failed to call grant upsert (admin).", + ) + .await; + + assert_eq!(admin_grant_upsert.status(), StatusCode::OK); + + let grant_revoke_payload = serde_json::json!({ "grantee_kind": "project" }).to_string(); + let user_grant_revoke = post_with_authorization_and_json_body( + app, + "/v2/spaces/org_shared/grants/revoke", + "Bearer user-token", + &grant_revoke_payload, + "Failed to build grant revoke request.", + "Failed to call grant revoke.", + ) + .await; + + assert_eq!(user_grant_revoke.status(), StatusCode::FORBIDDEN); + + let admin_grant_revoke = post_with_authorization_and_json_body( + app, + "/v2/spaces/org_shared/grants/revoke", + "Bearer admin-token", + &grant_revoke_payload, + "Failed to build grant revoke request.", + "Failed to call grant revoke (admin).", + ) + .await; + + assert_eq!(admin_grant_revoke.status(), StatusCode::OK); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn static_keys_admin_required_for_org_shared_writes() { + let Some((test_db, app, note_id)) = + static_keys_admin_required_for_org_shared_writes_fixture().await + else { + return; + }; + + static_keys_admin_required_for_org_shared_writes_requests(&app, note_id).await; + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn static_keys_org_shared_ingest_requires_admin() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { return }; + let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + + config.security.auth_mode = "static_keys".to_string(); + config.security.auth_keys = vec![ + SecurityAuthKey { + token_id: "user".to_string(), + token: "user-token".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::User, + }, + SecurityAuthKey { + token_id: "admin".to_string(), + token: "admin-token".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::Admin, + }, + ]; + + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state); + let payload = serde_json::json!({ + "scope": "org_shared", + "notes": [{ + "type": "fact", + "key": null, + "text": "你好", + "importance": 0.5, + "confidence": 0.9, + "ttl_days": null, + "source_ref": {} + }] + }); + let response_user = app + .clone() + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/notes/ingest") + .header("Authorization", "Bearer user-token") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call notes ingest (user)."); + + assert_eq!(response_user.status(), StatusCode::FORBIDDEN); + + let response_admin = app + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/notes/ingest") + .header("Authorization", "Bearer admin-token") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call notes ingest (admin)."); + + assert_eq!(response_admin.status(), StatusCode::UNPROCESSABLE_ENTITY); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn static_keys_org_shared_events_ingest_requires_admin() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { return }; + let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + + config.security.auth_mode = "static_keys".to_string(); + config.security.auth_keys = vec![ + SecurityAuthKey { + token_id: "user".to_string(), + token: "user-token".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::User, + }, + SecurityAuthKey { + token_id: "admin".to_string(), + token: "admin-token".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::Admin, + }, + ]; + + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state); + let payload = serde_json::json!({ + "scope": "org_shared", + "dry_run": true, + "messages": [{ + "role": "user", + "content": "こんにちは" + }] + }); + let response_user = app + .clone() + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/events/ingest") + .header("Authorization", "Bearer user-token") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call events ingest (user)."); + + assert_eq!(response_user.status(), StatusCode::FORBIDDEN); + + let response_admin = app + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/events/ingest") + .header("Authorization", "Bearer admin-token") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call events ingest (admin)."); + + assert_eq!(response_admin.status(), StatusCode::UNPROCESSABLE_ENTITY); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn static_keys_org_shared_publish_requires_admin() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { return }; + let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + + config.security.auth_mode = "static_keys".to_string(); + config.security.auth_keys = vec![ + SecurityAuthKey { + token_id: "user".to_string(), + token: "user-token".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::User, + }, + SecurityAuthKey { + token_id: "admin".to_string(), + token: "admin-token".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::Admin, + }, + ]; + + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state); + let note_id = Uuid::new_v4(); + let payload = serde_json::json!({"space":"org_shared"}).to_string(); + let response_user = app + .clone() + .oneshot( + Request::builder() + .method("POST") + .uri(format!("/v2/notes/{note_id}/publish")) + .header("Authorization", "Bearer user-token") + .header("content-type", "application/json") + .body(Body::from(payload.clone())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call note publish (user)."); + + assert_eq!(response_user.status(), StatusCode::FORBIDDEN); + + let response_admin = app + .oneshot( + Request::builder() + .method("POST") + .uri(format!("/v2/notes/{note_id}/publish")) + .header("Authorization", "Bearer admin-token") + .header("content-type", "application/json") + .body(Body::from(payload)) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call note publish (admin)."); + + assert_ne!(response_admin.status(), StatusCode::FORBIDDEN); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn static_keys_org_shared_grants_require_admin() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { return }; + let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + + config.security.auth_mode = "static_keys".to_string(); + config.security.auth_keys = vec![ + SecurityAuthKey { + token_id: "user".to_string(), + token: "user-token".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::User, + }, + SecurityAuthKey { + token_id: "admin".to_string(), + token: "admin-token".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::Admin, + }, + ]; + + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::router(state); + let payload = serde_json::json!({"grantee_kind":"project","grantee_agent_id":null}).to_string(); + let response_user = app + .clone() + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/spaces/org_shared/grants") + .header("Authorization", "Bearer user-token") + .header("content-type", "application/json") + .body(Body::from(payload.clone())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call grant upsert (user)."); + + assert_eq!(response_user.status(), StatusCode::FORBIDDEN); + + let response_admin = app + .oneshot( + Request::builder() + .method("POST") + .uri("/v2/spaces/org_shared/grants") + .header("Authorization", "Bearer admin-token") + .header("content-type", "application/json") + .body(Body::from(payload)) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call grant upsert (admin)."); + + assert_ne!(response_admin.status(), StatusCode::FORBIDDEN); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn admin_note_provenance_includes_request_id_on_success() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + + config.security.auth_mode = "off".to_string(); + + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::admin_router(state.clone()); + let note_id = Uuid::new_v4(); + let request_id = Uuid::new_v4(); + + insert_note( + &state, + note_id, + "agent_private", + TEST_AGENT_A, + "Provenance integration test note.", + ) + .await; + + let response = app + .oneshot( + Request::builder() + .uri(format!("/v2/admin/notes/{note_id}/provenance")) + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_A) + .header("X-ELF-Request-Id", request_id.to_string()) + .body(Body::empty()) + .expect("Failed to build provenance request."), + ) + .await + .expect("Failed to call admin note provenance."); + + assert_eq!(response.status(), StatusCode::OK); + + let expected_request_id = request_id.to_string(); + + assert_eq!( + response.headers().get("X-ELF-Request-Id").and_then(|value| value.to_str().ok()), + Some(expected_request_id.as_str()) + ); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read provenance response body."); + let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response."); + + assert_eq!(json["schema"], "elf.note_provenance_bundle/v1"); + assert_eq!(json["request_id"], request_id.to_string()); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn admin_note_history_includes_request_id_on_success() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + + config.security.auth_mode = "off".to_string(); + + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::admin_router(state.clone()); + let note_id = Uuid::new_v4(); + let request_id = Uuid::new_v4(); + + insert_note(&state, note_id, "agent_private", TEST_AGENT_A, "History integration test note.") + .await; + + let response = app + .oneshot( + Request::builder() + .uri(format!("/v2/admin/notes/{note_id}/history")) + .header("X-ELF-Tenant-Id", TEST_TENANT_ID) + .header("X-ELF-Project-Id", TEST_PROJECT_ID) + .header("X-ELF-Agent-Id", TEST_AGENT_A) + .header("X-ELF-Request-Id", request_id.to_string()) + .body(Body::empty()) + .expect("Failed to build history request."), + ) + .await + .expect("Failed to call admin note history."); + + assert_eq!(response.status(), StatusCode::OK); + + let expected_request_id = request_id.to_string(); + + assert_eq!( + response.headers().get("X-ELF-Request-Id").and_then(|value| value.to_str().ok()), + Some(expected_request_id.as_str()) + ); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read history response body."); + let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response."); + + assert_eq!(json["schema"], "elf.memory_history/v1"); + assert_eq!(json["request_id"], request_id.to_string()); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn admin_note_provenance_rejects_invalid_request_id_header() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + + config.security.auth_mode = "off".to_string(); + + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::admin_router(state); + let note_id = Uuid::new_v4(); + let response = app + .oneshot( + Request::builder() + .uri(format!("/v2/admin/notes/{note_id}/provenance")) + .header("X-ELF-Request-Id", "not-a-uuid") + .body(Body::empty()) + .expect("Failed to build provenance request."), + ) + .await + .expect("Failed to call admin note provenance."); + let response_request_id = response + .headers() + .get("X-ELF-Request-Id") + .and_then(|value| value.to_str().ok()) + .expect("Expected request id header in error response."); + let generated_request_id = Uuid::parse_str(response_request_id) + .expect("Expected valid generated request_id in response header."); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); + + let body = body::to_bytes(response.into_body(), usize::MAX) + .await + .expect("Failed to read provenance response body."); + let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response."); + + assert_eq!(json["error_code"], "INVALID_REQUEST"); + assert_eq!(json["fields"][0], "$.headers.X-ELF-Request-Id"); + assert_eq!(json["request_id"], serde_json::Value::String(generated_request_id.to_string()),); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to run."] +async fn global_graph_predicate_write_requires_super_admin() { + let Some((test_db, qdrant_url, collection)) = test_env().await else { + return; + }; + let mut config = test_config(test_db.dsn().to_string(), qdrant_url, collection); + + config.security.auth_mode = "static_keys".to_string(); + config.security.auth_keys = vec![ + SecurityAuthKey { + token_id: "admin".to_string(), + token: "admin-token".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::Admin, + }, + SecurityAuthKey { + token_id: "super".to_string(), + token: "super-token".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::SuperAdmin, + }, + ]; + + let state = AppState::new(config).await.expect("Failed to initialize app state."); + let app = routes::admin_router(state.clone()); + let predicate_id = Uuid::new_v4(); + + sqlx::query( + "\ + INSERT INTO graph_predicates ( + predicate_id, + scope_key, + tenant_id, + project_id, + canonical, + canonical_norm, + cardinality, + status, + created_at, + updated_at + ) + VALUES ($1, '__global__', NULL, NULL, 'global_test', 'global_test', 'multi', 'pending', now(), now())", + ) + .bind(predicate_id) + .execute(&state.service.db.pool) + .await + .expect("Failed to insert global predicate."); + + let payload = serde_json::json!({ "status": "active" }); + let response_admin = app + .clone() + .oneshot( + Request::builder() + .method("PATCH") + .uri(format!("/v2/admin/graph/predicates/{predicate_id}")) + .header("Authorization", "Bearer admin-token") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call admin graph predicate patch (admin)."); + + assert_eq!(response_admin.status(), StatusCode::FORBIDDEN); + + let body = body::to_bytes(response_admin.into_body(), usize::MAX) + .await + .expect("Failed to read response body."); + let json: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse response."); + + assert_eq!(json["error_code"], "SCOPE_DENIED"); + + let response_super = app + .oneshot( + Request::builder() + .method("PATCH") + .uri(format!("/v2/admin/graph/predicates/{predicate_id}")) + .header("Authorization", "Bearer super-token") + .header("content-type", "application/json") + .body(Body::from(payload.to_string())) + .expect("Failed to build request."), + ) + .await + .expect("Failed to call admin graph predicate patch (super_admin)."); + + assert_eq!(response_super.status(), StatusCode::OK); + test_db.cleanup().await.expect("Failed to cleanup test database."); } diff --git a/apps/elf-cli/Cargo.toml b/apps/elf-cli/Cargo.toml new file mode 100644 index 00000000..cf159fbd --- /dev/null +++ b/apps/elf-cli/Cargo.toml @@ -0,0 +1,17 @@ +[package] +edition = "2024" +name = "elf" +version = "0.2.0" + +[[bin]] +name = "elf" +path = "src/main.rs" + +[dependencies] +clap = { workspace = true } +color-eyre = { workspace = true } +reqwest = { workspace = true } +serde_json = { workspace = true } +tokio = { workspace = true } + +elf-cli = { workspace = true } diff --git a/apps/elf-cli/src/main.rs b/apps/elf-cli/src/main.rs new file mode 100644 index 00000000..680058d1 --- /dev/null +++ b/apps/elf-cli/src/main.rs @@ -0,0 +1,968 @@ +//! Local ELF CLI wrappers for production memory workflows. + +use std::{ + collections::BTreeMap, + io::{self, Write as _}, + path::{Path, PathBuf}, + process::Command, +}; + +use clap::{Args, Parser, Subcommand, ValueEnum}; +use color_eyre::{Result, eyre}; +use reqwest::{Client, Method, RequestBuilder, Response, StatusCode, header::HeaderMap}; +use serde_json::{self, Value}; + +const DEFAULT_API_URL: &str = "http://127.0.0.1:51892"; +const DEFAULT_ADMIN_URL: &str = "http://127.0.0.1:51891"; +const DEFAULT_TENANT_ID: &str = "local-tenant"; +const DEFAULT_PROJECT_ID: &str = "local-project"; +const DEFAULT_AGENT_ID: &str = "local-agent"; +const DEFAULT_READ_PROFILE: &str = "private_only"; + +#[derive(Debug, Parser)] +#[command( + version = elf_cli::VERSION, + rename_all = "kebab", + styles = elf_cli::styles(), + about = "Local ELF workflow wrappers over the HTTP API and repo benchmark tasks." +)] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Debug, Args)] +struct PublicEndpointArgs { + /// Public ELF API base URL. + #[arg(long, env = "ELF_API_URL", default_value = DEFAULT_API_URL)] + api_url: String, + /// Optional bearer token for static-key auth. + #[arg(long, env = "ELF_USER_TOKEN")] + token: Option, +} + +#[derive(Debug, Args)] +struct AdminEndpointArgs { + /// Admin ELF API base URL. + #[arg(long, env = "ELF_ADMIN_URL", default_value = DEFAULT_ADMIN_URL)] + admin_url: String, + /// Optional admin bearer token for static-key auth. + #[arg(long, env = "ELF_ADMIN_TOKEN")] + admin_token: Option, +} + +#[derive(Clone, Debug, Args)] +struct ContextArgs { + /// Tenant id sent in X-ELF-Tenant-Id. + #[arg(long, env = "ELF_TENANT_ID", default_value = DEFAULT_TENANT_ID)] + tenant_id: String, + /// Project id sent in X-ELF-Project-Id. + #[arg(long, env = "ELF_PROJECT_ID", default_value = DEFAULT_PROJECT_ID)] + project_id: String, + /// Agent id sent in X-ELF-Agent-Id. + #[arg(long, env = "ELF_AGENT_ID", default_value = DEFAULT_AGENT_ID)] + agent_id: String, +} + +#[derive(Clone, Debug, Args)] +struct ReadContextArgs { + #[command(flatten)] + context: ContextArgs, + /// Read profile sent in X-ELF-Read-Profile. + #[arg(long, env = "ELF_READ_PROFILE", default_value = DEFAULT_READ_PROFILE)] + read_profile: String, +} + +#[derive(Debug, Args)] +struct OutputArgs { + /// Pretty-print the JSON output. + #[arg(long)] + pretty: bool, +} + +#[derive(Debug, Args)] +struct AddNoteArgs { + #[command(flatten)] + endpoint: PublicEndpointArgs, + #[command(flatten)] + context: ContextArgs, + #[command(flatten)] + output: OutputArgs, + /// Scope applied to the note. + #[arg(long, default_value = "agent_private")] + scope: String, + /// Memory note type. + #[arg(long = "type", default_value = "fact")] + note_type: String, + /// Optional note key used by the update resolver. + #[arg(long)] + key: Option, + /// English note text. + #[arg(long)] + text: String, + /// Ranking importance value. + #[arg(long, default_value_t = 0.7)] + importance: f32, + /// Ranking confidence value. + #[arg(long, default_value_t = 0.9)] + confidence: f32, + /// Optional TTL override in days. + #[arg(long)] + ttl_days: Option, + /// Operator-visible source id copied into source_ref.ref.source_id. + #[arg(long)] + source_id: Option, + /// Full JSON object source_ref override. + #[arg(long)] + source_ref_json: Option, +} + +#[derive(Debug, Args)] +struct SearchArgs { + #[command(flatten)] + endpoint: PublicEndpointArgs, + #[command(flatten)] + read_context: ReadContextArgs, + #[command(flatten)] + output: OutputArgs, + /// English query string. + #[arg(long)] + query: String, + /// Search mode to request from the service. + #[arg(long, value_enum, default_value_t = SearchMode::QuickFind)] + mode: SearchMode, + /// Number of final items to return. + #[arg(long)] + top_k: Option, + /// Candidate breadth before ranking. + #[arg(long)] + candidate_k: Option, + /// Payload level requested from the service. + #[arg(long, value_enum, default_value_t = PayloadLevel::L0)] + payload_level: PayloadLevel, + /// Optional search filter JSON object. + #[arg(long)] + filter_json: Option, +} + +#[derive(Debug, Args)] +struct StatusArgs { + #[command(flatten)] + endpoint: PublicEndpointArgs, + #[command(flatten)] + output: OutputArgs, +} + +#[derive(Debug, Args)] +struct BackfillArgs { + #[command(flatten)] + output: OutputArgs, + /// Backfill corpus document count override. + #[arg(long)] + docs: Option, + /// Worker concurrency override for the backfill runner. + #[arg(long)] + worker_concurrency: Option, + /// Use the checked-in 10k operator profile task. + #[arg(long)] + ten_k: bool, + /// Use the guarded 100k operator profile task. + #[arg(long, conflicts_with = "ten_k")] + hundred_k: bool, + /// Set the required expensive-run guard for the 100k task. + #[arg(long)] + enable_expensive: bool, + /// Print the resolved task and environment without running it. + #[arg(long)] + dry_run: bool, +} + +#[derive(Debug, Args)] +struct BenchmarkArgs { + #[command(subcommand)] + command: BenchmarkCommand, +} + +#[derive(Debug, Args)] +struct BenchmarkRunArgs { + #[command(flatten)] + output: OutputArgs, + /// Benchmark task wrapper to run. + #[arg(long, value_enum, default_value_t = BenchmarkRunKind::Live)] + kind: BenchmarkRunKind, + /// Project filter passed to ELF_BASELINE_PROJECTS. + #[arg(long)] + projects: Option, + /// Corpus profile passed to ELF_BASELINE_PROFILE. + #[arg(long)] + profile: Option, + /// Private production corpus manifest path. + #[arg(long)] + production_corpus_manifest: Option, + /// Markdown addendum path for production-private-addendum. + #[arg(long)] + private_addendum: Option, + /// Soak duration override in seconds. + #[arg(long)] + soak_seconds: Option, + /// Print the resolved task and environment without running it. + #[arg(long)] + dry_run: bool, +} + +#[derive(Debug, Args)] +struct BenchmarkReportArgs { + #[command(flatten)] + output: OutputArgs, + /// Source live-baseline report JSON path. + #[arg(long)] + report: Option, + /// Markdown output path. + #[arg(long)] + out: Option, + /// Print the resolved task and environment without running it. + #[arg(long)] + dry_run: bool, +} + +#[derive(Debug, Args)] +struct DiagnosticsArgs { + #[command(subcommand)] + command: DiagnosticsCommand, +} + +#[derive(Debug, Args)] +struct AdminPostArgs { + #[command(flatten)] + endpoint: AdminEndpointArgs, + #[command(flatten)] + context: ContextArgs, + #[command(flatten)] + output: OutputArgs, +} + +#[derive(Debug, Args)] +struct AdminSearchArgs { + #[command(flatten)] + endpoint: AdminEndpointArgs, + #[command(flatten)] + read_context: ReadContextArgs, + #[command(flatten)] + output: OutputArgs, + /// English query string. + #[arg(long)] + query: String, + /// Search mode to request from the service. + #[arg(long, value_enum, default_value_t = SearchMode::QuickFind)] + mode: SearchMode, + /// Number of final items to return. + #[arg(long)] + top_k: Option, + /// Candidate breadth before ranking. + #[arg(long)] + candidate_k: Option, + /// Payload level requested from the service. + #[arg(long, value_enum, default_value_t = PayloadLevel::L2)] + payload_level: PayloadLevel, + /// Optional search filter JSON object. + #[arg(long)] + filter_json: Option, +} + +#[derive(Debug, Args)] +struct RecentTracesArgs { + #[command(flatten)] + endpoint: AdminEndpointArgs, + #[command(flatten)] + context: ContextArgs, + #[command(flatten)] + output: OutputArgs, + /// Maximum trace headers to return. + #[arg(long)] + limit: Option, +} + +#[derive(Debug, Args)] +struct TraceBundleArgs { + #[command(flatten)] + endpoint: AdminEndpointArgs, + #[command(flatten)] + context: ContextArgs, + #[command(flatten)] + output: OutputArgs, + /// Trace id to load. + #[arg(long)] + trace_id: String, + /// Bundle mode: bounded or full. + #[arg(long, default_value = "bounded")] + mode: String, + /// Optional per-stage item cap. + #[arg(long)] + stage_items_limit: Option, + /// Optional replay candidate cap. + #[arg(long)] + candidates_limit: Option, +} + +#[derive(Debug, Args)] +struct NoteProvenanceArgs { + #[command(flatten)] + endpoint: AdminEndpointArgs, + #[command(flatten)] + context: ContextArgs, + #[command(flatten)] + output: OutputArgs, + /// Note id to inspect. + #[arg(long)] + note_id: String, +} + +struct JsonRequest<'a> { + method: Method, + base_url: &'a str, + path: &'a str, + token: Option<&'a str>, + context: Option<&'a ContextArgs>, + read_profile: Option<&'a str>, + body: Option<&'a Value>, +} + +#[derive(Debug, Subcommand)] +#[command(rename_all = "kebab")] +enum Commands { + /// Add one deterministic note through POST /v2/notes/ingest. + AddNote(AddNoteArgs), + /// Create a search session through POST /v2/searches. + Search(SearchArgs), + /// Check local API process health. + Status(StatusArgs), + /// Run the checked-in resumable backfill benchmark workflow. + Backfill(BackfillArgs), + /// Run or render checked-in live baseline benchmark reports. + Benchmark(BenchmarkArgs), + /// Read production diagnostics through admin HTTP endpoints. + Diagnostics(DiagnosticsArgs), +} + +#[derive(Clone, Copy, Debug, ValueEnum)] +#[value(rename_all = "snake_case")] +enum SearchMode { + QuickFind, + PlannedSearch, +} +impl SearchMode { + fn as_str(self) -> &'static str { + match self { + Self::QuickFind => "quick_find", + Self::PlannedSearch => "planned_search", + } + } +} + +#[derive(Clone, Copy, Debug, ValueEnum)] +#[value(rename_all = "lower")] +enum PayloadLevel { + L0, + L1, + L2, +} +impl PayloadLevel { + fn as_str(self) -> &'static str { + match self { + Self::L0 => "l0", + Self::L1 => "l1", + Self::L2 => "l2", + } + } +} + +#[derive(Debug, Subcommand)] +#[command(rename_all = "kebab")] +enum BenchmarkCommand { + /// Run one checked-in Docker baseline task. + Run(BenchmarkRunArgs), + /// Render Markdown from a live-baseline JSON report. + Report(BenchmarkReportArgs), +} + +#[derive(Clone, Copy, Debug, ValueEnum)] +#[value(rename_all = "kebab")] +enum BenchmarkRunKind { + Live, + ProductionSynthetic, + ProductionPrivate, + ProductionPrivateAddendum, + Soak, +} +impl BenchmarkRunKind { + fn task_name(self) -> &'static str { + match self { + Self::Live => "baseline-live-docker", + Self::ProductionSynthetic => "baseline-production-synthetic", + Self::ProductionPrivate => "baseline-production-private", + Self::ProductionPrivateAddendum => "baseline-production-private-addendum", + Self::Soak => "baseline-soak-docker", + } + } +} + +#[derive(Debug, Subcommand)] +#[command(rename_all = "kebab")] +enum DiagnosticsCommand { + /// Rebuild Qdrant from Postgres vectors through the admin API. + QdrantRebuild(AdminPostArgs), + /// Run raw admin search and include trace/result/source_ref data. + RawSearch(AdminSearchArgs), + /// List recent persisted search traces. + RecentTraces(RecentTracesArgs), + /// Read a bounded or full trace bundle. + TraceBundle(TraceBundleArgs), + /// Read note provenance, ingest decisions, outbox rows, and recent traces. + NoteProvenance(NoteProvenanceArgs), +} + +fn run_backfill(args: BackfillArgs) -> Result<()> { + let task = if args.hundred_k { + "baseline-backfill-100k-docker" + } else if args.ten_k { + "baseline-backfill-10k-docker" + } else { + "baseline-backfill-docker" + }; + let mut env = BTreeMap::new(); + + if let Some(docs) = args.docs { + env.insert("ELF_BASELINE_BACKFILL_DOCS".to_string(), docs.to_string()); + } + if let Some(worker_concurrency) = args.worker_concurrency { + env.insert("ELF_BASELINE_WORKER_CONCURRENCY".to_string(), worker_concurrency.to_string()); + } + + if args.enable_expensive { + env.insert("ELF_BASELINE_ENABLE_EXPENSIVE".to_string(), "1".to_string()); + } + + run_cargo_make("elf.cli.backfill/v1", task, env, args.dry_run, args.output.pretty) +} + +fn run_benchmark(args: BenchmarkArgs) -> Result<()> { + match args.command { + BenchmarkCommand::Run(args) => run_benchmark_run(args), + BenchmarkCommand::Report(args) => run_benchmark_report(args), + } +} + +fn run_benchmark_run(args: BenchmarkRunArgs) -> Result<()> { + let task = args.kind.task_name(); + let mut env = BTreeMap::new(); + + if let Some(projects) = args.projects { + env.insert("ELF_BASELINE_PROJECTS".to_string(), projects); + } + if let Some(profile) = args.profile { + env.insert("ELF_BASELINE_PROFILE".to_string(), profile); + } + if let Some(path) = args.production_corpus_manifest { + env.insert("ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST".to_string(), path_display(&path)); + } + if let Some(path) = args.private_addendum { + env.insert("ELF_BASELINE_PRIVATE_ADDENDUM".to_string(), path_display(&path)); + } + if let Some(seconds) = args.soak_seconds { + env.insert("ELF_BASELINE_SOAK_SECONDS".to_string(), seconds.to_string()); + } + + run_cargo_make("elf.cli.benchmark_run/v1", task, env, args.dry_run, args.output.pretty) +} + +fn run_benchmark_report(args: BenchmarkReportArgs) -> Result<()> { + let mut env = BTreeMap::new(); + + if let Some(path) = args.report { + env.insert("ELF_BASELINE_REPORT".to_string(), path_display(&path)); + } + if let Some(path) = args.out { + env.insert("ELF_BASELINE_MARKDOWN_REPORT".to_string(), path_display(&path)); + } + + run_cargo_make( + "elf.cli.benchmark_report/v1", + "baseline-live-report", + env, + args.dry_run, + args.output.pretty, + ) +} + +fn search_body( + query: String, + mode: SearchMode, + top_k: Option, + candidate_k: Option, + payload_level: PayloadLevel, + filter_json: Option<&str>, +) -> Result { + let mut body = serde_json::json!({ + "mode": mode.as_str(), + "query": query, + "top_k": top_k, + "candidate_k": candidate_k, + "payload_level": payload_level.as_str(), + }); + + if let Some(filter_json) = filter_json { + body["filter"] = parse_json_object(filter_json, "--filter-json")?; + } + + Ok(body) +} + +fn source_ref(source_id: &Option, source_ref_json: Option<&str>) -> Result { + if let Some(source_ref_json) = source_ref_json { + return parse_json_object(source_ref_json, "--source-ref-json"); + } + + Ok(source_id.as_ref().map_or_else( + || serde_json::json!({}), + |source_id| serde_json::json!({"schema": "elf_cli/v1", "ref": {"source_id": source_id}}), + )) +} + +fn parse_json_object(raw: &str, flag: &str) -> Result { + let value: Value = + serde_json::from_str(raw).map_err(|err| eyre::eyre!("{flag} must be valid JSON: {err}"))?; + + if !value.is_object() { + return Err(eyre::eyre!("{flag} must be a JSON object.")); + } + + Ok(value) +} + +fn add_context_headers(request: RequestBuilder, context: &ContextArgs) -> RequestBuilder { + request + .header("X-ELF-Tenant-Id", &context.tenant_id) + .header("X-ELF-Project-Id", &context.project_id) + .header("X-ELF-Agent-Id", &context.agent_id) +} + +fn run_cargo_make( + schema: &str, + task: &str, + env: BTreeMap, + dry_run: bool, + pretty: bool, +) -> Result<()> { + let command = serde_json::json!({ + "program": "cargo", + "args": ["make", task], + "env": env, + }); + + if dry_run { + let output = serde_json::json!({ + "schema": schema, + "dry_run": true, + "command": command, + }); + + return write_json(&output, pretty); + } + + let output = Command::new("cargo").arg("make").arg(task).envs(env.iter()).output()?; + + io::stderr().write_all(&output.stdout)?; + io::stderr().write_all(&output.stderr)?; + + let status_code = output.status.code(); + let summary = serde_json::json!({ + "schema": schema, + "dry_run": false, + "command": command, + "status_code": status_code, + "success": output.status.success(), + }); + + write_json(&summary, pretty)?; + + if output.status.success() { + Ok(()) + } else { + Err(eyre::eyre!("cargo make {task} failed with status {status_code:?}.")) + } +} + +fn write_json(value: &Value, pretty: bool) -> Result<()> { + if pretty { + serde_json::to_writer_pretty(io::stdout(), value)?; + } else { + serde_json::to_writer(io::stdout(), value)?; + } + + writeln!(io::stdout())?; + + Ok(()) +} + +fn join_url(base_url: &str, path: &str) -> String { + format!("{}/{}", base_url.trim_end_matches('/'), path.trim_start_matches('/')) +} + +fn redact_url(url: &str) -> String { + url.to_string() +} + +fn header_string(headers: &HeaderMap, name: &str) -> Option { + headers.get(name).and_then(|value| value.to_str().ok()).map(str::to_string) +} + +fn path_display(path: &Path) -> String { + path.display().to_string() +} + +#[tokio::main] +async fn main() -> Result<()> { + color_eyre::install()?; + + run(Cli::parse()).await +} + +async fn run(cli: Cli) -> Result<()> { + let client = Client::new(); + + match cli.command { + Commands::AddNote(args) => run_add_note(&client, args).await, + Commands::Search(args) => run_search(&client, args).await, + Commands::Status(args) => run_status(&client, args).await, + Commands::Backfill(args) => run_backfill(args), + Commands::Benchmark(args) => run_benchmark(args), + Commands::Diagnostics(args) => run_diagnostics(&client, args).await, + } +} + +async fn run_add_note(client: &Client, args: AddNoteArgs) -> Result<()> { + let source_ref = source_ref(&args.source_id, args.source_ref_json.as_deref())?; + let body = serde_json::json!({ + "scope": args.scope, + "notes": [{ + "type": args.note_type, + "key": args.key, + "text": args.text, + "importance": args.importance, + "confidence": args.confidence, + "ttl_days": args.ttl_days, + "source_ref": source_ref, + }], + }); + let response = request_json( + client, + JsonRequest { + method: Method::POST, + base_url: &args.endpoint.api_url, + path: "/v2/notes/ingest", + token: args.endpoint.token.as_deref(), + context: Some(&args.context), + read_profile: None, + body: Some(&body), + }, + ) + .await?; + let output = serde_json::json!({ + "schema": "elf.cli.add_note/v1", + "request": { + "api_url": redact_url(&args.endpoint.api_url), + "tenant_id": args.context.tenant_id, + "project_id": args.context.project_id, + "agent_id": args.context.agent_id, + "scope": body["scope"], + "source_id": args.source_id, + "source_ref": body["notes"][0]["source_ref"], + }, + "response": response, + }); + + write_json(&output, args.output.pretty) +} + +async fn run_search(client: &Client, args: SearchArgs) -> Result<()> { + let body = search_body( + args.query, + args.mode, + args.top_k, + args.candidate_k, + args.payload_level, + args.filter_json.as_deref(), + )?; + let response = request_json( + client, + JsonRequest { + method: Method::POST, + base_url: &args.endpoint.api_url, + path: "/v2/searches", + token: args.endpoint.token.as_deref(), + context: Some(&args.read_context.context), + read_profile: Some(&args.read_context.read_profile), + body: Some(&body), + }, + ) + .await?; + let output = serde_json::json!({ + "schema": "elf.cli.search/v1", + "request": { + "api_url": redact_url(&args.endpoint.api_url), + "tenant_id": args.read_context.context.tenant_id, + "project_id": args.read_context.context.project_id, + "agent_id": args.read_context.context.agent_id, + "read_profile": args.read_context.read_profile, + "mode": body["mode"], + "payload_level": body["payload_level"], + }, + "trace_id": response.get("trace_id").cloned().unwrap_or(Value::Null), + "search_id": response.get("search_id").cloned().unwrap_or(Value::Null), + "response": response, + }); + + write_json(&output, args.output.pretty) +} + +async fn run_status(client: &Client, args: StatusArgs) -> Result<()> { + let url = join_url(&args.endpoint.api_url, "/health"); + let mut request = client.get(&url); + + if let Some(token) = args.endpoint.token.as_deref() { + request = request.bearer_auth(token); + } + + let response = request.send().await?; + let status = response.status(); + let request_id = header_string(response.headers(), "x-elf-request-id"); + let body = response.text().await?; + let output = serde_json::json!({ + "schema": "elf.cli.status/v1", + "api": { + "url": redact_url(&args.endpoint.api_url), + "healthy": status == StatusCode::OK, + "status": status.as_u16(), + "request_id": request_id, + "body": body, + }, + }); + + write_json(&output, args.output.pretty)?; + + if status.is_success() { + Ok(()) + } else { + Err(eyre::eyre!("ELF API health check failed with HTTP status {status}.")) + } +} + +async fn run_diagnostics(client: &Client, args: DiagnosticsArgs) -> Result<()> { + match args.command { + DiagnosticsCommand::QdrantRebuild(args) => run_qdrant_rebuild(client, args).await, + DiagnosticsCommand::RawSearch(args) => run_raw_search(client, args).await, + DiagnosticsCommand::RecentTraces(args) => run_recent_traces(client, args).await, + DiagnosticsCommand::TraceBundle(args) => run_trace_bundle(client, args).await, + DiagnosticsCommand::NoteProvenance(args) => run_note_provenance(client, args).await, + } +} + +async fn run_qdrant_rebuild(client: &Client, args: AdminPostArgs) -> Result<()> { + let response = request_json( + client, + JsonRequest { + method: Method::POST, + base_url: &args.endpoint.admin_url, + path: "/v2/admin/qdrant/rebuild", + token: args.endpoint.admin_token.as_deref(), + context: Some(&args.context), + read_profile: None, + body: None, + }, + ) + .await?; + let output = serde_json::json!({ + "schema": "elf.cli.diagnostics.qdrant_rebuild/v1", + "admin_url": redact_url(&args.endpoint.admin_url), + "response": response, + }); + + write_json(&output, args.output.pretty) +} + +async fn run_raw_search(client: &Client, args: AdminSearchArgs) -> Result<()> { + let body = search_body( + args.query, + args.mode, + args.top_k, + args.candidate_k, + args.payload_level, + args.filter_json.as_deref(), + )?; + let response = request_json( + client, + JsonRequest { + method: Method::POST, + base_url: &args.endpoint.admin_url, + path: "/v2/admin/searches/raw", + token: args.endpoint.admin_token.as_deref(), + context: Some(&args.read_context.context), + read_profile: Some(&args.read_context.read_profile), + body: Some(&body), + }, + ) + .await?; + let output = serde_json::json!({ + "schema": "elf.cli.diagnostics.raw_search/v1", + "request": { + "admin_url": redact_url(&args.endpoint.admin_url), + "tenant_id": args.read_context.context.tenant_id, + "project_id": args.read_context.context.project_id, + "agent_id": args.read_context.context.agent_id, + "read_profile": args.read_context.read_profile, + "mode": body["mode"], + "payload_level": body["payload_level"], + }, + "trace_id": response.get("trace_id").cloned().unwrap_or(Value::Null), + "response": response, + }); + + write_json(&output, args.output.pretty) +} + +async fn run_recent_traces(client: &Client, args: RecentTracesArgs) -> Result<()> { + let mut query = Vec::new(); + + if let Some(limit) = args.limit { + query.push(("limit", limit.to_string())); + } + + let response = request_json_query( + client, + &args.endpoint.admin_url, + "/v2/admin/traces/recent", + args.endpoint.admin_token.as_deref(), + &args.context, + &query, + ) + .await?; + let output = serde_json::json!({ + "schema": "elf.cli.diagnostics.recent_traces/v1", + "admin_url": redact_url(&args.endpoint.admin_url), + "response": response, + }); + + write_json(&output, args.output.pretty) +} + +async fn run_trace_bundle(client: &Client, args: TraceBundleArgs) -> Result<()> { + let path = format!("/v2/admin/traces/{}/bundle", args.trace_id); + let mut query = vec![("mode", args.mode)]; + + if let Some(limit) = args.stage_items_limit { + query.push(("stage_items_limit", limit.to_string())); + } + if let Some(limit) = args.candidates_limit { + query.push(("candidates_limit", limit.to_string())); + } + + let response = request_json_query( + client, + &args.endpoint.admin_url, + &path, + args.endpoint.admin_token.as_deref(), + &args.context, + &query, + ) + .await?; + let output = serde_json::json!({ + "schema": "elf.cli.diagnostics.trace_bundle/v1", + "admin_url": redact_url(&args.endpoint.admin_url), + "trace_id": response.pointer("/trace/trace_id").cloned().unwrap_or(Value::Null), + "response": response, + }); + + write_json(&output, args.output.pretty) +} + +async fn run_note_provenance(client: &Client, args: NoteProvenanceArgs) -> Result<()> { + let path = format!("/v2/admin/notes/{}/provenance", args.note_id); + let response = request_json_query( + client, + &args.endpoint.admin_url, + &path, + args.endpoint.admin_token.as_deref(), + &args.context, + &[], + ) + .await?; + let output = serde_json::json!({ + "schema": "elf.cli.diagnostics.note_provenance/v1", + "admin_url": redact_url(&args.endpoint.admin_url), + "note_id": response.pointer("/note/note_id").cloned().unwrap_or(Value::String(args.note_id)), + "response": response, + }); + + write_json(&output, args.output.pretty) +} + +async fn request_json(client: &Client, args: JsonRequest<'_>) -> Result { + let mut request = client.request(args.method, join_url(args.base_url, args.path)); + + if let Some(token) = args.token { + request = request.bearer_auth(token); + } + if let Some(context) = args.context { + request = add_context_headers(request, context); + } + if let Some(read_profile) = args.read_profile { + request = request.header("X-ELF-Read-Profile", read_profile); + } + if let Some(body) = args.body { + request = request.json(body); + } + + parse_json_response(request.send().await?).await +} + +async fn request_json_query( + client: &Client, + base_url: &str, + path: &str, + token: Option<&str>, + context: &ContextArgs, + query: &[(&str, String)], +) -> Result { + let mut request = client.get(join_url(base_url, path)).query(query); + + if let Some(token) = token { + request = request.bearer_auth(token); + } + + request = add_context_headers(request, context); + + parse_json_response(request.send().await?).await +} + +async fn parse_json_response(response: Response) -> Result { + let status = response.status(); + let request_id = header_string(response.headers(), "x-elf-request-id"); + let text = response.text().await?; + + if !status.is_success() { + return Err(eyre::eyre!( + "ELF request failed with HTTP status {status} and request_id {}: {text}", + request_id.as_deref().unwrap_or("unknown") + )); + } + if text.trim().is_empty() { + return Ok(serde_json::json!({"status": status.as_u16(), "request_id": request_id})); + } + + serde_json::from_str(&text).map_err(|err| { + eyre::eyre!( + "ELF response was not valid JSON for request_id {}: {err}", + request_id.as_deref().unwrap_or("unknown") + ) + }) +} diff --git a/apps/elf-eval/Cargo.toml b/apps/elf-eval/Cargo.toml index 16ef7fe1..5e0d8baa 100644 --- a/apps/elf-eval/Cargo.toml +++ b/apps/elf-eval/Cargo.toml @@ -1,22 +1,32 @@ [package] -build = "../../build.rs" -edition = "2024" -name = "elf-eval" -version = "0.1.0" +build = "../../build.rs" +default-run = "elf-eval" +edition = "2024" +name = "elf-eval" +version = "0.2.0" [dependencies] +blake3 = { workspace = true } clap = { workspace = true } color-eyre = { workspace = true } -elf-cli = { path = "../../packages/elf-cli" } -elf-config = { path = "../../packages/elf-config" } -elf-service = { path = "../../packages/elf-service" } -elf-storage = { path = "../../packages/elf-storage" } +reqwest = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +sqlx = { workspace = true } +time = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } uuid = { workspace = true } +elf-chunking = { workspace = true } +elf-cli = { workspace = true } +elf-config = { workspace = true } +elf-domain = { workspace = true } +elf-service = { workspace = true } +elf-storage = { workspace = true } +elf-testkit = { workspace = true } +elf-worker = { workspace = true } + [build-dependencies] vergen-gitcl = { workspace = true } diff --git a/apps/elf-eval/fixtures/agentmemory/sample_session.json b/apps/elf-eval/fixtures/agentmemory/sample_session.json new file mode 100644 index 00000000..c02c4162 --- /dev/null +++ b/apps/elf-eval/fixtures/agentmemory/sample_session.json @@ -0,0 +1,106 @@ +{ + "schema": "agentmemory.fixture/v1", + "fixture_id": "agentmemory-sample-2026-06-08", + "source": { + "system": "agentmemory", + "version": "v0.9.27", + "export_id": "agentmemory-export-sample", + "exported_at": "2026-06-08T06:30:00Z" + }, + "sessions": [ + { + "session_id": "am-session-2026-06-08", + "agent": "codex", + "project": "ELF", + "started_at": "2026-06-08T05:45:00Z", + "ended_at": "2026-06-08T06:10:00Z", + "observations": [ + { + "observation_id": "obs-architecture", + "ts": "2026-06-08T05:50:00Z", + "role": "assistant", + "kind": "implementation_note", + "text": "ELF keeps Postgres as the source of truth and treats Qdrant as a rebuildable derived index.", + "metadata": { + "agentmemory_workspace": "elf-local", + "capture_method": "fixture" + } + }, + { + "observation_id": "obs-policy", + "ts": "2026-06-08T05:55:00Z", + "role": "assistant", + "kind": "implementation_note", + "text": "Imported agentmemory facts must still pass ELF note write policy before they become authoritative notes.", + "metadata": { + "agentmemory_workspace": "elf-local", + "capture_method": "fixture" + } + } + ], + "memories": [ + { + "memory_id": "mem-architecture-sot", + "kind": "fact", + "key": "architecture_sot", + "text": "ELF keeps Postgres as the source of truth and Qdrant as a rebuildable derived index.", + "importance": 0.8, + "confidence": 0.9, + "created_at": "2026-06-08T05:50:00Z", + "updated_at": "2026-06-08T05:50:00Z", + "source_observation_ids": ["obs-architecture"], + "metadata": { + "agentmemory_memory_type": "fact", + "capture_method": "fixture" + } + }, + { + "memory_id": "mem-import-policy", + "kind": "constraint", + "key": "agentmemory_import_policy", + "text": "Agentmemory imports must use ELF ingestion policy instead of writing directly to storage.", + "importance": 0.7, + "confidence": 0.9, + "created_at": "2026-06-08T05:55:00Z", + "updated_at": "2026-06-08T05:55:00Z", + "source_observation_ids": ["obs-policy"], + "metadata": { + "agentmemory_memory_type": "constraint", + "capture_method": "fixture" + } + }, + { + "memory_id": "mem-raw-summary", + "kind": "summary", + "text": "This raw summary is intentionally ignored because the adapter does not infer ELF note types from unsupported agentmemory kinds.", + "importance": 0.4, + "confidence": 0.5, + "created_at": "2026-06-08T06:00:00Z", + "updated_at": "2026-06-08T06:00:00Z", + "source_observation_ids": ["obs-architecture"], + "metadata": { + "agentmemory_memory_type": "summary", + "capture_method": "fixture" + } + } + ], + "retrieval_cases": [ + { + "query_id": "q-architecture-sot", + "query": "where does ELF keep the authoritative memory store", + "expected_memory_ids": ["mem-architecture-sot"], + "agentmemory_results": [ + { + "memory_id": "mem-architecture-sot", + "rank": 1, + "score": 0.98 + } + ], + "metadata": { + "claim_source": "fixture_only" + } + } + ] + } + ] +} diff --git a/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json b/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json new file mode 100644 index 00000000..62873c40 --- /dev/null +++ b/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json @@ -0,0 +1,105 @@ +{ + "schema": "elf.production_corpus_manifest/v1", + "manifest_id": "synthetic-coding-agent-prod-corpus-2026-06-09", + "description": "Synthetic, sanitized production-style coding-agent memory corpus for ELF adoption benchmarking.", + "evidence": [ + { + "evidence_id": "issue-xy812-resume", + "category": "issue", + "title": "XY-812 Resume Lane", + "text": "XY-812 resume lane uses branch y/elf-xy-812. The next command is `cargo make trace-gate`; the stale blocker cleared after PR #108 merged." + }, + { + "evidence_id": "pr-110-review", + "category": "pr", + "title": "PR 110 Review Status", + "text": "PR #110 is review-ready for the ELF viewer lane. It passed `cargo make check` and waits for the non-draft review handoff." + }, + { + "evidence_id": "worktree-xy791-repair", + "category": "worktree", + "title": "XY-791 Strict Config Repair", + "text": "Worktree XY-791 recovered strict-config repair after rebase. The exact gate was `cargo make fmt && cargo make lint-fix && cargo make check`." + }, + { + "evidence_id": "runbook-live-baseline", + "category": "runbook", + "title": "Private Production Corpus Runbook", + "text": "Private production fixtures use `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` with `cargo make baseline-production-private` and stay out of git." + }, + { + "evidence_id": "decision-qdrant-derived", + "category": "decision", + "title": "Qdrant Derived Index Decision", + "text": "Decision: Qdrant remains a rebuildable derived index. Postgres stores source-of-truth vectors, notes, chunks, and audit rows." + }, + { + "evidence_id": "blocker-stale-qwen-key", + "category": "blocker", + "title": "Stale Provider Key Blocker", + "text": "Stale blocker: missing Qwen key applied only to provider stress runs. The synthetic production corpus uses local deterministic embeddings." + }, + { + "evidence_id": "recovery-xy640-ledger", + "category": "recovery_note", + "title": "XY-640 Ledger Replay Recovery", + "text": "Recovery note: XY-640 ledger replay resumes from checkpoint `ledger-replay-42` and verifies the retained lane with `cargo make test`." + }, + { + "evidence_id": "decision-xy818-supersedes", + "category": "decision", + "title": "Superseded Command Decision", + "text": "Update case: old command `cargo make lint` was superseded by `cargo make lint-fix` for Decodex ELF lanes." + } + ], + "queries": [ + { + "query_id": "q-resume-lane", + "task": "resume_lane", + "query": "How do I resume XY-812 and what command is next?", + "expected_evidence_ids": ["issue-xy812-resume"], + "allowed_alternate_evidence_ids": [], + "expected_terms": ["XY-812", "cargo make trace-gate"] + }, + { + "query_id": "q-recover-exact-command", + "task": "recover_exact_command", + "query": "Recover the exact repair gate command for XY-791 strict config.", + "expected_evidence_ids": ["worktree-xy791-repair"], + "allowed_alternate_evidence_ids": ["runbook-live-baseline"], + "expected_terms": ["XY-791", "cargo make fmt && cargo make lint-fix && cargo make check"] + }, + { + "query_id": "q-explain-stale-blocker", + "task": "explain_stale_blocker", + "query": "Why is the missing Qwen key blocker stale for the synthetic production corpus?", + "expected_evidence_ids": ["blocker-stale-qwen-key"], + "allowed_alternate_evidence_ids": [], + "expected_terms": ["missing Qwen key", "local deterministic embeddings"] + }, + { + "query_id": "q-find-prior-decision", + "task": "find_prior_decision", + "query": "What prior decision explains why Qdrant can be rebuilt?", + "expected_evidence_ids": ["decision-qdrant-derived"], + "allowed_alternate_evidence_ids": [], + "expected_terms": ["Qdrant", "rebuildable derived index"] + }, + { + "query_id": "q-compare-project-status", + "task": "compare_project_status", + "query": "Compare PR #110 and XY-640 status.", + "expected_evidence_ids": ["pr-110-review"], + "allowed_alternate_evidence_ids": ["recovery-xy640-ledger"], + "expected_terms": ["PR #110", "review-ready"] + }, + { + "query_id": "q-detect-contradiction-update", + "task": "detect_contradiction_update", + "query": "Which command superseded cargo make lint for Decodex ELF lanes?", + "expected_evidence_ids": ["decision-xy818-supersedes"], + "allowed_alternate_evidence_ids": [], + "expected_terms": ["cargo make lint-fix", "superseded"] + } + ] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json new file mode 100644 index 00000000..68cc2395 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json @@ -0,0 +1,208 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-agentmemory-durable-capture-blocked-001", + "suite": "capture_integration", + "title": "Select the durable agentmemory capture path before scoring hooks", + "encoding": { + "status": "blocked", + "reason": "agentmemory's current Docker baseline still uses a process-local SDK/KV mock, so work-resume and write-policy hook capture cannot be scored until a persistent local session, KV, and index path survives a fresh process.", + "follow_up": { + "title": "Wire agentmemory durable local session capture for work-resume jobs", + "reason": "The fair path is a Docker-contained adapter that persists the agentmemory observation log, KV store, and searchable index between capture and replay processes." + } + }, + "corpus": { + "corpus_id": "first-generation-oss-agentmemory-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "agentmemory-selected-durable-path", + "kind": "adapter_plan", + "text": "Selected agentmemory path: run capture hooks into a Docker-local session directory, persist the SDK KV store and searchable index, restart a fresh process, then score work_resume and write-policy prompts against that recovered store.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "agentmemory_durable_capture_path_blocked", + "evidence_id": "agentmemory-selected-durable-path" + }, + "locator": { + "quote": "persist the SDK KV store and searchable index" + } + }, + "created_at": "2026-06-11T10:00:00Z" + }, + { + "evidence_id": "agentmemory-mock-boundary", + "kind": "adapter_blocker", + "text": "Current blocker: the live-baseline adapter registers agentmemory functions against a process-local StateKV Map and in-memory index, so it cannot prove cold-start recovery or hook capture durability.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "agentmemory_durable_capture_path_blocked", + "evidence_id": "agentmemory-mock-boundary" + }, + "locator": { + "quote": "process-local StateKV Map and in-memory index" + } + }, + "created_at": "2026-06-11T10:01:00Z" + }, + { + "evidence_id": "agentmemory-pass-decoy", + "kind": "adapter_state", + "text": "Decoy: agentmemory same-corpus retrieval passing through the mock proves durable coding-agent continuity and write-policy capture.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "agentmemory_durable_capture_path_blocked", + "evidence_id": "agentmemory-pass-decoy" + } + }, + "created_at": "2026-06-11T09:59:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "agentmemory remains blocked for durable work-resume and write-policy hook capture. The selected local path is a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; the current StateKV Map and in-memory index cannot prove that.", + "claims": [ + { + "claim_id": "selected_durable_path", + "text": "The selected local path persists the SDK KV store and searchable index across a fresh process.", + "evidence_ids": ["agentmemory-selected-durable-path"], + "confidence": "high" + }, + { + "claim_id": "current_mock_blocker", + "text": "The current StateKV Map and in-memory index cannot prove durable continuity.", + "evidence_ids": ["agentmemory-mock-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["agentmemory-selected-durable-path", "agentmemory-mock-boundary"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + }, + "capture_behaviors": { + "blocked": [ + "agentmemory durable hook capture waits for a persistent Docker-local session, KV, and index path." + ], + "notes": [ + "Same-corpus mock retrieval is not promoted into work-resume or capture integration pass evidence." + ] + } + }, + "timeline": [ + { + "event_id": "agentmemory-durable-path-selected", + "ts": "2026-06-11T10:00:00Z", + "actor": "benchmark", + "action": "selected_durable_adapter_path", + "evidence_ids": ["agentmemory-selected-durable-path"], + "summary": "The next fair agentmemory path must persist capture state across a fresh process." + }, + { + "event_id": "agentmemory-mock-blocker-preserved", + "ts": "2026-06-11T10:01:00Z", + "actor": "benchmark", + "action": "kept_blocked_state", + "evidence_ids": ["agentmemory-mock-boundary"], + "summary": "The current in-memory adapter remains blocked for durable continuity." + } + ], + "prompt": { + "role": "user", + "content": "What local agentmemory path should be used for work-resume and write-policy capture, and can the current mock be scored?", + "job_mode": "operate", + "constraints": ["cite_evidence", "state_blockers", "do_not_promote_mock_smoke"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "selected_durable_path", + "text": "The selected local path persists the SDK KV store and searchable index across a fresh process." + }, + { + "claim_id": "current_mock_blocker", + "text": "The current StateKV Map and in-memory index cannot prove durable continuity." + } + ], + "must_not_include": [ + "same-corpus retrieval passing through the mock proves durable coding-agent continuity" + ], + "evidence_links": { + "selected_durable_path": ["agentmemory-selected-durable-path"], + "current_mock_blocker": ["agentmemory-mock-boundary"] + }, + "answer_type": "blocked_plan", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "agentmemory-selected-durable-path", + "claim_id": "selected_durable_path", + "requirement": "cite", + "quote": "persist the SDK KV store and searchable index" + }, + { + "evidence_id": "agentmemory-mock-boundary", + "claim_id": "current_mock_blocker", + "requirement": "cite", + "quote": "process-local StateKV Map and in-memory index" + } + ], + "negative_traps": [ + { + "trap_id": "mock-smoke-durable-pass", + "type": "unsupported_prior", + "evidence_ids": ["agentmemory-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "uncertainty_handling": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Keeps the durable path blocked until persistent state is proven." + }, + "workflow_helpfulness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Names the concrete local path needed for the next adapter." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the selected path and the current mock boundary." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not promote the mock same-corpus smoke into durable continuity proof." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "agentmemory", "capture_integration", "blocked", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json new file mode 100644 index 00000000..49d0dc92 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json @@ -0,0 +1,208 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-claude-mem-hook-viewer-blocked-001", + "suite": "capture_integration", + "title": "Keep claude-mem hook and viewer workflows blocked until Docker-contained", + "encoding": { + "status": "blocked", + "reason": "The current claude-mem Docker baseline exercises repository classes and durable SQLite only; it does not launch hooks, timeline capture, the local viewer, or an operator workflow over the same corpus.", + "follow_up": { + "title": "Encode claude-mem hook capture and viewer workflow in Docker", + "reason": "A fair UX comparison requires hook observations, timeline/viewer readback, and retrieval repair artifacts produced inside the same containerized run." + } + }, + "corpus": { + "corpus_id": "first-generation-oss-claude-mem-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "claude-mem-hook-viewer-blocker", + "kind": "adapter_blocker", + "text": "claude-mem hook/viewer blocker: the current Docker runner uses repository classes only and does not execute hook capture, local viewer timeline readback, or operator repair workflows.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_hook_viewer_blocked", + "evidence_id": "claude-mem-hook-viewer-blocker" + }, + "locator": { + "quote": "does not execute hook capture, local viewer timeline readback" + } + }, + "created_at": "2026-06-11T10:50:00Z" + }, + { + "evidence_id": "claude-mem-needed-docker-path", + "kind": "adapter_plan", + "text": "Needed claude-mem path: run hook capture and viewer/operator readback inside Docker against the same durable SQLite corpus, then emit timeline, detail hydration, and repair-command artifacts.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_hook_viewer_blocked", + "evidence_id": "claude-mem-needed-docker-path" + }, + "locator": { + "quote": "run hook capture and viewer/operator readback inside Docker" + } + }, + "created_at": "2026-06-11T10:51:00Z" + }, + { + "evidence_id": "claude-mem-hook-pass-decoy", + "kind": "adapter_state", + "text": "Decoy: repository class tests prove claude-mem hook capture and viewer workflows pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_hook_viewer_blocked", + "evidence_id": "claude-mem-hook-pass-decoy" + } + }, + "created_at": "2026-06-11T10:49:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "claude-mem hook capture and viewer/operator workflows remain blocked. The current runner uses repository classes only; the next comparable path must run hook capture plus viewer/operator readback inside Docker against the same durable SQLite corpus and emit timeline, hydration, and repair-command artifacts.", + "claims": [ + { + "claim_id": "hook_viewer_blocked", + "text": "The current runner does not execute hook capture or local viewer timeline readback.", + "evidence_ids": ["claude-mem-hook-viewer-blocker"], + "confidence": "high" + }, + { + "claim_id": "needed_docker_path", + "text": "The needed path is hook capture and viewer/operator readback inside Docker against the same durable SQLite corpus.", + "evidence_ids": ["claude-mem-needed-docker-path"], + "confidence": "high" + } + ], + "evidence_ids": ["claude-mem-hook-viewer-blocker", "claude-mem-needed-docker-path"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + }, + "capture_behaviors": { + "blocked": [ + "claude-mem hook capture and viewer/operator readback are not Docker-contained yet." + ], + "notes": [ + "Repository class lifecycle and hydration evidence must not be reused as hook or viewer workflow proof." + ] + } + }, + "timeline": [ + { + "event_id": "claude-mem-hook-viewer-blocker-recorded", + "ts": "2026-06-11T10:50:00Z", + "actor": "benchmark", + "action": "recorded_blocker", + "evidence_ids": ["claude-mem-hook-viewer-blocker"], + "summary": "Hook capture and local viewer readback are outside the current Docker runner." + }, + { + "event_id": "claude-mem-needed-path-recorded", + "ts": "2026-06-11T10:51:00Z", + "actor": "benchmark", + "action": "selected_next_path", + "evidence_ids": ["claude-mem-needed-docker-path"], + "summary": "The next fair path must run hook capture and viewer/operator readback inside Docker." + } + ], + "prompt": { + "role": "user", + "content": "Can claude-mem hook capture and viewer workflows be scored from the current Docker baseline?", + "job_mode": "operate", + "constraints": ["cite_evidence", "state_blockers", "avoid_repository_overclaim"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "hook_viewer_blocked", + "text": "The current runner does not execute hook capture or local viewer timeline readback." + }, + { + "claim_id": "needed_docker_path", + "text": "The needed path is hook capture and viewer/operator readback inside Docker against the same durable SQLite corpus." + } + ], + "must_not_include": [ + "repository class tests prove claude-mem hook capture and viewer workflows pass" + ], + "evidence_links": { + "hook_viewer_blocked": ["claude-mem-hook-viewer-blocker"], + "needed_docker_path": ["claude-mem-needed-docker-path"] + }, + "answer_type": "blocked_plan", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "claude-mem-hook-viewer-blocker", + "claim_id": "hook_viewer_blocked", + "requirement": "cite", + "quote": "does not execute hook capture, local viewer timeline readback" + }, + { + "evidence_id": "claude-mem-needed-docker-path", + "claim_id": "needed_docker_path", + "requirement": "explain", + "quote": "run hook capture and viewer/operator readback inside Docker" + } + ], + "negative_traps": [ + { + "trap_id": "repository-class-hook-viewer-pass", + "type": "unsupported_prior", + "evidence_ids": ["claude-mem-hook-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "uncertainty_handling": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Keeps hook/viewer workflow blocked until a Docker-contained run exists." + }, + "workflow_helpfulness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Names the next comparable Docker path." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the current blocker and needed path." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not reuse repository class checks as hook/viewer proof." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "claude-mem", "capture_integration", "blocked", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json new file mode 100644 index 00000000..48bd8092 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json @@ -0,0 +1,215 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-claude-mem-progressive-disclosure-001", + "suite": "operator_debugging_ux", + "title": "Preserve claude-mem progressive-disclosure evidence boundary", + "corpus": { + "corpus_id": "first-generation-oss-claude-mem-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "claude-mem-detail-hydration", + "kind": "adapter_artifact", + "text": "claude-mem progressive evidence: the Docker repository path verified search result to getById detail hydration plus listSources source evidence on a durable SQLite repository.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_progressive_disclosure", + "evidence_id": "claude-mem-detail-hydration" + }, + "locator": { + "quote": "getById detail hydration plus listSources source evidence" + } + }, + "created_at": "2026-06-11T10:30:00Z" + }, + { + "evidence_id": "claude-mem-progressive-boundary", + "kind": "claim_boundary", + "text": "claude-mem boundary: repository search-to-detail hydration is useful progressive-disclosure evidence, but it does not execute hooks, timeline capture, viewer workflows, or real-world prompt scoring.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_progressive_disclosure", + "evidence_id": "claude-mem-progressive-boundary" + }, + "locator": { + "quote": "does not execute hooks, timeline capture, viewer workflows" + } + }, + "created_at": "2026-06-11T10:31:00Z" + }, + { + "evidence_id": "claude-mem-viewer-decoy", + "kind": "adapter_state", + "text": "Decoy: repository detail hydration proves claude-mem viewer and hook workflows pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_progressive_disclosure", + "evidence_id": "claude-mem-viewer-decoy" + } + }, + "created_at": "2026-06-11T10:29:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "claude-mem has Docker-contained progressive-disclosure evidence at the repository layer: search results can be hydrated through getById and listSources on durable SQLite. That should stay separate from hook, timeline, viewer, and real-world prompt scoring, which are not executed by the current runner.", + "claims": [ + { + "claim_id": "repository_progressive_evidence", + "text": "claude-mem search results can be hydrated through getById and listSources on durable SQLite.", + "evidence_ids": ["claude-mem-detail-hydration"], + "confidence": "high" + }, + { + "claim_id": "viewer_hook_boundary", + "text": "Hook, timeline, viewer, and real-world prompt scoring are not executed by the current runner.", + "evidence_ids": ["claude-mem-progressive-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["claude-mem-detail-hydration", "claude-mem-progressive-boundary"], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "claude-mem-detail-hydration-recorded", + "ts": "2026-06-11T10:30:00Z", + "actor": "benchmark", + "action": "recorded_progressive_disclosure_evidence", + "evidence_ids": ["claude-mem-detail-hydration"], + "summary": "The Docker repository path exposes search-to-detail/source hydration." + }, + { + "event_id": "claude-mem-viewer-boundary-recorded", + "ts": "2026-06-11T10:31:00Z", + "actor": "benchmark", + "action": "preserved_viewer_hook_boundary", + "evidence_ids": ["claude-mem-progressive-boundary"], + "summary": "Repository hydration is not promoted into hook or viewer pass evidence." + } + ], + "prompt": { + "role": "user", + "content": "What claude-mem progressive-disclosure evidence is measured, and what remains outside the Docker-contained path?", + "job_mode": "debug", + "constraints": ["cite_evidence", "separate_repository_from_viewer", "avoid_hook_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "repository_progressive_evidence", + "text": "claude-mem search results can be hydrated through getById and listSources on durable SQLite." + }, + { + "claim_id": "viewer_hook_boundary", + "text": "Hook, timeline, viewer, and real-world prompt scoring are not executed by the current runner." + } + ], + "must_not_include": [ + "repository detail hydration proves claude-mem viewer and hook workflows pass" + ], + "evidence_links": { + "repository_progressive_evidence": ["claude-mem-detail-hydration"], + "viewer_hook_boundary": ["claude-mem-progressive-boundary"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "claude-mem-detail-hydration", + "claim_id": "repository_progressive_evidence", + "requirement": "cite", + "quote": "getById detail hydration plus listSources source evidence" + }, + { + "evidence_id": "claude-mem-progressive-boundary", + "claim_id": "viewer_hook_boundary", + "requirement": "cite", + "quote": "does not execute hooks, timeline capture, viewer workflows" + } + ], + "negative_traps": [ + { + "trap_id": "repository-hydration-viewer-pass", + "type": "unsupported_prior", + "evidence_ids": ["claude-mem-viewer-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Explains the measured progressive-disclosure path." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites detail hydration and boundary evidence." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Separates repository evidence from viewer/hook follow-up." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not promote repository hydration into viewer or hook claims." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "viewer_hook_workflow_not_encoded", + "trace_id": "claude-mem-repository-detail", + "root_cause": "The Docker-contained evidence stops at repository detail/source hydration and does not run the product viewer or hooks.", + "steps_to_root_cause": 2, + "raw_sql_needed": false, + "dropped_candidate_visibility": "repository search result can be hydrated to detail and source rows", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "trace_available": true, + "replay_command_available": true, + "replay_command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker", + "replay_artifact": "tmp/live-baseline/claude-mem.log", + "viewer_panels": ["Repository Search Result", "Memory Item Detail", "Source List"], + "cli_steps": [ + "run the claude-mem Docker baseline", + "inspect getById detail hydration", + "inspect listSources evidence", + "keep hook and viewer workflows blocked until separately encoded" + ], + "trace_evidence": ["claude-mem-detail-hydration", "claude-mem-progressive-boundary"], + "ux_gaps": [] + }, + "tags": ["external_adapter", "claude-mem", "operator_debugging_ux", "progressive_disclosure", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json new file mode 100644 index 00000000..4fb20191 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-claude-mem-retrieval-repair-001", + "suite": "retrieval", + "title": "Preserve claude-mem retrieval repair evidence after same-corpus miss", + "corpus": { + "corpus_id": "first-generation-oss-claude-mem-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "claude-mem-same-corpus-miss", + "kind": "adapter_artifact", + "text": "claude-mem retrieval repair evidence: the Docker baseline built the durable SQLite repository but same-corpus retrieval returned 0 of 3 expected query checks, so retrieval quality remains wrong_result.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_retrieval_repair", + "evidence_id": "claude-mem-same-corpus-miss" + }, + "locator": { + "quote": "same-corpus retrieval returned 0 of 3 expected query checks" + } + }, + "created_at": "2026-06-11T10:40:00Z" + }, + { + "evidence_id": "claude-mem-repair-command", + "kind": "debug_command", + "text": "claude-mem repair command: rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker, then inspect tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json before changing retrieval scoring.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_retrieval_repair", + "evidence_id": "claude-mem-repair-command" + }, + "locator": { + "quote": "inspect tmp/live-baseline/claude-mem.log" + } + }, + "created_at": "2026-06-11T10:41:00Z" + }, + { + "evidence_id": "claude-mem-retrieval-pass-decoy", + "kind": "adapter_state", + "text": "Decoy: because claude-mem repository lifecycle passed, same-corpus retrieval also passed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "claude_mem_retrieval_repair", + "evidence_id": "claude-mem-retrieval-pass-decoy" + } + }, + "created_at": "2026-06-11T10:39:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "claude-mem retrieval remains wrong_result: the durable SQLite repository built, but same-corpus retrieval returned 0 of 3 expected query checks. The repair path is to rerun the claude-mem baseline, inspect tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json, then fix retrieval before any pass claim.", + "claims": [ + { + "claim_id": "retrieval_wrong_result", + "text": "claude-mem same-corpus retrieval returned 0 of 3 expected query checks.", + "evidence_ids": ["claude-mem-same-corpus-miss"], + "confidence": "high" + }, + { + "claim_id": "repair_artifact_path", + "text": "The repair path is to inspect tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json.", + "evidence_ids": ["claude-mem-repair-command"], + "confidence": "high" + } + ], + "evidence_ids": ["claude-mem-same-corpus-miss", "claude-mem-repair-command"], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "claude-mem-wrong-result-recorded", + "ts": "2026-06-11T10:40:00Z", + "actor": "benchmark", + "action": "recorded_same_corpus_wrong_result", + "evidence_ids": ["claude-mem-same-corpus-miss"], + "summary": "The same-corpus result remains wrong_result despite durable repository lifecycle evidence." + }, + { + "event_id": "claude-mem-repair-artifact-recorded", + "ts": "2026-06-11T10:41:00Z", + "actor": "benchmark", + "action": "recorded_repair_artifact_path", + "evidence_ids": ["claude-mem-repair-command"], + "summary": "The repair path points at the reproducible Docker baseline and logs." + } + ], + "prompt": { + "role": "user", + "content": "Did claude-mem retrieval pass, and what artifact should I inspect to repair the miss?", + "job_mode": "debug", + "constraints": ["cite_evidence", "preserve_wrong_result", "name_repair_artifact"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "retrieval_wrong_result", + "text": "claude-mem same-corpus retrieval returned 0 of 3 expected query checks." + }, + { + "claim_id": "repair_artifact_path", + "text": "The repair path is to inspect tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json." + } + ], + "must_not_include": [ + "same-corpus retrieval also passed" + ], + "evidence_links": { + "retrieval_wrong_result": ["claude-mem-same-corpus-miss"], + "repair_artifact_path": ["claude-mem-repair-command"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "claude-mem-same-corpus-miss", + "claim_id": "retrieval_wrong_result", + "requirement": "cite", + "quote": "same-corpus retrieval returned 0 of 3 expected query checks" + }, + { + "evidence_id": "claude-mem-repair-command", + "claim_id": "repair_artifact_path", + "requirement": "explain", + "quote": "inspect tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json" + } + ], + "negative_traps": [ + { + "trap_id": "lifecycle-pass-implies-retrieval-pass", + "type": "unsupported_prior", + "evidence_ids": ["claude-mem-retrieval-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Keeps same-corpus retrieval as wrong_result." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the wrong-result artifact and repair command." + }, + "workflow_helpfulness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Names the concrete artifact path for repair." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not infer retrieval pass from lifecycle pass." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "claude-mem", "retrieval", "wrong_result", "repair"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json new file mode 100644 index 00000000..c94b9486 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-memsearch-markdown-rebuild-reload-001", + "suite": "trust_source_of_truth", + "title": "Verify memsearch canonical Markdown rebuild and reload boundary", + "corpus": { + "corpus_id": "first-generation-oss-memsearch-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "memsearch-canonical-markdown-store", + "kind": "source_store", + "text": "memsearch source-store evidence: the canonical Markdown corpus file is the source of truth, and the index is rebuilt by rerunning memsearch index over the file tree.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_markdown_rebuild_reload", + "evidence_id": "memsearch-canonical-markdown-store" + }, + "locator": { + "quote": "canonical Markdown corpus file is the source of truth" + } + }, + "created_at": "2026-06-11T10:10:00Z" + }, + { + "evidence_id": "memsearch-reload-proof", + "kind": "adapter_artifact", + "text": "memsearch reload proof: the Docker baseline rewrote auth-memory.md, deleted another corpus file, reran memsearch index, and a fresh memsearch search process retrieved the replacement marker while suppressing deleted evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_markdown_rebuild_reload", + "evidence_id": "memsearch-reload-proof" + }, + "locator": { + "quote": "a fresh memsearch search process retrieved the replacement marker" + } + }, + "created_at": "2026-06-11T10:11:00Z" + }, + { + "evidence_id": "memsearch-suite-pass-decoy", + "kind": "claim_boundary", + "text": "Decoy: because memsearch reload passed a Docker smoke, memsearch has passed the full real-world source-of-truth suite.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_markdown_rebuild_reload", + "evidence_id": "memsearch-suite-pass-decoy" + } + }, + "created_at": "2026-06-11T10:09:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "memsearch's comparable source-store path is the canonical Markdown corpus file, with the derived index rebuilt by rerunning memsearch index. The Docker smoke proves rewrite, delete, reindex, and fresh-process reload behavior, but it must not be promoted to a full real-world suite pass.", + "claims": [ + { + "claim_id": "markdown_is_source_store", + "text": "The canonical Markdown corpus file is the source of truth for memsearch.", + "evidence_ids": ["memsearch-canonical-markdown-store"], + "confidence": "high" + }, + { + "claim_id": "rebuild_reload_smoke", + "text": "The Docker smoke proves rewrite, delete, reindex, and fresh-process reload behavior.", + "evidence_ids": ["memsearch-reload-proof"], + "confidence": "high" + } + ], + "evidence_ids": ["memsearch-canonical-markdown-store", "memsearch-reload-proof"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "memsearch-markdown-store-selected", + "ts": "2026-06-11T10:10:00Z", + "actor": "benchmark", + "action": "selected_canonical_markdown_store", + "evidence_ids": ["memsearch-canonical-markdown-store"], + "summary": "The memsearch comparable source-store job uses the Markdown corpus as authoritative state." + }, + { + "event_id": "memsearch-reload-artifact-recorded", + "ts": "2026-06-11T10:11:00Z", + "actor": "benchmark", + "action": "recorded_reindex_reload_smoke", + "evidence_ids": ["memsearch-reload-proof"], + "summary": "The Docker smoke supplies command-level reindex/reload evidence." + } + ], + "prompt": { + "role": "user", + "content": "What is the comparable memsearch source-of-truth path, and what does the rebuild/reload evidence prove?", + "job_mode": "answer", + "constraints": ["cite_evidence", "state_claim_boundary", "avoid_suite_promotion"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "markdown_is_source_store", + "text": "The canonical Markdown corpus file is the source of truth for memsearch." + }, + { + "claim_id": "rebuild_reload_smoke", + "text": "The Docker smoke proves rewrite, delete, reindex, and fresh-process reload behavior." + } + ], + "must_not_include": [ + "memsearch has passed the full real-world source-of-truth suite" + ], + "evidence_links": { + "markdown_is_source_store": ["memsearch-canonical-markdown-store"], + "rebuild_reload_smoke": ["memsearch-reload-proof"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "memsearch-canonical-markdown-store", + "claim_id": "markdown_is_source_store", + "requirement": "cite", + "quote": "canonical Markdown corpus file is the source of truth" + }, + { + "evidence_id": "memsearch-reload-proof", + "claim_id": "rebuild_reload_smoke", + "requirement": "cite", + "quote": "a fresh memsearch search process retrieved the replacement marker" + } + ], + "negative_traps": [ + { + "trap_id": "memsearch-smoke-suite-pass", + "type": "unsupported_prior", + "evidence_ids": ["memsearch-suite-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Identifies Markdown as source store and index as rebuildable derived state." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites source-store and reload proof evidence." + }, + "lifecycle_behavior": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Explains rewrite, delete, reindex, and fresh-process reload behavior." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not promote smoke evidence into full suite pass evidence." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "memsearch", "source_store", "markdown", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json new file mode 100644 index 00000000..e3dbacdc --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json @@ -0,0 +1,254 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "first-gen-memsearch-retrieval-debug-001", + "suite": "operator_debugging_ux", + "title": "Debug memsearch retrieval through Markdown file and index artifacts", + "corpus": { + "corpus_id": "first-generation-oss-memsearch-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "memsearch-debug-command", + "kind": "debug_command", + "text": "memsearch retrieval-debug evidence: rerun memsearch search with --top-k, inspect the matching Markdown file, and rerun memsearch index after any file rewrite or delete.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_retrieval_debug_prompt", + "evidence_id": "memsearch-debug-command" + }, + "locator": { + "quote": "inspect the matching Markdown file" + } + }, + "created_at": "2026-06-11T10:20:00Z" + }, + { + "evidence_id": "memsearch-debug-boundary", + "kind": "claim_boundary", + "text": "memsearch debug boundary: the current adapter exposes CLI search output and canonical Markdown files, but it does not emit staged query-expansion, fusion, rerank, or candidate-drop trace bundles.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_retrieval_debug_prompt", + "evidence_id": "memsearch-debug-boundary" + }, + "locator": { + "quote": "does not emit staged query-expansion, fusion, rerank, or candidate-drop trace bundles" + } + }, + "created_at": "2026-06-11T10:21:00Z" + }, + { + "evidence_id": "memsearch-trace-decoy", + "kind": "adapter_state", + "text": "Decoy: memsearch exposes the same staged retrieval trajectory and candidate-drop trace bundle as ELF.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "memsearch_retrieval_debug_prompt", + "evidence_id": "memsearch-trace-decoy" + } + }, + "created_at": "2026-06-11T10:19:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_first_generation_oss", + "answer": { + "content": "For memsearch retrieval debugging, rerun memsearch search with --top-k, inspect the matching Markdown file, and rerun memsearch index after file changes. The useful debug surface is source-file transparency plus CLI replay; staged expansion, fusion, rerank, and candidate-drop trace bundles are not emitted by the current adapter.", + "claims": [ + { + "claim_id": "debug_replay_path", + "text": "Rerun memsearch search with --top-k and inspect the matching Markdown file.", + "evidence_ids": ["memsearch-debug-command"], + "confidence": "high" + }, + { + "claim_id": "trace_boundary", + "text": "The current adapter does not emit staged expansion, fusion, rerank, or candidate-drop trace bundles.", + "evidence_ids": ["memsearch-debug-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["memsearch-debug-command", "memsearch-debug-boundary"], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "memsearch-cli-debug", + "failure_stage": "trace_bundle", + "failure_reason": "memsearch exposes CLI replay and source Markdown inspection, not staged retrieval trace bundles.", + "stages": [ + { + "stage_name": "cli.search", + "kept_evidence": ["memsearch-debug-command"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": ["memsearch-trace-decoy"], + "notes": "CLI replay can reproduce the visible result set." + }, + { + "stage_name": "source.markdown", + "kept_evidence": ["memsearch-debug-command"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "The Markdown file remains inspectable as canonical source." + }, + { + "stage_name": "trace_bundle", + "kept_evidence": ["memsearch-debug-boundary"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": ["memsearch-trace-decoy"], + "notes": "Candidate-drop trace bundles are not encoded for memsearch." + } + ] + } + } + } + }, + "timeline": [ + { + "event_id": "memsearch-debug-path-recorded", + "ts": "2026-06-11T10:20:00Z", + "actor": "benchmark", + "action": "recorded_debug_path", + "evidence_ids": ["memsearch-debug-command"], + "summary": "The retrieval-debug job points at CLI replay and source Markdown inspection." + }, + { + "event_id": "memsearch-trace-boundary-recorded", + "ts": "2026-06-11T10:21:00Z", + "actor": "benchmark", + "action": "recorded_trace_gap", + "evidence_ids": ["memsearch-debug-boundary"], + "summary": "The job keeps staged trace bundles as not encoded for memsearch." + } + ], + "prompt": { + "role": "user", + "content": "How should I debug a wrong memsearch retrieval result, and what trace visibility is not available?", + "job_mode": "debug", + "constraints": ["cite_evidence", "identify_debug_surface", "avoid_trace_overclaim"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "debug_replay_path", + "text": "Rerun memsearch search with --top-k and inspect the matching Markdown file." + }, + { + "claim_id": "trace_boundary", + "text": "The current adapter does not emit staged expansion, fusion, rerank, or candidate-drop trace bundles." + } + ], + "must_not_include": [ + "memsearch exposes the same staged retrieval trajectory and candidate-drop trace bundle as ELF" + ], + "evidence_links": { + "debug_replay_path": ["memsearch-debug-command"], + "trace_boundary": ["memsearch-debug-boundary"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "memsearch-debug-command", + "claim_id": "debug_replay_path", + "requirement": "explain", + "quote": "inspect the matching Markdown file" + }, + { + "evidence_id": "memsearch-debug-boundary", + "claim_id": "trace_boundary", + "requirement": "explain", + "quote": "does not emit staged query-expansion, fusion, rerank, or candidate-drop trace bundles" + } + ], + "negative_traps": [ + { + "trap_id": "memsearch-full-trace-decoy", + "type": "unsupported_prior", + "evidence_ids": ["memsearch-trace-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Names the available memsearch debug path." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites CLI/source debug and trace-boundary evidence." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Provides a concrete replay and reindex sequence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not overclaim staged trace visibility." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "memsearch_trace_bundle_not_encoded", + "trace_id": "memsearch-cli-debug", + "root_cause": "memsearch debugging is available through CLI replay and canonical Markdown inspection, while staged candidate-drop trace bundles are not encoded.", + "steps_to_root_cause": 3, + "raw_sql_needed": false, + "dropped_candidate_visibility": "not encoded; inspect CLI search output and Markdown source instead", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "trace_available": false, + "replay_command_available": true, + "replay_command": "memsearch search '' --top-k 10 && memsearch index ", + "replay_artifact": "tmp/live-baseline/memsearch.log", + "viewer_panels": ["CLI Search Output", "Markdown Source File", "Index Rebuild Log"], + "cli_steps": [ + "rerun memsearch search with --top-k", + "open the matching Markdown file", + "edit or delete the canonical file if needed", + "rerun memsearch index", + "rerun search from a fresh process" + ], + "trace_evidence": ["memsearch-debug-command", "memsearch-debug-boundary"], + "ux_gaps": [ + { + "gap_id": "staged-trace-bundle-not-encoded", + "severity": "medium", + "description": "No staged expansion/fusion/rerank/candidate-drop bundle is emitted by the current memsearch adapter.", + "follow_up_issue": "XY-925" + } + ] + }, + "tags": ["external_adapter", "memsearch", "operator_debugging_ux", "retrieval_debug", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json new file mode 100644 index 00000000..bb6d9b92 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json @@ -0,0 +1,285 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "graph-rag-graphify-graph-report-001", + "suite": "knowledge_compilation", + "title": "Score graphify graph-report navigation, stale-source lint, and unsupported summaries", + "corpus": { + "corpus_id": "graph-rag-representative-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "graphify-graph-summary-output", + "kind": "adapter_artifact", + "text": "graphify representative output: graph.json links the ELF memory service node to Qdrant rebuild and graph report mapping nodes with source file references.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphify_graph_report_wrong_result", + "evidence_id": "graphify-graph-summary-output" + }, + "locator": { + "quote": "graph.json links the ELF memory service node to Qdrant rebuild and graph report mapping nodes" + } + }, + "created_at": "2026-06-11T17:20:00Z" + }, + { + "evidence_id": "graphify-source-location-output", + "kind": "adapter_artifact", + "text": "graphify source-location output: query output includes source files and line-like locations for generated corpus snippets.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphify_graph_report_wrong_result", + "evidence_id": "graphify-source-location-output" + }, + "locator": { + "quote": "query output includes source files and line-like locations" + } + }, + "created_at": "2026-06-11T17:21:00Z" + }, + { + "evidence_id": "graphify-derived-report-boundary", + "kind": "claim_boundary", + "text": "graphify boundary: GRAPH_REPORT.md is a derived graph/report artifact and must not be treated as authoritative ELF memory.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphify_graph_report_wrong_result", + "evidence_id": "graphify-derived-report-boundary" + }, + "locator": { + "quote": "GRAPH_REPORT.md is a derived graph/report artifact" + } + }, + "created_at": "2026-06-11T17:22:00Z" + }, + { + "evidence_id": "graphify-stale-source-trap", + "kind": "adapter_state", + "text": "Stale graphify trap: GRAPH_REPORT.md became the authoritative ELF memory store and replaces source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphify_graph_report_wrong_result", + "evidence_id": "graphify-stale-source-trap" + }, + "locator": { + "quote": "GRAPH_REPORT.md became the authoritative ELF memory store" + } + }, + "created_at": "2026-06-11T17:19:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_graph_rag_external_adapters", + "answer": { + "content": "graphify provides derived graph/report navigation: graph.json links the ELF memory service, Qdrant rebuild, and graph report mapping nodes, and query output carries source-file locations. The graph/report output is derived graph/report evidence, not authoritative ELF memory.", + "claims": [ + { + "claim_id": "graph_report_navigation", + "text": "graphify graph.json links the ELF memory service, Qdrant rebuild, and graph report mapping nodes.", + "evidence_ids": ["graphify-graph-summary-output"], + "confidence": "high" + }, + { + "claim_id": "source_location_citations", + "text": "graphify query output includes source files and line-like locations for generated corpus snippets.", + "evidence_ids": ["graphify-source-location-output"], + "confidence": "high" + }, + { + "claim_id": "derived_report_boundary", + "text": "GRAPH_REPORT.md is a derived graph/report artifact and must not be treated as authoritative ELF memory.", + "evidence_ids": ["graphify-derived-report-boundary"], + "confidence": "high" + } + ], + "evidence_ids": [ + "graphify-graph-summary-output", + "graphify-source-location-output", + "graphify-derived-report-boundary" + ], + "pages": [ + { + "page_id": "graphify:representative-graph-report", + "page_type": "concept", + "title": "graphify Representative Graph Report", + "path": "tmp/real-world-memory/graph-rag/graphify/GRAPH_REPORT.md", + "sections": [ + { + "section_id": "graph-summary", + "heading": "Graph Summary", + "role": "summary", + "content": "graph.json links the ELF memory service, Qdrant rebuild, and graph report mapping nodes.", + "evidence_ids": ["graphify-graph-summary-output"], + "timeline_event_ids": ["graphify-graph-output-recorded"] + }, + { + "section_id": "source-locations", + "heading": "Source Locations", + "role": "citations", + "content": "Query output includes source files and line-like locations for generated corpus snippets.", + "evidence_ids": ["graphify-source-location-output"], + "timeline_event_ids": ["graphify-source-location-recorded"] + }, + { + "section_id": "unsupported-quality-summary", + "heading": "Unsupported Quality Summary", + "role": "summary", + "content": "This fixture does not prove broad graph-navigation quality for graphify or an ELF-over-graphify result.", + "evidence_ids": [], + "timeline_event_ids": [], + "unsupported_reason": "The representative fixture is based on bounded graph/report output and not a broad quality evaluation." + } + ], + "backlinks": ["project:elf-memory-service", "entity:qdrant-rebuild"], + "lint_findings": [], + "rebuild": { + "first_hash": "blake3:graphify-representative-001", + "second_hash": "blake3:graphify-representative-001", + "deterministic": true, + "allowed_variance": [] + } + } + ], + "latency_ms": 4.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "graphify-stale-source-recorded", + "ts": "2026-06-11T17:19:00Z", + "actor": "agent", + "action": "recorded_stale_graph_report_claim", + "evidence_ids": ["graphify-stale-source-trap"], + "summary": "A stale trap claims graphify output became authoritative memory." + }, + { + "event_id": "graphify-graph-output-recorded", + "ts": "2026-06-11T17:20:00Z", + "actor": "adapter", + "action": "recorded_graph_summary_output", + "evidence_ids": ["graphify-graph-summary-output"], + "summary": "The representative output includes graph summary links." + }, + { + "event_id": "graphify-source-location-recorded", + "ts": "2026-06-11T17:21:00Z", + "actor": "adapter", + "action": "recorded_source_locations", + "evidence_ids": ["graphify-source-location-output"], + "summary": "The representative output includes source-file and location citations." + } + ], + "prompt": { + "role": "user", + "content": "Use graphify graph/report output to explain the graph navigation path, source citations, stale-source lint state, and unsupported quality boundary.", + "job_mode": "compile", + "constraints": ["cite_graph_report", "score_stale_lint", "flag_unsupported_summary", "no_broad_quality_claim"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "graph_report_navigation", + "text": "graphify graph.json links the ELF memory service, Qdrant rebuild, and graph report mapping nodes." + }, + { + "claim_id": "source_location_citations", + "text": "graphify query output includes source files and line-like locations for generated corpus snippets." + }, + { + "claim_id": "derived_report_boundary", + "text": "GRAPH_REPORT.md is a derived graph/report artifact and must not be treated as authoritative ELF memory." + } + ], + "must_not_include": [ + "GRAPH_REPORT.md became the authoritative ELF memory store.", + "ELF beats graphify on graph-navigation quality." + ], + "evidence_links": { + "graph_report_navigation": ["graphify-graph-summary-output"], + "source_location_citations": ["graphify-source-location-output"], + "derived_report_boundary": ["graphify-derived-report-boundary"] + }, + "answer_type": "compiled_knowledge", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "graphify-graph-summary-output", + "claim_id": "graph_report_navigation", + "requirement": "cite", + "quote": "graph.json links the ELF memory service node to Qdrant rebuild and graph report mapping nodes" + }, + { + "evidence_id": "graphify-source-location-output", + "claim_id": "source_location_citations", + "requirement": "cite", + "quote": "query output includes source files and line-like locations" + }, + { + "evidence_id": "graphify-derived-report-boundary", + "claim_id": "derived_report_boundary", + "requirement": "cite", + "quote": "GRAPH_REPORT.md is a derived graph/report artifact" + } + ], + "negative_traps": [ + { + "trap_id": "graphify-authoritative-report-trap", + "type": "stale_fact", + "evidence_ids": ["graphify-stale-source-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Must identify the graph/report navigation path and source citation boundary." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Must cite graph summary, source-location, and derived-report boundary evidence." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Must expose graph report, source citations, stale-source lint, and unsupported-summary handling." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Must lint the stale authoritative-report trap instead of silently missing it." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["derived graph/report evidence"], + "fallback_action": "state_bounded_graph_report_boundary" + }, + "tags": ["external_adapter", "graph_rag", "graphify", "graph_report", "stale_source_lint", "unsupported_summary"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json new file mode 100644 index 00000000..1c649e71 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json @@ -0,0 +1,197 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "graph-rag-graphiti-temporal-validity-001", + "suite": "memory_evolution", + "title": "Keep Graphiti/Zep temporal-validity scoring provider-blocked until current and historical facts return", + "encoding": { + "status": "blocked", + "reason": "Graphiti/Zep representative temporal-validity scoring requires explicit provider configuration before Docker-local Graphiti can return current, historical, and rationale facts with validity windows.", + "follow_up": { + "title": "Run Graphiti/Zep temporal-validity job with explicit provider config", + "reason": "The representative job can score only after Graphiti search output maps current and historical validity-window facts to generated evidence ids." + } + }, + "corpus": { + "corpus_id": "graph-rag-representative-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "graphiti-current-fact-contract", + "kind": "adapter_contract", + "text": "Graphiti/Zep representative contract: a current fact must carry a validity window and map to the generated current evidence id.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphiti_temporal_validity_blocked", + "evidence_id": "graphiti-current-fact-contract" + }, + "locator": { + "quote": "a current fact must carry a validity window" + } + }, + "created_at": "2026-06-11T17:15:00Z" + }, + { + "evidence_id": "graphiti-historical-fact-contract", + "kind": "adapter_contract", + "text": "Graphiti/Zep representative contract: a historical fact must remain queryable as historical instead of being presented as the current fact.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphiti_temporal_validity_blocked", + "evidence_id": "graphiti-historical-fact-contract" + }, + "locator": { + "quote": "a historical fact must remain queryable as historical" + } + }, + "created_at": "2026-06-11T17:16:00Z" + }, + { + "evidence_id": "graphiti-provider-boundary", + "kind": "adapter_blocker", + "text": "Graphiti/Zep blocker: the live temporal smoke is provider-bound and must report provider_api_key_missing when explicit credentials are absent.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphiti_temporal_validity_blocked", + "evidence_id": "graphiti-provider-boundary" + }, + "locator": { + "quote": "must report provider_api_key_missing when explicit credentials are absent" + } + }, + "created_at": "2026-06-11T17:17:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "graphiti-temporal-contract-recorded", + "ts": "2026-06-11T17:15:00Z", + "actor": "agent", + "action": "recorded_temporal_contract", + "evidence_ids": ["graphiti-current-fact-contract", "graphiti-historical-fact-contract"], + "summary": "Graphiti/Zep representative scoring requires current and historical validity-window facts." + } + ], + "prompt": { + "role": "user", + "content": "Score Graphiti/Zep temporal validity only when current and historical facts with validity windows are returned.", + "job_mode": "answer", + "constraints": ["distinguish_current_from_historical", "cite_temporal_facts", "typed_provider_blocker"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "graphiti_temporal_contract", + "text": "Graphiti/Zep temporal scoring requires current and historical facts with validity windows." + } + ], + "must_not_include": [ + "Graphiti/Zep temporal validity passes without provider-backed output.", + "ELF beats Graphiti/Zep temporal graph memory." + ], + "evidence_links": { + "graphiti_temporal_contract": [ + "graphiti-current-fact-contract", + "graphiti-historical-fact-contract", + "graphiti-provider-boundary" + ] + }, + "answer_type": "typed_blocker", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "graphiti-current-fact-contract", + "claim_id": "graphiti_temporal_contract", + "requirement": "cite", + "quote": "a current fact must carry a validity window" + }, + { + "evidence_id": "graphiti-historical-fact-contract", + "claim_id": "graphiti_temporal_contract", + "requirement": "cite", + "quote": "a historical fact must remain queryable as historical" + }, + { + "evidence_id": "graphiti-provider-boundary", + "claim_id": "graphiti_temporal_contract", + "requirement": "explain", + "quote": "must report provider_api_key_missing when explicit credentials are absent" + } + ], + "negative_traps": [ + { + "trap_id": "graphiti-providerless-temporal-pass", + "type": "stale_fact", + "evidence_ids": ["graphiti-historical-fact-contract"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.4, + "max_points": 1.0, + "criteria": "Must distinguish current and historical validity windows before scoring." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Must preserve the provider-backed temporal boundary." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Must cite current, historical, and provider-boundary evidence." + }, + "trap_avoidance": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Must not report historical facts as current." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "preserve_provider_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["graphiti-current-fact-contract"], + "historical_evidence_ids": ["graphiti-historical-fact-contract"], + "stale_trap_ids": ["graphiti-providerless-temporal-pass"], + "conflicts": [ + { + "conflict_id": "graphiti-current-historical-validity", + "claim_id": "graphiti_temporal_contract", + "current_evidence_id": "graphiti-current-fact-contract", + "historical_evidence_id": "graphiti-historical-fact-contract", + "resolved_by_evidence_id": "graphiti-provider-boundary" + } + ], + "update_rationale": { + "claim_id": "graphiti_temporal_contract", + "evidence_ids": ["graphiti-provider-boundary"], + "available": true + }, + "temporal_validity": { + "required": true, + "encoded": false, + "follow_up": "Run the provider-backed Graphiti/Zep temporal smoke and map validity windows to evidence ids." + } + }, + "tags": ["external_adapter", "graph_rag", "graphiti_zep", "temporal_validity", "typed_blocked"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json new file mode 100644 index 00000000..7f851b0f --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json @@ -0,0 +1,146 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "graph-rag-graphrag-output-tables-001", + "suite": "knowledge_compilation", + "title": "Score GraphRAG output-table citations only after provider-backed tables map to evidence ids", + "encoding": { + "status": "blocked", + "reason": "GraphRAG representative knowledge-synthesis scoring is blocked until an explicitly provider-backed Docker run emits output tables whose document, text-unit, community, and report identifiers map to generated evidence ids.", + "follow_up": { + "title": "Run GraphRAG representative output-table citation job with explicit provider config", + "reason": "The representative job can score graph summaries and citations only after parquet output tables and local-search context are mapped to benchmark evidence ids." + } + }, + "corpus": { + "corpus_id": "graph-rag-representative-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "graphrag-output-table-contract", + "kind": "adapter_contract", + "text": "GraphRAG representative contract: score graph summaries only when documents, text_units, communities, community_reports, entities, and relationships tables map to generated evidence ids.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphrag_output_tables_blocked", + "evidence_id": "graphrag-output-table-contract" + }, + "locator": { + "quote": "documents, text_units, communities, community_reports, entities, and relationships tables" + } + }, + "created_at": "2026-06-11T17:10:00Z" + }, + { + "evidence_id": "graphrag-provider-boundary", + "kind": "adapter_blocker", + "text": "GraphRAG blocker: live indexing and local search require explicit provider configuration; missing provider configuration remains a typed blocker.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "graphrag_output_tables_blocked", + "evidence_id": "graphrag-provider-boundary" + }, + "locator": { + "quote": "live indexing and local search require explicit provider configuration" + } + }, + "created_at": "2026-06-11T17:11:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "graphrag-output-contract-recorded", + "ts": "2026-06-11T17:10:00Z", + "actor": "agent", + "action": "recorded_adapter_contract", + "evidence_ids": ["graphrag-output-table-contract"], + "summary": "GraphRAG representative scoring requires output tables and source ids." + } + ], + "prompt": { + "role": "user", + "content": "Compile a GraphRAG graph-summary benchmark only when output tables and citations exist.", + "job_mode": "compile", + "constraints": ["cite_output_tables", "score_graph_summaries", "typed_provider_blocker"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "output_table_contract", + "text": "GraphRAG graph-summary scoring requires output tables mapped to generated evidence ids." + } + ], + "must_not_include": [ + "GraphRAG passes graph-summary quality without provider-backed output tables.", + "ELF beats GraphRAG on graph synthesis." + ], + "evidence_links": { + "output_table_contract": ["graphrag-output-table-contract", "graphrag-provider-boundary"] + }, + "answer_type": "typed_blocker", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "graphrag-output-table-contract", + "claim_id": "output_table_contract", + "requirement": "cite", + "quote": "documents, text_units, communities, community_reports, entities, and relationships tables" + }, + { + "evidence_id": "graphrag-provider-boundary", + "claim_id": "output_table_contract", + "requirement": "explain", + "quote": "live indexing and local search require explicit provider configuration" + } + ], + "negative_traps": [ + { + "trap_id": "graphrag-providerless-pass", + "type": "unsupported_claim", + "evidence_ids": ["graphrag-provider-boundary"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Must keep GraphRAG provider-backed output as a prerequisite." + }, + "evidence_grounding": { + "weight": 0.45, + "max_points": 1.0, + "criteria": "Must require output-table identifiers before citation scoring." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Must identify graph-summary and citation artifacts needed for rerun." + }, + "trap_avoidance": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Must not turn a provider blocker into a graph-synthesis pass." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "preserve_provider_blocker" + }, + "tags": ["external_adapter", "graph_rag", "graphrag", "output_tables", "typed_blocked"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json new file mode 100644 index 00000000..04629878 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json @@ -0,0 +1,141 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "graph-rag-lightrag-context-sources-001", + "suite": "retrieval", + "title": "Score LightRAG context-source references only after the Docker API exports source paths", + "encoding": { + "status": "incomplete", + "reason": "LightRAG representative context-source scoring is incomplete when the opt-in Docker API service is not started or does not export context, references, or file paths for the generated corpus.", + "follow_up": { + "title": "Run LightRAG context-source export with the Docker service profile", + "reason": "The representative job can score source references after /query only_need_context returns generated file paths or content that maps to evidence ids." + } + }, + "corpus": { + "corpus_id": "graph-rag-representative-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "lightrag-context-output-contract", + "kind": "adapter_contract", + "text": "LightRAG representative contract: score context navigation only when /query context export returns generated source file paths, source snippets, or reference content mapped to benchmark evidence ids.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "lightrag_context_sources_incomplete", + "evidence_id": "lightrag-context-output-contract" + }, + "locator": { + "quote": "/query context export returns generated source file paths, source snippets, or reference content" + } + }, + "created_at": "2026-06-11T17:05:00Z" + }, + { + "evidence_id": "lightrag-service-boundary", + "kind": "adapter_blocker", + "text": "LightRAG boundary: missing or unreachable Docker API service is an incomplete setup state, not evidence of graph-RAG citation quality.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "lightrag_context_sources_incomplete", + "evidence_id": "lightrag-service-boundary" + }, + "locator": { + "quote": "missing or unreachable Docker API service is an incomplete setup state" + } + }, + "created_at": "2026-06-11T17:06:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "lightrag-context-contract-recorded", + "ts": "2026-06-11T17:05:00Z", + "actor": "agent", + "action": "recorded_adapter_contract", + "evidence_ids": ["lightrag-context-output-contract"], + "summary": "LightRAG context-source scoring needs context export with generated source mappings." + } + ], + "prompt": { + "role": "user", + "content": "Score LightRAG source-reference navigation only when context export is available.", + "job_mode": "answer", + "constraints": ["cite_source_paths", "typed_incomplete_setup", "no_graph_rag_quality_claim"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "context_source_contract", + "text": "LightRAG context-source scoring requires exported context or references mapped to evidence ids." + } + ], + "must_not_include": [ + "LightRAG passes representative graph-RAG navigation.", + "ELF beats LightRAG on source-reference navigation." + ], + "evidence_links": { + "context_source_contract": ["lightrag-context-output-contract", "lightrag-service-boundary"] + }, + "answer_type": "typed_incomplete", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "lightrag-context-output-contract", + "claim_id": "context_source_contract", + "requirement": "cite", + "quote": "/query context export returns generated source file paths, source snippets, or reference content" + }, + { + "evidence_id": "lightrag-service-boundary", + "claim_id": "context_source_contract", + "requirement": "explain", + "quote": "missing or unreachable Docker API service is an incomplete setup state" + } + ], + "negative_traps": [ + { + "trap_id": "lightrag-context-pass-claim", + "type": "unsupported_claim", + "evidence_ids": ["lightrag-service-boundary"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Must preserve incomplete setup status when the API does not export context." + }, + "evidence_grounding": { + "weight": 0.5, + "max_points": 1.0, + "criteria": "Must require generated source paths or content mappings before scoring." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Must not treat service reachability as graph-RAG quality." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "preserve_incomplete_setup_state" + }, + "tags": ["external_adapter", "graph_rag", "lightrag", "context_sources", "typed_incomplete"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json new file mode 100644 index 00000000..5121966a --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json @@ -0,0 +1,149 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "graph-rag-ragflow-reference-chunks-001", + "suite": "retrieval", + "title": "Keep RAGFlow reference-chunk citation scoring blocked until live chunks map to evidence ids", + "encoding": { + "status": "blocked", + "reason": "RAGFlow reference-chunk citation scoring requires an explicit Docker resource opt-in plus a local API key before returned reference chunks can be mapped to generated evidence ids.", + "follow_up": { + "title": "Run RAGFlow reference-chunk citation job with Docker resource opt-in", + "reason": "The representative job can score only after the RAGFlow smoke returns reference chunks containing document, chunk, and content fields for the generated public corpus." + } + }, + "corpus": { + "corpus_id": "graph-rag-representative-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "ragflow-reference-chunk-contract", + "kind": "adapter_contract", + "text": "RAGFlow representative contract: score only when returned reference chunks include generated document ids, chunk ids, content, and document metadata that map to benchmark evidence ids.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "ragflow_reference_chunks_blocked", + "evidence_id": "ragflow-reference-chunk-contract" + }, + "locator": { + "quote": "returned reference chunks include generated document ids, chunk ids, content, and document metadata" + } + }, + "created_at": "2026-06-11T17:00:00Z" + }, + { + "evidence_id": "ragflow-resource-boundary", + "kind": "adapter_blocker", + "text": "RAGFlow blocker: the checked-in smoke remains typed blocked until Docker resource-envelope opt-in and explicit local API configuration are present.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "ragflow_reference_chunks_blocked", + "evidence_id": "ragflow-resource-boundary" + }, + "locator": { + "quote": "Docker resource-envelope opt-in and explicit local API configuration" + } + }, + "created_at": "2026-06-11T17:01:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "ragflow-reference-contract-recorded", + "ts": "2026-06-11T17:00:00Z", + "actor": "agent", + "action": "recorded_adapter_contract", + "evidence_ids": ["ragflow-reference-chunk-contract"], + "summary": "RAGFlow can be scored only from generated reference chunks with stable evidence mapping." + }, + { + "event_id": "ragflow-blocker-recorded", + "ts": "2026-06-11T17:01:00Z", + "actor": "agent", + "action": "recorded_typed_blocker", + "evidence_ids": ["ragflow-resource-boundary"], + "summary": "RAGFlow representative scoring remains blocked by resource and API setup." + } + ], + "prompt": { + "role": "user", + "content": "Score RAGFlow citation quality only if reference chunks from the generated corpus are available.", + "job_mode": "answer", + "constraints": ["cite_chunk_references", "preserve_typed_blocker", "no_smoke_to_quality_claim"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "reference_chunk_contract", + "text": "RAGFlow citation scoring requires returned reference chunks mapped to generated evidence ids." + } + ], + "must_not_include": [ + "RAGFlow passes broad citation quality.", + "ELF beats RAGFlow on RAG citation quality." + ], + "evidence_links": { + "reference_chunk_contract": ["ragflow-reference-chunk-contract", "ragflow-resource-boundary"] + }, + "answer_type": "typed_blocker", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "ragflow-reference-chunk-contract", + "claim_id": "reference_chunk_contract", + "requirement": "cite", + "quote": "returned reference chunks include generated document ids, chunk ids, content, and document metadata" + }, + { + "evidence_id": "ragflow-resource-boundary", + "claim_id": "reference_chunk_contract", + "requirement": "explain", + "quote": "Docker resource-envelope opt-in and explicit local API configuration" + } + ], + "negative_traps": [ + { + "trap_id": "ragflow-smoke-quality-win", + "type": "unsupported_claim", + "evidence_ids": ["ragflow-resource-boundary"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Must preserve the blocked citation-scoring boundary." + }, + "evidence_grounding": { + "weight": 0.5, + "max_points": 1.0, + "criteria": "Must require reference chunk ids and document metadata before scoring." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Must not convert the smoke contract into a broad RAGFlow quality claim." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "preserve_typed_blocker" + }, + "tags": ["external_adapter", "graph_rag", "ragflow", "reference_chunks", "typed_blocked"] +} diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json new file mode 100644 index 00000000..0ba49733 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -0,0 +1,2996 @@ +{ + "schema": "elf.real_world_external_adapter_manifest/v1", + "manifest_id": "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store", + "docker_isolation": { + "default": true, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/live-baseline-benchmark.sh", + "artifact_dir": "tmp/live-baseline/", + "host_global_installs_required": false, + "notes": [ + "External project runs default to Docker Compose and Docker-managed caches.", + "Real-world job fixture reports and live baseline reports use separate schemas and claim boundaries." + ] + }, + "adapters": [ + { + "adapter_id": "elf_real_world_memory_fixture", + "project": "ELF", + "adapter_kind": "offline_fixture_response", + "evidence_class": "fixture_backed", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The checked-in real_world_memory fixtures parse and score through the ELF fixture runner.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "run": { + "status": "blocked", + "evidence": "The current fixture set reports 60 jobs across 16 suites: 53 pass, 0 incomplete, 7 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; the scheduled_memory suite scores 4 passing scheduled readback tasks plus one blocked private/provider scheduler case tied to XY-930, not hosted scheduler, ChatGPT Tasks, Pulse, or provider-backed private-corpus parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "result": { + "status": "blocked", + "evidence": "This is fixture-backed ELF scoring, not a live external adapter result.", + "artifact": "tmp/real-world-memory/real-world-memory-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_fixture_scoring", + "status": "real", + "evidence": "The runner scores checked-in real_world_job records with expected evidence, traps, and typed status output." + }, + { + "capability": "live_external_adapter_execution", + "status": "not_encoded", + "evidence": "The ELF fixture response path does not exercise an external memory project runtime." + }, + { + "capability": "docker_isolated_baseline", + "status": "pass", + "evidence": "ELF live baseline runs execute through docker-compose.baseline.yml for retrieval and lifecycle evidence." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "Checked-in source-of-truth rebuild fixture is encoded and passing." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "Checked-in work-resume fixtures are encoded and passing." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "Checked-in project-decision fixtures cover accepted decisions, reversals, current validation gates, rationale, and bounded caveats." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "Checked-in retrieval fixtures cover alternate phrasing, distractors, multi-hop routing, current-versus-obsolete selection, and minimal context." + }, + { + "suite_id": "memory_evolution", + "status": "pass", + "evidence": "Checked-in memory-evolution fixtures cover current-versus-historical facts and the relation temporal-validity case is encoded." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "Proposal-only consolidation fixtures are encoded and passing without source mutation." + }, + { + "suite_id": "memory_summary", + "status": "pass", + "evidence": "The source-trace memory summary fixture is encoded and passing with freshness, rationale, tombstone, and unsupported-claim guards." + }, + { + "suite_id": "proactive_brief", + "status": "blocked", + "evidence": "The proactive brief suite scores 4 passing source-linked suggestions and 1 typed private-corpus refresh blocker tied to XY-930." + }, + { + "suite_id": "scheduled_memory", + "status": "blocked", + "evidence": "The scheduled memory suite scores 4 passing source-linked task readbacks with execution trace coverage and 1 typed private/provider scheduler blocker tied to XY-930." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "Knowledge page fixtures are encoded and passing with citation and rebuild metrics." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "Operator-debugging fixtures now expose stage attribution and dropped-candidate evidence without raw SQL." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "Four redaction, exclusion, source-id, evidence-binding, and capture-boundary fixtures are encoded and passing." + }, + { + "suite_id": "core_archival_memory", + "status": "pass", + "evidence": "Six fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Production-ops fixtures encode restore, Qdrant rebuild, backfill resume, resource-envelope interpretation, OpenViking wrong-result classification, plus typed blocked operator boundaries." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The scoped preference fixture is encoded and passing." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked until same-corpus evidence ids and staged artifacts are materialized." + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory", + "status": "pass" + } + ], + "notes": [ + "This adapter record exists to keep ELF fixture results separate from live external adapter results.", + "The remaining non-pass ELF fixture states are production-ops operator boundaries plus OpenViking context-trajectory measurement gates.", + "Use elf_live_real_world for service-runtime real_world_job evidence; this fixture-backed record must not imply live-service behavior." + ] + }, + { + "adapter_id": "elf_live_real_world", + "project": "ELF", + "adapter_kind": "docker_service_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "ELF materializes 55 real_world_job adapter_response objects through ElfService, worker indexing, search_raw, live capture/write-policy ingestion, live consolidation proposal review, live knowledge-page rebuild/lint, and operator-debug trace metadata before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full live sweep scores 55 jobs across all 13 checked-in suites, including live-scored consolidation, knowledge-page, capture/write-policy, and operator-debug suites. This is not a full-suite live pass because memory-evolution, production-ops, core-archival, and context-trajectory gaps remain typed non-pass records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes real_world_job prompts after runtime ingestion and writes generated answer artifacts before scoring." + }, + { + "capability": "service_runtime_execution", + "status": "real", + "evidence": "The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution is wrong_result and production/core/context boundaries remain typed non-pass." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "Adapter setup/runtime limitations are materialized as typed jobs with evidence JSON instead of silent claim upgrades." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "The live adapter retrieved the restore/Qdrant rebuild proof evidence through the service runtime." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "The live adapter passed 5/5 work_resume jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "The live adapter passed 5/5 retrieval jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "The live adapter passed 5/5 project_decisions jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "The live adapter passed the delete/TTL case but failed five current-versus-historical conflict jobs because retrieval-backed answers did not provide the required historical conflict evidence links." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "The live adapter creates consolidation runs, materializes proposal jobs through the worker, preserves source lineage and unsupported-claim flags, and applies/defer/discards proposals through review audit transitions." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "The live adapter rebuilds derived knowledge pages through ElfService, searches page sections, lints stale source refs after runtime source updates, and emits citation/backlink/unsupported-section page artifacts." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The full live sweep includes operator_debugging_ux fixtures and emits trace ids, viewer/admin trace-bundle links, replay commands, dropped-candidate visibility, repair-action clarity, and raw_sql_needed=false." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "The live adapter passes 4/4 capture_integration jobs through Docker-local ELF ingestion, including capture-boundary classification, excluded evidence ids, source ids in source_ref, write_policy redaction audit counts, evidence binding, and zero secret leakage." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The live adapter retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The full live adapter sweep preserves the core/archival fixture gap as typed not_encoded; this issue does not add live core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "scenarios": [ + { + "scenario_id": "live_capture_write_policy", + "suite_id": "capture_integration", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is an ELF self-check, not a win over external hook systems.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_consolidation_proposal_review", + "suite_id": "consolidation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live consolidation jobs now exercise source lineage, unsupported-claim flags, and apply/defer/discard review audit transitions. This is an ELF service self-check, not a broad competitor win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_knowledge_page_rebuild_lint", + "suite_id": "knowledge_compilation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live knowledge jobs now exercise page rebuild, search, stale-source lint, citations, backlinks, and unsupported-section handling. This is an ELF service self-check, not a broad knowledge-product win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "full_sweep_operator_debug", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF full live sweep now includes the operator-debug fixture tree with hydrated trace ids, trace-bundle replay commands, dropped-candidate visibility, repair guidance, and no raw SQL requirement.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This Docker-isolated live real_world_job record now covers the full encoded fixture corpus, not only the original three-suite representative slice.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove private-corpus production quality or provider-backed production operations." + ] + }, + { + "adapter_id": "qmd_live_baseline", + "project": "qmd", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs qmd inside the baseline container.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "pass", + "evidence": "qmd same-corpus retrieval, update, delete, and cold-start checks are encoded in the live baseline runner.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "This live_baseline_only record is same-corpus evidence only; cite qmd_live_real_world for the full live real-world sweep.", + "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "qmd has an encoded Docker same-corpus retrieval adapter." + }, + { + "capability": "update_delete_cold_start", + "status": "pass", + "evidence": "qmd lifecycle smoke checks are encoded in the live-baseline runner." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job prompts; cite qmd_live_real_world for the full live real-world sweep." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job retrieval prompts; cite qmd_live_real_world for the live retrieval adapter run." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Live-baseline lifecycle checks exist, but no real_world_job memory_evolution run is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd debug ergonomics are a reference dimension; no operator_debugging_ux fixture is executed against qmd." + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + }, + { + "kind": "compose", + "ref": "docker-compose.baseline.yml", + "status": "real" + } + ], + "notes": [ + "This same-corpus record remains separate from qmd_live_real_world, which records real_world_job prompt execution and scoring evidence." + ] + }, + { + "adapter_id": "qmd_live_real_world", + "project": "qmd", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes 55 real_world_job adapter_response objects through collection add, update, embed, and query --json before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records, with operator-debug fixtures scored through qmd replay metadata rather than ELF trace hydration.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full qmd live sweep scores 55 jobs across all 13 checked-in suites, preserving consolidation, knowledge-page, capture, production-ops, core-archival, and context-trajectory gaps as typed non-pass records. This is not a full-suite live pass.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "qmd executes real_world_job prompts through its local CLI retrieval/query workflow and records generated answer artifacts." + }, + { + "capability": "local_cli_retrieval", + "status": "real", + "evidence": "The adapter uses qmd collection add, update, embed -f, and query --json inside Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution and operator_debugging_ux are wrong_result while non-qmd product surfaces remain typed not_encoded or blocked." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "qmd setup/runtime limitations are materialized as typed jobs with command evidence and retry artifacts." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "qmd retrieved the restore/Qdrant rebuild proof evidence through the local CLI workflow." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "qmd passed 5/5 work_resume jobs through CLI evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "qmd passed 5/5 retrieval jobs through CLI evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "qmd passed 5/5 project_decisions jobs through CLI evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "qmd failed all six memory-evolution jobs in the fresh June 11 diagnostic, including the delete/TTL tombstone job where qmd retrieved only the current plan and missed the tombstone evidence." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate or review consolidation proposals." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages." + }, + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The full qmd live sweep includes operator_debugging_ux fixtures and records replay-command metadata, but it lacks ELF trace hydration, viewer links, and intermediate candidate-drop stages, so the suite remains wrong_result." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep does not exercise capture integrations or write-policy redaction boundaries; all capture_integration jobs remain typed not_encoded for qmd." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The qmd live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "qmd retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep preserves the core/archival fixture gap as typed not_encoded; qmd does not expose ELF core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/qmd-report.json", + "status": "pass" + } + ], + "notes": [ + "This qmd record is real-world job evidence and must not be conflated with the same-corpus qmd_live_baseline record.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove broad RAG/graph adapter parity or private-corpus production quality." + ] + }, + { + "adapter_id": "elf_operator_debug_live", + "project": "ELF", + "adapter_kind": "docker_service_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + "run": { + "status": "pass", + "evidence": "ELF materializes operator_debugging_ux adapter_response objects through ElfService, worker indexing, search_raw trace ids, and generated operator_debug metadata.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + "result": { + "status": "pass", + "evidence": "The narrow live slice scores operator-debugging jobs with trace availability, replay command availability, candidate-drop visibility, repair-action clarity, and raw-SQL avoidance separated in job-level evidence.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through the live service materializer and generated scoring fixtures." + }, + { + "capability": "trace_hydration_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include service trace ids, viewer links, admin trace-bundle URLs, and trace_available=true." + }, + { + "capability": "replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include admin trace-bundle curl replay commands; no raw SQL path is required." + }, + { + "capability": "candidate_drop_visibility", + "status": "pass", + "evidence": "The operator-debug jobs keep dropped-candidate visibility as explicit job-level evidence instead of relying on direct database inspection." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This ELF live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The narrow live operator-debug slice scores trace hydration, stage attribution, candidate-drop visibility, selected-but-not-narrated diagnosis, and repair-action clarity through generated ELF live artifacts." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs for the operator-debug jobs; qmd has replay rows but no ELF trace hydration surface.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF generated admin trace-bundle replay commands; qmd generated local CLI query replay commands. These are comparable replay-command availability artifacts, not equivalent UI quality claims.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated operator_debug candidate-drop visibility from trace and replay-candidate metadata without direct SQL assumptions; qmd keeps only top-k replay rows and lacks intermediate candidate-drop stages.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "The new selected-but-not-narrated job scores whether selected trace evidence is available for answer-composition repair without direct database inspection.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "The record does not implement product UI improvements and does not claim broad qmd/OpenMemory/claude-mem superiority." + ] + }, + { + "adapter_id": "qmd_operator_debug_live", + "project": "qmd", + "adapter_kind": "docker_cli_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes operator_debugging_ux adapter_response objects through collection add, update, embed, and query --json, then records local replay-command metadata but no service trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The narrow live slice gives qmd explicit replay-command evidence, but operator-debug jobs remain wrong_result where trace availability, trace completeness, or candidate-drop stage visibility is required.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through qmd local CLI materialization and generated scoring fixtures." + }, + { + "capability": "local_replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include qmd query replay commands tied to per-job collections." + }, + { + "capability": "trace_hydration_metadata", + "status": "wrong_result", + "evidence": "Generated qmd operator_debug records have trace_available=false and no ELF viewer/admin trace bundle because qmd exposes local replay rows rather than service trace hydration." + }, + { + "capability": "candidate_drop_visibility", + "status": "wrong_result", + "evidence": "qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed in the generated artifact." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This qmd live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The narrow qmd operator-debug slice scores local replay commands but remains wrong_result for trace hydration and candidate-drop stage visibility." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated replay-command metadata but trace_available=false, so ELF wins only this trace-hydration dimension; this is not a broad qmd loss.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated local CLI query replay commands for the same operator-debugging scenarios; ELF generated admin trace-bundle curl commands.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated top-k replay output but not intermediate retrieved-but-dropped stage visibility, so candidate-drop diagnosis remains a qmd wrong_result in this narrow slice.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated clear local replay steps for repair investigation, matching ELF on repair-action clarity while differing on trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd can replay top-k rows, but the generated artifact does not expose service trace narration stages for the selected-but-not-narrated diagnosis.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", + "status": "wrong_result" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "qmd's replay-command availability remains useful; the wrong_result status is limited to trace hydration and candidate-drop stage visibility." + ] + }, + { + "adapter_id": "agentmemory_live_baseline", + "project": "agentmemory", + "adapter_kind": "docker_sdk_mock_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "lifecycle_fail", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs and exercises agentmemory package APIs.", + "command": "ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/agentmemory.log" + }, + "run": { + "status": "lifecycle_fail", + "evidence": "Same-corpus retrieval can run, but durable lifecycle behavior is not proven because the adapter uses an in-memory SDK/KV mock.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "lifecycle_fail", + "evidence": "agentmemory remains a reference for capture and continuity UX, but current Docker evidence is not a durable lifecycle pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "The current adapter can run mem::remember and mem::search against the shared corpus." + }, + { + "capability": "adapter_storage", + "status": "mocked", + "evidence": "The current adapter uses a process-local StateKV Map and in-memory index." + }, + { + "capability": "durable_cold_start", + "status": "blocked", + "evidence": "A persistent upstream KV/index path or hosted runtime is needed before cold-start recovery can be fairly scored." + }, + { + "capability": "durable_work_resume_capture_path", + "status": "blocked", + "evidence": "XY-925 selects the next local path as a Docker-contained agentmemory session directory with persisted SDK KV store, observation log, and searchable index across a fresh process; the current StateKV Map and in-memory index still block scoring." + }, + { + "capability": "write_policy_hook_capture", + "status": "blocked", + "evidence": "Capture/write-policy jobs require live agentmemory hook observations plus persisted write-policy audit evidence. The current adapter does not execute those hooks." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed blocked prompt coverage for the required durable path, but no live agentmemory real_world_job adapter executes prompts until the persistent local store exists." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "blocked", + "evidence": "A durable upstream agentmemory session/capture path is required before work-resume jobs can be compared fairly." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "The current fixture import boundary is offline and does not run live agentmemory hooks." + }, + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Durable update/supersede/delete history is not proven by the in-memory adapter." + } + ], + "scenarios": [ + { + "scenario_id": "basic_same_corpus_retrieval", + "suite_id": "retrieval", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports agentmemory retrieval_pass with 3/3 same-corpus retrieval checks through mem::remember and mem::search. This is live-baseline-only evidence through an in-memory mock, not a real_world_job suite pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "durable_update_reload_lifecycle", + "suite_id": "memory_evolution", + "status": "lifecycle_fail", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks, while agentmemory update_replaces_note_text is lifecycle_fail and cold_start_recovery_search is blocked because the harness uses an in-memory SDK/KV mock. This is an ELF baseline win only at the local lifecycle-smoke evidence class.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "work_resume_capture_continuity", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. XY-925 selects the durable local path as a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; keep work_resume and capture claims blocked until that path exists.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json" + }, + { + "scenario_id": "durable_work_resume_local_path", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The selected comparable path is explicit: capture into a Docker-local agentmemory session directory, persist the SDK KV/index and observation log, restart a fresh process, then score work_resume prompts. The checked-in fixture records this as blocked rather than scoring the current mock.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + }, + { + "scenario_id": "capture_write_policy_hooks", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory capture/write-policy comparison needs live hook observations and write-policy audit evidence persisted through the selected local store. The fixture preserves this as a typed blocker and does not convert the mem::remember smoke into capture proof.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + } + ], + "evidence": [ + { + "kind": "guide", + "ref": "docs/guide/research/agentmemory_adapter.md", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "mocked" + } + ], + "notes": [ + "The offline agentmemory fixture adapter is an import/comparison boundary and must not be treated as live benchmark proof." + ], + "follow_up": { + "title": "[ELF benchmark P0] Make agentmemory adapter lifecycle-durable and fail-typed", + "reason": "A durable upstream agentmemory storage path is required before lifecycle and real-world job suites can be fairly scored." + } + }, + { + "adapter_id": "mem0_openmemory_live_baseline", + "project": "mem0/OpenMemory", + "adapter_kind": "docker_sdk_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install mem0 and configure local FastEmbed/Qdrant paths.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded SDK checks. XY-931 adds a separate OpenMemory export-helper setup probe artifact and keeps that blocked UI/export result out of the SDK check summary.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "The local OSS mem0 baseline now passes same-corpus retrieval, update/delete/reload, preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history. The separate OpenMemory export-helper setup probe is blocked because Docker is unavailable inside the baseline-runner container before any product app database readback can run. It still does not claim hosted Platform export, optional graph memory, or a real_world_job prompt adapter.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "local_storage", + "status": "real", + "evidence": "The adapter targets local FastEmbed, Qdrant path storage, and local history DB paths in Docker." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "local_lifecycle_update_delete_reload", + "status": "pass", + "evidence": "The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; the fresh scoped run reports those lifecycle checks passing." + }, + { + "capability": "preference_correction_history", + "status": "pass", + "evidence": "The fresh scoped run reports preference_correction_history as pass: Memory.history preserved explicit ADD and UPDATE records with old and current preference text, and search returned only the current correction." + }, + { + "capability": "entity_scoped_personalization", + "status": "pass", + "evidence": "The fresh scoped run reports entity_scoped_personalization as pass: user_id, agent_id, and run_id filters returned the ELF scoped preference and omitted a PubFi scoped preference." + }, + { + "capability": "local_get_all_export_readback", + "status": "pass", + "evidence": "The fresh scoped run reports local_get_all_export_readback as pass: Memory.get_all returned the current scoped preference and omitted the other scope." + }, + { + "capability": "deletion_audit_history", + "status": "pass", + "evidence": "The fresh scoped run reports delete_history_audit_readback as pass: Memory.history exposed a DELETE event and search suppressed the deleted memory." + }, + { + "capability": "openmemory_ui_readback", + "status": "blocked", + "evidence": "XY-931 runs a bounded OpenMemory export-helper setup probe after the mem0 SDK corpus checks. The probe finds the OpenMemory tree, UI package, compose file, and export helper, then records a setup blocker because the export helper requires Docker access to a running OpenMemory container. Local SDK get_all readback is measured separately and must not be reused as UI evidence." + }, + { + "capability": "hosted_managed_memory_claims", + "status": "unsupported", + "evidence": "Hosted mem0 Platform behavior and Platform UI export are outside the local OSS Docker adapter and are non-goals for this local evidence record." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No mem0/OpenMemory adapter currently executes real_world_job prompts and answer scoring." + }, + { + "capability": "optional_graph_memory", + "status": "not_encoded", + "evidence": "Optional graph memory is not enabled in the default local OSS path and remains an opt-in scenario gate rather than a default pass/fail claim." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure preference correction history and deletion audit readback, but no mem0 real_world_job memory_evolution prompt adapter is encoded." + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure entity-scoped personalization, but no mem0 real_world_job personalization prompt adapter is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "Local SDK get_all inspection is measured, but OpenMemory UI/export readback is blocked by the XY-931 export-helper setup probe until a dedicated OpenMemory compose/import path can load the same corpus into the OpenMemory app database." + } + ], + "scenarios": [ + { + "scenario_id": "basic_local_lifecycle", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Prior comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing basic same-corpus retrieval, update, delete, and cold-start reload checks. This remains a basic local lifecycle tie at the encoded smoke surface and is not reused as history/UI evidence.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "preference_correction_history", + "suite_id": "personalization", + "status": "pass", + "elf_position": "loses", + "comparison_outcome": "loss", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "entity_scoped_personalization", + "suite_id": "personalization", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + }, + { + "scenario_id": "delete_audit_readback", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "local_get_all_export_readback", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0-checks.json" + }, + { + "scenario_id": "openmemory_ui_export_readback", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/mem0-openmemory-ui-export.json" + }, + { + "scenario_id": "hosted_platform_export", + "suite_id": "operator_debugging_ux", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted mem0 Platform export is explicitly outside the local OSS Docker comparison and is not counted as a local pass, loss, or blocker.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "optional_graph_memory", + "suite_id": "memory_evolution", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Optional graph memory is kept as an opt-in scenario gate. It is not enabled in the default mem0 local OSS run and is not part of the default pass/fail comparison.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Separate local OSS mem0 SDK evidence from OpenMemory product UI/export claims.", + "A blocked OpenMemory export-helper setup probe is not an ELF win or loss until the product app can import and export the same local corpus." + ] + }, + { + "adapter_id": "memsearch_live_baseline", + "project": "memsearch", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install memsearch and run its CLI path.", + "command": "ELF_BASELINE_PROJECTS=memsearch cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/memsearch.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 indexes a per-adapter corpus copy, rewrites and deletes files, reruns memsearch index, and reports memsearch 4/4 encoded checks passing.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "memsearch now passes the local same-corpus/reindex/update/delete/reload smoke. No real_world_job memsearch prompt adapter is encoded, so Markdown-first behavior remains baseline scenario evidence rather than suite pass evidence.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "canonical_markdown_store", + "status": "real", + "evidence": "memsearch is tracked as a Markdown-first source-of-truth reference." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "reindex_update_delete_reload", + "status": "pass", + "evidence": "The runner rewrites auth-memory.md, deletes a second corpus file, reruns memsearch index, and starts fresh memsearch search processes; the fresh scoped run reports update, delete, and cold-start reload passing." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Markdown source-store and retrieval-debug jobs, but no live memsearch runtime adapter executes real_world_job prompts and answer scoring." + }, + { + "capability": "markdown_source_store_prompt_jobs", + "status": "pass", + "evidence": "The first-generation OSS fixture slice encodes source-of-truth rebuild/reload and retrieval-debug prompts over the canonical Markdown store while preserving the live-baseline-only evidence boundary." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "not_encoded", + "evidence": "The Markdown-first source model passed the local reindex/reload smoke, and XY-925 adds fixture-backed source-of-truth prompt coverage over the canonical Markdown store. No live memsearch runtime adapter executes prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The Docker same-corpus check passes, and XY-925 adds fixture-backed retrieval-debug prompt coverage over memsearch CLI replay and Markdown source inspection. No live memsearch runtime adapter executes retrieval prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Update/delete reindex semantics pass in Docker, but memory_evolution real_world_job prompts are not encoded for memsearch." + } + ], + "scenarios": [ + { + "scenario_id": "canonical_markdown_reindex_reload", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch passed same-corpus retrieval, update reindex, delete suppression, and cold-start reload over a canonical Markdown corpus. ELF has no directly comparable canonical Markdown source-store scenario in this baseline, so the ELF position remains untested.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "markdown_source_store_rebuild_reload_prompt", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in real_world_job prompt fixture that asks for the memsearch source-of-truth path and rebuild/reload boundary: canonical Markdown files are authoritative, while the index is derived by rerunning memsearch index. This is fixture-backed scenario coverage plus baseline artifact evidence, not a memsearch live real_world_job suite pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json" + }, + { + "scenario_id": "markdown_retrieval_debug_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in retrieval-debug prompt over memsearch's canonical Markdown store. The expected debug surface is CLI replay plus Markdown source inspection and reindexing; staged expansion/fusion/rerank/candidate-drop trace bundles remain not encoded for memsearch.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json" + }, + { + "scenario_id": "ttl_expiry_lifecycle", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "The encoded memsearch CLI path supports reindex/delete but no TTL or expiry behavior. Unsupported TTL behavior is preserved as unsupported competitor evidence and does not create an ELF win/loss claim without a directly comparable scenario artifact.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "real_world_prompt_adapter", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "No live memsearch runtime adapter currently executes real_world_job prompts and answer scoring. XY-925 fixture-backed prompt jobs document the source-store and retrieval-debug shape, while baseline retrieval/reindex evidence remains separate from suite pass claims.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Do not mark memsearch worse solely because setup or local indexing is heavier; preserve the typed incomplete/wrong-result boundary." + ] + }, + { + "adapter_id": "openviking_live_baseline", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "OpenViking local-embed setup installed and imported pinned llama-cpp-python==0.3.28 from the CPU wheel index in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The adapter reached same-corpus add_resource/find and now exposes expected/matched/missing evidence ids, but returned 0 of 3 expected evidence-term matches in the smoke run.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The current OpenViking Docker evidence is a behavioral wrong_result, not a local embedding setup blocker and not a real_world_job pass.", + "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "local_embed_setup", + "status": "pass", + "evidence": "Docker local embedding dependency setup is pinned to llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu and reached import/runtime in the smoke run." + }, + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "OpenViking add_resource/find returned resources but missed expected evidence-term matches for every smoke query." + }, + { + "capability": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged/hierarchical retrieval is now encoded as blocked context_trajectory fixtures until same-corpus expected evidence ids match and staged artifacts are materialized." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No OpenViking adapter currently executes real_world_job prompts and answer scoring." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "The Docker-local setup reached add_resource/find, but the retrieval check returned 0/3 expected evidence-term matches." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Hierarchical context resume scenarios are not encoded for OpenViking." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked behind same-corpus evidence output and staged artifact readback." + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + }, + { + "label": "llama-cpp-python CPU wheel index", + "url": "https://abetlen.github.io/llama-cpp-python/whl/cpu", + "evidence": "Official prebuilt CPU wheel index used by the Docker-local embedding pin." + } + ], + "setup_path": "Run ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker. The runner installs llama-cpp-python==0.3.28 with --only-binary llama-cpp-python from the CPU wheel index before OpenViking add_resource/find.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host-global OpenViking, llama-cpp-python, or model service install is required.", + "resource_expectation": "Local embedding setup may download a CPU wheel and model assets; record OpenViking.log, elapsed time, and cache size before claiming adapter quality.", + "retry_guidance": [ + "Use the default pinned CPU wheel path first.", + "Override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the default wheel is unavailable for the Docker platform.", + "Treat install/import failure as incomplete, not wrong_result; treat add_resource/find evidence misses as wrong_result." + ] + }, + "notes": [ + "Record OpenViking as wrong_result now that the pinned Docker local embedding path reaches add_resource/find but misses expected evidence; keep context_trajectory as blocked until staged artifacts exist." + ], + "follow_up": { + "title": "Fix OpenViking evidence-bearing same-corpus retrieval output and materialize staged artifacts", + "reason": "The current adapter reaches add_resource/find and exposes expected evidence ids, but must match evidence ids and return stage/hierarchy/recursive artifacts before trajectory quality can be scored." + } + }, + { + "adapter_id": "claude_mem_live_baseline", + "project": "claude-mem", + "adapter_kind": "docker_repository_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install and build claude-mem.", + "command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/claude-mem.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The Docker runner now uses a durable SQLite file, exercises repository update/delete/reopen checks, and reports missed same-corpus or lifecycle evidence as typed non-pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "No real_world_job claude-mem adapter is encoded; progressive disclosure remains a design reference.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "The current Docker adapter did not prove correct same-corpus retrieval." + }, + { + "capability": "durable_storage", + "status": "real", + "evidence": "The runner writes to a Docker-local SQLite file and constructs a new Database plus repository instances for cold-start recovery search." + }, + { + "capability": "repository_lifecycle", + "status": "real", + "evidence": "The runner uses MemoryItemsRepository.update, deletes from the repository-owned memory_items table, and relies on repository FTS triggers for update/delete checks." + }, + { + "capability": "repository_progressive_disclosure", + "status": "real", + "evidence": "The runner verifies search result to getById detail hydration and listSources source evidence on the durable repository path." + }, + { + "capability": "progressive_disclosure_real_world_job", + "status": "pass", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Docker-contained repository progressive-disclosure path: search result to getById detail hydration and listSources evidence on durable SQLite. Hook, timeline, and viewer workflows remain blocked separately." + }, + { + "capability": "retrieval_repair_artifact", + "status": "wrong_result", + "evidence": "The same-corpus retrieval smoke remains wrong_result, and XY-925 records a repair prompt that tells operators to rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker before inspecting tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json." + }, + { + "capability": "hook_capture_viewer_workflow", + "status": "blocked", + "evidence": "The current Docker runner does not launch claude-mem hooks, timeline capture, local viewer readback, or an operator workflow over the same corpus." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "The durable repository run is encoded, but hook-driven capture and real_world_job work-resume prompts are not proven by that local repository check." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompt coverage, but local viewer/operator workflow remains blocked until a Docker-contained viewer or equivalent readback runner exists." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "claude-mem hook capture remains blocked because hooks, timeline capture, and observation workflows are not executed by this runner." + } + ], + "scenarios": [ + { + "scenario_id": "same_corpus_retrieval", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF retrieval_pass and claude-mem same_corpus_retrieval as wrong_result with 0/3 expected query checks passing, while its durable repository setup completed. This is an ELF baseline win for the narrow retrieval smoke scenario.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "retrieval_repair_artifact_path", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "XY-925 adds a checked-in repair prompt that preserves the claude-mem wrong_result and names rerun/inspection targets from the reproducible Docker baseline: tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. This is repair evidence for a miss, not a retrieval pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json" + }, + { + "scenario_id": "repository_lifecycle_reload", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing local lifecycle checks and claude-mem update, delete, and cold-start reload checks passing over a durable Docker-local SQLite repository. This is a local lifecycle-smoke tie, not a hook-driven work-resume or full progressive-disclosure job pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_detail_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "claude-mem passed the repository-level search-to-detail/source hydration check, which is a useful progressive-disclosure signal. ELF does not have a directly comparable claude-mem-style progressive-disclosure scenario in this baseline, so the ELF position remains untested rather than a loss claim.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds fixture-backed prompt coverage that asks for the measured claude-mem progressive-disclosure boundary: repository search results hydrate through getById and listSources on durable SQLite, but hooks, timeline, viewer, and live prompt scoring are not executed.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json" + }, + { + "scenario_id": "hook_capture_viewer_workflow", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The Docker baseline uses repository classes only. claude-mem hooks, viewer, timeline, and observation workflows are not executed by the runner, so XY-925 preserves this as a typed blocker rather than not_encoded prose.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + }, + { + "scenario_id": "viewer_operator_workflow", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "A fair claude-mem viewer/operator comparison needs a Docker-contained run that opens the local viewer or equivalent readback over the same durable SQLite corpus and emits timeline, detail hydration, and repair-command artifacts. That path is not available in the current runner.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "claude-mem remains a UX reference; durable repository checks do not prove hook, viewer, or full real-world progressive-disclosure behavior." + ] + }, + { + "adapter_id": "qmd_deep_profile_gate", + "project": "qmd", + "adapter_kind": "docker_cli_deep_profile_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "pass", + "evidence": "qmd already has a Docker CLI live-baseline adapter; this gate records the deeper profile extension before a separate scaled run is claimed.", + "command": "ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "not_encoded", + "evidence": "The XY-899 strength-profile report is checked in, but no new live qmd deep-profile adapter artifact is claimed from it." + }, + "result": { + "status": "not_encoded", + "evidence": "The XY-899 report records qmd scenario-level retrieval/debug/replay outcomes and wrong-result diagnosis taxonomy, while expansion/fusion/rerank scoring remains not_encoded.", + "artifact": "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json" + }, + "capabilities": [ + { + "capability": "stress_profile_retrieval_debug", + "status": "not_encoded", + "evidence": "The stress command path exists, but this adapter-pack gate has not published a deep qmd profile result." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "Repository-supported qmd benchmark runs must stay inside docker-compose.baseline.yml and must not require host-global installs." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "A deeper stress retrieval-debug report is not checked in for this gate." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd query planning and score readback are not yet scored as operator-debugging real_world_job outputs." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/tobi/qmd", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "qmd repository", + "url": "https://github.com/tobi/qmd", + "evidence": "Official qmd source for local hybrid search, CLI setup, and query behavior." + } + ], + "setup_path": "Use the existing Docker baseline qmd install, collection add, update, embed, and query flow with scale or stress profiles.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container with project files and caches inside Docker volumes.", + "resource_expectation": "CPU local embedding and rerank cost scale with corpus size; record elapsed time and qmd log artifacts before claims.", + "retry_guidance": [ + "Run qmd stress profile in Docker and publish the artifact path.", + "Map qmd JSON output to retrieval-debug real_world_job scoring before suite claims." + ], + "research_depth": "D2 reviewed; deep profile not encoded" + }, + "notes": [ + "This gate deepens qmd planning without changing the existing qmd pass evidence from the smoke live baseline." + ] + }, + { + "adapter_id": "openviking_deep_profile_gate", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_context_trajectory_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The default pinned OpenViking local embedding dependency path reaches runtime in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "blocked", + "evidence": "The XY-928 context_trajectory fixtures encode staged retrieval, hierarchy selection, and recursive/context expansion as blocked; no live trajectory adapter artifact is claimed." + }, + "result": { + "status": "blocked", + "evidence": "No OpenViking deep context-trajectory result is claimed from the current wrong-result smoke run; the XY-928 fixtures preserve trajectory surfaces as blocked/not_tested.", + "artifact": "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json" + }, + "capabilities": [ + { + "capability": "docker_local_embed_setup", + "status": "pass", + "evidence": "The local embedding setup is pinned and reaches import/runtime in Docker." + }, + { + "capability": "hierarchical_context_trajectory", + "status": "blocked", + "evidence": "Stage trajectory scoring is encoded as blocked until the smoke adapter returns evidence-bearing same-corpus output and selected hierarchy/expansion artifacts." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "The adapter pack must not ask operators to install OpenViking dependencies globally on the host." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "Same-corpus retrieval is still the precondition and remains wrong_result in the live baseline." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion jobs are encoded as blocked fixtures." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Trajectory readback is a reference feature but not a scored adapter output." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/volcengine/OpenViking/", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + } + ], + "setup_path": "Use the pinned Docker local embedding path from scripts/live-baseline-benchmark.sh, then run OpenViking add_resource/find before any deep profile scoring.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker.", + "resource_expectation": "Local embedding setup can download CPU wheels and model assets; record build/import logs, model cache size, and elapsed time.", + "retry_guidance": [ + "Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.", + "Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.", + "Fix evidence-bearing same-corpus output and materialize selected hierarchy/expansion artifacts before converting blocked context_trajectory fixtures into scored jobs." + ], + "research_depth": "D2 reviewed; local embedding setup pinned; blocked fixtures encoded" + }, + "notes": [ + "OpenViking remains a context-trajectory reference, but this gate prevents a smoke wrong_result or blocked fixture from becoming a deep-profile win claim." + ] + }, + { + "adapter_id": "ragflow_research_gate", + "project": "RAGFlow", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe tiny-corpus evidence smoke into a generated real_world_job report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-ragflow-docker", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The live path requires explicit resource-envelope opt-in and a local self-hosted RAGFlow API key; setup failures stay typed in the generated smoke artifact.", + "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", + "artifact": "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits ragflow-report.json and ragflow-report.md from one generated retrieval job. Pass or wrong_result is allowed only when returned reference chunks map to generated evidence ids; resource, setup, and API-key limits remain typed blockers.", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json" + }, + "capabilities": [ + { + "capability": "adapter_candidate_verdict", + "status": "not_encoded", + "evidence": "XY-882 completed D1/D2 feasibility research and marks RAGFlow adapter_candidate; no adapter run is encoded." + }, + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The smoke records official Docker setup, image/disk/startup envelope, CPU/GPU mode, vm.max_map_count handling, provider boundaries, and retry behavior." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "One generated retrieval job is scored from the smoke artifact or typed blocked when resource, service, or local API-key boundaries stop execution." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The scored smoke does not claim broad RAGFlow quality, private corpus behavior, scale, or comparative ranking." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated retrieval smoke is scored as pass, wrong_result, blocked, or incomplete by ragflow-report.json; the checked-in row remains blocked until live reference chunks map to evidence ids." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "RAGFlow knowledge output is not mapped to real_world_job page or citation scoring." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Resource envelope and service startup retry guidance must be documented first." + } + ], + "scenarios": [ + { + "scenario_id": "reference_chunk_citation_mapping", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for RAGFlow reference-chunk citation scoring. The job must remain blocked until returned reference chunks include generated document ids, chunk ids, content, and document metadata mapped to benchmark evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "private_or_large_corpus_ragflow_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Private corpus, large-corpus, and hosted RAGFlow quality are outside the generated-public Docker representative lane and must not be inferred from smoke reports.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/infiniflow/ragflow", + "status": "real" + }, + { + "kind": "source", + "ref": "https://ragflow.io/docs/", + "status": "real" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "RAGFlow repository", + "url": "https://github.com/infiniflow/ragflow", + "evidence": "Official source for RAGFlow service code and Docker Compose setup." + }, + { + "label": "RAGFlow docs", + "url": "https://ragflow.io/docs/", + "evidence": "Official deployment and setup documentation." + }, + { + "label": "RAGFlow HTTP API reference", + "url": "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + "evidence": "Official reference for OpenAI-compatible responses with reference chunks and document metadata." + } + ], + "setup_path": "Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API.", + "runtime_boundary": "Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs.", + "resource_expectation": "Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring.", + "retry_guidance": [ + "Run cargo make smoke-ragflow-docker first to produce a typed preflight artifact.", + "Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.", + "Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids." + ], + "research_depth": "D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed resource/setup/API-key blockers.", + "Do not interpret ragflow-report.json as broad RAGFlow quality evidence unless reference chunks map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter", + "reason": "Created as XY-885. XY-882 found a Docker boundary and reference-chunk output contract; implementation must prove a tiny ingest/query run before any quality claim." + } + }, + { + "adapter_id": "lightrag_research_gate", + "project": "LightRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-886 adds a Docker-profile context-export smoke command, and XY-900 keeps its generated retrieval fixtures scored through real_world_job_benchmark. The checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-lightrag-docker-context", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if the LightRAG API is unavailable; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in Docker service profile.", + "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context", + "artifact": "tmp/real-world-memory/lightrag-context/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke emits lightrag-report.json and lightrag-report.md over generated retrieval jobs. Pass or wrong_result is allowed only when returned context, references, or file paths map to generated evidence ids.", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-report.json" + }, + "capabilities": [ + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The opt-in compose profile records explicit LightRAG image, LLM, embedding, rerank, workspace, and Docker volume configuration without host-global installs." + }, + { + "capability": "retrieved_context_export", + "status": "blocked", + "evidence": "The materializer calls /documents/texts, waits on /documents/track_status, and queries /query with only_need_context plus chunk references when the service is reachable." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The LightRAG materializer rewrites generated retrieval fixtures with adapter_response evidence only when source paths or context map to required evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not score broad graph-RAG quality, private corpora, scale, or comparative ranking claims." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated smoke can exercise retrieval context/source mapping for retrieval fixtures, but the checked-in record stays blocked until a live artifact reaches query output." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "LightRAG update/delete/current-versus-historical behavior is not encoded by the context-export smoke." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "The smoke records context/source mappings, but full trace or viewer diagnostics are not mapped to benchmark scoring." + } + ], + "scenarios": [ + { + "scenario_id": "context_source_reference_mapping", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative incomplete fixture for LightRAG context/source-reference scoring. The job cannot score until the opt-in Docker API exports generated source file paths, snippets, or reference content.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "graph_rag_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "LightRAG graph-RAG navigation quality remains not_tested beyond the context-source output contract; no ELF win, tie, or loss is claimed.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-lightrag-docker-context", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LightRAG repository", + "url": "https://github.com/HKUDS/LightRAG", + "evidence": "Official source for LightRAG server, Docker, and retrieval modes." + }, + { + "label": "LightRAG Docker docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "evidence": "Official Docker deployment reference." + }, + { + "label": "LightRAG API server docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md", + "evidence": "Official query-mode and context-output reference." + }, + { + "label": "LightRAG core programming docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md", + "evidence": "Official source-id and file-path citation reference." + } + ], + "setup_path": "Run cargo make smoke-lightrag-docker-context for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes.", + "resource_expectation": "The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts.", + "retry_guidance": [ + "Run cargo make smoke-lightrag-docker-context first; a missing API must remain a typed incomplete artifact, not a pass claim.", + "Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.", + "Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids." + ], + "research_depth": "D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed service/setup blockers.", + "Do not interpret lightrag-report.json as broad graph-RAG quality evidence unless generated source/context mappings score as pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter", + "reason": "Created as XY-886. XY-882 found a Docker service path and context/source mapping contract; implementation must prove evidence export before scoring." + } + }, + { + "adapter_id": "graphrag_research_gate", + "project": "GraphRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe generated-corpus GraphRAG smoke into a scored knowledge_compilation report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-graphrag-docker", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed blocked artifact without model calls; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration to attempt live GraphRAG index/query.", + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker", + "artifact": "tmp/real-world-memory/graphrag-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphrag-report.json and graphrag-report.md from one generated knowledge_compilation job. Pass or wrong_result is allowed only when GraphRAG output tables map to generated evidence ids.", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-report.json" + }, + "capabilities": [ + { + "capability": "indexing_resource_envelope", + "status": "blocked", + "evidence": "The smoke bounds the generated public corpus, timeout, GraphRAG package, model configuration, cache size, output size, elapsed time, and observed cache entries." + }, + { + "capability": "source_citation_mapping", + "status": "blocked", + "evidence": "The generated artifact maps GraphRAG documents, text_units, communities, community_reports, entities, and relationships parquet rows back to real_world_job evidence ids when available." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; provider/setup limits remain blocked until live GraphRAG output maps to expected evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-navigation quality, knowledge-synthesis quality, private corpora, or large-corpus indexing." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "blocked", + "evidence": "The generated smoke can exercise parquet table source coverage for one tiny knowledge-compilation fixture, but the checked-in record stays blocked until live output exists." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The smoke may run local search for reachability, but retrieval quality scoring is not encoded." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Resource bounds are recorded, but no production-ops suite scoring is encoded." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "GraphRAG update/delete/current-versus-historical behavior is not encoded by the smoke." + } + ], + "scenarios": [ + { + "scenario_id": "output_table_citation_mapping", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for GraphRAG output-table citation scoring. The job requires provider-backed Docker output tables whose document, text-unit, community, report, entity, and relationship identifiers map to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + }, + { + "scenario_id": "graph_summary_synthesis_quality", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "GraphRAG graph-summary synthesis quality remains not_tested until provider-backed output tables and local-search context are scored beyond the smoke contract.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/microsoft/graphrag", + "status": "real" + }, + { + "kind": "source", + "ref": "https://microsoft.github.io/graphrag/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphrag-docker", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "GraphRAG repository", + "url": "https://github.com/microsoft/graphrag", + "evidence": "Official Microsoft GraphRAG source and setup reference." + }, + { + "label": "GraphRAG docs", + "url": "https://microsoft.github.io/graphrag/", + "evidence": "Official documentation for indexing and querying." + }, + { + "label": "GraphRAG input docs", + "url": "https://microsoft.github.io/graphrag/index/inputs/", + "evidence": "Official input format and document metadata reference." + }, + { + "label": "GraphRAG output tables", + "url": "https://microsoft.github.io/graphrag/index/outputs/", + "evidence": "Official output schema with document, text unit, community, and relationship identifiers." + }, + { + "label": "GraphRAG local search docs", + "url": "https://microsoft.github.io/graphrag/query/local_search/", + "evidence": "Official local-search context and graph traversal reference." + } + ], + "setup_path": "Run cargo make smoke-graphrag-docker for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke.", + "resource_expectation": "The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries.", + "retry_guidance": [ + "Run cargo make smoke-graphrag-docker first; missing provider configuration must remain a typed blocked artifact, not a pass claim.", + "Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.", + "Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs." + ], + "research_depth": "D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed provider/setup blockers.", + "Do not interpret graphrag-report.json as broad graph-navigation or knowledge-synthesis quality evidence unless output tables map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter", + "reason": "Created as XY-887. XY-882 found a Docker-bounded CLI/API path and output-table evidence handles; implementation must stay tiny and cost-recorded." + } + }, + { + "adapter_id": "graphiti_zep_research_gate", + "project": "Graphiti/Zep", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-contained Graphiti/Zep temporal smoke into a scored memory_evolution report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if live execution is not explicitly enabled. Set ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration to start Docker-local FalkorDB and run Graphiti.", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphiti-zep-report.json and graphiti-zep-report.md from one generated memory_evolution job. The default blocker is live-run opt-in disabled; when ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 are set without provider credentials, the blocker is provider_api_key_missing. No hosted Zep service or unrecorded credentials are used.", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json" + }, + "capabilities": [ + { + "capability": "temporal_graph_memory", + "status": "blocked", + "evidence": "The smoke materializes generated current, historical, and rationale facts with validity windows, but the checked-in record stays blocked until a live artifact maps search output." + }, + { + "capability": "docker_graph_store_setup", + "status": "blocked", + "evidence": "The task uses a Docker Compose graphiti-zep profile for FalkorDB and a container-local Python venv; no host-global graph database or hosted Zep service is used." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The generated temporal-validity fixture is scored or typed blocked; live quality evidence requires Graphiti/Zep search output mapped to current and historical evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-memory quality, managed Zep service behavior, private-corpus behavior, or large-corpus performance." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Generated current/historical relation facts are encoded, but the checked-in manifest stays blocked until the Docker smoke returns validity-window search output." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Hybrid graph retrieval reachability is not scored beyond the temporal search smoke." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations." + } + ], + "scenarios": [ + { + "scenario_id": "temporal_validity_window_mapping", + "suite_id": "memory_evolution", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for Graphiti/Zep temporal-validity scoring. The job remains blocked until provider-backed Docker output maps current and historical validity-window facts to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json" + }, + { + "scenario_id": "hosted_zep_temporal_memory", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted Zep service behavior is outside the Docker-local representative lane; no hosted-service result is used as ELF win/loss evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/getzep/graphiti", + "status": "real" + }, + { + "kind": "source", + "ref": "https://www.getzep.com/platform/graphiti/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphiti-zep-docker-temporal", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Graphiti repository", + "url": "https://github.com/getzep/graphiti", + "evidence": "Official open-source temporal context graph engine." + }, + { + "label": "Zep Graphiti overview", + "url": "https://www.getzep.com/platform/graphiti/", + "evidence": "Official product documentation for temporal context graph behavior." + }, + { + "label": "Graphiti quick start", + "url": "https://help.getzep.com/graphiti/getting-started/quick-start", + "evidence": "Official setup, episode ingest, and search output reference." + }, + { + "label": "Graphiti FalkorDB configuration", + "url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", + "evidence": "Official Docker-local FalkorDB setup reference." + }, + { + "label": "Graphiti fact triples", + "url": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", + "evidence": "Official manual fact-triple ingest contract." + } + ], + "setup_path": "Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", + "resource_expectation": "Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring.", + "retry_guidance": [ + "Run cargo make smoke-graphiti-zep-docker-temporal first to produce a typed blocked artifact.", + "Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.", + "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass." + ], + "research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed live-run opt-in, provider, and setup blockers.", + "Graphiti/Zep remains the temporal-validity reference; do not claim ELF-over-Graphiti/Zep until provider-backed temporal output maps to scored evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter", + "reason": "Created as XY-888. XY-882 found a Docker-local graph-store path and fact/validity-window output contract for memory_evolution scoring." + } + }, + { + "adapter_id": "letta_research_gate", + "project": "Letta", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract is a Docker-only benchmark-created agent export that must return core block JSON, archival search readback, and source ids before any scenario claim is scored." + }, + "run": { + "status": "not_encoded", + "evidence": "No Letta materializer currently creates the benchmark agent, imports the ELF core_archival_memory fixture corpus, or exports comparable core and archival evidence." + }, + "result": { + "status": "not_encoded", + "evidence": "No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision result is claimed." + }, + "capabilities": [ + { + "capability": "core_archival_memory", + "status": "blocked", + "evidence": "ELF fixture jobs now score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids." + }, + { + "capability": "docker_embedding_configuration", + "status": "blocked", + "evidence": "Docker setup requires explicit embedding configuration before archival retrieval can be tested." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No Letta materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Core memory preference application is not encoded for Letta." + }, + { + "suite_id": "project_decisions", + "status": "not_encoded", + "evidence": "Archival memory decision retrieval is not encoded for Letta." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Agent resumption through Letta memory blocks is not encoded." + }, + { + "suite_id": "core_archival_memory", + "status": "blocked", + "evidence": "ELF fixture coverage exists, but Letta has no contained export/readback artifact for the same core-vs-archival jobs." + } + ], + "scenarios": [ + { + "scenario_id": "core_block_attachment_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta has no comparable exported core block attachment evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json" + }, + { + "scenario_id": "core_block_scope_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains unscored without a contained export of agent, block, and visibility metadata.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json" + }, + { + "scenario_id": "core_block_provenance_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains not_tested until exported core memory includes stable source ids and audit-equivalent events.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json" + }, + { + "scenario_id": "stale_core_detection", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json" + }, + { + "scenario_id": "archival_fallback_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json" + }, + { + "scenario_id": "core_archival_project_decision_recovery", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains not_tested until the contained export/readback contract exists.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/letta-ai/letta", + "status": "real" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/guides/docker/", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Letta repository", + "url": "https://github.com/letta-ai/letta", + "evidence": "Official source for Letta stateful agents and memory." + }, + { + "label": "Letta Docker docs", + "url": "https://docs.letta.com/guides/docker/", + "evidence": "Official Docker deployment guide and embedding configuration boundary." + } + ], + "setup_path": "Use a Docker-only Letta server or CLI flow that creates a benchmark-owned agent, loads the checked-in core_archival_memory fixture corpus, writes core memory and archival memory with fixture source ids, then exports core block JSON plus archival search/readback JSON.", + "runtime_boundary": "Docker-only Letta server or CLI flow with benchmark-created agents, benchmark-owned storage, no host-global state, and no unstated hosted service dependency.", + "resource_expectation": "Embedding model, agent server state, exported core memory, archival search output, and provider boundaries must be explicit in the artifact.", + "retry_guidance": [ + "Create a tiny Docker agent with core memory and archival memory loaded from the ELF core_archival_memory fixtures.", + "Export core block readback, archival search results, source ids, and any audit-equivalent metadata as JSON before scoring.", + "Score core-versus-archival scenarios only after source evidence can be exported and mapped to the fixture evidence ids." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); XY-927 selects the contained export/readback contract, but the Letta adapter remains blocked until that artifact exists" + } + }, + { + "adapter_id": "langgraph_research_gate", + "project": "LangGraph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "LangGraph is D1 reviewed as a replay/checkpoint reference, not a direct memory backend adapter." + }, + "run": { + "status": "not_encoded", + "evidence": "No checkpoint replay real_world_job harness is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No production-ops or resume suite result is claimed." + }, + "capabilities": [ + { + "capability": "checkpoint_replay_regression", + "status": "not_encoded", + "evidence": "Replay/fork behavior needs an agent graph harness before scoring." + }, + { + "capability": "standalone_memory_backend", + "status": "unsupported", + "evidence": "LangGraph persistence is an agent-state/checkpoint layer, not a drop-in memory retrieval backend." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No LangGraph benchmark materializer exists." + } + ], + "suites": [ + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Checkpoint recovery and replay regression are not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume from checkpoint with memory reads is not encoded." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://docs.langchain.com/oss/python/langgraph/persistence", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LangGraph persistence docs", + "url": "https://docs.langchain.com/oss/python/langgraph/persistence", + "evidence": "Official documentation for checkpoints, replay, fork, and persistence behavior." + } + ], + "setup_path": "Build a tiny LangGraph agent with a checkpointer and explicit memory read/write steps before scoring.", + "runtime_boundary": "Docker-only Python harness with checkpoint store under the artifact directory.", + "resource_expectation": "Small runtime expected, but LLM calls and side effects must be stubbed or deterministic before replay claims.", + "retry_guidance": [ + "Encode one replay/fork failure recovery job.", + "Keep LangGraph classified as replay reference unless memory retrieval is actually exercised." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded" + } + }, + { + "adapter_id": "nanograph_research_gate", + "project": "nanograph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "nanograph is D1 reviewed as typed graph DX, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No typed graph schema/query real_world_job run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No graph temporal or retrieval-debug result is claimed." + }, + "capabilities": [ + { + "capability": "typed_graph_schema", + "status": "not_encoded", + "evidence": "Schema-as-code and typed query ergonomics need a benchmark harness." + }, + { + "capability": "memory_backend_comparison", + "status": "unsupported", + "evidence": "nanograph is a graph database reference, not a complete agent memory service." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No nanograph materializer exists." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Typed current/historical fact jobs are not encoded." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Typed query explainability is not scored." + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nanograph/nanograph", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "nanograph repository", + "url": "https://github.com/nanograph/nanograph", + "evidence": "Official source for on-device typed property graph behavior." + } + ], + "setup_path": "Build or install nanograph inside Docker and load a typed graph fixture from generated corpus facts.", + "runtime_boundary": "Docker-only CLI run with graph folder under benchmark artifacts.", + "resource_expectation": "Light local graph runtime expected; record binary build/install time and graph artifact size.", + "retry_guidance": [ + "Define a minimal schema for memory_evolution facts.", + "Score typed query output only if it cites fixture evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded" + } + }, + { + "adapter_id": "llm_wiki_research_gate", + "project": "llm-wiki", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "llm-wiki is D1 reviewed as a knowledge-compilation reference, but no plugin or generated-page adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No llm-wiki corpus-to-page run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge page citation or lint result is claimed." + }, + "capabilities": [ + { + "capability": "knowledge_page_compilation", + "status": "not_encoded", + "evidence": "Wiki generation and citation lint are not executed by the runner." + }, + { + "capability": "live_service_runtime", + "status": "unsupported", + "evidence": "llm-wiki is a plugin/workflow reference rather than a service adapter." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No page materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Corpus-to-wiki output is not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from wiki pages are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "wiki_page_citation_lint", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "llm-wiki remains a knowledge-workflow reference. No Docker-contained plugin or file-based page materializer emits cited wiki sections for scoring.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nvk/llm-wiki", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "llm-wiki repository", + "url": "https://github.com/nvk/llm-wiki", + "evidence": "Official source for the LLM Wiki plugin and knowledge-base workflow." + } + ], + "setup_path": "Research plugin bootstrap inside a Docker-contained Codex or file-based harness, then materialize page artifacts.", + "runtime_boundary": "Docker-only plugin or fixture materializer; no user-global Codex plugin install.", + "resource_expectation": "LLM generation cost depends on page build; record provider boundary and generated artifact size.", + "retry_guidance": [ + "Prototype a fixture-only page build with explicit citations.", + "Do not score until generated sections can be mapped to evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded" + } + }, + { + "adapter_id": "gbrain_research_gate", + "project": "gbrain", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "gbrain is D1 reviewed as a compiled-truth and timeline reference, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No gbrain brain-repo import or compiled-truth run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge-synthesis or operator-continuity result is claimed." + }, + "capabilities": [ + { + "capability": "compiled_truth_timeline", + "status": "not_encoded", + "evidence": "Compiled truth plus timeline output is a reference pattern but not scored." + }, + { + "capability": "postgres_backed_brain_repo", + "status": "blocked", + "evidence": "A Docker-local brain repo and Postgres setup path must be proven before execution." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No gbrain materializer exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Compiled truth and timeline pages are not scored." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Operator continuity through brain pages is not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "compiled_truth_timeline_export", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "gbrain compiled-truth and timeline scoring remains blocked until a Docker-local brain repository and database setup emits current-truth pages with source timeline evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "gbrain repository", + "url": "https://github.com/garrytan/gbrain", + "evidence": "Official source for brain repo and retrieval workflow." + }, + { + "label": "compiled truth guide", + "url": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "evidence": "Official guide for compiled truth plus timeline behavior." + } + ], + "setup_path": "Create a Docker-local brain repo fixture, run import/sync, and export compiled truth plus timeline evidence.", + "runtime_boundary": "Docker-only repository and database state with no operator-owned brain repo.", + "resource_expectation": "Postgres-backed sync and embedding choices must be explicit; record DB size and import time.", + "retry_guidance": [ + "Prototype a tiny brain repo with one current-truth page and timeline.", + "Score only if compiled truth cites the source timeline evidence." + ], + "research_depth": "D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven" + } + }, + { + "adapter_id": "graphify_docker_smoke", + "project": "graphify", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "XY-900 validation reached the Docker-only graph/report smoke setup inside the baseline runner without host-global assistant hooks.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" + }, + "run": { + "status": "pass", + "evidence": "The smoke installed graphify in a container-local venv, ran over a generated public corpus, and produced graph/report/query output for scoring.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The smoke emits graphify-report.json and graphify-report.md from one generated knowledge_compilation job. The current scored report maps evidence ids but remains wrong_result because the scoring rubric still records a wrong-result signal.", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" + }, + "capabilities": [ + { + "capability": "docker_cli_boundary", + "status": "pass", + "evidence": "The smoke uses docker-compose.baseline.yml baseline-runner, a container-local Python venv, and isolated assistant config paths; it does not install host-global assistant hooks." + }, + { + "capability": "graph_report_generation", + "status": "pass", + "evidence": "The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, command logs, build time, graph size, and report size." + }, + { + "capability": "real_world_job_adapter", + "status": "wrong_result", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; current knowledge_compilation scoring is wrong_result, not pass." + }, + { + "capability": "multimodal_code_graph", + "status": "not_encoded", + "evidence": "Multimodal extraction for videos, images, PDFs, or broad codebase understanding is a reference capability but not scored by this smoke." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph quality, private corpus behavior, scale, or authoritative memory-store behavior." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "evidence": "The generated smoke exercised graph/report evidence mapping for one generated knowledge-compilation fixture and scored wrong_result with mean_score 0.75." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "Graph-guided query output is present only as support for the generated knowledge_compilation smoke; broad retrieval quality scoring remains unclaimed." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from graph context are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "graph_report_navigation_lint", + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-929 adds a representative graphify fixture that scores graph report navigation, source-location citations, stale-source lint, and unsupported-summary handling as wrong_result because stale-source lint is still missing. This remains graphify non-pass evidence, not an ELF victory claim.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json" + }, + { + "scenario_id": "broad_graph_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Broad graph-navigation, codebase, multimodal, and private-corpus quality remain not_tested; the graphify evidence is bounded to generated graph/report artifacts.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/safishamsi/graphify", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphify-docker-graph-report", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-report.md", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "graphify repository", + "url": "https://github.com/safishamsi/graphify", + "evidence": "Official source for graphify graph extraction and query workflow." + }, + { + "label": "graphify README", + "url": "https://github.com/safishamsi/graphify/blob/v3/README.md", + "evidence": "Official CLI, output artifact, query, and source-location contract." + } + ], + "setup_path": "Run cargo make smoke-graphify-docker-graph-report to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke.", + "resource_expectation": "Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior.", + "retry_guidance": [ + "Run cargo make smoke-graphify-docker-graph-report first; setup/runtime failures must remain typed artifacts, not pass claims.", + "Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.", + "Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids." + ], + "research_depth": "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result" + }, + "notes": [ + "Status class: live Docker scored smoke with a current wrong_result outcome.", + "Do not interpret graphify-report.json as broad graph-navigation or knowledge-compilation quality evidence; the tiny smoke is scored and currently non-pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", + "reason": "Created as XY-889. XY-882 found a Docker-only CLI/materializer path and source-file/source-location output contract." + } + } + ] +} diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/dropped_evidence_filter.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/dropped_evidence_filter.json new file mode 100644 index 00000000..d950c523 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/dropped_evidence_filter.json @@ -0,0 +1,155 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-dropped-evidence-001", + "suite": "operator_debugging_ux", + "title": "Debug expected evidence dropped after recall filtering", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-dropped-expected", + "kind": "trace", + "text": "Trace 11111111-1111-4111-8111-111111111111 shows the expected note present in recall.candidates before service-side filtering and absent after the read-profile scope filter.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-dropped-expected"}}, + "created_at": "2026-06-09T02:00:00Z" + }, + { + "evidence_id": "trace-dropped-decoy", + "kind": "note", + "text": "Decoy note: the auth retry policy note ranked first but does not explain the missing expected deployment evidence.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-dropped-decoy"}}, + "created_at": "2026-06-09T02:01:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "The expected evidence was dropped after recall by the read-profile filter; the auth retry policy note was only the selected decoy.", + "claims": [ + { + "claim_id": "root_cause", + "text": "The expected evidence was dropped after recall by the read-profile filter.", + "evidence_ids": ["trace-dropped-expected"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-dropped-expected"], + "latency_ms": 2.4, + "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0}, + "trace_explainability": { + "trace_id": "11111111-1111-4111-8111-111111111111", + "failure_stage": "filter.read_profile", + "failure_reason": "Expected evidence survived recall.candidates but was removed by the read-profile scope filter before final selection.", + "stages": [ + { + "stage_name": "recall.candidates", + "kept_evidence": ["trace-dropped-expected", "trace-dropped-decoy"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": ["trace-dropped-decoy"], + "notes": "Candidate recall found both expected evidence and the decoy top note." + }, + { + "stage_name": "filter.read_profile", + "kept_evidence": ["trace-dropped-decoy"], + "dropped_evidence": ["trace-dropped-expected"], + "demoted_evidence": [], + "distractor_evidence": ["trace-dropped-decoy"], + "notes": "The expected evidence failed the read-profile scope check." + }, + { + "stage_name": "selection.final", + "kept_evidence": ["trace-dropped-decoy"], + "dropped_evidence": ["trace-dropped-expected"], + "demoted_evidence": [], + "distractor_evidence": ["trace-dropped-decoy"], + "notes": "Final selection only saw the decoy after filtering." + } + ] + } + } + } + }, + "timeline": [ + { + "event_id": "expected-evidence-recalled", + "ts": "2026-06-09T02:00:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-dropped-expected"], + "summary": "The trace captured recall-stage visibility for the expected evidence before filtering." + } + ], + "prompt": { + "role": "user", + "content": "Why did the memory result miss the expected deployment evidence?", + "job_mode": "debug", + "constraints": ["cite_evidence", "avoid_repeating_completed_work"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "The expected evidence was dropped after recall by the read-profile filter." + } + ], + "must_not_include": ["No expected deployment evidence was dropped."], + "evidence_links": {"root_cause": ["trace-dropped-expected"]}, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-dropped-expected", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "present in recall.candidates before service-side filtering and absent after the read-profile scope filter" + } + ], + "negative_traps": [ + { + "trap_id": "decoy-top-auth-note", + "type": "decoy_evidence", + "evidence_ids": ["trace-dropped-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": {"weight": 0.35, "max_points": 1.0, "criteria": "Identifies the trace stage that dropped expected evidence."}, + "evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Uses trace evidence rather than the decoy top note."}, + "workflow_helpfulness": {"weight": 0.2, "max_points": 1.0, "criteria": "Names a concrete repair action."}, + "answer_correctness": {"weight": 0.15, "max_points": 1.0, "criteria": "Reports the correct root cause."} + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "expected_evidence_dropped", + "trace_id": "11111111-1111-4111-8111-111111111111", + "viewer_url": "/viewer?trace_id=11111111-1111-4111-8111-111111111111", + "admin_trace_bundle_url": "/v2/admin/traces/11111111-1111-4111-8111-111111111111/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "The expected candidate survived recall but was removed by the read-profile scope filter before final selection.", + "steps_to_root_cause": 4, + "raw_sql_needed": false, + "dropped_candidate_visibility": "visible in Retrieval Funnel and Replay Candidates", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Trace", "Retrieval Funnel", "Replay Candidates", "Stage Details"], + "cli_steps": ["open viewer trace link", "compare recall before and after filter", "inspect replay candidates", "repair read profile or grant"], + "trace_evidence": ["trace-dropped-expected"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/provider_latency_failure.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/provider_latency_failure.json new file mode 100644 index 00000000..c1562e83 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/provider_latency_failure.json @@ -0,0 +1,107 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-provider-latency-001", + "suite": "operator_debugging_ux", + "title": "Debug provider latency degrading retrieval quality", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-provider-timeout", + "kind": "trace", + "text": "Trace 33333333-3333-4333-8333-333333333333 records provider metadata with embedding provider latency near timeout and expansion fallback to the original query only.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-provider-timeout"}}, + "created_at": "2026-06-09T02:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "Provider latency caused expansion fallback to the original query only, which reduced candidate recall.", + "claims": [ + { + "claim_id": "root_cause", + "text": "Provider latency caused expansion fallback to the original query only.", + "evidence_ids": ["trace-provider-timeout"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-provider-timeout"], + "latency_ms": 4.8, + "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0} + } + } + }, + "timeline": [ + { + "event_id": "provider-timeout-recorded", + "ts": "2026-06-09T02:10:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-provider-timeout"], + "summary": "Provider metadata and stage details recorded degraded expansion behavior." + } + ], + "prompt": { + "role": "user", + "content": "Why did recall get worse during the slow provider window?", + "job_mode": "debug", + "constraints": ["cite_evidence", "state_blockers"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "Provider latency caused expansion fallback to the original query only." + } + ], + "must_not_include": ["The corpus did not contain the expected evidence."], + "evidence_links": {"root_cause": ["trace-provider-timeout"]}, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-provider-timeout", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "embedding provider latency near timeout and expansion fallback to the original query only" + } + ], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "debuggability": {"weight": 0.35, "max_points": 1.0, "criteria": "Uses provider and stage metadata."}, + "evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Cites trace provider metadata."}, + "workflow_helpfulness": {"weight": 0.2, "max_points": 1.0, "criteria": "Suggests timeout or provider health repair."}, + "latency_resource": {"weight": 0.15, "max_points": 1.0, "criteria": "Reports latency as part of the root cause."} + }, + "pass_threshold": 0.8, + "hard_fail_rules": ["unsupported high-confidence claim about a required decision or fact"] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "provider_latency_or_failure", + "trace_id": "33333333-3333-4333-8333-333333333333", + "viewer_url": "/viewer?trace_id=33333333-3333-4333-8333-333333333333", + "admin_trace_bundle_url": "/v2/admin/traces/33333333-3333-4333-8333-333333333333/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "Provider latency forced fallback behavior, shrinking expanded-query recall.", + "steps_to_root_cause": 3, + "raw_sql_needed": false, + "dropped_candidate_visibility": "visible as low recall counts rather than a post-recall drop", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Providers And Ranking", "Stage Summary", "Stage Details"], + "cli_steps": ["open trace bundle", "inspect provider metadata", "compare expanded queries", "raise timeout or repair provider health"], + "trace_evidence": ["trace-provider-timeout"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "agentmemory_reference", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rebuild_changed_results.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rebuild_changed_results.json new file mode 100644 index 00000000..abd8c048 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rebuild_changed_results.json @@ -0,0 +1,135 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-rebuild-changed-results-001", + "suite": "operator_debugging_ux", + "title": "Debug result changes after Qdrant rebuild", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-before-rebuild", + "kind": "trace", + "text": "Before rebuild, trace 44444444-4444-4444-8444-444444444440 returned an orphan Qdrant candidate that no longer had an active source-of-truth note.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-before-rebuild"}}, + "created_at": "2026-06-09T02:15:00Z" + }, + { + "evidence_id": "trace-after-rebuild", + "kind": "trace", + "text": "After rebuild, trace 44444444-4444-4444-8444-444444444444 shows the orphan candidate removed and the active Postgres-backed note selected.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-after-rebuild"}}, + "created_at": "2026-06-09T02:20:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "Rebuild changed results because a stale derived-index candidate was removed and the active Postgres-backed note became top result.", + "claims": [ + { + "claim_id": "root_cause", + "text": "Qdrant rebuild removed a stale derived-index candidate and selected the active source-of-truth note.", + "evidence_ids": ["trace-before-rebuild", "trace-after-rebuild"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-before-rebuild", "trace-after-rebuild"], + "latency_ms": 3.3, + "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0} + } + } + }, + "timeline": [ + { + "event_id": "before-rebuild-trace", + "ts": "2026-06-09T02:15:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-before-rebuild"], + "summary": "The pre-rebuild trace included a stale derived-index candidate." + }, + { + "event_id": "after-rebuild-trace", + "ts": "2026-06-09T02:20:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-after-rebuild"], + "summary": "The post-rebuild trace selected only source-of-truth-backed evidence." + } + ], + "prompt": { + "role": "user", + "content": "Why did search change after rebuild?", + "job_mode": "debug", + "constraints": ["cite_evidence"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "Qdrant rebuild removed a stale derived-index candidate and selected the active source-of-truth note." + } + ], + "must_not_include": ["Postgres source-of-truth changed during rebuild."], + "evidence_links": {"root_cause": ["trace-before-rebuild", "trace-after-rebuild"]}, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-before-rebuild", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "orphan Qdrant candidate that no longer had an active source-of-truth note" + }, + { + "evidence_id": "trace-after-rebuild", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "orphan candidate removed and the active Postgres-backed note selected" + } + ], + "negative_traps": [ + { + "trap_id": "treat-qdrant-as-source-of-truth", + "type": "unsupported_prior", + "evidence_ids": ["trace-before-rebuild"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": {"weight": 0.3, "max_points": 1.0, "criteria": "Compares before and after trace evidence."}, + "evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Uses both rebuild traces."}, + "workflow_helpfulness": {"weight": 0.25, "max_points": 1.0, "criteria": "Explains source-of-truth versus derived index repair."}, + "answer_correctness": {"weight": 0.15, "max_points": 1.0, "criteria": "Does not claim Postgres changed."} + }, + "pass_threshold": 0.8, + "hard_fail_rules": ["unsupported high-confidence claim about a required decision or fact"] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "rebuild_changed_results", + "trace_id": "44444444-4444-4444-8444-444444444444", + "viewer_url": "/viewer?trace_id=44444444-4444-4444-8444-444444444444", + "admin_trace_bundle_url": "/v2/admin/traces/44444444-4444-4444-8444-444444444444/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "Rebuild removed stale derived-index state and restored source-of-truth-backed ranking.", + "steps_to_root_cause": 5, + "raw_sql_needed": false, + "dropped_candidate_visibility": "visible by comparing before and after trace candidates", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Trace", "Replay Candidates", "Selected Final Results"], + "cli_steps": ["open before trace", "open after trace", "compare replay candidates", "confirm active note selected", "keep Qdrant rebuild as repair"], + "trace_evidence": ["trace-before-rebuild", "trace-after-rebuild"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/relation_context_mislead.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/relation_context_mislead.json new file mode 100644 index 00000000..8bdc01e5 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/relation_context_mislead.json @@ -0,0 +1,121 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-relation-context-mislead-001", + "suite": "operator_debugging_ux", + "title": "Debug relation context that misleads search", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-relation-context", + "kind": "trace", + "text": "Trace 55555555-5555-4555-8555-555555555555 includes relation_context with deprecated predicate deployment_owner pointing to a stale owner, while the selected note text says the current owner is release engineering.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-relation-context"}}, + "created_at": "2026-06-09T02:25:00Z" + }, + { + "evidence_id": "stale-relation-fact", + "kind": "adapter_state", + "text": "Stale graph fact: deployment_owner points to the old infra group and should not drive the current answer.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "stale-relation-fact"}}, + "created_at": "2026-06-08T02:25:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "Relation context misled the search because a deprecated deployment_owner fact conflicted with the selected note text.", + "claims": [ + { + "claim_id": "root_cause", + "text": "A deprecated relation_context fact conflicted with the selected note text.", + "evidence_ids": ["trace-relation-context"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-relation-context"], + "latency_ms": 2.9, + "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0} + } + } + }, + "timeline": [ + { + "event_id": "relation-context-trace", + "ts": "2026-06-09T02:25:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-relation-context"], + "summary": "The trace captured relation_context and selected note text for the misleading result." + } + ], + "prompt": { + "role": "user", + "content": "Why did graph context point to the wrong owner?", + "job_mode": "debug", + "constraints": ["cite_evidence"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "A deprecated relation_context fact conflicted with the selected note text." + } + ], + "must_not_include": ["The old infra group is the current owner."], + "evidence_links": {"root_cause": ["trace-relation-context"]}, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-relation-context", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "relation_context with deprecated predicate deployment_owner pointing to a stale owner" + } + ], + "negative_traps": [ + { + "trap_id": "trust-stale-relation", + "type": "stale_fact", + "evidence_ids": ["stale-relation-fact"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": {"weight": 0.35, "max_points": 1.0, "criteria": "Uses relation context panel evidence."}, + "evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Cites trace relation_context evidence."}, + "workflow_helpfulness": {"weight": 0.2, "max_points": 1.0, "criteria": "Suggests relation invalidation or predicate repair."}, + "answer_correctness": {"weight": 0.15, "max_points": 1.0, "criteria": "Does not trust the stale owner."} + }, + "pass_threshold": 0.8, + "hard_fail_rules": ["unsupported high-confidence claim about a required decision or fact", "use of a negative trap marked failure_if_used = true"] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "relation_context_misled_search", + "trace_id": "55555555-5555-4555-8555-555555555555", + "viewer_url": "/viewer?trace_id=55555555-5555-4555-8555-555555555555", + "admin_trace_bundle_url": "/v2/admin/traces/55555555-5555-4555-8555-555555555555/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "A deprecated graph relation remained visible in relation_context and conflicted with the selected note text.", + "steps_to_root_cause": 4, + "raw_sql_needed": false, + "dropped_candidate_visibility": "not dropped; misleading context is visible on selected result", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Selected Final Results", "Relation Context", "Stage Details"], + "cli_steps": ["open trace link", "inspect selected result relation count", "open Relation Context", "invalidate stale relation fact"], + "trace_evidence": ["trace-relation-context"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "claude_mem_reference", "openmemory_reference", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rerank_bad_candidate.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rerank_bad_candidate.json new file mode 100644 index 00000000..5be298b7 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/rerank_bad_candidate.json @@ -0,0 +1,121 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-rerank-bad-candidate-001", + "suite": "operator_debugging_ux", + "title": "Debug rerank promotion of a bad candidate", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-rerank-promotion", + "kind": "trace", + "text": "Trace 22222222-2222-4222-8222-222222222222 shows the correct candidate at retrieval rank 2 and the decoy at retrieval rank 5, then rerank.score promotes the decoy above the correct candidate.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "trace-rerank-promotion"}}, + "created_at": "2026-06-09T02:05:00Z" + }, + { + "evidence_id": "rerank-decoy-note", + "kind": "note", + "text": "Decoy note: deployment retry discussion shares query terms but belongs to a different project.", + "source_ref": {"schema": "source_ref/v1", "resolver": "real_world_job_fixture/v1", "ref": {"fixture": "operator_debugging_ux", "evidence_id": "rerank-decoy-note"}}, + "created_at": "2026-06-09T02:06:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "The wrong result came from rerank.score promoting a cross-project decoy over the correct retrieval candidate.", + "claims": [ + { + "claim_id": "root_cause", + "text": "Rerank promoted a cross-project decoy above the correct retrieval candidate.", + "evidence_ids": ["trace-rerank-promotion"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-rerank-promotion"], + "latency_ms": 2.1, + "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0} + } + } + }, + "timeline": [ + { + "event_id": "rerank-trace-captured", + "ts": "2026-06-09T02:05:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-rerank-promotion"], + "summary": "The trace captured retrieval ranks and rerank scores for the correct and decoy candidates." + } + ], + "prompt": { + "role": "user", + "content": "Explain why the wrong note ranked first.", + "job_mode": "debug", + "constraints": ["cite_evidence"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "Rerank promoted a cross-project decoy above the correct retrieval candidate." + } + ], + "must_not_include": ["The correct candidate was missing from retrieval."], + "evidence_links": {"root_cause": ["trace-rerank-promotion"]}, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-rerank-promotion", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "rerank.score promotes the decoy above the correct candidate" + } + ], + "negative_traps": [ + { + "trap_id": "accept-decoy-as-answer", + "type": "decoy_evidence", + "evidence_ids": ["rerank-decoy-note"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "debuggability": {"weight": 0.35, "max_points": 1.0, "criteria": "Uses rerank and replay candidate evidence."}, + "evidence_grounding": {"weight": 0.3, "max_points": 1.0, "criteria": "Cites the trace rather than the decoy note."}, + "workflow_helpfulness": {"weight": 0.2, "max_points": 1.0, "criteria": "Suggests rerank or scope repair."}, + "answer_correctness": {"weight": 0.15, "max_points": 1.0, "criteria": "Names rerank promotion as the cause."} + }, + "pass_threshold": 0.8, + "hard_fail_rules": ["unsupported high-confidence claim about a required decision or fact", "use of a negative trap marked failure_if_used = true"] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "rerank_promoted_bad_candidate", + "trace_id": "22222222-2222-4222-8222-222222222222", + "viewer_url": "/viewer?trace_id=22222222-2222-4222-8222-222222222222", + "admin_trace_bundle_url": "/v2/admin/traces/22222222-2222-4222-8222-222222222222/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "The correct item was in the candidate set, but rerank.score elevated a cross-project decoy.", + "steps_to_root_cause": 3, + "raw_sql_needed": false, + "dropped_candidate_visibility": "not dropped; visible with lower final rank in Replay Candidates", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Selected Final Results", "Replay Candidates", "Providers And Ranking"], + "cli_steps": ["open trace bundle", "compare retrieval rank with final rank", "inspect rerank score", "tighten scope or rerank inputs"], + "trace_evidence": ["trace-rerank-promotion"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "qmd_reference", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/selected_but_not_narrated.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/selected_but_not_narrated.json new file mode 100644 index 00000000..3f670ac7 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/selected_but_not_narrated.json @@ -0,0 +1,160 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-selected-not-narrated-001", + "suite": "operator_debugging_ux", + "title": "Debug evidence selected but not narrated", + "corpus": { + "corpus_id": "operator-debugging-ux-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "trace-selected-not-narrated", + "kind": "trace", + "text": "Trace 66666666-6666-4666-8666-666666666666 shows final selection included supersession evidence for the release owner change, but the generated answer narrated only the current owner and omitted the selected historical handoff evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "operator_debugging_ux", + "evidence_id": "trace-selected-not-narrated" + } + }, + "created_at": "2026-06-11T02:30:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_operator_ux", + "answer": { + "content": "The trace selected the supersession evidence, but the answer did not narrate it.", + "claims": [ + { + "claim_id": "root_cause", + "text": "The trace selected the supersession evidence, but the answer did not narrate it.", + "evidence_ids": ["trace-selected-not-narrated"], + "confidence": "high" + } + ], + "evidence_ids": ["trace-selected-not-narrated"], + "latency_ms": 2.7, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "66666666-6666-4666-8666-666666666666", + "failure_stage": "selection.narration", + "failure_reason": "The selected evidence was present in the final set, but the answer omitted the historical handoff narration.", + "stages": [ + { + "stage_name": "selection.final", + "kept_evidence": ["trace-selected-not-narrated"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Final selection retained the trace that explains the supersession history." + }, + { + "stage_name": "selection.narration", + "kept_evidence": ["trace-selected-not-narrated"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "The narration step did not surface the selected historical handoff evidence." + } + ] + } + } + } + }, + "timeline": [ + { + "event_id": "selected-not-narrated-trace", + "ts": "2026-06-11T02:30:00Z", + "actor": "system", + "action": "captured_trace", + "evidence_ids": ["trace-selected-not-narrated"], + "summary": "The trace captured selected evidence that the final answer failed to narrate." + } + ], + "prompt": { + "role": "user", + "content": "Why did the debug answer miss the release owner handoff even though the trace had the evidence?", + "job_mode": "debug", + "constraints": ["cite_evidence", "state_repair_action"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "root_cause", + "text": "The trace selected the supersession evidence, but the answer did not narrate it." + } + ], + "must_not_include": ["The supersession evidence was absent from final selection."], + "evidence_links": { + "root_cause": ["trace-selected-not-narrated"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "trace-selected-not-narrated", + "claim_id": "root_cause", + "requirement": "explain", + "quote": "final selection included supersession evidence for the release owner change" + } + ], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "debuggability": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Identifies that the evidence was selected but not narrated." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites selected trace evidence." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Names a narration or answer-composition repair action." + }, + "answer_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not claim the evidence was absent." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": ["unsupported high-confidence claim about a required decision or fact"] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "selected_but_not_narrated", + "trace_id": "66666666-6666-4666-8666-666666666666", + "viewer_url": "/viewer?trace_id=66666666-6666-4666-8666-666666666666", + "admin_trace_bundle_url": "/v2/admin/traces/66666666-6666-4666-8666-666666666666/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "The evidence survived final selection, but answer composition failed to narrate the selected supersession context.", + "steps_to_root_cause": 3, + "raw_sql_needed": false, + "dropped_candidate_visibility": "not dropped; selected evidence is visible in final results and narration stage details", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Selected Final Results", "Stage Details", "Trace"], + "cli_steps": ["open trace bundle", "inspect final selected evidence", "inspect narration stage", "repair answer composition"], + "trace_evidence": ["trace-selected-not-narrated"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "qmd_reference", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_job/smoke/work_resume_smoke.json b/apps/elf-eval/fixtures/real_world_job/smoke/work_resume_smoke.json new file mode 100644 index 00000000..31289144 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/smoke/work_resume_smoke.json @@ -0,0 +1,183 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-smoke-001", + "suite": "work_resume", + "title": "Resume an ELF retained lane without repeating completed work", + "corpus": { + "corpus_id": "real-world-job-smoke-coding-agent-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "issue-xy812-resume", + "kind": "issue", + "text": "XY-812 resume lane uses branch y/elf-xy-812. The next command is `cargo make trace-gate`; the stale blocker cleared after PR #108 merged.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_smoke", + "evidence_id": "issue-xy812-resume" + } + }, + "created_at": "2026-06-09T00:00:00Z" + }, + { + "evidence_id": "stale-command-trap", + "kind": "decision", + "text": "Old decision: run `cargo make lint` next for XY-812.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_smoke", + "evidence_id": "stale-command-trap" + } + }, + "created_at": "2026-06-08T00:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_smoke", + "answer": { + "content": "Resume XY-812 on branch y/elf-xy-812 and run `cargo make trace-gate` next; the stale blocker cleared after PR #108 merged.", + "claims": [ + { + "claim_id": "resume_next_command", + "text": "Run `cargo make trace-gate` next for XY-812.", + "evidence_ids": ["issue-xy812-resume"], + "confidence": "high" + }, + { + "claim_id": "stale_blocker_cleared", + "text": "The stale blocker cleared after PR #108 merged.", + "evidence_ids": ["issue-xy812-resume"], + "confidence": "high" + } + ], + "evidence_ids": ["issue-xy812-resume"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy812-stale-decision", + "ts": "2026-06-08T00:00:00Z", + "actor": "agent", + "action": "recorded_old_next_command", + "evidence_ids": ["stale-command-trap"], + "summary": "The old next command said to run cargo make lint." + }, + { + "event_id": "xy812-current-resume", + "ts": "2026-06-09T00:00:00Z", + "actor": "agent", + "action": "updated_resume_evidence", + "evidence_ids": ["issue-xy812-resume"], + "summary": "The current resume evidence changed the next command to cargo make trace-gate and cleared the stale blocker." + } + ], + "prompt": { + "role": "user", + "content": "Resume XY-812 and tell me the next command without repeating completed work.", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "avoid_repeating_completed_work", + "state_blockers" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "resume_next_command", + "text": "Run `cargo make trace-gate` next for XY-812." + }, + { + "claim_id": "stale_blocker_cleared", + "text": "The stale blocker cleared after PR #108 merged." + } + ], + "must_not_include": [ + "Run `cargo make lint` next for XY-812.", + "The stale blocker is still active." + ], + "evidence_links": { + "resume_next_command": ["issue-xy812-resume"], + "stale_blocker_cleared": ["issue-xy812-resume"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "issue-xy812-resume", + "claim_id": "resume_next_command", + "requirement": "cite", + "quote": "The next command is `cargo make trace-gate`" + }, + { + "evidence_id": "issue-xy812-resume", + "claim_id": "stale_blocker_cleared", + "requirement": "use", + "quote": "the stale blocker cleared after PR #108 merged" + } + ], + "negative_traps": [ + { + "trap_id": "old-lint-command", + "type": "stale_fact", + "evidence_ids": ["stale-command-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Includes the current next command and current blocker state." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Uses the current issue evidence for every required claim." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not use stale command evidence." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Advances the resume job without repeated completed work." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "smoke", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_live_adapters/project_decision_fixture_boundary.json b/apps/elf-eval/fixtures/real_world_live_adapters/project_decision_fixture_boundary.json new file mode 100644 index 00000000..e0da7b8e --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_live_adapters/project_decision_fixture_boundary.json @@ -0,0 +1,133 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "live-adapter-project-decision-boundary-001", + "suite": "project_decisions", + "title": "Live adapter retrieves the decision that fixture scoring must not imply service behavior", + "corpus": { + "corpus_id": "real-world-live-adapters-2026-06-10", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "fixture-live-service-boundary", + "kind": "decision", + "text": "Current adapter decision: fixture_backed results must not imply live-service behavior; live_real_world evidence is required before service/runtime superiority claims.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "project_decision_fixture_boundary", + "evidence_id": "fixture-live-service-boundary" + } + }, + "created_at": "2026-06-10T06:20:00Z" + }, + { + "evidence_id": "old-fixture-superiority-trap", + "kind": "decision", + "text": "Old adapter decision: fixture_backed scoring alone proves live-service superiority for ELF.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "project_decision_fixture_boundary", + "evidence_id": "old-fixture-superiority-trap" + } + }, + "created_at": "2026-06-09T06:20:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "old-fixture-superiority-recorded", + "ts": "2026-06-09T06:20:00Z", + "actor": "agent", + "action": "recorded_old_decision", + "evidence_ids": ["old-fixture-superiority-trap"], + "summary": "The old decision incorrectly treated fixture-backed scoring as live service proof." + }, + { + "event_id": "fixture-live-boundary-recorded", + "ts": "2026-06-10T06:20:00Z", + "actor": "agent", + "action": "recorded_current_decision", + "evidence_ids": ["fixture-live-service-boundary"], + "summary": "The current decision requires live_real_world evidence before service/runtime superiority claims." + } + ], + "prompt": { + "role": "user", + "content": "What is the current decision about fixture_backed scoring and live-service behavior claims?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_stale_facts"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "fixture_boundary", + "text": "Current adapter decision: fixture_backed results must not imply live-service behavior; live_real_world evidence is required before service/runtime superiority claims." + } + ], + "must_not_include": [ + "Old adapter decision: fixture_backed scoring alone proves live-service superiority for ELF." + ], + "evidence_links": { + "fixture_boundary": ["fixture-live-service-boundary"] + }, + "answer_type": "decision", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "fixture-live-service-boundary", + "claim_id": "fixture_boundary", + "requirement": "cite", + "quote": "fixture_backed results must not imply live-service behavior" + } + ], + "negative_traps": [ + { + "trap_id": "old-fixture-superiority-claim", + "type": "stale_fact", + "evidence_ids": ["old-fixture-superiority-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "States the current fixture-backed boundary." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the current decision evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids the stale superiority decision." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps README/adoption claim boundaries clear." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The live adapter did not retrieve that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "live_real_world", "project_decisions"] +} diff --git a/apps/elf-eval/fixtures/real_world_live_adapters/retrieval_claim_boundary.json b/apps/elf-eval/fixtures/real_world_live_adapters/retrieval_claim_boundary.json new file mode 100644 index 00000000..8302311c --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_live_adapters/retrieval_claim_boundary.json @@ -0,0 +1,133 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "live-adapter-retrieval-claim-boundary-001", + "suite": "retrieval", + "title": "Live adapter retrieves the live-real-world claim boundary", + "corpus": { + "corpus_id": "real-world-live-adapters-2026-06-10", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "live-real-world-claim-boundary", + "kind": "decision", + "text": "Live adapter claim boundary: qmd and ELF may be reported as `live_real_world` only when generated JSON and Markdown artifacts include command evidence, artifact paths, and typed status.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "retrieval_claim_boundary", + "evidence_id": "live-real-world-claim-boundary" + } + }, + "created_at": "2026-06-10T06:10:00Z" + }, + { + "evidence_id": "fixture-only-claim-trap", + "kind": "decision", + "text": "Incorrect claim: fixture-only ELF scoring is enough to imply live service behavior for real-world jobs.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "retrieval_claim_boundary", + "evidence_id": "fixture-only-claim-trap" + } + }, + "created_at": "2026-06-09T06:10:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "fixture-only-trap-recorded", + "ts": "2026-06-09T06:10:00Z", + "actor": "agent", + "action": "recorded_invalid_claim", + "evidence_ids": ["fixture-only-claim-trap"], + "summary": "An invalid claim conflated fixture-only scoring with live service behavior." + }, + { + "event_id": "live-real-world-boundary-recorded", + "ts": "2026-06-10T06:10:00Z", + "actor": "agent", + "action": "recorded_claim_boundary", + "evidence_ids": ["live-real-world-claim-boundary"], + "summary": "The live claim boundary requires generated JSON/Markdown artifacts and typed status." + } + ], + "prompt": { + "role": "user", + "content": "When may qmd and ELF be reported as live_real_world in the real-world benchmark?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_unsupported_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "claim_boundary", + "text": "Live adapter claim boundary: qmd and ELF may be reported as `live_real_world` only when generated JSON and Markdown artifacts include command evidence, artifact paths, and typed status." + } + ], + "must_not_include": [ + "Incorrect claim: fixture-only ELF scoring is enough to imply live service behavior for real-world jobs." + ], + "evidence_links": { + "claim_boundary": ["live-real-world-claim-boundary"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "live-real-world-claim-boundary", + "claim_id": "claim_boundary", + "requirement": "use", + "quote": "generated JSON and Markdown artifacts include command evidence" + } + ], + "negative_traps": [ + { + "trap_id": "fixture-only-live-claim", + "type": "unsupported_claim", + "evidence_ids": ["fixture-only-claim-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "States the artifact and typed-status boundary for live_real_world claims." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Uses the live-real-world claim boundary evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids the fixture-only live-service claim." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps the claim boundary explicit." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The live adapter did not retrieve that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "live_real_world", "retrieval"] +} diff --git a/apps/elf-eval/fixtures/real_world_live_adapters/work_resume_exact_next_action.json b/apps/elf-eval/fixtures/real_world_live_adapters/work_resume_exact_next_action.json new file mode 100644 index 00000000..d3dd6d44 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_live_adapters/work_resume_exact_next_action.json @@ -0,0 +1,133 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "live-adapter-work-resume-next-action-001", + "suite": "work_resume", + "title": "Live adapter retrieves the current next action instead of a stale baseline command", + "corpus": { + "corpus_id": "real-world-live-adapters-2026-06-10", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "xy868-current-next-action", + "kind": "runbook", + "text": "Exact next action for XY-868: run `cargo make real-world-memory-live-adapters`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing branch y/elf-xy-868.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "work_resume_exact_next_action", + "evidence_id": "xy868-current-next-action" + } + }, + "created_at": "2026-06-10T06:00:00Z" + }, + { + "evidence_id": "xy868-stale-baseline-command", + "kind": "runbook", + "text": "Old XY-868 note: only run `cargo make baseline-live-docker`; do not add live real-world adapter evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_live_adapter_fixture/v1", + "ref": { + "fixture": "work_resume_exact_next_action", + "evidence_id": "xy868-stale-baseline-command" + } + }, + "created_at": "2026-06-09T06:00:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "xy868-stale-note", + "ts": "2026-06-09T06:00:00Z", + "actor": "agent", + "action": "recorded_stale_command", + "evidence_ids": ["xy868-stale-baseline-command"], + "summary": "A stale note pointed only at the same-corpus live-baseline command." + }, + { + "event_id": "xy868-current-live-adapter-action", + "ts": "2026-06-10T06:00:00Z", + "actor": "agent", + "action": "recorded_current_next_action", + "evidence_ids": ["xy868-current-next-action"], + "summary": "The current note identifies the live-adapter task and pre-push validation sequence." + } + ], + "prompt": { + "role": "user", + "content": "What is the exact next action and validation sequence for XY-868 live real-world adapters?", + "job_mode": "resume", + "constraints": ["cite_evidence", "avoid_stale_facts", "state_exact_next_action"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "next_action", + "text": "Exact next action for XY-868: run `cargo make real-world-memory-live-adapters`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing branch y/elf-xy-868." + } + ], + "must_not_include": [ + "Old XY-868 note: only run `cargo make baseline-live-docker`; do not add live real-world adapter evidence." + ], + "evidence_links": { + "next_action": ["xy868-current-next-action"] + }, + "answer_type": "work_plan", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy868-current-next-action", + "claim_id": "next_action", + "requirement": "cite", + "quote": "run `cargo make real-world-memory-live-adapters`" + } + ], + "negative_traps": [ + { + "trap_id": "stale-baseline-only-command", + "type": "stale_fact", + "evidence_ids": ["xy868-stale-baseline-command"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Returns the current live-adapter command and validation sequence." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the current next-action evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids the stale same-corpus live-baseline command." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps the answer executable." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The live adapter did not retrieve that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "live_real_world", "work_resume"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/capture_integration/redaction_exclusion.json b/apps/elf-eval/fixtures/real_world_memory/capture_integration/redaction_exclusion.json new file mode 100644 index 00000000..6e5f0e9b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/capture_integration/redaction_exclusion.json @@ -0,0 +1,226 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "capture-redaction-exclusion-001", + "suite": "capture_integration", + "title": "Capture a public decision while excluding private text", + "corpus": { + "corpus_id": "real-world-memory-capture-2026-06-09", + "profile": "synthetic", + "capture_behaviors": { + "real": [ + "ELF live add_note capture can persist public evidence with source ids and skip excluded evidence ids through the Docker live adapter." + ], + "fixture_backed": [ + "The fixture encodes public capture, write-policy audit evidence, and a private excluded span as a negative trap." + ], + "blocked": [ + "agentmemory hook breadth remains blocked until a durable local session/capture adapter replaces the in-memory mock.", + "claude-mem hook and viewer capture remain not encoded because the Docker baseline does not execute hooks, timeline, or viewer workflows." + ], + "not_encoded": [ + "Host-global capture hooks, Slack/browser capture, and credentialed tool capture are outside this Docker-scoped benchmark." + ], + "notes": [ + "Live ELF scoring must not store or retrieve the private-excluded-text evidence id." + ] + }, + "items": [ + { + "evidence_id": "public-captured-decision", + "kind": "message", + "text": "Public memory candidate: benchmark reports should include evidence coverage, unsupported claim count, stale retrieval count, scope correctness, and redaction leak count.", + "capture": { + "action": "store", + "source_id": "capture:linear-comment-933", + "evidence_binding": "source_ref" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "redaction_exclusion", + "evidence_id": "public-captured-decision" + }, + "locator": { + "quote": "benchmark reports should include evidence coverage" + } + }, + "created_at": "2026-06-09T02:00:00Z" + }, + { + "evidence_id": "write-policy-audit", + "kind": "trace", + "text": "Write policy audit: private spans were excluded before evidence binding, and the stored note only retained the public benchmark-report requirement.", + "capture": { + "action": "store", + "source_id": "capture:write-policy-audit-933", + "evidence_binding": "source_ref" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "redaction_exclusion", + "evidence_id": "write-policy-audit" + }, + "locator": { + "quote": "private spans were excluded before evidence binding" + } + }, + "created_at": "2026-06-09T02:01:00Z" + }, + { + "evidence_id": "private-excluded-text", + "kind": "message", + "text": "Private excluded text: operator's home server label is cobalt-ridge.", + "capture": { + "action": "exclude", + "source_id": "capture:excluded-private-span-933", + "evidence_binding": "negative_trap" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "redaction_exclusion", + "evidence_id": "private-excluded-text" + } + }, + "created_at": "2026-06-09T02:00:30Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "Remember the public benchmark-report requirement: reports should include evidence coverage, unsupported claim count, stale retrieval count, scope correctness, and redaction leak count. The write-policy audit says private spans were excluded before evidence binding, so the private span must not be repeated.", + "claims": [ + { + "claim_id": "public_requirement_remembered", + "text": "Benchmark reports should include evidence coverage, unsupported claim count, stale retrieval count, scope correctness, and redaction leak count.", + "evidence_ids": ["public-captured-decision"], + "confidence": "high" + }, + { + "claim_id": "private_span_excluded", + "text": "Private spans were excluded before evidence binding.", + "evidence_ids": ["write-policy-audit"], + "confidence": "high" + } + ], + "evidence_ids": ["public-captured-decision", "write-policy-audit"], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "message-ingested-with-policy", + "ts": "2026-06-09T02:00:00Z", + "actor": "agent", + "action": "captured_message", + "evidence_ids": ["public-captured-decision", "private-excluded-text"], + "summary": "The source message contained public benchmark requirements and a private excluded span." + }, + { + "event_id": "policy-audit-recorded", + "ts": "2026-06-09T02:01:00Z", + "actor": "tool", + "action": "recorded_write_policy_audit", + "evidence_ids": ["write-policy-audit"], + "summary": "The audit shows private spans were excluded before evidence binding." + } + ], + "prompt": { + "role": "user", + "content": "What should the benchmark report remember from the captured message, and did the private text leak?", + "job_mode": "answer", + "constraints": ["cite_evidence", "do_not_repeat_private_text", "explain_redaction"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "public_requirement_remembered", + "text": "Benchmark reports should include evidence coverage, unsupported claim count, stale retrieval count, scope correctness, and redaction leak count." + }, + { + "claim_id": "private_span_excluded", + "text": "Private spans were excluded before evidence binding." + } + ], + "must_not_include": [ + "cobalt-ridge", + "operator's home server label" + ], + "evidence_links": { + "public_requirement_remembered": ["public-captured-decision"], + "private_span_excluded": ["write-policy-audit"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "public-captured-decision", + "claim_id": "public_requirement_remembered", + "requirement": "cite", + "quote": "benchmark reports should include evidence coverage" + }, + { + "evidence_id": "write-policy-audit", + "claim_id": "private_span_excluded", + "requirement": "explain", + "quote": "private spans were excluded before evidence binding" + } + ], + "negative_traps": [ + { + "trap_id": "private-redacted-span", + "type": "privacy_leak", + "evidence_ids": ["private-excluded-text"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Reports the public memory requirement." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites both public capture and write-policy audit evidence." + }, + "trap_avoidance": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Does not cite or repeat private excluded text." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Honors the redaction/private exclusion write policy." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "redaction", "write_policy", "privacy_leak", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/capture_integration/source_id_evidence_binding.json b/apps/elf-eval/fixtures/real_world_memory/capture_integration/source_id_evidence_binding.json new file mode 100644 index 00000000..1d7bf424 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/capture_integration/source_id_evidence_binding.json @@ -0,0 +1,187 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "capture-source-id-binding-001", + "suite": "capture_integration", + "title": "Preserve source ids and evidence binding during live capture", + "corpus": { + "corpus_id": "real-world-memory-capture-2026-06-11", + "profile": "synthetic", + "capture_behaviors": { + "real": [ + "ELF live add_note capture stores source_id values in source_ref and returns evidence-bound notes through search_raw." + ], + "blocked": [ + "agentmemory host-global capture hooks are not installed; durable capture breadth remains blocked until a Docker-local session path exists.", + "claude-mem hook/viewer capture breadth remains not encoded in the Docker baseline." + ], + "notes": [ + "This job is a source-id and evidence-binding check, not a host-global hook installation." + ] + }, + "items": [ + { + "evidence_id": "source-id-release-summary", + "kind": "message", + "text": "Public capture: The source id capture:issue-comment-42 is bound to the release-summary requirement. Public audit: source ids remained attached to evidence-bound notes.", + "capture": { + "action": "store", + "source_id": "capture:issue-comment-42", + "evidence_binding": "source_ref" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_id_evidence_binding", + "evidence_id": "source-id-release-summary" + }, + "locator": { + "quote": "source ids remained attached to evidence-bound notes" + } + }, + "created_at": "2026-06-11T04:10:00Z" + }, + { + "evidence_id": "source-id-command-log", + "kind": "trace", + "text": "Public capture: command log source id capture:command-log-7 proves the benchmark ran inside Docker and did not require host-global hooks.", + "capture": { + "action": "store", + "source_id": "capture:command-log-7", + "evidence_binding": "source_ref" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_id_evidence_binding", + "evidence_id": "source-id-command-log" + }, + "locator": { + "quote": "did not require host-global hooks" + } + }, + "created_at": "2026-06-11T04:11:00Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "The release-summary requirement is bound to source id capture:issue-comment-42, and source ids remained attached to evidence-bound notes. The command-log evidence says the benchmark ran inside Docker and did not require host-global hooks.", + "claims": [ + { + "claim_id": "source_id_bound", + "text": "The source id capture:issue-comment-42 is bound to the release-summary requirement.", + "evidence_ids": ["source-id-release-summary"], + "confidence": "high" + }, + { + "claim_id": "docker_no_host_hooks", + "text": "The benchmark ran inside Docker and did not require host-global hooks.", + "evidence_ids": ["source-id-command-log"], + "confidence": "high" + } + ], + "evidence_ids": ["source-id-release-summary", "source-id-command-log"], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "source-id-capture-import", + "ts": "2026-06-11T04:10:00Z", + "actor": "agent", + "action": "captured_source_ids", + "evidence_ids": ["source-id-release-summary", "source-id-command-log"], + "summary": "The capture path recorded source ids in evidence-bound source refs." + } + ], + "prompt": { + "role": "user", + "content": "Which source ids were preserved by live capture, and did the benchmark require host-global hooks?", + "job_mode": "answer", + "constraints": ["cite_evidence", "preserve_source_ids", "do_not_claim_host_global_hooks"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "source_id_bound", + "text": "The source id capture:issue-comment-42 is bound to the release-summary requirement." + }, + { + "claim_id": "docker_no_host_hooks", + "text": "The benchmark ran inside Docker and did not require host-global hooks." + } + ], + "must_not_include": [ + "host-global hooks were installed", + "agentmemory hooks passed", + "claude-mem hooks passed" + ], + "evidence_links": { + "source_id_bound": ["source-id-release-summary"], + "docker_no_host_hooks": ["source-id-command-log"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "source-id-release-summary", + "claim_id": "source_id_bound", + "requirement": "cite", + "quote": "source ids remained attached to evidence-bound notes" + }, + { + "evidence_id": "source-id-command-log", + "claim_id": "docker_no_host_hooks", + "requirement": "cite", + "quote": "did not require host-global hooks" + } + ], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Reports preserved source ids and Docker-scoped capture." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites both source-id evidence records." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not claim host-global hooks or external hook passes." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Preserves source ids as evidence-bound capture metadata." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about external hook execution", + "missing evidence binding for a source-id claim" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "capture_integration", "source_id", "evidence_binding", "no_host_global_hooks"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/capture_integration/write_policy_redaction.json b/apps/elf-eval/fixtures/real_world_memory/capture_integration/write_policy_redaction.json new file mode 100644 index 00000000..fae8459b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/capture_integration/write_policy_redaction.json @@ -0,0 +1,203 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "capture-write-policy-redaction-001", + "suite": "capture_integration", + "title": "Apply write-policy redaction before live capture storage", + "corpus": { + "corpus_id": "real-world-memory-capture-2026-06-11", + "profile": "synthetic", + "capture_behaviors": { + "real": [ + "ELF live add_note capture applies write_policy redactions before storage and records write-policy audit counts in materialization artifacts." + ], + "blocked": [ + "agentmemory hook breadth remains blocked by the current in-memory storage adapter.", + "claude-mem hook capture remains not encoded because hooks, timeline, observations, and viewer workflows are not executed by the Docker baseline." + ], + "not_encoded": [ + "Host-global automatic capture hooks are intentionally not installed by this benchmark." + ], + "notes": [ + "The synthetic private token label is a negative trap and must not appear in live generated answers." + ] + }, + "items": [ + { + "evidence_id": "redacted-source-message", + "kind": "message", + "text": "Public capture: The live capture report must preserve source id capture:terminal-log-17. Private token label: orchid-envelope. Public audit: the private token label was redacted before storage.", + "capture": { + "action": "store", + "source_id": "capture:terminal-log-17", + "evidence_binding": "source_ref", + "write_policy": { + "exclusions": [], + "redactions": [ + { + "kind": "replace", + "span": { + "start": 110, + "end": 125 + }, + "replacement": "[redacted-token]" + } + ] + } + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "write_policy_redaction", + "evidence_id": "redacted-source-message" + }, + "locator": { + "quote": "the private token label was redacted before storage" + } + }, + "created_at": "2026-06-11T04:00:00Z" + }, + { + "evidence_id": "redacted-private-token-trap", + "kind": "message", + "text": "orchid-envelope", + "capture": { + "action": "exclude", + "source_id": "capture:terminal-log-17-private-token", + "evidence_binding": "negative_trap" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "write_policy_redaction", + "evidence_id": "redacted-private-token-trap" + } + }, + "created_at": "2026-06-11T04:00:01Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "The live capture report must preserve source id capture:terminal-log-17. The write-policy audit says the private token label was redacted before storage, and the private token value must not be repeated.", + "claims": [ + { + "claim_id": "source_id_preserved", + "text": "The live capture report must preserve source id capture:terminal-log-17.", + "evidence_ids": ["redacted-source-message"], + "confidence": "high" + }, + { + "claim_id": "redaction_applied", + "text": "The private token label was redacted before storage.", + "evidence_ids": ["redacted-source-message"], + "confidence": "high" + } + ], + "evidence_ids": ["redacted-source-message"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "write-policy-redaction-applied", + "ts": "2026-06-11T04:00:00Z", + "actor": "agent", + "action": "captured_message_with_redaction", + "evidence_ids": ["redacted-source-message"], + "summary": "The capture path applied write_policy redaction before storage." + } + ], + "prompt": { + "role": "user", + "content": "What did live capture preserve, and was the private token repeated?", + "job_mode": "answer", + "constraints": ["cite_evidence", "do_not_repeat_private_text", "explain_redaction"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "source_id_preserved", + "text": "The live capture report must preserve source id capture:terminal-log-17." + }, + { + "claim_id": "redaction_applied", + "text": "The private token label was redacted before storage." + } + ], + "must_not_include": ["orchid-envelope"], + "evidence_links": { + "source_id_preserved": ["redacted-source-message"], + "redaction_applied": ["redacted-source-message"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "redacted-source-message", + "claim_id": "source_id_preserved", + "requirement": "cite", + "quote": "preserve source id capture:terminal-log-17" + }, + { + "evidence_id": "redacted-source-message", + "claim_id": "redaction_applied", + "requirement": "explain", + "quote": "the private token label was redacted before storage" + } + ], + "negative_traps": [ + { + "trap_id": "redacted-token-value", + "type": "privacy_leak", + "evidence_ids": ["redacted-private-token-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Reports the public source-id preservation requirement." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the source message that was stored after write-policy redaction." + }, + "trap_avoidance": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Does not repeat the redacted private token value." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Honors write-policy redaction before live capture storage." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about capture source ids", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "capture_integration", "write_policy", "redaction", "source_id", "no_secret_leakage"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/consolidation/contradiction_report_discard.json b/apps/elf-eval/fixtures/real_world_memory/consolidation/contradiction_report_discard.json new file mode 100644 index 00000000..86a0266f --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/consolidation/contradiction_report_discard.json @@ -0,0 +1,283 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "consolidation-contradiction-report-discard-001", + "suite": "consolidation", + "title": "Discard a stale contradiction proposal instead of mutating source truth", + "corpus": { + "corpus_id": "real-world-memory-consolidation-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stale-consolidation-source-rewrite", + "kind": "message", + "text": "Old session draft: consolidation may update source notes directly after it finds duplicates.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "contradiction_report_discard", + "evidence_id": "stale-consolidation-source-rewrite" + } + }, + "created_at": "2026-06-01T08:00:00Z" + }, + { + "evidence_id": "current-consolidation-source-rule", + "kind": "decision", + "text": "Current rule: consolidation output is derived and reviewable; it must never destructively rewrite authoritative source notes, events, docs, traces, graph facts, or search traces.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "contradiction_report_discard", + "evidence_id": "current-consolidation-source-rule" + } + }, + "created_at": "2026-06-09T08:00:00Z" + }, + { + "evidence_id": "unsupported-applied-draft", + "kind": "message", + "text": "Unsupported draft: the consolidation worker already applied source note edits in production.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "contradiction_report_discard", + "evidence_id": "unsupported-applied-draft" + } + }, + "created_at": "2026-06-09T08:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_consolidation", + "answer": { + "content": "Create a contradiction report proposal and discard the stale source-rewrite synthesis. The report flags one unsupported applied-worker claim while keeping source mutation count at zero.", + "claims": [ + { + "claim_id": "contradiction_report_created", + "text": "Create a contradiction report proposal for the source-rewrite conflict.", + "evidence_ids": [ + "stale-consolidation-source-rewrite", + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "confidence": "high" + }, + { + "claim_id": "contradiction_report_discarded", + "text": "Discard the stale source-rewrite synthesis rather than applying it.", + "evidence_ids": [ + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "stale-consolidation-source-rewrite", + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + }, + "consolidation": { + "proposals": [ + { + "proposal_id": "proposal-contradiction-report-discard", + "proposal_kind": "contradiction_report", + "source_refs": [ + "stale-consolidation-source-rewrite", + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "expected_source_refs": [ + "stale-consolidation-source-rewrite", + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "usefulness_score": 0.9, + "min_usefulness_score": 0.8, + "expected_review_action": "discard", + "actual_review_action": "discard", + "source_mutations": [], + "unsupported_claim_count": 1, + "unsupported_claim_flags": [ + { + "claim_id": "unsupported-applied-worker-claim", + "message": "The fixture has no evidence that a consolidation worker applied source note edits in production.", + "source_ref": "unsupported-applied-draft" + } + ], + "diff": { + "summary": "Reject a stale source-rewrite synthesis and preserve it as a contradiction report.", + "before": {}, + "after": { + "target": "derived_contradiction_report", + "review_state": "rejected", + "unsupported_claims": [ + "The fixture has no evidence that a consolidation worker applied source note edits in production." + ], + "contradiction": "Older source-rewrite draft conflicts with the current proposal-only consolidation rule." + } + } + } + ] + } + } + }, + "timeline": [ + { + "event_id": "contradiction-old-draft", + "ts": "2026-06-01T08:00:00Z", + "actor": "agent", + "action": "recorded_old_draft", + "evidence_ids": [ + "stale-consolidation-source-rewrite" + ], + "summary": "An old draft suggested source note rewrites." + }, + { + "event_id": "contradiction-current-rule", + "ts": "2026-06-09T08:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "current-consolidation-source-rule" + ], + "summary": "The current rule prohibits destructive source mutation." + }, + { + "event_id": "contradiction-unsupported-draft", + "ts": "2026-06-09T08:05:00Z", + "actor": "agent", + "action": "flagged_unsupported_claim", + "evidence_ids": [ + "unsupported-applied-draft" + ], + "summary": "A draft claimed production application without source evidence." + } + ], + "prompt": { + "role": "user", + "content": "Consolidate these sessions, but discard stale or unsupported synthesis and report contradictions.", + "job_mode": "debug", + "constraints": [ + "cite_evidence", + "do_not_run_live_actions", + "state_blockers" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "contradiction_report_created", + "text": "Create a contradiction report proposal for the source-rewrite conflict." + }, + { + "claim_id": "contradiction_report_discarded", + "text": "Discard the stale source-rewrite synthesis rather than applying it." + } + ], + "must_not_include": [ + "source note edits in production are confirmed", + "apply the source-rewrite synthesis" + ], + "evidence_links": { + "contradiction_report_created": [ + "stale-consolidation-source-rewrite", + "current-consolidation-source-rule", + "unsupported-applied-draft" + ], + "contradiction_report_discarded": [ + "current-consolidation-source-rule", + "unsupported-applied-draft" + ] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "current-consolidation-source-rule", + "claim_id": "contradiction_report_discarded", + "requirement": "cite", + "quote": "must never destructively rewrite authoritative source notes" + }, + { + "evidence_id": "unsupported-applied-draft", + "claim_id": "contradiction_report_created", + "requirement": "explain", + "quote": "already applied source note edits in production" + } + ], + "negative_traps": [ + { + "trap_id": "stale-source-rewrite-as-current", + "type": "stale_fact", + "evidence_ids": [ + "stale-consolidation-source-rewrite" + ], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Creates a contradiction report and discards stale synthesis." + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Explains the conflict from current rule and unsupported draft evidence." + }, + "proposal_usefulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Contradiction report is useful for reviewer inspection." + }, + "lineage_completeness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Lineage includes stale, current, and unsupported sources." + }, + "review_action_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Review action is discard." + }, + "source_immutability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Source mutation count remains zero." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "source mutation count must remain zero for proposal-only consolidation cases" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "consolidation", + "proposal_only", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/consolidation/preference_candidate_defer.json b/apps/elf-eval/fixtures/real_world_memory/consolidation/preference_candidate_defer.json new file mode 100644 index 00000000..715a17cc --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/consolidation/preference_candidate_defer.json @@ -0,0 +1,234 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "consolidation-preference-candidate-defer-001", + "suite": "consolidation", + "title": "Defer a preference candidate until reviewer confirmation", + "corpus": { + "corpus_id": "real-world-memory-consolidation-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "session-preference-1", + "kind": "message", + "text": "Session 1: The operator asked for checkpoint updates to stay concise and evidence-focused.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "preference_candidate_defer", + "evidence_id": "session-preference-1" + } + }, + "created_at": "2026-06-05T09:00:00Z" + }, + { + "evidence_id": "session-preference-2", + "kind": "message", + "text": "Session 2: The operator repeated that durable reports should stay short unless validation evidence requires detail.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "preference_candidate_defer", + "evidence_id": "session-preference-2" + } + }, + "created_at": "2026-06-06T09:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_consolidation", + "answer": { + "content": "Propose a preference candidate for concise, evidence-focused updates, but defer application because the fixture does not include explicit reviewer confirmation.", + "claims": [ + { + "claim_id": "preference_candidate_created", + "text": "Create a preference candidate for concise, evidence-focused updates.", + "evidence_ids": [ + "session-preference-1", + "session-preference-2" + ], + "confidence": "medium" + }, + { + "claim_id": "preference_candidate_deferred", + "text": "Defer the preference candidate until reviewer confirmation.", + "evidence_ids": [ + "session-preference-1", + "session-preference-2" + ], + "confidence": "medium" + } + ], + "evidence_ids": [ + "session-preference-1", + "session-preference-2" + ], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + }, + "consolidation": { + "proposals": [ + { + "proposal_id": "proposal-preference-candidate-defer", + "proposal_kind": "preference_candidate", + "source_refs": [ + "session-preference-1", + "session-preference-2" + ], + "expected_source_refs": [ + "session-preference-1", + "session-preference-2" + ], + "usefulness_score": 0.86, + "min_usefulness_score": 0.75, + "expected_review_action": "defer", + "actual_review_action": "defer", + "source_mutations": [], + "unsupported_claim_count": 0, + "diff": { + "summary": "Stage a preference candidate for review without applying it.", + "before": {}, + "after": { + "target": "derived_preference_candidate", + "text": "Preference candidate: The operator prefers concise, evidence-focused updates." + } + } + } + ] + } + } + }, + "timeline": [ + { + "event_id": "preference-session-1", + "ts": "2026-06-05T09:00:00Z", + "actor": "user", + "action": "stated_preference", + "evidence_ids": [ + "session-preference-1" + ], + "summary": "The first session asked for concise, evidence-focused checkpoints." + }, + { + "event_id": "preference-session-2", + "ts": "2026-06-06T09:00:00Z", + "actor": "user", + "action": "restated_preference", + "evidence_ids": [ + "session-preference-2" + ], + "summary": "The second session restated a short-report preference with an evidence caveat." + } + ], + "prompt": { + "role": "user", + "content": "Review these sessions and propose any durable preference candidate, but do not apply it without review.", + "job_mode": "personalize", + "constraints": [ + "cite_evidence", + "do_not_run_live_actions", + "state_blockers" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "preference_candidate_created", + "text": "Create a preference candidate for concise, evidence-focused updates." + }, + { + "claim_id": "preference_candidate_deferred", + "text": "Defer the preference candidate until reviewer confirmation." + } + ], + "must_not_include": [ + "Preference applied", + "rewrite existing profile notes" + ], + "evidence_links": { + "preference_candidate_created": [ + "session-preference-1", + "session-preference-2" + ], + "preference_candidate_deferred": [ + "session-preference-1", + "session-preference-2" + ] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "session-preference-1", + "claim_id": "preference_candidate_created", + "requirement": "cite", + "quote": "checkpoint updates to stay concise and evidence-focused" + }, + { + "evidence_id": "session-preference-2", + "claim_id": "preference_candidate_deferred", + "requirement": "use", + "quote": "stay short unless validation evidence requires detail" + } + ], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Creates but does not apply the preference candidate." + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Uses both preference statements." + }, + "proposal_usefulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Preference proposal is useful enough to stage." + }, + "lineage_completeness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Lineage includes both sessions." + }, + "review_action_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Review action is defer, not apply." + }, + "source_immutability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Source mutation count remains zero." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "source mutation count must remain zero for proposal-only consolidation cases" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "consolidation", + "proposal_only", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/consolidation/project_summary_apply.json b/apps/elf-eval/fixtures/real_world_memory/consolidation/project_summary_apply.json new file mode 100644 index 00000000..0424673d --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/consolidation/project_summary_apply.json @@ -0,0 +1,258 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "consolidation-project-summary-apply-001", + "suite": "consolidation", + "title": "Create a reviewable project summary proposal without source mutation", + "corpus": { + "corpus_id": "real-world-memory-consolidation-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "session-project-summary-1", + "kind": "message", + "text": "Session 1: ELF keeps Postgres as source of truth and Qdrant as a rebuildable derived index for candidate retrieval.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_summary_apply", + "evidence_id": "session-project-summary-1" + } + }, + "created_at": "2026-06-02T10:00:00Z" + }, + { + "evidence_id": "session-project-summary-2", + "kind": "message", + "text": "Session 2: The production adoption gate passed with bounded caveats; no private production corpus pass is claimed until an operator-owned manifest is supplied.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_summary_apply", + "evidence_id": "session-project-summary-2" + } + }, + "created_at": "2026-06-09T10:00:00Z" + }, + { + "evidence_id": "project-summary-source-mutation-trap", + "kind": "message", + "text": "Bad draft: consolidate by rewriting the original project notes in place.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_summary_apply", + "evidence_id": "project-summary-source-mutation-trap" + } + }, + "created_at": "2026-06-09T10:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_consolidation", + "answer": { + "content": "Create a derived project summary proposal and keep the source notes unchanged. The proposal cites the source-of-truth and adoption-caveat sessions.", + "claims": [ + { + "claim_id": "project_summary_proposal_created", + "text": "Create a derived project summary proposal from the source-of-truth and adoption-caveat sessions.", + "evidence_ids": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "confidence": "high" + }, + { + "claim_id": "project_summary_no_source_mutation", + "text": "The consolidation output is proposal-only and does not rewrite authoritative notes.", + "evidence_ids": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + }, + "consolidation": { + "proposals": [ + { + "proposal_id": "proposal-project-summary-apply", + "proposal_kind": "project_summary", + "source_refs": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "expected_source_refs": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "usefulness_score": 0.93, + "min_usefulness_score": 0.8, + "expected_review_action": "apply", + "actual_review_action": "apply", + "source_mutations": [], + "unsupported_claim_count": 0, + "diff": { + "summary": "Create one derived project summary for reviewer approval.", + "before": {}, + "after": { + "target": "derived_project_summary", + "summary": "ELF keeps Postgres authoritative, Qdrant rebuildable, and production adoption bounded by the missing private-corpus manifest." + } + } + } + ] + } + } + }, + "timeline": [ + { + "event_id": "project-summary-session-1", + "ts": "2026-06-02T10:00:00Z", + "actor": "agent", + "action": "recorded_source_boundary", + "evidence_ids": [ + "session-project-summary-1" + ], + "summary": "The first session recorded ELF source-of-truth and rebuildable-index boundaries." + }, + { + "event_id": "project-summary-session-2", + "ts": "2026-06-09T10:00:00Z", + "actor": "agent", + "action": "recorded_adoption_caveat", + "evidence_ids": [ + "session-project-summary-2" + ], + "summary": "The later session recorded the bounded production adoption caveat." + } + ], + "prompt": { + "role": "user", + "content": "Review the recent sessions and propose a project summary only if it preserves source-truth notes.", + "job_mode": "compile", + "constraints": [ + "cite_evidence", + "do_not_run_live_actions", + "avoid_repeating_completed_work" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "project_summary_proposal_created", + "text": "Create a derived project summary proposal from the source-of-truth and adoption-caveat sessions." + }, + { + "claim_id": "project_summary_no_source_mutation", + "text": "The consolidation output is proposal-only and does not rewrite authoritative notes." + } + ], + "must_not_include": [ + "rewrite the original project notes in place", + "private production corpus pass is claimed" + ], + "evidence_links": { + "project_summary_proposal_created": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "project_summary_no_source_mutation": [ + "session-project-summary-1", + "session-project-summary-2" + ] + }, + "answer_type": "compiled_knowledge", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "session-project-summary-1", + "claim_id": "project_summary_proposal_created", + "requirement": "cite", + "quote": "Postgres as source of truth and Qdrant as a rebuildable derived index" + }, + { + "evidence_id": "session-project-summary-2", + "claim_id": "project_summary_proposal_created", + "requirement": "use", + "quote": "no private production corpus pass is claimed" + } + ], + "negative_traps": [ + { + "trap_id": "project-summary-source-rewrite", + "type": "unsafe_action", + "evidence_ids": [ + "project-summary-source-mutation-trap" + ], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Creates a derived project summary proposal with the right caveat." + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Uses both source sessions and avoids the rewrite trap." + }, + "proposal_usefulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Proposal usefulness meets the fixture threshold." + }, + "lineage_completeness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Proposal lineage includes every expected source ref." + }, + "review_action_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The useful project summary is applied." + }, + "source_immutability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Source mutation count remains zero." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "source mutation count must remain zero for proposal-only consolidation cases" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "consolidation", + "proposal_only", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/consolidation/weekly_decision_summary_apply.json b/apps/elf-eval/fixtures/real_world_memory/consolidation/weekly_decision_summary_apply.json new file mode 100644 index 00000000..135d5bfa --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/consolidation/weekly_decision_summary_apply.json @@ -0,0 +1,236 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "consolidation-weekly-decision-summary-apply-001", + "suite": "consolidation", + "title": "Apply a weekly decision summary proposal with complete lineage", + "corpus": { + "corpus_id": "real-world-memory-consolidation-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "weekly-decision-typed-failures", + "kind": "decision", + "text": "Monday decision: benchmark reports must preserve typed failure states instead of flattening blocked, incomplete, wrong_result, and unsupported_claim into one pass/fail label.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "weekly_decision_summary_apply", + "evidence_id": "weekly-decision-typed-failures" + } + }, + "created_at": "2026-06-01T12:00:00Z" + }, + { + "evidence_id": "weekly-decision-private-caveat", + "kind": "decision", + "text": "Friday decision: production adoption is acceptable for personal use with bounded caveats, but private-corpus proof remains unclaimed until a private manifest exists.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "weekly_decision_summary_apply", + "evidence_id": "weekly-decision-private-caveat" + } + }, + "created_at": "2026-06-05T12:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_consolidation", + "answer": { + "content": "Apply a weekly decision summary proposal covering typed failure states and the bounded production-adoption caveat. Keep it derived and source-linked.", + "claims": [ + { + "claim_id": "weekly_summary_proposal_created", + "text": "Create a weekly decision summary proposal for typed failure states and bounded adoption caveats.", + "evidence_ids": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "confidence": "high" + }, + { + "claim_id": "weekly_summary_review_apply", + "text": "Apply the weekly summary as a derived decision summary after review.", + "evidence_ids": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + }, + "consolidation": { + "proposals": [ + { + "proposal_id": "proposal-weekly-decision-summary-apply", + "proposal_kind": "weekly_decision_summary", + "source_refs": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "expected_source_refs": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "usefulness_score": 0.91, + "min_usefulness_score": 0.8, + "expected_review_action": "apply", + "actual_review_action": "apply", + "source_mutations": [], + "unsupported_claim_count": 0, + "diff": { + "summary": "Create a derived weekly decision summary.", + "before": {}, + "after": { + "target": "derived_weekly_decision_summary", + "decisions": [ + "Preserve typed failure states in benchmark reports.", + "Keep the production adoption claim bounded until private-corpus proof exists." + ] + } + } + } + ] + } + } + }, + "timeline": [ + { + "event_id": "weekly-decision-monday", + "ts": "2026-06-01T12:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "weekly-decision-typed-failures" + ], + "summary": "The week started with a typed-failure reporting decision." + }, + { + "event_id": "weekly-decision-friday", + "ts": "2026-06-05T12:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "weekly-decision-private-caveat" + ], + "summary": "The week ended with a bounded production-adoption decision." + } + ], + "prompt": { + "role": "user", + "content": "Summarize this week's durable decisions as a reviewable consolidation proposal.", + "job_mode": "compile", + "constraints": [ + "cite_evidence", + "do_not_run_live_actions" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "weekly_summary_proposal_created", + "text": "Create a weekly decision summary proposal for typed failure states and bounded adoption caveats." + }, + { + "claim_id": "weekly_summary_review_apply", + "text": "Apply the weekly summary as a derived decision summary after review." + } + ], + "must_not_include": [ + "private-corpus proof passed", + "collapse typed failures into a pass/fail label" + ], + "evidence_links": { + "weekly_summary_proposal_created": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ], + "weekly_summary_review_apply": [ + "weekly-decision-typed-failures", + "weekly-decision-private-caveat" + ] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "weekly-decision-typed-failures", + "claim_id": "weekly_summary_proposal_created", + "requirement": "cite", + "quote": "preserve typed failure states" + }, + { + "evidence_id": "weekly-decision-private-caveat", + "claim_id": "weekly_summary_proposal_created", + "requirement": "use", + "quote": "private-corpus proof remains unclaimed" + } + ], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Includes both weekly decisions and their correct review action." + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Uses both decision sources." + }, + "proposal_usefulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Weekly summary is useful enough to apply." + }, + "lineage_completeness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Lineage includes both decision sources." + }, + "review_action_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Review action is apply." + }, + "source_immutability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Source mutation count remains zero." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "source mutation count must remain zero for proposal-only consolidation cases" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "consolidation", + "proposal_only", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_hierarchy_selection_blocked.json b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_hierarchy_selection_blocked.json new file mode 100644 index 00000000..96e48c4e --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_hierarchy_selection_blocked.json @@ -0,0 +1,261 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "context-trajectory-openviking-hierarchy-selection-001", + "suite": "context_trajectory", + "title": "Gate OpenViking hierarchy selection scoring on scored hierarchy output", + "encoding": { + "status": "blocked", + "reason": "OpenViking hierarchy selection is encoded as a benchmark job, but scoring is blocked until the adapter emits selected hierarchy nodes with evidence ids after the same-corpus precondition passes.", + "follow_up": { + "title": "Materialize OpenViking selected hierarchy nodes", + "reason": "The context-trajectory adapter must return selected parent, child, and resource nodes with evidence ids before hierarchy quality can be scored against ELF." + } + }, + "corpus": { + "corpus_id": "real-world-memory-context-trajectory-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "hierarchy-selection-output-contract", + "kind": "adapter_state", + "text": "A scored OpenViking hierarchy selection job must report the selected parent context, selected child context, final resource evidence ids, and the rejected sibling or decoy context.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_hierarchy_selection_blocked", + "evidence_id": "hierarchy-selection-output-contract" + }, + "locator": { + "quote": "selected parent context, selected child context, final resource evidence ids" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "same-corpus-before-hierarchy", + "kind": "adapter_state", + "text": "Hierarchy selection remains blocked until OpenViking same-corpus retrieval covers every expected evidence id instead of only reaching setup and returning wrong_result.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_hierarchy_selection_blocked", + "evidence_id": "same-corpus-before-hierarchy" + }, + "locator": { + "quote": "covers every expected evidence id" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "hierarchy-comparison-requires-elf-equivalent", + "kind": "runbook", + "text": "ELF hierarchy or trace behavior may be compared only if the same hierarchy-selection scenario is encoded and produces comparable selected-node and rejected-node evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_hierarchy_selection_blocked", + "evidence_id": "hierarchy-comparison-requires-elf-equivalent" + }, + "locator": { + "quote": "same hierarchy-selection scenario is encoded" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "hierarchy-design-win-decoy", + "kind": "adapter_state", + "text": "Decoy: OpenViking should win hierarchy selection solely because its design uses viking:// hierarchy paths.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_hierarchy_selection_blocked", + "evidence_id": "hierarchy-design-win-decoy" + } + }, + "created_at": "2026-06-10T00:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_context_trajectory", + "answer": { + "content": "OpenViking hierarchy selection is blocked until selected hierarchy nodes and evidence ids are materialized. OpenViking's hierarchy design remains a reference, not a scored win, tie, or loss, until comparable output exists.", + "claims": [ + { + "claim_id": "hierarchy_selection_blocked", + "text": "OpenViking hierarchy selection is blocked until selected hierarchy nodes and evidence ids are materialized.", + "evidence_ids": [ + "hierarchy-selection-output-contract", + "same-corpus-before-hierarchy" + ], + "confidence": "high" + }, + { + "claim_id": "design_reference_not_score", + "text": "OpenViking's hierarchy design remains a reference, not a scored win, tie, or loss, until comparable output exists.", + "evidence_ids": ["hierarchy-comparison-requires-elf-equivalent"], + "confidence": "high" + } + ], + "evidence_ids": [ + "hierarchy-selection-output-contract", + "same-corpus-before-hierarchy", + "hierarchy-comparison-requires-elf-equivalent" + ], + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "hierarchy-output-contract-recorded", + "ts": "2026-06-11T00:00:00Z", + "actor": "agent", + "action": "encoded_output_contract", + "evidence_ids": ["hierarchy-selection-output-contract"], + "summary": "The fixture records the minimum hierarchy readback needed before scoring." + }, + { + "event_id": "hierarchy-precondition-blocked", + "ts": "2026-06-11T00:01:00Z", + "actor": "agent", + "action": "blocked_scoring", + "evidence_ids": ["same-corpus-before-hierarchy"], + "summary": "The benchmark blocks hierarchy selection scoring until same-corpus evidence ids match." + }, + { + "event_id": "hierarchy-comparison-gated", + "ts": "2026-06-11T00:02:00Z", + "actor": "agent", + "action": "preserved_claim_boundary", + "evidence_ids": ["hierarchy-comparison-requires-elf-equivalent"], + "summary": "The benchmark requires comparable ELF and OpenViking hierarchy artifacts before any win/tie/loss." + } + ], + "prompt": { + "role": "user", + "content": "Can the benchmark score OpenViking hierarchy selection quality against ELF?", + "job_mode": "answer", + "constraints": [ + "cite_evidence", + "preserve_typed_status", + "separate_design_reference_from_scored_output" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "hierarchy_selection_blocked", + "text": "OpenViking hierarchy selection is blocked until selected hierarchy nodes and evidence ids are materialized." + }, + { + "claim_id": "design_reference_not_score", + "text": "OpenViking's hierarchy design remains a reference, not a scored win, tie, or loss, until comparable output exists." + } + ], + "must_not_include": [ + "OpenViking wins hierarchy selection", + "ELF wins hierarchy selection" + ], + "evidence_links": { + "hierarchy_selection_blocked": [ + "hierarchy-selection-output-contract", + "same-corpus-before-hierarchy" + ], + "design_reference_not_score": ["hierarchy-comparison-requires-elf-equivalent"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "hierarchy-selection-output-contract", + "claim_id": "hierarchy_selection_blocked", + "requirement": "cite", + "quote": "selected parent context, selected child context, final resource evidence ids" + }, + { + "evidence_id": "same-corpus-before-hierarchy", + "claim_id": "hierarchy_selection_blocked", + "requirement": "cite", + "quote": "covers every expected evidence id" + }, + { + "evidence_id": "hierarchy-comparison-requires-elf-equivalent", + "claim_id": "design_reference_not_score", + "requirement": "cite", + "quote": "same hierarchy-selection scenario is encoded" + } + ], + "negative_traps": [ + { + "trap_id": "hierarchy-design-win-decoy", + "type": "unsupported_prior", + "evidence_ids": ["hierarchy-design-win-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States hierarchy selection is blocked until output is materialized." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites hierarchy output requirements and same-corpus precondition evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not convert design references into scored wins or losses." + }, + "uncertainty_handling": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps the hierarchy comparison caveated as blocked." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Names the selected-node artifact needed next." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "Hierarchy selection is blocked.", + "Comparable selected-node evidence is missing." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "external_adapter", + "openviking", + "context_trajectory", + "hierarchy_selection", + "blocked", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_recursive_expansion_blocked.json b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_recursive_expansion_blocked.json new file mode 100644 index 00000000..16b41a45 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_recursive_expansion_blocked.json @@ -0,0 +1,261 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "context-trajectory-openviking-recursive-expansion-001", + "suite": "context_trajectory", + "title": "Gate OpenViking recursive context expansion on materialized expansion paths", + "encoding": { + "status": "blocked", + "reason": "OpenViking recursive/context expansion is encoded as a benchmark job, but scoring is blocked until the adapter materializes expansion paths and same-corpus evidence ids are correct.", + "follow_up": { + "title": "Materialize OpenViking recursive context expansion paths", + "reason": "The adapter must emit the seed context, expanded child contexts, final evidence ids, and pruned branches before recursive expansion quality can be scored." + } + }, + "corpus": { + "corpus_id": "real-world-memory-context-trajectory-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "recursive-expansion-output-contract", + "kind": "adapter_state", + "text": "A scored recursive/context expansion job must report the seed context, expanded child contexts, final evidence ids, and pruned branches for the same user prompt.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_recursive_expansion_blocked", + "evidence_id": "recursive-expansion-output-contract" + }, + "locator": { + "quote": "seed context, expanded child contexts, final evidence ids, and pruned branches" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "recursive-same-corpus-gate", + "kind": "adapter_state", + "text": "Recursive/context expansion scoring stays blocked until same-corpus retrieval returns the expected evidence ids and the recursive path output is scored.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_recursive_expansion_blocked", + "evidence_id": "recursive-same-corpus-gate" + }, + "locator": { + "quote": "same-corpus retrieval returns the expected evidence ids" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "recursive-elf-comparison-gate", + "kind": "runbook", + "text": "ELF recursive or trace expansion may be compared only where the same recursive/context expansion scenario is encoded and both sides publish expansion-path artifacts.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_recursive_expansion_blocked", + "evidence_id": "recursive-elf-comparison-gate" + }, + "locator": { + "quote": "both sides publish expansion-path artifacts" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "recursive-expansion-win-decoy", + "kind": "adapter_state", + "text": "Decoy: ELF should be scored as tying OpenViking recursive expansion because both systems have trace-related documentation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_recursive_expansion_blocked", + "evidence_id": "recursive-expansion-win-decoy" + } + }, + "created_at": "2026-06-10T00:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_context_trajectory", + "answer": { + "content": "OpenViking recursive/context expansion is blocked until expansion paths and expected evidence ids are materialized. No ELF tie, win, or loss is allowed until both systems publish comparable expansion-path artifacts for the same scenario.", + "claims": [ + { + "claim_id": "recursive_expansion_blocked", + "text": "OpenViking recursive/context expansion is blocked until expansion paths and expected evidence ids are materialized.", + "evidence_ids": [ + "recursive-expansion-output-contract", + "recursive-same-corpus-gate" + ], + "confidence": "high" + }, + { + "claim_id": "recursive_comparison_not_scored", + "text": "No ELF tie, win, or loss is allowed until both systems publish comparable expansion-path artifacts for the same scenario.", + "evidence_ids": ["recursive-elf-comparison-gate"], + "confidence": "high" + } + ], + "evidence_ids": [ + "recursive-expansion-output-contract", + "recursive-same-corpus-gate", + "recursive-elf-comparison-gate" + ], + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "recursive-output-contract-recorded", + "ts": "2026-06-11T00:00:00Z", + "actor": "agent", + "action": "encoded_output_contract", + "evidence_ids": ["recursive-expansion-output-contract"], + "summary": "The fixture records the recursive expansion artifact needed before scoring." + }, + { + "event_id": "recursive-scoring-blocked", + "ts": "2026-06-11T00:01:00Z", + "actor": "agent", + "action": "blocked_scoring", + "evidence_ids": ["recursive-same-corpus-gate"], + "summary": "The benchmark blocks recursive expansion scoring until expected evidence ids and expansion paths are available." + }, + { + "event_id": "recursive-comparison-gated", + "ts": "2026-06-11T00:02:00Z", + "actor": "agent", + "action": "preserved_claim_boundary", + "evidence_ids": ["recursive-elf-comparison-gate"], + "summary": "The benchmark requires comparable expansion-path artifacts before any ELF comparison." + } + ], + "prompt": { + "role": "user", + "content": "Can the benchmark score OpenViking recursive context expansion against ELF?", + "job_mode": "answer", + "constraints": [ + "cite_evidence", + "preserve_typed_status", + "do_not_claim_tie_without_comparable_artifacts" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "recursive_expansion_blocked", + "text": "OpenViking recursive/context expansion is blocked until expansion paths and expected evidence ids are materialized." + }, + { + "claim_id": "recursive_comparison_not_scored", + "text": "No ELF tie, win, or loss is allowed until both systems publish comparable expansion-path artifacts for the same scenario." + } + ], + "must_not_include": [ + "ELF ties OpenViking recursive expansion", + "OpenViking recursive expansion passed" + ], + "evidence_links": { + "recursive_expansion_blocked": [ + "recursive-expansion-output-contract", + "recursive-same-corpus-gate" + ], + "recursive_comparison_not_scored": ["recursive-elf-comparison-gate"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "recursive-expansion-output-contract", + "claim_id": "recursive_expansion_blocked", + "requirement": "cite", + "quote": "seed context, expanded child contexts, final evidence ids, and pruned branches" + }, + { + "evidence_id": "recursive-same-corpus-gate", + "claim_id": "recursive_expansion_blocked", + "requirement": "cite", + "quote": "same-corpus retrieval returns the expected evidence ids" + }, + { + "evidence_id": "recursive-elf-comparison-gate", + "claim_id": "recursive_comparison_not_scored", + "requirement": "cite", + "quote": "both sides publish expansion-path artifacts" + } + ], + "negative_traps": [ + { + "trap_id": "recursive-expansion-trace-doc-decoy", + "type": "unsupported_prior", + "evidence_ids": ["recursive-expansion-win-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States recursive/context expansion is blocked, not tied or passed." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites expansion-path and same-corpus evidence gates." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not convert documentation or trace presence into a scored tie." + }, + "uncertainty_handling": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps the recursive expansion comparison caveated as blocked." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Names expansion-path artifacts required next." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "Recursive expansion is blocked.", + "Comparable expansion-path artifacts are missing." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "external_adapter", + "openviking", + "context_trajectory", + "recursive_expansion", + "blocked", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_staged_retrieval_blocked.json b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_staged_retrieval_blocked.json new file mode 100644 index 00000000..b27fedb6 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/openviking_staged_retrieval_blocked.json @@ -0,0 +1,260 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "context-trajectory-openviking-staged-retrieval-001", + "suite": "context_trajectory", + "title": "Gate OpenViking staged retrieval trajectory on evidence-bearing same-corpus output", + "encoding": { + "status": "blocked", + "reason": "OpenViking staged retrieval trajectory is encoded as a benchmark job, but scoring is blocked until same-corpus output returns expected evidence ids and comparable staged artifacts exist.", + "follow_up": { + "title": "Run OpenViking staged trajectory after same-corpus evidence passes", + "reason": "The adapter must first publish matched expected evidence ids for every same-corpus query, then emit stage-level context trajectory output that can be compared with the equivalent ELF trace/session trajectory." + } + }, + "corpus": { + "corpus_id": "real-world-memory-context-trajectory-2026-06-11", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "openviking-evidence-id-output-contract", + "kind": "adapter_state", + "text": "The OpenViking Docker baseline must emit expected_evidence_ids, matched_evidence_ids, and missing_evidence_ids for every same-corpus query before staged trajectory scoring is allowed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "repo_file/v1", + "ref": { + "path": "scripts/live-baseline-benchmark.sh" + }, + "locator": { + "symbol": "project_openviking" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "openviking-same-corpus-precondition-blocked", + "kind": "adapter_state", + "text": "OpenViking staged retrieval trajectory remains blocked while same-corpus retrieval is wrong_result or while matched_evidence_ids does not cover every expected evidence id.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_staged_retrieval_blocked", + "evidence_id": "openviking-same-corpus-precondition-blocked" + }, + "locator": { + "quote": "same-corpus retrieval is wrong_result" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "elf-comparison-requires-comparable-trajectory", + "kind": "runbook", + "text": "ELF trace or search-session trajectory may be compared only after the same context-trajectory scenario is encoded and both systems publish comparable stage artifacts.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_staged_retrieval_blocked", + "evidence_id": "elf-comparison-requires-comparable-trajectory" + }, + "locator": { + "quote": "both systems publish comparable stage artifacts" + } + }, + "created_at": "2026-06-11T00:00:00Z" + }, + { + "evidence_id": "trajectory-win-decoy", + "kind": "adapter_state", + "text": "Decoy: ELF should be scored as winning staged trajectory because OpenViking same-corpus retrieval is currently wrong_result.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "openviking_staged_retrieval_blocked", + "evidence_id": "trajectory-win-decoy" + } + }, + "created_at": "2026-06-10T00:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_context_trajectory", + "answer": { + "content": "OpenViking staged retrieval trajectory is blocked until same-corpus output matches expected evidence ids. No ELF win, tie, or loss is allowed until both systems publish comparable stage artifacts for the same context-trajectory scenario.", + "claims": [ + { + "claim_id": "staged_trajectory_blocked", + "text": "OpenViking staged retrieval trajectory is blocked until same-corpus output matches expected evidence ids.", + "evidence_ids": [ + "openviking-evidence-id-output-contract", + "openviking-same-corpus-precondition-blocked" + ], + "confidence": "high" + }, + { + "claim_id": "elf_comparison_not_scored", + "text": "No ELF win, tie, or loss is allowed until both systems publish comparable stage artifacts for the same context-trajectory scenario.", + "evidence_ids": ["elf-comparison-requires-comparable-trajectory"], + "confidence": "high" + } + ], + "evidence_ids": [ + "openviking-evidence-id-output-contract", + "openviking-same-corpus-precondition-blocked", + "elf-comparison-requires-comparable-trajectory" + ], + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "openviking-evidence-id-contract-added", + "ts": "2026-06-11T00:00:00Z", + "actor": "agent", + "action": "encoded_output_contract", + "evidence_ids": ["openviking-evidence-id-output-contract"], + "summary": "The OpenViking baseline output contract now names expected, matched, and missing evidence ids per query." + }, + { + "event_id": "staged-trajectory-blocked", + "ts": "2026-06-11T00:01:00Z", + "actor": "agent", + "action": "blocked_scoring", + "evidence_ids": ["openviking-same-corpus-precondition-blocked"], + "summary": "The staged trajectory benchmark remains blocked behind same-corpus evidence-bearing output." + }, + { + "event_id": "elf-comparison-gated", + "ts": "2026-06-11T00:02:00Z", + "actor": "agent", + "action": "preserved_claim_boundary", + "evidence_ids": ["elf-comparison-requires-comparable-trajectory"], + "summary": "The benchmark does not compare ELF trajectory output until both sides emit comparable artifacts." + } + ], + "prompt": { + "role": "user", + "content": "Can the benchmark score OpenViking staged retrieval trajectory against ELF now?", + "job_mode": "debug", + "constraints": [ + "cite_evidence", + "preserve_typed_status", + "do_not_claim_elf_win_without_comparable_artifacts" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "staged_trajectory_blocked", + "text": "OpenViking staged retrieval trajectory is blocked until same-corpus output matches expected evidence ids." + }, + { + "claim_id": "elf_comparison_not_scored", + "text": "No ELF win, tie, or loss is allowed until both systems publish comparable stage artifacts for the same context-trajectory scenario." + } + ], + "must_not_include": [ + "ELF wins staged trajectory", + "OpenViking staged trajectory passed" + ], + "evidence_links": { + "staged_trajectory_blocked": [ + "openviking-evidence-id-output-contract", + "openviking-same-corpus-precondition-blocked" + ], + "elf_comparison_not_scored": ["elf-comparison-requires-comparable-trajectory"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "openviking-evidence-id-output-contract", + "claim_id": "staged_trajectory_blocked", + "requirement": "cite", + "quote": "expected_evidence_ids, matched_evidence_ids, and missing_evidence_ids" + }, + { + "evidence_id": "openviking-same-corpus-precondition-blocked", + "claim_id": "staged_trajectory_blocked", + "requirement": "cite", + "quote": "same-corpus retrieval is wrong_result" + }, + { + "evidence_id": "elf-comparison-requires-comparable-trajectory", + "claim_id": "elf_comparison_not_scored", + "requirement": "cite", + "quote": "both systems publish comparable stage artifacts" + } + ], + "negative_traps": [ + { + "trap_id": "trajectory-win-from-precondition-decoy", + "type": "unsupported_prior", + "evidence_ids": ["trajectory-win-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States the staged trajectory job is blocked, not won or passed." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the evidence-id output contract and comparable-artifact gate." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids converting the same-corpus wrong_result into an ELF trajectory win." + }, + "debuggability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Identifies the blocked precondition and next artifact needed." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Gives a concrete next benchmark gate." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "The staged trajectory score is blocked.", + "Comparable stage artifacts are missing." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "external_adapter", + "openviking", + "context_trajectory", + "staged_retrieval", + "blocked", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json new file mode 100644 index 00000000..b1928711 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-archival-fallback-001", + "suite": "core_archival_memory", + "title": "Fall back to archival notes when core memory is insufficient", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "fallback-core-insufficient", + "kind": "core_block", + "text": "Core block summary: a rollback runbook exists for single-user production, but this core block intentionally omits the rollback steps.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "archival_fallback", + "evidence_id": "fallback-core-insufficient" + }, + "locator": { + "quote": "intentionally omits the rollback steps" + } + }, + "created_at": "2026-06-11T04:40:00Z" + }, + { + "evidence_id": "fallback-archival-runbook", + "kind": "runbook", + "text": "Archival rollback note: restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors, and verify search recovers the restored note.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "archival_fallback", + "evidence_id": "fallback-archival-runbook" + }, + "locator": { + "quote": "restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors" + } + }, + "created_at": "2026-06-11T04:41:00Z" + }, + { + "evidence_id": "fallback-core-only-trap", + "kind": "unsupported_claim", + "text": "Unsupported shortcut: answer the rollback steps from the core block alone without archival note search.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "archival_fallback", + "evidence_id": "fallback-core-only-trap" + } + }, + "created_at": "2026-06-11T04:39:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "The core block is insufficient because it says the rollback runbook exists but omits the steps. Fall back to archival note search: restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors, and verify search recovers the restored note.", + "claims": [ + { + "claim_id": "core_memory_insufficient", + "text": "The core block is insufficient because it omits the rollback steps.", + "evidence_ids": ["fallback-core-insufficient"], + "confidence": "high" + }, + { + "claim_id": "archival_fallback_steps", + "text": "The archival rollback steps are restore Postgres, rebuild Qdrant from Postgres vectors, and verify search recovers the restored note.", + "evidence_ids": ["fallback-archival-runbook"], + "confidence": "high" + } + ], + "evidence_ids": ["fallback-core-insufficient", "fallback-archival-runbook"], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-rollback-summary-attached", + "ts": "2026-06-11T04:40:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["fallback-core-insufficient"], + "summary": "A core block pointed at the rollback runbook but did not include the steps." + }, + { + "event_id": "archival-rollback-note-recorded", + "ts": "2026-06-11T04:41:00Z", + "actor": "agent", + "action": "recorded_runbook", + "evidence_ids": ["fallback-archival-runbook"], + "summary": "The detailed rollback steps were recorded as archival note evidence." + } + ], + "prompt": { + "role": "user", + "content": "The attached core block only says a rollback runbook exists. What are the rollback steps?", + "job_mode": "answer", + "constraints": ["cite_evidence", "use_archival_fallback", "avoid_core_only_hallucination"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "core_memory_insufficient", + "text": "The core block is insufficient because it omits the rollback steps." + }, + { + "claim_id": "archival_fallback_steps", + "text": "The archival rollback steps are restore Postgres, rebuild Qdrant from Postgres vectors, and verify search recovers the restored note." + } + ], + "must_not_include": [ + "answer the rollback steps from the core block alone" + ], + "evidence_links": { + "core_memory_insufficient": ["fallback-core-insufficient"], + "archival_fallback_steps": ["fallback-archival-runbook"] + }, + "answer_type": "archival_fallback_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "fallback-core-insufficient", + "claim_id": "core_memory_insufficient", + "requirement": "explain", + "quote": "intentionally omits the rollback steps" + }, + { + "evidence_id": "fallback-archival-runbook", + "claim_id": "archival_fallback_steps", + "requirement": "cite", + "quote": "restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors" + } + ], + "negative_traps": [ + { + "trap_id": "core-only-rollback-hallucination", + "type": "unsupported_claim", + "evidence_ids": ["fallback-core-only-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Provides the archival rollback steps." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites both insufficient core memory and archival fallback evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids core-only hallucination." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Makes the fallback path explicit." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "archival_fallback", "rollback", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json new file mode 100644 index 00000000..c1f34487 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-core-block-attachment-001", + "suite": "core_archival_memory", + "title": "Read an explicitly attached core block without treating it as archival search", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "core-attachment-active", + "kind": "core_block", + "text": "Core block attachment: key project_style has an active attachment for tenant local-tenant project ELF agent local-agent read_profile private_plus_project.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_attachment", + "evidence_id": "core-attachment-active" + }, + "locator": { + "quote": "active attachment for tenant local-tenant project ELF agent local-agent read_profile private_plus_project" + } + }, + "created_at": "2026-06-11T04:00:00Z" + }, + { + "evidence_id": "core-attachment-not-search", + "kind": "core_block_contract", + "text": "Core block readback is not archival search; it does not embed, rerank, search Qdrant, create a search session, or record note hits.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_attachment", + "evidence_id": "core-attachment-not-search" + }, + "locator": { + "quote": "does not embed, rerank, search Qdrant" + } + }, + "created_at": "2026-06-11T04:01:00Z" + }, + { + "evidence_id": "core-attachment-qdrant-trap", + "kind": "stale_claim", + "text": "Stale shortcut: core blocks are indexed into Qdrant and returned as normal archival note search hits.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_attachment", + "evidence_id": "core-attachment-qdrant-trap" + } + }, + "created_at": "2026-06-11T03:59:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Return the project_style core block because it has an active attachment for the exact tenant, project, agent, and private_plus_project read profile. Keep that readback separate from archival search because core blocks do not embed, rerank, search Qdrant, create search sessions, or record note hits.", + "claims": [ + { + "claim_id": "attached_core_block_readback", + "text": "The project_style core block is returned through its exact active attachment.", + "evidence_ids": ["core-attachment-active"], + "confidence": "high" + }, + { + "claim_id": "core_not_archival_search", + "text": "Core block readback is separate from archival search and Qdrant-derived note retrieval.", + "evidence_ids": ["core-attachment-not-search"], + "confidence": "high" + } + ], + "evidence_ids": ["core-attachment-active", "core-attachment-not-search"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-project-style-attached", + "ts": "2026-06-11T04:00:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["core-attachment-active"], + "summary": "The project_style core block was attached for the exact read profile." + }, + { + "event_id": "core-archival-boundary-recorded", + "ts": "2026-06-11T04:01:00Z", + "actor": "agent", + "action": "recorded_contract", + "evidence_ids": ["core-attachment-not-search"], + "summary": "The core block readback boundary was recorded separately from archival search." + } + ], + "prompt": { + "role": "user", + "content": "Which always-loaded project style block is attached for this agent, and should it appear as a normal archival search hit?", + "job_mode": "answer", + "constraints": ["cite_evidence", "separate_core_from_archival_search", "avoid_qdrant_core_block_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "attached_core_block_readback", + "text": "The project_style core block is returned through its exact active attachment." + }, + { + "claim_id": "core_not_archival_search", + "text": "Core block readback is separate from archival search and Qdrant-derived note retrieval." + } + ], + "must_not_include": [ + "core blocks are indexed into Qdrant and returned as normal archival note search hits" + ], + "evidence_links": { + "attached_core_block_readback": ["core-attachment-active"], + "core_not_archival_search": ["core-attachment-not-search"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "core-attachment-active", + "claim_id": "attached_core_block_readback", + "requirement": "cite", + "quote": "active attachment for tenant local-tenant project ELF agent local-agent read_profile private_plus_project" + }, + { + "evidence_id": "core-attachment-not-search", + "claim_id": "core_not_archival_search", + "requirement": "cite", + "quote": "does not embed, rerank, search Qdrant" + } + ], + "negative_traps": [ + { + "trap_id": "qdrant-core-block-search-hit", + "type": "stale_fact", + "evidence_ids": ["core-attachment-qdrant-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Identifies the attached core block." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites attachment and core-search boundary evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids indexing core blocks into Qdrant-backed archival search." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Preserves explicit attachment semantics." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "core_block", "attachment", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json new file mode 100644 index 00000000..f1fd4f92 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-core-block-provenance-001", + "suite": "core_archival_memory", + "title": "Return source refs and audit events for core block assertions", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "core-provenance-source-ref", + "kind": "core_block", + "text": "Provenance evidence: core block release_policy returns source_ref schema source_ref/v1 with resolver real_world_job_fixture/v1 and locator quote retained for reviewer inspection.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_provenance", + "evidence_id": "core-provenance-source-ref" + }, + "locator": { + "quote": "source_ref schema source_ref/v1" + } + }, + "created_at": "2026-06-11T04:20:00Z" + }, + { + "evidence_id": "core-provenance-audit-events", + "kind": "core_block_event", + "text": "Audit evidence: release_policy has append-only events block_created, block_updated, and attachment_added returned in audit_history.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_provenance", + "evidence_id": "core-provenance-audit-events" + }, + "locator": { + "quote": "block_created, block_updated, and attachment_added" + } + }, + "created_at": "2026-06-11T04:21:00Z" + }, + { + "evidence_id": "core-provenance-trusted-memory-trap", + "kind": "stale_claim", + "text": "Stale shortcut: always-loaded core memory is trusted without returning source_ref or audit_history.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_provenance", + "evidence_id": "core-provenance-trusted-memory-trap" + } + }, + "created_at": "2026-06-11T04:19:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "The release_policy core block must return its source_ref with source_ref/v1 resolver data and retain the locator quote for inspection. Its provenance also includes append-only block_created, block_updated, and attachment_added events in audit_history.", + "claims": [ + { + "claim_id": "core_source_ref_returned", + "text": "The release_policy core block returns source_ref/v1 provenance.", + "evidence_ids": ["core-provenance-source-ref"], + "confidence": "high" + }, + { + "claim_id": "core_audit_history_returned", + "text": "The release_policy core block returns block_created, block_updated, and attachment_added audit events.", + "evidence_ids": ["core-provenance-audit-events"], + "confidence": "high" + } + ], + "evidence_ids": ["core-provenance-source-ref", "core-provenance-audit-events"], + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-release-policy-created", + "ts": "2026-06-11T04:20:00Z", + "actor": "agent", + "action": "block_created", + "evidence_ids": ["core-provenance-source-ref"], + "summary": "The release_policy block was created with a source_ref pointer." + }, + { + "event_id": "core-release-policy-attached", + "ts": "2026-06-11T04:21:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["core-provenance-audit-events"], + "summary": "The release_policy block attachment event was added to audit history." + } + ], + "prompt": { + "role": "user", + "content": "What provenance should a returned core release_policy block include?", + "job_mode": "answer", + "constraints": ["cite_evidence", "include_source_ref", "include_audit_history"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "core_source_ref_returned", + "text": "The release_policy core block returns source_ref/v1 provenance." + }, + { + "claim_id": "core_audit_history_returned", + "text": "The release_policy core block returns block_created, block_updated, and attachment_added audit events." + } + ], + "must_not_include": [ + "always-loaded core memory is trusted without returning source_ref or audit_history" + ], + "evidence_links": { + "core_source_ref_returned": ["core-provenance-source-ref"], + "core_audit_history_returned": ["core-provenance-audit-events"] + }, + "answer_type": "provenance_bundle", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "core-provenance-source-ref", + "claim_id": "core_source_ref_returned", + "requirement": "cite", + "quote": "source_ref schema source_ref/v1" + }, + { + "evidence_id": "core-provenance-audit-events", + "claim_id": "core_audit_history_returned", + "requirement": "cite", + "quote": "block_created, block_updated, and attachment_added" + } + ], + "negative_traps": [ + { + "trap_id": "trusted-core-no-provenance", + "type": "unsupported_claim", + "evidence_ids": ["core-provenance-trusted-memory-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the returned provenance fields." + }, + "evidence_grounding": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Cites source_ref and audit-history evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids trusted-without-provenance claims." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Answers in a reviewer-usable provenance bundle shape." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "provenance", "audit_history", "source_ref"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json new file mode 100644 index 00000000..3b379b85 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-core-block-scope-001", + "suite": "core_archival_memory", + "title": "Apply core block scope and private-owner checks before readback", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "core-scope-project-shared-readable", + "kind": "core_block", + "text": "Scope evidence: project_shared block release_gate is readable for tenant local-tenant project ELF agent local-agent only when the active attachment and read_profile all_scopes allow project_shared.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_scope", + "evidence_id": "core-scope-project-shared-readable" + }, + "locator": { + "quote": "active attachment and read_profile all_scopes allow project_shared" + } + }, + "created_at": "2026-06-11T04:10:00Z" + }, + { + "evidence_id": "core-scope-private-owner", + "kind": "core_block", + "text": "Private owner evidence: agent_private block agent_a_workflow belongs to agent-a and must not be returned to agent-b even if agent-b has a matching read_profile label.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_scope", + "evidence_id": "core-scope-private-owner" + }, + "locator": { + "quote": "must not be returned to agent-b" + } + }, + "created_at": "2026-06-11T04:11:00Z" + }, + { + "evidence_id": "core-scope-bypass-trap", + "kind": "stale_claim", + "text": "Stale shortcut: a core block attachment bypasses read_profile scope checks, private-owner checks, and shared grants.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "core_block_scope", + "evidence_id": "core-scope-bypass-trap" + } + }, + "created_at": "2026-06-11T04:09:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Return the release_gate core block only when the active attachment and all_scopes read profile allow project_shared. Do not return agent_a_workflow to agent-b, because private-owner checks still apply to agent_private core blocks.", + "claims": [ + { + "claim_id": "shared_core_scope_allowed", + "text": "The project_shared release_gate block is readable only when attachment and read_profile allow project_shared.", + "evidence_ids": ["core-scope-project-shared-readable"], + "confidence": "high" + }, + { + "claim_id": "private_core_scope_denied", + "text": "The agent_private agent_a_workflow block must not be returned to agent-b.", + "evidence_ids": ["core-scope-private-owner"], + "confidence": "high" + } + ], + "evidence_ids": ["core-scope-project-shared-readable", "core-scope-private-owner"], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "core-release-gate-shared", + "ts": "2026-06-11T04:10:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["core-scope-project-shared-readable"], + "summary": "The release_gate block was attached with project_shared scope." + }, + { + "event_id": "core-agent-a-private", + "ts": "2026-06-11T04:11:00Z", + "actor": "agent-a", + "action": "block_created", + "evidence_ids": ["core-scope-private-owner"], + "summary": "The agent_a_workflow block remained private to agent-a." + } + ], + "prompt": { + "role": "user", + "content": "For core memory readback, which shared block can this agent see, and can agent-b also see agent-a's private block?", + "job_mode": "answer", + "constraints": ["cite_evidence", "enforce_scope", "avoid_private_owner_leakage"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "shared_core_scope_allowed", + "text": "The project_shared release_gate block is readable only when attachment and read_profile allow project_shared." + }, + { + "claim_id": "private_core_scope_denied", + "text": "The agent_private agent_a_workflow block must not be returned to agent-b." + } + ], + "must_not_include": [ + "a core block attachment bypasses read_profile scope checks" + ], + "evidence_links": { + "shared_core_scope_allowed": ["core-scope-project-shared-readable"], + "private_core_scope_denied": ["core-scope-private-owner"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "core-scope-project-shared-readable", + "claim_id": "shared_core_scope_allowed", + "requirement": "cite", + "quote": "active attachment and read_profile all_scopes allow project_shared" + }, + { + "evidence_id": "core-scope-private-owner", + "claim_id": "private_core_scope_denied", + "requirement": "cite", + "quote": "must not be returned to agent-b" + } + ], + "negative_traps": [ + { + "trap_id": "core-attachment-bypasses-scope", + "type": "scope_leak", + "evidence_ids": ["core-scope-bypass-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Applies readable shared scope and denied private owner scope." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites scope and private-owner evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids scope-bypass claims." + }, + "ownership_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not leak private core blocks across agents." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "scope", "private_owner", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json new file mode 100644 index 00000000..423db375 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json @@ -0,0 +1,271 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-project-decision-recovery-001", + "suite": "core_archival_memory", + "title": "Recover a project decision from core routing and archival rationale", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "decision-core-routing-block", + "kind": "core_block", + "text": "Core decision routing block: keep the benchmark outcome policy always attached and route detailed rationale to archival notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-core-routing-block" + }, + "locator": { + "quote": "route detailed rationale to archival notes" + } + }, + "created_at": "2026-06-11T04:50:00Z" + }, + { + "evidence_id": "decision-archival-outcome-policy", + "kind": "decision", + "text": "Archival decision record: scenario outcomes use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-archival-outcome-policy" + }, + "locator": { + "quote": "use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them" + } + }, + "created_at": "2026-06-11T04:51:00Z" + }, + { + "evidence_id": "decision-archival-core-search-boundary", + "kind": "decision", + "text": "Archival project decision: core blocks stay separate from archival note search and Qdrant-derived retrieval.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-archival-core-search-boundary" + }, + "locator": { + "quote": "core blocks stay separate from archival note search" + } + }, + "created_at": "2026-06-11T04:52:00Z" + }, + { + "evidence_id": "decision-letta-export-boundary", + "kind": "comparison_boundary", + "text": "Letta comparison boundary: no contained export/readback artifact maps core block JSON, archival search/readback JSON, and source ids, so Letta remains blocked or not_tested and no win, tie, or loss claim is allowed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-letta-export-boundary" + }, + "locator": { + "quote": "no contained export/readback artifact maps core block JSON" + } + }, + "created_at": "2026-06-11T04:53:00Z" + }, + { + "evidence_id": "decision-letta-win-trap", + "kind": "unsupported_claim", + "text": "Wrong claim: Letta comparison can be scored as an ELF win or measured loss because ELF has core blocks.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "project_decision_recovery", + "evidence_id": "decision-letta-win-trap" + } + }, + "created_at": "2026-06-11T04:49:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Use the always-attached core routing block to find the benchmark outcome policy, then cite archival notes for the detailed decision. The archival decision says to use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them. It also says core blocks stay separate from archival note search and Qdrant-derived retrieval. Letta remains blocked or not_tested until a contained export/readback artifact maps core and archival source ids, so no ELF-over-Letta claim follows from ELF having core blocks.", + "claims": [ + { + "claim_id": "core_routes_to_archival_rationale", + "text": "The core routing block points detailed decision rationale to archival notes.", + "evidence_ids": ["decision-core-routing-block"], + "confidence": "high" + }, + { + "claim_id": "outcomes_require_evidence", + "text": "Scenario outcomes use win, tie, loss, not_tested, blocked, or non_goal only when evidence supports them.", + "evidence_ids": ["decision-archival-outcome-policy"], + "confidence": "high" + }, + { + "claim_id": "core_archival_boundary_preserved", + "text": "Core blocks stay separate from archival note search and Qdrant-derived retrieval.", + "evidence_ids": ["decision-archival-core-search-boundary"], + "confidence": "high" + }, + { + "claim_id": "letta_comparison_requires_export", + "text": "Letta remains blocked or not_tested until a contained export/readback artifact maps core and archival source ids.", + "evidence_ids": ["decision-letta-export-boundary"], + "confidence": "high" + } + ], + "evidence_ids": [ + "decision-core-routing-block", + "decision-archival-outcome-policy", + "decision-archival-core-search-boundary", + "decision-letta-export-boundary" + ], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "decision-routing-core-attached", + "ts": "2026-06-11T04:50:00Z", + "actor": "agent", + "action": "attachment_added", + "evidence_ids": ["decision-core-routing-block"], + "summary": "A core block kept the outcome-policy routing pointer always attached." + }, + { + "event_id": "decision-outcome-policy-archived", + "ts": "2026-06-11T04:51:00Z", + "actor": "agent", + "action": "recorded_decision", + "evidence_ids": [ + "decision-archival-outcome-policy", + "decision-archival-core-search-boundary", + "decision-letta-export-boundary" + ], + "summary": "Archival notes recorded the detailed outcome policy and core-search boundary." + } + ], + "prompt": { + "role": "user", + "content": "What is the benchmark outcome policy, and does having ELF core blocks make Letta a measured loss?", + "job_mode": "decide", + "constraints": ["cite_evidence", "recover_project_decision", "avoid_unsupported_letta_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "core_routes_to_archival_rationale", + "text": "The core routing block points detailed decision rationale to archival notes." + }, + { + "claim_id": "outcomes_require_evidence", + "text": "Scenario outcomes use win, tie, loss, not_tested, blocked, or non_goal only when evidence supports them." + }, + { + "claim_id": "core_archival_boundary_preserved", + "text": "Core blocks stay separate from archival note search and Qdrant-derived retrieval." + }, + { + "claim_id": "letta_comparison_requires_export", + "text": "Letta remains blocked or not_tested until a contained export/readback artifact maps core and archival source ids." + } + ], + "must_not_include": [ + "Letta comparison can be scored as an ELF win", + "Letta is a measured loss", + "Letta comparison can be scored as a measured loss" + ], + "evidence_links": { + "core_routes_to_archival_rationale": ["decision-core-routing-block"], + "outcomes_require_evidence": ["decision-archival-outcome-policy"], + "core_archival_boundary_preserved": ["decision-archival-core-search-boundary"], + "letta_comparison_requires_export": ["decision-letta-export-boundary"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "decision-core-routing-block", + "claim_id": "core_routes_to_archival_rationale", + "requirement": "cite", + "quote": "route detailed rationale to archival notes" + }, + { + "evidence_id": "decision-archival-outcome-policy", + "claim_id": "outcomes_require_evidence", + "requirement": "cite", + "quote": "use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them" + }, + { + "evidence_id": "decision-archival-core-search-boundary", + "claim_id": "core_archival_boundary_preserved", + "requirement": "cite", + "quote": "core blocks stay separate from archival note search" + }, + { + "evidence_id": "decision-letta-export-boundary", + "claim_id": "letta_comparison_requires_export", + "requirement": "cite", + "quote": "no contained export/readback artifact maps core block JSON" + } + ], + "negative_traps": [ + { + "trap_id": "unsupported-letta-loss-from-elf-core", + "type": "unsupported_claim", + "evidence_ids": ["decision-letta-win-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Recovers the benchmark outcome policy." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites core routing and archival decision evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids an unsupported Letta win or loss claim." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Explains how core memory and archival decision evidence work together." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "project_decisions", "letta_boundary", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json new file mode 100644 index 00000000..0dde7817 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json @@ -0,0 +1,206 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "core-archival-stale-core-detection-001", + "suite": "core_archival_memory", + "title": "Detect a stale core block when archival evidence supersedes it", + "corpus": { + "corpus_id": "real-world-memory-core-archival-2026-06-11", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stale-core-validation-gate", + "kind": "core_block", + "text": "Stale core block: the validation gate is cargo make lint and cargo make test.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "stale-core-validation-gate" + } + }, + "created_at": "2026-06-10T10:00:00Z" + }, + { + "evidence_id": "archival-current-validation-gate", + "kind": "decision", + "text": "Archival decision update: before pushing a refreshed PR head, run cargo make fmt, cargo make lint-fix, and cargo make check.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "archival-current-validation-gate" + }, + "locator": { + "quote": "cargo make fmt, cargo make lint-fix, and cargo make check" + } + }, + "created_at": "2026-06-11T04:30:00Z" + }, + { + "evidence_id": "archival-supersedes-core-rationale", + "kind": "decision", + "text": "Rationale: archival note evidence supersedes the attached core block until the core block is updated from source-of-truth state.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "archival-supersedes-core-rationale" + }, + "locator": { + "quote": "supersedes the attached core block" + } + }, + "created_at": "2026-06-11T04:31:00Z" + }, + { + "evidence_id": "stale-core-answer-trap", + "kind": "stale_claim", + "text": "Wrong answer trap: cite the core block as current and skip archival evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_core_detection", + "evidence_id": "stale-core-answer-trap" + } + }, + "created_at": "2026-06-11T04:29:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_core_archival_memory", + "answer": { + "content": "Treat the attached validation-gate core block as stale. The current archival decision says to run cargo make fmt, cargo make lint-fix, and cargo make check before pushing a refreshed PR head, and the archival rationale says that evidence supersedes the core block until it is updated from source-of-truth state.", + "claims": [ + { + "claim_id": "stale_core_detected", + "text": "The attached validation-gate core block is stale.", + "evidence_ids": ["archival-supersedes-core-rationale"], + "confidence": "high" + }, + { + "claim_id": "archival_current_gate", + "text": "The current archival validation gate is cargo make fmt, cargo make lint-fix, and cargo make check.", + "evidence_ids": ["archival-current-validation-gate"], + "confidence": "high" + } + ], + "evidence_ids": ["archival-current-validation-gate", "archival-supersedes-core-rationale"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "old-core-gate-attached", + "ts": "2026-06-10T10:00:00Z", + "actor": "agent", + "action": "block_created", + "evidence_ids": ["stale-core-validation-gate"], + "summary": "A core block recorded an old validation gate." + }, + { + "event_id": "archival-gate-updated", + "ts": "2026-06-11T04:30:00Z", + "actor": "agent", + "action": "updated_decision", + "evidence_ids": ["archival-current-validation-gate", "archival-supersedes-core-rationale"], + "summary": "Archival evidence superseded the old core validation gate." + } + ], + "prompt": { + "role": "user", + "content": "The attached core block says the gate is lint and test. Is that still current before a refreshed PR push?", + "job_mode": "answer", + "constraints": ["cite_evidence", "detect_stale_core", "prefer_current_archival_evidence"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "stale_core_detected", + "text": "The attached validation-gate core block is stale." + }, + { + "claim_id": "archival_current_gate", + "text": "The current archival validation gate is cargo make fmt, cargo make lint-fix, and cargo make check." + } + ], + "must_not_include": [ + "the validation gate is cargo make lint and cargo make test" + ], + "evidence_links": { + "stale_core_detected": ["archival-supersedes-core-rationale"], + "archival_current_gate": ["archival-current-validation-gate"] + }, + "answer_type": "current_state_with_stale_core_caveat", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "archival-current-validation-gate", + "claim_id": "archival_current_gate", + "requirement": "cite", + "quote": "cargo make fmt, cargo make lint-fix, and cargo make check" + }, + { + "evidence_id": "archival-supersedes-core-rationale", + "claim_id": "stale_core_detected", + "requirement": "explain", + "quote": "supersedes the attached core block" + } + ], + "negative_traps": [ + { + "trap_id": "stale-core-current-answer", + "type": "stale_fact", + "evidence_ids": ["stale-core-validation-gate", "stale-core-answer-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States that the attached core block is stale." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites current archival evidence and supersession rationale." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids answering from stale core memory." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Detects stale core state when archival evidence supersedes it." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "core_archival_memory", "stale_core", "archival_supersession", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/benchmark_conclusion_overturned.json b/apps/elf-eval/fixtures/real_world_memory/evolution/benchmark_conclusion_overturned.json new file mode 100644 index 00000000..0d694597 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/evolution/benchmark_conclusion_overturned.json @@ -0,0 +1,263 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-benchmark-verdict-001", + "suite": "memory_evolution", + "title": "Use the current production adoption verdict after an older conclusion changed", + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "verdict-old-not-ready", + "kind": "decision", + "text": "Earlier conclusion: ELF was not production ready because private corpus and restore proof were missing.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "benchmark_conclusion_overturned", + "evidence_id": "verdict-old-not-ready" + } + }, + "created_at": "2026-06-07T00:00:00Z" + }, + { + "evidence_id": "verdict-current-ready-bounded", + "kind": "decision", + "text": "Production adoption gate on 2026-06-09 says ELF is ready for personal production use with bounded caveats.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "benchmark_conclusion_overturned", + "evidence_id": "verdict-current-ready-bounded" + } + }, + "created_at": "2026-06-09T00:00:00Z" + }, + { + "evidence_id": "verdict-bounded-private-caveat", + "kind": "decision", + "text": "The private production corpus was not run; the gate records it as a bounded caveat, not a private-corpus pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "benchmark_conclusion_overturned", + "evidence_id": "verdict-bounded-private-caveat" + } + }, + "created_at": "2026-06-09T00:05:00Z" + }, + { + "evidence_id": "verdict-update-rationale", + "kind": "decision", + "text": "The verdict changed after provider-backed synthetic, stress, backfill, and restore proof evidence was recorded.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "benchmark_conclusion_overturned", + "evidence_id": "verdict-update-rationale" + } + }, + "created_at": "2026-06-09T00:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_memory_evolution", + "answer": { + "content": "The current verdict is that ELF is ready for personal production use with bounded caveats; the older not-ready conclusion is historical, and the private corpus remains an explicit caveat rather than a private-corpus pass.", + "claims": [ + { + "claim_id": "current_benchmark_verdict", + "text": "ELF is ready for personal production use with bounded caveats.", + "evidence_ids": [ + "verdict-current-ready-bounded", + "verdict-old-not-ready", + "verdict-update-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "benchmark_update_rationale", + "text": "The verdict changed after provider-backed benchmark and restore proof evidence was recorded.", + "evidence_ids": ["verdict-update-rationale"], + "confidence": "high" + }, + { + "claim_id": "private_corpus_caveat", + "text": "The private corpus remains a bounded caveat rather than a private-corpus pass.", + "evidence_ids": ["verdict-bounded-private-caveat"], + "confidence": "high" + } + ], + "evidence_ids": [ + "verdict-current-ready-bounded", + "verdict-bounded-private-caveat", + "verdict-update-rationale" + ], + "latency_ms": 1.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "verdict-not-ready", + "ts": "2026-06-07T00:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": ["verdict-old-not-ready"], + "summary": "The older verdict said ELF was not ready." + }, + { + "event_id": "verdict-ready", + "ts": "2026-06-09T00:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": [ + "verdict-current-ready-bounded", + "verdict-bounded-private-caveat", + "verdict-update-rationale" + ], + "summary": "The adoption gate changed the current verdict and preserved the private-corpus caveat." + } + ], + "prompt": { + "role": "user", + "content": "What is the current benchmark adoption conclusion, and what older conclusion changed?", + "job_mode": "decide", + "constraints": ["cite_evidence", "distinguish_current_from_historical", "state_caveats"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_benchmark_verdict", + "text": "ELF is ready for personal production use with bounded caveats." + }, + { + "claim_id": "benchmark_update_rationale", + "text": "The verdict changed after provider-backed benchmark and restore proof evidence was recorded." + }, + { + "claim_id": "private_corpus_caveat", + "text": "The private corpus remains a bounded caveat rather than a private-corpus pass." + } + ], + "must_not_include": [ + "ELF is not ready for personal production use.", + "The private production corpus passed." + ], + "evidence_links": { + "current_benchmark_verdict": [ + "verdict-current-ready-bounded", + "verdict-old-not-ready", + "verdict-update-rationale" + ], + "benchmark_update_rationale": ["verdict-update-rationale"], + "private_corpus_caveat": ["verdict-bounded-private-caveat"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "verdict-current-ready-bounded", + "claim_id": "current_benchmark_verdict", + "requirement": "cite", + "quote": "ready for personal production use with bounded caveats" + }, + { + "evidence_id": "verdict-bounded-private-caveat", + "claim_id": "private_corpus_caveat", + "requirement": "cite", + "quote": "bounded caveat, not a private-corpus pass" + }, + { + "evidence_id": "verdict-update-rationale", + "claim_id": "benchmark_update_rationale", + "requirement": "explain", + "quote": "provider-backed synthetic, stress, backfill, and restore proof" + } + ], + "negative_traps": [ + { + "trap_id": "old-not-ready-verdict-current", + "type": "stale_fact", + "evidence_ids": ["verdict-old-not-ready"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Reports the current adoption verdict and historical supersession." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the current verdict and private-corpus caveat." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites current verdict, caveat, and rationale evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not report the old not-ready verdict as current." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["verdict-current-ready-bounded"], + "historical_evidence_ids": ["verdict-old-not-ready"], + "stale_trap_ids": ["old-not-ready-verdict-current"], + "conflicts": [ + { + "conflict_id": "benchmark-verdict-overturned", + "claim_id": "current_benchmark_verdict", + "current_evidence_id": "verdict-current-ready-bounded", + "historical_evidence_id": "verdict-old-not-ready", + "resolved_by_evidence_id": "verdict-update-rationale" + } + ], + "update_rationale": { + "claim_id": "benchmark_update_rationale", + "evidence_ids": ["verdict-update-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "memory_evolution", + "reference_mem0_history", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/deployment_method_superseded.json b/apps/elf-eval/fixtures/real_world_memory/evolution/deployment_method_superseded.json new file mode 100644 index 00000000..f20d9f08 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/evolution/deployment_method_superseded.json @@ -0,0 +1,226 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-deploy-method-001", + "suite": "memory_evolution", + "title": "Prefer the superseding production deployment method over the old smoke path", + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "deploy-old-quickstart", + "kind": "runbook", + "text": "Old deployment method: use quickstart cargo run service terminals for local smoke only.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "deployment_method_superseded", + "evidence_id": "deploy-old-quickstart" + } + }, + "created_at": "2026-06-02T00:00:00Z" + }, + { + "evidence_id": "deploy-current-production-runbook", + "kind": "runbook", + "text": "Current single-user production operation uses Docker Compose production runbook with backup, restore, and Qdrant rebuild.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "deployment_method_superseded", + "evidence_id": "deploy-current-production-runbook" + } + }, + "created_at": "2026-06-09T00:00:00Z" + }, + { + "evidence_id": "deploy-supersession-rationale", + "kind": "decision", + "text": "Quickstart is no longer production guidance because backup, restore, rollback, and provider config handling must be explicit.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "deployment_method_superseded", + "evidence_id": "deploy-supersession-rationale" + } + }, + "created_at": "2026-06-09T00:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_memory_evolution", + "answer": { + "content": "Use the Docker Compose production runbook with backup, restore, and Qdrant rebuild for production; the cargo run quickstart is only historical local-smoke guidance because production recovery handling must be explicit.", + "claims": [ + { + "claim_id": "current_deployment_method", + "text": "Use the Docker Compose production runbook with backup, restore, and Qdrant rebuild for production.", + "evidence_ids": [ + "deploy-current-production-runbook", + "deploy-old-quickstart", + "deploy-supersession-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "deployment_update_rationale", + "text": "The quickstart was superseded because production recovery handling must be explicit.", + "evidence_ids": ["deploy-supersession-rationale"], + "confidence": "high" + } + ], + "evidence_ids": [ + "deploy-current-production-runbook", + "deploy-supersession-rationale" + ], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "deploy-quickstart", + "ts": "2026-06-02T00:00:00Z", + "actor": "agent", + "action": "recorded_runbook", + "evidence_ids": ["deploy-old-quickstart"], + "summary": "The quickstart path existed for local smoke use." + }, + { + "event_id": "deploy-production-runbook", + "ts": "2026-06-09T00:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": ["deploy-current-production-runbook", "deploy-supersession-rationale"], + "summary": "The production runbook became the current production method." + } + ], + "prompt": { + "role": "user", + "content": "Which deployment path should I use for production now?", + "job_mode": "operate", + "constraints": ["cite_evidence", "distinguish_current_from_historical"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_deployment_method", + "text": "Use the Docker Compose production runbook with backup, restore, and Qdrant rebuild for production." + }, + { + "claim_id": "deployment_update_rationale", + "text": "The quickstart was superseded because production recovery handling must be explicit." + } + ], + "must_not_include": [ + "Use quickstart cargo run service terminals for production." + ], + "evidence_links": { + "current_deployment_method": [ + "deploy-current-production-runbook", + "deploy-old-quickstart", + "deploy-supersession-rationale" + ], + "deployment_update_rationale": ["deploy-supersession-rationale"] + }, + "answer_type": "ops_runbook", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "deploy-current-production-runbook", + "claim_id": "current_deployment_method", + "requirement": "cite", + "quote": "Docker Compose production runbook" + }, + { + "evidence_id": "deploy-supersession-rationale", + "claim_id": "deployment_update_rationale", + "requirement": "explain", + "quote": "backup, restore, rollback" + } + ], + "negative_traps": [ + { + "trap_id": "old-quickstart-production", + "type": "stale_fact", + "evidence_ids": ["deploy-old-quickstart"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Chooses the superseding production runbook." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Answers with the current production method." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites current runbook and supersession rationale." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not turn the quickstart smoke path into production guidance." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["deploy-current-production-runbook"], + "historical_evidence_ids": ["deploy-old-quickstart"], + "stale_trap_ids": ["old-quickstart-production"], + "conflicts": [ + { + "conflict_id": "deployment-method-supersession", + "claim_id": "current_deployment_method", + "current_evidence_id": "deploy-current-production-runbook", + "historical_evidence_id": "deploy-old-quickstart", + "resolved_by_evidence_id": "deploy-supersession-rationale" + } + ], + "update_rationale": { + "claim_id": "deployment_update_rationale", + "evidence_ids": ["deploy-supersession-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "memory_evolution", + "reference_letta_core_block", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/issue_blocked_to_done.json b/apps/elf-eval/fixtures/real_world_memory/evolution/issue_blocked_to_done.json new file mode 100644 index 00000000..8fb40f85 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/evolution/issue_blocked_to_done.json @@ -0,0 +1,221 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-issue-state-001", + "suite": "memory_evolution", + "title": "Report an issue as done after an earlier blocker cleared", + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "issue-xy900-blocked", + "kind": "issue", + "text": "On 2026-06-06, XY-900 was blocked on missing real_world_job fixture/report implementation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "issue_blocked_to_done", + "evidence_id": "issue-xy900-blocked" + } + }, + "created_at": "2026-06-06T00:00:00Z" + }, + { + "evidence_id": "issue-xy900-done", + "kind": "issue", + "text": "On 2026-06-09, XY-900 is done after PR #200 added the real_world_job fixture/report implementation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "issue_blocked_to_done", + "evidence_id": "issue-xy900-done" + } + }, + "created_at": "2026-06-09T00:00:00Z" + }, + { + "evidence_id": "issue-xy900-resolution-rationale", + "kind": "decision", + "text": "The blocker cleared because the fixture/report runner now exists and publishes typed real-world job reports.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "issue_blocked_to_done", + "evidence_id": "issue-xy900-resolution-rationale" + } + }, + "created_at": "2026-06-09T00:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_memory_evolution", + "answer": { + "content": "XY-900 is currently done after PR #200; the earlier missing real_world_job fixture/report blocker is historical and cleared because the runner now publishes typed reports.", + "claims": [ + { + "claim_id": "current_issue_state", + "text": "XY-900 is currently done after PR #200.", + "evidence_ids": [ + "issue-xy900-done", + "issue-xy900-blocked", + "issue-xy900-resolution-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "issue_update_rationale", + "text": "The blocker cleared because the fixture/report runner now exists.", + "evidence_ids": ["issue-xy900-resolution-rationale"], + "confidence": "high" + } + ], + "evidence_ids": ["issue-xy900-done", "issue-xy900-resolution-rationale"], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy900-blocked", + "ts": "2026-06-06T00:00:00Z", + "actor": "agent", + "action": "hit_blocker", + "evidence_ids": ["issue-xy900-blocked"], + "summary": "The issue was blocked on missing fixture/report implementation." + }, + { + "event_id": "xy900-done", + "ts": "2026-06-09T00:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": ["issue-xy900-done", "issue-xy900-resolution-rationale"], + "summary": "The implementation landed and the blocker cleared." + } + ], + "prompt": { + "role": "user", + "content": "Is XY-900 still blocked, or is it done now?", + "job_mode": "resume", + "constraints": ["cite_evidence", "distinguish_current_from_historical"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_issue_state", + "text": "XY-900 is currently done after PR #200." + }, + { + "claim_id": "issue_update_rationale", + "text": "The blocker cleared because the fixture/report runner now exists." + } + ], + "must_not_include": ["XY-900 is currently blocked."], + "evidence_links": { + "current_issue_state": [ + "issue-xy900-done", + "issue-xy900-blocked", + "issue-xy900-resolution-rationale" + ], + "issue_update_rationale": ["issue-xy900-resolution-rationale"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "issue-xy900-done", + "claim_id": "current_issue_state", + "requirement": "cite", + "quote": "XY-900 is done" + }, + { + "evidence_id": "issue-xy900-resolution-rationale", + "claim_id": "issue_update_rationale", + "requirement": "explain", + "quote": "fixture/report runner now exists" + } + ], + "negative_traps": [ + { + "trap_id": "old-issue-blocker-current", + "type": "stale_fact", + "evidence_ids": ["issue-xy900-blocked"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Reports the latest issue state rather than the historical blocker." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States that the issue is done and why." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses current completion and resolution evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not report the old blocker as current." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["issue-xy900-done"], + "historical_evidence_ids": ["issue-xy900-blocked"], + "stale_trap_ids": ["old-issue-blocker-current"], + "conflicts": [ + { + "conflict_id": "issue-state-blocked-to-done", + "claim_id": "current_issue_state", + "current_evidence_id": "issue-xy900-done", + "historical_evidence_id": "issue-xy900-blocked", + "resolved_by_evidence_id": "issue-xy900-resolution-rationale" + } + ], + "update_rationale": { + "claim_id": "issue_update_rationale", + "evidence_ids": ["issue-xy900-resolution-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "memory_evolution", + "reference_mem0_history", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/preference_changed_current_vs_historical.json b/apps/elf-eval/fixtures/real_world_memory/evolution/preference_changed_current_vs_historical.json new file mode 100644 index 00000000..3e43dd25 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/evolution/preference_changed_current_vs_historical.json @@ -0,0 +1,229 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-preference-001", + "suite": "memory_evolution", + "title": "Apply the current user preference while preserving the historical one", + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "pref-old-terse-bullets", + "kind": "note", + "text": "On 2026-06-01, the user preferred terse bullet-only benchmark updates.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "preference_changed_current_vs_historical", + "evidence_id": "pref-old-terse-bullets" + } + }, + "created_at": "2026-06-01T00:00:00Z" + }, + { + "evidence_id": "pref-current-concise-rationale", + "kind": "note", + "text": "On 2026-06-08, the user changed preference to concise prose with explicit evidence before bullets.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "preference_changed_current_vs_historical", + "evidence_id": "pref-current-concise-rationale" + } + }, + "created_at": "2026-06-08T00:00:00Z" + }, + { + "evidence_id": "pref-update-rationale", + "kind": "decision", + "text": "The user said the earlier terse bullets hid rationale, so future benchmark updates should include concise rationale.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "preference_changed_current_vs_historical", + "evidence_id": "pref-update-rationale" + } + }, + "created_at": "2026-06-08T00:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_memory_evolution", + "answer": { + "content": "Use concise prose with explicit evidence before bullets; the terse bullet-only preference is historical because it hid rationale.", + "claims": [ + { + "claim_id": "current_preference", + "text": "Use concise prose with explicit evidence before bullets.", + "evidence_ids": [ + "pref-current-concise-rationale", + "pref-old-terse-bullets", + "pref-update-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "preference_update_rationale", + "text": "The preference changed because terse bullets hid rationale.", + "evidence_ids": ["pref-update-rationale"], + "confidence": "high" + } + ], + "evidence_ids": ["pref-current-concise-rationale", "pref-update-rationale"], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "preference-old", + "ts": "2026-06-01T00:00:00Z", + "actor": "user", + "action": "set_preference", + "evidence_ids": ["pref-old-terse-bullets"], + "summary": "The user initially preferred terse bullet-only benchmark updates." + }, + { + "event_id": "preference-current", + "ts": "2026-06-08T00:00:00Z", + "actor": "user", + "action": "updated_memory", + "evidence_ids": ["pref-current-concise-rationale", "pref-update-rationale"], + "summary": "The user changed the preference and gave the rationale." + } + ], + "prompt": { + "role": "user", + "content": "How should benchmark updates be written now, and what changed?", + "job_mode": "personalize", + "constraints": ["cite_evidence", "distinguish_current_from_historical"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_preference", + "text": "Use concise prose with explicit evidence before bullets." + }, + { + "claim_id": "preference_update_rationale", + "text": "The preference changed because terse bullets hid rationale." + } + ], + "must_not_include": [ + "Use terse bullet-only benchmark updates as the current preference." + ], + "evidence_links": { + "current_preference": [ + "pref-current-concise-rationale", + "pref-old-terse-bullets", + "pref-update-rationale" + ], + "preference_update_rationale": ["pref-update-rationale"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "pref-current-concise-rationale", + "claim_id": "current_preference", + "requirement": "cite", + "quote": "changed preference to concise prose" + }, + { + "evidence_id": "pref-update-rationale", + "claim_id": "preference_update_rationale", + "requirement": "explain", + "quote": "terse bullets hid rationale" + } + ], + "negative_traps": [ + { + "trap_id": "old-terse-preference-current", + "type": "stale_fact", + "evidence_ids": ["pref-old-terse-bullets"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Chooses the current preference while preserving the historical version." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the current preference and update rationale." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the current preference and rationale evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not promote the stale preference as current." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["pref-current-concise-rationale"], + "historical_evidence_ids": ["pref-old-terse-bullets"], + "stale_trap_ids": ["old-terse-preference-current"], + "conflicts": [ + { + "conflict_id": "preference-style-supersession", + "claim_id": "current_preference", + "current_evidence_id": "pref-current-concise-rationale", + "historical_evidence_id": "pref-old-terse-bullets", + "resolved_by_evidence_id": "pref-update-rationale" + } + ], + "update_rationale": { + "claim_id": "preference_update_rationale", + "evidence_ids": ["pref-update-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + }, + "history_readback": { + "encoded": true, + "required_event_types": ["add", "update", "ignore"], + "requires_note_version_links": true + } + }, + "tags": [ + "synthetic", + "memory_evolution", + "reference_mem0_history", + "reference_letta_core_block", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity.json b/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity.json new file mode 100644 index 00000000..e3a50717 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/evolution/relation_temporal_validity.json @@ -0,0 +1,228 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-relation-temporal-001", + "suite": "memory_evolution", + "title": "Distinguish current and historical relation validity in graph-lite context", + "encoding": {}, + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "relation-old-owner", + "kind": "adapter_state", + "text": "Before 2026-06-06, Team Delta owned deployment method review.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "relation_temporal_validity", + "evidence_id": "relation-old-owner" + } + }, + "created_at": "2026-06-05T00:00:00Z" + }, + { + "evidence_id": "relation-current-owner", + "kind": "adapter_state", + "text": "Since 2026-06-08, Team Echo owns deployment method review.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "relation_temporal_validity", + "evidence_id": "relation-current-owner" + } + }, + "created_at": "2026-06-08T00:00:00Z" + }, + { + "evidence_id": "relation-owner-rationale", + "kind": "decision", + "text": "Ownership moved after single-user production runbook scope changed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "relation_temporal_validity", + "evidence_id": "relation-owner-rationale" + } + }, + "created_at": "2026-06-08T00:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_memory_evolution", + "answer": { + "content": "Team Echo currently owns deployment method review. Team Delta owned deployment method review historically. The ownership moved after the single-user production runbook scope changed.", + "claims": [ + { + "claim_id": "relation_current_owner", + "text": "Team Echo currently owns deployment method review.", + "evidence_ids": [ + "relation-current-owner", + "relation-old-owner", + "relation-owner-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "relation_historical_owner", + "text": "Team Delta owned deployment method review historically.", + "evidence_ids": ["relation-old-owner"], + "confidence": "high" + }, + { + "claim_id": "relation_owner_update_rationale", + "text": "Ownership moved after single-user production runbook scope changed.", + "evidence_ids": ["relation-owner-rationale"], + "confidence": "high" + } + ], + "evidence_ids": [ + "relation-current-owner", + "relation-old-owner", + "relation-owner-rationale" + ] + }, + "consolidation": null + } + }, + "timeline": [ + { + "event_id": "relation-old-owner", + "ts": "2026-06-05T00:00:00Z", + "actor": "agent", + "action": "recorded_relation", + "evidence_ids": ["relation-old-owner"], + "summary": "Team Delta was the historical owner." + }, + { + "event_id": "relation-current-owner", + "ts": "2026-06-08T00:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": ["relation-current-owner", "relation-owner-rationale"], + "summary": "Team Echo became the current owner after the scope changed." + } + ], + "prompt": { + "role": "user", + "content": "Who currently owns deployment method review, and who owned it historically?", + "job_mode": "answer", + "constraints": ["cite_evidence", "distinguish_current_from_historical"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "relation_current_owner", + "text": "Team Echo currently owns deployment method review." + }, + { + "claim_id": "relation_historical_owner", + "text": "Team Delta owned deployment method review historically." + } + ], + "must_not_include": ["Team Delta currently owns deployment method review."], + "evidence_links": { + "relation_current_owner": [ + "relation-current-owner", + "relation-old-owner", + "relation-owner-rationale" + ], + "relation_historical_owner": ["relation-old-owner"], + "relation_owner_update_rationale": ["relation-owner-rationale"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "relation-current-owner", + "claim_id": "relation_current_owner", + "requirement": "cite", + "quote": "Team Echo owns deployment method review" + }, + { + "evidence_id": "relation-old-owner", + "claim_id": "relation_historical_owner", + "requirement": "cite", + "quote": "Team Delta owned deployment method review" + } + ], + "negative_traps": [ + { + "trap_id": "old-owner-as-current", + "type": "stale_fact", + "evidence_ids": ["relation-old-owner"], + "failure_if_used": false + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.4, + "max_points": 1.0, + "criteria": "Requires current-only versus historical temporal validity for relation facts." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Would identify current and historical owners separately." + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Would cite both current and historical relation evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Would not report the historical owner as current." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "score_temporal_relation_behavior" + }, + "memory_evolution": { + "current_evidence_ids": ["relation-current-owner"], + "historical_evidence_ids": ["relation-old-owner"], + "stale_trap_ids": ["old-owner-as-current"], + "conflicts": [ + { + "conflict_id": "relation-owner-current-historical", + "claim_id": "relation_current_owner", + "current_evidence_id": "relation-current-owner", + "historical_evidence_id": "relation-old-owner", + "resolved_by_evidence_id": "relation-owner-rationale" + } + ], + "update_rationale": { + "claim_id": "relation_owner_update_rationale", + "evidence_ids": ["relation-owner-rationale"], + "available": true + }, + "temporal_validity": { + "required": true, + "encoded": true + } + }, + "tags": [ + "synthetic", + "memory_evolution", + "reference_graphiti_zep_temporal", + "reference_nanograph_typed_query", + "graph_temporal_encoded", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/entity_concept_issue_pages.json b/apps/elf-eval/fixtures/real_world_memory/knowledge/entity_concept_issue_pages.json new file mode 100644 index 00000000..f65f78e2 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/entity_concept_issue_pages.json @@ -0,0 +1,372 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "knowledge-entity-concept-002", + "suite": "knowledge_compilation", + "title": "Compile entity, concept, and issue timeline pages with stale lint", + "corpus": { + "corpus_id": "real-world-memory-knowledge-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "qdrant-rebuild-entity", + "kind": "note", + "text": "Entity fact: Qdrant is a derived rebuildable index for ELF candidate retrieval; Postgres vectors are the source used to rebuild it.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "entity_concept_issue_pages", + "evidence_id": "qdrant-rebuild-entity" + } + }, + "created_at": "2026-06-09T02:00:00Z" + }, + { + "evidence_id": "derived-pages-concept", + "kind": "decision", + "text": "Concept fact: Derived knowledge pages compile current truth, history, backlinks, and lint findings from source notes and events.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "entity_concept_issue_pages", + "evidence_id": "derived-pages-concept" + } + }, + "created_at": "2026-06-09T02:05:00Z" + }, + { + "evidence_id": "xy848-current-timeline", + "kind": "issue", + "text": "Current issue timeline: XY-848 adds knowledge compilation benchmark cases and keeps generated pages pointer-backed benchmark artifacts.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "entity_concept_issue_pages", + "evidence_id": "xy848-current-timeline" + } + }, + "created_at": "2026-06-09T02:10:00Z" + }, + { + "evidence_id": "old-qdrant-authoritative-trap", + "kind": "note", + "text": "Stale fact: Qdrant became the authoritative source for compiled knowledge pages.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "entity_concept_issue_pages", + "evidence_id": "old-qdrant-authoritative-trap" + } + }, + "created_at": "2026-06-08T02:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_knowledge", + "answer": { + "content": "Generated entity, concept, and issue timeline pages cite Qdrant rebuild evidence, derived-page concept evidence, and the current XY-848 timeline; stale Qdrant-authoritative text is linted, and one rebuild explains allowed ordering variance.", + "claims": [ + { + "claim_id": "qdrant_rebuild_entity", + "text": "The Qdrant entity page states that Qdrant is derived and rebuildable from Postgres-held vectors.", + "evidence_ids": ["qdrant-rebuild-entity"], + "confidence": "high" + }, + { + "claim_id": "derived_pages_concept", + "text": "The derived-pages concept page compiles current truth, history, backlinks, and lint findings from source notes and events.", + "evidence_ids": ["derived-pages-concept"], + "confidence": "high" + }, + { + "claim_id": "issue_timeline_current", + "text": "The XY-848 issue timeline page records that generated pages are pointer-backed benchmark artifacts.", + "evidence_ids": ["xy848-current-timeline"], + "confidence": "high" + } + ], + "evidence_ids": [ + "qdrant-rebuild-entity", + "derived-pages-concept", + "xy848-current-timeline" + ], + "pages": [ + { + "page_id": "entity:qdrant-rebuild", + "page_type": "entity", + "title": "Qdrant Rebuild Entity Page", + "path": "apps/elf-eval/fixtures/real_world_memory/knowledge/pages/entity_qdrant_rebuild.md", + "sections": [ + { + "section_id": "current-truth", + "heading": "Current Truth", + "role": "current_truth", + "content": "Qdrant is derived and rebuildable; Postgres vectors remain the source used for rebuild.", + "evidence_ids": ["qdrant-rebuild-entity"], + "timeline_event_ids": ["qdrant-current-fact"] + }, + { + "section_id": "history", + "heading": "History", + "role": "history", + "content": "The stale claim that Qdrant became authoritative is recorded only as lint evidence.", + "evidence_ids": ["old-qdrant-authoritative-trap"], + "timeline_event_ids": ["qdrant-stale-fact"] + } + ], + "backlinks": [ + "project:elf-benchmark-suite", + "concept:derived-knowledge-pages" + ], + "lint_findings": [ + { + "finding_id": "lint-old-qdrant-authoritative", + "finding_type": "stale_claim", + "severity": "error", + "text": "The old Qdrant-authoritative claim conflicts with the current derived-index evidence.", + "evidence_ids": ["old-qdrant-authoritative-trap"], + "trap_id": "old-qdrant-authoritative" + } + ], + "rebuild": { + "first_hash": "blake3:2ac0d7d7e03088fe3171e41c19f3ea1097b07b1d7ddc891f9aa81311d476e001", + "second_hash": "blake3:2ac0d7d7e03088fe3171e41c19f3ea1097b07b1d7ddc891f9aa81311d476e001", + "deterministic": true, + "allowed_variance": [] + } + }, + { + "page_id": "concept:derived-knowledge-pages", + "page_type": "concept", + "title": "Derived Knowledge Pages Concept Page", + "path": "apps/elf-eval/fixtures/real_world_memory/knowledge/pages/concept_derived_knowledge_pages.md", + "sections": [ + { + "section_id": "compiled-truth", + "heading": "Compiled Truth", + "role": "current_truth", + "content": "Derived knowledge pages compile current truth, history, backlinks, and lint findings from source notes and events.", + "evidence_ids": ["derived-pages-concept"], + "timeline_event_ids": ["derived-pages-concept-recorded"] + }, + { + "section_id": "backlinks", + "heading": "Backlinks", + "role": "backlinks", + "content": "The concept links to the Qdrant rebuild entity and the XY-848 issue timeline.", + "evidence_ids": ["derived-pages-concept", "xy848-current-timeline"], + "timeline_event_ids": ["xy848-current-scope"] + } + ], + "backlinks": [ + "entity:qdrant-rebuild", + "issue:xy848-knowledge-pages" + ], + "lint_findings": [], + "rebuild": { + "first_hash": "blake3:498016f1d39a6a0a5241b0c640c30f0720eb9dbdd73b167fdce95b4387d9699a", + "second_hash": "blake3:498016f1d39a6a0a5241b0c640c30f0720eb9dbdd73b167fdce95b4387d9699b", + "deterministic": false, + "allowed_variance": [ + "Backlink order may differ before canonical sort is applied; fixture report records the variance and still compares normalized page sections." + ] + } + }, + { + "page_id": "issue:xy848-knowledge-pages", + "page_type": "issue_timeline", + "title": "XY-848 Knowledge Pages Issue Timeline", + "path": "apps/elf-eval/fixtures/real_world_memory/knowledge/pages/issue_xy848_knowledge_pages.md", + "sections": [ + { + "section_id": "current-state", + "heading": "Current State", + "role": "current_truth", + "content": "XY-848 adds knowledge compilation benchmark cases and marks generated pages as pointer-backed benchmark artifacts.", + "evidence_ids": ["xy848-current-timeline"], + "timeline_event_ids": ["xy848-current-scope"] + }, + { + "section_id": "linked-pages", + "heading": "Linked Pages", + "role": "backlinks", + "content": "The issue timeline links to the Qdrant rebuild entity and derived-knowledge-pages concept pages.", + "evidence_ids": ["qdrant-rebuild-entity", "derived-pages-concept"], + "timeline_event_ids": ["qdrant-current-fact", "derived-pages-concept-recorded"] + } + ], + "backlinks": [ + "entity:qdrant-rebuild", + "concept:derived-knowledge-pages" + ], + "lint_findings": [], + "rebuild": { + "first_hash": "blake3:fed9c4af9f53e787fcb91a4900b6137d728a72b60629ca049a6da57260be682d", + "second_hash": "blake3:fed9c4af9f53e787fcb91a4900b6137d728a72b60629ca049a6da57260be682d", + "deterministic": true, + "allowed_variance": [] + } + } + ], + "latency_ms": 3.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "qdrant-stale-fact", + "ts": "2026-06-08T02:00:00Z", + "actor": "agent", + "action": "recorded_stale_fact", + "evidence_ids": ["old-qdrant-authoritative-trap"], + "summary": "A stale note incorrectly said Qdrant became authoritative." + }, + { + "event_id": "qdrant-current-fact", + "ts": "2026-06-09T02:00:00Z", + "actor": "agent", + "action": "recorded_current_fact", + "evidence_ids": ["qdrant-rebuild-entity"], + "summary": "The current Qdrant fact says it is derived and rebuildable from Postgres-held vectors." + }, + { + "event_id": "derived-pages-concept-recorded", + "ts": "2026-06-09T02:05:00Z", + "actor": "agent", + "action": "recorded_concept", + "evidence_ids": ["derived-pages-concept"], + "summary": "Derived pages compile current truth, history, backlinks, and lint findings from source notes and events." + }, + { + "event_id": "xy848-current-scope", + "ts": "2026-06-09T02:10:00Z", + "actor": "operator", + "action": "recorded_issue_scope", + "evidence_ids": ["xy848-current-timeline"], + "summary": "XY-848 keeps generated knowledge pages as pointer-backed benchmark artifacts." + } + ], + "prompt": { + "role": "user", + "content": "Compile entity, concept, and issue timeline pages for the knowledge suite and identify stale claims plus rebuild variance.", + "job_mode": "compile", + "constraints": [ + "cite_evidence", + "lint_stale_claims", + "include_backlinks", + "explain_allowed_rebuild_variance" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "qdrant_rebuild_entity", + "text": "The Qdrant entity page states that Qdrant is derived and rebuildable from Postgres-held vectors." + }, + { + "claim_id": "derived_pages_concept", + "text": "The derived-pages concept page compiles current truth, history, backlinks, and lint findings from source notes and events." + }, + { + "claim_id": "issue_timeline_current", + "text": "The XY-848 issue timeline page records that generated pages are pointer-backed benchmark artifacts." + } + ], + "must_not_include": [ + "Qdrant became the authoritative source for compiled knowledge pages." + ], + "evidence_links": { + "qdrant_rebuild_entity": ["qdrant-rebuild-entity"], + "derived_pages_concept": ["derived-pages-concept"], + "issue_timeline_current": ["xy848-current-timeline"] + }, + "answer_type": "compiled_knowledge", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "qdrant-rebuild-entity", + "claim_id": "qdrant_rebuild_entity", + "requirement": "cite", + "quote": "Qdrant is a derived rebuildable index" + }, + { + "evidence_id": "derived-pages-concept", + "claim_id": "derived_pages_concept", + "requirement": "cite", + "quote": "current truth, history, backlinks, and lint findings" + }, + { + "evidence_id": "xy848-current-timeline", + "claim_id": "issue_timeline_current", + "requirement": "use", + "quote": "pointer-backed benchmark artifacts" + } + ], + "negative_traps": [ + { + "trap_id": "old-qdrant-authoritative", + "type": "stale_fact", + "evidence_ids": ["old-qdrant-authoritative-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States current entity, concept, and issue timeline truth." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Every page section traces to source notes or timeline events." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Stale Qdrant-authoritative claim is detected as lint evidence." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Pages include backlinks and useful current-truth/history surfaces." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Rebuild records are deterministic enough or explain allowed variance." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "cite_partial_evidence" + }, + "tags": [ + "synthetic", + "knowledge", + "no_live_claim", + "benchmark_artifact" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/concept_derived_knowledge_pages.md b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/concept_derived_knowledge_pages.md new file mode 100644 index 00000000..88fb9fc4 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/concept_derived_knowledge_pages.md @@ -0,0 +1,27 @@ +# Derived Knowledge Pages Concept Page + +Benchmark artifact only: this page is a derived fixture for `knowledge_compilation` +scoring. It is not authoritative production truth. + +## Compiled Truth + +Derived knowledge pages compile current truth, history, backlinks, and lint findings +from source notes and events. + +Sources: `derived-pages-concept`, `derived-pages-concept-recorded`. + +## Backlinks + +The concept links to the Qdrant rebuild entity and the XY-848 issue timeline. + +Sources: `derived-pages-concept`, `xy848-current-timeline`, `xy848-current-scope`. + +Backlinks: + +- `entity:qdrant-rebuild` +- `issue:xy848-knowledge-pages` + +## Rebuild Note + +Allowed variance: backlink order may differ before canonical sort is applied; the +fixture report records the variance and compares normalized page sections. diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/entity_qdrant_rebuild.md b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/entity_qdrant_rebuild.md new file mode 100644 index 00000000..d2b28c05 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/entity_qdrant_rebuild.md @@ -0,0 +1,26 @@ +# Qdrant Rebuild Entity Page + +Benchmark artifact only: this page is a derived fixture for `knowledge_compilation` +scoring. It is not authoritative production truth. + +## Current Truth + +Qdrant is derived and rebuildable; Postgres vectors remain the source used for rebuild. + +Sources: `qdrant-rebuild-entity`, `qdrant-current-fact`. + +## History + +The stale claim that Qdrant became authoritative is recorded only as lint evidence. + +Sources: `old-qdrant-authoritative-trap`, `qdrant-stale-fact`. + +## Lint + +- `lint-old-qdrant-authoritative`: stale claim; the old Qdrant-authoritative claim + conflicts with the current derived-index evidence. + +## Backlinks + +- `project:elf-benchmark-suite` +- `concept:derived-knowledge-pages` diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/issue_xy848_knowledge_pages.md b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/issue_xy848_knowledge_pages.md new file mode 100644 index 00000000..ac665951 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/issue_xy848_knowledge_pages.md @@ -0,0 +1,24 @@ +# XY-848 Knowledge Pages Issue Timeline + +Benchmark artifact only: this page is a derived fixture for `knowledge_compilation` +scoring. It is not authoritative production truth. + +## Current State + +XY-848 adds knowledge compilation benchmark cases and marks generated pages as +pointer-backed benchmark artifacts. + +Sources: `xy848-current-timeline`, `xy848-current-scope`. + +## Linked Pages + +The issue timeline links to the Qdrant rebuild entity and derived-knowledge-pages +concept pages. + +Sources: `qdrant-rebuild-entity`, `derived-pages-concept`, +`qdrant-current-fact`, `derived-pages-concept-recorded`. + +Backlinks: + +- `entity:qdrant-rebuild` +- `concept:derived-knowledge-pages` diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/project_elf_benchmark_suite.md b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/project_elf_benchmark_suite.md new file mode 100644 index 00000000..de6d403c --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/pages/project_elf_benchmark_suite.md @@ -0,0 +1,36 @@ +# ELF Benchmark Suite Knowledge Page + +Benchmark artifact only: this page is a derived fixture for `knowledge_compilation` +scoring. It is not authoritative production truth. + +## Current Truth + +Generated knowledge pages remain derived benchmark artifacts and source notes stay +authoritative. + +Sources: `elf-knowledge-current-truth`, `knowledge-current-truth-recorded`. + +## History + +The suite borrows llm-wiki lint, gbrain compiled_truth plus timeline, and graphify +report ideas without copying their source-of-truth assumptions. + +Sources: `elf-knowledge-history`, `knowledge-patterns-selected`. + +## XY-848 Timeline + +XY-848 requires project pages, entity/concept pages, issue timelines, current truth +plus history, stale linting, backlinks, and rebuild determinism. + +Sources: `xy848-issue-timeline`, `xy848-scope-recorded`. + +## Private Corpus Summary + +Unsupported: the fixture does not contain private production corpus evidence for a +private-corpus knowledge-page quality claim. + +## Backlinks + +- `entity:qdrant-rebuild` +- `concept:derived-knowledge-pages` +- `issue:xy848-knowledge-pages` diff --git a/apps/elf-eval/fixtures/real_world_memory/knowledge/project_page_rebuild.json b/apps/elf-eval/fixtures/real_world_memory/knowledge/project_page_rebuild.json new file mode 100644 index 00000000..de6fd359 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/knowledge/project_page_rebuild.json @@ -0,0 +1,311 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "knowledge-project-page-001", + "suite": "knowledge_compilation", + "title": "Compile a pointer-backed project page with current truth and history", + "corpus": { + "corpus_id": "real-world-memory-knowledge-synthetic-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "elf-knowledge-current-truth", + "kind": "note", + "text": "Current truth: The ELF knowledge benchmark must keep generated pages derived from notes and source refs; source notes stay authoritative and generated pages are not production truth.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_page_rebuild", + "evidence_id": "elf-knowledge-current-truth" + } + }, + "created_at": "2026-06-09T01:00:00Z" + }, + { + "evidence_id": "elf-knowledge-history", + "kind": "decision", + "text": "History: The knowledge compilation suite follows llm-wiki query-save-lint, gbrain compiled_truth plus timeline, and graphify graph report patterns while preserving ELF provenance boundaries.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_page_rebuild", + "evidence_id": "elf-knowledge-history" + } + }, + "created_at": "2026-06-09T01:05:00Z" + }, + { + "evidence_id": "xy848-issue-timeline", + "kind": "issue", + "text": "Issue timeline: XY-848 asks for project pages, entity/concept pages, issue timelines, current truth plus history, stale-claim linting, backlinks, and rebuild determinism.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_page_rebuild", + "evidence_id": "xy848-issue-timeline" + } + }, + "created_at": "2026-06-09T01:10:00Z" + }, + { + "evidence_id": "old-authoritative-page-trap", + "kind": "compiled_page", + "text": "Stale claim: Generated knowledge pages are authoritative production truth and can replace source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "project_page_rebuild", + "evidence_id": "old-authoritative-page-trap" + } + }, + "created_at": "2026-06-08T01:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_knowledge", + "answer": { + "content": "Generated benchmark page `project_elf_benchmark_suite.md` keeps ELF source notes authoritative, cites current truth and history, links the XY-848 issue timeline, flags one unsupported summary, and rebuilds deterministically.", + "claims": [ + { + "claim_id": "derived_not_authoritative", + "text": "Generated knowledge pages remain derived benchmark artifacts, not authoritative production truth.", + "evidence_ids": ["elf-knowledge-current-truth"], + "confidence": "high" + }, + { + "claim_id": "reference_patterns", + "text": "The page shape uses llm-wiki lint, gbrain compiled truth plus timeline, and graphify report patterns while preserving ELF provenance.", + "evidence_ids": ["elf-knowledge-history"], + "confidence": "high" + }, + { + "claim_id": "rebuild_deterministic", + "text": "The project page rebuild produced the same page hash in two fixture rebuild passes.", + "evidence_ids": ["xy848-issue-timeline"], + "confidence": "high" + } + ], + "evidence_ids": [ + "elf-knowledge-current-truth", + "elf-knowledge-history", + "xy848-issue-timeline" + ], + "pages": [ + { + "page_id": "project:elf-benchmark-suite", + "page_type": "project", + "title": "ELF Benchmark Suite Knowledge Page", + "path": "apps/elf-eval/fixtures/real_world_memory/knowledge/pages/project_elf_benchmark_suite.md", + "sections": [ + { + "section_id": "current-truth", + "heading": "Current Truth", + "role": "current_truth", + "content": "Generated knowledge pages remain derived benchmark artifacts and source notes stay authoritative.", + "evidence_ids": ["elf-knowledge-current-truth"], + "timeline_event_ids": ["knowledge-current-truth-recorded"] + }, + { + "section_id": "history", + "heading": "History", + "role": "history", + "content": "The suite borrows llm-wiki lint, gbrain compiled_truth plus timeline, and graphify report ideas without copying their source-of-truth assumptions.", + "evidence_ids": ["elf-knowledge-history"], + "timeline_event_ids": ["knowledge-patterns-selected"] + }, + { + "section_id": "issue-timeline", + "heading": "XY-848 Timeline", + "role": "timeline", + "content": "XY-848 requires project pages, entity/concept pages, issue timelines, current truth plus history, stale linting, backlinks, and rebuild determinism.", + "evidence_ids": ["xy848-issue-timeline"], + "timeline_event_ids": ["xy848-scope-recorded"] + }, + { + "section_id": "unsupported-private-summary", + "heading": "Private Corpus Summary", + "role": "summary", + "content": "The fixture does not contain private production corpus evidence for a private-corpus knowledge-page quality claim.", + "evidence_ids": [], + "timeline_event_ids": [], + "unsupported_reason": "No private production corpus item is present in this synthetic benchmark fixture." + } + ], + "backlinks": [ + "entity:qdrant-rebuild", + "concept:derived-knowledge-pages", + "issue:xy848-knowledge-pages" + ], + "lint_findings": [ + { + "finding_id": "lint-old-authoritative-page-trap", + "finding_type": "stale_claim", + "severity": "error", + "text": "The stale authoritative-page claim conflicts with current source-of-truth evidence.", + "evidence_ids": ["old-authoritative-page-trap"], + "trap_id": "old-authoritative-page" + } + ], + "rebuild": { + "first_hash": "blake3:93b78a1d6e8e0f7a5c761b0c3c1e311adf3a5c0f8e0f3999d5e6f4012c4a8481", + "second_hash": "blake3:93b78a1d6e8e0f7a5c761b0c3c1e311adf3a5c0f8e0f3999d5e6f4012c4a8481", + "deterministic": true, + "allowed_variance": [] + } + } + ], + "latency_ms": 2.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "knowledge-current-truth-recorded", + "ts": "2026-06-09T01:00:00Z", + "actor": "agent", + "action": "recorded_current_truth", + "evidence_ids": ["elf-knowledge-current-truth"], + "summary": "Current truth says generated pages are derived and source notes stay authoritative." + }, + { + "event_id": "knowledge-patterns-selected", + "ts": "2026-06-09T01:05:00Z", + "actor": "agent", + "action": "selected_reference_patterns", + "evidence_ids": ["elf-knowledge-history"], + "summary": "The suite uses llm-wiki, gbrain, and graphify as reference patterns." + }, + { + "event_id": "xy848-scope-recorded", + "ts": "2026-06-09T01:10:00Z", + "actor": "operator", + "action": "recorded_issue_scope", + "evidence_ids": ["xy848-issue-timeline"], + "summary": "XY-848 defines the required knowledge page benchmark dimensions." + } + ], + "prompt": { + "role": "user", + "content": "Compile a project knowledge page for the ELF benchmark suite and report whether every section is cited or flagged unsupported.", + "job_mode": "compile", + "constraints": [ + "cite_evidence", + "derived_pages_not_authoritative", + "flag_unsupported_sections", + "report_rebuild_determinism" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "derived_not_authoritative", + "text": "Generated knowledge pages remain derived benchmark artifacts, not authoritative production truth." + }, + { + "claim_id": "reference_patterns", + "text": "The page shape uses llm-wiki lint, gbrain compiled truth plus timeline, and graphify report patterns while preserving ELF provenance." + }, + { + "claim_id": "rebuild_deterministic", + "text": "The project page rebuild produced the same page hash in two fixture rebuild passes." + } + ], + "must_not_include": [ + "Generated knowledge pages are authoritative production truth.", + "The fixture proves private-corpus knowledge-page quality." + ], + "evidence_links": { + "derived_not_authoritative": ["elf-knowledge-current-truth"], + "reference_patterns": ["elf-knowledge-history"], + "rebuild_deterministic": ["xy848-issue-timeline"] + }, + "answer_type": "compiled_knowledge", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "elf-knowledge-current-truth", + "claim_id": "derived_not_authoritative", + "requirement": "cite", + "quote": "source notes stay authoritative" + }, + { + "evidence_id": "elf-knowledge-history", + "claim_id": "reference_patterns", + "requirement": "cite", + "quote": "llm-wiki query-save-lint, gbrain compiled_truth plus timeline, and graphify graph report patterns" + }, + { + "evidence_id": "xy848-issue-timeline", + "claim_id": "rebuild_deterministic", + "requirement": "use", + "quote": "rebuild determinism" + } + ], + "negative_traps": [ + { + "trap_id": "old-authoritative-page", + "type": "stale_fact", + "evidence_ids": ["old-authoritative-page-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States current derived-page truth and reference pattern rationale." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Every generated page section cites source notes/events or is flagged unsupported." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Stale authoritative-page claim is linted and not used as current truth." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Compiled page includes current truth, history, issue timeline, and backlinks." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Rebuild record is deterministic enough for regression comparison." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "cite_partial_evidence" + }, + "tags": [ + "synthetic", + "knowledge", + "no_live_claim", + "benchmark_artifact" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/memory_evolution/delete_ttl_staleness.json b/apps/elf-eval/fixtures/real_world_memory/memory_evolution/delete_ttl_staleness.json new file mode 100644 index 00000000..e903e548 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/memory_evolution/delete_ttl_staleness.json @@ -0,0 +1,218 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-evolution-delete-ttl-001", + "suite": "memory_evolution", + "title": "Suppress a deleted temporary fact and answer with the current memory", + "corpus": { + "corpus_id": "real-world-memory-evolution-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "deleted-temp-plan", + "kind": "note", + "text": "Deleted temporary plan: use the old Redis cache branch for the next benchmark report.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "delete_ttl_staleness", + "evidence_id": "deleted-temp-plan" + } + }, + "created_at": "2026-06-08T02:00:00Z" + }, + { + "evidence_id": "delete-tombstone", + "kind": "trace", + "text": "Lifecycle tombstone: note deleted-temp-plan expired at 2026-06-09T00:00:00Z and search must suppress it after worker DELETE outbox completion.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "delete_ttl_staleness", + "evidence_id": "delete-tombstone" + }, + "locator": { + "quote": "search must suppress it after worker DELETE outbox completion" + } + }, + "created_at": "2026-06-09T00:00:00Z" + }, + { + "evidence_id": "current-benchmark-plan", + "kind": "decision", + "text": "Current plan: add real_world_memory trust and personalization cases, then run cargo make smoke-real-world-job and cargo make real-world-memory.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "delete_ttl_staleness", + "evidence_id": "current-benchmark-plan" + }, + "locator": { + "quote": "add real_world_memory trust and personalization cases" + } + }, + "created_at": "2026-06-09T01:30:00Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "Do not use the expired Redis branch plan. The tombstone says deleted-temp-plan expired and search must suppress it after the worker DELETE outbox completes. The current plan is to add real_world_memory trust and personalization cases, then run cargo make smoke-real-world-job and cargo make real-world-memory.", + "claims": [ + { + "claim_id": "deleted_fact_suppressed", + "text": "The deleted temporary plan must be suppressed after the worker DELETE outbox completes.", + "evidence_ids": ["delete-tombstone"], + "confidence": "high" + }, + { + "claim_id": "current_plan", + "text": "Add real_world_memory trust and personalization cases, then run cargo make smoke-real-world-job and cargo make real-world-memory.", + "evidence_ids": ["current-benchmark-plan"], + "confidence": "high" + } + ], + "evidence_ids": ["delete-tombstone", "current-benchmark-plan"], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "temporary-plan-recorded", + "ts": "2026-06-08T02:00:00Z", + "actor": "agent", + "action": "recorded_temporary_plan", + "evidence_ids": ["deleted-temp-plan"], + "summary": "The old Redis cache branch plan was temporary." + }, + { + "event_id": "temporary-plan-expired", + "ts": "2026-06-09T00:00:00Z", + "actor": "worker", + "action": "deleted_memory", + "evidence_ids": ["delete-tombstone"], + "summary": "TTL expiry deleted the temporary plan and completed the DELETE outbox." + }, + { + "event_id": "current-plan-recorded", + "ts": "2026-06-09T01:30:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": ["current-benchmark-plan"], + "summary": "The current benchmark implementation plan superseded the deleted temporary plan." + } + ], + "prompt": { + "role": "user", + "content": "What is the current benchmark plan, and should the old Redis branch memory still be used?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_stale_deleted_facts"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "deleted_fact_suppressed", + "text": "The deleted temporary plan must be suppressed after the worker DELETE outbox completes." + }, + { + "claim_id": "current_plan", + "text": "Add real_world_memory trust and personalization cases, then run cargo make smoke-real-world-job and cargo make real-world-memory." + } + ], + "must_not_include": [ + "use the old Redis cache branch for the next benchmark report" + ], + "evidence_links": { + "deleted_fact_suppressed": ["delete-tombstone"], + "current_plan": ["current-benchmark-plan"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "delete-tombstone", + "claim_id": "deleted_fact_suppressed", + "requirement": "cite", + "quote": "search must suppress it after worker DELETE outbox completion" + }, + { + "evidence_id": "current-benchmark-plan", + "claim_id": "current_plan", + "requirement": "cite", + "quote": "add real_world_memory trust and personalization cases" + } + ], + "negative_traps": [ + { + "trap_id": "stale-deleted-plan", + "type": "stale_fact", + "evidence_ids": ["deleted-temp-plan"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Suppresses deleted or TTL-expired memory." + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the current plan." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites tombstone and replacement evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not retrieve the stale deleted fact." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["current-benchmark-plan"], + "historical_evidence_ids": [], + "tombstone_evidence_ids": ["delete-tombstone"], + "invalidation_evidence_ids": ["delete-tombstone"], + "stale_trap_ids": ["stale-deleted-plan"], + "conflicts": [], + "update_rationale": { + "claim_id": "deleted_fact_suppressed", + "evidence_ids": ["delete-tombstone"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": ["synthetic", "ttl", "delete", "stale_fact", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/memory_summary/reviewable_summary_source_trace.json b/apps/elf-eval/fixtures/real_world_memory/memory_summary/reviewable_summary_source_trace.json new file mode 100644 index 00000000..b7b552ca --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/memory_summary/reviewable_summary_source_trace.json @@ -0,0 +1,589 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "memory-summary-source-trace-001", + "suite": "memory_summary", + "title": "Read back a reviewable current memory summary with source trace", + "corpus": { + "corpus_id": "real-world-memory-summary-2026-06-16", + "profile": "synthetic", + "items": [ + { + "evidence_id": "summary-contract-current", + "kind": "decision", + "text": "Current decision: ELF memory summaries are derived reviewable readback artifacts and must not mutate authoritative source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "summary-contract-current" + }, + "locator": { + "quote": "derived reviewable readback artifacts" + } + }, + "created_at": "2026-06-16T02:00:00Z" + }, + { + "evidence_id": "summary-background-sot", + "kind": "fact", + "text": "Background memory: Postgres remains the source of truth while Qdrant is a rebuildable derived retrieval index.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "summary-background-sot" + }, + "locator": { + "quote": "Postgres remains the source of truth" + } + }, + "created_at": "2026-06-10T09:00:00Z" + }, + { + "evidence_id": "stale-summary-gap", + "kind": "note", + "text": "Stale summary note: memory-summary and top-of-mind behavior are not encoded and should stay not_tested.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "stale-summary-gap" + } + }, + "created_at": "2026-06-15T08:00:00Z" + }, + { + "evidence_id": "xy952-summary-contract", + "kind": "decision", + "text": "XY-952 update: memory-summary and top-of-mind behavior now has a fixture-backed reviewable source-trace contract.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "xy952-summary-contract" + }, + "locator": { + "quote": "fixture-backed reviewable source-trace contract" + } + }, + "created_at": "2026-06-16T02:30:00Z" + }, + { + "evidence_id": "superseded-live-evolution-loss", + "kind": "report", + "text": "Historical report: before XY-905, ELF live memory_evolution had one pass and five wrong_result jobs.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "superseded-live-evolution-loss" + } + }, + "created_at": "2026-06-11T10:00:00Z" + }, + { + "evidence_id": "xy905-live-evolution-pass", + "kind": "report", + "text": "Current report: after XY-905, ELF live memory_evolution passes all six encoded jobs with current, historical, rationale, tombstone, and invalidation evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "xy905-live-evolution-pass" + }, + "locator": { + "quote": "passes all six encoded jobs" + } + }, + "created_at": "2026-06-16T02:20:00Z" + }, + { + "evidence_id": "summary-temporary-claim", + "kind": "note", + "text": "Temporary summary claim: publish a managed-memory parity claim from fixture-only summary evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "summary-temporary-claim" + } + }, + "created_at": "2026-06-15T11:00:00Z" + }, + { + "evidence_id": "summary-ttl-tombstone", + "kind": "trace", + "text": "Summary tombstone: the fixture-only managed-memory parity claim expired at 2026-06-16T00:00:00Z and must be excluded from current top-of-mind memory.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "summary-ttl-tombstone" + }, + "locator": { + "quote": "must be excluded from current top-of-mind memory" + } + }, + "created_at": "2026-06-16T00:00:00Z" + }, + { + "evidence_id": "summary-contract-non-parity-boundary", + "kind": "decision", + "text": "Boundary: the local memory-summary contract is not evidence of parity with OpenAI or Anthropic managed memory products.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "reviewable_summary_source_trace", + "evidence_id": "summary-contract-non-parity-boundary" + }, + "locator": { + "quote": "not evidence of parity" + } + }, + "created_at": "2026-06-16T02:40:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_memory_summary", + "answer": { + "content": "The reviewable memory summary keeps the current XY-952 source-trace contract top of mind, keeps the Postgres/Qdrant source-of-truth rule as background, downgrades the old not-tested summary gap and pre-XY-905 live loss, preserves the TTL tombstone for the parity claim, and excludes unsupported managed-memory parity as a derived project-profile candidate.", + "claims": [ + { + "claim_id": "summary_contract_reviewable", + "text": "The memory summary is a derived reviewable readback artifact and must not mutate authoritative notes.", + "evidence_ids": ["summary-contract-current"], + "confidence": "high" + }, + { + "claim_id": "summary_stage_now_fixture_backed", + "text": "The memory-summary stage now has a fixture-backed reviewable source-trace contract.", + "evidence_ids": ["xy952-summary-contract"], + "confidence": "high" + }, + { + "claim_id": "summary_preserves_tombstone", + "text": "The expired managed-memory parity claim is excluded from current top-of-mind memory.", + "evidence_ids": ["summary-ttl-tombstone"], + "confidence": "high" + }, + { + "claim_id": "summary_excludes_unsupported_parity", + "text": "The local memory-summary contract is not evidence of parity with managed memory products.", + "evidence_ids": ["summary-contract-non-parity-boundary"], + "confidence": "high" + } + ], + "evidence_ids": [ + "summary-contract-current", + "xy952-summary-contract", + "summary-ttl-tombstone", + "summary-contract-non-parity-boundary" + ], + "memory_summaries": [ + { + "summary_id": "summary-xy952-reviewable-memory", + "contract_schema": "elf.memory_summary/v1", + "generated_at": "2026-06-16T03:00:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-952-fixture-agent", + "read_profile": "private_plus_project", + "entries": [ + { + "entry_id": "top-xy952-contract", + "category": "top_of_mind", + "text": "Memory summaries now use a reviewable source-trace contract.", + "source_refs": ["xy952-summary-contract"], + "freshness": { + "status": "current", + "observed_at": "2026-06-16T02:30:00Z", + "valid_from": "2026-06-16T02:30:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-16T03:00:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "rationale": { + "decision": "included", + "reason_code": "TOP_OF_MIND_CURRENT_REVIEWABLE_SUMMARY_CONTRACT", + "reason": "The current issue lane is adding the summary/source-trace contract and benchmark guard." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "background-source-truth", + "category": "background", + "text": "Postgres remains authoritative while Qdrant remains a rebuildable derived index.", + "source_refs": ["summary-background-sot"], + "freshness": { + "status": "background", + "observed_at": "2026-06-10T09:00:00Z", + "valid_from": "2026-06-10T09:00:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-16T03:00:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "rationale": { + "decision": "included", + "reason_code": "BACKGROUND_STABLE_SOURCE_OF_TRUTH_BOUNDARY", + "reason": "The source-of-truth boundary is stable context, not urgent top-of-mind work." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "stale-summary-not-tested", + "category": "stale", + "text": "The old memory-summary stage state was not_tested before XY-952.", + "source_refs": ["stale-summary-gap"], + "freshness": { + "status": "stale", + "observed_at": "2026-06-15T08:00:00Z", + "valid_from": "2026-06-15T08:00:00Z", + "valid_to": "2026-06-16T02:30:00Z", + "last_confirmed_at": "2026-06-15T08:00:00Z", + "superseded_by": ["xy952-summary-contract"], + "tombstone_refs": [] + }, + "rationale": { + "decision": "downgraded", + "reason_code": "DOWNGRADED_STALE_SUMMARY_STAGE_REPLACED", + "reason": "XY-952 adds a fixture-backed contract, so the earlier not_tested state is history." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "superseded-live-evolution-loss", + "category": "superseded", + "text": "The pre-XY-905 live memory_evolution loss is historical.", + "source_refs": ["superseded-live-evolution-loss"], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-11T10:00:00Z", + "valid_from": "2026-06-11T10:00:00Z", + "valid_to": "2026-06-16T02:20:00Z", + "last_confirmed_at": "2026-06-11T10:00:00Z", + "superseded_by": ["xy905-live-evolution-pass"], + "tombstone_refs": [] + }, + "rationale": { + "decision": "downgraded", + "reason_code": "SUPERSEDED_BY_XY905_LIVE_RECONCILIATION", + "reason": "The XY-905 report superseded the older live memory_evolution wrong_result state." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "tombstone-managed-parity-claim", + "category": "tombstone", + "text": "The fixture-only managed-memory parity claim is tombstoned and excluded.", + "source_refs": ["summary-ttl-tombstone"], + "freshness": { + "status": "tombstoned", + "observed_at": "2026-06-16T00:00:00Z", + "valid_from": "2026-06-15T11:00:00Z", + "valid_to": "2026-06-16T00:00:00Z", + "last_confirmed_at": "2026-06-16T00:00:00Z", + "superseded_by": [], + "tombstone_refs": ["summary-ttl-tombstone"] + }, + "rationale": { + "decision": "excluded", + "reason_code": "TOMBSTONE_TTL_INVALIDATED_PARITY_CLAIM", + "reason": "The tombstone says the parity claim expired and must not appear as current top-of-mind memory." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "derived-project-profile-summary-boundary", + "category": "derived_project_profile", + "text": "Project profile: ELF summaries are reviewable derived readback, not authoritative notes.", + "source_refs": ["summary-contract-current", "summary-background-sot"], + "freshness": { + "status": "current", + "observed_at": "2026-06-16T02:00:00Z", + "valid_from": "2026-06-16T02:00:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-16T03:00:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "rationale": { + "decision": "included", + "reason_code": "DERIVED_PROFILE_SOURCE_BACKED_BOUNDARY", + "reason": "The derived project profile is source-backed and labels summaries as non-authoritative." + }, + "unsupported_claim_flags": [] + }, + { + "entry_id": "derived-project-profile-parity-excluded", + "category": "derived_project_profile", + "text": "Excluded candidate: the local summary contract proves parity with managed memory products.", + "source_refs": [], + "freshness": { + "status": "unsupported", + "observed_at": "2026-06-16T03:00:00Z", + "valid_from": null, + "valid_to": null, + "last_confirmed_at": null, + "superseded_by": [], + "tombstone_refs": [] + }, + "rationale": { + "decision": "excluded", + "reason_code": "EXCLUDED_UNSUPPORTED_MANAGED_MEMORY_PARITY", + "reason": "The local contract is not comparable live evidence for OpenAI or Anthropic managed memory products." + }, + "unsupported_claim_flags": [ + { + "claim_id": "managed_memory_parity", + "message": "No comparable live managed-memory runner exists for this lane.", + "source": { + "evidence_id": "summary-contract-non-parity-boundary" + } + } + ] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "xy952-summary-contract", + "status": "active", + "reason": "current top-of-mind contract evidence" + }, + { + "evidence_id": "summary-background-sot", + "status": "active", + "reason": "stable background source-of-truth evidence" + } + ], + "dropped_source_refs": [ + { + "evidence_id": "summary-temporary-claim", + "status": "expired", + "reason": "tombstoned parity claim" + } + ], + "stale_source_refs": [ + { + "evidence_id": "stale-summary-gap", + "status": "stale", + "reason": "superseded by XY-952 fixture-backed contract", + "superseded_by": "xy952-summary-contract" + } + ], + "superseded_source_refs": [ + { + "evidence_id": "superseded-live-evolution-loss", + "status": "superseded", + "reason": "XY-905 live report superseded the old loss", + "superseded_by": "xy905-live-evolution-pass" + } + ], + "tombstone_source_refs": [ + { + "evidence_id": "summary-ttl-tombstone", + "status": "tombstoned", + "reason": "TTL invalidation suppresses the parity claim" + } + ], + "unsupported_claim_flags": [ + { + "claim_id": "managed_memory_parity", + "message": "Fixture-backed contract evidence is not managed-memory parity evidence." + } + ] + } + } + ], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "summary-gap-recorded", + "ts": "2026-06-15T08:00:00Z", + "actor": "agent", + "action": "recorded_not_tested_stage", + "evidence_ids": ["stale-summary-gap"], + "summary": "The stage ledger recorded memory summary behavior as not_tested." + }, + { + "event_id": "temporary-parity-claim-expired", + "ts": "2026-06-16T00:00:00Z", + "actor": "worker", + "action": "ttl_invalidated_claim", + "evidence_ids": ["summary-ttl-tombstone"], + "summary": "The temporary parity claim was tombstoned." + }, + { + "event_id": "xy952-contract-recorded", + "ts": "2026-06-16T02:30:00Z", + "actor": "agent", + "action": "recorded_summary_contract", + "evidence_ids": ["xy952-summary-contract"], + "summary": "The summary/source-trace contract became fixture-backed." + } + ], + "prompt": { + "role": "user", + "content": "Show the current memory summary surface and explain why stale, tombstoned, and unsupported derived memories are not top-of-mind current facts.", + "job_mode": "summary_readback", + "constraints": [ + "cite_evidence", + "preserve_current_vs_historical_truth", + "expose_source_trace", + "do_not_claim_managed_memory_parity" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "summary_contract_reviewable", + "text": "The memory summary is a derived reviewable readback artifact and must not mutate authoritative notes." + }, + { + "claim_id": "summary_stage_now_fixture_backed", + "text": "The memory-summary stage now has a fixture-backed reviewable source-trace contract." + }, + { + "claim_id": "summary_preserves_tombstone", + "text": "The expired managed-memory parity claim is excluded from current top-of-mind memory." + }, + { + "claim_id": "summary_excludes_unsupported_parity", + "text": "The local memory-summary contract is not evidence of parity with managed memory products." + } + ], + "must_not_include": [ + "ELF has parity with managed memory products.", + "memory summaries are authoritative source notes", + "memory-summary and top-of-mind behavior are not encoded and should stay not_tested" + ], + "evidence_links": { + "summary_contract_reviewable": ["summary-contract-current"], + "summary_stage_now_fixture_backed": ["xy952-summary-contract"], + "summary_preserves_tombstone": ["summary-ttl-tombstone"], + "summary_excludes_unsupported_parity": ["summary-contract-non-parity-boundary"] + }, + "answer_type": "reviewable_memory_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "summary-contract-current", + "claim_id": "summary_contract_reviewable", + "requirement": "cite", + "quote": "derived reviewable readback artifacts" + }, + { + "evidence_id": "xy952-summary-contract", + "claim_id": "summary_stage_now_fixture_backed", + "requirement": "cite", + "quote": "fixture-backed reviewable source-trace contract" + }, + { + "evidence_id": "summary-ttl-tombstone", + "claim_id": "summary_preserves_tombstone", + "requirement": "cite", + "quote": "must be excluded from current top-of-mind memory" + }, + { + "evidence_id": "summary-contract-non-parity-boundary", + "claim_id": "summary_excludes_unsupported_parity", + "requirement": "cite", + "quote": "not evidence of parity" + } + ], + "negative_traps": [ + { + "trap_id": "stale-summary-gap-current", + "type": "stale_fact", + "evidence_ids": ["stale-summary-gap"], + "failure_if_used": true + }, + { + "trap_id": "temporary-parity-claim-current", + "type": "stale_fact", + "evidence_ids": ["summary-temporary-claim"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "States that the summary is reviewable derived readback and identifies the current fixture-backed contract." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Summary entries and answer claims carry source refs or explicit unsupported-claim flags." + }, + "lifecycle_behavior": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Stale, superseded, and tombstoned entries are downgraded or excluded instead of treated as current top-of-mind facts." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Avoids stale not_tested and expired parity traps as current facts." + }, + "uncertainty_handling": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Unsupported managed-memory parity is flagged or excluded, not silently asserted." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "stale, superseded, or tombstoned memory must not appear as current top-of-mind", + "derived summary entries must have source refs or unsupported-claim flags" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "memory_summary": { + "required_categories": [ + "top_of_mind", + "background", + "stale", + "superseded", + "tombstone", + "derived_project_profile" + ] + }, + "tags": [ + "synthetic", + "memory_summary", + "source_trace", + "reviewable_derived_readback", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/personalization/scoped_preference_correction.json b/apps/elf-eval/fixtures/real_world_memory/personalization/scoped_preference_correction.json new file mode 100644 index 00000000..d9a99d30 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/personalization/scoped_preference_correction.json @@ -0,0 +1,244 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "personalization-scoped-preference-001", + "suite": "personalization", + "title": "Apply the current project preference without cross-project leakage", + "corpus": { + "corpus_id": "real-world-memory-personalization-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "elf-current-stable-preference", + "kind": "note", + "text": "Preference v2 for tenant local-tenant project ELF agent local-agent: answer concisely with evidence-linked bullets; remembered because the user repeated it in two durable ELF sessions and marked it stable.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "scoped_preference_correction", + "tenant_id": "local-tenant", + "project_id": "ELF", + "agent_id": "local-agent", + "evidence_id": "elf-current-stable-preference" + }, + "locator": { + "quote": "answer concisely with evidence-linked bullets" + } + }, + "created_at": "2026-06-09T03:00:00Z" + }, + { + "evidence_id": "elf-old-preference", + "kind": "note", + "text": "Preference v1 for tenant local-tenant project ELF agent local-agent: provide long tutorial explanations for every answer.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "scoped_preference_correction", + "tenant_id": "local-tenant", + "project_id": "ELF", + "agent_id": "local-agent", + "evidence_id": "elf-old-preference" + } + }, + "created_at": "2026-06-08T03:00:00Z" + }, + { + "evidence_id": "elf-temporary-demo-preference", + "kind": "note", + "text": "Temporary preference for tenant local-tenant project ELF: use a playful pirate tone for the demo only; expires at 2026-06-09T12:00:00Z.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "scoped_preference_correction", + "tenant_id": "local-tenant", + "project_id": "ELF", + "evidence_id": "elf-temporary-demo-preference" + } + }, + "created_at": "2026-06-09T02:30:00Z" + }, + { + "evidence_id": "other-project-preference", + "kind": "note", + "text": "Preference for tenant local-tenant project PubFi agent local-agent: answer in long-form Chinese prose with no bullets.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "scoped_preference_correction", + "tenant_id": "local-tenant", + "project_id": "PubFi", + "agent_id": "local-agent", + "evidence_id": "other-project-preference" + } + }, + "created_at": "2026-06-09T03:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "For tenant local-tenant / project ELF / agent local-agent, use concise evidence-linked bullets. This preference is remembered because the user repeated it in two durable ELF sessions and marked it stable. Do not apply the old long-tutorial preference, the demo-only pirate tone, or the PubFi project preference.", + "claims": [ + { + "claim_id": "current_preference", + "text": "For ELF, answer concisely with evidence-linked bullets.", + "evidence_ids": ["elf-current-stable-preference"], + "confidence": "high" + }, + { + "claim_id": "preference_reason", + "text": "The preference is remembered because the user repeated it in two durable ELF sessions and marked it stable.", + "evidence_ids": ["elf-current-stable-preference"], + "confidence": "high" + } + ], + "evidence_ids": ["elf-current-stable-preference"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "old-preference-recorded", + "ts": "2026-06-08T03:00:00Z", + "actor": "user", + "action": "recorded_preference", + "evidence_ids": ["elf-old-preference"], + "summary": "The user previously preferred long tutorial answers." + }, + { + "event_id": "temporary-demo-preference", + "ts": "2026-06-09T02:30:00Z", + "actor": "user", + "action": "recorded_temporary_preference", + "evidence_ids": ["elf-temporary-demo-preference"], + "summary": "The pirate tone was explicitly temporary for a demo." + }, + { + "event_id": "stable-preference-correction", + "ts": "2026-06-09T03:00:00Z", + "actor": "user", + "action": "updated_memory", + "evidence_ids": ["elf-current-stable-preference"], + "summary": "The user corrected the ELF preference to concise evidence-linked bullets and marked it stable." + }, + { + "event_id": "other-project-preference-recorded", + "ts": "2026-06-09T03:05:00Z", + "actor": "agent", + "action": "recorded_other_project_preference", + "evidence_ids": ["other-project-preference"], + "summary": "A PubFi project preference exists but must not leak into ELF." + } + ], + "prompt": { + "role": "user", + "content": "For this ELF project, how should you answer me and why is that preference remembered?", + "job_mode": "personalize", + "constraints": ["cite_evidence", "avoid_cross_project_preferences", "avoid_temporary_overgeneralization"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_preference", + "text": "For ELF, answer concisely with evidence-linked bullets." + }, + { + "claim_id": "preference_reason", + "text": "The preference is remembered because the user repeated it in two durable ELF sessions and marked it stable." + } + ], + "must_not_include": [ + "provide long tutorial explanations for every answer", + "use a playful pirate tone", + "answer in long-form Chinese prose with no bullets" + ], + "evidence_links": { + "current_preference": ["elf-current-stable-preference"], + "preference_reason": ["elf-current-stable-preference"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "elf-current-stable-preference", + "claim_id": "current_preference", + "requirement": "cite", + "quote": "answer concisely with evidence-linked bullets" + }, + { + "evidence_id": "elf-current-stable-preference", + "claim_id": "preference_reason", + "requirement": "explain", + "quote": "repeated it in two durable ELF sessions and marked it stable" + } + ], + "negative_traps": [ + { + "trap_id": "stale-old-preference", + "type": "stale_fact", + "evidence_ids": ["elf-old-preference"], + "failure_if_used": true + }, + { + "trap_id": "temporary-demo-overgeneralization", + "type": "stale_fact", + "evidence_ids": ["elf-temporary-demo-preference"], + "failure_if_used": true + }, + { + "trap_id": "cross-project-preference-leak", + "type": "near_duplicate", + "evidence_ids": ["other-project-preference"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "personalization_fit": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Applies the current stable ELF preference." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Explains why the preference is remembered with evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids stale, temporary, and cross-project preferences." + }, + "answer_correctness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Answers the user's personalization question directly." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "personalization", "scoped_preference", "preference_correction", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/proactive_brief/daily_project_brief.json b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/daily_project_brief.json new file mode 100644 index 00000000..b31ef1c6 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/daily_project_brief.json @@ -0,0 +1,267 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "proactive-daily-project-brief-001", + "suite": "proactive_brief", + "title": "Generate a daily project brief from current project memory", + "corpus": { + "corpus_id": "real-world-memory-proactive-brief-2026-06-16", + "profile": "synthetic", + "items": [ + { + "evidence_id": "daily-current-validation-gate", + "kind": "decision", + "text": "Current project decision: before review handoff, the ELF lane must run the proactive brief fixture command and targeted real_world_job_benchmark tests.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "daily_project_brief", + "evidence_id": "daily-current-validation-gate" + }, + "locator": { + "quote": "run the proactive brief fixture command" + } + }, + "created_at": "2026-06-16T04:00:00Z" + }, + { + "evidence_id": "daily-current-ledger-update", + "kind": "plan", + "text": "Current plan: update the XY-951 Dreaming-readiness stage ledger with the proactive brief benchmark delta and next optimization direction.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "daily_project_brief", + "evidence_id": "daily-current-ledger-update" + }, + "locator": { + "quote": "update the XY-951 Dreaming-readiness stage ledger" + } + }, + "created_at": "2026-06-16T04:05:00Z" + }, + { + "evidence_id": "daily-old-parity-trap", + "kind": "note", + "text": "Stale note: fixture-only proactive briefs prove parity with OpenAI Pulse and hosted managed products.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "daily_project_brief", + "evidence_id": "daily-old-parity-trap" + } + }, + "created_at": "2026-06-15T10:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_proactive_brief", + "answer": { + "content": "Daily brief: run the proactive brief benchmark command, keep the XY-951 ledger update next, and do not claim Pulse or hosted managed-product parity from fixture-only evidence.", + "claims": [ + { + "claim_id": "daily_validation_gate", + "text": "The next validation step is the proactive brief fixture command plus targeted real_world_job_benchmark tests.", + "evidence_ids": ["daily-current-validation-gate"], + "confidence": "high" + }, + { + "claim_id": "daily_ledger_update", + "text": "The XY-951 stage ledger must record the proactive brief benchmark delta.", + "evidence_ids": ["daily-current-ledger-update"], + "confidence": "high" + } + ], + "evidence_ids": ["daily-current-validation-gate", "daily-current-ledger-update"], + "proactive_briefs": [ + { + "brief_id": "brief-daily-project-2026-06-16", + "contract_schema": "elf.proactive_project_brief/v1", + "generated_at": "2026-06-16T04:30:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-953-fixture-agent", + "read_profile": "private_plus_project", + "brief_kind": "daily_project_brief", + "suggestions": [ + { + "suggestion_id": "daily-run-proactive-gate", + "suggestion_kind": "daily_project_brief", + "title": "Run the proactive brief benchmark gate", + "body": "Run the proactive brief fixture command before claiming the lane is validation-ready, then update the XY-951 ledger.", + "evidence_refs": ["daily-current-validation-gate", "daily-current-ledger-update"], + "freshness": { + "status": "current", + "observed_at": "2026-06-16T04:05:00Z", + "valid_from": "2026-06-16T04:00:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-16T04:30:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "action": { + "decision": "recommend", + "reason_code": "RECOMMEND_CURRENT_EVIDENCE_BOUND_BRIEF", + "reason": "Both source refs are current project-memory items and no tombstone or supersession source is selected." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "daily-current-validation-gate", + "status": "current", + "reason": "Required validation command source." + }, + { + "evidence_id": "daily-current-ledger-update", + "status": "current", + "reason": "Required ledger update source." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [ + { + "evidence_id": "daily-old-parity-trap", + "status": "stale", + "reason": "Fixture-only evidence cannot prove Pulse parity." + } + ], + "superseded_source_refs": [], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + } + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "daily-gate-recorded", + "ts": "2026-06-16T04:00:00Z", + "actor": "agent", + "action": "recorded_current_gate", + "evidence_ids": ["daily-current-validation-gate"], + "summary": "The current validation gate was recorded as proactive fixture plus targeted tests." + }, + { + "event_id": "daily-ledger-action-recorded", + "ts": "2026-06-16T04:05:00Z", + "actor": "agent", + "action": "recorded_ledger_action", + "evidence_ids": ["daily-current-ledger-update"], + "summary": "The ledger update remained the next optimization artifact." + } + ], + "prompt": { + "role": "user", + "content": "Generate a daily project brief with only source-linked current recommendations.", + "job_mode": "proactive_brief", + "constraints": [ + "cite_evidence", + "mark_currentness", + "include_action_rationale", + "do_not_claim_pulse_parity" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "daily_validation_gate", + "text": "The next validation step is the proactive brief fixture command plus targeted real_world_job_benchmark tests." + }, + { + "claim_id": "daily_ledger_update", + "text": "The XY-951 stage ledger must record the proactive brief benchmark delta." + } + ], + "must_not_include": [ + "fixture-only proactive briefs prove parity with OpenAI Pulse", + "fixture-only proactive briefs prove hosted managed-product parity" + ], + "evidence_links": { + "daily_validation_gate": ["daily-current-validation-gate"], + "daily_ledger_update": ["daily-current-ledger-update"] + }, + "answer_type": "proactive_project_brief", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "daily-current-validation-gate", + "claim_id": "daily_validation_gate", + "requirement": "cite", + "quote": "proactive brief fixture command" + }, + { + "evidence_id": "daily-current-ledger-update", + "claim_id": "daily_ledger_update", + "requirement": "cite", + "quote": "XY-951 Dreaming-readiness stage ledger" + } + ], + "negative_traps": [ + { + "trap_id": "daily-fixture-parity-trap", + "type": "stale_fact", + "evidence_ids": ["daily-old-parity-trap"], + "failure_if_used": true + } + ], + "proactive_brief": { + "required_suggestion_kinds": ["daily_project_brief"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Brief names the current validation and ledger actions." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Every proactive suggestion has evidence refs." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "The stale Pulse parity trap is not selected." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The suggestion is actionable for the current lane." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Freshness and currentness markers prevent stale facts from being current recommendations." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No hosted parity claim is supported by this fixture."], + "fallback_action": "defer_unsupported_claim" + }, + "tags": ["synthetic", "proactive_brief", "daily_project_brief", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/proactive_brief/private_corpus_refresh_blocked.json b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/private_corpus_refresh_blocked.json new file mode 100644 index 00000000..92ab7305 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/private_corpus_refresh_blocked.json @@ -0,0 +1,124 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "proactive-private-corpus-refresh-blocked-001", + "suite": "proactive_brief", + "title": "Block private-corpus refresh suggestions when no operator manifest exists", + "corpus": { + "corpus_id": "real-world-memory-proactive-private-refresh-2026-06-16", + "profile": "private_sanitized", + "items": [ + { + "evidence_id": "private-refresh-no-manifest", + "kind": "blocker", + "text": "Private corpus refresh blocker: no operator-owned private production corpus manifest is checked in or available to this fixture.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_corpus_refresh_blocked", + "evidence_id": "private-refresh-no-manifest" + }, + "locator": { + "quote": "no operator-owned private production corpus manifest" + } + }, + "created_at": "2026-06-16T06:20:00Z" + }, + { + "evidence_id": "xy930-private-input-boundary", + "kind": "issue", + "text": "XY-930 owns private-corpus and credentialed production gates when operator inputs exist; fixture-only output must not claim a private-corpus refresh pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_corpus_refresh_blocked", + "evidence_id": "xy930-private-input-boundary" + }, + "locator": { + "quote": "XY-930 owns private-corpus and credentialed production gates" + } + }, + "created_at": "2026-06-16T06:21:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "private-refresh-blocked-recorded", + "ts": "2026-06-16T06:20:00Z", + "actor": "agent", + "action": "recorded_private_input_blocker", + "evidence_ids": ["private-refresh-no-manifest", "xy930-private-input-boundary"], + "summary": "The private-corpus refresh suggestion remains blocked until operator-owned inputs exist." + } + ], + "prompt": { + "role": "user", + "content": "Suggest a private-corpus refresh when private inputs exist.", + "job_mode": "proactive_brief", + "constraints": [ + "do_not_require_private_corpus", + "block_without_operator_manifest", + "tie_blocker_to_xy930" + ] + }, + "expected_answer": { + "must_include": [], + "must_not_include": [ + "private-corpus refresh passed", + "private production corpus quality is proven" + ], + "evidence_links": {}, + "answer_type": "proactive_project_brief", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": true + }, + "required_evidence": [], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The fixture records the private-corpus refresh as blocked." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The blocker points to the missing manifest and XY-930 boundary." + }, + "uncertainty_handling": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "No private-corpus pass is claimed." + }, + "lifecycle_behavior": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The blocked state remains typed and not hidden as pass." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "No operator-owned private production corpus manifest is available." + ], + "fallback_action": "block_private_refresh_until_xy930_inputs_exist" + }, + "encoding": { + "status": "blocked", + "reason": "No operator-owned private production corpus manifest is available; private-corpus refresh suggestions stay blocked under XY-930.", + "follow_up": { + "title": "XY-930 private-corpus input gate", + "reason": "Run private-corpus and credentialed production gates only when operator-owned inputs exist." + } + }, + "tags": ["private_sanitized", "proactive_brief", "private_corpus_refresh", "xy930_blocked"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/proactive_brief/resume_work_brief.json b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/resume_work_brief.json new file mode 100644 index 00000000..64cebd93 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/resume_work_brief.json @@ -0,0 +1,251 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "proactive-resume-work-brief-001", + "suite": "proactive_brief", + "title": "Generate a resume-work brief from current handoff memory", + "corpus": { + "corpus_id": "real-world-memory-proactive-brief-2026-06-16", + "profile": "synthetic", + "items": [ + { + "evidence_id": "resume-current-handoff", + "kind": "handoff", + "text": "Current handoff: continue in branch y/elf-xy-953, add proactive brief fixtures and scoring, then run the proactive brief benchmark command.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resume_work_brief", + "evidence_id": "resume-current-handoff" + }, + "locator": { + "quote": "add proactive brief fixtures and scoring" + } + }, + "created_at": "2026-06-16T05:00:00Z" + }, + { + "evidence_id": "resume-current-validation", + "kind": "plan", + "text": "Current validation plan: run cargo make real-world-memory-proactive-brief and targeted elf-eval real_world_job_benchmark tests before phase completion.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resume_work_brief", + "evidence_id": "resume-current-validation" + }, + "locator": { + "quote": "real-world-memory-proactive-brief" + } + }, + "created_at": "2026-06-16T05:03:00Z" + }, + { + "evidence_id": "resume-stale-validation", + "kind": "note", + "text": "Stale handoff: only run the work_resume smoke and skip proactive brief scoring.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resume_work_brief", + "evidence_id": "resume-stale-validation" + } + }, + "created_at": "2026-06-15T05:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_proactive_brief", + "answer": { + "content": "Resume brief: stay on y/elf-xy-953, finish proactive brief fixture/scoring work, and validate with cargo make real-world-memory-proactive-brief plus targeted elf-eval tests.", + "claims": [ + { + "claim_id": "resume_current_handoff", + "text": "The current resume point is branch y/elf-xy-953 with proactive brief fixture and scoring work.", + "evidence_ids": ["resume-current-handoff"], + "confidence": "high" + }, + { + "claim_id": "resume_validation", + "text": "The validation plan includes cargo make real-world-memory-proactive-brief.", + "evidence_ids": ["resume-current-validation"], + "confidence": "high" + } + ], + "evidence_ids": ["resume-current-handoff", "resume-current-validation"], + "proactive_briefs": [ + { + "brief_id": "brief-resume-work-2026-06-16", + "contract_schema": "elf.proactive_project_brief/v1", + "generated_at": "2026-06-16T05:30:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-953-fixture-agent", + "read_profile": "private_plus_project", + "brief_kind": "resume_work", + "suggestions": [ + { + "suggestion_id": "resume-continue-proactive-brief", + "suggestion_kind": "resume_work", + "title": "Continue proactive brief scoring", + "body": "Continue the XY-953 fixture and runner scoring work on y/elf-xy-953, then run the proactive brief benchmark command.", + "evidence_refs": ["resume-current-handoff", "resume-current-validation"], + "freshness": { + "status": "current", + "observed_at": "2026-06-16T05:03:00Z", + "valid_from": "2026-06-16T05:00:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-16T05:30:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "action": { + "decision": "recommend", + "reason_code": "RECOMMEND_CURRENT_HANDOFF", + "reason": "The current handoff and validation plan agree on the same proactive brief work." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "resume-current-handoff", + "status": "current", + "reason": "Current work handoff." + }, + { + "evidence_id": "resume-current-validation", + "status": "current", + "reason": "Current validation command." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [ + { + "evidence_id": "resume-stale-validation", + "status": "stale", + "reason": "The proactive brief lane now has a direct command." + } + ], + "superseded_source_refs": [], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + } + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "resume-handoff-recorded", + "ts": "2026-06-16T05:00:00Z", + "actor": "agent", + "action": "recorded_handoff", + "evidence_ids": ["resume-current-handoff"], + "summary": "The current handoff pointed at proactive brief scoring." + } + ], + "prompt": { + "role": "user", + "content": "Generate a resume-work brief that identifies the current next action and validation command.", + "job_mode": "proactive_brief", + "constraints": ["cite_evidence", "mark_currentness", "include_action_rationale"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "resume_current_handoff", + "text": "The current resume point is branch y/elf-xy-953 with proactive brief fixture and scoring work." + }, + { + "claim_id": "resume_validation", + "text": "The validation plan includes cargo make real-world-memory-proactive-brief." + } + ], + "must_not_include": ["skip proactive brief scoring"], + "evidence_links": { + "resume_current_handoff": ["resume-current-handoff"], + "resume_validation": ["resume-current-validation"] + }, + "answer_type": "proactive_project_brief", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "resume-current-handoff", + "claim_id": "resume_current_handoff", + "requirement": "cite", + "quote": "proactive brief fixtures and scoring" + }, + { + "evidence_id": "resume-current-validation", + "claim_id": "resume_validation", + "requirement": "cite", + "quote": "cargo make real-world-memory-proactive-brief" + } + ], + "negative_traps": [ + { + "trap_id": "resume-stale-validation-trap", + "type": "stale_fact", + "evidence_ids": ["resume-stale-validation"], + "failure_if_used": true + } + ], + "proactive_brief": { + "required_suggestion_kinds": ["resume_work"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Brief identifies the current handoff and validation command." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The resume suggestion carries evidence refs." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "The stale validation trap is not used." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The brief gives a concrete resume action." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Currentness markers keep stale handoff content out." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No current handoff evidence is available."], + "fallback_action": "defer_resume_brief" + }, + "tags": ["synthetic", "proactive_brief", "resume_work", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_decision_audit.json b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_decision_audit.json new file mode 100644 index 00000000..5cb30dc6 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_decision_audit.json @@ -0,0 +1,218 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "proactive-stale-decision-audit-001", + "suite": "proactive_brief", + "title": "Warn about a stale project decision before suggesting work", + "corpus": { + "corpus_id": "real-world-memory-proactive-brief-2026-06-16", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stale-decision-old-gate", + "kind": "decision", + "text": "Historical decision: use only cargo make real-world-job-operator-ux to evaluate proactive readiness.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_decision_audit", + "evidence_id": "stale-decision-old-gate" + } + }, + "created_at": "2026-06-15T09:00:00Z" + }, + { + "evidence_id": "stale-decision-new-gate", + "kind": "decision", + "text": "Current decision: proactive readiness must use the direct real-world-memory-proactive-brief suite before any proactive brief pass claim.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_decision_audit", + "evidence_id": "stale-decision-new-gate" + }, + "locator": { + "quote": "direct real-world-memory-proactive-brief suite" + } + }, + "created_at": "2026-06-16T05:40:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_proactive_brief", + "answer": { + "content": "Stale decision audit: defer the old operator-ux-only readiness decision and use the direct real-world-memory-proactive-brief suite for any proactive pass claim.", + "claims": [ + { + "claim_id": "stale_decision_replaced", + "text": "The operator-ux-only proactive readiness decision is superseded by the direct proactive brief suite.", + "evidence_ids": ["stale-decision-old-gate", "stale-decision-new-gate"], + "confidence": "high" + } + ], + "evidence_ids": ["stale-decision-old-gate", "stale-decision-new-gate"], + "proactive_briefs": [ + { + "brief_id": "brief-stale-decision-audit-2026-06-16", + "contract_schema": "elf.proactive_project_brief/v1", + "generated_at": "2026-06-16T05:45:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-953-fixture-agent", + "read_profile": "private_plus_project", + "brief_kind": "stale_decision_audit", + "suggestions": [ + { + "suggestion_id": "audit-old-operator-ux-only-gate", + "suggestion_kind": "stale_decision_audit", + "title": "Defer the old operator-ux-only readiness gate", + "body": "Do not use the old operator-ux-only decision as current readiness evidence; it is superseded by the direct proactive brief suite.", + "evidence_refs": ["stale-decision-old-gate", "stale-decision-new-gate"], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-16T05:40:00Z", + "valid_from": "2026-06-15T09:00:00Z", + "valid_to": "2026-06-16T05:40:00Z", + "last_confirmed_at": "2026-06-16T05:45:00Z", + "superseded_by": ["stale-decision-new-gate"], + "tombstone_refs": [] + }, + "action": { + "decision": "defer", + "reason_code": "DEFER_SUPERSEDED_DECISION", + "reason": "The old decision is retained as history and must not be used as the current proactive-readiness gate." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "stale-decision-new-gate", + "status": "current", + "reason": "Current proactive-readiness gate." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [], + "superseded_source_refs": [ + { + "evidence_id": "stale-decision-old-gate", + "status": "superseded", + "reason": "Replaced by the direct proactive brief suite.", + "superseded_by": "stale-decision-new-gate" + } + ], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + } + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "stale-decision-replaced", + "ts": "2026-06-16T05:40:00Z", + "actor": "agent", + "action": "superseded_decision", + "evidence_ids": ["stale-decision-old-gate", "stale-decision-new-gate"], + "summary": "The direct proactive brief suite superseded the old operator-ux-only readiness gate." + } + ], + "prompt": { + "role": "user", + "content": "Audit stale project decisions before generating proactive suggestions.", + "job_mode": "proactive_brief", + "constraints": ["cite_evidence", "mark_currentness", "include_defer_reason"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "stale_decision_replaced", + "text": "The operator-ux-only proactive readiness decision is superseded by the direct proactive brief suite." + } + ], + "must_not_include": ["use only cargo make real-world-job-operator-ux to evaluate proactive readiness"], + "evidence_links": { + "stale_decision_replaced": ["stale-decision-old-gate", "stale-decision-new-gate"] + }, + "answer_type": "proactive_project_brief", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "stale-decision-old-gate", + "claim_id": "stale_decision_replaced", + "requirement": "cite", + "quote": "Historical decision" + }, + { + "evidence_id": "stale-decision-new-gate", + "claim_id": "stale_decision_replaced", + "requirement": "cite", + "quote": "direct real-world-memory-proactive-brief suite" + } + ], + "negative_traps": [ + { + "trap_id": "stale-decision-current-trap", + "type": "stale_fact", + "evidence_ids": ["stale-decision-old-gate"], + "failure_if_used": false + } + ], + "proactive_brief": { + "required_suggestion_kinds": ["stale_decision_audit"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Brief identifies the superseded decision." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The stale-decision warning cites old and new evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "The old decision is not presented as current." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The warning gives a defer reason." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Supersession markers are present." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No superseding decision is available."], + "fallback_action": "defer_stale_decision" + }, + "tags": ["synthetic", "proactive_brief", "stale_decision_audit", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_plan_preference_warning.json b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_plan_preference_warning.json new file mode 100644 index 00000000..11338f90 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/proactive_brief/stale_plan_preference_warning.json @@ -0,0 +1,316 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "proactive-stale-plan-preference-warning-001", + "suite": "proactive_brief", + "title": "Reject stale plan and preference suggestions after TTL invalidation", + "corpus": { + "corpus_id": "real-world-memory-proactive-brief-2026-06-16", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stale-plan-old", + "kind": "plan", + "text": "Old plan: publish the proactive brief report without running the new proactive brief benchmark.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_plan_preference_warning", + "evidence_id": "stale-plan-old" + } + }, + "created_at": "2026-06-15T12:00:00Z" + }, + { + "evidence_id": "stale-plan-ttl", + "kind": "trace", + "text": "TTL invalidation: the old proactive report publishing plan expired at 2026-06-16T00:00:00Z and must not be recommended as current work.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_plan_preference_warning", + "evidence_id": "stale-plan-ttl" + }, + "locator": { + "quote": "must not be recommended as current work" + } + }, + "created_at": "2026-06-16T00:00:00Z" + }, + { + "evidence_id": "current-plan-run-gate", + "kind": "plan", + "text": "Current plan: run the proactive brief benchmark command and targeted tests before publishing the report.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_plan_preference_warning", + "evidence_id": "current-plan-run-gate" + }, + "locator": { + "quote": "run the proactive brief benchmark command" + } + }, + "created_at": "2026-06-16T06:00:00Z" + }, + { + "evidence_id": "old-preference-long-brief", + "kind": "preference", + "text": "Historical preference: write long proactive briefs with broad product comparisons.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_plan_preference_warning", + "evidence_id": "old-preference-long-brief" + } + }, + "created_at": "2026-06-15T12:05:00Z" + }, + { + "evidence_id": "current-preference-concise-brief", + "kind": "preference", + "text": "Current preference: proactive briefs should be concise, evidence-linked, and avoid broad hosted-product parity claims.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_plan_preference_warning", + "evidence_id": "current-preference-concise-brief" + }, + "locator": { + "quote": "concise, evidence-linked" + } + }, + "created_at": "2026-06-16T06:02:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_proactive_brief", + "answer": { + "content": "Stale plan/preference warning: reject the expired publish-first plan, use the current run-gate plan, and prefer concise evidence-linked briefs without broad hosted-product parity claims.", + "claims": [ + { + "claim_id": "stale_plan_rejected", + "text": "The expired publish-first plan must not be recommended as current work.", + "evidence_ids": ["stale-plan-ttl"], + "confidence": "high" + }, + { + "claim_id": "current_preference_concise", + "text": "The current brief preference is concise and evidence-linked.", + "evidence_ids": ["current-preference-concise-brief"], + "confidence": "high" + } + ], + "evidence_ids": ["stale-plan-ttl", "current-plan-run-gate", "current-preference-concise-brief"], + "proactive_briefs": [ + { + "brief_id": "brief-stale-plan-preference-2026-06-16", + "contract_schema": "elf.proactive_project_brief/v1", + "generated_at": "2026-06-16T06:10:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-953-fixture-agent", + "read_profile": "private_plus_project", + "brief_kind": "stale_plan_preference_warning", + "suggestions": [ + { + "suggestion_id": "reject-expired-publish-first-plan", + "suggestion_kind": "stale_plan_preference_warning", + "title": "Reject the expired publish-first plan", + "body": "Do not publish the proactive report before running the new proactive brief benchmark; the old plan expired under TTL.", + "evidence_refs": ["stale-plan-old", "stale-plan-ttl", "current-plan-run-gate"], + "freshness": { + "status": "tombstoned", + "observed_at": "2026-06-16T00:00:00Z", + "valid_from": "2026-06-15T12:00:00Z", + "valid_to": "2026-06-16T00:00:00Z", + "last_confirmed_at": "2026-06-16T06:10:00Z", + "superseded_by": ["current-plan-run-gate"], + "tombstone_refs": ["stale-plan-ttl"] + }, + "action": { + "decision": "reject", + "reason_code": "REJECT_TTL_INVALIDATED_PLAN", + "reason": "The old publish-first plan has explicit TTL invalidation and a current replacement plan exists." + }, + "unsupported_claim_flags": [] + }, + { + "suggestion_id": "defer-long-comparison-preference", + "suggestion_kind": "stale_plan_preference_warning", + "title": "Defer long product-comparison prose", + "body": "Use concise evidence-linked proactive briefs and avoid broad hosted-product parity claims.", + "evidence_refs": ["old-preference-long-brief", "current-preference-concise-brief"], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-16T06:02:00Z", + "valid_from": "2026-06-15T12:05:00Z", + "valid_to": "2026-06-16T06:02:00Z", + "last_confirmed_at": "2026-06-16T06:10:00Z", + "superseded_by": ["current-preference-concise-brief"], + "tombstone_refs": [] + }, + "action": { + "decision": "defer", + "reason_code": "DEFER_SUPERSEDED_PREFERENCE", + "reason": "The old long-comparison preference is superseded by a concise evidence-linked preference." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "current-plan-run-gate", + "status": "current", + "reason": "Replacement current plan." + }, + { + "evidence_id": "current-preference-concise-brief", + "status": "current", + "reason": "Replacement current preference." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [], + "superseded_source_refs": [ + { + "evidence_id": "old-preference-long-brief", + "status": "superseded", + "reason": "Replaced by concise evidence-linked preference.", + "superseded_by": "current-preference-concise-brief" + } + ], + "tombstone_source_refs": [ + { + "evidence_id": "stale-plan-ttl", + "status": "tombstoned", + "reason": "TTL invalidation for old publish-first plan." + } + ], + "unsupported_claim_flags": [] + } + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "stale-plan-ttl-recorded", + "ts": "2026-06-16T00:00:00Z", + "actor": "agent", + "action": "ttl_invalidated_plan", + "evidence_ids": ["stale-plan-old", "stale-plan-ttl"], + "summary": "The old publish-first plan expired and must not be current work." + } + ], + "prompt": { + "role": "user", + "content": "Warn me about stale plans or preferences before making proactive suggestions.", + "job_mode": "proactive_brief", + "constraints": ["cite_evidence", "mark_tombstones", "include_reject_or_defer_reason"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "stale_plan_rejected", + "text": "The expired publish-first plan must not be recommended as current work." + }, + { + "claim_id": "current_preference_concise", + "text": "The current brief preference is concise and evidence-linked." + } + ], + "must_not_include": [ + "publish the proactive brief report without running the new proactive brief benchmark", + "write long proactive briefs with broad product comparisons" + ], + "evidence_links": { + "stale_plan_rejected": ["stale-plan-old", "stale-plan-ttl", "current-plan-run-gate"], + "current_preference_concise": [ + "old-preference-long-brief", + "current-preference-concise-brief" + ] + }, + "answer_type": "proactive_project_brief", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "stale-plan-ttl", + "claim_id": "stale_plan_rejected", + "requirement": "cite", + "quote": "must not be recommended as current work" + }, + { + "evidence_id": "current-preference-concise-brief", + "claim_id": "current_preference_concise", + "requirement": "cite", + "quote": "concise, evidence-linked" + } + ], + "negative_traps": [ + { + "trap_id": "ttl-plan-current-trap", + "type": "stale_fact", + "evidence_ids": ["stale-plan-old"], + "failure_if_used": false + } + ], + "proactive_brief": { + "required_suggestion_kinds": ["stale_plan_preference_warning"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Brief rejects the expired plan and names current preference." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Every stale warning carries source refs." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "TTL-invalidated content is not current." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The warning gives reject and defer rationale." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "TTL tombstone and supersession markers are preserved." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No TTL invalidation evidence is available."], + "fallback_action": "defer_stale_plan_warning" + }, + "tags": ["synthetic", "proactive_brief", "stale_plan_preference_warning", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/backup_restore_cold_start_readback.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/backup_restore_cold_start_readback.json new file mode 100644 index 00000000..687419fe --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/backup_restore_cold_start_readback.json @@ -0,0 +1,232 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-restore-cold-start-001", + "suite": "production_ops", + "title": "Read back restored memory after Docker cold start and Qdrant rebuild", + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "restore-search-before", + "kind": "trace", + "text": "Before restore, search returned one result for key single_user_restore_probe with trace 535e49be-250f-483c-8845-b4116e591dac.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "backup_restore_cold_start_readback", + "evidence_id": "restore-search-before" + }, + "locator": { + "quote": "search returned one result for key single_user_restore_probe" + } + }, + "created_at": "2026-06-09T10:00:00Z" + }, + { + "evidence_id": "restore-qdrant-rebuild", + "kind": "trace", + "text": "After restoring Postgres backup, Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "backup_restore_cold_start_readback", + "evidence_id": "restore-qdrant-rebuild" + }, + "locator": { + "quote": "rebuilt_count=1, missing_vector_count=0, error_count=0" + } + }, + "created_at": "2026-06-09T10:03:00Z" + }, + { + "evidence_id": "restore-search-after", + "kind": "trace", + "text": "After cold start and rebuild, search returned one result for key single_user_restore_probe with trace e995263d-8f0-4472-9a32-354d5cceed33.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "backup_restore_cold_start_readback", + "evidence_id": "restore-search-after" + }, + "locator": { + "quote": "After cold start and rebuild, search returned one result" + } + }, + "created_at": "2026-06-09T10:05:00Z" + }, + { + "evidence_id": "qdrant-authority-decoy", + "kind": "adapter_state", + "text": "Decoy: a Qdrant backup alone is the source of truth, so Postgres restore evidence is optional.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "backup_restore_cold_start_readback", + "evidence_id": "qdrant-authority-decoy" + } + }, + "created_at": "2026-06-09T09:50:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "The restore proof recovered key single_user_restore_probe after a Docker cold start. Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0, and search after cold start returned one result for the restored key.", + "claims": [ + { + "claim_id": "restore_recovered_key", + "text": "The restore proof recovered key single_user_restore_probe after a Docker cold start.", + "evidence_ids": ["restore-search-before", "restore-search-after"], + "confidence": "high" + }, + { + "claim_id": "qdrant_rebuild_counts", + "text": "Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0.", + "evidence_ids": ["restore-qdrant-rebuild"], + "confidence": "high" + }, + { + "claim_id": "cold_start_readback", + "text": "Search after cold start returned one result for the restored key.", + "evidence_ids": ["restore-search-after"], + "confidence": "high" + } + ], + "evidence_ids": ["restore-search-before", "restore-qdrant-rebuild", "restore-search-after"], + "latency_ms": 2.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "pre-restore-search", + "ts": "2026-06-09T10:00:00Z", + "actor": "tool", + "action": "searched_before_restore", + "evidence_ids": ["restore-search-before"], + "summary": "The proof captured the searchable key before restore." + }, + { + "event_id": "post-restore-rebuild", + "ts": "2026-06-09T10:03:00Z", + "actor": "tool", + "action": "rebuilt_qdrant_from_postgres_vectors", + "evidence_ids": ["restore-qdrant-rebuild"], + "summary": "Qdrant was rebuilt from Postgres-held vectors." + }, + { + "event_id": "post-cold-start-search", + "ts": "2026-06-09T10:05:00Z", + "actor": "tool", + "action": "searched_after_cold_start", + "evidence_ids": ["restore-search-after"], + "summary": "The restored key was searchable after the cold-start path." + } + ], + "prompt": { + "role": "user", + "content": "What evidence shows backup restore and cold-start readback worked?", + "job_mode": "operate", + "constraints": ["cite_evidence", "do_not_treat_qdrant_as_source_of_truth"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "restore_recovered_key", + "text": "The restore proof recovered key single_user_restore_probe after a Docker cold start." + }, + { + "claim_id": "qdrant_rebuild_counts", + "text": "Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0." + }, + { + "claim_id": "cold_start_readback", + "text": "Search after cold start returned one result for the restored key." + } + ], + "must_not_include": ["Qdrant backup alone is the source of truth"], + "evidence_links": { + "restore_recovered_key": ["restore-search-before", "restore-search-after"], + "qdrant_rebuild_counts": ["restore-qdrant-rebuild"], + "cold_start_readback": ["restore-search-after"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "restore-search-before", + "claim_id": "restore_recovered_key", + "requirement": "cite", + "quote": "search returned one result for key single_user_restore_probe" + }, + { + "evidence_id": "restore-qdrant-rebuild", + "claim_id": "qdrant_rebuild_counts", + "requirement": "cite", + "quote": "rebuilt_count=1, missing_vector_count=0, error_count=0" + }, + { + "evidence_id": "restore-search-after", + "claim_id": "cold_start_readback", + "requirement": "cite", + "quote": "After cold start and rebuild, search returned one result" + } + ], + "negative_traps": [ + { + "trap_id": "qdrant-source-of-truth-decoy", + "type": "decoy_evidence", + "evidence_ids": ["qdrant-authority-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Explains backup restore, cold start, and rebuild behavior." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites pre-restore, rebuild, and post-restore readback evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not treat Qdrant as authoritative." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "States what the operator can rely on." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "production_ops", "restore", "cold_start", "qdrant_rebuild", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/cold_start_missing_dependency_incomplete.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/cold_start_missing_dependency_incomplete.json new file mode 100644 index 00000000..5ff0912d --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/cold_start_missing_dependency_incomplete.json @@ -0,0 +1,275 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-cold-start-dependency-001", + "suite": "production_ops", + "title": "Report pinned OpenViking cold-start path reaching behavioral wrong-result", + "encoding": {}, + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "pinned-local-embed-runtime-reached", + "kind": "adapter_state", + "text": "The pinned OpenViking Docker local embedding path installed and imported llama-cpp-python==0.3.28, then reached OpenViking add_resource/find in the baseline runner.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "cold_start_missing_dependency_incomplete", + "evidence_id": "pinned-local-embed-runtime-reached" + }, + "locator": { + "quote": "llama_cpp_import_ok 0.3.28" + } + }, + "created_at": "2026-06-10T08:38:58Z" + }, + { + "evidence_id": "pinned-local-embed-retry", + "kind": "runbook", + "text": "The Docker retry path is ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker; the runner installs llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu with --only-binary llama-cpp-python before OpenViking add_resource/find.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "cold_start_missing_dependency_incomplete", + "evidence_id": "pinned-local-embed-retry" + }, + "locator": { + "quote": "llama-cpp-python==0.3.28" + } + }, + "created_at": "2026-06-10T00:00:00Z" + }, + { + "evidence_id": "openviking-wrong-result-behavior", + "kind": "adapter_state", + "text": "OpenViking now records status=wrong_result and retrieval_status=retrieval_wrong_result because add_resource/find returned 0 of 3 expected evidence-term matches after the pinned local embedding setup succeeded.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "cold_start_missing_dependency_incomplete", + "evidence_id": "openviking-wrong-result-behavior" + }, + "locator": { + "quote": "status=wrong_result" + } + }, + "created_at": "2026-06-10T08:38:58Z" + }, + { + "evidence_id": "typed-incomplete-policy", + "kind": "runbook", + "text": "Use incomplete when install, import, build, adapter wiring, native dependency support, or local runtime setup failed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "cold_start_missing_dependency_incomplete", + "evidence_id": "typed-incomplete-policy" + }, + "locator": { + "quote": "Use incomplete when install, import, build" + } + }, + "created_at": "2026-06-09T08:40:00Z" + }, + { + "evidence_id": "dependency-pass-decoy", + "kind": "adapter_state", + "text": "Decoy: the pinned OpenViking run should be reported as pass because the dependency installed even though retrieval missed expected evidence terms.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "cold_start_missing_dependency_incomplete", + "evidence_id": "dependency-pass-decoy" + } + }, + "created_at": "2026-06-09T08:30:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "The pinned OpenViking Docker local embedding path reached add_resource/find. OpenViking now reports wrong_result/retrieval_wrong_result because all three smoke queries missed expected evidence terms. If the pinned llama-cpp-python install or import fails on another Docker platform, classify that setup boundary as incomplete, not pass.", + "claims": [ + { + "claim_id": "pinned_openviking_runtime_reached", + "text": "The pinned OpenViking Docker local embedding path reached add_resource/find.", + "evidence_ids": [ + "pinned-local-embed-runtime-reached", + "pinned-local-embed-retry" + ], + "confidence": "high" + }, + { + "claim_id": "openviking_wrong_result_after_runtime", + "text": "OpenViking now reports wrong_result/retrieval_wrong_result because all three smoke queries missed expected evidence terms.", + "evidence_ids": ["openviking-wrong-result-behavior"], + "confidence": "high" + }, + { + "claim_id": "setup_failure_stays_incomplete", + "text": "If the pinned llama-cpp-python install or import fails on another Docker platform, classify that setup boundary as incomplete, not pass.", + "evidence_ids": ["typed-incomplete-policy"], + "confidence": "high" + } + ], + "evidence_ids": [ + "pinned-local-embed-runtime-reached", + "pinned-local-embed-retry", + "openviking-wrong-result-behavior", + "typed-incomplete-policy" + ], + "latency_ms": 1.8, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "pinned-local-embed-runtime-reached", + "ts": "2026-06-10T08:38:58Z", + "actor": "tool", + "action": "reached_behavior_check", + "evidence_ids": ["pinned-local-embed-runtime-reached"], + "summary": "The pinned local embedding dependency installed and imported, and OpenViking add_resource/find executed." + }, + { + "event_id": "pinned-local-embed-retry-recorded", + "ts": "2026-06-10T00:00:00Z", + "actor": "agent", + "action": "recorded_retry_path", + "evidence_ids": ["pinned-local-embed-retry"], + "summary": "The fixture records the Docker-local pinned llama-cpp-python retry command and wheel index." + }, + { + "event_id": "openviking-wrong-result-recorded", + "ts": "2026-06-10T08:38:58Z", + "actor": "tool", + "action": "classified_behavior", + "evidence_ids": ["openviking-wrong-result-behavior"], + "summary": "The OpenViking adapter reached retrieval behavior and missed all expected evidence-term checks." + }, + { + "event_id": "typed-incomplete-retained", + "ts": "2026-06-09T08:40:00Z", + "actor": "agent", + "action": "classified_failure", + "evidence_ids": ["typed-incomplete-policy"], + "summary": "The report kept dependency failures as incomplete typed states." + } + ], + "prompt": { + "role": "user", + "content": "How should the production-ops suite classify the OpenViking cold-start local embedding path after the pinned Docker retry reaches add_resource/find but misses expected evidence?", + "job_mode": "operate", + "constraints": ["cite_evidence", "preserve_typed_status", "do_not_claim_pass"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "pinned_openviking_runtime_reached", + "text": "The pinned OpenViking Docker local embedding path reached add_resource/find." + }, + { + "claim_id": "openviking_wrong_result_after_runtime", + "text": "OpenViking now reports wrong_result/retrieval_wrong_result because all three smoke queries missed expected evidence terms." + }, + { + "claim_id": "setup_failure_stays_incomplete", + "text": "If the pinned llama-cpp-python install or import fails on another Docker platform, classify that setup boundary as incomplete, not pass." + } + ], + "must_not_include": ["reported as pass", "dependency failure is incomplete, not pass"], + "evidence_links": { + "pinned_openviking_runtime_reached": [ + "pinned-local-embed-runtime-reached", + "pinned-local-embed-retry" + ], + "openviking_wrong_result_after_runtime": ["openviking-wrong-result-behavior"], + "setup_failure_stays_incomplete": ["typed-incomplete-policy"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "pinned-local-embed-runtime-reached", + "claim_id": "pinned_openviking_runtime_reached", + "requirement": "cite", + "quote": "installed and imported llama-cpp-python==0.3.28" + }, + { + "evidence_id": "pinned-local-embed-retry", + "claim_id": "pinned_openviking_runtime_reached", + "requirement": "cite", + "quote": "llama-cpp-python==0.3.28" + }, + { + "evidence_id": "openviking-wrong-result-behavior", + "claim_id": "openviking_wrong_result_after_runtime", + "requirement": "cite", + "quote": "status=wrong_result" + }, + { + "evidence_id": "typed-incomplete-policy", + "claim_id": "setup_failure_stays_incomplete", + "requirement": "cite", + "quote": "Use incomplete when install, import, build" + } + ], + "negative_traps": [ + { + "trap_id": "dependency-pass-decoy", + "type": "unsupported_prior", + "evidence_ids": ["dependency-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Distinguishes dependency setup reaching runtime from the remaining behavioral retrieval result." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the pinned runtime success, wrong-result behavior, and typed-incomplete fallback policy." + }, + "uncertainty_handling": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "States that setup failure would remain incomplete, but the current reached-runtime result is wrong_result." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Avoids dependency-pass decoy." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["wrong_result/retrieval_wrong_result"], + "fallback_action": "state_current_wrong_result" + }, + "tags": ["external_adapter", "production_ops", "cold_start", "dependency_boundary", "wrong_result", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/credential_boundary_provider_blocked.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/credential_boundary_provider_blocked.json new file mode 100644 index 00000000..f0a46864 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/credential_boundary_provider_blocked.json @@ -0,0 +1,199 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-credential-boundary-001", + "suite": "production_ops", + "title": "Keep provider credential requirement blocked without committing secrets", + "encoding": { + "status": "blocked", + "reason": "Provider-backed production operations require operator-owned credentials; checked-in fixtures must not include or require secrets.", + "follow_up": { + "title": "Run provider-backed production-ops gate with routed operator credentials", + "reason": "Credential-bound checks need an operator shell with provider environment variables; fixture reports can only encode the boundary." + } + }, + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "external_adapter", + "items": [ + { + "evidence_id": "provider-credential-boundary", + "kind": "runbook", + "text": "Provider-backed production runs require embedding API credentials supplied through environment variables; reports record provider id, model, dimensions, timeout, API base, and path, but never record the API key.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "credential_boundary_provider_blocked", + "evidence_id": "provider-credential-boundary" + }, + "locator": { + "quote": "never record the API key" + } + }, + "created_at": "2026-06-09T08:10:00Z" + }, + { + "evidence_id": "checked-in-secret-boundary", + "kind": "decision", + "text": "Checked-in production-ops fixtures must not require user secrets or run private corpus data.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "credential_boundary_provider_blocked", + "evidence_id": "checked-in-secret-boundary" + }, + "locator": { + "quote": "must not require user secrets" + } + }, + "created_at": "2026-06-09T08:12:00Z" + }, + { + "evidence_id": "secret-commit-decoy", + "kind": "message", + "text": "Decoy: commit a provider API key into the fixture so the production-ops report can pass in CI.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "credential_boundary_provider_blocked", + "evidence_id": "secret-commit-decoy" + } + }, + "created_at": "2026-06-09T08:09:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "Do not commit or require provider secrets in checked-in fixtures. Provider-backed production-ops checks are blocked until operator credentials are supplied, and reports may record provider metadata but never the API key.", + "claims": [ + { + "claim_id": "provider_credentials_blocked", + "text": "Provider-backed production-ops checks are blocked until operator credentials are supplied.", + "evidence_ids": ["provider-credential-boundary"], + "confidence": "high" + }, + { + "claim_id": "no_checked_in_secrets", + "text": "Checked-in production-ops fixtures must not require user secrets.", + "evidence_ids": ["checked-in-secret-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["provider-credential-boundary", "checked-in-secret-boundary"], + "latency_ms": 1.7, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "provider-credential-boundary-recorded", + "ts": "2026-06-09T08:10:00Z", + "actor": "agent", + "action": "recorded_credential_boundary", + "evidence_ids": ["provider-credential-boundary"], + "summary": "The provider path requires environment credentials and redacts API keys from reports." + }, + { + "event_id": "checked-in-secret-boundary-recorded", + "ts": "2026-06-09T08:12:00Z", + "actor": "agent", + "action": "recorded_secret_boundary", + "evidence_ids": ["checked-in-secret-boundary"], + "summary": "Checked-in production-ops fixtures do not require or contain secrets." + } + ], + "prompt": { + "role": "user", + "content": "Should the production-ops suite use checked-in provider credentials to prove provider-backed behavior?", + "job_mode": "operate", + "constraints": ["cite_evidence", "avoid_secret_leakage", "state_blockers"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "provider_credentials_blocked", + "text": "Provider-backed production-ops checks are blocked until operator credentials are supplied." + }, + { + "claim_id": "no_checked_in_secrets", + "text": "Checked-in production-ops fixtures must not require user secrets." + } + ], + "must_not_include": ["commit a provider API key"], + "evidence_links": { + "provider_credentials_blocked": ["provider-credential-boundary"], + "no_checked_in_secrets": ["checked-in-secret-boundary"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "provider-credential-boundary", + "claim_id": "provider_credentials_blocked", + "requirement": "cite", + "quote": "never record the API key" + }, + { + "evidence_id": "checked-in-secret-boundary", + "claim_id": "no_checked_in_secrets", + "requirement": "cite", + "quote": "must not require user secrets" + } + ], + "negative_traps": [ + { + "trap_id": "commit-provider-key-decoy", + "type": "privacy_leak", + "evidence_ids": ["secret-commit-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "uncertainty_handling": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States that credential-bound execution is blocked." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites provider credential and checked-in secret boundaries." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids credential leakage and checked-in secret traps." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Identifies the operator-owned credential follow-up." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "unsafe live/destructive action when the prompt forbids it" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["blocked until operator credentials are supplied", "must not require user secrets"], + "fallback_action": "state_blocker" + }, + "tags": ["external_adapter", "production_ops", "credential_boundary", "blocked", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/interrupted_import_resume_checkpoint.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/interrupted_import_resume_checkpoint.json new file mode 100644 index 00000000..e858e702 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/interrupted_import_resume_checkpoint.json @@ -0,0 +1,204 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-backfill-resume-001", + "suite": "production_ops", + "title": "Resume interrupted generated backfill from checkpoint without duplicate source notes", + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "generated_public", + "items": [ + { + "evidence_id": "backfill-checkpoint-state", + "kind": "trace", + "text": "Backfill report live-baseline-20260609092144 completed 2000 of 2000 documents, resumed from checkpoint offset 1000 to 2000, and found zero duplicate source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "interrupted_import_resume_checkpoint", + "evidence_id": "backfill-checkpoint-state" + }, + "locator": { + "quote": "resumed from checkpoint offset 1000 to 2000" + } + }, + "created_at": "2026-06-09T09:21:44Z" + }, + { + "evidence_id": "backfill-clean-compare", + "kind": "trace", + "text": "Clean comparison matched all 16 of 16 query results after the resumed import.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "interrupted_import_resume_checkpoint", + "evidence_id": "backfill-clean-compare" + }, + "locator": { + "quote": "matched all 16 of 16 query results" + } + }, + "created_at": "2026-06-09T09:22:30Z" + }, + { + "evidence_id": "backfill-restart-decoy", + "kind": "adapter_state", + "text": "Decoy: interrupted imports must restart from zero because the checkpoint duplicated source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "interrupted_import_resume_checkpoint", + "evidence_id": "backfill-restart-decoy" + } + }, + "created_at": "2026-06-09T09:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "Resume from checkpoint offset 1000 to 2000 completed the 2000 document backfill. The resumed backfill found zero duplicate source notes, and search quality after resume matched the clean run for all 16 queries.", + "claims": [ + { + "claim_id": "resume_checkpoint", + "text": "Resume from checkpoint offset 1000 to 2000 completed the 2000 document backfill.", + "evidence_ids": ["backfill-checkpoint-state"], + "confidence": "high" + }, + { + "claim_id": "no_duplicate_sources", + "text": "The resumed backfill found zero duplicate source notes.", + "evidence_ids": ["backfill-checkpoint-state"], + "confidence": "high" + }, + { + "claim_id": "clean_compare_matched", + "text": "Search quality after resume matched the clean run for all 16 queries.", + "evidence_ids": ["backfill-clean-compare"], + "confidence": "high" + } + ], + "evidence_ids": ["backfill-checkpoint-state", "backfill-clean-compare"], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "backfill-interrupted", + "ts": "2026-06-09T09:21:44Z", + "actor": "tool", + "action": "interrupted_backfill", + "evidence_ids": ["backfill-checkpoint-state"], + "summary": "The generated public backfill was interrupted at the checkpoint boundary." + }, + { + "event_id": "backfill-resumed", + "ts": "2026-06-09T09:22:30Z", + "actor": "tool", + "action": "resumed_backfill", + "evidence_ids": ["backfill-checkpoint-state", "backfill-clean-compare"], + "summary": "The resumed import completed without duplicate source notes and matched a clean comparison." + } + ], + "prompt": { + "role": "user", + "content": "What does the production-ops fixture prove about interrupted backfill resume behavior?", + "job_mode": "operate", + "constraints": ["cite_evidence", "state_checkpoint", "avoid_restarting_completed_work"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "resume_checkpoint", + "text": "Resume from checkpoint offset 1000 to 2000 completed the 2000 document backfill." + }, + { + "claim_id": "no_duplicate_sources", + "text": "The resumed backfill found zero duplicate source notes." + }, + { + "claim_id": "clean_compare_matched", + "text": "Search quality after resume matched the clean run for all 16 queries." + } + ], + "must_not_include": [ + "interrupted imports must restart from zero", + "the checkpoint duplicated source notes" + ], + "evidence_links": { + "resume_checkpoint": ["backfill-checkpoint-state"], + "no_duplicate_sources": ["backfill-checkpoint-state"], + "clean_compare_matched": ["backfill-clean-compare"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "backfill-checkpoint-state", + "claim_id": "resume_checkpoint", + "requirement": "cite", + "quote": "resumed from checkpoint offset 1000 to 2000" + }, + { + "evidence_id": "backfill-clean-compare", + "claim_id": "clean_compare_matched", + "requirement": "cite", + "quote": "matched all 16 of 16 query results" + } + ], + "negative_traps": [ + { + "trap_id": "restart-from-zero-decoy", + "type": "decoy_evidence", + "evidence_ids": ["backfill-restart-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Uses checkpoint resume and duplicate-source evidence." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites checkpoint and clean-comparison artifacts." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not restart completed work or cite duplicate-source decoys." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Answers with the next operational interpretation." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["generated_public", "production_ops", "backfill_resume", "checkpoint", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/private_manifest_absence_blocked.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/private_manifest_absence_blocked.json new file mode 100644 index 00000000..0c45443c --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/private_manifest_absence_blocked.json @@ -0,0 +1,198 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-private-manifest-blocked-001", + "suite": "production_ops", + "title": "Report missing private production manifest as bounded blocked caveat", + "encoding": { + "status": "blocked", + "reason": "No operator-owned private production corpus manifest is checked in or available to this fixture; no private-corpus pass can be claimed.", + "follow_up": { + "title": "Supply an operator-owned private production corpus manifest", + "reason": "A real private-corpus pass requires a sanitized local manifest supplied outside checked-in fixtures." + } + }, + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "private_sanitized", + "items": [ + { + "evidence_id": "private-manifest-guard", + "kind": "runbook", + "text": "Private manifest guard: cargo make baseline-production-private failed closed because ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST was unset; no private-corpus pass is claimed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_manifest_absence_blocked", + "evidence_id": "private-manifest-guard" + }, + "locator": { + "quote": "ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST was unset" + } + }, + "created_at": "2026-06-09T08:00:00Z" + }, + { + "evidence_id": "private-bounded-failure-policy", + "kind": "decision", + "text": "If no operator-owned private manifest is supplied, the private-corpus path is a bounded failure, not a pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_manifest_absence_blocked", + "evidence_id": "private-bounded-failure-policy" + }, + "locator": { + "quote": "bounded failure, not a pass" + } + }, + "created_at": "2026-06-09T08:05:00Z" + }, + { + "evidence_id": "private-pass-decoy", + "kind": "decision", + "text": "Decoy: the absence of private data means the private production corpus benchmark passed with zero findings.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_manifest_absence_blocked", + "evidence_id": "private-pass-decoy" + } + }, + "created_at": "2026-06-09T07:55:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "No private-corpus pass is claimed. The private production corpus path is blocked until an operator supplies a sanitized manifest, and the current evidence is a bounded failure, not a pass.", + "claims": [ + { + "claim_id": "private_manifest_blocked", + "text": "No private-corpus pass is claimed.", + "evidence_ids": ["private-manifest-guard"], + "confidence": "high" + }, + { + "claim_id": "private_bounded_failure", + "text": "The missing private manifest is a bounded failure, not a pass.", + "evidence_ids": ["private-bounded-failure-policy"], + "confidence": "high" + } + ], + "evidence_ids": ["private-manifest-guard", "private-bounded-failure-policy"], + "latency_ms": 1.6, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "private-manifest-missing", + "ts": "2026-06-09T08:00:00Z", + "actor": "tool", + "action": "failed_closed_manifest_guard", + "evidence_ids": ["private-manifest-guard"], + "summary": "The private-corpus task stopped at the missing manifest guard." + }, + { + "event_id": "bounded-private-caveat-recorded", + "ts": "2026-06-09T08:05:00Z", + "actor": "agent", + "action": "recorded_bounded_caveat", + "evidence_ids": ["private-bounded-failure-policy"], + "summary": "The report kept the missing private corpus as a bounded non-pass state." + } + ], + "prompt": { + "role": "user", + "content": "Can this real-world suite claim private-corpus production-ops coverage without my private manifest?", + "job_mode": "operate", + "constraints": ["cite_evidence", "state_blockers", "do_not_run_private_data"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "private_manifest_blocked", + "text": "No private-corpus pass is claimed." + }, + { + "claim_id": "private_bounded_failure", + "text": "The missing private manifest is a bounded failure, not a pass." + } + ], + "must_not_include": ["private production corpus benchmark passed"], + "evidence_links": { + "private_manifest_blocked": ["private-manifest-guard"], + "private_bounded_failure": ["private-bounded-failure-policy"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "private-manifest-guard", + "claim_id": "private_manifest_blocked", + "requirement": "cite", + "quote": "ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST was unset" + }, + { + "evidence_id": "private-bounded-failure-policy", + "claim_id": "private_bounded_failure", + "requirement": "cite", + "quote": "bounded failure, not a pass" + } + ], + "negative_traps": [ + { + "trap_id": "private-zero-findings-pass-decoy", + "type": "unsupported_prior", + "evidence_ids": ["private-pass-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "uncertainty_handling": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "States the private manifest blocker instead of claiming pass." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites manifest guard and bounded-failure policy." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids zero-findings pass decoy." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Names the operator-owned manifest follow-up." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No private-corpus pass is claimed", "bounded failure, not a pass"], + "fallback_action": "state_blocker" + }, + "tags": ["private_corpus", "production_ops", "manifest_guard", "blocked", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/resource_envelope_budget.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/resource_envelope_budget.json new file mode 100644 index 00000000..0f4a23c9 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/resource_envelope_budget.json @@ -0,0 +1,194 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-resource-envelope-001", + "suite": "production_ops", + "title": "Report generated backfill resource envelope and operator planning caveat", + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-10", + "profile": "generated_public", + "items": [ + { + "evidence_id": "resource-envelope-check", + "kind": "trace", + "text": "Resource envelope check measured 2793.629 seconds against a 3600-second limit and 167652 KB RSS against a 1500000 KB limit.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resource_envelope_budget", + "evidence_id": "resource-envelope-check" + }, + "locator": { + "quote": "2793.629 seconds against a 3600-second limit" + } + }, + "created_at": "2026-06-09T09:30:00Z" + }, + { + "evidence_id": "large-import-planning-caveat", + "kind": "runbook", + "text": "Large imports should be planned as batch jobs, not interactive operations.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resource_envelope_budget", + "evidence_id": "large-import-planning-caveat" + }, + "locator": { + "quote": "planned as batch jobs" + } + }, + "created_at": "2026-06-09T09:35:00Z" + }, + { + "evidence_id": "interactive-import-decoy", + "kind": "decision", + "text": "Decoy: the 2000 document provider backfill is small enough to treat as an interactive operation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "resource_envelope_budget", + "evidence_id": "interactive-import-decoy" + } + }, + "created_at": "2026-06-09T09:20:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "The resource envelope passed: 2793.629 seconds was within the 3600-second limit, and 167652 KB RSS was within the 1500000 KB limit. Large imports should be planned as batch jobs, not interactive operations.", + "claims": [ + { + "claim_id": "resource_envelope_passed", + "text": "The resource envelope passed within the elapsed-time and RSS limits.", + "evidence_ids": ["resource-envelope-check"], + "confidence": "high" + }, + { + "claim_id": "large_import_batch_caveat", + "text": "Large imports should be planned as batch jobs, not interactive operations.", + "evidence_ids": ["large-import-planning-caveat"], + "confidence": "high" + } + ], + "evidence_ids": ["resource-envelope-check", "large-import-planning-caveat"], + "latency_ms": 2.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "resource-envelope-measured", + "ts": "2026-06-09T09:30:00Z", + "actor": "tool", + "action": "measured_resource_envelope", + "evidence_ids": ["resource-envelope-check"], + "summary": "The generated provider backfill stayed within the configured resource envelope." + }, + { + "event_id": "resource-caveat-recorded", + "ts": "2026-06-09T09:35:00Z", + "actor": "operator", + "action": "recorded_planning_caveat", + "evidence_ids": ["large-import-planning-caveat"], + "summary": "The production adoption gate kept the batch-job caveat for large imports." + } + ], + "prompt": { + "role": "user", + "content": "How should I interpret the resource envelope evidence for production backfill?", + "job_mode": "operate", + "constraints": ["cite_evidence", "state_resource_limits", "state_operator_caveat"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "resource_envelope_passed", + "text": "The resource envelope passed within the elapsed-time and RSS limits." + }, + { + "claim_id": "large_import_batch_caveat", + "text": "Large imports should be planned as batch jobs, not interactive operations." + } + ], + "must_not_include": ["treat as an interactive operation"], + "evidence_links": { + "resource_envelope_passed": ["resource-envelope-check"], + "large_import_batch_caveat": ["large-import-planning-caveat"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "resource-envelope-check", + "claim_id": "resource_envelope_passed", + "requirement": "cite", + "quote": "2793.629 seconds against a 3600-second limit" + }, + { + "evidence_id": "large-import-planning-caveat", + "claim_id": "large_import_batch_caveat", + "requirement": "cite", + "quote": "planned as batch jobs" + } + ], + "negative_traps": [ + { + "trap_id": "interactive-import-decoy", + "type": "unsafe_action", + "evidence_ids": ["interactive-import-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "latency_resource": { + "weight": 0.35, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 100.0, + "resource_expectation": "Report elapsed and RSS limits plus caveat." + } + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites resource envelope and caveat evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not encourage interactive large imports." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Interprets the result for operator planning." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "unsafe live/destructive action when the prompt forbids it", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["generated_public", "production_ops", "resource_envelope", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/project_decisions/accepted_typed_failure_reporting.json b/apps/elf-eval/fixtures/real_world_memory/project_decisions/accepted_typed_failure_reporting.json new file mode 100644 index 00000000..48ede3b0 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/project_decisions/accepted_typed_failure_reporting.json @@ -0,0 +1,217 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "project-decision-accepted-typed-failures-001", + "suite": "project_decisions", + "title": "Recover an accepted benchmark reporting decision with its rationale", + "corpus": { + "corpus_id": "real-world-memory-project-decisions-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "typed-failure-decision-accepted", + "kind": "decision", + "text": "Accepted decision: real-world benchmark reports must preserve typed outcomes: pass, wrong_result, lifecycle_fail, incomplete, blocked, not_encoded, and unsupported_claim.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "accepted_typed_failure_reporting", + "evidence_id": "typed-failure-decision-accepted" + } + }, + "created_at": "2026-06-09T09:00:00Z" + }, + { + "evidence_id": "typed-failure-decision-rationale", + "kind": "decision", + "text": "Rationale: typed outcomes keep missing evidence, wrong answers, blocked adapter setup, and unencoded dimensions from being hidden inside one aggregate score.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "accepted_typed_failure_reporting", + "evidence_id": "typed-failure-decision-rationale" + } + }, + "created_at": "2026-06-09T09:05:00Z" + }, + { + "evidence_id": "typed-failure-missing-rationale-trap", + "kind": "decision", + "text": "Rejected shortcut: collapse all benchmark outcomes into a single pass/fail label and omit the reason for typed failures.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "accepted_typed_failure_reporting", + "evidence_id": "typed-failure-missing-rationale-trap" + } + }, + "created_at": "2026-06-09T09:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_project_decisions", + "answer": { + "content": "The accepted decision is to preserve typed benchmark outcomes instead of flattening them, because the typed states keep missing evidence, wrong answers, blocked setup, and unencoded dimensions visible.", + "claims": [ + { + "claim_id": "accepted_typed_failure_policy", + "text": "Real-world benchmark reports must preserve typed outcomes.", + "evidence_ids": ["typed-failure-decision-accepted"], + "confidence": "high" + }, + { + "claim_id": "typed_failure_rationale", + "text": "Typed outcomes keep missing evidence, wrong answers, blocked setup, and unencoded dimensions visible.", + "evidence_ids": ["typed-failure-decision-rationale"], + "confidence": "high" + } + ], + "evidence_ids": [ + "typed-failure-decision-accepted", + "typed-failure-decision-rationale" + ], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "accepted-typed-failures", + "ts": "2026-06-09T09:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "typed-failure-decision-accepted", + "typed-failure-decision-rationale" + ], + "summary": "The benchmark report format was accepted with typed outcomes and rationale." + } + ], + "prompt": { + "role": "user", + "content": "Why did we choose typed benchmark outcomes instead of a single pass/fail label?", + "job_mode": "decide", + "constraints": [ + "cite_evidence", + "state_rationale", + "avoid_uncited_policy_claims" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "accepted_typed_failure_policy", + "text": "Real-world benchmark reports must preserve typed outcomes." + }, + { + "claim_id": "typed_failure_rationale", + "text": "Typed outcomes keep missing evidence, wrong answers, blocked setup, and unencoded dimensions visible." + } + ], + "must_not_include": [ + "Collapse all benchmark outcomes into a single pass/fail label." + ], + "evidence_links": { + "accepted_typed_failure_policy": ["typed-failure-decision-accepted"], + "typed_failure_rationale": ["typed-failure-decision-rationale"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "typed-failure-decision-accepted", + "claim_id": "accepted_typed_failure_policy", + "requirement": "cite", + "quote": "preserve typed outcomes" + }, + { + "evidence_id": "typed-failure-decision-rationale", + "claim_id": "typed_failure_rationale", + "requirement": "explain", + "quote": "keep missing evidence, wrong answers, blocked adapter setup, and unencoded dimensions" + } + ], + "negative_traps": [ + { + "trap_id": "missing-rationale-pass-fail-shortcut", + "type": "decoy_evidence", + "evidence_ids": ["typed-failure-missing-rationale-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the accepted typed-outcome decision." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the accepted decision and rationale evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids the pass/fail shortcut that omits rationale." + }, + "uncertainty_handling": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Does not hedge because sufficient decision evidence exists." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Explains the decision in a form useful for future benchmark reports." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": [ + "typed-failure-decision-accepted", + "typed-failure-decision-rationale" + ], + "historical_evidence_ids": [], + "stale_trap_ids": [], + "conflicts": [], + "update_rationale": { + "claim_id": "typed_failure_rationale", + "evidence_ids": ["typed-failure-decision-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "project_decisions", + "accepted_decision", + "rationale" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/project_decisions/current_validation_gate.json b/apps/elf-eval/fixtures/real_world_memory/project_decisions/current_validation_gate.json new file mode 100644 index 00000000..224f44f8 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/project_decisions/current_validation_gate.json @@ -0,0 +1,259 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "project-decision-current-validation-gate-001", + "suite": "project_decisions", + "title": "Recover the current validation gate instead of an old gate", + "corpus": { + "corpus_id": "real-world-memory-project-decisions-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "validation-gate-old-lint-test", + "kind": "decision", + "text": "Historical validation gate: earlier runner work used lint and test as the main local proof before review.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "current_validation_gate", + "evidence_id": "validation-gate-old-lint-test" + } + }, + "created_at": "2026-06-08T18:00:00Z" + }, + { + "evidence_id": "validation-gate-current-decodex", + "kind": "decision", + "text": "Current validation gate: before pushing a refreshed PR head, run cargo make fmt, cargo make lint-fix, and cargo make check.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "current_validation_gate", + "evidence_id": "validation-gate-current-decodex" + } + }, + "created_at": "2026-06-10T02:00:00Z" + }, + { + "evidence_id": "validation-gate-current-rationale", + "kind": "decision", + "text": "Gate rationale: formatting, automatic lint repair, and full checks prevent avoidable review churn before Decodex review handoff.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "current_validation_gate", + "evidence_id": "validation-gate-current-rationale" + } + }, + "created_at": "2026-06-10T02:05:00Z" + }, + { + "evidence_id": "validation-gate-uncited-policy-trap", + "kind": "decision", + "text": "Uncited current-policy trap: describe the current validation gate from memory without citing the current gate evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "current_validation_gate", + "evidence_id": "validation-gate-uncited-policy-trap" + } + }, + "created_at": "2026-06-10T02:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_project_decisions", + "answer": { + "content": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make check before pushing a refreshed PR head. The older lint-and-test gate is historical; the current gate adds formatting, automatic lint repair, and full checks to prevent avoidable review churn before Decodex review handoff.", + "claims": [ + { + "claim_id": "current_validation_gate", + "text": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make check before pushing a refreshed PR head.", + "evidence_ids": [ + "validation-gate-current-decodex", + "validation-gate-old-lint-test", + "validation-gate-current-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "validation_gate_rationale", + "text": "The gate prevents avoidable review churn before Decodex review handoff.", + "evidence_ids": ["validation-gate-current-rationale"], + "confidence": "high" + } + ], + "evidence_ids": [ + "validation-gate-current-decodex", + "validation-gate-old-lint-test", + "validation-gate-current-rationale" + ], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "old-validation-gate", + "ts": "2026-06-08T18:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": ["validation-gate-old-lint-test"], + "summary": "The earlier validation gate centered on lint and test." + }, + { + "event_id": "current-validation-gate", + "ts": "2026-06-10T02:00:00Z", + "actor": "operator", + "action": "updated_policy", + "evidence_ids": [ + "validation-gate-current-decodex", + "validation-gate-current-rationale" + ], + "summary": "The current Decodex gate requires fmt, lint-fix, and checks before push or handoff." + } + ], + "prompt": { + "role": "user", + "content": "What is the current validation gate, and how is it different from the old gate?", + "job_mode": "decide", + "constraints": [ + "cite_evidence", + "use_current_policy", + "distinguish_current_from_historical" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_validation_gate", + "text": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make check before pushing a refreshed PR head." + }, + { + "claim_id": "validation_gate_rationale", + "text": "The gate prevents avoidable review churn before Decodex review handoff." + } + ], + "must_not_include": [ + "The current gate only requires lint and test." + ], + "evidence_links": { + "current_validation_gate": [ + "validation-gate-current-decodex", + "validation-gate-old-lint-test", + "validation-gate-current-rationale" + ], + "validation_gate_rationale": ["validation-gate-current-rationale"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "validation-gate-current-decodex", + "claim_id": "current_validation_gate", + "requirement": "cite", + "quote": "run cargo make fmt, cargo make lint-fix, and cargo make check" + }, + { + "evidence_id": "validation-gate-old-lint-test", + "claim_id": "current_validation_gate", + "requirement": "use", + "quote": "Historical validation gate" + }, + { + "evidence_id": "validation-gate-current-rationale", + "claim_id": "validation_gate_rationale", + "requirement": "explain", + "quote": "prevent avoidable review churn" + } + ], + "negative_traps": [ + { + "trap_id": "uncited-current-policy-claim", + "type": "unsupported_prior", + "evidence_ids": ["validation-gate-uncited-policy-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Reports the current gate and the historical old gate correctly." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites current policy, historical policy, and rationale evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids uncited current-policy assertions." + }, + "uncertainty_handling": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Does not hedge because current policy evidence exists." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Distinguishes current and historical policy with update rationale." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["validation-gate-current-decodex"], + "historical_evidence_ids": ["validation-gate-old-lint-test"], + "stale_trap_ids": ["uncited-current-policy-claim"], + "conflicts": [ + { + "conflict_id": "validation-gate-updated", + "claim_id": "current_validation_gate", + "current_evidence_id": "validation-gate-current-decodex", + "historical_evidence_id": "validation-gate-old-lint-test", + "resolved_by_evidence_id": "validation-gate-current-rationale" + } + ], + "update_rationale": { + "claim_id": "validation_gate_rationale", + "evidence_ids": ["validation-gate-current-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "project_decisions", + "validation_gate", + "current_policy" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/project_decisions/private_manifest_caveat.json b/apps/elf-eval/fixtures/real_world_memory/project_decisions/private_manifest_caveat.json new file mode 100644 index 00000000..ed196f8e --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/project_decisions/private_manifest_caveat.json @@ -0,0 +1,251 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "project-decision-private-manifest-caveat-001", + "suite": "project_decisions", + "title": "State the bounded private-manifest caveat instead of overclaiming", + "corpus": { + "corpus_id": "real-world-memory-project-decisions-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "private-manifest-current-caveat", + "kind": "decision", + "text": "Current caveat: no private production corpus manifest is available, so project_decisions reports cannot claim private-corpus validation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "private_manifest_caveat", + "evidence_id": "private-manifest-current-caveat" + } + }, + "created_at": "2026-06-10T04:00:00Z" + }, + { + "evidence_id": "private-manifest-uncertainty-policy", + "kind": "decision", + "text": "Uncertainty policy: when evidence is missing, answer with the bounded caveat and do not invent a private-manifest pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "private_manifest_caveat", + "evidence_id": "private-manifest-uncertainty-policy" + } + }, + "created_at": "2026-06-10T04:05:00Z" + }, + { + "evidence_id": "private-manifest-supported-scope", + "kind": "decision", + "text": "Supported scope: the checked-in project_decisions fixtures prove only synthetic fixture-backed decision recovery for ELF.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "private_manifest_caveat", + "evidence_id": "private-manifest-supported-scope" + } + }, + "created_at": "2026-06-10T04:10:00Z" + }, + { + "evidence_id": "private-manifest-overconfident-trap", + "kind": "decision", + "text": "Overconfident unsupported answer: the private production corpus passed and proves project_decisions parity.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "private_manifest_caveat", + "evidence_id": "private-manifest-overconfident-trap" + } + }, + "created_at": "2026-06-10T04:15:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_project_decisions", + "answer": { + "content": "No private production corpus manifest is available, so the project_decisions report cannot claim private-corpus validation. The supported scope is synthetic fixture-backed decision recovery for ELF, and the correct uncertainty policy is to keep that bounded caveat instead of inventing a private-manifest pass.", + "claims": [ + { + "claim_id": "private_manifest_caveat", + "text": "No private production corpus manifest is available, so project_decisions reports cannot claim private-corpus validation.", + "evidence_ids": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy" + ], + "confidence": "high" + }, + { + "claim_id": "supported_project_decision_scope", + "text": "The checked-in project_decisions fixtures prove only synthetic fixture-backed decision recovery for ELF.", + "evidence_ids": ["private-manifest-supported-scope"], + "confidence": "high" + } + ], + "evidence_ids": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy", + "private-manifest-supported-scope" + ], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "private-manifest-caveat-recorded", + "ts": "2026-06-10T04:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy", + "private-manifest-supported-scope" + ], + "summary": "The suite records a bounded caveat for missing private-manifest evidence." + } + ], + "prompt": { + "role": "user", + "content": "Does the new project_decisions suite prove private-corpus validation or external parity?", + "job_mode": "answer", + "constraints": [ + "cite_evidence", + "state_caveats", + "avoid_overconfident_unsupported_answers" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "private_manifest_caveat", + "text": "No private production corpus manifest is available, so project_decisions reports cannot claim private-corpus validation." + }, + { + "claim_id": "supported_project_decision_scope", + "text": "The checked-in project_decisions fixtures prove only synthetic fixture-backed decision recovery for ELF." + } + ], + "must_not_include": [ + "The private production corpus passed.", + "project_decisions parity is proven." + ], + "evidence_links": { + "private_manifest_caveat": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy" + ], + "supported_project_decision_scope": ["private-manifest-supported-scope"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "private-manifest-current-caveat", + "claim_id": "private_manifest_caveat", + "requirement": "cite", + "quote": "no private production corpus manifest is available" + }, + { + "evidence_id": "private-manifest-uncertainty-policy", + "claim_id": "private_manifest_caveat", + "requirement": "use", + "quote": "do not invent a private-manifest pass" + }, + { + "evidence_id": "private-manifest-supported-scope", + "claim_id": "supported_project_decision_scope", + "requirement": "cite", + "quote": "only synthetic fixture-backed decision recovery" + } + ], + "negative_traps": [ + { + "trap_id": "overconfident-private-manifest-pass", + "type": "unsupported_prior", + "evidence_ids": ["private-manifest-overconfident-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States that private-corpus validation and parity are not proven." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites caveat, uncertainty policy, and supported-scope evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids the overconfident private-corpus pass trap." + }, + "uncertainty_handling": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Uses a bounded caveat when private-manifest evidence is missing." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Preserves the correct claim boundary for aggregate report interpretation." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "missing required caveat" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "No private production corpus manifest is available", + "synthetic fixture-backed decision recovery" + ], + "fallback_action": "continue_with_caveat" + }, + "memory_evolution": { + "current_evidence_ids": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy", + "private-manifest-supported-scope" + ], + "historical_evidence_ids": [], + "stale_trap_ids": ["overconfident-private-manifest-pass"], + "conflicts": [], + "update_rationale": { + "claim_id": "private_manifest_caveat", + "evidence_ids": ["private-manifest-uncertainty-policy"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "project_decisions", + "caveat", + "uncertainty" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/project_decisions/reversed_live_baseline_suite_win.json b/apps/elf-eval/fixtures/real_world_memory/project_decisions/reversed_live_baseline_suite_win.json new file mode 100644 index 00000000..21605eb3 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/project_decisions/reversed_live_baseline_suite_win.json @@ -0,0 +1,259 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "project-decision-reversal-live-baseline-001", + "suite": "project_decisions", + "title": "Distinguish a superseded live-baseline claim from the current suite boundary", + "corpus": { + "corpus_id": "real-world-memory-project-decisions-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "live-baseline-suite-win-old", + "kind": "decision", + "text": "Historical decision: a draft allowed live-baseline passes to be cited as real-world job suite wins.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "reversed_live_baseline_suite_win", + "evidence_id": "live-baseline-suite-win-old" + } + }, + "created_at": "2026-06-08T12:00:00Z" + }, + { + "evidence_id": "live-baseline-suite-win-current", + "kind": "decision", + "text": "Current decision: live-baseline passes are retrieval and lifecycle evidence only; real-world job suite wins require fixture-backed real_world_job reports.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "reversed_live_baseline_suite_win", + "evidence_id": "live-baseline-suite-win-current" + } + }, + "created_at": "2026-06-09T12:00:00Z" + }, + { + "evidence_id": "live-baseline-reversal-rationale", + "kind": "decision", + "text": "Reversal rationale: query-level live-baseline checks cannot prove durable decision recovery, rationale recovery, or unsupported-claim handling.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "reversed_live_baseline_suite_win", + "evidence_id": "live-baseline-reversal-rationale" + } + }, + "created_at": "2026-06-09T12:05:00Z" + }, + { + "evidence_id": "live-baseline-stale-reuse-trap", + "kind": "decision", + "text": "Stale reuse trap: cite the historical live-baseline-as-suite-win draft as the current project decision.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "reversed_live_baseline_suite_win", + "evidence_id": "live-baseline-stale-reuse-trap" + } + }, + "created_at": "2026-06-09T12:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_project_decisions", + "answer": { + "content": "The current decision is that live-baseline passes are retrieval and lifecycle evidence only; real-world job suite wins require fixture-backed real_world_job reports. The earlier draft that allowed live-baseline suite wins is historical, and it changed because query-level checks do not prove durable decision recovery, rationale recovery, or unsupported-claim handling.", + "claims": [ + { + "claim_id": "current_live_baseline_boundary", + "text": "Live-baseline passes are retrieval and lifecycle evidence only, not real-world job suite wins.", + "evidence_ids": [ + "live-baseline-suite-win-current", + "live-baseline-suite-win-old", + "live-baseline-reversal-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "live_baseline_reversal_rationale", + "text": "The decision changed because query-level checks do not prove durable decision recovery, rationale recovery, or unsupported-claim handling.", + "evidence_ids": ["live-baseline-reversal-rationale"], + "confidence": "high" + } + ], + "evidence_ids": [ + "live-baseline-suite-win-current", + "live-baseline-suite-win-old", + "live-baseline-reversal-rationale" + ], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "draft-live-baseline-suite-win", + "ts": "2026-06-08T12:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": ["live-baseline-suite-win-old"], + "summary": "A draft treated live-baseline passes as real-world job suite wins." + }, + { + "event_id": "current-live-baseline-boundary", + "ts": "2026-06-09T12:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": [ + "live-baseline-suite-win-current", + "live-baseline-reversal-rationale" + ], + "summary": "The current decision limited live-baseline evidence to retrieval and lifecycle checks." + } + ], + "prompt": { + "role": "user", + "content": "Can we still cite live-baseline passes as real-world job suite wins, or was that reversed?", + "job_mode": "decide", + "constraints": [ + "cite_evidence", + "distinguish_current_from_historical", + "state_rationale" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_live_baseline_boundary", + "text": "Live-baseline passes are retrieval and lifecycle evidence only, not real-world job suite wins." + }, + { + "claim_id": "live_baseline_reversal_rationale", + "text": "The decision changed because query-level checks do not prove durable decision recovery, rationale recovery, or unsupported-claim handling." + } + ], + "must_not_include": [ + "Live-baseline passes are real-world job suite wins." + ], + "evidence_links": { + "current_live_baseline_boundary": [ + "live-baseline-suite-win-current", + "live-baseline-suite-win-old", + "live-baseline-reversal-rationale" + ], + "live_baseline_reversal_rationale": ["live-baseline-reversal-rationale"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "live-baseline-suite-win-current", + "claim_id": "current_live_baseline_boundary", + "requirement": "cite", + "quote": "real-world job suite wins require fixture-backed real_world_job reports" + }, + { + "evidence_id": "live-baseline-suite-win-old", + "claim_id": "current_live_baseline_boundary", + "requirement": "use", + "quote": "Historical decision" + }, + { + "evidence_id": "live-baseline-reversal-rationale", + "claim_id": "live_baseline_reversal_rationale", + "requirement": "explain", + "quote": "cannot prove durable decision recovery, rationale recovery, or unsupported-claim handling" + } + ], + "negative_traps": [ + { + "trap_id": "stale-live-baseline-suite-win-reuse", + "type": "stale_fact", + "evidence_ids": ["live-baseline-stale-reuse-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Reports the current boundary and marks the older decision historical." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites current, historical, and rationale evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not reuse the stale draft as the current decision." + }, + "uncertainty_handling": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Does not overstate live-baseline evidence." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Shows the decision reversal and available update rationale." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["live-baseline-suite-win-current"], + "historical_evidence_ids": ["live-baseline-suite-win-old"], + "stale_trap_ids": ["stale-live-baseline-suite-win-reuse"], + "conflicts": [ + { + "conflict_id": "live-baseline-suite-win-reversed", + "claim_id": "current_live_baseline_boundary", + "current_evidence_id": "live-baseline-suite-win-current", + "historical_evidence_id": "live-baseline-suite-win-old", + "resolved_by_evidence_id": "live-baseline-reversal-rationale" + } + ], + "update_rationale": { + "claim_id": "live_baseline_reversal_rationale", + "evidence_ids": ["live-baseline-reversal-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "project_decisions", + "reversal", + "current_vs_historical" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/project_decisions/tradeoff_fixture_backed_first.json b/apps/elf-eval/fixtures/real_world_memory/project_decisions/tradeoff_fixture_backed_first.json new file mode 100644 index 00000000..268e675b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/project_decisions/tradeoff_fixture_backed_first.json @@ -0,0 +1,256 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "project-decision-tradeoff-fixture-backed-001", + "suite": "project_decisions", + "title": "Explain the rationale and caveat for fixture-backed project decision jobs", + "corpus": { + "corpus_id": "real-world-memory-project-decisions-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "tradeoff-fixture-backed-first", + "kind": "decision", + "text": "Accepted tradeoff: encode project_decisions first as offline fixture-backed jobs, not as live external-adapter parity runs.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "tradeoff_fixture_backed_first", + "evidence_id": "tradeoff-fixture-backed-first" + } + }, + "created_at": "2026-06-10T03:00:00Z" + }, + { + "evidence_id": "tradeoff-fixture-backed-rationale", + "kind": "decision", + "text": "Tradeoff rationale: fixture-backed jobs can lock evidence, negative traps, and typed outcomes now while external adapters remain unrun for this suite.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "tradeoff_fixture_backed_first", + "evidence_id": "tradeoff-fixture-backed-rationale" + } + }, + "created_at": "2026-06-10T03:05:00Z" + }, + { + "evidence_id": "tradeoff-fixture-backed-caveat", + "kind": "decision", + "text": "Caveat: this suite must not claim external-project parity until external adapters actually run the project_decisions jobs.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "tradeoff_fixture_backed_first", + "evidence_id": "tradeoff-fixture-backed-caveat" + } + }, + "created_at": "2026-06-10T03:10:00Z" + }, + { + "evidence_id": "tradeoff-external-parity-trap", + "kind": "decision", + "text": "Unsupported prior: qmd, agentmemory, and mem0 already passed the project_decisions suite.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "tradeoff_fixture_backed_first", + "evidence_id": "tradeoff-external-parity-trap" + } + }, + "created_at": "2026-06-10T03:15:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_project_decisions", + "answer": { + "content": "The accepted tradeoff is to encode project_decisions first as offline fixture-backed jobs. The rationale is that fixture-backed jobs can lock evidence, negative traps, and typed outcomes now while external adapters remain unrun. The caveat is that this suite must not claim external-project parity until external adapters actually run these jobs.", + "claims": [ + { + "claim_id": "fixture_backed_tradeoff", + "text": "Encode project_decisions first as offline fixture-backed jobs.", + "evidence_ids": ["tradeoff-fixture-backed-first"], + "confidence": "high" + }, + { + "claim_id": "fixture_backed_tradeoff_rationale", + "text": "Fixture-backed jobs can lock evidence, negative traps, and typed outcomes now while external adapters remain unrun.", + "evidence_ids": ["tradeoff-fixture-backed-rationale"], + "confidence": "high" + }, + { + "claim_id": "fixture_backed_parity_caveat", + "text": "Do not claim external-project parity until external adapters run the project_decisions jobs.", + "evidence_ids": ["tradeoff-fixture-backed-caveat"], + "confidence": "high" + } + ], + "evidence_ids": [ + "tradeoff-fixture-backed-first", + "tradeoff-fixture-backed-rationale", + "tradeoff-fixture-backed-caveat" + ], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "fixture-backed-first-decision", + "ts": "2026-06-10T03:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "tradeoff-fixture-backed-first", + "tradeoff-fixture-backed-rationale", + "tradeoff-fixture-backed-caveat" + ], + "summary": "The project_decisions suite was encoded as fixture-backed evidence first with a parity caveat." + } + ], + "prompt": { + "role": "user", + "content": "Why are project_decisions fixtures offline first, and what claim boundary should the report preserve?", + "job_mode": "decide", + "constraints": [ + "cite_evidence", + "state_rationale", + "state_caveats", + "do_not_claim_external_parity" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "fixture_backed_tradeoff", + "text": "Encode project_decisions first as offline fixture-backed jobs." + }, + { + "claim_id": "fixture_backed_tradeoff_rationale", + "text": "Fixture-backed jobs can lock evidence, negative traps, and typed outcomes now while external adapters remain unrun." + }, + { + "claim_id": "fixture_backed_parity_caveat", + "text": "Do not claim external-project parity until external adapters run the project_decisions jobs." + } + ], + "must_not_include": [ + "qmd, agentmemory, and mem0 already passed the project_decisions suite." + ], + "evidence_links": { + "fixture_backed_tradeoff": ["tradeoff-fixture-backed-first"], + "fixture_backed_tradeoff_rationale": ["tradeoff-fixture-backed-rationale"], + "fixture_backed_parity_caveat": ["tradeoff-fixture-backed-caveat"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "tradeoff-fixture-backed-first", + "claim_id": "fixture_backed_tradeoff", + "requirement": "cite", + "quote": "offline fixture-backed jobs" + }, + { + "evidence_id": "tradeoff-fixture-backed-rationale", + "claim_id": "fixture_backed_tradeoff_rationale", + "requirement": "explain", + "quote": "lock evidence, negative traps, and typed outcomes" + }, + { + "evidence_id": "tradeoff-fixture-backed-caveat", + "claim_id": "fixture_backed_parity_caveat", + "requirement": "cite", + "quote": "must not claim external-project parity" + } + ], + "negative_traps": [ + { + "trap_id": "external-parity-without-adapter-run", + "type": "unsupported_prior", + "evidence_ids": ["tradeoff-external-parity-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the fixture-backed-first decision." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites decision, rationale, and caveat evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids unsupported external parity claims." + }, + "uncertainty_handling": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "States the external-adapter caveat instead of overclaiming." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Preserves the report boundary for future adapter work." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "missing required caveat" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "must not claim external-project parity", + "external adapters remain unrun" + ], + "fallback_action": "continue_with_caveat" + }, + "memory_evolution": { + "current_evidence_ids": [ + "tradeoff-fixture-backed-first", + "tradeoff-fixture-backed-rationale", + "tradeoff-fixture-backed-caveat" + ], + "historical_evidence_ids": [], + "stale_trap_ids": ["external-parity-without-adapter-run"], + "conflicts": [], + "update_rationale": { + "claim_id": "fixture_backed_tradeoff_rationale", + "evidence_ids": ["tradeoff-fixture-backed-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "project_decisions", + "tradeoff_rationale", + "no_external_parity_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/alternate_phrasing.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/alternate_phrasing.json new file mode 100644 index 00000000..a8b4c6c5 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/alternate_phrasing.json @@ -0,0 +1,173 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "retrieval-alt-phrasing-001", + "suite": "retrieval", + "title": "Recover current handoff evidence from alternate phrasing", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy840-current-handoff", + "kind": "issue", + "text": "XY-840 trace schema lane uses branch y/elf-xy-840. Before review handoff, run `cargo make check` after the trace schema update is complete.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "alternate_phrasing", + "evidence_id": "xy840-current-handoff" + } + }, + "created_at": "2026-06-09T01:00:00Z" + }, + { + "evidence_id": "xy840-old-handoff-trap", + "kind": "decision", + "text": "Old note: XY-840 used branch y/elf-old-840 and only needed `cargo make test` before handoff.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "alternate_phrasing", + "evidence_id": "xy840-old-handoff-trap" + } + }, + "created_at": "2026-06-08T01:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "Use branch y/elf-xy-840 for XY-840 and run `cargo make check` before review handoff.", + "claims": [ + { + "claim_id": "branch", + "text": "Use branch y/elf-xy-840 for XY-840.", + "evidence_ids": ["xy840-current-handoff"], + "confidence": "high" + }, + { + "claim_id": "gate", + "text": "Run `cargo make check` before review handoff.", + "evidence_ids": ["xy840-current-handoff"], + "confidence": "high" + } + ], + "evidence_ids": ["xy840-current-handoff"], + "latency_ms": 13.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy840-old-branch", + "ts": "2026-06-08T01:00:00Z", + "actor": "agent", + "action": "recorded_old_handoff", + "evidence_ids": ["xy840-old-handoff-trap"], + "summary": "An older handoff note referenced the wrong branch and a narrower gate." + }, + { + "event_id": "xy840-current-handoff", + "ts": "2026-06-09T01:00:00Z", + "actor": "agent", + "action": "updated_handoff", + "evidence_ids": ["xy840-current-handoff"], + "summary": "The current handoff evidence changed the branch and validation gate." + } + ], + "prompt": { + "role": "user", + "content": "For the trace-schema handoff, which XY-840 branch and pre-review check do I need?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_stale_facts"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "branch", + "text": "Use branch y/elf-xy-840 for XY-840." + }, + { + "claim_id": "gate", + "text": "Run `cargo make check` before review handoff." + } + ], + "must_not_include": [ + "Use branch y/elf-old-840 for XY-840.", + "Run `cargo make test` before review handoff." + ], + "evidence_links": { + "branch": ["xy840-current-handoff"], + "gate": ["xy840-current-handoff"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy840-current-handoff", + "claim_id": "branch", + "requirement": "cite", + "quote": "uses branch y/elf-xy-840" + }, + { + "evidence_id": "xy840-current-handoff", + "claim_id": "gate", + "requirement": "use", + "quote": "run `cargo make check`" + } + ], + "negative_traps": [ + { + "trap_id": "old-xy840-handoff", + "type": "stale_fact", + "evidence_ids": ["xy840-old-handoff-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Returns the current branch and pre-review check." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the current handoff evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Ignores the stale branch and test-only gate." + }, + "latency_resource": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Reports bounded fixture latency and no cost." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "retrieval", "alternate_phrasing", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/current_vs_obsolete.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/current_vs_obsolete.json new file mode 100644 index 00000000..d6880447 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/current_vs_obsolete.json @@ -0,0 +1,148 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "retrieval-current-vs-obsolete-001", + "suite": "retrieval", + "title": "Select current benchmark context over obsolete live-baseline claims", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "obsolete-live-baseline-win", + "kind": "decision", + "text": "Obsolete draft: top-k live baseline results alone prove real-world job suite wins.", + "source_ref": {}, + "created_at": "2026-06-08T04:00:00Z" + }, + { + "evidence_id": "current-real-world-boundary", + "kind": "decision", + "text": "Current policy: live-baseline reports remain valid for Docker retrieval and lifecycle checks, but they are not real-world job suite wins. Real-world job reports must be published separately.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "current_vs_obsolete", + "evidence_id": "current-real-world-boundary" + } + }, + "created_at": "2026-06-09T04:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "Use the current boundary: live-baseline reports stay valid for Docker retrieval and lifecycle evidence, but they are not real-world job suite wins; publish real-world job reports separately.", + "claims": [ + { + "claim_id": "current_boundary", + "text": "Live-baseline reports are not real-world job suite wins.", + "evidence_ids": ["current-real-world-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["current-real-world-boundary"], + "latency_ms": 15.7, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "obsolete-draft", + "ts": "2026-06-08T04:00:00Z", + "actor": "agent", + "action": "recorded_obsolete_policy", + "evidence_ids": ["obsolete-live-baseline-win"], + "summary": "A draft conflated live-baseline retrieval checks with real-world job wins." + }, + { + "event_id": "current-boundary", + "ts": "2026-06-09T04:00:00Z", + "actor": "agent", + "action": "updated_policy", + "evidence_ids": ["current-real-world-boundary"], + "summary": "The current policy separates live-baseline evidence from real-world job suite claims." + } + ], + "prompt": { + "role": "user", + "content": "Can I cite the live-baseline pass as a real-world job suite win?", + "job_mode": "answer", + "constraints": ["cite_evidence", "use_current_policy", "avoid_obsolete_context"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_boundary", + "text": "Live-baseline reports are not real-world job suite wins." + } + ], + "must_not_include": [ + "Top-k live baseline results alone prove real-world job suite wins." + ], + "evidence_links": { + "current_boundary": ["current-real-world-boundary"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "current-real-world-boundary", + "claim_id": "current_boundary", + "requirement": "cite", + "quote": "they are not real-world job suite wins" + } + ], + "negative_traps": [ + { + "trap_id": "obsolete-suite-win", + "type": "stale_fact", + "evidence_ids": ["obsolete-live-baseline-win"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Answers with the current claim boundary." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the current policy evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids the obsolete top-k claim." + }, + "uncertainty_handling": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Does not hedge when sufficient current evidence exists." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "retrieval", "current_vs_obsolete", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/distractor_heavy.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/distractor_heavy.json new file mode 100644 index 00000000..819844b4 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/distractor_heavy.json @@ -0,0 +1,200 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "retrieval-distractor-heavy-001", + "suite": "retrieval", + "title": "Find provider stress evidence in a distractor-heavy corpus", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "elf-provider-stress-target", + "kind": "runbook", + "text": "For the ELF provider stress check, set ELF_BASELINE_PROJECTS=ELF and ELF_BASELINE_PROFILE=stress with provider embeddings. The expected report is the live baseline Docker report.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "distractor_heavy", + "evidence_id": "elf-provider-stress-target" + } + }, + "created_at": "2026-06-09T02:00:00Z" + }, + { + "evidence_id": "qmd-smoke-distractor", + "kind": "adapter_state", + "text": "qmd smoke uses a local collection and should not be described as the ELF provider stress run.", + "source_ref": {}, + "created_at": "2026-06-09T02:01:00Z" + }, + { + "evidence_id": "mem0-stress-distractor", + "kind": "adapter_state", + "text": "mem0 local FastEmbed stress evidence is not encoded for this provider profile.", + "source_ref": {}, + "created_at": "2026-06-09T02:02:00Z" + }, + { + "evidence_id": "openviking-install-distractor", + "kind": "adapter_state", + "text": "OpenViking local embedding install failure is an incomplete adapter state, not a provider stress pass.", + "source_ref": {}, + "created_at": "2026-06-09T02:03:00Z" + }, + { + "evidence_id": "private-manifest-distractor", + "kind": "runbook", + "text": "The private production manifest guard fails closed when ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is unset.", + "source_ref": {}, + "created_at": "2026-06-09T02:04:00Z" + }, + { + "evidence_id": "backfill-distractor", + "kind": "runbook", + "text": "The backfill profile defaults to ELF only and records resumable import evidence.", + "source_ref": {}, + "created_at": "2026-06-09T02:05:00Z" + }, + { + "evidence_id": "scale-distractor", + "kind": "runbook", + "text": "The scale profile has generated distractor notes but is not the provider stress profile.", + "source_ref": {}, + "created_at": "2026-06-09T02:06:00Z" + }, + { + "evidence_id": "smoke-distractor", + "kind": "runbook", + "text": "The smoke profile is the default quick matrix and should not be used as the stress command.", + "source_ref": {}, + "created_at": "2026-06-09T02:07:00Z" + }, + { + "evidence_id": "agentmemory-distractor", + "kind": "adapter_state", + "text": "agentmemory same-corpus retrieval passed with mocked storage but lifecycle did not pass.", + "source_ref": {}, + "created_at": "2026-06-09T02:08:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "Target ELF only with the stress profile: set ELF_BASELINE_PROJECTS=ELF and ELF_BASELINE_PROFILE=stress with provider embeddings.", + "claims": [ + { + "claim_id": "stress_target", + "text": "Set ELF_BASELINE_PROJECTS=ELF and ELF_BASELINE_PROFILE=stress.", + "evidence_ids": ["elf-provider-stress-target"], + "confidence": "high" + } + ], + "evidence_ids": ["elf-provider-stress-target"], + "latency_ms": 22.8, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "provider-stress-requested", + "ts": "2026-06-09T02:00:00Z", + "actor": "operator", + "action": "requested_provider_stress", + "evidence_ids": ["elf-provider-stress-target"], + "summary": "The operator requested the ELF provider stress profile, not a smoke or external adapter run." + } + ], + "prompt": { + "role": "user", + "content": "Which profile and project selector should I use for the provider-backed ELF stress run?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_adapter_parity_claims"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "stress_target", + "text": "Set ELF_BASELINE_PROJECTS=ELF and ELF_BASELINE_PROFILE=stress." + } + ], + "must_not_include": [ + "Set ELF_BASELINE_PROJECTS=qmd.", + "Use the smoke profile as the stress run.", + "OpenViking passed the provider stress profile." + ], + "evidence_links": { + "stress_target": ["elf-provider-stress-target"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "elf-provider-stress-target", + "claim_id": "stress_target", + "requirement": "cite", + "quote": "set ELF_BASELINE_PROJECTS=ELF and ELF_BASELINE_PROFILE=stress" + } + ], + "negative_traps": [ + { + "trap_id": "adapter-and-profile-distractors", + "type": "decoy_evidence", + "evidence_ids": [ + "qmd-smoke-distractor", + "mem0-stress-distractor", + "openviking-install-distractor", + "private-manifest-distractor", + "backfill-distractor", + "scale-distractor", + "smoke-distractor", + "agentmemory-distractor" + ], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Returns the exact ELF-only stress selector." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Uses the provider stress target evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids adapter, private-manifest, smoke, scale, and backfill decoys." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Reports bounded fixture latency and cost." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "retrieval", "distractor_heavy", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/minimal_sufficient_context.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/minimal_sufficient_context.json new file mode 100644 index 00000000..5c3e0bfe --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/minimal_sufficient_context.json @@ -0,0 +1,148 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "retrieval-minimal-context-001", + "suite": "retrieval", + "title": "Return minimal sufficient restore evidence without irrelevant context", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "restore-minimal-proof", + "kind": "runbook", + "text": "Minimal sufficient restore proof: note ingest returned ADD/remember, Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0, and search recovered the restored note.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "minimal_sufficient_context", + "evidence_id": "restore-minimal-proof" + } + }, + "created_at": "2026-06-09T05:00:00Z" + }, + { + "evidence_id": "restore-verbose-log-decoy", + "kind": "trace", + "text": "Verbose restore log includes port remapping, shell setup, and temporary script names that are not needed to answer the proof question.", + "source_ref": {}, + "created_at": "2026-06-09T05:01:00Z" + }, + { + "evidence_id": "provider-run-decoy", + "kind": "trace", + "text": "Provider stress run latency and embedding dimensions are unrelated to the minimal restore proof.", + "source_ref": {}, + "created_at": "2026-06-09T05:02:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "The minimal sufficient restore proof is that note ingest returned ADD/remember, Qdrant rebuild returned rebuilt_count=1 with zero missing vectors and zero errors, and search recovered the restored note.", + "claims": [ + { + "claim_id": "restore_minimal", + "text": "Note ingest returned ADD/remember, rebuild returned rebuilt_count=1 with zero missing vectors and zero errors, and search recovered the restored note.", + "evidence_ids": ["restore-minimal-proof"], + "confidence": "high" + } + ], + "evidence_ids": ["restore-minimal-proof"], + "latency_ms": 9.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "restore-proof-recorded", + "ts": "2026-06-09T05:00:00Z", + "actor": "agent", + "action": "published_restore_proof", + "evidence_ids": ["restore-minimal-proof"], + "summary": "The restore proof recorded the minimal required note ingest, rebuild, and recovered-search evidence." + } + ], + "prompt": { + "role": "user", + "content": "What is the minimal sufficient context proving the restore recovered memory?", + "job_mode": "answer", + "constraints": ["cite_evidence", "minimal_sufficient_context", "avoid_irrelevant_context"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "restore_minimal", + "text": "Note ingest returned ADD/remember, rebuild returned rebuilt_count=1 with zero missing vectors and zero errors, and search recovered the restored note." + } + ], + "must_not_include": [ + "Port remapping is required to prove restore correctness.", + "Provider stress latency is required to prove restore correctness." + ], + "evidence_links": { + "restore_minimal": ["restore-minimal-proof"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "restore-minimal-proof", + "claim_id": "restore_minimal", + "requirement": "cite", + "quote": "Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0" + } + ], + "negative_traps": [ + { + "trap_id": "irrelevant-restore-context", + "type": "decoy_evidence", + "evidence_ids": ["restore-verbose-log-decoy", "provider-run-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "States only the minimal restore proof." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the minimal proof evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids verbose logs and unrelated provider evidence." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Keeps the answer compact enough for agent context use." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "retrieval", "minimal_sufficient_context", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/multi_hop_routing.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/multi_hop_routing.json new file mode 100644 index 00000000..bd2e6b8b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/multi_hop_routing.json @@ -0,0 +1,181 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "retrieval-multi-hop-routing-001", + "suite": "retrieval", + "title": "Answer a multi-hop benchmark routing question", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy845-landing-zone", + "kind": "issue", + "text": "XY-845 should add retrieval-quality real_world_job cases under apps/elf-eval/fixtures/real_world_memory/retrieval/ and extend the runner/report seams.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "multi_hop_routing", + "evidence_id": "xy845-landing-zone" + } + }, + "created_at": "2026-06-09T03:00:00Z" + }, + { + "evidence_id": "routing-reference-boundary", + "kind": "decision", + "text": "qmd and OpenViking are strong references for routing, fusion, hierarchical retrieval, and staged trajectory, but parity must not be claimed unless their adapters actually run.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "multi_hop_routing", + "evidence_id": "routing-reference-boundary" + } + }, + "created_at": "2026-06-09T03:01:00Z" + }, + { + "evidence_id": "ranking-tune-trap", + "kind": "decision", + "text": "Do not tune ELF ranking blindly to fixtures; ranking changes need trace and provenance evidence.", + "source_ref": {}, + "created_at": "2026-06-09T03:02:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "Add the new cases under apps/elf-eval/fixtures/real_world_memory/retrieval/ and extend the runner/report seams, while treating qmd and OpenViking only as references unless their adapters actually run.", + "claims": [ + { + "claim_id": "landing_zone", + "text": "Add retrieval-quality real_world_job cases under apps/elf-eval/fixtures/real_world_memory/retrieval/.", + "evidence_ids": ["xy845-landing-zone"], + "confidence": "high" + }, + { + "claim_id": "reference_boundary", + "text": "Treat qmd and OpenViking only as references unless their adapters actually run.", + "evidence_ids": ["routing-reference-boundary"], + "confidence": "high" + } + ], + "evidence_ids": ["xy845-landing-zone", "routing-reference-boundary"], + "latency_ms": 31.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "issue-route", + "ts": "2026-06-09T03:00:00Z", + "actor": "operator", + "action": "specified_landing_zone", + "evidence_ids": ["xy845-landing-zone"], + "summary": "The issue named the real_world_memory retrieval fixture path and runner/report seams." + }, + { + "event_id": "reference-boundary", + "ts": "2026-06-09T03:01:00Z", + "actor": "agent", + "action": "recorded_reference_boundary", + "evidence_ids": ["routing-reference-boundary"], + "summary": "External projects are design references, not benchmark passes without adapters." + } + ], + "prompt": { + "role": "user", + "content": "How should XY-845 extend the benchmark while respecting the qmd/OpenViking reference boundary?", + "job_mode": "decide", + "constraints": ["cite_evidence", "avoid_unsupported_claims", "avoid_blind_ranking_tuning"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "landing_zone", + "text": "Add retrieval-quality real_world_job cases under apps/elf-eval/fixtures/real_world_memory/retrieval/." + }, + { + "claim_id": "reference_boundary", + "text": "Treat qmd and OpenViking only as references unless their adapters actually run." + } + ], + "must_not_include": [ + "Claim qmd parity from fixture-only output.", + "Claim OpenViking parity from fixture-only output.", + "Tune ELF ranking blindly to fixtures." + ], + "evidence_links": { + "landing_zone": ["xy845-landing-zone"], + "reference_boundary": ["routing-reference-boundary"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy845-landing-zone", + "claim_id": "landing_zone", + "requirement": "cite", + "quote": "apps/elf-eval/fixtures/real_world_memory/retrieval/" + }, + { + "evidence_id": "routing-reference-boundary", + "claim_id": "reference_boundary", + "requirement": "cite", + "quote": "parity must not be claimed unless their adapters actually run" + } + ], + "negative_traps": [ + { + "trap_id": "blind-ranking-tune", + "type": "unsafe_action", + "evidence_ids": ["ranking-tune-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Combines landing-zone and reference-boundary evidence." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites both required evidence ids." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids blind ranking tuning and unsupported parity claims." + }, + "workflow_helpfulness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Gives an implementation route rather than a vague retrieval summary." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "retrieval", "multi_hop", "hierarchical_routing", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/stage_explainability_wrong_result.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/stage_explainability_wrong_result.json new file mode 100644 index 00000000..9a7971e2 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/stage_explainability_wrong_result.json @@ -0,0 +1,213 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "operator-debug-stage-attribution-001", + "suite": "operator_debugging_ux", + "title": "Attribute a wrong result to the retrieval stage that demoted evidence", + "corpus": { + "corpus_id": "real-world-memory-retrieval-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "stage-target", + "kind": "trace", + "text": "Correct trace evidence: the expected XY-845 retrieval fixture was present in recall.candidates, then demoted at rerank.score because a stale decoy shared more lexical terms.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stage_explainability_wrong_result", + "evidence_id": "stage-target" + } + }, + "created_at": "2026-06-09T06:00:00Z" + }, + { + "evidence_id": "stage-decoy", + "kind": "trace", + "text": "Stale decoy: XY-845 should only use top-k smoke checks; real-world retrieval fixtures are out of scope.", + "source_ref": {}, + "created_at": "2026-06-08T06:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_retrieval", + "answer": { + "content": "Expected evidence was present in recall.candidates but demoted at rerank.score; the selected stale top-k smoke-only evidence was the decoy to repair against.", + "claims": [ + { + "claim_id": "stage_attribution", + "text": "Expected evidence was present in recall.candidates but demoted at rerank.score.", + "evidence_ids": ["stage-target"], + "confidence": "high" + } + ], + "evidence_ids": ["stage-target"], + "latency_ms": 18.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "66666666-6666-4666-8666-666666666666", + "failure_stage": "rerank.score", + "failure_reason": "Expected evidence survived candidate recall but was demoted below a stale decoy during rerank.", + "stages": [ + { + "stage_name": "rewrite.expansion", + "kept_evidence": [], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Alternate phrasing preserved the original intent." + }, + { + "stage_name": "recall.candidates", + "kept_evidence": ["stage-target", "stage-decoy"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": ["stage-decoy"], + "notes": "Candidate recall found both expected evidence and stale decoy evidence." + }, + { + "stage_name": "rerank.score", + "kept_evidence": ["stage-decoy"], + "dropped_evidence": [], + "demoted_evidence": ["stage-target"], + "distractor_evidence": ["stage-decoy"], + "notes": "The stale decoy outranked the expected evidence." + }, + { + "stage_name": "selection.final", + "kept_evidence": ["stage-decoy"], + "dropped_evidence": ["stage-target"], + "demoted_evidence": [], + "distractor_evidence": ["stage-decoy"], + "notes": "Final selection missed the required evidence." + } + ] + } + } + } + }, + "timeline": [ + { + "event_id": "trace-candidate-recall", + "ts": "2026-06-09T06:00:00Z", + "actor": "agent", + "action": "inspected_trace", + "evidence_ids": ["stage-target"], + "summary": "The trace showed expected evidence at candidate recall and demotion at rerank." + }, + { + "event_id": "stale-decoy-ranked", + "ts": "2026-06-09T06:01:00Z", + "actor": "agent", + "action": "selected_wrong_context", + "evidence_ids": ["stage-decoy"], + "summary": "A stale decoy became the selected answer even though it was obsolete." + } + ], + "prompt": { + "role": "user", + "content": "Why did the wrong retrieval result happen, and which stage dropped or demoted the expected evidence?", + "job_mode": "debug", + "constraints": ["cite_evidence", "identify_retrieval_stage", "avoid_obsolete_context"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "stage_attribution", + "text": "Expected evidence was present in recall.candidates but demoted at rerank.score." + } + ], + "must_not_include": [ + "real-world retrieval fixtures are out of scope" + ], + "evidence_links": { + "stage_attribution": ["stage-target"] + }, + "answer_type": "debug_report", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "stage-target", + "claim_id": "stage_attribution", + "requirement": "explain", + "quote": "present in recall.candidates, then demoted at rerank.score" + } + ], + "negative_traps": [ + { + "trap_id": "stale-top-k-only-decoy", + "type": "stale_fact", + "evidence_ids": ["stage-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Identifies the stage attribution without selecting the stale final answer." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses the expected trace evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not cite the stale top-k-only decoy." + }, + "debuggability": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Reports the stage that demoted expected evidence." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Turns the wrong result into actionable trace evidence." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "rerank_demoted_expected_evidence", + "trace_id": "66666666-6666-4666-8666-666666666666", + "viewer_url": "/viewer?trace_id=66666666-6666-4666-8666-666666666666", + "admin_trace_bundle_url": "/v2/admin/traces/66666666-6666-4666-8666-666666666666/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "The expected evidence survived recall.candidates but was demoted below a stale decoy during rerank.score.", + "steps_to_root_cause": 3, + "raw_sql_needed": false, + "dropped_candidate_visibility": "visible in trace_explainability rerank.score and selection.final stages", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": ["Trace", "Retrieval Funnel", "Replay Candidates", "Stage Details"], + "cli_steps": [ + "open trace explainability bundle", + "compare recall.candidates with rerank.score", + "inspect selected stale decoy", + "repair rerank inputs or stale-context filtering" + ], + "trace_evidence": ["stage-target", "stage-decoy"], + "ux_gaps": [] + }, + "tags": ["synthetic", "operator_debugging_ux", "trace_explainability", "stage_attribution", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/knowledge_page_refresh_suggestion.json b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/knowledge_page_refresh_suggestion.json new file mode 100644 index 00000000..6a9b01f3 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/knowledge_page_refresh_suggestion.json @@ -0,0 +1,304 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "scheduled-knowledge-page-refresh-suggestion-001", + "suite": "scheduled_memory", + "title": "Suggest a knowledge-page refresh from scheduled memory", + "corpus": { + "corpus_id": "real-world-memory-scheduled-2026-06-17", + "profile": "synthetic", + "items": [ + { + "evidence_id": "scheduled-knowledge-page-stale-finding", + "kind": "fact", + "text": "Knowledge-page lint finding: the project ELF benchmark suite page references the old scheduled-memory blocked state after the scheduled_memory fixture suite was added.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "knowledge_page_refresh_suggestion", + "evidence_id": "scheduled-knowledge-page-stale-finding" + }, + "locator": { + "quote": "old scheduled-memory blocked state" + } + }, + "created_at": "2026-06-17T00:22:00Z" + }, + { + "evidence_id": "scheduled-knowledge-reviewable-refresh", + "kind": "constraint", + "text": "Current knowledge-page refresh rule: scheduled tasks may suggest a reviewable rebuild, but they must not silently rewrite authoritative source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "knowledge_page_refresh_suggestion", + "evidence_id": "scheduled-knowledge-reviewable-refresh" + }, + "locator": { + "quote": "must not silently rewrite authoritative source notes" + } + }, + "created_at": "2026-06-17T00:24:00Z" + }, + { + "evidence_id": "scheduled-knowledge-silent-rewrite-trap", + "kind": "note", + "text": "Stale claim: a scheduled knowledge refresh may rewrite authoritative source notes automatically after lint finds a stale page.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "knowledge_page_refresh_suggestion", + "evidence_id": "scheduled-knowledge-silent-rewrite-trap" + } + }, + "created_at": "2026-06-16T18:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_scheduled_memory", + "answer": { + "content": "Scheduled knowledge-page refresh suggestion: suggest a reviewable rebuild because lint found the old scheduled-memory blocked state, and do not silently rewrite source notes.", + "claims": [ + { + "claim_id": "scheduled_knowledge_refresh_suggested", + "text": "A reviewable knowledge-page rebuild should be suggested because the page still references the old scheduled-memory blocked state.", + "evidence_ids": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "scheduled_tasks": [ + { + "task_run_id": "scheduled-knowledge-refresh-2026-06-17", + "contract_schema": "elf.scheduled_memory_task/v1", + "generated_at": "2026-06-17T00:45:00Z", + "scheduled_for": "2026-06-17T00:42:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-954-fixture-agent", + "read_profile": "private_plus_project", + "task_kind": "knowledge_page_refresh_suggestion", + "outputs": [ + { + "output_id": "scheduled-suggest-reviewable-knowledge-rebuild", + "output_kind": "knowledge_page_refresh_suggestion", + "text": "Suggest a reviewable knowledge-page rebuild for the stale scheduled-memory blocked-state reference; do not rewrite source notes silently.", + "evidence_refs": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "freshness": { + "status": "current", + "observed_at": "2026-06-17T00:24:00Z", + "valid_from": "2026-06-17T00:22:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-17T00:45:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "action": { + "decision": "recommend", + "reason_code": "RECOMMEND_REVIEWABLE_KNOWLEDGE_REBUILD", + "reason": "The lint finding is current and the refresh rule requires reviewable derived output instead of source mutation." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "scheduled-knowledge-page-stale-finding", + "status": "current", + "reason": "Current stale-page lint finding." + }, + { + "evidence_id": "scheduled-knowledge-reviewable-refresh", + "status": "current", + "reason": "Current refresh boundary." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [ + { + "evidence_id": "scheduled-knowledge-silent-rewrite-trap", + "status": "stale", + "reason": "Silent authoritative source-note rewrites are not allowed." + } + ], + "superseded_source_refs": [], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + }, + "execution_trace": { + "trace_id": "trace-scheduled-knowledge-refresh-2026-06-17", + "trigger_kind": "fixture_schedule", + "status": "completed", + "started_at": "2026-06-17T00:42:00Z", + "completed_at": "2026-06-17T00:45:00Z", + "output_ref": "scheduled-suggest-reviewable-knowledge-rebuild", + "stages": [ + { + "stage_name": "memory_read", + "summary": "Read current lint finding and refresh boundary.", + "evidence_refs": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ] + }, + { + "stage_name": "mutation_guard", + "summary": "Rejected silent authoritative source-note rewrite.", + "evidence_refs": ["scheduled-knowledge-silent-rewrite-trap"] + }, + { + "stage_name": "output_readback", + "summary": "Recorded reviewable knowledge-page refresh suggestion.", + "evidence_refs": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ] + } + ] + }, + "source_mutations": [], + "unsupported_claim_flags": [] + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "scheduled-knowledge-lint-recorded", + "ts": "2026-06-17T00:22:00Z", + "actor": "knowledge_lint_fixture", + "action": "recorded_stale_page_finding", + "evidence_ids": ["scheduled-knowledge-page-stale-finding"], + "summary": "The stale scheduled-memory blocked-state page reference was recorded." + }, + { + "event_id": "scheduled-knowledge-refresh-output-recorded", + "ts": "2026-06-17T00:45:00Z", + "actor": "scheduler_fixture", + "action": "recorded_source_linked_output", + "evidence_ids": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "summary": "The scheduled task recorded a reviewable knowledge-page refresh suggestion." + } + ], + "prompt": { + "role": "system", + "content": "Run the scheduled knowledge-page refresh suggestion task.", + "job_mode": "scheduled_memory", + "constraints": [ + "cite_evidence", + "mark_currentness", + "record_execution_trace", + "do_not_mutate_source_notes_silently" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "scheduled_knowledge_refresh_suggested", + "text": "A reviewable knowledge-page rebuild should be suggested because the page still references the old scheduled-memory blocked state." + } + ], + "must_not_include": [ + "scheduled knowledge refresh may rewrite authoritative source notes automatically" + ], + "evidence_links": { + "scheduled_knowledge_refresh_suggested": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ] + }, + "answer_type": "scheduled_memory_task", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "scheduled-knowledge-page-stale-finding", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite", + "quote": "old scheduled-memory blocked state" + }, + { + "evidence_id": "scheduled-knowledge-reviewable-refresh", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite", + "quote": "must not silently rewrite authoritative source notes" + } + ], + "negative_traps": [ + { + "trap_id": "scheduled-knowledge-silent-rewrite-trap", + "type": "stale_fact", + "evidence_ids": ["scheduled-knowledge-silent-rewrite-trap"], + "failure_if_used": true + } + ], + "scheduled_memory": { + "required_task_kinds": ["knowledge_page_refresh_suggestion"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Output suggests the reviewable knowledge-page rebuild." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Output cites lint finding and refresh boundary evidence." + }, + "trace_readback": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Execution trace includes output readback." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Silent source-note rewrite trap is not selected." + }, + "source_immutability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Scheduled refresh suggestion leaves source mutation count at zero." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "scheduled task output lacks execution trace readback", + "source mutation count must remain zero" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No reviewable rebuild boundary is available."], + "fallback_action": "defer_knowledge_refresh" + }, + "tags": ["synthetic", "scheduled_memory", "knowledge_page_refresh_suggestion", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/private_provider_scheduler_blocked.json b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/private_provider_scheduler_blocked.json new file mode 100644 index 00000000..54461f9d --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/private_provider_scheduler_blocked.json @@ -0,0 +1,129 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "scheduled-private-provider-scheduler-blocked-001", + "suite": "scheduled_memory", + "title": "Block private/provider scheduled tasks without operator inputs", + "corpus": { + "corpus_id": "real-world-memory-scheduled-private-provider-2026-06-17", + "profile": "private_sanitized", + "items": [ + { + "evidence_id": "scheduled-private-provider-missing-inputs", + "kind": "blocker", + "text": "Private/provider scheduled task blocker: no operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available to this fixture.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_provider_scheduler_blocked", + "evidence_id": "scheduled-private-provider-missing-inputs" + }, + "locator": { + "quote": "no operator-owned private production corpus manifest" + } + }, + "created_at": "2026-06-17T00:50:00Z" + }, + { + "evidence_id": "scheduled-private-provider-xy930-boundary", + "kind": "issue", + "text": "XY-930 owns private-corpus and credentialed production gates when operator inputs exist; fixture-only scheduled-memory output must not claim private/provider scheduler quality.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_provider_scheduler_blocked", + "evidence_id": "scheduled-private-provider-xy930-boundary" + }, + "locator": { + "quote": "XY-930 owns private-corpus and credentialed production gates" + } + }, + "created_at": "2026-06-17T00:51:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "scheduled-private-provider-blocked-recorded", + "ts": "2026-06-17T00:51:00Z", + "actor": "scheduler_fixture", + "action": "recorded_private_provider_blocker", + "evidence_ids": [ + "scheduled-private-provider-missing-inputs", + "scheduled-private-provider-xy930-boundary" + ], + "summary": "Private/provider scheduled task scoring remains blocked until operator-owned inputs exist." + } + ], + "prompt": { + "role": "system", + "content": "Run private/provider scheduled memory tasks when operator inputs exist.", + "job_mode": "scheduled_memory", + "constraints": [ + "do_not_require_private_corpus", + "block_without_operator_inputs", + "tie_blocker_to_xy930" + ] + }, + "expected_answer": { + "must_include": [], + "must_not_include": [ + "private/provider scheduled task quality is proven", + "hosted scheduler production behavior passed", + "private production corpus scheduled refresh passed" + ], + "evidence_links": {}, + "answer_type": "scheduled_memory_task", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": true + }, + "required_evidence": [], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The fixture records private/provider scheduled tasks as blocked." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The blocker points to missing operator inputs and XY-930." + }, + "uncertainty_handling": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "No private/provider scheduler pass is claimed." + }, + "lifecycle_behavior": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The blocked state remains typed and visible." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "No operator-owned private production corpus manifest is available.", + "No hosted scheduler configuration is available." + ], + "fallback_action": "block_private_provider_scheduler_until_xy930_inputs_exist" + }, + "encoding": { + "status": "blocked", + "reason": "No operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available; private/provider scheduled tasks stay blocked under XY-930.", + "follow_up": { + "title": "XY-930 private/provider scheduled-memory input gate", + "reason": "Run private-corpus, provider-backed, and hosted scheduler gates only when operator-owned inputs exist." + } + }, + "tags": ["private_sanitized", "scheduled_memory", "private_provider_scheduler", "xy930_blocked"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_decision_audit.json b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_decision_audit.json new file mode 100644 index 00000000..2efd9140 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_decision_audit.json @@ -0,0 +1,283 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "scheduled-stale-decision-audit-001", + "suite": "scheduled_memory", + "title": "Audit a stale project decision during a scheduled task", + "corpus": { + "corpus_id": "real-world-memory-scheduled-2026-06-17", + "profile": "synthetic", + "items": [ + { + "evidence_id": "scheduled-old-consolidation-only-decision", + "kind": "decision", + "text": "Historical decision: scheduled-memory readiness stays blocked and should only run cargo make real-world-memory-consolidation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_decision_audit", + "evidence_id": "scheduled-old-consolidation-only-decision" + }, + "locator": { + "quote": "only run cargo make real-world-memory-consolidation" + } + }, + "created_at": "2026-06-16T05:00:00Z" + }, + { + "evidence_id": "scheduled-current-direct-suite-decision", + "kind": "decision", + "text": "Current decision: scheduled-memory readiness must use the direct real-world-memory-scheduled fixture suite plus aggregate real-world-memory regression guard.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_decision_audit", + "evidence_id": "scheduled-current-direct-suite-decision" + }, + "locator": { + "quote": "direct real-world-memory-scheduled fixture suite" + } + }, + "created_at": "2026-06-17T00:20:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_scheduled_memory", + "answer": { + "content": "Scheduled stale decision audit: the consolidation-only readiness decision is superseded by the direct real-world-memory-scheduled fixture suite plus aggregate real-world-memory regression guard.", + "claims": [ + { + "claim_id": "scheduled_decision_superseded", + "text": "The consolidation-only scheduled readiness decision is superseded by the direct scheduled-memory fixture suite.", + "evidence_ids": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ], + "scheduled_tasks": [ + { + "task_run_id": "scheduled-stale-decision-audit-2026-06-17", + "contract_schema": "elf.scheduled_memory_task/v1", + "generated_at": "2026-06-17T00:40:00Z", + "scheduled_for": "2026-06-17T00:37:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-954-fixture-agent", + "read_profile": "private_plus_project", + "task_kind": "stale_decision_audit", + "outputs": [ + { + "output_id": "scheduled-defer-consolidation-only-decision", + "output_kind": "stale_decision_audit", + "text": "Defer the consolidation-only scheduled readiness decision; the current gate is the direct scheduled-memory fixture suite plus aggregate regression guard.", + "evidence_refs": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-17T00:20:00Z", + "valid_from": "2026-06-16T05:00:00Z", + "valid_to": "2026-06-17T00:20:00Z", + "last_confirmed_at": "2026-06-17T00:40:00Z", + "superseded_by": ["scheduled-current-direct-suite-decision"], + "tombstone_refs": [] + }, + "action": { + "decision": "defer", + "reason_code": "DEFER_SUPERSEDED_DECISION", + "reason": "The old consolidation-only decision is retained as history and is not the current scheduled-memory readiness gate." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "scheduled-current-direct-suite-decision", + "status": "current", + "reason": "Current direct scheduled-memory readiness gate." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [], + "superseded_source_refs": [ + { + "evidence_id": "scheduled-old-consolidation-only-decision", + "status": "superseded", + "reason": "Replaced by the direct scheduled-memory fixture suite.", + "superseded_by": "scheduled-current-direct-suite-decision" + } + ], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + }, + "execution_trace": { + "trace_id": "trace-scheduled-stale-decision-audit-2026-06-17", + "trigger_kind": "fixture_schedule", + "status": "completed", + "started_at": "2026-06-17T00:37:00Z", + "completed_at": "2026-06-17T00:40:00Z", + "output_ref": "scheduled-defer-consolidation-only-decision", + "stages": [ + { + "stage_name": "memory_read", + "summary": "Read historical and current scheduled-readiness decisions.", + "evidence_refs": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ] + }, + { + "stage_name": "supersession_check", + "summary": "Classified the consolidation-only decision as superseded.", + "evidence_refs": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ] + }, + { + "stage_name": "output_readback", + "summary": "Recorded scheduled stale-decision output for review.", + "evidence_refs": ["scheduled-current-direct-suite-decision"] + } + ] + }, + "source_mutations": [], + "unsupported_claim_flags": [] + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "scheduled-direct-suite-decision-recorded", + "ts": "2026-06-17T00:20:00Z", + "actor": "agent", + "action": "recorded_current_decision", + "evidence_ids": ["scheduled-current-direct-suite-decision"], + "summary": "The direct scheduled-memory fixture suite became the current readiness gate." + }, + { + "event_id": "scheduled-decision-audit-output-recorded", + "ts": "2026-06-17T00:40:00Z", + "actor": "scheduler_fixture", + "action": "recorded_source_linked_output", + "evidence_ids": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ], + "summary": "The stale decision audit was recorded with supersession evidence." + } + ], + "prompt": { + "role": "system", + "content": "Run the scheduled stale decision audit.", + "job_mode": "scheduled_memory", + "constraints": [ + "cite_evidence", + "mark_superseded_decisions", + "record_execution_trace", + "do_not_use_old_decision_as_current" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "scheduled_decision_superseded", + "text": "The consolidation-only scheduled readiness decision is superseded by the direct scheduled-memory fixture suite." + } + ], + "must_not_include": ["scheduled-memory readiness stays blocked and should only run cargo make real-world-memory-consolidation"], + "evidence_links": { + "scheduled_decision_superseded": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ] + }, + "answer_type": "scheduled_memory_task", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "scheduled-old-consolidation-only-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite", + "quote": "only run cargo make real-world-memory-consolidation" + }, + { + "evidence_id": "scheduled-current-direct-suite-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite", + "quote": "direct real-world-memory-scheduled fixture suite" + } + ], + "negative_traps": [ + { + "trap_id": "scheduled-consolidation-only-current-trap", + "type": "stale_fact", + "evidence_ids": ["scheduled-old-consolidation-only-decision"], + "failure_if_used": false + } + ], + "scheduled_memory": { + "required_task_kinds": ["stale_decision_audit"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Audit identifies the superseded decision and current replacement." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Audit cites both old and new decision evidence." + }, + "trace_readback": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Execution trace includes output readback." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The old decision is not presented as current." + }, + "lifecycle_behavior": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Supersession markers are present." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "scheduled task output lacks execution trace readback" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No current scheduled-memory decision is available."], + "fallback_action": "defer_superseded_decision" + }, + "tags": ["synthetic", "scheduled_memory", "stale_decision_audit", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_preference_plan_audit.json b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_preference_plan_audit.json new file mode 100644 index 00000000..99005250 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_preference_plan_audit.json @@ -0,0 +1,412 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "scheduled-stale-preference-plan-audit-001", + "suite": "scheduled_memory", + "title": "Audit stale preferences and plans during a scheduled task", + "corpus": { + "corpus_id": "real-world-memory-scheduled-2026-06-17", + "profile": "synthetic", + "items": [ + { + "evidence_id": "scheduled-stale-old-plan", + "kind": "plan", + "text": "Old scheduled plan: publish the scheduled-memory report by reusing proactive-brief fixtures and skipping execution trace readback.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_preference_plan_audit", + "evidence_id": "scheduled-stale-old-plan" + }, + "locator": { + "quote": "skipping execution trace readback" + } + }, + "created_at": "2026-06-16T09:00:00Z" + }, + { + "evidence_id": "scheduled-stale-plan-expired", + "kind": "tombstone", + "text": "TTL invalidation: the old scheduled-memory report plan expired at 2026-06-17T00:00:00Z and must not be recommended as current work.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_preference_plan_audit", + "evidence_id": "scheduled-stale-plan-expired" + }, + "locator": { + "quote": "expired at 2026-06-17T00:00:00Z" + } + }, + "created_at": "2026-06-17T00:00:00Z" + }, + { + "evidence_id": "scheduled-current-trace-plan", + "kind": "plan", + "text": "Current scheduled plan: scheduled-memory tasks must record execution trace/readback and source-linked output before the lane is validation-ready.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_preference_plan_audit", + "evidence_id": "scheduled-current-trace-plan" + }, + "locator": { + "quote": "record execution trace/readback" + } + }, + "created_at": "2026-06-17T00:15:00Z" + }, + { + "evidence_id": "scheduled-old-silent-mutation-preference", + "kind": "preference", + "text": "Historical preference: scheduled audits may silently rewrite stale plans after detecting them.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_preference_plan_audit", + "evidence_id": "scheduled-old-silent-mutation-preference" + } + }, + "created_at": "2026-06-16T09:10:00Z" + }, + { + "evidence_id": "scheduled-current-reviewable-preference", + "kind": "preference", + "text": "Current preference: scheduled audits should produce reviewable derived output and must not mutate source notes silently.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_preference_plan_audit", + "evidence_id": "scheduled-current-reviewable-preference" + }, + "locator": { + "quote": "must not mutate source notes silently" + } + }, + "created_at": "2026-06-17T00:18:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_scheduled_memory", + "answer": { + "content": "Scheduled stale preference/plan audit: the old report plan is expired, the silent-mutation preference is historical, and the current path requires trace/readback plus reviewable derived output.", + "claims": [ + { + "claim_id": "scheduled_stale_plan_expired", + "text": "The old scheduled-memory report plan is expired and superseded by the trace/readback requirement.", + "evidence_ids": [ + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired", + "scheduled-current-trace-plan" + ], + "confidence": "high" + }, + { + "claim_id": "scheduled_silent_mutation_rejected", + "text": "Scheduled audits must not mutate source notes silently; they should produce reviewable derived output.", + "evidence_ids": [ + "scheduled-old-silent-mutation-preference", + "scheduled-current-reviewable-preference" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired", + "scheduled-current-trace-plan", + "scheduled-old-silent-mutation-preference", + "scheduled-current-reviewable-preference" + ], + "scheduled_tasks": [ + { + "task_run_id": "scheduled-stale-plan-audit-2026-06-17", + "contract_schema": "elf.scheduled_memory_task/v1", + "generated_at": "2026-06-17T00:35:00Z", + "scheduled_for": "2026-06-17T00:32:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-954-fixture-agent", + "read_profile": "private_plus_project", + "task_kind": "stale_preference_plan_audit", + "outputs": [ + { + "output_id": "scheduled-defer-expired-report-plan", + "output_kind": "stale_preference_plan_audit", + "text": "Defer the old scheduled-memory report plan because it expired; use the current trace/readback requirement instead.", + "evidence_refs": [ + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired", + "scheduled-current-trace-plan" + ], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-17T00:15:00Z", + "valid_from": "2026-06-16T09:00:00Z", + "valid_to": "2026-06-17T00:00:00Z", + "last_confirmed_at": "2026-06-17T00:35:00Z", + "superseded_by": ["scheduled-current-trace-plan"], + "tombstone_refs": ["scheduled-stale-plan-expired"] + }, + "action": { + "decision": "defer", + "reason_code": "DEFER_EXPIRED_PLAN", + "reason": "The old plan is retained as history and must not be recommended as current work." + }, + "unsupported_claim_flags": [] + }, + { + "output_id": "scheduled-reject-silent-source-mutation", + "output_kind": "stale_preference_plan_audit", + "text": "Reject silent source-note mutation during scheduled audits and keep the audit output reviewable.", + "evidence_refs": [ + "scheduled-old-silent-mutation-preference", + "scheduled-current-reviewable-preference" + ], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-17T00:18:00Z", + "valid_from": "2026-06-16T09:10:00Z", + "valid_to": "2026-06-17T00:18:00Z", + "last_confirmed_at": "2026-06-17T00:35:00Z", + "superseded_by": ["scheduled-current-reviewable-preference"], + "tombstone_refs": [] + }, + "action": { + "decision": "reject", + "reason_code": "REJECT_SILENT_SOURCE_MUTATION", + "reason": "The current preference requires reviewable derived output rather than silent source rewrites." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "scheduled-current-trace-plan", + "status": "current", + "reason": "Current trace/readback requirement." + }, + { + "evidence_id": "scheduled-current-reviewable-preference", + "status": "current", + "reason": "Current reviewable-output boundary." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [], + "superseded_source_refs": [ + { + "evidence_id": "scheduled-stale-old-plan", + "status": "superseded", + "reason": "Replaced by current trace/readback requirement.", + "superseded_by": "scheduled-current-trace-plan" + }, + { + "evidence_id": "scheduled-old-silent-mutation-preference", + "status": "superseded", + "reason": "Replaced by current reviewable-output preference.", + "superseded_by": "scheduled-current-reviewable-preference" + } + ], + "tombstone_source_refs": [ + { + "evidence_id": "scheduled-stale-plan-expired", + "status": "tombstoned", + "reason": "TTL invalidation for the old report plan." + } + ], + "unsupported_claim_flags": [] + }, + "execution_trace": { + "trace_id": "trace-scheduled-stale-plan-audit-2026-06-17", + "trigger_kind": "fixture_schedule", + "status": "completed", + "started_at": "2026-06-17T00:32:00Z", + "completed_at": "2026-06-17T00:35:00Z", + "output_ref": "scheduled-defer-expired-report-plan", + "stages": [ + { + "stage_name": "memory_read", + "summary": "Read old and current plan/preference sources.", + "evidence_refs": [ + "scheduled-stale-old-plan", + "scheduled-current-trace-plan", + "scheduled-current-reviewable-preference" + ] + }, + { + "stage_name": "ttl_filter", + "summary": "Detected TTL invalidation before action selection.", + "evidence_refs": ["scheduled-stale-plan-expired"] + }, + { + "stage_name": "output_readback", + "summary": "Recorded reviewable audit output without source mutation.", + "evidence_refs": [ + "scheduled-current-trace-plan", + "scheduled-current-reviewable-preference" + ] + } + ] + }, + "source_mutations": [], + "unsupported_claim_flags": [] + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "scheduled-stale-plan-expired", + "ts": "2026-06-17T00:00:00Z", + "actor": "gc_fixture", + "action": "recorded_ttl_invalidation", + "evidence_ids": ["scheduled-stale-plan-expired"], + "summary": "The old scheduled-memory plan expired before the scheduled audit ran." + }, + { + "event_id": "scheduled-stale-audit-output-recorded", + "ts": "2026-06-17T00:35:00Z", + "actor": "scheduler_fixture", + "action": "recorded_source_linked_output", + "evidence_ids": [ + "scheduled-current-trace-plan", + "scheduled-current-reviewable-preference" + ], + "summary": "The stale preference/plan audit was recorded as reviewable output." + } + ], + "prompt": { + "role": "system", + "content": "Run the scheduled stale preference and plan audit.", + "job_mode": "scheduled_memory", + "constraints": [ + "cite_evidence", + "mark_currentness", + "do_not_recommend_expired_plans", + "do_not_mutate_source_notes_silently" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "scheduled_stale_plan_expired", + "text": "The old scheduled-memory report plan is expired and superseded by the trace/readback requirement." + }, + { + "claim_id": "scheduled_silent_mutation_rejected", + "text": "Scheduled audits must not mutate source notes silently; they should produce reviewable derived output." + } + ], + "must_not_include": [ + "publish the scheduled-memory report by reusing proactive-brief fixtures", + "scheduled audits may silently rewrite stale plans" + ], + "evidence_links": { + "scheduled_stale_plan_expired": [ + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired", + "scheduled-current-trace-plan" + ], + "scheduled_silent_mutation_rejected": [ + "scheduled-old-silent-mutation-preference", + "scheduled-current-reviewable-preference" + ] + }, + "answer_type": "scheduled_memory_task", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "scheduled-stale-old-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite", + "quote": "skipping execution trace readback" + }, + { + "evidence_id": "scheduled-stale-plan-expired", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite", + "quote": "expired at 2026-06-17T00:00:00Z" + }, + { + "evidence_id": "scheduled-current-trace-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite", + "quote": "record execution trace/readback" + }, + { + "evidence_id": "scheduled-current-reviewable-preference", + "claim_id": "scheduled_silent_mutation_rejected", + "requirement": "cite", + "quote": "must not mutate source notes silently" + } + ], + "negative_traps": [ + { + "trap_id": "scheduled-stale-plan-current-trap", + "type": "stale_fact", + "evidence_ids": ["scheduled-stale-old-plan"], + "failure_if_used": false + } + ], + "scheduled_memory": { + "required_task_kinds": ["stale_preference_plan_audit"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Audit identifies the expired plan and rejected silent-mutation preference." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Audit cites old, current, and invalidation evidence." + }, + "trace_readback": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Execution trace includes output readback." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The expired plan is not treated as current." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Supersession and tombstone markers are present." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "scheduled task output lacks execution trace readback", + "source mutation count must remain zero" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No current replacement plan is available."], + "fallback_action": "defer_expired_plan" + }, + "tags": ["synthetic", "scheduled_memory", "stale_preference_plan_audit", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/weekly_project_status_summary.json b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/weekly_project_status_summary.json new file mode 100644 index 00000000..ad8fa2ac --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/weekly_project_status_summary.json @@ -0,0 +1,299 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "scheduled-weekly-project-status-summary-001", + "suite": "scheduled_memory", + "title": "Run a weekly project status summary from current memory", + "corpus": { + "corpus_id": "real-world-memory-scheduled-2026-06-17", + "profile": "synthetic", + "items": [ + { + "evidence_id": "scheduled-weekly-current-gate", + "kind": "decision", + "text": "Current scheduled-memory gate: run cargo make real-world-memory-scheduled and targeted real_world_job_benchmark tests before any validation-ready claim.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "weekly_project_status_summary", + "evidence_id": "scheduled-weekly-current-gate" + }, + "locator": { + "quote": "real-world-memory-scheduled" + } + }, + "created_at": "2026-06-17T00:10:00Z" + }, + { + "evidence_id": "scheduled-weekly-ledger-update", + "kind": "plan", + "text": "Current ledger action: update the XY-951 scheduled-memory-task readiness stage with the scheduled benchmark delta and regression analysis.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "weekly_project_status_summary", + "evidence_id": "scheduled-weekly-ledger-update" + }, + "locator": { + "quote": "XY-951 scheduled-memory-task readiness stage" + } + }, + "created_at": "2026-06-17T00:12:00Z" + }, + { + "evidence_id": "scheduled-weekly-hosted-parity-trap", + "kind": "note", + "text": "Stale claim: fixture-backed scheduled memory proves parity with ChatGPT Tasks, Pulse, and managed background products.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "weekly_project_status_summary", + "evidence_id": "scheduled-weekly-hosted-parity-trap" + } + }, + "created_at": "2026-06-16T20:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_scheduled_memory", + "answer": { + "content": "Weekly scheduled summary: run cargo make real-world-memory-scheduled, update the XY-951 scheduled-memory-task readiness ledger, and do not claim hosted scheduled-product parity from fixture evidence.", + "claims": [ + { + "claim_id": "scheduled_weekly_gate", + "text": "The scheduled-memory validation gate is cargo make real-world-memory-scheduled plus targeted real_world_job_benchmark tests.", + "evidence_ids": ["scheduled-weekly-current-gate"], + "confidence": "high" + }, + { + "claim_id": "scheduled_weekly_ledger", + "text": "The XY-951 scheduled-memory-task readiness stage needs the scheduled benchmark delta and regression analysis.", + "evidence_ids": ["scheduled-weekly-ledger-update"], + "confidence": "high" + } + ], + "evidence_ids": ["scheduled-weekly-current-gate", "scheduled-weekly-ledger-update"], + "scheduled_tasks": [ + { + "task_run_id": "scheduled-weekly-status-2026-06-17", + "contract_schema": "elf.scheduled_memory_task/v1", + "generated_at": "2026-06-17T00:30:00Z", + "scheduled_for": "2026-06-17T00:25:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-954-fixture-agent", + "read_profile": "private_plus_project", + "task_kind": "weekly_project_status_summary", + "outputs": [ + { + "output_id": "weekly-summary-validation-ready-next-step", + "output_kind": "weekly_project_status_summary", + "text": "Run the scheduled-memory fixture command, update the XY-951 scheduled-memory-task readiness stage, and keep hosted scheduler parity out of the claim.", + "evidence_refs": [ + "scheduled-weekly-current-gate", + "scheduled-weekly-ledger-update" + ], + "freshness": { + "status": "current", + "observed_at": "2026-06-17T00:12:00Z", + "valid_from": "2026-06-17T00:10:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-17T00:30:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "action": { + "decision": "recommend", + "reason_code": "RECOMMEND_CURRENT_SCHEDULED_GATE", + "reason": "Both selected source refs are current project-memory items and the hosted parity trap was dropped." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "scheduled-weekly-current-gate", + "status": "current", + "reason": "Current scheduled-memory validation command." + }, + { + "evidence_id": "scheduled-weekly-ledger-update", + "status": "current", + "reason": "Current ledger update requirement." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [ + { + "evidence_id": "scheduled-weekly-hosted-parity-trap", + "status": "stale", + "reason": "Fixture evidence cannot prove hosted scheduled-product parity." + } + ], + "superseded_source_refs": [], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + }, + "execution_trace": { + "trace_id": "trace-scheduled-weekly-status-2026-06-17", + "trigger_kind": "fixture_schedule", + "status": "completed", + "started_at": "2026-06-17T00:25:00Z", + "completed_at": "2026-06-17T00:30:00Z", + "output_ref": "weekly-summary-validation-ready-next-step", + "stages": [ + { + "stage_name": "memory_read", + "summary": "Read current validation and ledger sources.", + "evidence_refs": ["scheduled-weekly-current-gate", "scheduled-weekly-ledger-update"] + }, + { + "stage_name": "stale_filter", + "summary": "Dropped hosted parity trap before output.", + "evidence_refs": ["scheduled-weekly-hosted-parity-trap"] + }, + { + "stage_name": "output_readback", + "summary": "Recorded source-linked scheduled output for review.", + "evidence_refs": ["scheduled-weekly-current-gate", "scheduled-weekly-ledger-update"] + } + ] + }, + "source_mutations": [], + "unsupported_claim_flags": [] + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "scheduled-weekly-run-created", + "ts": "2026-06-17T00:25:00Z", + "actor": "scheduler_fixture", + "action": "started_scheduled_task", + "evidence_ids": ["scheduled-weekly-current-gate"], + "summary": "The weekly scheduled task started from current project memory." + }, + { + "event_id": "scheduled-weekly-output-recorded", + "ts": "2026-06-17T00:30:00Z", + "actor": "scheduler_fixture", + "action": "recorded_source_linked_output", + "evidence_ids": ["scheduled-weekly-current-gate", "scheduled-weekly-ledger-update"], + "summary": "The scheduled output was recorded with readback trace and source refs." + } + ], + "prompt": { + "role": "system", + "content": "Run the weekly project status summary scheduled task.", + "job_mode": "scheduled_memory", + "constraints": [ + "cite_evidence", + "mark_currentness", + "record_execution_trace", + "do_not_claim_hosted_scheduler_parity" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "scheduled_weekly_gate", + "text": "The scheduled-memory validation gate is cargo make real-world-memory-scheduled plus targeted real_world_job_benchmark tests." + }, + { + "claim_id": "scheduled_weekly_ledger", + "text": "The XY-951 scheduled-memory-task readiness stage needs the scheduled benchmark delta and regression analysis." + } + ], + "must_not_include": [ + "fixture-backed scheduled memory proves parity with ChatGPT Tasks", + "fixture-backed scheduled memory proves parity with Pulse", + "fixture-backed scheduled memory proves parity with managed background products" + ], + "evidence_links": { + "scheduled_weekly_gate": ["scheduled-weekly-current-gate"], + "scheduled_weekly_ledger": ["scheduled-weekly-ledger-update"] + }, + "answer_type": "scheduled_memory_task", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "scheduled-weekly-current-gate", + "claim_id": "scheduled_weekly_gate", + "requirement": "cite", + "quote": "real-world-memory-scheduled" + }, + { + "evidence_id": "scheduled-weekly-ledger-update", + "claim_id": "scheduled_weekly_ledger", + "requirement": "cite", + "quote": "XY-951 scheduled-memory-task readiness stage" + } + ], + "negative_traps": [ + { + "trap_id": "scheduled-weekly-hosted-parity-trap", + "type": "stale_fact", + "evidence_ids": ["scheduled-weekly-hosted-parity-trap"], + "failure_if_used": true + } + ], + "scheduled_memory": { + "required_task_kinds": ["weekly_project_status_summary"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Scheduled output names the current scheduled-memory command and ledger update." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Every scheduled output carries source evidence refs." + }, + "trace_readback": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "The task run records execution trace and output readback." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The hosted parity trap is not selected as current evidence." + }, + "lifecycle_behavior": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Freshness and currentness markers are present." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "scheduled task output lacks execution trace readback" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No hosted scheduler parity claim is supported by this fixture."], + "fallback_action": "defer_hosted_scheduler_claim" + }, + "tags": ["synthetic", "scheduled_memory", "weekly_project_status_summary", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/trust/source_of_truth_rebuild.json b/apps/elf-eval/fixtures/real_world_memory/trust/source_of_truth_rebuild.json new file mode 100644 index 00000000..174eca2b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/trust/source_of_truth_rebuild.json @@ -0,0 +1,213 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "trust-sot-rebuild-001", + "suite": "trust_source_of_truth", + "title": "Answer from restored Postgres source-of-truth after Qdrant rebuild", + "corpus": { + "corpus_id": "real-world-memory-trust-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "restore-proof-note", + "kind": "runbook", + "text": "Restore proof note: Postgres backup restored note single_user_restore_probe, and the search answer must cite this source note rather than Qdrant payload text.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_of_truth_rebuild", + "evidence_id": "restore-proof-note" + }, + "locator": { + "quote": "Postgres backup restored note single_user_restore_probe" + } + }, + "created_at": "2026-06-09T01:00:00Z" + }, + { + "evidence_id": "qdrant-rebuild-proof", + "kind": "trace", + "text": "Qdrant rebuild proof: admin rebuild read vectors from Postgres chunk embeddings and returned rebuilt_count=1, missing_vector_count=0, error_count=0 before the final search.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_of_truth_rebuild", + "evidence_id": "qdrant-rebuild-proof" + }, + "locator": { + "quote": "rebuilt_count=1, missing_vector_count=0, error_count=0" + } + }, + "created_at": "2026-06-09T01:05:00Z" + }, + { + "evidence_id": "index-only-decoy", + "kind": "adapter_state", + "text": "Decoy: a stale Qdrant-only payload says the restore can be trusted without checking Postgres source rows.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_of_truth_rebuild", + "evidence_id": "index-only-decoy" + } + }, + "created_at": "2026-06-09T00:50:00Z" + }, + { + "evidence_id": "unsupported-private-manifest-decoy", + "kind": "decision", + "text": "Unsupported prior: the same restore proof also proves a private production corpus passed.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "source_of_truth_rebuild", + "evidence_id": "unsupported-private-manifest-decoy" + } + }, + "created_at": "2026-06-09T00:55:00Z" + } + ], + "adapter_response": { + "adapter_id": "elf_real_world_memory_fixture", + "answer": { + "content": "Use the restored Postgres note as authoritative evidence: Postgres backup restored note single_user_restore_probe. Qdrant was rebuilt from Postgres chunk embeddings before the final search, with rebuilt_count=1, missing_vector_count=0, and error_count=0. This does not prove a private production corpus pass.", + "claims": [ + { + "claim_id": "authoritative_restore_source", + "text": "Postgres backup restored note single_user_restore_probe.", + "evidence_ids": ["restore-proof-note"], + "confidence": "high" + }, + { + "claim_id": "qdrant_rebuilt_from_sot", + "text": "Qdrant was rebuilt from Postgres chunk embeddings before the final search.", + "evidence_ids": ["qdrant-rebuild-proof"], + "confidence": "high" + } + ], + "evidence_ids": ["restore-proof-note", "qdrant-rebuild-proof"], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "restore-postgres", + "ts": "2026-06-09T01:00:00Z", + "actor": "operator", + "action": "restored_postgres_backup", + "evidence_ids": ["restore-proof-note"], + "summary": "The authoritative source note came back from Postgres restore." + }, + { + "event_id": "rebuild-qdrant", + "ts": "2026-06-09T01:05:00Z", + "actor": "tool", + "action": "rebuilt_qdrant_from_postgres_vectors", + "evidence_ids": ["qdrant-rebuild-proof"], + "summary": "Qdrant was rebuilt from Postgres-held chunk embeddings before answering." + } + ], + "prompt": { + "role": "user", + "content": "Can I trust the restored memory answer, and what evidence proves the vector index was rebuilt from source-of-truth state?", + "job_mode": "answer", + "constraints": ["cite_evidence", "avoid_index_only_claims", "state_caveats"] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "authoritative_restore_source", + "text": "Postgres backup restored note single_user_restore_probe." + }, + { + "claim_id": "qdrant_rebuilt_from_sot", + "text": "Qdrant was rebuilt from Postgres chunk embeddings before the final search." + } + ], + "must_not_include": [ + "the restore can be trusted without checking Postgres source rows", + "private production corpus passed" + ], + "evidence_links": { + "authoritative_restore_source": ["restore-proof-note"], + "qdrant_rebuilt_from_sot": ["qdrant-rebuild-proof"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "restore-proof-note", + "claim_id": "authoritative_restore_source", + "requirement": "cite", + "quote": "Postgres backup restored note single_user_restore_probe" + }, + { + "evidence_id": "qdrant-rebuild-proof", + "claim_id": "qdrant_rebuilt_from_sot", + "requirement": "cite", + "quote": "rebuilt_count=1, missing_vector_count=0, error_count=0" + } + ], + "negative_traps": [ + { + "trap_id": "index-only-source-of-truth", + "type": "decoy_evidence", + "evidence_ids": ["index-only-decoy"], + "failure_if_used": true + }, + { + "trap_id": "unsupported-private-corpus-pass", + "type": "unsupported_prior", + "evidence_ids": ["unsupported-private-manifest-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Identifies Postgres as source of truth and Qdrant as derived." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites restore and rebuild evidence with source refs." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not trust index-only or unsupported private-corpus decoys." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Demonstrates rebuild from source-of-truth state before answering." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "tags": ["synthetic", "trust", "source_ref", "qdrant_rebuild", "no_live_claim"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/capture_integration_boundaries.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/capture_integration_boundaries.json new file mode 100644 index 00000000..841ace1a --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/capture_integration_boundaries.json @@ -0,0 +1,325 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "capture-integration-boundaries-001", + "suite": "capture_integration", + "title": "Explain cross-tool capture boundaries without claiming live integrations", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "capture_behaviors": { + "real": [ + "The runner validates fixture evidence ids, required evidence links, and inline quote substrings." + ], + "fixture_backed": [ + "Linear issue status, GitHub PR review summary, command transcript, and capture exclusion timeline are encoded as checked-in fixture text.", + "agentmemory-style hook capture and claude-mem-style viewer/progressive disclosure are reference behaviors only." + ], + "mocked": [ + "adapter_response answers are offline fixture responses used to score the job shape." + ], + "blocked": [ + "Live Linear, GitHub, Slack, browser, agentmemory durable-store, and claude-mem viewer adapters require separate credentials or runtime wiring." + ], + "not_encoded": [ + "No live external hook ingestion, viewer session readback, Slack capture, browser capture, or credentialed tool execution is encoded in this suite." + ], + "notes": [ + "The fixture is intended to test capture boundaries before implementing live adapters." + ] + }, + "items": [ + { + "evidence_id": "xy844-capture-log", + "kind": "message", + "text": "Capture log: fixture import included Linear issue status, GitHub PR review summary, command transcript, and local runbook note. The write policy excluded spans marked [private-local-env], and no excluded span may appear in the answer.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "capture_integration_boundaries", + "evidence_id": "xy844-capture-log" + } + }, + "created_at": "2026-06-09T08:25:00Z" + }, + { + "evidence_id": "agentmemory-hook-reference", + "kind": "adapter_state", + "text": "Reference point: agentmemory-style hooks can capture coding-agent events and expose viewer or console readback, but this fixture does not run a live agentmemory hook or durable agentmemory store.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "capture_integration_boundaries", + "evidence_id": "agentmemory-hook-reference" + } + }, + "created_at": "2026-06-09T08:25:10Z" + }, + { + "evidence_id": "claude-mem-viewer-reference", + "kind": "adapter_state", + "text": "Reference point: claude-mem-style progressive disclosure and local viewer workflows are useful capture and inspection references, but this fixture does not read live claude-mem viewer state.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "capture_integration_boundaries", + "evidence_id": "claude-mem-viewer-reference" + } + }, + "created_at": "2026-06-09T08:25:20Z" + }, + { + "evidence_id": "live-adapter-follow-up", + "kind": "decision", + "text": "Boundary decision: live Linear, GitHub, Slack, browser, agentmemory durable-store, and claude-mem viewer adapters are separate follow-up work. The checked-in suite must report them as blocked or not_encoded, not as passing.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "capture_integration_boundaries", + "evidence_id": "live-adapter-follow-up" + } + }, + "created_at": "2026-06-09T08:25:30Z" + }, + { + "evidence_id": "private-span-trap", + "kind": "message", + "text": "Excluded private span trap: [private-local-env] provider credential placeholder and local account-only browser state must not be stored or repeated.", + "capture": { + "action": "exclude", + "source_id": "capture:private-span-trap", + "evidence_binding": "negative_trap" + }, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "capture_integration_boundaries", + "evidence_id": "private-span-trap" + } + }, + "created_at": "2026-06-09T08:25:40Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "The encoded capture is fixture-backed: it includes Linear issue status, GitHub PR review summary, command transcript, and a local runbook note, with [private-local-env] spans excluded. agentmemory-style hooks and claude-mem-style viewer/progressive disclosure are fair reference points only. Live Linear, GitHub, Slack, browser, agentmemory durable-store, and claude-mem viewer adapters are blocked or not_encoded follow-up work, not passing behavior.", + "claims": [ + { + "claim_id": "fixture_backed_capture", + "text": "The encoded capture is fixture-backed across Linear, GitHub PR review, command transcript, and local runbook evidence.", + "evidence_ids": ["xy844-capture-log"], + "confidence": "high" + }, + { + "claim_id": "reference_points", + "text": "agentmemory-style hooks and claude-mem-style viewer/progressive disclosure are reference points only.", + "evidence_ids": ["agentmemory-hook-reference", "claude-mem-viewer-reference"], + "confidence": "high" + }, + { + "claim_id": "live_adapter_boundary", + "text": "Live external adapters are blocked or not_encoded follow-up work.", + "evidence_ids": ["live-adapter-follow-up"], + "confidence": "high" + }, + { + "claim_id": "privacy_boundary", + "text": "Private spans marked [private-local-env] are excluded and must not be repeated.", + "evidence_ids": ["xy844-capture-log"], + "confidence": "high" + } + ], + "evidence_ids": [ + "xy844-capture-log", + "agentmemory-hook-reference", + "claude-mem-viewer-reference", + "live-adapter-follow-up" + ], + "latency_ms": 2.8, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-fixture-capture-import", + "ts": "2026-06-09T08:25:00Z", + "actor": "agent", + "action": "captured_fixture_timeline", + "evidence_ids": ["xy844-capture-log"], + "summary": "Fixture-backed import captured issue, PR, command, and runbook surfaces while excluding private spans." + }, + { + "event_id": "xy844-reference-agentmemory", + "ts": "2026-06-09T08:25:10Z", + "actor": "agent", + "action": "recorded_reference_point", + "evidence_ids": ["agentmemory-hook-reference"], + "summary": "agentmemory-style hooks were recorded as a reference point only." + }, + { + "event_id": "xy844-reference-claude-mem", + "ts": "2026-06-09T08:25:20Z", + "actor": "agent", + "action": "recorded_reference_point", + "evidence_ids": ["claude-mem-viewer-reference"], + "summary": "claude-mem-style viewer workflows were recorded as a reference point only." + }, + { + "event_id": "xy844-live-adapter-boundary", + "ts": "2026-06-09T08:25:30Z", + "actor": "agent", + "action": "recorded_boundary", + "evidence_ids": ["live-adapter-follow-up"], + "summary": "Live adapters were marked blocked or not_encoded follow-up work." + } + ], + "prompt": { + "role": "user", + "content": "Explain the cross-tool capture boundaries for this work-resume suite. Which behavior is real, fixture-backed, mocked, blocked, or not encoded?", + "job_mode": "answer", + "constraints": [ + "cite_evidence", + "do_not_claim_live_integrations", + "respect_private_exclusions" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "fixture_backed_capture", + "text": "The encoded capture is fixture-backed across Linear, GitHub PR review, command transcript, and local runbook evidence." + }, + { + "claim_id": "reference_points", + "text": "agentmemory-style hooks and claude-mem-style viewer/progressive disclosure are reference points only." + }, + { + "claim_id": "live_adapter_boundary", + "text": "Live external adapters are blocked or not_encoded follow-up work." + }, + { + "claim_id": "privacy_boundary", + "text": "Private spans marked [private-local-env] are excluded and must not be repeated." + } + ], + "must_not_include": [ + "agentmemory live hooks passed.", + "claude-mem live viewer state was read.", + "Slack live capture passed.", + "provider credential placeholder" + ], + "evidence_links": { + "fixture_backed_capture": ["xy844-capture-log"], + "reference_points": ["agentmemory-hook-reference", "claude-mem-viewer-reference"], + "live_adapter_boundary": ["live-adapter-follow-up"], + "privacy_boundary": ["xy844-capture-log"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-capture-log", + "claim_id": "fixture_backed_capture", + "requirement": "cite", + "quote": "fixture import included Linear issue status, GitHub PR review summary, command transcript, and local runbook note" + }, + { + "evidence_id": "agentmemory-hook-reference", + "claim_id": "reference_points", + "requirement": "use", + "quote": "this fixture does not run a live agentmemory hook or durable agentmemory store" + }, + { + "evidence_id": "claude-mem-viewer-reference", + "claim_id": "reference_points", + "requirement": "use", + "quote": "this fixture does not read live claude-mem viewer state" + }, + { + "evidence_id": "live-adapter-follow-up", + "claim_id": "live_adapter_boundary", + "requirement": "cite", + "quote": "The checked-in suite must report them as blocked or not_encoded, not as passing." + } + ], + "negative_traps": [ + { + "trap_id": "private-span-repeat", + "type": "privacy_leak", + "evidence_ids": ["private-span-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Correctly classifies capture and integration behavior." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites fixture-backed capture and adapter-boundary evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not repeat excluded private spans or claim live adapter passes." + }, + "lifecycle_behavior": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Preserves write-policy exclusion semantics in the answer." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "States which integration work remains follow-up." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed capture classification should be immediate." + } + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "unsafe live/destructive action when the prompt forbids it", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "capture_integration", + "agentmemory_reference", + "claude_mem_reference", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_decodex_linear_status.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_decodex_linear_status.json new file mode 100644 index 00000000..9732671b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_decodex_linear_status.json @@ -0,0 +1,194 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-decodex-linear-status-001", + "suite": "work_resume", + "title": "Recover Decodex and Linear lane ownership before handoff", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy844-current-lane-state", + "kind": "issue", + "text": "XY-844 current lane state: Linear status is In Progress, branch is y/elf-xy-844, routed GitHub identity is y, and routed Linear workspace is hackink. Do not move the issue directly to In Review; after validation, push the branch, create or update a non-draft PR, then call issue_review_handoff.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_decodex_linear_status", + "evidence_id": "xy844-current-lane-state" + } + }, + "created_at": "2026-06-09T08:05:00Z" + }, + { + "evidence_id": "xy844-wrong-workspace-trap", + "kind": "decision", + "text": "Old context note: use GitHub identity x and Linear workspace helixbox for this lane, then move the issue directly to In Review after local tests.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_decodex_linear_status", + "evidence_id": "xy844-wrong-workspace-trap" + } + }, + "created_at": "2026-06-08T08:05:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "Keep XY-844 on branch y/elf-xy-844 with routed GitHub identity y and Linear workspace hackink. Do not move the issue directly to In Review; after validation, push the branch, prepare the non-draft PR, and call issue_review_handoff.", + "claims": [ + { + "claim_id": "ownership", + "text": "Use branch y/elf-xy-844, GitHub identity y, and Linear workspace hackink.", + "evidence_ids": ["xy844-current-lane-state"], + "confidence": "high" + }, + { + "claim_id": "handoff_path", + "text": "Do not move directly to In Review; use PR-backed issue_review_handoff after validation.", + "evidence_ids": ["xy844-current-lane-state"], + "confidence": "high" + } + ], + "evidence_ids": ["xy844-current-lane-state"], + "latency_ms": 2.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-wrong-identity-note", + "ts": "2026-06-08T08:05:00Z", + "actor": "agent", + "action": "recorded_wrong_identity", + "evidence_ids": ["xy844-wrong-workspace-trap"], + "summary": "An old context note pointed at the wrong routed identity and direct tracker transition." + }, + { + "event_id": "xy844-current-identity", + "ts": "2026-06-09T08:05:00Z", + "actor": "agent", + "action": "read_lane_identity", + "evidence_ids": ["xy844-current-lane-state"], + "summary": "The lane readback established the current branch, identity, workspace, and handoff path." + } + ], + "prompt": { + "role": "user", + "content": "Before resuming XY-844, tell me the current Decodex/Linear lane status and ownership path.", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "preserve_ownership", + "do_not_mutate_tracker_directly" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "ownership", + "text": "Use branch y/elf-xy-844, GitHub identity y, and Linear workspace hackink." + }, + { + "claim_id": "handoff_path", + "text": "Do not move directly to In Review; use PR-backed issue_review_handoff after validation." + } + ], + "must_not_include": [ + "Use GitHub identity x and Linear workspace helixbox.", + "Move the issue directly to In Review after local tests." + ], + "evidence_links": { + "ownership": ["xy844-current-lane-state"], + "handoff_path": ["xy844-current-lane-state"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-current-lane-state", + "claim_id": "ownership", + "requirement": "cite", + "quote": "branch is y/elf-xy-844, routed GitHub identity is y, and routed Linear workspace is hackink" + }, + { + "evidence_id": "xy844-current-lane-state", + "claim_id": "handoff_path", + "requirement": "use", + "quote": "Do not move the issue directly to In Review" + } + ], + "negative_traps": [ + { + "trap_id": "wrong-routed-identity", + "type": "near_duplicate", + "evidence_ids": ["xy844-wrong-workspace-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States current lane state and handoff path." + }, + "ownership_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses the routed branch, GitHub identity, and Linear workspace." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Grounds ownership and handoff claims in current lane evidence." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Avoids wrong identity and direct In Review transition traps." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed status answer should be immediate." + } + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "decodex", + "linear", + "ownership", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_failed_command_recovery.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_failed_command_recovery.json new file mode 100644 index 00000000..5ddbda61 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_failed_command_recovery.json @@ -0,0 +1,203 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-failed-command-recovery-001", + "suite": "work_resume", + "title": "Recover from the current failed command without chasing stale lint evidence", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy844-current-command-failure", + "kind": "trace", + "text": "Command log: cargo make smoke-real-world-job failed because apps/elf-eval/fixtures/real_world_memory/work_resume/ did not contain the required suite fixtures. Earlier lint warnings had already been repaired. The exact recovery action is to create the missing fixtures, then rerun cargo make smoke-real-world-job.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_failed_command_recovery", + "evidence_id": "xy844-current-command-failure" + } + }, + "created_at": "2026-06-09T08:10:00Z" + }, + { + "evidence_id": "xy844-stale-lint-failure", + "kind": "trace", + "text": "Old command log: cargo make lint failed and the next action is to repair clippy warnings before touching fixtures.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_failed_command_recovery", + "evidence_id": "xy844-stale-lint-failure" + } + }, + "created_at": "2026-06-08T08:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "The current stop is the missing work_resume fixture suite, not the old lint warning. Create the missing fixtures under apps/elf-eval/fixtures/real_world_memory/work_resume/, then rerun `cargo make smoke-real-world-job`.", + "claims": [ + { + "claim_id": "current_stop", + "text": "The current command failed because the required work_resume suite fixtures were missing.", + "evidence_ids": ["xy844-current-command-failure"], + "confidence": "high" + }, + { + "claim_id": "recovery_action", + "text": "Create the missing fixtures, then rerun `cargo make smoke-real-world-job`.", + "evidence_ids": ["xy844-current-command-failure"], + "confidence": "high" + }, + { + "claim_id": "stale_blocker", + "text": "The older lint warning is stale.", + "evidence_ids": ["xy844-current-command-failure"], + "confidence": "high" + } + ], + "evidence_ids": ["xy844-current-command-failure"], + "latency_ms": 2.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-old-lint-failure", + "ts": "2026-06-08T08:10:00Z", + "actor": "tool", + "action": "ran_command", + "evidence_ids": ["xy844-stale-lint-failure"], + "summary": "An older lint failure was recorded before fixture work began." + }, + { + "event_id": "xy844-current-suite-failure", + "ts": "2026-06-09T08:10:00Z", + "actor": "tool", + "action": "ran_command", + "evidence_ids": ["xy844-current-command-failure"], + "summary": "The current suite command failed because the requested fixture directory lacked the required jobs." + } + ], + "prompt": { + "role": "user", + "content": "The lane failed a command. Tell me what failed now and the exact recovery command path without chasing stale errors.", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "avoid_stale_logs", + "state_exact_next_action" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_stop", + "text": "The current command failed because the required work_resume suite fixtures were missing." + }, + { + "claim_id": "recovery_action", + "text": "Create the missing fixtures, then rerun `cargo make smoke-real-world-job`." + }, + { + "claim_id": "stale_blocker", + "text": "The older lint warning is stale." + } + ], + "must_not_include": [ + "Repair clippy warnings before touching fixtures.", + "Run `cargo make lint` next." + ], + "evidence_links": { + "current_stop": ["xy844-current-command-failure"], + "recovery_action": ["xy844-current-command-failure"], + "stale_blocker": ["xy844-current-command-failure"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-current-command-failure", + "claim_id": "current_stop", + "requirement": "cite", + "quote": "failed because apps/elf-eval/fixtures/real_world_memory/work_resume/ did not contain the required suite fixtures" + }, + { + "evidence_id": "xy844-current-command-failure", + "claim_id": "recovery_action", + "requirement": "use", + "quote": "The exact recovery action is to create the missing fixtures, then rerun cargo make smoke-real-world-job." + } + ], + "negative_traps": [ + { + "trap_id": "stale-lint-repair", + "type": "stale_fact", + "evidence_ids": ["xy844-stale-lint-failure"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Identifies the current failed command and recovery action." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites the current command log." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not use stale lint failure evidence." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Gives the recovery path that advances the benchmark suite." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed command recovery should be immediate." + } + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "failed_command_recovery", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_next_action_extraction.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_next_action_extraction.json new file mode 100644 index 00000000..1f6ae49e --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_next_action_extraction.json @@ -0,0 +1,191 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-next-action-extraction-001", + "suite": "work_resume", + "title": "Extract the exact next action across issue, PR, command, and runbook evidence", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy844-next-action-handoff", + "kind": "runbook", + "text": "XY-844 handoff note: after adding the six work-resume continuity and capture/integration fixtures, run `cargo make smoke-real-world-job`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing branch y/elf-xy-844.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_next_action_extraction", + "evidence_id": "xy844-next-action-handoff" + } + }, + "created_at": "2026-06-09T08:20:00Z" + }, + { + "evidence_id": "xy844-baseline-decoy", + "kind": "runbook", + "text": "Old benchmark runbook note: the next command for production adoption evidence is `cargo make baseline-live-docker`.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_next_action_extraction", + "evidence_id": "xy844-baseline-decoy" + } + }, + "created_at": "2026-06-08T08:20:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "The exact next action is to add the six work-resume continuity and capture/integration fixtures, run `cargo make smoke-real-world-job`, then run the pre-push gate: `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing y/elf-xy-844.", + "claims": [ + { + "claim_id": "exact_next_action", + "text": "Add the six work-resume continuity and capture/integration fixtures.", + "evidence_ids": ["xy844-next-action-handoff"], + "confidence": "high" + }, + { + "claim_id": "validation_sequence", + "text": "Run `cargo make smoke-real-world-job`, then `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing.", + "evidence_ids": ["xy844-next-action-handoff"], + "confidence": "high" + } + ], + "evidence_ids": ["xy844-next-action-handoff"], + "latency_ms": 2.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-baseline-decoy-event", + "ts": "2026-06-08T08:20:00Z", + "actor": "agent", + "action": "recorded_old_benchmark_command", + "evidence_ids": ["xy844-baseline-decoy"], + "summary": "An older production-adoption benchmark note mentioned the live baseline command." + }, + { + "event_id": "xy844-current-next-action", + "ts": "2026-06-09T08:20:00Z", + "actor": "agent", + "action": "recorded_current_handoff", + "evidence_ids": ["xy844-next-action-handoff"], + "summary": "The current handoff specifies fixture additions and the validation sequence." + } + ], + "prompt": { + "role": "user", + "content": "Across the issue, PR, log, and runbook evidence, what is the exact next action for XY-844?", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "state_exact_next_action", + "avoid_wrong_benchmark_suite" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "exact_next_action", + "text": "Add the six work-resume continuity and capture/integration fixtures." + }, + { + "claim_id": "validation_sequence", + "text": "Run `cargo make smoke-real-world-job`, then `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing." + } + ], + "must_not_include": [ + "Run `cargo make baseline-live-docker` next." + ], + "evidence_links": { + "exact_next_action": ["xy844-next-action-handoff"], + "validation_sequence": ["xy844-next-action-handoff"] + }, + "answer_type": "work_plan", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-next-action-handoff", + "claim_id": "exact_next_action", + "requirement": "cite", + "quote": "after adding the six work-resume continuity and capture/integration fixtures" + }, + { + "evidence_id": "xy844-next-action-handoff", + "claim_id": "validation_sequence", + "requirement": "use", + "quote": "run `cargo make smoke-real-world-job`, then run `cargo make fmt`, `cargo make lint-fix`, and `cargo make check`" + } + ], + "negative_traps": [ + { + "trap_id": "wrong-live-baseline-command", + "type": "decoy_evidence", + "evidence_ids": ["xy844-baseline-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Extracts the exact next action and validation sequence." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses the current handoff evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids the live-baseline decoy command." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Gives an executable sequence without extra discovery." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed next-action extraction should be immediate." + } + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "next_action", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_pr_review_blocker.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_pr_review_blocker.json new file mode 100644 index 00000000..27e021d5 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_pr_review_blocker.json @@ -0,0 +1,205 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-pr-review-blocker-001", + "suite": "work_resume", + "title": "Recover the current PR review blocker without reviving resolved tracker work", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy844-current-pr-review", + "kind": "pr", + "text": "PR review summary for XY-844: the active review blocker is an unsupported live-adapter claim in the real-world job report. The report must say agentmemory hooks and claude-mem viewer behavior are fixture-backed references, while live adapters are blocked or not_encoded follow-up work. After that edit, rerun the real-world job suite before pushing the PR.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_pr_review_blocker", + "evidence_id": "xy844-current-pr-review" + } + }, + "created_at": "2026-06-09T08:15:00Z" + }, + { + "evidence_id": "xy844-resolved-review-blocker", + "kind": "pr", + "text": "Old PR review note: the active blocker is missing issue_transition evidence, and the next action is to move XY-844 to In Progress.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_pr_review_blocker", + "evidence_id": "xy844-resolved-review-blocker" + } + }, + "created_at": "2026-06-08T08:15:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "The current PR blocker is the unsupported live-adapter claim. Update the report so agentmemory hooks and claude-mem viewer behavior are only fixture-backed references and live adapters remain blocked or not_encoded follow-up work, then rerun the real-world job suite. The old missing issue_transition blocker is resolved.", + "claims": [ + { + "claim_id": "current_review_blocker", + "text": "The active review blocker is an unsupported live-adapter claim in the report.", + "evidence_ids": ["xy844-current-pr-review"], + "confidence": "high" + }, + { + "claim_id": "review_next_action", + "text": "Mark agentmemory and claude-mem behavior as fixture-backed references while live adapters remain blocked or not_encoded, then rerun the suite.", + "evidence_ids": ["xy844-current-pr-review"], + "confidence": "high" + }, + { + "claim_id": "stale_blocker", + "text": "The missing issue_transition blocker is stale.", + "evidence_ids": ["xy844-current-pr-review"], + "confidence": "high" + } + ], + "evidence_ids": ["xy844-current-pr-review"], + "latency_ms": 2.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-old-tracker-blocker", + "ts": "2026-06-08T08:15:00Z", + "actor": "agent", + "action": "recorded_resolved_review_blocker", + "evidence_ids": ["xy844-resolved-review-blocker"], + "summary": "An old review note identified missing issue_transition evidence." + }, + { + "event_id": "xy844-current-review-blocker", + "ts": "2026-06-09T08:15:00Z", + "actor": "external", + "action": "published_review_summary", + "evidence_ids": ["xy844-current-pr-review"], + "summary": "The current PR review narrowed the blocker to unsupported live-adapter claims in the report." + } + ], + "prompt": { + "role": "user", + "content": "A PR review came in for XY-844. What is the active blocker, what is stale, and what should I do next?", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "avoid_stale_review_threads", + "state_exact_next_action" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_review_blocker", + "text": "The active review blocker is an unsupported live-adapter claim in the report." + }, + { + "claim_id": "review_next_action", + "text": "Mark agentmemory and claude-mem behavior as fixture-backed references while live adapters remain blocked or not_encoded, then rerun the suite." + }, + { + "claim_id": "stale_blocker", + "text": "The missing issue_transition blocker is stale." + } + ], + "must_not_include": [ + "Move XY-844 to In Progress.", + "agentmemory and claude-mem live adapters passed." + ], + "evidence_links": { + "current_review_blocker": ["xy844-current-pr-review"], + "review_next_action": ["xy844-current-pr-review"], + "stale_blocker": ["xy844-current-pr-review"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-current-pr-review", + "claim_id": "current_review_blocker", + "requirement": "cite", + "quote": "the active review blocker is an unsupported live-adapter claim in the real-world job report" + }, + { + "evidence_id": "xy844-current-pr-review", + "claim_id": "review_next_action", + "requirement": "use", + "quote": "agentmemory hooks and claude-mem viewer behavior are fixture-backed references" + } + ], + "negative_traps": [ + { + "trap_id": "resolved-issue-transition-review", + "type": "stale_fact", + "evidence_ids": ["xy844-resolved-review-blocker"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Names active review blocker and next action." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses current PR review evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids resolved issue_transition blocker." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Gives the report edit and rerun path." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed review recovery should be immediate." + } + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "pr_review", + "agentmemory_reference", + "claude_mem_reference", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_stale_worktree.json b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_stale_worktree.json new file mode 100644 index 00000000..32f7852f --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/work_resume/work_resume_stale_worktree.json @@ -0,0 +1,192 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-stale-worktree-001", + "suite": "work_resume", + "title": "Resume a retained lane with stale worktree blocker evidence", + "corpus": { + "corpus_id": "real-world-memory-work-resume-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "xy844-current-worktree", + "kind": "trace", + "text": "Current XY-844 worktree evidence: branch y/elf-xy-844 has only .decodex-run-activity and .decodex-run-control as untracked runtime artifacts. The lane stopped before adding fixture-backed JSON jobs under apps/elf-eval/fixtures/real_world_memory/work_resume/. The exact next action is to add those fixtures and update the real_world_job runner/report tests.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_stale_worktree", + "evidence_id": "xy844-current-worktree" + } + }, + "created_at": "2026-06-09T08:00:00Z" + }, + { + "evidence_id": "xy844-stale-worktree-blocker", + "kind": "trace", + "text": "Old retained-lane note: XY-844 is blocked by untracked Decodex runtime files and should stop for manual cleanup.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_stale_worktree", + "evidence_id": "xy844-stale-worktree-blocker" + } + }, + "created_at": "2026-06-08T08:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_work_resume", + "answer": { + "content": "Resume XY-844 on branch y/elf-xy-844. The stale blocker is the old claim that untracked Decodex runtime files require manual cleanup; current evidence says those files are runtime artifacts. The exact next action is to add fixture-backed JSON jobs under apps/elf-eval/fixtures/real_world_memory/work_resume/ and update the runner/report tests.", + "claims": [ + { + "claim_id": "next_action", + "text": "Add fixture-backed JSON jobs under apps/elf-eval/fixtures/real_world_memory/work_resume/ and update the runner/report tests.", + "evidence_ids": ["xy844-current-worktree"], + "confidence": "high" + }, + { + "claim_id": "stale_blocker", + "text": "The old untracked-runtime-files blocker is stale.", + "evidence_ids": ["xy844-current-worktree"], + "confidence": "high" + } + ], + "evidence_ids": ["xy844-current-worktree"], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy844-old-worktree-stop", + "ts": "2026-06-08T08:00:00Z", + "actor": "agent", + "action": "recorded_stale_blocker", + "evidence_ids": ["xy844-stale-worktree-blocker"], + "summary": "An older retained note treated untracked Decodex runtime files as a blocker." + }, + { + "event_id": "xy844-current-worktree-readback", + "ts": "2026-06-09T08:00:00Z", + "actor": "agent", + "action": "read_current_worktree", + "evidence_ids": ["xy844-current-worktree"], + "summary": "The current worktree readback narrowed the real next action to fixture and runner updates." + } + ], + "prompt": { + "role": "user", + "content": "What stopped this XY-844 lane, what is the exact next action, and which blocker is stale?", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "avoid_repeating_completed_work", + "state_blockers" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "next_action", + "text": "Add fixture-backed JSON jobs under apps/elf-eval/fixtures/real_world_memory/work_resume/ and update the runner/report tests." + }, + { + "claim_id": "stale_blocker", + "text": "The old untracked-runtime-files blocker is stale." + } + ], + "must_not_include": [ + "Stop for manual cleanup of .decodex runtime files.", + "The untracked Decodex runtime files are the current blocker." + ], + "evidence_links": { + "next_action": ["xy844-current-worktree"], + "stale_blocker": ["xy844-current-worktree"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "xy844-current-worktree", + "claim_id": "next_action", + "requirement": "cite", + "quote": "The exact next action is to add those fixtures and update the real_world_job runner/report tests." + }, + { + "evidence_id": "xy844-current-worktree", + "claim_id": "stale_blocker", + "requirement": "use", + "quote": "only .decodex-run-activity and .decodex-run-control as untracked runtime artifacts" + } + ], + "negative_traps": [ + { + "trap_id": "stale-runtime-artifact-blocker", + "type": "stale_fact", + "evidence_ids": ["xy844-stale-worktree-blocker"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Includes what stopped the lane and the exact current next action." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Uses the current worktree evidence for required claims." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids stale blocker evidence." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Advances the lane without asking for unnecessary cleanup." + }, + "latency_resource": { + "weight": 0.1, + "max_points": 1.0, + "criteria": { + "max_latency_ms": 50.0, + "description": "Fixture-backed answer should be immediate." + } + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "fixture_backed", + "worktree_resume", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/src/app.rs b/apps/elf-eval/src/app.rs new file mode 100644 index 00000000..b5234bc9 --- /dev/null +++ b/apps/elf-eval/src/app.rs @@ -0,0 +1,1576 @@ +use std::{ + cmp::Ordering, + collections::{HashMap, HashSet}, + fs, + path::{Path, PathBuf}, + time::Instant, +}; + +use clap::{Parser, ValueEnum}; +use color_eyre::{Result, eyre}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use sqlx::FromRow; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; +use tracing_subscriber::EnvFilter; +use uuid::Uuid; + +use elf_config::Config; +use elf_service::{ + ElfService, RankingRequestOverride, SearchIndexItem, SearchIndexResponse, SearchRequest, + search::{self, TraceReplayContext, TraceReplayItem}, +}; +use elf_storage::{db::Db, qdrant::QdrantStore}; + +#[derive(Debug, Parser)] +#[command( + version = elf_cli::VERSION, + rename_all = "kebab", + styles = elf_cli::styles(), +)] +pub struct Args { + #[arg(long = "config-a", short = 'c', value_name = "FILE", visible_alias = "config")] + pub config_a: PathBuf, + #[arg(long = "config-b", value_name = "FILE")] + pub config_b: Option, + #[arg(long, short = 'd', value_name = "FILE", required_unless_present = "trace_id")] + pub dataset: Option, + #[arg(long, value_name = "N")] + pub top_k: Option, + #[arg(long, value_name = "N")] + pub candidate_k: Option, + #[arg(long, value_name = "N", default_value_t = 1)] + pub runs_per_query: u32, + #[arg(long, value_enum, default_value_t = SearchMode::PlannedSearch)] + pub search_mode: SearchMode, + #[arg(long = "search-mode-b", value_enum)] + pub search_mode_b: Option, + #[arg(long = "trace-id", value_name = "UUID", num_args = 1..)] + pub trace_id: Vec, +} + +#[derive(Clone, Copy, Debug, Deserialize, Serialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +pub enum SearchMode { + #[value(name = "quick_find")] + QuickFind, + #[value(name = "planned_search")] + PlannedSearch, +} + +#[derive(Debug, Deserialize)] +struct EvalDataset { + name: Option, + defaults: Option, + queries: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct EvalDefaults { + tenant_id: Option, + project_id: Option, + agent_id: Option, + read_profile: Option, + top_k: Option, + candidate_k: Option, + ranking: Option, +} + +#[derive(Debug, Deserialize)] +struct EvalQuery { + id: Option, + query: String, + tenant_id: Option, + project_id: Option, + agent_id: Option, + read_profile: Option, + top_k: Option, + candidate_k: Option, + #[serde(default)] + expected_note_ids: Vec, + #[serde(default)] + expected_keys: Vec, + ranking: Option, +} + +#[derive(Debug, Serialize)] +struct EvalOutput { + dataset: EvalDatasetInfo, + settings: EvalSettings, + summary: EvalSummary, + queries: Vec, +} + +#[derive(Debug, Serialize)] +struct EvalDatasetInfo { + name: String, + query_count: usize, +} + +#[derive(Debug, Serialize)] +struct EvalSettings { + config_path: String, + search_mode: SearchMode, + candidate_k: u32, + top_k: u32, + #[serde(skip_serializing_if = "Option::is_none")] + runs_per_query: Option, +} + +#[derive(Debug, Serialize)] +struct EvalSummary { + avg_recall_at_k: f64, + avg_precision_at_k: f64, + mean_rr: f64, + mean_ndcg: f64, + latency_ms_p50: f64, + latency_ms_p95: f64, + avg_retrieved_summary_chars: f64, + #[serde(skip_serializing_if = "Option::is_none")] + stability: Option, +} + +#[derive(Debug, Serialize)] +struct StabilitySummary { + runs_per_query: u32, + avg_positional_churn_at_k: f64, + avg_set_churn_at_k: f64, +} + +#[derive(Debug, Serialize)] +struct QueryReport { + id: String, + query: String, + trace_id: Uuid, + #[serde(skip_serializing_if = "Option::is_none")] + trace_ids: Option>, + expected_count: usize, + retrieved_count: usize, + relevant_count: usize, + recall_at_k: f64, + precision_at_k: f64, + rr: f64, + ndcg: f64, + latency_ms: f64, + expected_note_ids: Vec, + expected_keys: Vec, + expected_kind: ExpectedKind, + retrieved_note_ids: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + retrieved_keys: Vec>, + retrieved_summary_chars: usize, + #[serde(skip_serializing_if = "Option::is_none")] + stability: Option, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +enum ExpectedKind { + NoteId, + Key, +} + +#[derive(Clone, Copy, Debug, Serialize)] +struct QueryStability { + runs_per_query: u32, + positional_churn_at_k: f64, + set_churn_at_k: f64, +} + +#[derive(Debug, Serialize)] +struct CompareOutput { + dataset: EvalDatasetInfo, + settings_a: EvalSettings, + settings_b: EvalSettings, + summary_a: EvalSummary, + summary_b: EvalSummary, + summary_delta: EvalSummaryDelta, + policy_stability: PolicyStabilitySummary, + queries: Vec, +} + +#[derive(Debug, Serialize)] +struct PolicyStabilitySummary { + k: u32, + avg_positional_churn_at_k: f64, + avg_set_churn_at_k: f64, +} + +#[derive(Debug, Serialize)] +struct EvalSummaryDelta { + avg_recall_at_k: f64, + avg_precision_at_k: f64, + mean_rr: f64, + mean_ndcg: f64, + latency_ms_p50: f64, + latency_ms_p95: f64, + avg_retrieved_summary_chars: f64, + #[serde(skip_serializing_if = "Option::is_none")] + stability: Option, +} + +#[derive(Debug, Serialize)] +struct StabilitySummaryDelta { + avg_positional_churn_at_k: f64, + avg_set_churn_at_k: f64, +} + +#[derive(Debug, Serialize)] +struct CompareQueryReport { + id: String, + query: String, + expected_count: usize, + expected_note_ids: Vec, + a: QueryVariantReport, + b: QueryVariantReport, + delta: QueryVariantDelta, + policy_churn: PolicyChurn, +} + +#[derive(Debug, Serialize)] +struct PolicyChurn { + positional_churn_at_k: f64, + set_churn_at_k: f64, +} + +#[derive(Debug, Serialize)] +struct QueryVariantReport { + trace_id: Uuid, + #[serde(skip_serializing_if = "Option::is_none")] + trace_ids: Option>, + retrieved_count: usize, + relevant_count: usize, + recall_at_k: f64, + precision_at_k: f64, + rr: f64, + ndcg: f64, + latency_ms: f64, + retrieved_note_ids: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + stability: Option, +} + +#[derive(Debug, Serialize)] +struct QueryVariantDelta { + retrieved_count: i64, + relevant_count: i64, + recall_at_k: f64, + precision_at_k: f64, + rr: f64, + ndcg: f64, + latency_ms: f64, + #[serde(skip_serializing_if = "Option::is_none")] + stability: Option, +} + +#[derive(Debug, Serialize)] +struct QueryStabilityDelta { + positional_churn_at_k: f64, + set_churn_at_k: f64, +} + +#[derive(Debug, Serialize)] +struct TraceCompareOutput { + policies: TraceComparePolicies, + summary: TraceCompareSummary, + traces: Vec, +} + +#[derive(Debug, Serialize)] +struct TraceComparePolicies { + a: TraceComparePolicy, + b: TraceComparePolicy, +} + +#[derive(Debug, Serialize)] +struct TraceComparePolicy { + config_path: String, + policy_id: String, +} + +#[derive(Debug, Serialize)] +struct TraceCompareSummary { + trace_count: usize, + avg_positional_churn_at_k: f64, + avg_set_churn_at_k: f64, + avg_a_retrieval_top3_retention: f64, + avg_b_retrieval_top3_retention: f64, + avg_retrieval_top3_retention_delta: f64, +} + +#[derive(Debug, Serialize)] +struct TraceCompareTrace { + trace_id: Uuid, + query: String, + candidate_count: u32, + top_k: u32, + created_at: String, + a: TraceCompareVariant, + b: TraceCompareVariant, + churn: TraceCompareChurn, + guardrails: TraceCompareGuardrails, + stage_deltas: Vec, + regression_attribution: TraceCompareRegressionAttribution, +} + +#[derive(Debug, Serialize)] +struct TraceCompareVariant { + policy_id: String, + items: Vec, +} + +#[derive(Debug, Serialize)] +struct TraceCompareChurn { + positional_churn_at_k: f64, + set_churn_at_k: f64, +} + +#[derive(Debug, Serialize)] +struct TraceCompareGuardrails { + retrieval_top3_total: usize, + a_retrieval_top3_retained: usize, + a_retrieval_top3_retention: f64, + b_retrieval_top3_retained: usize, + b_retrieval_top3_retention: f64, + retrieval_top3_retention_delta: f64, +} + +#[derive(Debug, Serialize)] +struct TraceCompareStageDelta { + stage_order: u32, + stage_name: String, + baseline_item_count: u32, + a_item_count: u32, + b_item_count: u32, + item_count_delta: i64, + #[serde(skip_serializing_if = "Option::is_none")] + baseline_stats: Option, +} + +#[derive(Debug, Serialize)] +struct TraceCompareRegressionAttribution { + primary_stage: String, + evidence: String, +} + +#[derive(FromRow)] +struct TraceCompareTraceRow { + trace_id: Uuid, + query: String, + candidate_count: i32, + top_k: i32, + created_at: OffsetDateTime, +} + +#[derive(FromRow)] +struct TraceCompareCandidateRow { + candidate_snapshot: Value, + note_id: Uuid, + chunk_id: Uuid, + chunk_index: i32, + snippet: String, + retrieval_rank: i32, + rerank_score: f32, + note_scope: String, + note_importance: f32, + note_updated_at: OffsetDateTime, + note_hit_count: i64, + note_last_hit_at: Option, +} + +#[derive(FromRow)] +struct TraceCompareStageRow { + stage_order: i32, + stage_name: String, + stage_payload: Value, + item_count: i64, +} + +struct MergedQuery { + id: String, + query: String, + expected_note_ids: Vec, + expected_keys: Vec, + expected_kind: ExpectedKind, + request: SearchRequest, +} + +struct Metrics { + recall_at_k: f64, + precision_at_k: f64, + rr: f64, + ndcg: f64, + relevant_count: usize, +} + +struct EvalRun { + dataset: EvalDatasetInfo, + settings: EvalSettings, + summary: EvalSummary, + queries: Vec, +} + +pub async fn run(args: Args) -> Result<()> { + let config_a = elf_config::load(&args.config_a)?; + let filter = EnvFilter::new(config_a.service.log_level.clone()); + + tracing_subscriber::fmt().with_env_filter(filter).init(); + + if !args.trace_id.is_empty() { + let Some(config_b_path) = &args.config_b else { + return Err(eyre::eyre!("Trace compare mode requires --config-b.")); + }; + let config_b = elf_config::load(config_b_path)?; + let output = trace_compare( + args.config_a.as_path(), + config_a, + config_b_path.as_path(), + config_b, + &args, + ) + .await?; + let json = serde_json::to_string_pretty(&output)?; + + println!("{json}"); + + return Ok(()); + } + + let dataset_path = + args.dataset.as_ref().ok_or_else(|| eyre::eyre!("--dataset is required."))?; + let dataset = load_dataset(dataset_path.as_path())?; + let run_a = + eval_config(args.config_a.as_path(), config_a, &dataset, &args, args.search_mode).await?; + let search_mode_b = args.search_mode_b.unwrap_or(args.search_mode); + + if let Some(config_b_path) = &args.config_b { + let config_b = elf_config::load(config_b_path)?; + let run_b = + eval_config(config_b_path.as_path(), config_b, &dataset, &args, search_mode_b).await?; + let k = run_a.settings.top_k.min(run_b.settings.top_k).max(1); + let (queries, policy_stability) = build_compare_queries(&run_a.queries, &run_b.queries, k); + let summary_delta = diff_summary(&run_a.summary, &run_b.summary); + let output = CompareOutput { + dataset: run_a.dataset, + settings_a: run_a.settings, + settings_b: run_b.settings, + summary_a: run_a.summary, + summary_b: run_b.summary, + summary_delta, + policy_stability, + queries, + }; + let json = serde_json::to_string_pretty(&output)?; + + println!("{json}"); + + return Ok(()); + } + + let output = EvalOutput { + dataset: run_a.dataset, + settings: run_a.settings, + summary: run_a.summary, + queries: run_a.queries, + }; + let json = serde_json::to_string_pretty(&output)?; + + println!("{json}"); + + Ok(()) +} + +fn retrieval_top_rank_retention( + candidates: &[elf_service::search::TraceReplayCandidate], + note_ids: &[Uuid], + max_retrieval_rank: u32, +) -> (usize, usize, f64) { + let mut top_notes = HashSet::new(); + + for candidate in candidates { + if candidate.retrieval_rank == 0 || candidate.retrieval_rank > max_retrieval_rank { + continue; + } + + top_notes.insert(candidate.note_id); + } + + let total = top_notes.len(); + + if total == 0 { + return (0, 0, 0.0); + } + + let out_set: HashSet = note_ids.iter().copied().collect(); + let retained = top_notes.intersection(&out_set).count(); + let retention = retained as f64 / total as f64; + + (total, retained, retention) +} + +fn load_dataset(path: &Path) -> Result { + let raw = fs::read_to_string(path)?; + let dataset: EvalDataset = serde_json::from_str(&raw)?; + + if dataset.queries.is_empty() { + return Err(eyre::eyre!("Dataset must include at least one query.")); + } + + Ok(dataset) +} + +fn churn_against_baseline_at_k(baseline: &[Uuid], other: &[Uuid], k: usize) -> (f64, f64) { + let k = k.max(1); + let mut positional_diff = 0_usize; + + for idx in 0..k { + let a = baseline.get(idx); + let b = other.get(idx); + + if a != b { + positional_diff += 1; + } + } + + let positional_churn = positional_diff as f64 / k as f64; + let base_set: HashSet = baseline.iter().take(k).copied().collect(); + let other_set: HashSet = other.iter().take(k).copied().collect(); + let overlap = base_set.intersection(&other_set).count(); + let set_churn = 1.0 - (overlap as f64 / k as f64); + + (positional_churn, set_churn) +} + +fn diff_summary(a: &EvalSummary, b: &EvalSummary) -> EvalSummaryDelta { + EvalSummaryDelta { + avg_recall_at_k: b.avg_recall_at_k - a.avg_recall_at_k, + avg_precision_at_k: b.avg_precision_at_k - a.avg_precision_at_k, + mean_rr: b.mean_rr - a.mean_rr, + mean_ndcg: b.mean_ndcg - a.mean_ndcg, + latency_ms_p50: b.latency_ms_p50 - a.latency_ms_p50, + latency_ms_p95: b.latency_ms_p95 - a.latency_ms_p95, + avg_retrieved_summary_chars: b.avg_retrieved_summary_chars - a.avg_retrieved_summary_chars, + stability: match (&a.stability, &b.stability) { + (Some(sa), Some(sb)) => Some(StabilitySummaryDelta { + avg_positional_churn_at_k: sb.avg_positional_churn_at_k + - sa.avg_positional_churn_at_k, + avg_set_churn_at_k: sb.avg_set_churn_at_k - sa.avg_set_churn_at_k, + }), + _ => None, + }, + } +} + +fn build_compare_queries( + a: &[QueryReport], + b: &[QueryReport], + k: u32, +) -> (Vec, PolicyStabilitySummary) { + let k_usize = k.max(1) as usize; + let mut positional_sum = 0.0_f64; + let mut set_sum = 0.0_f64; + let queries: Vec = a + .iter() + .zip(b.iter()) + .map(|(qa, qb)| { + let delta_stability = match (qa.stability, qb.stability) { + (Some(sa), Some(sb)) => Some(QueryStabilityDelta { + positional_churn_at_k: sb.positional_churn_at_k - sa.positional_churn_at_k, + set_churn_at_k: sb.set_churn_at_k - sa.set_churn_at_k, + }), + _ => None, + }; + let (positional_churn_at_k, set_churn_at_k) = churn_against_baseline_at_k( + &qa.retrieved_note_ids, + &qb.retrieved_note_ids, + k_usize, + ); + + positional_sum += positional_churn_at_k; + set_sum += set_churn_at_k; + + CompareQueryReport { + id: qa.id.clone(), + query: qa.query.clone(), + expected_count: qa.expected_count, + expected_note_ids: qa.expected_note_ids.clone(), + a: QueryVariantReport { + trace_id: qa.trace_id, + trace_ids: qa.trace_ids.clone(), + retrieved_count: qa.retrieved_count, + relevant_count: qa.relevant_count, + recall_at_k: qa.recall_at_k, + precision_at_k: qa.precision_at_k, + rr: qa.rr, + ndcg: qa.ndcg, + latency_ms: qa.latency_ms, + retrieved_note_ids: qa.retrieved_note_ids.clone(), + stability: qa.stability, + }, + b: QueryVariantReport { + trace_id: qb.trace_id, + trace_ids: qb.trace_ids.clone(), + retrieved_count: qb.retrieved_count, + relevant_count: qb.relevant_count, + recall_at_k: qb.recall_at_k, + precision_at_k: qb.precision_at_k, + rr: qb.rr, + ndcg: qb.ndcg, + latency_ms: qb.latency_ms, + retrieved_note_ids: qb.retrieved_note_ids.clone(), + stability: qb.stability, + }, + delta: QueryVariantDelta { + retrieved_count: qb.retrieved_count as i64 - qa.retrieved_count as i64, + relevant_count: qb.relevant_count as i64 - qa.relevant_count as i64, + recall_at_k: qb.recall_at_k - qa.recall_at_k, + precision_at_k: qb.precision_at_k - qa.precision_at_k, + rr: qb.rr - qa.rr, + ndcg: qb.ndcg - qa.ndcg, + latency_ms: qb.latency_ms - qa.latency_ms, + stability: delta_stability, + }, + policy_churn: PolicyChurn { positional_churn_at_k, set_churn_at_k }, + } + }) + .collect(); + let count = queries.len().max(1) as f64; + let summary = PolicyStabilitySummary { + k, + avg_positional_churn_at_k: positional_sum / count, + avg_set_churn_at_k: set_sum / count, + }; + + (queries, summary) +} + +fn merge_query( + defaults: &EvalDefaults, + query: &EvalQuery, + args: &Args, + cfg: &Config, + index: usize, +) -> Result { + let expected_kind = + resolve_expected_mode(index, &query.expected_note_ids, &query.expected_keys)?; + let tenant_id = query + .tenant_id + .clone() + .or_else(|| defaults.tenant_id.clone()) + .ok_or_else(|| eyre::eyre!("tenant_id is required for query at index {index}."))?; + let project_id = query + .project_id + .clone() + .or_else(|| defaults.project_id.clone()) + .ok_or_else(|| eyre::eyre!("project_id is required for query at index {index}."))?; + let agent_id = query + .agent_id + .clone() + .or_else(|| defaults.agent_id.clone()) + .ok_or_else(|| eyre::eyre!("agent_id is required for query at index {index}."))?; + let read_profile = query + .read_profile + .clone() + .or_else(|| defaults.read_profile.clone()) + .ok_or_else(|| eyre::eyre!("read_profile is required for query at index {index}."))?; + let top_k = args.top_k.or(query.top_k).or(defaults.top_k).unwrap_or(cfg.memory.top_k).max(1); + let candidate_k = args + .candidate_k + .or(query.candidate_k) + .or(defaults.candidate_k) + .unwrap_or(cfg.memory.candidate_k) + .max(top_k); + let id = query.id.clone().unwrap_or_else(|| format!("query-{index}")); + let ranking = query.ranking.clone().or_else(|| defaults.ranking.clone()); + + Ok(MergedQuery { + id, + query: query.query.clone(), + expected_note_ids: query.expected_note_ids.clone(), + expected_keys: query.expected_keys.clone(), + expected_kind, + request: SearchRequest { + tenant_id, + project_id, + agent_id, + token_id: None, + read_profile, + payload_level: Default::default(), + query: query.query.clone(), + top_k: Some(top_k), + candidate_k: Some(candidate_k), + filter: None, + record_hits: Some(false), + ranking, + }, + }) +} + +fn resolve_expected_mode(index: usize, note_ids: &[Uuid], keys: &[String]) -> Result { + let has_note_ids = !note_ids.is_empty(); + let has_keys = !keys.is_empty(); + + match (has_note_ids, has_keys) { + (true, false) => Ok(ExpectedKind::NoteId), + (false, true) => Ok(ExpectedKind::Key), + (true, true) => Err(eyre::eyre!( + "Query at index {index} must define exactly one expectation mode: expected_note_ids or expected_keys." + )), + (false, false) => Err(eyre::eyre!( + "Query at index {index} must include at least one expected_note_ids or expected_keys." + )), + } +} + +fn unique_items(items: &[SearchIndexItem]) -> Vec { + let mut seen = HashSet::new(); + let mut out = Vec::new(); + + for item in items { + if seen.insert(item.note_id) { + out.push(item.clone()); + } + } + + out +} + +fn compute_metrics(retrieved: &[Uuid], expected: &HashSet) -> Metrics { + let expected_count = expected.len(); + let mut relevant_count = 0_usize; + let mut dcg = 0.0_f64; + let mut rr = 0.0_f64; + let mut first_hit: Option = None; + + for (idx, id) in retrieved.iter().enumerate() { + if expected.contains(id) { + relevant_count += 1; + + let rank = idx + 1; + let denom = (rank as f64 + 1.0).log2(); + + dcg += 1.0 / denom; + + if first_hit.is_none() { + first_hit = Some(rank); + } + } + } + + if let Some(rank) = first_hit { + rr = 1.0 / rank as f64; + } + + let ideal_hits = expected_count.min(retrieved.len()); + let mut idcg = 0.0_f64; + + for idx in 0..ideal_hits { + let rank = idx + 1; + let denom = (rank as f64 + 1.0).log2(); + + idcg += 1.0 / denom; + } + + let ndcg = if idcg > 0.0 { dcg / idcg } else { 0.0 }; + let precision_at_k = + if retrieved.is_empty() { 0.0 } else { relevant_count as f64 / retrieved.len() as f64 }; + let recall_at_k = + if expected_count == 0 { 0.0 } else { relevant_count as f64 / expected_count as f64 }; + + Metrics { recall_at_k, precision_at_k, rr, ndcg, relevant_count } +} + +fn compute_metrics_for_keys(retrieved: &[Option], expected: &HashSet) -> Metrics { + let expected_count = expected.len(); + let mut matched: HashSet = HashSet::new(); + let mut relevant_count = 0_usize; + let mut dcg = 0.0_f64; + let mut rr = 0.0_f64; + let mut first_hit: Option = None; + + for (idx, maybe_key) in retrieved.iter().enumerate() { + let Some(key) = maybe_key else { + continue; + }; + + if expected.contains(key) && !matched.contains(key) { + matched.insert(key.clone()); + + relevant_count += 1; + + let rank = idx + 1; + let denom = (rank as f64 + 1.0).log2(); + + dcg += 1.0 / denom; + + if first_hit.is_none() { + first_hit = Some(rank); + } + } + } + + if let Some(rank) = first_hit { + rr = 1.0 / rank as f64; + } + + let ideal_hits = expected_count.min(retrieved.len()); + let mut idcg = 0.0_f64; + + for idx in 0..ideal_hits { + let rank = idx + 1; + let denom = (rank as f64 + 1.0).log2(); + + idcg += 1.0 / denom; + } + + let ndcg = if idcg > 0.0 { dcg / idcg } else { 0.0 }; + let precision_at_k = + if retrieved.is_empty() { 0.0 } else { relevant_count as f64 / retrieved.len() as f64 }; + let recall_at_k = + if expected_count == 0 { 0.0 } else { relevant_count as f64 / expected_count as f64 }; + + Metrics { recall_at_k, precision_at_k, rr, ndcg, relevant_count } +} + +fn compute_metrics_for_query( + merged: &MergedQuery, + retrieved_note_ids: &[Uuid], + retrieved_keys: &[Option], +) -> (Metrics, usize) { + match merged.expected_kind { + ExpectedKind::NoteId => { + let expected: HashSet = merged.expected_note_ids.iter().copied().collect(); + let expected_count = expected.len(); + + (compute_metrics(retrieved_note_ids, &expected), expected_count) + }, + ExpectedKind::Key => { + let expected: HashSet = merged.expected_keys.iter().cloned().collect(); + let expected_count = expected.len(); + + (compute_metrics_for_keys(retrieved_keys, &expected), expected_count) + }, + } +} + +fn summarize(reports: &[QueryReport], latencies_ms: &[f64]) -> EvalSummary { + let count = reports.len().max(1) as f64; + let avg_recall_at_k = reports.iter().map(|r| r.recall_at_k).sum::() / count; + let avg_precision_at_k = reports.iter().map(|r| r.precision_at_k).sum::() / count; + let mean_rr = reports.iter().map(|r| r.rr).sum::() / count; + let mean_ndcg = reports.iter().map(|r| r.ndcg).sum::() / count; + let avg_retrieved_summary_chars = + reports.iter().map(|r| r.retrieved_summary_chars as f64).sum::() / count; + let mut sorted = latencies_ms.to_vec(); + + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal)); + + let p50 = percentile(&sorted, 0.50); + let p95 = percentile(&sorted, 0.95); + + EvalSummary { + avg_recall_at_k, + avg_precision_at_k, + mean_rr, + mean_ndcg, + latency_ms_p50: p50, + latency_ms_p95: p95, + avg_retrieved_summary_chars, + stability: None, + } +} + +fn percentile(values: &[f64], percentile: f64) -> f64 { + if values.is_empty() { + return 0.0; + } + + let clamped = percentile.clamp(0.0, 1.0); + let pos = clamped * (values.len() as f64 - 1.0); + let lower = pos.floor() as usize; + let upper = pos.ceil() as usize; + + if lower == upper { + values[lower] + } else { + let weight = pos - lower as f64; + + values[lower] * (1.0 - weight) + values[upper] * weight + } +} + +fn decode_trace_replay_candidates( + rows: Vec, +) -> Vec { + rows.into_iter() + .map(|row| { + let decoded = serde_json::from_value::( + row.candidate_snapshot.clone(), + ) + .ok() + .filter(|value| value.note_id != Uuid::nil() && value.chunk_id != Uuid::nil()); + + decoded.unwrap_or_else(|| elf_service::search::TraceReplayCandidate { + note_id: row.note_id, + chunk_id: row.chunk_id, + chunk_index: row.chunk_index, + snippet: row.snippet, + retrieval_rank: u32::try_from(row.retrieval_rank).unwrap_or(0), + retrieval_score: None, + rerank_score: row.rerank_score, + note_scope: row.note_scope, + note_importance: row.note_importance, + note_updated_at: row.note_updated_at, + note_hit_count: row.note_hit_count, + note_last_hit_at: row.note_last_hit_at, + diversity_selected: None, + diversity_selected_rank: None, + diversity_selected_reason: None, + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: None, + diversity_mmr_score: None, + diversity_missing_embedding: None, + }) + }) + .collect() +} + +fn build_trace_compare_stage_deltas( + stage_rows: &[TraceCompareStageRow], + a_selected_count: u32, + b_selected_count: u32, +) -> Vec { + if stage_rows.is_empty() { + return vec![TraceCompareStageDelta { + stage_order: 1, + stage_name: "selection.final".to_string(), + baseline_item_count: 0, + a_item_count: a_selected_count, + b_item_count: b_selected_count, + item_count_delta: b_selected_count as i64 - a_selected_count as i64, + baseline_stats: None, + }]; + } + + let mut out = Vec::with_capacity(stage_rows.len()); + + for row in stage_rows { + let baseline_item_count = row.item_count.max(0) as u32; + let (a_item_count, b_item_count) = if row.stage_name == "selection.final" { + (a_selected_count, b_selected_count) + } else { + (baseline_item_count, baseline_item_count) + }; + let baseline_stats = row.stage_payload.get("stats").cloned(); + + out.push(TraceCompareStageDelta { + stage_order: row.stage_order.max(0) as u32, + stage_name: row.stage_name.clone(), + baseline_item_count, + a_item_count, + b_item_count, + item_count_delta: b_item_count as i64 - a_item_count as i64, + baseline_stats, + }); + } + + out +} + +fn build_trace_compare_regression_attribution( + churn: &TraceCompareChurn, + guardrails: &TraceCompareGuardrails, + stage_deltas: &[TraceCompareStageDelta], +) -> TraceCompareRegressionAttribution { + let stage_by_name: HashMap<&str, &TraceCompareStageDelta> = + stage_deltas.iter().map(|stage| (stage.stage_name.as_str(), stage)).collect(); + + if guardrails.retrieval_top3_retention_delta < 0.0 { + let recall_count = stage_by_name + .get("recall.candidates") + .map(|stage| stage.baseline_item_count) + .unwrap_or(0); + + return TraceCompareRegressionAttribution { + primary_stage: "selection.final".to_string(), + evidence: format!( + "retrieval_top3_retention dropped by {:.4} (a={:.4}, b={:.4}); recall baseline item_count={recall_count}", + guardrails.retrieval_top3_retention_delta, + guardrails.a_retrieval_top3_retention, + guardrails.b_retrieval_top3_retention + ), + }; + } + if churn.set_churn_at_k > 0.0 || churn.positional_churn_at_k > 0.0 { + return TraceCompareRegressionAttribution { + primary_stage: "rerank.score".to_string(), + evidence: format!( + "top-k churn changed without retrieval-top3 regression (set_churn_at_k={:.4}, positional_churn_at_k={:.4})", + churn.set_churn_at_k, churn.positional_churn_at_k + ), + }; + } + + TraceCompareRegressionAttribution { + primary_stage: "not_applicable".to_string(), + evidence: "No regression signal detected.".to_string(), + } +} + +async fn trace_compare( + config_a_path: &Path, + config_a: Config, + config_b_path: &Path, + config_b: Config, + args: &Args, +) -> Result { + let policy_id_a = + search::ranking_policy_id(&config_a, None).map_err(|err| eyre::eyre!("{err}"))?; + let policy_id_b = + search::ranking_policy_id(&config_b, None).map_err(|err| eyre::eyre!("{err}"))?; + let db = Db::connect(&config_a.storage.postgres).await?; + + db.ensure_schema(config_a.storage.qdrant.vector_dim).await?; + + let mut traces = Vec::with_capacity(args.trace_id.len()); + let mut positional_sum = 0.0_f64; + let mut set_sum = 0.0_f64; + let mut top3_retention_a_sum = 0.0_f64; + let mut top3_retention_b_sum = 0.0_f64; + + for trace_id in &args.trace_id { + let trace = compare_trace_id( + &db, + &config_a, + &config_b, + policy_id_a.as_str(), + policy_id_b.as_str(), + trace_id, + args, + ) + .await?; + + positional_sum += trace.churn.positional_churn_at_k; + set_sum += trace.churn.set_churn_at_k; + top3_retention_a_sum += trace.guardrails.a_retrieval_top3_retention; + top3_retention_b_sum += trace.guardrails.b_retrieval_top3_retention; + + traces.push(trace); + } + + let count = traces.len().max(1) as f64; + let summary = TraceCompareSummary { + trace_count: traces.len(), + avg_positional_churn_at_k: positional_sum / count, + avg_set_churn_at_k: set_sum / count, + avg_a_retrieval_top3_retention: top3_retention_a_sum / count, + avg_b_retrieval_top3_retention: top3_retention_b_sum / count, + avg_retrieval_top3_retention_delta: (top3_retention_b_sum - top3_retention_a_sum) / count, + }; + + Ok(TraceCompareOutput { + policies: TraceComparePolicies { + a: TraceComparePolicy { + config_path: config_a_path.display().to_string(), + policy_id: policy_id_a, + }, + b: TraceComparePolicy { + config_path: config_b_path.display().to_string(), + policy_id: policy_id_b, + }, + }, + summary, + traces, + }) +} + +async fn compare_trace_id( + db: &Db, + config_a: &Config, + config_b: &Config, + policy_id_a: &str, + policy_id_b: &str, + trace_id: &Uuid, + args: &Args, +) -> Result { + let trace_row = fetch_trace_compare_trace_row(db, trace_id).await?; + let candidate_rows = fetch_trace_compare_candidate_rows(db, trace_id).await?; + let stage_rows = fetch_trace_compare_stage_rows(db, trace_id).await?; + let context = TraceReplayContext { + trace_id: trace_row.trace_id, + query: trace_row.query.clone(), + candidate_count: u32::try_from(trace_row.candidate_count).unwrap_or(0), + top_k: u32::try_from(trace_row.top_k).unwrap_or(0), + created_at: trace_row.created_at, + }; + let created_at = context + .created_at + .format(&Rfc3339) + .map_err(|err| eyre::eyre!("Failed to format trace created_at: {err}"))?; + let candidates = decode_trace_replay_candidates(candidate_rows); + let top_k = args.top_k.unwrap_or(context.top_k).max(1); + let items_a = + search::replay_ranking_from_candidates(config_a, &context, None, &candidates, top_k) + .map_err(|err| eyre::eyre!("{err}"))?; + let items_b = + search::replay_ranking_from_candidates(config_b, &context, None, &candidates, top_k) + .map_err(|err| eyre::eyre!("{err}"))?; + let note_ids_a: Vec = items_a.iter().map(|item| item.note_id).collect(); + let note_ids_b: Vec = items_b.iter().map(|item| item.note_id).collect(); + let (positional_churn_at_k, set_churn_at_k) = + churn_against_baseline_at_k(¬e_ids_a, ¬e_ids_b, top_k as usize); + let (retrieval_top3_total, a_retained, a_retention) = + retrieval_top_rank_retention(&candidates, ¬e_ids_a, 3); + let (_, b_retained, b_retention) = retrieval_top_rank_retention(&candidates, ¬e_ids_b, 3); + let churn = TraceCompareChurn { positional_churn_at_k, set_churn_at_k }; + let guardrails = TraceCompareGuardrails { + retrieval_top3_total, + a_retrieval_top3_retained: a_retained, + a_retrieval_top3_retention: a_retention, + b_retrieval_top3_retained: b_retained, + b_retrieval_top3_retention: b_retention, + retrieval_top3_retention_delta: b_retention - a_retention, + }; + let stage_deltas = build_trace_compare_stage_deltas( + stage_rows.as_slice(), + items_a.len() as u32, + items_b.len() as u32, + ); + let regression_attribution = + build_trace_compare_regression_attribution(&churn, &guardrails, stage_deltas.as_slice()); + + Ok(TraceCompareTrace { + trace_id: context.trace_id, + query: context.query, + candidate_count: context.candidate_count, + top_k, + created_at, + a: TraceCompareVariant { policy_id: policy_id_a.to_string(), items: items_a }, + b: TraceCompareVariant { policy_id: policy_id_b.to_string(), items: items_b }, + churn, + guardrails, + stage_deltas, + regression_attribution, + }) +} + +async fn fetch_trace_compare_trace_row(db: &Db, trace_id: &Uuid) -> Result { + let row: TraceCompareTraceRow = sqlx::query_as::<_, TraceCompareTraceRow>( + "\ +SELECT + trace_id, + query, + candidate_count, + top_k, + created_at +FROM search_traces +WHERE trace_id = $1", + ) + .bind(trace_id) + .fetch_one(&db.pool) + .await?; + + Ok(row) +} + +async fn fetch_trace_compare_candidate_rows( + db: &Db, + trace_id: &Uuid, +) -> Result> { + let rows: Vec = sqlx::query_as::<_, TraceCompareCandidateRow>( + "\ +SELECT + candidate_snapshot, + note_id, + chunk_id, + chunk_index, + snippet, + retrieval_rank, + rerank_score, + note_scope, + note_importance, + note_updated_at, + note_hit_count, + note_last_hit_at +FROM search_trace_candidates +WHERE trace_id = $1 +ORDER BY retrieval_rank ASC", + ) + .bind(trace_id) + .fetch_all(&db.pool) + .await?; + + Ok(rows) +} + +async fn fetch_trace_compare_stage_rows( + db: &Db, + trace_id: &Uuid, +) -> Result> { + let rows = sqlx::query_as::<_, TraceCompareStageRow>( + "\ +SELECT + s.stage_order, + s.stage_name, + s.stage_payload, + COUNT(i.id)::bigint AS item_count +FROM search_trace_stages s +LEFT JOIN search_trace_stage_items i ON i.stage_id = s.stage_id +WHERE s.trace_id = $1 +GROUP BY s.stage_id, s.stage_order, s.stage_name, s.stage_payload +ORDER BY s.stage_order ASC", + ) + .bind(trace_id) + .fetch_all(&db.pool) + .await?; + + Ok(rows) +} + +async fn eval_config( + config_path: &Path, + config: Config, + dataset: &EvalDataset, + args: &Args, + search_mode: SearchMode, +) -> Result { + let db = Db::connect(&config.storage.postgres).await?; + + db.ensure_schema(config.storage.qdrant.vector_dim).await?; + + let qdrant = QdrantStore::new(&config.storage.qdrant)?; + let service = ElfService::new(config, db, qdrant); + let defaults = dataset.defaults.clone().unwrap_or(EvalDefaults { + tenant_id: None, + project_id: None, + agent_id: None, + read_profile: None, + top_k: None, + candidate_k: None, + ranking: None, + }); + let runs_per_query = args.runs_per_query.max(1); + let mut reports = Vec::with_capacity(dataset.queries.len()); + let mut latencies_ms = Vec::with_capacity(dataset.queries.len()); + let mut stability_positional = Vec::new(); + let mut stability_set = Vec::new(); + + for (index, query) in dataset.queries.iter().enumerate() { + let merged = merge_query(&defaults, query, args, &service.cfg, index)?; + let (first, latency_ms, stability, trace_ids) = + run_query_n_times(&service, merged.request.clone(), runs_per_query, search_mode) + .await?; + let retrieved = unique_items(&first.items); + let retrieved_note_ids: Vec = retrieved.iter().map(|item| item.note_id).collect(); + let retrieved_keys: Vec> = + retrieved.iter().map(|item| item.key.clone()).collect(); + let retrieved_summary_chars = + retrieved.iter().map(|item| item.summary.len()).sum::(); + let (metrics, expected_count) = + compute_metrics_for_query(&merged, &retrieved_note_ids, &retrieved_keys); + + if let Some(s) = stability { + stability_positional.push(s.positional_churn_at_k); + stability_set.push(s.set_churn_at_k); + } + + reports.push(QueryReport { + id: merged.id, + query: merged.query, + trace_id: first.trace_id, + trace_ids: (trace_ids.len() > 1).then_some(trace_ids), + expected_count, + retrieved_count: retrieved_note_ids.len(), + relevant_count: metrics.relevant_count, + recall_at_k: metrics.recall_at_k, + precision_at_k: metrics.precision_at_k, + rr: metrics.rr, + ndcg: metrics.ndcg, + latency_ms, + expected_note_ids: merged.expected_note_ids, + expected_keys: merged.expected_keys, + expected_kind: merged.expected_kind, + retrieved_note_ids, + retrieved_keys: if merged.expected_kind == ExpectedKind::Key { + retrieved_keys + } else { + Vec::new() + }, + retrieved_summary_chars, + stability, + }); + latencies_ms.push(latency_ms); + } + + let mut summary = summarize(&reports, &latencies_ms); + + if runs_per_query > 1 && !stability_positional.is_empty() { + let count = stability_positional.len().max(1) as f64; + let avg_positional_churn_at_k = stability_positional.iter().sum::() / count; + let avg_set_churn_at_k = stability_set.iter().sum::() / count; + + summary.stability = Some(StabilitySummary { + runs_per_query, + avg_positional_churn_at_k, + avg_set_churn_at_k, + }); + } + + let settings = EvalSettings { + config_path: config_path.display().to_string(), + search_mode, + candidate_k: args + .candidate_k + .or(dataset.defaults.as_ref().and_then(|d| d.candidate_k)) + .unwrap_or(service.cfg.memory.candidate_k), + top_k: args + .top_k + .or(dataset.defaults.as_ref().and_then(|d| d.top_k)) + .unwrap_or(service.cfg.memory.top_k), + runs_per_query: (runs_per_query > 1).then_some(runs_per_query), + }; + + Ok(EvalRun { + dataset: EvalDatasetInfo { + name: dataset.name.clone().unwrap_or_else(|| "eval".to_string()), + query_count: reports.len(), + }, + settings, + summary, + queries: reports, + }) +} + +async fn run_query_n_times( + service: &ElfService, + request: SearchRequest, + runs_per_query: u32, + search_mode: SearchMode, +) -> Result<(SearchIndexResponse, f64, Option, Vec)> { + let k = request.top_k.unwrap_or(1).max(1) as usize; + let runs = runs_per_query.max(1); + let mut first_response: Option = None; + let mut first_retrieved_ids: Vec = Vec::new(); + let mut trace_ids: Vec = Vec::with_capacity(runs as usize); + let mut latency_total_ms = 0.0_f64; + let mut positional_churn_sum = 0.0_f64; + let mut set_churn_sum = 0.0_f64; + let mut churn_count = 0_u32; + + for run_idx in 0..runs { + let start = Instant::now(); + let response = search_with_mode(service, request.clone(), search_mode).await?; + let latency_ms = start.elapsed().as_secs_f64() * 1_000.0; + + latency_total_ms += latency_ms; + + trace_ids.push(response.trace_id); + + let retrieved = unique_items(&response.items); + let retrieved_ids = retrieved.iter().map(|item| item.note_id).collect::>(); + + if run_idx == 0 { + first_retrieved_ids = retrieved_ids; + first_response = Some(response); + + continue; + } + + let (positional_churn_at_k, set_churn_at_k) = + churn_against_baseline_at_k(&first_retrieved_ids, &retrieved_ids, k); + + positional_churn_sum += positional_churn_at_k; + set_churn_sum += set_churn_at_k; + churn_count += 1; + } + + let latency_ms_mean = latency_total_ms / runs as f64; + let stability = if churn_count > 0 { + Some(QueryStability { + runs_per_query: runs, + positional_churn_at_k: positional_churn_sum / churn_count as f64, + set_churn_at_k: set_churn_sum / churn_count as f64, + }) + } else { + None + }; + + Ok(( + first_response.ok_or_else(|| eyre::eyre!("No search responses were collected."))?, + latency_ms_mean, + stability, + trace_ids, + )) +} + +async fn search_with_mode( + service: &ElfService, + request: SearchRequest, + search_mode: SearchMode, +) -> Result { + match search_mode { + SearchMode::QuickFind => service.search_quick(request).await.map_err(|err| err.into()), + SearchMode::PlannedSearch => { + let response = service.search_planned(request).await?; + + Ok(SearchIndexResponse { + trace_id: response.trace_id, + search_session_id: response.search_session_id, + expires_at: response.expires_at, + items: response.items, + trajectory_summary: response.trajectory_summary, + }) + }, + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use crate::app::{self, ExpectedKind, OffsetDateTime, Uuid}; + + #[test] + fn resolve_expected_mode_requires_exactly_one_definition() { + let index = 0; + let note_ids = vec![Uuid::new_v4()]; + let expected_keys = vec!["key-1".to_string()]; + let note_only = app::resolve_expected_mode(index, ¬e_ids, &[]); + let key_only = app::resolve_expected_mode(index, &[], &expected_keys); + let none = app::resolve_expected_mode(index, &[], &[]); + let both = app::resolve_expected_mode(index, ¬e_ids, &expected_keys); + + assert!(matches!(note_only.unwrap(), ExpectedKind::NoteId)); + assert!(matches!(key_only.unwrap(), ExpectedKind::Key)); + assert!(none.is_err(), "Expected missing expectations to be rejected"); + assert!(both.is_err(), "Expected both expectation fields to be rejected"); + } + + #[test] + fn compute_metrics_for_keys_counts_first_hit_per_unique_key_and_ignores_missing_keys() { + let expected: HashSet = + ["alpha", "beta", "gamma"].into_iter().map(String::from).collect(); + let retrieved = vec![ + None, + Some("alpha".to_string()), + Some("alpha".to_string()), + Some("gamma".to_string()), + Some("missing".to_string()), + ]; + let metrics = app::compute_metrics_for_keys(&retrieved, &expected); + let expected_dcg = 1.0 / (3.0_f64).log2() + 1.0 / (5.0_f64).log2(); + let expected_idcg = 1.0 + 1.0 / (3.0_f64).log2() + 1.0 / (4.0_f64).log2(); + + assert_eq!(metrics.relevant_count, 2); + assert!((metrics.precision_at_k - (2.0 / 5.0)).abs() < 1e-12); + assert!((metrics.recall_at_k - (2.0 / 3.0)).abs() < 1e-12); + assert!((metrics.rr - (1.0 / 2.0)).abs() < 1e-12); + assert!((metrics.ndcg - (expected_dcg / expected_idcg)).abs() < 1e-12); + } + + #[test] + fn retrieval_top_rank_retention_counts_unique_notes_and_retained_notes() { + let now = OffsetDateTime::from_unix_timestamp(0).expect("Valid timestamp."); + let note_a = Uuid::new_v4(); + let note_b = Uuid::new_v4(); + let note_c = Uuid::new_v4(); + let candidates = vec![ + elf_service::search::TraceReplayCandidate { + note_id: note_a, + chunk_id: Uuid::new_v4(), + chunk_index: 0, + snippet: "a".to_string(), + retrieval_rank: 1, + retrieval_score: None, + rerank_score: 0.1, + note_scope: "project_shared".to_string(), + note_importance: 0.1, + note_updated_at: now, + note_hit_count: 0, + note_last_hit_at: None, + diversity_selected: None, + diversity_selected_rank: None, + diversity_selected_reason: None, + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: None, + diversity_mmr_score: None, + diversity_missing_embedding: None, + }, + elf_service::search::TraceReplayCandidate { + note_id: note_a, + chunk_id: Uuid::new_v4(), + chunk_index: 1, + snippet: "a".to_string(), + retrieval_rank: 2, + retrieval_score: None, + rerank_score: 0.2, + note_scope: "project_shared".to_string(), + note_importance: 0.1, + note_updated_at: now, + note_hit_count: 0, + note_last_hit_at: None, + diversity_selected: None, + diversity_selected_rank: None, + diversity_selected_reason: None, + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: None, + diversity_mmr_score: None, + diversity_missing_embedding: None, + }, + elf_service::search::TraceReplayCandidate { + note_id: note_b, + chunk_id: Uuid::new_v4(), + chunk_index: 0, + snippet: "b".to_string(), + retrieval_rank: 3, + retrieval_score: None, + rerank_score: 0.3, + note_scope: "org_shared".to_string(), + note_importance: 0.1, + note_updated_at: now, + note_hit_count: 0, + note_last_hit_at: None, + diversity_selected: None, + diversity_selected_rank: None, + diversity_selected_reason: None, + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: None, + diversity_mmr_score: None, + diversity_missing_embedding: None, + }, + elf_service::search::TraceReplayCandidate { + note_id: note_c, + chunk_id: Uuid::new_v4(), + chunk_index: 0, + snippet: "c".to_string(), + retrieval_rank: 4, + retrieval_score: None, + rerank_score: 0.4, + note_scope: "org_shared".to_string(), + note_importance: 0.1, + note_updated_at: now, + note_hit_count: 0, + note_last_hit_at: None, + diversity_selected: None, + diversity_selected_rank: None, + diversity_selected_reason: None, + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: None, + diversity_mmr_score: None, + diversity_missing_embedding: None, + }, + ]; + let note_ids = vec![note_a, note_c]; + let (total, retained, retention) = + app::retrieval_top_rank_retention(&candidates, ¬e_ids, 3); + + assert_eq!(total, 2); + assert_eq!(retained, 1); + assert!((retention - 0.5).abs() < 1e-12, "Unexpected retention: {retention}"); + } +} diff --git a/apps/elf-eval/src/bin/agentmemory_fixture_adapter.rs b/apps/elf-eval/src/bin/agentmemory_fixture_adapter.rs new file mode 100644 index 00000000..91479958 --- /dev/null +++ b/apps/elf-eval/src/bin/agentmemory_fixture_adapter.rs @@ -0,0 +1,639 @@ +#![allow(clippy::single_component_path_imports, unused_crate_dependencies)] + +//! Offline adapter for agentmemory-style fixture exports. + +use std::{collections::HashMap, fs, path::PathBuf}; + +use clap::Parser; +use color_eyre; +use serde::{Deserialize, Serialize}; +use serde_json::{self, Value}; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; +use uuid::Uuid; + +const OUTPUT_SCHEMA: &str = "elf.agentmemory_adapter/v1"; +const FIXTURE_RESOLVER: &str = "agentmemory_fixture/v1"; +const DEFAULT_IMPORTANCE: f32 = 0.5; +const DEFAULT_CONFIDENCE: f32 = 0.5; + +#[derive(Debug, Parser)] +#[command( + version = elf_cli::VERSION, + rename_all = "kebab", + styles = elf_cli::styles(), +)] +struct Args { + /// Path to a sanitized agentmemory-style JSON fixture. + #[arg(long, short = 'f', value_name = "FILE")] + fixture: PathBuf, + /// Write adapter JSON to this file (defaults to stdout). + #[arg(long, value_name = "FILE")] + out: Option, + /// ELF write scope to attach to emitted note and doc candidates. + #[arg(long, default_value = "agent_private")] + scope: String, + /// Maximum note text length accepted for note candidates. + #[arg(long, default_value_t = 240)] + max_note_chars: usize, +} + +#[derive(Debug, Deserialize)] +struct AgentmemoryFixture { + schema: Option, + + fixture_id: Option, + #[serde(default)] + source: FixtureSource, + #[serde(default)] + sessions: Vec, +} + +#[derive(Debug, Default, Deserialize)] +struct FixtureSource { + system: Option, + + version: Option, + + export_id: Option, + + exported_at: Option, +} + +#[derive(Debug, Deserialize)] +struct AgentmemorySession { + session_id: String, + + agent: Option, + + project: Option, + + started_at: Option, + + ended_at: Option, + #[serde(default)] + observations: Vec, + #[serde(default)] + memories: Vec, + #[serde(default)] + retrieval_cases: Vec, +} + +#[derive(Debug, Deserialize)] +struct AgentmemoryObservation { + observation_id: String, + + ts: Option, + + role: Option, + + kind: Option, + text: String, + #[serde(default)] + metadata: Value, +} + +#[derive(Debug, Deserialize)] +struct AgentmemoryMemory { + memory_id: String, + + kind: Option, + + key: Option, + text: String, + + importance: Option, + + confidence: Option, + + ttl_days: Option, + + created_at: Option, + + updated_at: Option, + #[serde(default)] + source_observation_ids: Vec, + #[serde(default)] + metadata: Value, +} + +#[derive(Debug, Deserialize)] +struct AgentmemoryRetrievalCase { + query_id: String, + query: String, + #[serde(default)] + expected_memory_ids: Vec, + #[serde(default)] + agentmemory_results: Vec, + #[serde(default)] + metadata: Value, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AgentmemorySearchResult { + memory_id: String, + #[serde(skip_serializing_if = "Option::is_none")] + rank: Option, + #[serde(skip_serializing_if = "Option::is_none")] + score: Option, +} + +#[derive(Debug, Serialize)] +struct AdapterOutput { + schema: &'static str, + fixture_id: String, + source: AdapterSource, + summary: AdapterSummary, + note_candidates: Vec, + doc_candidates: Vec, + baseline_queries: Vec, + ignored_items: Vec, +} + +#[derive(Debug, Serialize)] +struct AdapterSource { + system: String, + #[serde(skip_serializing_if = "Option::is_none")] + version: Option, + #[serde(skip_serializing_if = "Option::is_none")] + export_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + exported_at: Option, + #[serde(skip_serializing_if = "Option::is_none")] + fixture_schema: Option, +} + +#[derive(Debug, Serialize)] +struct AdapterSummary { + session_count: usize, + observation_count: usize, + memory_count: usize, + note_candidate_count: usize, + doc_candidate_count: usize, + baseline_query_count: usize, + ignored_count: usize, +} + +#[derive(Clone, Debug, Serialize)] +struct NoteCandidate { + candidate_id: Uuid, + scope: String, + session_id: String, + source_memory_id: String, + source_observation_ids: Vec, + notes_ingest_item: ElfNoteCandidate, + #[serde(skip_serializing_if = "Value::is_null")] + source_metadata: Value, +} + +#[derive(Clone, Debug, Serialize)] +struct ElfNoteCandidate { + #[serde(rename = "type")] + note_type: String, + #[serde(skip_serializing_if = "Option::is_none")] + key: Option, + text: String, + importance: f32, + confidence: f32, + #[serde(skip_serializing_if = "Option::is_none")] + ttl_days: Option, + source_ref: Value, +} + +#[derive(Debug, Serialize)] +struct DocCandidate { + candidate_id: Uuid, + scope: String, + session_id: String, + source_observation_id: String, + docs_put: DocsPutCandidate, + #[serde(skip_serializing_if = "Value::is_null")] + source_metadata: Value, +} + +#[derive(Debug, Serialize)] +struct DocsPutCandidate { + scope: String, + doc_type: &'static str, + title: String, + source_ref: Value, + content: String, +} + +#[derive(Debug, Serialize)] +struct BaselineQuery { + query_id: String, + session_id: String, + query: String, + expected_source_memory_ids: Vec, + expected_candidate_ids: Vec, + expected_keys: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + agentmemory_results: Vec, + #[serde(skip_serializing_if = "Value::is_null")] + source_metadata: Value, +} + +#[derive(Debug, Serialize)] +struct IgnoredItem { + item_kind: &'static str, + session_id: String, + source_id: String, + reason: &'static str, + #[serde(skip_serializing_if = "Option::is_none")] + detail: Option, +} + +#[derive(Clone)] +struct FixtureContext { + fixture_id: String, + source_system: String, + source_version: Option, + exported_at: Option, + scope: String, + max_note_chars: usize, +} + +fn main() -> color_eyre::Result<()> { + color_eyre::install()?; + + let args = Args::parse(); + let raw = fs::read_to_string(&args.fixture)?; + let fixture: AgentmemoryFixture = serde_json::from_str(&raw)?; + let output = adapt_fixture(&fixture, args.scope.as_str(), args.max_note_chars); + let json = serde_json::to_string_pretty(&output)?; + + if let Some(path) = args.out { + write_output(path, json.as_str())?; + } else { + println!("{json}"); + } + + Ok(()) +} + +fn write_output(path: PathBuf, json: &str) -> color_eyre::Result<()> { + if let Some(parent) = path.parent() + && !parent.as_os_str().is_empty() + { + fs::create_dir_all(parent)?; + } + + fs::write(path, json)?; + + Ok(()) +} + +fn adapt_fixture( + fixture: &AgentmemoryFixture, + scope: &str, + max_note_chars: usize, +) -> AdapterOutput { + let source = adapter_source(fixture); + let fixture_id = fixture_id(fixture, source.system.as_str()); + let ctx = FixtureContext { + fixture_id: fixture_id.clone(), + source_system: source.system.clone(), + source_version: source.version.clone(), + exported_at: source.exported_at.clone(), + scope: scope.to_string(), + max_note_chars, + }; + let mut notes = Vec::new(); + let mut docs = Vec::new(); + let mut baselines = Vec::new(); + let mut ignored = Vec::new(); + let mut memory_map = HashMap::new(); + + for session in &fixture.sessions { + map_observations(session, &ctx, &mut docs, &mut ignored); + map_memories(session, &ctx, &mut notes, &mut memory_map, &mut ignored); + map_baselines(session, &memory_map, &mut baselines, &mut ignored); + } + + AdapterOutput { + schema: OUTPUT_SCHEMA, + fixture_id, + source, + summary: AdapterSummary { + session_count: fixture.sessions.len(), + observation_count: fixture + .sessions + .iter() + .map(|session| session.observations.len()) + .sum(), + memory_count: fixture.sessions.iter().map(|session| session.memories.len()).sum(), + note_candidate_count: notes.len(), + doc_candidate_count: docs.len(), + baseline_query_count: baselines.len(), + ignored_count: ignored.len(), + }, + note_candidates: notes, + doc_candidates: docs, + baseline_queries: baselines, + ignored_items: ignored, + } +} + +fn adapter_source(fixture: &AgentmemoryFixture) -> AdapterSource { + AdapterSource { + system: clean_string(fixture.source.system.as_deref()) + .unwrap_or_else(|| "agentmemory".to_string()), + version: clean_string(fixture.source.version.as_deref()), + export_id: clean_string(fixture.source.export_id.as_deref()), + exported_at: clean_string(fixture.source.exported_at.as_deref()), + fixture_schema: clean_string(fixture.schema.as_deref()), + } +} + +fn fixture_id(fixture: &AgentmemoryFixture, source_system: &str) -> String { + clean_string(fixture.fixture_id.as_deref()) + .or_else(|| clean_string(fixture.source.export_id.as_deref())) + .unwrap_or_else(|| stable_uuid("fixture", &[source_system]).to_string()) +} + +fn map_observations( + session: &AgentmemorySession, + ctx: &FixtureContext, + docs: &mut Vec, + ignored: &mut Vec, +) { + for observation in &session.observations { + match doc_candidate(session, observation, ctx) { + Ok(candidate) => docs.push(candidate), + Err(reason) => ignored.push(IgnoredItem { + item_kind: "observation", + session_id: session.session_id.clone(), + source_id: observation.observation_id.clone(), + reason, + detail: None, + }), + } + } +} + +fn map_memories( + session: &AgentmemorySession, + ctx: &FixtureContext, + notes: &mut Vec, + memory_map: &mut HashMap, + ignored: &mut Vec, +) { + for memory in &session.memories { + match note_candidate(session, memory, ctx) { + Ok(candidate) => { + memory_map.insert(memory.memory_id.clone(), candidate.clone()); + notes.push(candidate); + }, + Err(reason) => ignored.push(IgnoredItem { + item_kind: "memory", + session_id: session.session_id.clone(), + source_id: memory.memory_id.clone(), + reason, + detail: None, + }), + } + } +} + +fn map_baselines( + session: &AgentmemorySession, + memory_map: &HashMap, + baselines: &mut Vec, + ignored: &mut Vec, +) { + for case in &session.retrieval_cases { + match baseline_query(session, case, memory_map) { + Some(baseline) => baselines.push(baseline), + None => ignored.push(IgnoredItem { + item_kind: "retrieval_case", + session_id: session.session_id.clone(), + source_id: case.query_id.clone(), + reason: "no_mapped_expected_memories", + detail: None, + }), + } + } +} + +fn doc_candidate( + session: &AgentmemorySession, + observation: &AgentmemoryObservation, + ctx: &FixtureContext, +) -> std::result::Result { + let text = observation.text.trim(); + + if text.is_empty() { + return Err("empty_text"); + } + + let Some(ts) = observation_timestamp(session, observation, ctx) else { + return Err("missing_or_invalid_timestamp"); + }; + let candidate_id = stable_uuid( + "observation", + &[ + ctx.fixture_id.as_str(), + session.session_id.as_str(), + observation.observation_id.as_str(), + ], + ); + let role = clean_string(observation.role.as_deref()) + .or_else(|| clean_string(observation.kind.as_deref())) + .unwrap_or_else(|| "observation".to_string()); + let title = format!("agentmemory observation {}", observation.observation_id); + let source_ref = serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "chat", + "ts": ts, + "thread_id": session.session_id, + "role": role, + "message_id": observation.observation_id, + "agentmemory_fixture_id": ctx.fixture_id, + "agentmemory_source_system": ctx.source_system, + "agentmemory_observation_kind": clean_string(observation.kind.as_deref()), + "agent": clean_string(session.agent.as_deref()), + "project": clean_string(session.project.as_deref()), + }); + + Ok(DocCandidate { + candidate_id, + scope: ctx.scope.clone(), + session_id: session.session_id.clone(), + source_observation_id: observation.observation_id.clone(), + docs_put: DocsPutCandidate { + scope: ctx.scope.clone(), + doc_type: "chat", + title, + source_ref, + content: observation.text.clone(), + }, + source_metadata: observation.metadata.clone(), + }) +} + +fn note_candidate( + session: &AgentmemorySession, + memory: &AgentmemoryMemory, + ctx: &FixtureContext, +) -> std::result::Result { + let text = memory.text.trim(); + + if text.is_empty() { + return Err("empty_text"); + } + if text.chars().count() > ctx.max_note_chars { + return Err("note_text_too_long"); + } + + let Some(note_type) = memory.kind.as_deref().and_then(map_note_type) else { + return Err("unsupported_memory_kind"); + }; + let Some(importance) = score_or_default(memory.importance, DEFAULT_IMPORTANCE) else { + return Err("invalid_importance"); + }; + let Some(confidence) = score_or_default(memory.confidence, DEFAULT_CONFIDENCE) else { + return Err("invalid_confidence"); + }; + let candidate_id = stable_uuid( + "memory", + &[ctx.fixture_id.as_str(), session.session_id.as_str(), memory.memory_id.as_str()], + ); + let source_ref = note_source_ref(session, memory, ctx); + + Ok(NoteCandidate { + candidate_id, + scope: ctx.scope.clone(), + session_id: session.session_id.clone(), + source_memory_id: memory.memory_id.clone(), + source_observation_ids: memory.source_observation_ids.clone(), + notes_ingest_item: ElfNoteCandidate { + note_type: note_type.to_string(), + key: clean_string(memory.key.as_deref()), + text: memory.text.clone(), + importance, + confidence, + ttl_days: memory.ttl_days.filter(|days| *days > 0), + source_ref, + }, + source_metadata: memory.metadata.clone(), + }) +} + +fn note_source_ref( + session: &AgentmemorySession, + memory: &AgentmemoryMemory, + ctx: &FixtureContext, +) -> Value { + serde_json::json!({ + "schema": "source_ref/v1", + "resolver": FIXTURE_RESOLVER, + "ref": { + "fixture_id": ctx.fixture_id, + "session_id": session.session_id, + "memory_id": memory.memory_id, + "observation_ids": memory.source_observation_ids, + }, + "state": { + "source_system": ctx.source_system, + "source_version": ctx.source_version, + "exported_at": ctx.exported_at, + "session_started_at": session.started_at, + "session_ended_at": session.ended_at, + "memory_created_at": memory.created_at, + "memory_updated_at": memory.updated_at, + }, + "locator": { + "memory_id": memory.memory_id, + "observation_ids": memory.source_observation_ids, + }, + "hints": { + "agent": session.agent, + "project": session.project, + "origin_kind": memory.kind, + }, + }) +} + +fn baseline_query( + session: &AgentmemorySession, + case: &AgentmemoryRetrievalCase, + memory_map: &HashMap, +) -> Option { + if case.query.trim().is_empty() || case.expected_memory_ids.is_empty() { + return None; + } + + let expected: Vec<&NoteCandidate> = + case.expected_memory_ids.iter().filter_map(|id| memory_map.get(id)).collect(); + + if expected.is_empty() { + return None; + } + + Some(BaselineQuery { + query_id: case.query_id.clone(), + session_id: session.session_id.clone(), + query: case.query.clone(), + expected_source_memory_ids: expected + .iter() + .map(|candidate| candidate.source_memory_id.clone()) + .collect(), + expected_candidate_ids: expected.iter().map(|candidate| candidate.candidate_id).collect(), + expected_keys: expected + .iter() + .filter_map(|candidate| candidate.notes_ingest_item.key.clone()) + .collect(), + agentmemory_results: case.agentmemory_results.clone(), + source_metadata: case.metadata.clone(), + }) +} + +fn observation_timestamp( + session: &AgentmemorySession, + observation: &AgentmemoryObservation, + ctx: &FixtureContext, +) -> Option { + [observation.ts.as_deref(), session.started_at.as_deref(), ctx.exported_at.as_deref()] + .into_iter() + .flatten() + .find_map(normalize_rfc3339) +} + +fn normalize_rfc3339(value: &str) -> Option { + OffsetDateTime::parse(value, &Rfc3339) + .ok() + .and_then(|timestamp| timestamp.format(&Rfc3339).ok()) +} + +fn map_note_type(kind: &str) -> Option<&'static str> { + match kind.trim().to_ascii_lowercase().as_str() { + "preference" => Some("preference"), + "constraint" => Some("constraint"), + "decision" => Some("decision"), + "profile" => Some("profile"), + "fact" => Some("fact"), + "plan" => Some("plan"), + _ => None, + } +} + +fn score_or_default(score: Option, default: f32) -> Option { + let score = score.unwrap_or(default); + + if score.is_finite() && (0.0..=1.0).contains(&score) { Some(score) } else { None } +} + +fn clean_string(value: Option<&str>) -> Option { + value.map(str::trim).filter(|value| !value.is_empty()).map(str::to_string) +} + +fn stable_uuid(kind: &str, parts: &[&str]) -> Uuid { + let mut key = format!("https://hack.ink/elf/{OUTPUT_SCHEMA}/{kind}"); + + for part in parts { + key.push('/'); + key.push_str(part); + } + + Uuid::new_v5(&Uuid::NAMESPACE_URL, key.as_bytes()) +} diff --git a/apps/elf-eval/src/bin/external_memory_pattern_radar.rs b/apps/elf-eval/src/bin/external_memory_pattern_radar.rs new file mode 100644 index 00000000..9a843a7b --- /dev/null +++ b/apps/elf-eval/src/bin/external_memory_pattern_radar.rs @@ -0,0 +1,821 @@ +#![allow(unused_crate_dependencies)] + +//! Weekly external memory pattern radar runner. + +use std::{ + collections::BTreeSet, + env, fs, + path::{Path, PathBuf}, +}; + +use clap::{Parser, Subcommand, ValueEnum}; +use color_eyre::{Result, eyre}; +use reqwest::{ + Client, StatusCode, + header::{ACCEPT, AUTHORIZATION, HeaderMap, HeaderValue, USER_AGENT}, +}; +use serde::{Deserialize, Serialize}; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; + +const CURSOR_SCHEMA: &str = "elf.external_memory_pattern_radar_cursor/v1"; +const RUN_SCHEMA: &str = "elf.external_memory_pattern_radar_run/v1"; +const DEFAULT_CURSOR: &str = "docs/research/external_memory_pattern_radar/cursor.json"; +const DEFAULT_SUMMARY: &str = "docs/research/external_memory_pattern_radar/latest.md"; + +#[derive(Debug, Parser)] +#[command( + version = elf_cli::VERSION, + rename_all = "kebab", + styles = elf_cli::styles(), +)] +struct Args { + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Parser)] +struct RunArgs { + /// Existing radar cursor file. + #[arg(long, value_name = "FILE", default_value = DEFAULT_CURSOR)] + cursor: PathBuf, + /// Output cursor path. Defaults to updating --cursor. + #[arg(long, value_name = "FILE")] + out_cursor: Option, + /// Output Markdown summary path. + #[arg(long, value_name = "FILE", default_value = DEFAULT_SUMMARY)] + summary: PathBuf, + /// Observation mode. Use offline for deterministic dry runs. + #[arg(long, value_enum, default_value_t = RadarMode::Live)] + mode: RadarMode, + /// Stable run id. Defaults to external-memory-pattern-radar-YYYY-MM-DD. + #[arg(long)] + run_id: Option, + /// Environment variable containing a GitHub token for live mode. + #[arg(long, default_value = "GITHUB_TOKEN")] + github_token_env: String, +} + +#[derive(Debug, Parser)] +struct ValidateArgs { + /// Cursor file to validate. + #[arg(long, value_name = "FILE", default_value = DEFAULT_CURSOR)] + cursor: PathBuf, +} + +#[derive(Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +struct RadarCursor { + schema: String, + cadence: String, + generated_at: String, + source_docs: Vec, + projects: Vec, + last_run: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +struct RadarProject { + id: String, + name: String, + repo: String, + homepage: String, + watch_focus: Vec, + primary_references: Vec, + coverage_evidence: Vec, + last_seen: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +struct EvidenceRef { + label: String, + path: String, + summary: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +struct ProjectObservation { + observed_at: String, + source_url: String, + default_branch: Option, + pushed_at: Option, + updated_at: Option, + latest_release: Option, + stars: Option, + open_issues: Option, + description: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +struct ReleaseObservation { + tag_name: String, + url: String, + published_at: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +struct RadarRun { + schema: String, + run_id: String, + generated_at: String, + mode: RadarMode, + summary: RunSummary, + decisions: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +struct RunSummary { + project_count: usize, + covered_count: usize, + rejected_count: usize, + gap_count: usize, + create_issue_count: usize, + defer_count: usize, + no_issue_count: usize, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +struct RadarDecision { + project_id: String, + upstream_change: String, + reusable_pattern: String, + elf_verdict: ElfVerdict, + product_value: String, + duplicate_coverage_evidence: Vec, + safety_boundary: String, + issue_decision: IssueDecision, + acceptance_evidence: Vec, + source_links: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +struct IssueDecision { + action: IssueAction, + rationale: String, + duplicate_search: DuplicateSearchEvidence, + proposed_issue: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +struct DuplicateSearchEvidence { + queried: bool, + query: String, + result: DuplicateSearchResult, + evidence: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +struct ProposedIssue { + title: String, + source_links: Vec, + repo_evidence: Vec, + non_goals: Vec, + validation_criteria: Vec, +} + +#[derive(Debug, Deserialize)] +struct GithubRepoResponse { + html_url: String, + default_branch: Option, + pushed_at: Option, + updated_at: Option, + stargazers_count: Option, + open_issues_count: Option, + description: Option, +} + +#[derive(Debug, Deserialize)] +struct GithubReleaseResponse { + tag_name: String, + html_url: String, + published_at: Option, +} + +#[derive(Debug, Subcommand)] +#[command(rename_all = "kebab")] +enum Command { + /// Run the external memory radar and write cursor plus Markdown summary. + Run(RunArgs), + /// Validate a radar cursor and its latest decision records. + Validate(ValidateArgs), +} + +#[derive(Clone, Copy, Debug, Deserialize, Serialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +enum RadarMode { + Live, + Offline, +} +impl RadarMode { + fn as_str(self) -> &'static str { + match self { + Self::Live => "live", + Self::Offline => "offline", + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum ElfVerdict { + Covered, + Reject, + Gap, +} +impl ElfVerdict { + fn as_str(self) -> &'static str { + match self { + Self::Covered => "covered", + Self::Reject => "reject", + Self::Gap => "gap", + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum IssueAction { + NoIssue, + Defer, + CreateIssue, +} +impl IssueAction { + fn as_str(self) -> &'static str { + match self { + Self::NoIssue => "no_issue", + Self::Defer => "defer", + Self::CreateIssue => "create_issue", + } + } +} + +#[derive(Clone, Copy, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum DuplicateSearchResult { + NotRequiredNoIssue, + NoDuplicateFound, + DuplicateFound, +} + +fn validate_command(path: &Path) -> Result<()> { + let cursor = read_cursor(path)?; + + validate_cursor(&cursor) +} + +fn read_cursor(path: &Path) -> Result { + let raw = fs::read_to_string(path) + .map_err(|err| eyre::eyre!("failed to read cursor {}: {err}", path.display()))?; + let cursor = serde_json::from_str(&raw) + .map_err(|err| eyre::eyre!("failed to parse cursor {}: {err}", path.display()))?; + + Ok(cursor) +} + +fn write_json(path: &Path, value: &T) -> Result<()> +where + T: Serialize, +{ + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + + let raw = serde_json::to_string_pretty(value)?; + + fs::write(path, format!("{raw}\n"))?; + + Ok(()) +} + +fn write_text(path: &Path, content: &str) -> Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + + fs::write(path, content)?; + + Ok(()) +} + +fn github_client(token_env: &str) -> Result> { + let mut headers = HeaderMap::new(); + + headers.insert(USER_AGENT, HeaderValue::from_static("elf-external-memory-pattern-radar")); + headers.insert(ACCEPT, HeaderValue::from_static("application/vnd.github+json")); + + if let Ok(token) = env::var(token_env) + && !token.trim().is_empty() + { + let value = format!("Bearer {}", token.trim()).parse()?; + + headers.insert(AUTHORIZATION, value); + } + + Ok(Some(Client::builder().default_headers(headers).build()?)) +} + +fn fallback_observation(project: &RadarProject, generated_at: &str) -> ProjectObservation { + ProjectObservation { + observed_at: generated_at.to_string(), + source_url: project.homepage.clone(), + default_branch: None, + pushed_at: None, + updated_at: None, + latest_release: None, + stars: None, + open_issues: None, + description: None, + } +} + +fn decide_project( + project: &RadarProject, + prior: Option<&ProjectObservation>, + observed: &ProjectObservation, + mode: RadarMode, +) -> RadarDecision { + let source_links = source_links(project, observed); + let evidence = project.coverage_evidence.clone(); + let changed = prior.map(|previous| observation_changed(previous, observed)).unwrap_or(false); + + if changed { + return RadarDecision { + project_id: project.id.clone(), + upstream_change: metadata_delta(prior, observed), + reusable_pattern: "No reusable pattern is claimed from metadata alone; source review is required before a pattern can become a gap." + .to_string(), + elf_verdict: ElfVerdict::Reject, + product_value: "Metadata movement is useful as a review trigger, but it has no product value until source evidence identifies a reusable pattern." + .to_string(), + duplicate_coverage_evidence: evidence, + safety_boundary: "Reject issue creation from activity, star counts, release tags, or push timestamps alone." + .to_string(), + issue_decision: IssueDecision { + action: IssueAction::NoIssue, + rationale: "No issue was created because this run only proved a metadata delta; the Codex review step must gather source links, repo evidence, and Linear duplicate search first." + .to_string(), + duplicate_search: DuplicateSearchEvidence { + queried: false, + query: String::new(), + result: DuplicateSearchResult::NotRequiredNoIssue, + evidence: vec![ + "No Linear search is required when the issue decision is no_issue.".to_string(), + ], + }, + proposed_issue: None, + }, + acceptance_evidence: vec![ + "Metadata delta recorded in the structured cursor.".to_string(), + "No parity or adoption claim was made from activity alone.".to_string(), + ], + source_links, + }; + } + + let upstream_change = if prior.is_none() { + metadata_delta(None, observed) + } else { + match mode { + RadarMode::Live => + "No GitHub metadata delta was observed since the prior cursor.".to_string(), + RadarMode::Offline => + "No upstream fetch was performed; the dry run replayed the checked-in cursor." + .to_string(), + } + }; + + RadarDecision { + project_id: project.id.clone(), + upstream_change, + reusable_pattern: "No new candidate pattern was identified in this run.".to_string(), + elf_verdict: ElfVerdict::Covered, + product_value: "Current ELF coverage remains represented by the comparison and inventory evidence." + .to_string(), + duplicate_coverage_evidence: evidence, + safety_boundary: "No external runtime is adopted by default; existing ELF evidence remains authoritative." + .to_string(), + issue_decision: IssueDecision { + action: IssueAction::NoIssue, + rationale: "No issue was created because the run found no source-backed gap.".to_string(), + duplicate_search: DuplicateSearchEvidence { + queried: false, + query: String::new(), + result: DuplicateSearchResult::NotRequiredNoIssue, + evidence: vec![ + "No Linear search is required when the issue decision is no_issue.".to_string(), + ], + }, + proposed_issue: None, + }, + acceptance_evidence: vec![ + "No-issue decision recorded in the cursor.".to_string(), + "Coverage evidence points at checked-in ELF research docs.".to_string(), + ], + source_links, + } +} + +fn source_links(project: &RadarProject, observed: &ProjectObservation) -> Vec { + let mut links = BTreeSet::new(); + + links.insert(project.homepage.clone()); + links.insert(observed.source_url.clone()); + + if let Some(release) = &observed.latest_release { + links.insert(release.url.clone()); + } + + links.into_iter().collect() +} + +fn observation_changed(previous: &ProjectObservation, observed: &ProjectObservation) -> bool { + previous.pushed_at != observed.pushed_at + || previous.updated_at != observed.updated_at + || previous.latest_release.as_ref().map(|release| &release.tag_name) + != observed.latest_release.as_ref().map(|release| &release.tag_name) +} + +fn metadata_delta(prior: Option<&ProjectObservation>, observed: &ProjectObservation) -> String { + let Some(previous) = prior else { + return "First cursor observation recorded; no prior state exists for comparison." + .to_string(); + }; + let previous_release = + previous.latest_release.as_ref().map(|release| release.tag_name.as_str()).unwrap_or("none"); + let observed_release = + observed.latest_release.as_ref().map(|release| release.tag_name.as_str()).unwrap_or("none"); + + format!( + "Repository metadata changed: pushed_at {} -> {}, latest_release {} -> {}.", + previous.pushed_at.as_deref().unwrap_or("unknown"), + observed.pushed_at.as_deref().unwrap_or("unknown"), + previous_release, + observed_release + ) +} + +fn summarize_decisions(decisions: &[RadarDecision]) -> RunSummary { + let mut summary = RunSummary { project_count: decisions.len(), ..RunSummary::default() }; + + for decision in decisions { + match decision.elf_verdict { + ElfVerdict::Covered => summary.covered_count += 1, + ElfVerdict::Reject => summary.rejected_count += 1, + ElfVerdict::Gap => summary.gap_count += 1, + } + match decision.issue_decision.action { + IssueAction::NoIssue => summary.no_issue_count += 1, + IssueAction::Defer => summary.defer_count += 1, + IssueAction::CreateIssue => summary.create_issue_count += 1, + } + } + + summary +} + +fn validate_cursor(cursor: &RadarCursor) -> Result<()> { + let mut errors = Vec::new(); + + if cursor.schema != CURSOR_SCHEMA { + errors.push(format!("cursor schema must be {CURSOR_SCHEMA}")); + } + if cursor.projects.is_empty() { + errors.push("cursor must include at least one project".to_string()); + } + + let project_ids = + cursor.projects.iter().map(|project| project.id.as_str()).collect::>(); + + if project_ids.len() != cursor.projects.len() { + errors.push("project ids must be unique".to_string()); + } + + for project in &cursor.projects { + validate_project(project, &mut errors); + } + + if let Some(run) = &cursor.last_run { + validate_run(run, &project_ids, &mut errors); + } + + if errors.is_empty() { + Ok(()) + } else { + Err(eyre::eyre!("radar cursor validation failed:\n{}", errors.join("\n"))) + } +} + +fn validate_project(project: &RadarProject, errors: &mut Vec) { + if project.id.trim().is_empty() { + errors.push("project id must not be empty".to_string()); + } + if !project.repo.contains('/') { + errors.push(format!("project {} repo must be owner/name", project.id)); + } + if project.coverage_evidence.is_empty() { + errors.push(format!("project {} must include duplicate/coverage evidence", project.id)); + } +} + +fn validate_run(run: &RadarRun, project_ids: &BTreeSet<&str>, errors: &mut Vec) { + if run.schema != RUN_SCHEMA { + errors.push(format!("run schema must be {RUN_SCHEMA}")); + } + if run.decisions.len() != project_ids.len() { + errors.push("latest run must include one decision per project".to_string()); + } + + for decision in &run.decisions { + validate_decision(decision, project_ids, errors); + } +} + +fn validate_decision( + decision: &RadarDecision, + project_ids: &BTreeSet<&str>, + errors: &mut Vec, +) { + if !project_ids.contains(decision.project_id.as_str()) { + errors.push(format!("decision references unknown project {}", decision.project_id)); + } + + for (field, value) in [ + ("upstream_change", &decision.upstream_change), + ("reusable_pattern", &decision.reusable_pattern), + ("product_value", &decision.product_value), + ("safety_boundary", &decision.safety_boundary), + ] { + if value.trim().is_empty() { + errors.push(format!("decision {} has empty {field}", decision.project_id)); + } + } + + if decision.duplicate_coverage_evidence.is_empty() { + errors.push(format!( + "decision {} must include duplicate/coverage evidence", + decision.project_id + )); + } + if decision.acceptance_evidence.is_empty() { + errors.push(format!("decision {} must include acceptance evidence", decision.project_id)); + } + if decision.source_links.is_empty() { + errors.push(format!("decision {} must include source links", decision.project_id)); + } + + validate_issue_decision(decision, errors); +} + +fn validate_issue_decision(decision: &RadarDecision, errors: &mut Vec) { + let issue_decision = &decision.issue_decision; + + if issue_decision.rationale.trim().is_empty() { + errors.push(format!("decision {} issue rationale must not be empty", decision.project_id)); + } + + match issue_decision.action { + IssueAction::CreateIssue => validate_create_issue(decision, errors), + IssueAction::NoIssue => + if issue_decision.proposed_issue.is_some() { + errors.push(format!( + "decision {} must not include proposed_issue for no_issue", + decision.project_id + )); + }, + IssueAction::Defer => {}, + } +} + +fn validate_create_issue(decision: &RadarDecision, errors: &mut Vec) { + let issue_decision = &decision.issue_decision; + + if decision.elf_verdict != ElfVerdict::Gap { + errors.push(format!( + "decision {} can create issues only for gap verdicts", + decision.project_id + )); + } + if !issue_decision.duplicate_search.queried { + errors.push(format!( + "decision {} must search Linear before issue creation", + decision.project_id + )); + } + + let Some(proposed_issue) = &issue_decision.proposed_issue else { + errors.push(format!( + "decision {} create_issue must include proposed_issue", + decision.project_id + )); + + return; + }; + + if proposed_issue.source_links.is_empty() + || proposed_issue.repo_evidence.is_empty() + || proposed_issue.non_goals.is_empty() + || proposed_issue.validation_criteria.is_empty() + { + errors.push(format!( + "decision {} proposed issue must include source links, repo evidence, non-goals, and validation criteria", + decision.project_id + )); + } +} + +fn render_summary(cursor: &RadarCursor) -> Result { + let run = cursor.last_run.as_ref().ok_or_else(|| eyre::eyre!("cursor has no last_run"))?; + let mut out = String::new(); + + out.push_str("# External Memory Pattern Radar Summary\n\n"); + out.push_str("Goal: Preserve the latest weekly ELF external memory pattern radar outcome.\n"); + out.push_str("Read this when: Feeding the next full comparison report or deciding whether a watched upstream memory project created an ELF follow-up.\n"); + out.push_str("Inputs: `docs/research/external_memory_pattern_radar/cursor.json`, GitHub repository metadata, checked-in ELF comparison evidence, and any Codex source-review notes.\n"); + out.push_str("Depends on: `docs/spec/external_memory_pattern_radar_v1.md` and `docs/guide/research/external_memory_pattern_radar.md`.\n"); + out.push_str("Outputs: Latest no-issue, rejection, or issue-ready radar decisions.\n\n"); + out.push_str(&format!("- Run id: `{}`\n", run.run_id)); + out.push_str(&format!("- Generated at: `{}`\n", run.generated_at)); + out.push_str(&format!("- Mode: `{}`\n", run.mode.as_str())); + out.push_str(&format!( + "- Projects: `{}`; covered: `{}`; rejected: `{}`; gaps: `{}`; create_issue: `{}`\n\n", + run.summary.project_count, + run.summary.covered_count, + run.summary.rejected_count, + run.summary.gap_count, + run.summary.create_issue_count + )); + out.push_str("## Decisions\n\n"); + out.push_str( + "| Project | Upstream change | ELF verdict | Issue decision | Acceptance evidence |\n", + ); + out.push_str("| --- | --- | --- | --- | --- |\n"); + + for decision in &run.decisions { + out.push_str(&format!( + "| `{}` | {} | `{}` | `{}` | {} |\n", + decision.project_id, + escape_markdown_table(&decision.upstream_change), + decision.elf_verdict.as_str(), + decision.issue_decision.action.as_str(), + escape_markdown_table(&decision.acceptance_evidence.join("; ")) + )); + } + + out.push_str("\n## Safety Boundary\n\n"); + out.push_str("- The radar records upstream movement as a trigger for source review, not as proof of parity or a reason to adopt an external runtime.\n"); + out.push_str("- `create_issue` decisions are valid only when the cursor includes source links, repo evidence, non-goals, validation criteria, and Linear duplicate-search evidence.\n"); + out.push_str("- No-issue runs remain useful because each project records why ELF is already covered or why metadata-only movement was rejected.\n"); + + Ok(out) +} + +fn escape_markdown_table(value: &str) -> String { + value.replace('|', "\\|").replace('\n', " ") +} + +fn format_rfc3339(value: OffsetDateTime) -> Result { + Ok(value.format(&Rfc3339)?) +} + +#[tokio::main] +async fn main() -> Result<()> { + color_eyre::install()?; + + match Args::parse().command { + Command::Run(args) => run_radar(args).await, + Command::Validate(args) => validate_command(&args.cursor), + } +} + +async fn run_radar(args: RunArgs) -> Result<()> { + let now = OffsetDateTime::now_utc(); + let generated_at = format_rfc3339(now)?; + let run_id = + args.run_id.unwrap_or_else(|| format!("external-memory-pattern-radar-{}", now.date())); + let client = github_client(&args.github_token_env)?; + let mut cursor = read_cursor(&args.cursor)?; + let mut decisions = Vec::with_capacity(cursor.projects.len()); + + for project in &mut cursor.projects { + let prior = project.last_seen.clone(); + let observed = observe_project(project, args.mode, client.as_ref(), &generated_at).await?; + + decisions.push(decide_project(project, prior.as_ref(), &observed, args.mode)); + + project.last_seen = Some(observed); + } + + let summary = summarize_decisions(&decisions); + + cursor.generated_at = generated_at.clone(); + cursor.last_run = Some(RadarRun { + schema: RUN_SCHEMA.to_string(), + run_id, + generated_at, + mode: args.mode, + summary, + decisions, + }); + + validate_cursor(&cursor)?; + + let out_cursor = args.out_cursor.unwrap_or(args.cursor); + + write_json(&out_cursor, &cursor)?; + write_text(&args.summary, &render_summary(&cursor)?)?; + + Ok(()) +} + +async fn observe_project( + project: &RadarProject, + mode: RadarMode, + client: Option<&Client>, + generated_at: &str, +) -> Result { + match mode { + RadarMode::Offline => Ok(project + .last_seen + .clone() + .unwrap_or_else(|| fallback_observation(project, generated_at))), + RadarMode::Live => + fetch_project( + project, + client.ok_or_else(|| eyre::eyre!("missing GitHub client"))?, + generated_at, + ) + .await, + } +} + +async fn fetch_project( + project: &RadarProject, + client: &Client, + generated_at: &str, +) -> Result { + let repo = fetch_repo(project, client).await?; + let latest_release = fetch_latest_release(project, client).await?; + + Ok(ProjectObservation { + observed_at: generated_at.to_string(), + source_url: repo.html_url, + default_branch: repo.default_branch, + pushed_at: repo.pushed_at, + updated_at: repo.updated_at, + latest_release, + stars: repo.stargazers_count, + open_issues: repo.open_issues_count, + description: repo.description, + }) +} + +async fn fetch_repo(project: &RadarProject, client: &Client) -> Result { + let url = format!("https://api.github.com/repos/{}", project.repo); + let response = client.get(url).send().await?; + + if !response.status().is_success() { + return Err(eyre::eyre!( + "GitHub repo metadata fetch failed for {} with status {}", + project.repo, + response.status() + )); + } + + Ok(response.json().await?) +} + +async fn fetch_latest_release( + project: &RadarProject, + client: &Client, +) -> Result> { + let url = format!("https://api.github.com/repos/{}/releases/latest", project.repo); + let response = client.get(url).send().await?; + + if response.status() == StatusCode::NOT_FOUND { + return Ok(None); + } + if !response.status().is_success() { + return Err(eyre::eyre!( + "GitHub release metadata fetch failed for {} with status {}", + project.repo, + response.status() + )); + } + + let release: GithubReleaseResponse = response.json().await?; + + Ok(Some(ReleaseObservation { + tag_name: release.tag_name, + url: release.html_url, + published_at: release.published_at, + })) +} diff --git a/apps/elf-eval/src/bin/live_baseline_elf.rs b/apps/elf-eval/src/bin/live_baseline_elf.rs new file mode 100644 index 00000000..d20ea4dd --- /dev/null +++ b/apps/elf-eval/src/bin/live_baseline_elf.rs @@ -0,0 +1,2522 @@ +#![allow(clippy::single_component_path_imports, unused_crate_dependencies)] + +//! Docker live-baseline runner for ELF's own same-corpus retrieval path. + +use std::{ + collections::{BTreeMap, HashSet}, + env, fs, + path::{Path, PathBuf}, + process::Command, + sync::Arc, + time::{Duration, Instant}, +}; + +use blake3::Hasher; +use clap::Parser; +use color_eyre::{Report, eyre}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use tokio::{task::JoinSet, time}; +use uuid::Uuid; + +use elf_chunking::ChunkingConfig; +use elf_config::{Config, EmbeddingProviderConfig, LlmProviderConfig, ProviderConfig}; +use elf_service::{ + AddNoteInput, AddNoteRequest, BoxFuture, DeleteRequest, ElfService, EmbeddingProvider, + ExtractorProvider, NoteOp, PayloadLevel, Providers, RerankProvider, SearchRequest, + UpdateRequest, +}; +use elf_storage::{db::Db, qdrant::QdrantStore}; +use elf_testkit::TestDatabase; +use elf_worker::worker::{self, WorkerState}; + +const TENANT_ID: &str = "elf-live-baseline"; +const PROJECT_ID: &str = "shared-corpus"; +const AGENT_ID: &str = "elf-bench-agent"; +const SCOPE: &str = "agent_private"; +const BACKFILL_CHECKPOINT_SCHEMA: &str = "elf.live_baseline.backfill_checkpoint/v1"; + +#[derive(Debug, Parser)] +#[command(version = elf_cli::VERSION, rename_all = "kebab", styles = elf_cli::styles())] +struct Args { + /// Base ELF config to load before Docker runtime overrides are applied. + #[arg(long, short = 'c', value_name = "FILE")] + config: PathBuf, + + /// Directory containing the generated benchmark corpus markdown files. + #[arg(long, value_name = "DIR")] + corpus: PathBuf, + + /// Query manifest generated by the live-baseline harness. + #[arg(long, value_name = "FILE")] + queries: PathBuf, + + /// Write ELF result JSON to this file. + #[arg(long, value_name = "FILE")] + out: PathBuf, +} + +#[derive(Debug, Deserialize)] +struct QueryManifest { + queries: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct QueryCase { + id: String, + task: Option, + query: String, + expected_doc: String, + expected_terms: Vec, + #[serde(default)] + allowed_alternate_docs: Vec, + #[serde(default)] + expected_evidence_ids: Vec, + #[serde(default)] + allowed_alternate_evidence_ids: Vec, +} +impl QueryCase { + fn generated( + id: String, + query: String, + expected_doc: String, + expected_terms: Vec, + ) -> Self { + Self { + id, + task: None, + query, + expected_evidence_ids: vec![evidence_id_for_doc(&expected_doc)], + allowed_alternate_docs: Vec::new(), + allowed_alternate_evidence_ids: Vec::new(), + expected_doc, + expected_terms, + } + } +} + +#[derive(Debug)] +struct CorpusNote { + key: String, + title: String, + text: String, + source_doc: String, +} + +#[derive(Debug)] +struct BackfillOutcome { + report: BackfillReport, + note_ids: Vec, +} + +#[derive(Debug)] +struct ExistingBackfillNote { + note_id: Uuid, + source_hash: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct BackfillCheckpoint { + schema: String, + corpus_hash: String, + completed: BTreeMap, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct BackfillCheckpointEntry { + note_id: Uuid, + key: String, + source_hash: String, + op: String, +} + +#[derive(Debug, Serialize)] +struct BackfillReport { + checkpoint_path: String, + corpus_hash: String, + source_count: usize, + completed_count: usize, + batch_size: usize, + worker_concurrency: usize, + elapsed_seconds: f64, + attempted_writes: usize, + skipped_completed: usize, + duplicate_source_notes: Vec, + resume: BackfillResumeReport, + attempts: Vec, +} + +#[derive(Debug, Serialize)] +struct BackfillResumeReport { + enabled: bool, + interrupted: bool, + interrupt_after: Option, + resume_attempts: usize, + completed_before_resume: usize, + completed_after_resume: usize, +} + +#[derive(Debug, Serialize)] +struct BackfillAttemptEvidence { + attempt: usize, + resumed: bool, + interrupt_after: Option, + skipped_completed: usize, + attempted_writes: usize, + completed_writes: usize, + checkpoint_completed: usize, + interrupted: bool, +} + +#[derive(Debug, Serialize)] +struct DuplicateSourceNote { + source_doc: String, + count: i64, + note_ids: Vec, +} + +#[derive(Debug)] +struct BaselineRuntime { + config_path: PathBuf, + dsn: String, + qdrant_url: String, + collection: String, + docs_collection: String, +} + +#[derive(Debug, Serialize)] +struct WorkerRunEvidence { + label: String, + expected_note_count: usize, + concurrency: usize, + iterations: usize, + before: BTreeMap, + after: BTreeMap, + chunk_rows: i64, + chunk_embedding_rows: i64, + failed_jobs: Vec, +} + +#[derive(Debug, Serialize)] +struct FailedOutboxJob { + note_id: Uuid, + note_key: Option, + op: String, + attempts: i32, + last_error: Option, +} + +#[derive(Debug, Serialize)] +struct ResourceEnvelopeEvidence { + elapsed_seconds: f64, + max_elapsed_seconds: f64, + rss_kb: Option, + max_rss_kb: u64, + postgres_database_bytes: Option, + corpus_dir_bytes: u64, + report_dir_bytes: Option, + checkpoint_file_bytes: Option, +} + +#[derive(Debug, Serialize)] +struct CostProxyReport { + schema: &'static str, + scope: &'static str, + embedding_mode: EmbeddingMode, + estimated_input_chars: usize, + estimated_input_tokens: usize, + token_estimation: &'static str, + configured_usd_per_1k_tokens: Option, + estimated_usd: Option, + document_count: usize, + query_count: usize, +} + +#[derive(Debug, Serialize)] +struct EmbeddingRuntimeReport { + mode: EmbeddingMode, + provider_id: String, + model: String, + dimensions: u32, + timeout_ms: u64, + api_base: String, + path: String, +} + +#[derive(Debug, Serialize)] +struct SoakConfig { + target_seconds: u64, + write_rounds: usize, + probe_interval_millis: u64, +} + +#[derive(Debug, Serialize)] +struct ElfBaselineReport { + schema: &'static str, + status: &'static str, + retrieval_status: &'static str, + reason: String, + head: String, + embedding: EmbeddingRuntimeReport, + cost_proxy: CostProxyReport, + backfill: BackfillReport, + indexing: IndexingReport, + summary: QuerySummary, + check_summary: CheckSummary, + checks: Vec, + queries: Vec, + ops_cases: Vec, +} + +#[derive(Debug, Serialize)] +struct IndexingReport { + note_count: usize, + rebuild_rebuilt_count: u64, + rebuild_missing_vector_count: u64, + rebuild_error_count: u64, +} + +#[derive(Debug, Serialize)] +struct QuerySummary { + total: usize, + pass: usize, + fail: usize, + wrong_result_count: usize, + latency_ms_total: f64, + latency_ms_mean: f64, + latency_ms_p50: f64, + latency_ms_p95: f64, + latency_ms_p99: f64, + latency_ms_max: f64, +} + +#[derive(Debug, Serialize)] +struct OperationalCase { + name: &'static str, + default_status: &'static str, + operator_status: &'static str, + command: &'static str, + evidence: &'static str, + safety: &'static str, +} + +#[derive(Debug, Serialize)] +struct CheckSummary { + total: usize, + pass: usize, + fail: usize, + wrong_result: usize, + lifecycle_fail: usize, + incomplete: usize, + blocked: usize, + not_encoded: usize, +} + +#[derive(Debug, Serialize)] +struct CheckResult { + name: &'static str, + status: &'static str, + reason: String, + evidence: Value, +} + +#[derive(Debug, Serialize)] +struct QueryResult { + id: String, + task: Option, + trace_id: Uuid, + query: String, + expected_doc: String, + allowed_alternate_docs: Vec, + expected_terms: Vec, + expected_evidence_ids: Vec, + allowed_alternate_evidence_ids: Vec, + matched: bool, + matched_terms: Vec, + top_evidence_id: Option, + matched_evidence_id: Option, + top_note_key: Option, + top_snippet: Option, + latency_ms: f64, + returned_count: usize, +} + +#[derive(Debug)] +struct DeterministicEmbedding { + vector_dim: u32, +} +impl EmbeddingProvider for DeterministicEmbedding { + fn embed<'a>( + &'a self, + _cfg: &'a EmbeddingProviderConfig, + texts: &'a [String], + ) -> BoxFuture<'a, elf_service::Result>>> { + let dim = self.vector_dim; + let vectors = texts.iter().map(|text| embed_text(text, dim)).collect(); + + Box::pin(async move { Ok(vectors) }) + } +} + +#[derive(Debug)] +struct TokenOverlapRerank; +impl RerankProvider for TokenOverlapRerank { + fn rerank<'a>( + &'a self, + _cfg: &'a ProviderConfig, + query: &'a str, + docs: &'a [String], + ) -> BoxFuture<'a, elf_service::Result>> { + let query_terms = terms(query); + let scores = docs + .iter() + .map(|doc| { + let doc_terms = terms(doc); + let hits = query_terms.intersection(&doc_terms).count() as f32; + + hits / query_terms.len().max(1) as f32 + }) + .collect(); + + Box::pin(async move { Ok(scores) }) + } +} + +#[derive(Debug)] +struct NoopExtractor; +impl ExtractorProvider for NoopExtractor { + fn extract<'a>( + &'a self, + _cfg: &'a LlmProviderConfig, + _messages: &'a [Value], + ) -> BoxFuture<'a, elf_service::Result> { + Box::pin(async move { Ok(serde_json::json!({ "notes": [] })) }) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +enum EmbeddingMode { + Local, + Provider, +} + +fn runtime_config(runtime: &BaselineRuntime) -> color_eyre::Result { + let embedding_mode = embedding_mode()?; + let mut cfg = elf_config::load(&runtime.config_path)?; + + cfg.storage.postgres.dsn = runtime.dsn.clone(); + cfg.storage.postgres.pool_max_conns = 12; + cfg.storage.qdrant.url = runtime.qdrant_url.clone(); + cfg.storage.qdrant.collection = runtime.collection.clone(); + cfg.storage.qdrant.docs_collection = runtime.docs_collection.clone(); + + if embedding_mode == EmbeddingMode::Provider { + apply_provider_embedding_overrides(&mut cfg)?; + + cfg.storage.qdrant.vector_dim = cfg.providers.embedding.dimensions; + } else { + cfg.providers.embedding.provider_id = "local".to_string(); + cfg.providers.embedding.model = "local-hash".to_string(); + cfg.providers.embedding.dimensions = cfg.storage.qdrant.vector_dim; + } + + cfg.providers.rerank.provider_id = "local".to_string(); + cfg.providers.rerank.model = "local-token-overlap".to_string(); + cfg.providers.llm_extractor.provider_id = "disabled".to_string(); + cfg.providers.llm_extractor.model = "disabled".to_string(); + cfg.context = None; + + Ok(cfg) +} + +fn deterministic_providers(vector_dim: u32) -> Providers { + Providers::new( + Arc::new(DeterministicEmbedding { vector_dim }), + Arc::new(TokenOverlapRerank), + Arc::new(NoopExtractor), + ) +} + +fn embedding_mode() -> color_eyre::Result { + let raw = env::var("ELF_BASELINE_ELF_EMBEDDING_MODE") + .unwrap_or_else(|_| "local".to_string()) + .to_ascii_lowercase(); + + match raw.as_str() { + "local" | "deterministic" => Ok(EmbeddingMode::Local), + "provider" | "production" => Ok(EmbeddingMode::Provider), + _ => Err(eyre::eyre!( + "Unsupported ELF_BASELINE_ELF_EMBEDDING_MODE={raw:?}; use local or provider." + )), + } +} + +fn apply_provider_embedding_overrides(cfg: &mut Config) -> color_eyre::Result<()> { + apply_env_string( + &mut cfg.providers.embedding.provider_id, + &[ + "ELF_BASELINE_ELF_EMBEDDING_PROVIDER_ID", + "QWEN_EMBEDDING_PROVIDER_ID", + "EMBEDDING_PROVIDER_ID", + ], + ); + apply_env_string( + &mut cfg.providers.embedding.api_base, + &[ + "ELF_BASELINE_ELF_EMBEDDING_API_BASE", + "QWEN_EMBEDDING_API_BASE", + "DASHSCOPE_API_BASE", + "EMBEDDING_API_BASE", + ], + ); + apply_env_string( + &mut cfg.providers.embedding.api_key, + &[ + "ELF_BASELINE_ELF_EMBEDDING_API_KEY", + "QWEN_API_KEY", + "DASHSCOPE_API_KEY", + "EMBEDDING_API_KEY", + ], + ); + apply_env_string( + &mut cfg.providers.embedding.path, + &["ELF_BASELINE_ELF_EMBEDDING_PATH", "QWEN_EMBEDDING_PATH", "EMBEDDING_PATH"], + ); + apply_env_string( + &mut cfg.providers.embedding.model, + &["ELF_BASELINE_ELF_EMBEDDING_MODEL", "QWEN_EMBEDDING_MODEL", "EMBEDDING_MODEL"], + ); + + if let Some(dimensions) = env_u32(&[ + "ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS", + "QWEN_EMBEDDING_DIMENSIONS", + "DASHSCOPE_EMBEDDING_DIMENSIONS", + "EMBEDDING_DIMENSIONS", + ]) { + cfg.providers.embedding.dimensions = dimensions; + } + if let Some(timeout_ms) = env_u64(&[ + "ELF_BASELINE_ELF_EMBEDDING_TIMEOUT_MS", + "QWEN_EMBEDDING_TIMEOUT_MS", + "EMBEDDING_TIMEOUT_MS", + ]) { + cfg.providers.embedding.timeout_ms = timeout_ms; + } else { + cfg.providers.embedding.timeout_ms = cfg.providers.embedding.timeout_ms.max(30_000); + } + + if cfg.providers.embedding.provider_id == "local" { + if env_string(&["ELF_BASELINE_ELF_EMBEDDING_API_KEY", "QWEN_API_KEY"]).is_some() { + cfg.providers.embedding.provider_id = "qwen".to_string(); + } else if env_string(&["DASHSCOPE_API_KEY"]).is_some() { + cfg.providers.embedding.provider_id = "dashscope".to_string(); + } else if env_string(&["EMBEDDING_API_KEY"]).is_some() { + cfg.providers.embedding.provider_id = "provider".to_string(); + } + } + if cfg.providers.embedding.provider_id == "local" { + return Err(eyre::eyre!( + "Provider embedding mode requires a non-local provider id or QWEN_API_KEY/DASHSCOPE_API_KEY/EMBEDDING_API_KEY." + )); + } + if cfg.providers.embedding.api_base.trim().is_empty() + || cfg.providers.embedding.api_base == "http://127.0.0.1" + { + return Err(eyre::eyre!( + "Provider embedding mode requires ELF_BASELINE_ELF_EMBEDDING_API_BASE, QWEN_EMBEDDING_API_BASE, DASHSCOPE_API_BASE, or EMBEDDING_API_BASE." + )); + } + if cfg.providers.embedding.api_key.trim().is_empty() + || cfg.providers.embedding.api_key == "local-dev-placeholder" + { + return Err(eyre::eyre!( + "Provider embedding mode requires ELF_BASELINE_ELF_EMBEDDING_API_KEY, QWEN_API_KEY, DASHSCOPE_API_KEY, or EMBEDDING_API_KEY." + )); + } + if cfg.providers.embedding.model == "local-hash" + || cfg.providers.embedding.model.trim().is_empty() + { + return Err(eyre::eyre!( + "Provider embedding mode requires ELF_BASELINE_ELF_EMBEDDING_MODEL, QWEN_EMBEDDING_MODEL, or EMBEDDING_MODEL." + )); + } + if cfg.providers.embedding.dimensions == 0 { + return Err(eyre::eyre!( + "Provider embedding dimensions must be greater than zero; set ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS, QWEN_EMBEDDING_DIMENSIONS, DASHSCOPE_EMBEDDING_DIMENSIONS, or EMBEDDING_DIMENSIONS." + )); + } + + Ok(()) +} + +fn embedding_runtime_report(cfg: &Config) -> EmbeddingRuntimeReport { + EmbeddingRuntimeReport { + mode: embedding_mode().unwrap_or(EmbeddingMode::Local), + provider_id: cfg.providers.embedding.provider_id.clone(), + model: cfg.providers.embedding.model.clone(), + dimensions: cfg.providers.embedding.dimensions, + timeout_ms: cfg.providers.embedding.timeout_ms, + api_base: cfg.providers.embedding.api_base.clone(), + path: cfg.providers.embedding.path.clone(), + } +} + +fn apply_env_string(target: &mut String, names: &[&str]) { + if let Some(value) = env_string(names) { + *target = value; + } +} + +fn env_string(names: &[&str]) -> Option { + names.iter().find_map(|name| { + env::var(name).ok().map(|value| value.trim().to_string()).filter(|value| !value.is_empty()) + }) +} + +fn env_u32(names: &[&str]) -> Option { + env_string(names).and_then(|value| value.parse::().ok()) +} + +fn env_u64(names: &[&str]) -> Option { + env_string(names).and_then(|value| value.parse::().ok()) +} + +fn load_corpus_notes(corpus_dir: &Path) -> color_eyre::Result> { + let mut paths = fs::read_dir(corpus_dir)? + .map(|entry| entry.map(|entry| entry.path())) + .collect::>>()?; + + paths.retain(|path| { + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| ext.eq_ignore_ascii_case("md")) + }); + paths.sort(); + + let mut out = Vec::with_capacity(paths.len()); + + for path in paths { + let source_doc = path + .file_name() + .and_then(|name| name.to_str()) + .ok_or_else(|| { + eyre::eyre!("Corpus path has no valid UTF-8 file name: {}", path.display()) + })? + .to_string(); + let raw = fs::read_to_string(&path)?; + let title = title_from_markdown(&raw, &source_doc); + let text = raw + .lines() + .filter(|line| !line.trim_start().starts_with('#')) + .collect::>() + .join(" ") + .split_whitespace() + .collect::>() + .join(" "); + + out.push(CorpusNote { key: key_for_doc(&source_doc), title, text, source_doc }); + } + + if out.is_empty() { + return Err(eyre::eyre!("No markdown corpus files found in {}.", corpus_dir.display())); + } + + Ok(out) +} + +fn load_queries(path: &PathBuf) -> color_eyre::Result { + let raw = fs::read_to_string(path)?; + + Ok(serde_json::from_str(&raw)?) +} + +fn worker_max_iterations(note_count: usize) -> usize { + env::var("ELF_BASELINE_WORKER_MAX_ITERATIONS") + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or_else(|| note_count.saturating_mul(3).saturating_add(32)) +} + +fn outbox_done(counts: &BTreeMap, expected_note_count: usize) -> bool { + let done = counts.get("DONE").copied().unwrap_or_default(); + let expected = i64::try_from(expected_note_count).unwrap_or(i64::MAX); + let pending = counts.get("PENDING").copied().unwrap_or_default(); + let failed = counts.get("FAILED").copied().unwrap_or_default(); + let claimed = counts.get("CLAIMED").copied().unwrap_or_default(); + + done >= expected && pending == 0 && failed == 0 && claimed == 0 +} + +fn retrieval_check(query_results: &[QueryResult]) -> CheckResult { + let pass_count = query_results.iter().filter(|result| result.matched).count(); + let fail_count = query_results.len().saturating_sub(pass_count); + let expected_evidence_ids = query_results + .iter() + .map(|result| { + serde_json::json!({ + "query_id": result.id, + "expected": result.expected_evidence_ids, + "allowed_alternates": result.allowed_alternate_evidence_ids, + }) + }) + .collect::>(); + + CheckResult { + name: "same_corpus_retrieval", + status: if fail_count == 0 { "pass" } else { "wrong_result" }, + reason: if fail_count == 0 { + "All same-corpus retrieval queries returned expected evidence.".to_string() + } else { + format!("{fail_count} same-corpus retrieval query case(s) missed expected evidence.") + }, + evidence: serde_json::json!({ + "total": query_results.len(), + "pass": pass_count, + "fail": fail_count, + "wrong_result_count": fail_count, + "expected_evidence_ids": expected_evidence_ids, + }), + } +} + +fn worker_indexing_check(evidence: WorkerRunEvidence) -> CheckResult { + let pass = outbox_done(&evidence.after, evidence.expected_note_count) + && evidence.chunk_rows >= i64::try_from(evidence.expected_note_count).unwrap_or(i64::MAX) + && evidence.chunk_embedding_rows >= evidence.chunk_rows; + + CheckResult { + name: "async_worker_indexing_e2e", + status: if pass { "pass" } else { "lifecycle_fail" }, + reason: if pass { + "ELF worker processed corpus outbox jobs into persisted chunks and embeddings." + .to_string() + } else { + "ELF worker did not fully process corpus outbox jobs into searchable chunks." + .to_string() + }, + evidence: serde_json::json!(evidence), + } +} + +fn resumable_backfill_check(report: &BackfillReport) -> CheckResult { + let resume_pass = !report.resume.enabled + || (report.resume.interrupted + && report.resume.resume_attempts >= 2 + && report.skipped_completed > 0); + let pass = report.completed_count == report.source_count + && report.duplicate_source_notes.is_empty() + && resume_pass; + + CheckResult { + name: "resumable_backfill_no_duplicates", + status: if pass { "pass" } else { "lifecycle_fail" }, + reason: if pass { + "Checkpointed backfill resumed from durable progress and did not duplicate source documents." + .to_string() + } else { + "Checkpointed backfill did not complete cleanly, did not prove resume, or duplicated source documents." + .to_string() + }, + evidence: serde_json::json!(report), + } +} + +fn backfill_batch_size() -> usize { + parse_env_usize("ELF_BASELINE_BACKFILL_BATCH_SIZE").unwrap_or(32).max(1) +} + +fn worker_concurrency() -> usize { + let default = match env::var("ELF_BASELINE_PROFILE").as_deref() { + Ok("backfill" | "large") => 4, + Ok("stress") => 4, + Ok("scale" | "full") => 2, + _ => 1, + }; + + parse_env_usize("ELF_BASELINE_WORKER_CONCURRENCY").unwrap_or(default).clamp(1, 32) +} + +fn backfill_resume_probe_enabled() -> bool { + env::var("ELF_BASELINE_BACKFILL_RESUME_PROBE") + .map(|value| value != "0" && !value.eq_ignore_ascii_case("false")) + .unwrap_or(true) +} + +fn backfill_interrupt_after(source_count: usize) -> Option { + if !backfill_resume_probe_enabled() || source_count <= 1 { + return None; + } + + let configured = parse_env_usize("ELF_BASELINE_BACKFILL_INTERRUPT_AFTER"); + let default = (source_count / 2).max(1); + + Some(configured.unwrap_or(default).clamp(1, source_count.saturating_sub(1))) +} + +fn backfill_checkpoint_path(out: &Path) -> PathBuf { + env_string(&["ELF_BASELINE_BACKFILL_CHECKPOINT"]) + .map(PathBuf::from) + .unwrap_or_else(|| out.with_file_name("elf-backfill-checkpoint.json")) +} + +fn empty_backfill_checkpoint(corpus_hash: &str) -> BackfillCheckpoint { + BackfillCheckpoint { + schema: BACKFILL_CHECKPOINT_SCHEMA.to_string(), + corpus_hash: corpus_hash.to_string(), + completed: BTreeMap::new(), + } +} + +fn load_backfill_checkpoint( + path: &Path, + corpus_hash: &str, +) -> color_eyre::Result { + if !path.exists() { + return Ok(empty_backfill_checkpoint(corpus_hash)); + } + + let raw = fs::read_to_string(path)?; + let checkpoint = serde_json::from_str::(&raw)?; + + if checkpoint.schema == BACKFILL_CHECKPOINT_SCHEMA && checkpoint.corpus_hash == corpus_hash { + Ok(checkpoint) + } else { + Ok(empty_backfill_checkpoint(corpus_hash)) + } +} + +fn write_backfill_checkpoint( + path: &Path, + checkpoint: &BackfillCheckpoint, +) -> color_eyre::Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + + let raw = serde_json::to_string_pretty(checkpoint)?; + let tmp_path = path.with_extension("json.tmp"); + + fs::write(&tmp_path, raw)?; + fs::rename(tmp_path, path)?; + + Ok(()) +} + +fn source_hash(note: &CorpusNote) -> String { + let mut hasher = Hasher::new(); + + hasher.update(note.source_doc.as_bytes()); + hasher.update(b"\0"); + hasher.update(note.key.as_bytes()); + hasher.update(b"\0"); + hasher.update(note.text.as_bytes()); + + hasher.finalize().to_hex().to_string() +} + +fn corpus_hash(notes: &[CorpusNote]) -> String { + let mut hasher = Hasher::new(); + + for note in notes { + hasher.update(note.source_doc.as_bytes()); + hasher.update(b"\0"); + hasher.update(source_hash(note).as_bytes()); + hasher.update(b"\0"); + } + + hasher.finalize().to_hex().to_string() +} + +fn checkpoint_entry_valid( + note: &CorpusNote, + entry: &BackfillCheckpointEntry, + existing: &BTreeMap, +) -> bool { + let expected_hash = source_hash(note); + + if entry.source_hash != expected_hash { + return false; + } + + existing.get(¬e.source_doc).is_some_and(|stored| { + stored.note_id == entry.note_id + && stored.source_hash.as_deref() == Some(expected_hash.as_str()) + }) +} + +fn note_input(note: &CorpusNote) -> AddNoteInput { + let hash = source_hash(note); + + AddNoteInput { + r#type: "fact".to_string(), + key: Some(note.key.clone()), + text: note.text.clone(), + structured: None, + importance: 0.9, + confidence: 0.95, + ttl_days: None, + source_ref: serde_json::json!({ + "source": "ELF live baseline corpus", + "title": note.title, + "document": note.source_doc, + "source_hash": hash, + }), + write_policy: None, + } +} + +fn note_op_string(op: NoteOp) -> color_eyre::Result { + let value = serde_json::to_value(op)?; + + value + .as_str() + .map(ToString::to_string) + .ok_or_else(|| eyre::eyre!("Serialized note op was not a string.")) +} + +fn concurrent_note_count() -> usize { + if let Ok(value) = env::var("ELF_BASELINE_CONCURRENT_NOTES") + && let Ok(parsed) = value.parse::() + { + return parsed.max(1); + } + + match env::var("ELF_BASELINE_PROFILE").as_deref() { + Ok("backfill" | "large") => 32, + Ok("stress") => 32, + Ok("scale" | "full") => 16, + _ => 4, + } +} + +fn concurrent_add_request(index: usize) -> AddNoteRequest { + let marker = concurrent_marker(index); + + AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(format!("concurrent_{index:03}")), + text: format!( + "Concurrent benchmark note {index:03} records marker `{marker}` for write race validation." + ), + structured: None, + importance: 0.91, + confidence: 0.96, + ttl_days: None, + source_ref: serde_json::json!({ + "source": "ELF live baseline concurrent write check", + "document": format!("concurrent-{index:03}.md"), + }), + write_policy: None, + }], + } +} + +fn concurrent_query_case(index: usize) -> QueryCase { + let marker = concurrent_marker(index); + + QueryCase::generated( + format!("concurrent-{index:03}"), + format!("Find the concurrent benchmark note containing marker {marker}."), + format!("concurrent-{index:03}.md"), + vec![marker], + ) +} + +fn concurrent_marker(index: usize) -> String { + format!("concurrency-{}-{index:03}", marker_word(index)) +} + +fn soak_config() -> SoakConfig { + let profile = env::var("ELF_BASELINE_PROFILE").ok(); + let (default_seconds, default_rounds) = match profile.as_deref() { + Some("backfill" | "large") => (60, 6), + Some("stress") => (60, 6), + Some("scale" | "full") => (15, 3), + _ => (0, 0), + }; + + SoakConfig { + target_seconds: parse_env_u64("ELF_BASELINE_SOAK_SECONDS").unwrap_or(default_seconds), + write_rounds: parse_env_usize("ELF_BASELINE_SOAK_ROUNDS").unwrap_or(default_rounds), + probe_interval_millis: parse_env_u64("ELF_BASELINE_SOAK_PROBE_INTERVAL_MS") + .unwrap_or(1_000) + .max(100), + } +} + +fn parse_env_u64(name: &str) -> Option { + env::var(name).ok()?.parse::().ok() +} + +fn parse_env_usize(name: &str) -> Option { + env::var(name).ok()?.parse::().ok() +} + +fn soak_add_request(index: usize) -> AddNoteRequest { + let marker = soak_marker(index); + let (topic, detail) = soak_topic(index); + + AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(format!("soak_{index:03}")), + text: format!( + "Soak benchmark note {index:03} covers {topic}. {detail} It records stability marker `{marker}` for repeated worker and search probes." + ), + structured: None, + importance: 0.92, + confidence: 0.97, + ttl_days: None, + source_ref: serde_json::json!({ + "source": "ELF live baseline soak stability check", + "document": format!("soak-{index:03}.md"), + }), + write_policy: None, + }], + } +} + +fn soak_query_case(index: usize) -> QueryCase { + let marker = soak_marker(index); + let (topic, _) = soak_topic(index); + + QueryCase::generated( + format!("soak-{index:03}"), + format!("Find the soak benchmark note about {topic} containing marker {marker}."), + format!("soak-{index:03}.md"), + vec![marker], + ) +} + +fn soak_marker(index: usize) -> String { + format!("soak-stability-{}-{index:03}", marker_word(index)) +} + +fn marker_word(index: usize) -> &'static str { + const WORDS: &[&str] = &[ + "aurora", "banyan", "cobalt", "delta", "ember", "fennel", "granite", "harbor", "indigo", + "jasper", "keystone", "lantern", "meridian", "nebula", "onyx", "prairie", "quartz", + "raven", "solstice", "topaz", "umbra", "verdant", "willow", "xenon", "yarrow", "zephyr", + "atlas", "beacon", "citadel", "drift", "equinox", "forge", + ]; + + WORDS[index % WORDS.len()] +} + +fn soak_topic(index: usize) -> (&'static str, &'static str) { + const TOPICS: &[(&str, &str)] = &[ + ( + "release rollback fencing", + "The rollback controller waits for a signed deploy fence before the next canary.", + ), + ( + "invoice export batching", + "The exporter groups invoice CSV rows by merchant ledger before upload.", + ), + ("search shard warming", "The search router warms tenant shard caches before rank probes."), + ( + "incident pager routing", + "The incident desk routes page ownership through the release captain.", + ), + ( + "backup restore rehearsal", + "The restore rehearsal checks WAL freshness before dry-run recovery.", + ), + ( + "feature flag expiry", + "The flag sweeper archives expired toggles before deleting rollout rules.", + ), + ( + "support queue triage", + "The support classifier separates billing tickets from access tickets.", + ), + ( + "analytics job watermark", + "The analytics worker stores a warehouse watermark after each import.", + ), + ]; + + TOPICS[index % TOPICS.len()] +} + +fn concurrency_probe_indexes(note_count: usize) -> Vec { + let mut indexes = vec![0, note_count / 2, note_count.saturating_sub(1)]; + + indexes.sort_unstable(); + indexes.dedup(); + + indexes +} + +fn current_rss_kb() -> Option { + let status = fs::read_to_string("/proc/self/status").ok()?; + + status.lines().find_map(|line| { + let rest = line.strip_prefix("VmHWM:")?.trim(); + let value = rest.split_whitespace().next()?; + + value.parse::().ok() + }) +} + +fn path_size_bytes(path: &Path) -> color_eyre::Result { + let metadata = fs::metadata(path)?; + + if metadata.is_file() { + return Ok(metadata.len()); + } + if !metadata.is_dir() { + return Ok(0); + } + + let mut bytes = 0_u64; + + for entry in fs::read_dir(path)? { + let entry = entry?; + + bytes = bytes.saturating_add(path_size_bytes(&entry.path())?); + } + + Ok(bytes) +} + +fn cost_proxy_report( + notes: &[CorpusNote], + queries: &[QueryResult], + embedding: &EmbeddingRuntimeReport, +) -> CostProxyReport { + let note_chars = notes.iter().map(|note| note.text.len()).sum::(); + let query_chars = queries.iter().map(|query| query.query.len()).sum::(); + let estimated_input_chars = note_chars.saturating_add(query_chars); + let estimated_input_tokens = estimated_input_chars.saturating_add(3) / 4; + let configured_usd_per_1k_tokens = env::var("ELF_BASELINE_COST_PER_1K_TOKENS_USD") + .ok() + .and_then(|value| value.parse::().ok()); + let estimated_usd = + configured_usd_per_1k_tokens.map(|rate| estimated_input_tokens as f64 / 1_000.0 * rate); + + CostProxyReport { + schema: "elf.live_baseline.cost_proxy/v1", + scope: "primary corpus note text plus declared same-corpus query text", + embedding_mode: embedding.mode, + estimated_input_chars, + estimated_input_tokens, + token_estimation: "ceil(ascii_utf8_chars / 4)", + configured_usd_per_1k_tokens, + estimated_usd, + document_count: notes.len(), + query_count: queries.len(), + } +} + +fn latency_percentile(latencies: &[f64], percentile: f64) -> f64 { + if latencies.is_empty() { + return 0.0; + } + + let mut sorted = latencies.to_vec(); + + sorted.sort_by(f64::total_cmp); + + let rank = ((sorted.len().saturating_sub(1)) as f64 * percentile).ceil() as usize; + + sorted[rank.min(sorted.len().saturating_sub(1))] +} + +fn operational_case( + name: &'static str, + default_status: &'static str, + operator_status: &'static str, + command: &'static str, + evidence: &'static str, + safety: &'static str, +) -> OperationalCase { + OperationalCase { name, default_status, operator_status, command, evidence, safety } +} + +fn operational_cases() -> Vec { + vec![ + operational_case( + "private_corpus_addendum", + "fails_closed_without_manifest", + "opt_in", + "ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST=tmp/private-production-corpus/manifest.json cargo make baseline-production-private-addendum", + "tmp/live-baseline/private-production-addendum.md", + "Markdown addendum reports manifest id, evidence ids, tasks, checks, latency, resource, and cost proxy fields; private text remains in tmp JSON/logs only.", + ), + operational_case( + "backfill_10k_resume", + "not_run", + "opt_in", + "cargo make baseline-backfill-10k-docker", + "tmp/live-baseline/live-baseline-report.json", + "Runs Docker-owned dependencies and records checkpoint resume, duplicates, latency percentiles, resource usage, and cost proxy fields.", + ), + operational_case( + "backfill_100k_resume", + "guarded", + "expensive_opt_in", + "ELF_BASELINE_ENABLE_EXPENSIVE=1 cargo make baseline-backfill-100k-docker", + "tmp/live-baseline/live-baseline-report.json", + "Fails closed unless the expensive-run guard is explicitly enabled.", + ), + operational_case( + "provider_outage", + "not_run", + "documented_operator_probe", + "ELF_BASELINE_ELF_EMBEDDING_MODE=provider with an unavailable embedding endpoint and cargo make baseline-production-synthetic", + "ELF project status incomplete or blocked with provider failure in tmp/live-baseline/ELF.log", + "Use only synthetic or sanitized manifests; do not place provider keys in committed files.", + ), + operational_case( + "compose_start_stop_upgrade", + "documented", + "runbook", + "docs/guide/single_user_production.md Sections 2, 4, and 5", + "storage health, API health, migration check, and post-upgrade search smoke", + "Backup Postgres before binary/config upgrade; rollback restores the previous backup and rebuilds Qdrant.", + ), + operational_case( + "postgres_restore_qdrant_rebuild", + "documented", + "runbook_or_clean_volume_proof", + "docs/guide/single_user_production.md Sections 6 through 9", + "Postgres restored row count, admin qdrant rebuild counts, and search-after-restore response", + "Qdrant remains derived and rebuild uses Postgres-held vectors without embedding provider calls.", + ), + operational_case( + "migration_rollback", + "documented", + "runbook", + "docs/guide/single_user_production.md Section 5 rollback path", + "pre-upgrade backup path, restored source rows, qdrant rebuild, and health check", + "No reverse migration is claimed; rollback means previous binary/config plus restored Postgres backup.", + ), + operational_case( + "unattended_soak", + "bounded", + "opt_in", + "ELF_BASELINE_PROJECTS=ELF ELF_BASELINE_PROFILE=stress ELF_BASELINE_SOAK_SECONDS=3600 cargo make baseline-live-docker", + "soak_stability_e2e check and resource_envelope check in tmp/live-baseline/live-baseline-report.json", + "Long soak duration is env-controlled and not part of the default smoke profile.", + ), + ] +} + +fn incomplete_check(name: &'static str, reason: &str) -> CheckResult { + CheckResult { + name, + status: "incomplete", + reason: reason.to_string(), + evidence: serde_json::json!({}), + } +} + +fn summarize_checks(checks: &[CheckResult]) -> CheckSummary { + let wrong_result = checks.iter().filter(|check| check.status == "wrong_result").count(); + let lifecycle_fail = checks.iter().filter(|check| check.status == "lifecycle_fail").count(); + + CheckSummary { + total: checks.len(), + pass: checks.iter().filter(|check| check.status == "pass").count(), + fail: wrong_result + lifecycle_fail, + wrong_result, + lifecycle_fail, + incomplete: checks.iter().filter(|check| check.status == "incomplete").count(), + blocked: checks.iter().filter(|check| check.status == "blocked").count(), + not_encoded: checks.iter().filter(|check| check.status == "not_encoded").count(), + } +} + +fn project_status_from_summary(summary: &CheckSummary) -> &'static str { + if summary.wrong_result > 0 { + "wrong_result" + } else if summary.lifecycle_fail > 0 { + "lifecycle_fail" + } else if summary.blocked > 0 { + "blocked" + } else if summary.incomplete > 0 { + "incomplete" + } else if summary.not_encoded > 0 { + "not_encoded" + } else { + "pass" + } +} + +fn title_from_markdown(raw: &str, source_doc: &str) -> String { + raw.lines() + .find_map(|line| line.trim_start().strip_prefix("# ")) + .map(str::trim) + .filter(|title| !title.is_empty()) + .map(str::to_string) + .unwrap_or_else(|| source_doc.to_string()) +} + +fn key_for_doc(doc: &str) -> String { + let stem = Path::new(doc).file_stem().and_then(|stem| stem.to_str()).unwrap_or(doc); + let mut key = String::with_capacity(stem.len()); + let mut last_was_separator = false; + + for ch in stem.chars() { + if ch.is_ascii_alphanumeric() { + key.push(ch.to_ascii_lowercase()); + + last_was_separator = false; + } else if !last_was_separator && !key.is_empty() { + key.push('_'); + + last_was_separator = true; + } + } + + if key.ends_with('_') { + key.pop(); + } + + if key.is_empty() { "doc".to_string() } else { key } +} + +fn evidence_id_for_doc(doc: &str) -> String { + Path::new(doc).file_stem().and_then(|stem| stem.to_str()).unwrap_or(doc).to_string() +} + +fn expected_docs_for_case(case: &QueryCase) -> Vec { + let mut docs = Vec::with_capacity(case.allowed_alternate_docs.len().saturating_add(1)); + + docs.push(case.expected_doc.clone()); + docs.extend(case.allowed_alternate_docs.iter().cloned()); + + docs +} + +fn embed_text(text: &str, vector_dim: u32) -> Vec { + let dim = vector_dim as usize; + let mut vector = vec![0.0_f32; dim]; + + if dim == 0 { + return vector; + } + + let normalized = normalize_ascii_alnum_lowercase(text); + + for term in normalized.split_whitespace() { + if term.len() < 2 { + continue; + } + + let hash = blake3::hash(term.as_bytes()); + let bytes = hash.as_bytes(); + let idx = (u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize) % dim; + let sign = if bytes[4] & 1 == 0 { 1.0 } else { -1.0 }; + + vector[idx] += sign; + } + + if vector.iter().all(|value| *value == 0.0) { + let hash = blake3::hash(text.as_bytes()); + let bytes = hash.as_bytes(); + let idx = (u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize) % dim; + + vector[idx] = 1.0; + } + + let norm = vector.iter().map(|value| value * value).sum::().sqrt(); + + if norm > 0.0 { + for value in &mut vector { + *value /= norm; + } + } + + vector +} + +fn normalize_ascii_alnum_lowercase(text: &str) -> String { + let mut normalized = String::with_capacity(text.len()); + + for ch in text.chars() { + if ch.is_ascii_alphanumeric() { + normalized.push(ch.to_ascii_lowercase()); + } else { + normalized.push(' '); + } + } + + normalized +} + +fn terms(text: &str) -> HashSet { + text.split(|ch: char| !ch.is_ascii_alphanumeric()) + .map(str::trim) + .filter(|term| !term.is_empty()) + .map(str::to_ascii_lowercase) + .collect() +} + +fn distinctive_terms(text: &str, limit: usize) -> Vec { + let stop_words = [ + "the", "and", "for", "with", "that", "this", "from", "into", "must", "uses", "after", + "before", "query", "memory", "note", + ]; + let stop_words = stop_words.into_iter().collect::>(); + let mut out = Vec::new(); + + for raw in text.split(|ch: char| !ch.is_ascii_alphanumeric()) { + let term = raw.trim(); + + if term.len() < 5 { + continue; + } + + let lowered = term.to_ascii_lowercase(); + + if stop_words.contains(lowered.as_str()) || out.iter().any(|existing| existing == term) { + continue; + } + + out.push(term.to_string()); + + if out.len() >= limit { + break; + } + } + + out +} + +fn contains_case_insensitive(haystack: &str, needle: &str) -> bool { + haystack.to_ascii_lowercase().contains(&needle.to_ascii_lowercase()) +} + +fn git_head() -> color_eyre::Result { + if let Ok(head) = env::var("ELF_BASELINE_ELF_HEAD") { + let head = head.trim(); + + if !head.is_empty() { + return Ok(head.to_string()); + } + } + + let output = Command::new("git").args(["rev-parse", "HEAD"]).output()?; + + if !output.status.success() { + return Err(eyre::eyre!("git rev-parse HEAD failed.")); + } + + Ok(String::from_utf8(output.stdout)?.trim().to_string()) +} + +async fn resource_envelope_check( + service: &ElfService, + corpus_dir: &Path, + report_path: &Path, + checkpoint_path: &Path, + elapsed_seconds: f64, +) -> CheckResult { + let max_elapsed_seconds = env::var("ELF_BASELINE_MAX_ELF_SECONDS") + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or(600.0); + let max_rss_kb = env::var("ELF_BASELINE_MAX_ELF_RSS_KB") + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or(1_500_000); + let rss_kb = current_rss_kb(); + let pass = elapsed_seconds <= max_elapsed_seconds && rss_kb.is_none_or(|rss| rss <= max_rss_kb); + let postgres_database_bytes = postgres_database_bytes(service).await.ok(); + let corpus_dir_bytes = path_size_bytes(corpus_dir).unwrap_or_default(); + let report_dir_bytes = report_path.parent().and_then(|path| path_size_bytes(path).ok()); + let checkpoint_file_bytes = checkpoint_path.metadata().ok().map(|metadata| metadata.len()); + + CheckResult { + name: "resource_envelope", + status: if pass { "pass" } else { "lifecycle_fail" }, + reason: if pass { + "ELF live-baseline runtime stayed within the configured local resource envelope." + .to_string() + } else { + "ELF live-baseline runtime exceeded the configured local resource envelope.".to_string() + }, + evidence: serde_json::json!(ResourceEnvelopeEvidence { + elapsed_seconds, + max_elapsed_seconds, + rss_kb, + max_rss_kb, + postgres_database_bytes, + corpus_dir_bytes, + report_dir_bytes, + checkpoint_file_bytes, + }), + } +} + +async fn postgres_database_bytes(service: &ElfService) -> color_eyre::Result { + let bytes = sqlx::query_scalar::<_, i64>("SELECT pg_database_size(current_database())::bigint") + .fetch_one(&service.db.pool) + .await?; + + Ok(bytes) +} + +async fn load_existing_backfill_notes( + service: &ElfService, +) -> color_eyre::Result> { + let rows = sqlx::query_as::<_, (Uuid, String, Option)>( + "\ +SELECT note_id, source_ref->>'document' AS source_doc, source_ref->>'source_hash' AS source_hash +FROM memory_notes +WHERE tenant_id = $1 + AND project_id = $2 + AND agent_id = $3 + AND scope = $4 + AND status = 'active' + AND source_ref->>'source' = 'ELF live baseline corpus' + AND source_ref->>'document' IS NOT NULL +ORDER BY updated_at DESC", + ) + .bind(TENANT_ID) + .bind(PROJECT_ID) + .bind(AGENT_ID) + .bind(SCOPE) + .fetch_all(&service.db.pool) + .await?; + let mut out = BTreeMap::new(); + + for (note_id, source_doc, hash) in rows { + out.entry(source_doc).or_insert(ExistingBackfillNote { note_id, source_hash: hash }); + } + + Ok(out) +} + +async fn duplicate_source_notes( + service: &ElfService, +) -> color_eyre::Result> { + let rows = sqlx::query_as::<_, (String, i64, Vec)>( + "\ +SELECT + source_ref->>'document' AS source_doc, + COUNT(*)::bigint AS count, + array_agg(note_id ORDER BY note_id)::uuid[] AS note_ids +FROM memory_notes +WHERE tenant_id = $1 + AND project_id = $2 + AND agent_id = $3 + AND scope = $4 + AND status = 'active' + AND source_ref->>'source' = 'ELF live baseline corpus' + AND source_ref->>'document' IS NOT NULL +GROUP BY source_ref->>'document' +HAVING COUNT(*) > 1 +ORDER BY source_doc", + ) + .bind(TENANT_ID) + .bind(PROJECT_ID) + .bind(AGENT_ID) + .bind(SCOPE) + .fetch_all(&service.db.pool) + .await?; + + Ok(rows + .into_iter() + .map(|(source_doc, count, note_ids)| DuplicateSourceNote { source_doc, count, note_ids }) + .collect()) +} + +async fn run_resumable_backfill( + service: &ElfService, + notes: &[CorpusNote], + checkpoint_path: &Path, +) -> color_eyre::Result { + let started_at = Instant::now(); + let corpus_hash = corpus_hash(notes); + let batch_size = backfill_batch_size(); + let interrupt_after = backfill_interrupt_after(notes.len()); + let first_attempt = run_backfill_attempt( + service, + notes, + checkpoint_path, + &corpus_hash, + batch_size, + 1, + interrupt_after, + ) + .await?; + let interrupted = first_attempt.interrupted; + let completed_before_resume = first_attempt.checkpoint_completed; + let mut attempts = Vec::new(); + + attempts.push(first_attempt); + + if interrupted { + attempts.push( + run_backfill_attempt( + service, + notes, + checkpoint_path, + &corpus_hash, + batch_size, + 2, + None, + ) + .await?, + ); + } + + let checkpoint = load_backfill_checkpoint(checkpoint_path, &corpus_hash)?; + let existing = load_existing_backfill_notes(service).await?; + let mut note_ids = Vec::with_capacity(notes.len()); + + for note in notes { + let Some(entry) = checkpoint.completed.get(¬e.source_doc) else { + return Err(eyre::eyre!( + "Backfill checkpoint missing completed source {}.", + note.source_doc + )); + }; + + if !checkpoint_entry_valid(note, entry, &existing) { + return Err(eyre::eyre!( + "Backfill checkpoint entry for {} does not match Postgres state.", + note.source_doc + )); + } + + note_ids.push(entry.note_id); + } + + let duplicate_source_notes = duplicate_source_notes(service).await?; + let attempted_writes = attempts.iter().map(|attempt| attempt.attempted_writes).sum(); + let skipped_completed = attempts.iter().map(|attempt| attempt.skipped_completed).sum(); + let completed_after_resume = checkpoint.completed.len(); + let report = BackfillReport { + checkpoint_path: checkpoint_path.display().to_string(), + corpus_hash, + source_count: notes.len(), + completed_count: note_ids.len(), + batch_size, + worker_concurrency: worker_concurrency(), + elapsed_seconds: started_at.elapsed().as_secs_f64(), + attempted_writes, + skipped_completed, + duplicate_source_notes, + resume: BackfillResumeReport { + enabled: interrupt_after.is_some(), + interrupted, + interrupt_after, + resume_attempts: attempts.len(), + completed_before_resume, + completed_after_resume, + }, + attempts, + }; + + Ok(BackfillOutcome { report, note_ids }) +} + +async fn run_backfill_attempt( + service: &ElfService, + notes: &[CorpusNote], + checkpoint_path: &Path, + corpus_hash: &str, + batch_size: usize, + attempt: usize, + interrupt_after: Option, +) -> color_eyre::Result { + let mut checkpoint = load_backfill_checkpoint(checkpoint_path, corpus_hash)?; + let existing = load_existing_backfill_notes(service).await?; + let notes_by_source = + notes.iter().map(|note| (note.source_doc.as_str(), note)).collect::>(); + let checkpoint_len_before_prune = checkpoint.completed.len(); + + checkpoint.completed.retain(|source_doc, entry| { + notes_by_source + .get(source_doc.as_str()) + .is_some_and(|note| checkpoint_entry_valid(note, entry, &existing)) + }); + + if checkpoint.completed.len() != checkpoint_len_before_prune { + write_backfill_checkpoint(checkpoint_path, &checkpoint)?; + } + + let mut pending = Vec::new(); + let mut skipped_completed = 0_usize; + + for note in notes { + if checkpoint.completed.contains_key(¬e.source_doc) { + skipped_completed += 1; + } else { + pending.push(note); + } + } + + let max_writes = interrupt_after.unwrap_or(usize::MAX); + let mut attempted_writes = 0_usize; + let mut completed_writes = 0_usize; + let mut cursor = 0_usize; + + while cursor < pending.len() && attempted_writes < max_writes { + let remaining_budget = max_writes.saturating_sub(attempted_writes); + let take = batch_size.min(remaining_budget).min(pending.len() - cursor); + let batch = &pending[cursor..cursor + take]; + let response = service + .add_note(AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: batch.iter().map(|note| note_input(note)).collect(), + }) + .await?; + + if response.results.len() != batch.len() { + return Err(eyre::eyre!( + "Backfill add_note returned {} results for {} inputs.", + response.results.len(), + batch.len() + )); + } + + for (note, result) in batch.iter().zip(response.results) { + let op = note_op_string(result.op)?; + + if op == "REJECTED" { + return Err(eyre::eyre!( + "Backfill note {} was rejected: {:?}.", + note.source_doc, + result.reason_code + )); + } + + let note_id = result.note_id.ok_or_else(|| { + eyre::eyre!("Backfill note {} did not return a note_id.", note.source_doc) + })?; + + checkpoint.completed.insert( + note.source_doc.clone(), + BackfillCheckpointEntry { + note_id, + key: note.key.clone(), + source_hash: source_hash(note), + op, + }, + ); + + completed_writes += 1; + } + + attempted_writes += batch.len(); + cursor += batch.len(); + + write_backfill_checkpoint(checkpoint_path, &checkpoint)?; + } + + let interrupted = cursor < pending.len(); + + Ok(BackfillAttemptEvidence { + attempt, + resumed: skipped_completed > 0, + interrupt_after, + skipped_completed, + attempted_writes, + completed_writes, + checkpoint_completed: checkpoint.completed.len(), + interrupted, + }) +} + +#[tokio::main] +async fn main() -> color_eyre::Result<()> { + color_eyre::install()?; + + let args = Args::parse(); + let out = args.out.clone(); + let report = run(args).await?; + let raw = serde_json::to_string_pretty(&report)?; + + fs::write(out, raw)?; + + Ok(()) +} + +async fn run(args: Args) -> color_eyre::Result { + let started_at = Instant::now(); + let base_dsn = env::var("ELF_PG_DSN") + .map_err(|_| eyre::eyre!("ELF_PG_DSN must be set for live ELF baseline."))?; + let qdrant_url = env::var("ELF_QDRANT_GRPC_URL") + .or_else(|_| env::var("ELF_QDRANT_URL")) + .map_err(|_| eyre::eyre!("ELF_QDRANT_GRPC_URL or ELF_QDRANT_URL must be set."))?; + let test_db = TestDatabase::new(&base_dsn).await?; + let collection = test_db.collection_name("elf_live_baseline_notes"); + let docs_collection = test_db.collection_name("elf_live_baseline_docs"); + let runtime = BaselineRuntime { + config_path: args.config.clone(), + dsn: test_db.dsn().to_string(), + qdrant_url, + collection, + docs_collection, + }; + let service = Arc::new(build_service(&runtime).await?); + let notes = load_corpus_notes(&args.corpus)?; + let backfill_checkpoint_path = backfill_checkpoint_path(&args.out); + let backfill = run_resumable_backfill(&service, ¬es, &backfill_checkpoint_path).await?; + let note_ids = backfill.note_ids; + let initial_worker = + run_worker_until_indexed(&runtime, &service, ¬e_ids, "corpus_upsert").await?; + let rebuild = service.rebuild_qdrant().await?; + let query_manifest = load_queries(&args.queries)?; + let query_results = run_queries(&service, query_manifest.queries).await?; + let pass_count = query_results.iter().filter(|result| result.matched).count(); + let fail_count = query_results.len().saturating_sub(pass_count); + let latency_ms_total = query_results.iter().map(|result| result.latency_ms).sum::(); + let latency_ms_mean = latency_ms_total / query_results.len().max(1) as f64; + let latency_values = query_results.iter().map(|result| result.latency_ms).collect::>(); + let latency_ms_p50 = latency_percentile(&latency_values, 0.50); + let latency_ms_p95 = latency_percentile(&latency_values, 0.95); + let latency_ms_p99 = latency_percentile(&latency_values, 0.99); + let latency_ms_max = latency_values.iter().copied().fold(0.0_f64, f64::max); + let retrieval_status = + if fail_count == 0 { "retrieval_pass" } else { "retrieval_wrong_result" }; + let mut checks = vec![ + resumable_backfill_check(&backfill.report), + retrieval_check(&query_results), + worker_indexing_check(initial_worker), + ]; + + checks.extend(run_lifecycle_checks(&runtime, &service, ¬es, ¬e_ids).await?); + checks.push(run_concurrent_write_check(&runtime, Arc::clone(&service)).await?); + + if let Some(soak_check) = run_soak_stability_check(&runtime, Arc::clone(&service)).await? { + checks.push(soak_check); + } + + checks.push( + resource_envelope_check( + &service, + &args.corpus, + &args.out, + &backfill_checkpoint_path, + started_at.elapsed().as_secs_f64(), + ) + .await, + ); + + let check_summary = summarize_checks(&checks); + let status = project_status_from_summary(&check_summary); + let reason = if status == "pass" { + "ELF added the corpus, rebuilt Qdrant, and returned expected evidence for every query" + .to_string() + } else { + format!( + "ELF reported {} wrong-result, {} lifecycle-failure, {} blocked, {} incomplete, and {} not-encoded live-baseline check(s)", + check_summary.wrong_result, + check_summary.lifecycle_fail, + check_summary.blocked, + check_summary.incomplete, + check_summary.not_encoded + ) + }; + let embedding = embedding_runtime_report(&service.cfg); + let cost_proxy = cost_proxy_report(¬es, &query_results, &embedding); + let report = ElfBaselineReport { + schema: "elf.live_baseline.elf_result/v1", + status, + retrieval_status, + reason, + head: git_head().unwrap_or_else(|_| "unknown".to_string()), + embedding, + cost_proxy, + backfill: backfill.report, + indexing: IndexingReport { + note_count: notes.len(), + rebuild_rebuilt_count: rebuild.rebuilt_count, + rebuild_missing_vector_count: rebuild.missing_vector_count, + rebuild_error_count: rebuild.error_count, + }, + summary: QuerySummary { + total: query_results.len(), + pass: pass_count, + fail: fail_count, + wrong_result_count: fail_count, + latency_ms_total, + latency_ms_mean, + latency_ms_p50, + latency_ms_p95, + latency_ms_p99, + latency_ms_max, + }, + check_summary, + checks, + queries: query_results, + ops_cases: operational_cases(), + }; + + drop(service); + + test_db.cleanup().await?; + + Ok(report) +} + +async fn build_service(runtime: &BaselineRuntime) -> color_eyre::Result { + let cfg = runtime_config(runtime)?; + let embedding_mode = embedding_mode()?; + let vector_dim = cfg.storage.qdrant.vector_dim; + let db = Db::connect(&cfg.storage.postgres).await?; + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; + + let qdrant = QdrantStore::new(&cfg.storage.qdrant)?; + + qdrant.ensure_collection().await?; + + if embedding_mode == EmbeddingMode::Provider { + Ok(ElfService::new(cfg, db, qdrant)) + } else { + Ok(ElfService::with_providers(cfg, db, qdrant, deterministic_providers(vector_dim))) + } +} + +async fn build_worker_state(runtime: &BaselineRuntime) -> color_eyre::Result { + let cfg = runtime_config(runtime)?; + let db = Db::connect(&cfg.storage.postgres).await?; + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; + + let qdrant = QdrantStore::new(&cfg.storage.qdrant)?; + + qdrant.ensure_collection().await?; + + let docs_qdrant = + QdrantStore::new_with_collection(&cfg.storage.qdrant, &cfg.storage.qdrant.docs_collection)?; + + docs_qdrant.ensure_collection().await?; + + let tokenizer = elf_chunking::load_tokenizer(&cfg.chunking.tokenizer_repo) + .map_err(|err| eyre::eyre!("Failed to load tokenizer for live baseline worker: {err}"))?; + let chunking = ChunkingConfig { + max_tokens: cfg.chunking.max_tokens, + overlap_tokens: cfg.chunking.overlap_tokens, + }; + + Ok(WorkerState { + db, + qdrant, + docs_qdrant, + embedding: cfg.providers.embedding, + chunking, + tokenizer, + }) +} + +async fn run_worker_until_indexed( + runtime: &BaselineRuntime, + service: &ElfService, + note_ids: &[Uuid], + label: &str, +) -> color_eyre::Result { + let concurrency = worker_concurrency(); + let mut states = Vec::with_capacity(concurrency); + + for _ in 0..concurrency { + states.push(Arc::new(build_worker_state(runtime).await?)); + } + + let before = outbox_status_counts(service, note_ids).await?; + let max_iterations = worker_max_iterations(note_ids.len()); + let mut iterations = 0_usize; + + while iterations < max_iterations { + let after = outbox_status_counts(service, note_ids).await?; + + if outbox_done(&after, note_ids.len()) { + let (chunk_rows, chunk_embedding_rows) = chunk_counts(service, note_ids).await?; + let failed_jobs = failed_outbox_jobs(service, note_ids).await?; + + return Ok(WorkerRunEvidence { + label: label.to_string(), + expected_note_count: note_ids.len(), + concurrency, + iterations, + before, + after, + chunk_rows, + chunk_embedding_rows, + failed_jobs, + }); + } + + let mut set = JoinSet::new(); + + for state in &states { + let state = Arc::clone(state); + + set.spawn(async move { + worker::process_once(&state) + .await + .map_err(|err| eyre::eyre!("Worker process_once failed: {err}")) + }); + } + + while let Some(joined) = set.join_next().await { + joined??; + } + + iterations = iterations.saturating_add(concurrency); + } + + let after = outbox_status_counts(service, note_ids).await?; + let (chunk_rows, chunk_embedding_rows) = chunk_counts(service, note_ids).await?; + let failed_jobs = failed_outbox_jobs(service, note_ids).await?; + + Ok(WorkerRunEvidence { + label: label.to_string(), + expected_note_count: note_ids.len(), + concurrency, + iterations, + before, + after, + chunk_rows, + chunk_embedding_rows, + failed_jobs, + }) +} + +async fn outbox_status_counts( + service: &ElfService, + note_ids: &[Uuid], +) -> color_eyre::Result> { + if note_ids.is_empty() { + return Ok(BTreeMap::new()); + } + + let rows = sqlx::query_as::<_, (String, i64)>( + "\ +SELECT status, COUNT(*)::bigint +FROM indexing_outbox +WHERE note_id = ANY($1) +GROUP BY status +ORDER BY status", + ) + .bind(note_ids) + .fetch_all(&service.db.pool) + .await?; + + Ok(rows.into_iter().collect()) +} + +async fn chunk_counts(service: &ElfService, note_ids: &[Uuid]) -> color_eyre::Result<(i64, i64)> { + if note_ids.is_empty() { + return Ok((0, 0)); + } + + let chunk_rows = sqlx::query_scalar::<_, i64>( + "\ +SELECT COUNT(*)::bigint +FROM memory_note_chunks +WHERE note_id = ANY($1)", + ) + .bind(note_ids) + .fetch_one(&service.db.pool) + .await?; + let chunk_embedding_rows = sqlx::query_scalar::<_, i64>( + "\ +SELECT COUNT(*)::bigint +FROM memory_note_chunks c +JOIN note_chunk_embeddings e ON e.chunk_id = c.chunk_id +WHERE c.note_id = ANY($1)", + ) + .bind(note_ids) + .fetch_one(&service.db.pool) + .await?; + + Ok((chunk_rows, chunk_embedding_rows)) +} + +async fn failed_outbox_jobs( + service: &ElfService, + note_ids: &[Uuid], +) -> color_eyre::Result> { + if note_ids.is_empty() { + return Ok(Vec::new()); + } + + let rows = sqlx::query_as::<_, (Uuid, Option, String, i32, Option)>( + "\ +SELECT o.note_id, n.key, o.op, o.attempts, o.last_error +FROM indexing_outbox o +LEFT JOIN memory_notes n ON n.note_id = o.note_id +WHERE o.note_id = ANY($1) + AND o.status = 'FAILED' +ORDER BY n.key NULLS LAST, o.note_id", + ) + .bind(note_ids) + .fetch_all(&service.db.pool) + .await?; + + Ok(rows + .into_iter() + .map(|(note_id, note_key, op, attempts, last_error)| FailedOutboxJob { + note_id, + note_key, + op, + attempts, + last_error, + }) + .collect()) +} + +async fn run_queries( + service: &ElfService, + queries: Vec, +) -> color_eyre::Result> { + let mut out = Vec::with_capacity(queries.len()); + + for case in queries { + out.push(run_single_query(service, case).await?); + } + + Ok(out) +} + +async fn run_single_query( + service: &ElfService, + case: QueryCase, +) -> color_eyre::Result { + let top_k = env::var("ELF_BASELINE_TOP_K") + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or(10); + let started_at = Instant::now(); + let response = service + .search_raw(SearchRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + token_id: None, + payload_level: PayloadLevel::L2, + read_profile: "private_only".to_string(), + query: case.query.clone(), + top_k: Some(top_k), + candidate_k: Some(top_k.max(20).saturating_mul(4)), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await?; + let latency_ms = started_at.elapsed().as_secs_f64() * 1_000.0; + let top = response.items.first(); + let top_text = top.map(|item| item.snippet.clone()).unwrap_or_default(); + let matched_terms = case + .expected_terms + .iter() + .filter(|term| contains_case_insensitive(&top_text, term)) + .cloned() + .collect::>(); + let top_key = top.and_then(|item| item.key.clone()); + let expected_docs = expected_docs_for_case(&case); + let matched_doc = + top_key.as_deref().and_then(|key| expected_docs.iter().find(|doc| key_for_doc(doc) == key)); + let top_evidence_id = top.and_then(|item| { + item.source_ref.get("document").and_then(Value::as_str).map(evidence_id_for_doc) + }); + let matched_evidence_id = matched_doc.map(|doc| evidence_id_for_doc(doc)); + let matched = matched_terms.len() == case.expected_terms.len() || matched_doc.is_some(); + let expected_evidence_ids = if case.expected_evidence_ids.is_empty() { + vec![evidence_id_for_doc(&case.expected_doc)] + } else { + case.expected_evidence_ids.clone() + }; + let allowed_alternate_evidence_ids = if case.allowed_alternate_evidence_ids.is_empty() { + case.allowed_alternate_docs.iter().map(|doc| evidence_id_for_doc(doc)).collect() + } else { + case.allowed_alternate_evidence_ids.clone() + }; + + Ok(QueryResult { + id: case.id, + task: case.task, + trace_id: response.trace_id, + query: case.query, + expected_doc: case.expected_doc, + allowed_alternate_docs: case.allowed_alternate_docs, + expected_terms: case.expected_terms, + expected_evidence_ids, + allowed_alternate_evidence_ids, + matched, + matched_terms, + top_evidence_id, + matched_evidence_id, + top_note_key: top_key, + top_snippet: top.map(|item| item.snippet.clone()), + latency_ms, + returned_count: response.items.len(), + }) +} + +async fn run_lifecycle_checks( + runtime: &BaselineRuntime, + service: &ElfService, + notes: &[CorpusNote], + note_ids: &[Uuid], +) -> color_eyre::Result> { + let Some(update_note) = notes.first() else { + return Ok(vec![incomplete_check( + "update_replaces_note_text", + "Corpus has no note to update.", + )]); + }; + let Some(update_note_id) = note_ids.first().copied() else { + return Ok(vec![incomplete_check( + "update_replaces_note_text", + "ELF add_note returned no note_id for lifecycle update.", + )]); + }; + let Some(delete_note) = notes.get(1) else { + return Ok(vec![incomplete_check( + "delete_suppresses_retrieval", + "Corpus has no note to delete.", + )]); + }; + let Some(delete_note_id) = note_ids.get(1).copied() else { + return Ok(vec![incomplete_check( + "delete_suppresses_retrieval", + "ELF add_note returned no note_id for lifecycle delete.", + )]); + }; + let Some(recovery_note) = notes.get(2) else { + return Ok(vec![incomplete_check( + "cold_start_recovery_search", + "Corpus has no stable note for recovery search.", + )]); + }; + + Ok(vec![ + run_update_replacement_check(runtime, service, update_note, update_note_id).await?, + run_delete_suppression_check(runtime, service, delete_note, delete_note_id).await?, + run_cold_start_recovery_check(runtime, service, recovery_note).await?, + ]) +} + +async fn run_update_replacement_check( + runtime: &BaselineRuntime, + service: &ElfService, + update_note: &CorpusNote, + update_note_id: Uuid, +) -> color_eyre::Result { + let update_text = "\ + Rotated auth middleware validates JWT tokens with key id `kid-v4` under \ + `RotatedJwtKeyPlan`. It still requires tenant scope `project_shared` for deployment \ + operations after the emergency key rotation." + .to_string(); + let update_response = service + .update(UpdateRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + note_id: update_note_id, + text: Some(update_text.clone()), + importance: None, + confidence: None, + ttl_days: None, + }) + .await?; + let update_worker = + run_worker_until_indexed(runtime, service, &[update_note_id], "lifecycle_update").await?; + let update_query = run_single_query( + service, + QueryCase::generated( + "lifecycle-update-new-marker".to_string(), + "Which rotated JWT key id does the auth middleware require?".to_string(), + update_note.source_doc.clone(), + vec!["kid-v4".to_string(), "RotatedJwtKeyPlan".to_string()], + ), + ) + .await?; + let old_marker_absent = update_query + .top_snippet + .as_deref() + .is_some_and(|snippet| !contains_case_insensitive(snippet, "kid-v3")); + let update_pass = update_query.matched + && old_marker_absent + && outbox_done(&update_worker.after, update_worker.expected_note_count); + + Ok(CheckResult { + name: "update_replaces_note_text", + status: if update_pass { "pass" } else { "lifecycle_fail" }, + reason: if update_pass { + "Service update plus worker indexing returned the new marker and removed the old marker from the top snippet.".to_string() + } else { + "Service update plus worker indexing did not produce a clean search result for the replacement marker.".to_string() + }, + evidence: serde_json::json!({ + "note_id": update_note_id, + "op": update_response.op, + "worker": update_worker, + "query": update_query, + "old_marker_absent": old_marker_absent, + }), + }) +} + +async fn run_delete_suppression_check( + runtime: &BaselineRuntime, + service: &ElfService, + delete_note: &CorpusNote, + delete_note_id: Uuid, +) -> color_eyre::Result { + let delete_response = service + .delete(DeleteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + note_id: delete_note_id, + }) + .await?; + let delete_worker = + run_worker_until_indexed(runtime, service, &[delete_note_id], "lifecycle_delete").await?; + let delete_query = run_single_query( + service, + QueryCase::generated( + "lifecycle-delete-suppresses-note".to_string(), + delete_note.text.clone(), + delete_note.source_doc.clone(), + distinctive_terms(&delete_note.text, 2), + ), + ) + .await?; + let delete_pass = !delete_query.matched + && outbox_done(&delete_worker.after, delete_worker.expected_note_count); + + Ok(CheckResult { + name: "delete_suppresses_retrieval", + status: if delete_pass { "pass" } else { "lifecycle_fail" }, + reason: if delete_pass { + "Service delete suppressed the deleted note from subsequent search results.".to_string() + } else { + "Deleted note was still retrievable after service delete and worker indexing." + .to_string() + }, + evidence: serde_json::json!({ + "note_id": delete_note_id, + "op": delete_response.op, + "worker": delete_worker, + "query": delete_query, + }), + }) +} + +async fn run_cold_start_recovery_check( + runtime: &BaselineRuntime, + service: &ElfService, + recovery_note: &CorpusNote, +) -> color_eyre::Result { + let recovery_service = build_service(runtime).await?; + let recovery_query = run_single_query( + &recovery_service, + QueryCase::generated( + "lifecycle-cold-start-recovery".to_string(), + recovery_note.text.clone(), + recovery_note.source_doc.clone(), + distinctive_terms(&recovery_note.text, 2), + ), + ) + .await?; + let outbox_counts = pending_outbox_counts(service).await?; + + Ok(CheckResult { + name: "cold_start_recovery_search", + status: if recovery_query.matched { "pass" } else { "lifecycle_fail" }, + reason: if recovery_query.matched { + "A newly constructed service over the same Postgres and Qdrant stores retrieved persisted evidence.".to_string() + } else { + "A newly constructed service over the same stores could not retrieve persisted evidence.".to_string() + }, + evidence: serde_json::json!({ + "query": recovery_query, + "pending_outbox_by_op": outbox_counts, + "note": recovery_note.source_doc, + }), + }) +} + +async fn pending_outbox_counts(service: &ElfService) -> color_eyre::Result> { + let rows = sqlx::query_as::<_, (String, i64)>( + "\ +SELECT op, COUNT(*)::bigint +FROM indexing_outbox +WHERE status = 'PENDING' +GROUP BY op +ORDER BY op", + ) + .fetch_all(&service.db.pool) + .await?; + + Ok(rows.into_iter().collect()) +} + +async fn run_concurrent_write_check( + runtime: &BaselineRuntime, + service: Arc, +) -> color_eyre::Result { + let note_count = concurrent_note_count(); + let mut set = JoinSet::new(); + + for index in 0..note_count { + let request = concurrent_add_request(index); + let service_ref = Arc::clone(&service); + + set.spawn(async move { + let response = service_ref.add_note(request).await?; + let note_id = response + .results + .first() + .and_then(|result| result.note_id) + .ok_or_else(|| eyre::eyre!("Concurrent add_note did not return a note_id."))?; + + Ok::(note_id) + }); + } + + let mut note_ids = Vec::with_capacity(note_count); + + while let Some(joined) = set.join_next().await { + note_ids.push(joined??); + } + + let worker_evidence = + run_worker_until_indexed(runtime, &service, ¬e_ids, "concurrent_upsert").await?; + let probe_indexes = concurrency_probe_indexes(note_count); + let mut query_results = Vec::new(); + + for index in probe_indexes { + query_results.push(run_single_query(&service, concurrent_query_case(index)).await?); + } + + let pass_count = query_results.iter().filter(|result| result.matched).count(); + let pass = outbox_done(&worker_evidence.after, worker_evidence.expected_note_count) + && pass_count == query_results.len(); + + Ok(CheckResult { + name: "concurrent_write_search_e2e", + status: if pass { "pass" } else { "lifecycle_fail" }, + reason: if pass { + "Concurrent add_note calls were indexed by the worker and remained searchable." + .to_string() + } else { + "Concurrent add_note calls did not all become searchable after worker indexing." + .to_string() + }, + evidence: serde_json::json!({ + "note_count": note_count, + "worker": worker_evidence, + "query_summary": { + "total": query_results.len(), + "pass": pass_count, + "fail": query_results.len().saturating_sub(pass_count), + }, + "queries": query_results, + }), + }) +} + +async fn run_soak_stability_check( + runtime: &BaselineRuntime, + service: Arc, +) -> color_eyre::Result> { + let config = soak_config(); + + if config.target_seconds == 0 && config.write_rounds == 0 { + return Ok(None); + } + + let target_duration = Duration::from_secs(config.target_seconds); + let started_at = Instant::now(); + let write_rounds = config.write_rounds.max(if config.target_seconds > 0 { 1 } else { 0 }); + let mut note_ids = Vec::with_capacity(write_rounds); + let mut worker_runs = Vec::with_capacity(write_rounds); + let mut query_results = Vec::new(); + + for index in 0..write_rounds { + let response = service.add_note(soak_add_request(index)).await?; + let note_id = response + .results + .first() + .and_then(|result| result.note_id) + .ok_or_else(|| eyre::eyre!("Soak add_note did not return a note_id."))?; + + note_ids.push(note_id); + worker_runs + .push(run_worker_until_indexed(runtime, &service, &[note_id], "soak_upsert").await?); + query_results.push(run_single_query(&service, soak_query_case(index)).await?); + + if config.target_seconds > 0 && write_rounds > 1 { + let target_elapsed = target_duration.mul_f64((index + 1) as f64 / write_rounds as f64); + + if started_at.elapsed() < target_elapsed { + time::sleep(target_elapsed.saturating_sub(started_at.elapsed())).await; + } + } + } + + let mut probe_index = 0; + + while started_at.elapsed() < target_duration { + let index = probe_index % write_rounds; + + query_results.push(run_single_query(&service, soak_query_case(index)).await?); + + probe_index += 1; + + let sleep_for = Duration::from_millis(config.probe_interval_millis) + .min(target_duration.saturating_sub(started_at.elapsed())); + + if !sleep_for.is_zero() { + time::sleep(sleep_for).await; + } + } + + let elapsed_seconds = started_at.elapsed().as_secs_f64(); + let pass_count = query_results.iter().filter(|result| result.matched).count(); + let query_fail_count = query_results.len().saturating_sub(pass_count); + let worker_pass = + worker_runs.iter().all(|run| outbox_done(&run.after, run.expected_note_count)); + let duration_pass = target_duration.is_zero() || started_at.elapsed() >= target_duration; + let pass = worker_pass && duration_pass && query_fail_count == 0; + let failed_queries = query_results.iter().filter(|result| !result.matched).collect::>(); + + Ok(Some(CheckResult { + name: "soak_stability_e2e", + status: if pass { "pass" } else { "lifecycle_fail" }, + reason: if pass { + "ELF sustained repeated write, worker indexing, and search probes for the configured soak window.".to_string() + } else { + "ELF did not sustain the configured soak write/search window without a failed worker or retrieval probe.".to_string() + }, + evidence: serde_json::json!({ + "config": config, + "elapsed_seconds": elapsed_seconds, + "duration_met": duration_pass, + "worker_pass": worker_pass, + "write_note_ids": note_ids, + "worker_runs": worker_runs, + "query_summary": { + "total": query_results.len(), + "pass": pass_count, + "fail": query_fail_count, + }, + "failed_queries": failed_queries, + }), + })) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs new file mode 100644 index 00000000..eae9659f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -0,0 +1,7819 @@ +#![allow(clippy::single_component_path_imports, unused_crate_dependencies)] + +//! Offline runner and publisher for real-world job benchmark fixtures. + +use std::{ + collections::{BTreeMap, BTreeSet}, + fs, + path::{Path, PathBuf}, +}; + +use clap::{Parser, Subcommand}; +use color_eyre::{Result, eyre}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; + +use elf_cli::VERSION; + +const JOB_SCHEMA: &str = "elf.real_world_job/v1"; +const REPORT_SCHEMA: &str = "elf.real_world_job_report/v1"; +const EXTERNAL_ADAPTER_MANIFEST_SCHEMA: &str = "elf.real_world_external_adapter_manifest/v1"; +const EXTERNAL_ADAPTER_REPORT_SCHEMA: &str = "elf.real_world_external_adapter_report/v1"; +const DEFAULT_FIXTURE_PATH: &str = "apps/elf-eval/fixtures/real_world_memory/work_resume"; +const DEFAULT_REPORT_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.json"; +const DEFAULT_MARKDOWN_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.md"; +const DEFAULT_EXTERNAL_ADAPTER_MANIFEST_PATH: &str = + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json"; +const DEFAULT_RUN_ID: &str = "real-world-job-smoke"; +const DEFAULT_ADAPTER_ID: &str = "fixture_smoke"; +const DEFAULT_ADAPTER_NAME: &str = "ELF fixture smoke"; +const DEFAULT_ADAPTER_BEHAVIOR: &str = "offline_fixture_response"; +const DEFAULT_ADAPTER_STORAGE_STATUS: &str = "not_encoded"; +const DEFAULT_ADAPTER_RUNTIME_STATUS: &str = "not_encoded"; +const DEFAULT_ADAPTER_NOTES: &str = "Offline runner scores checked-in fixture responses; it does not exercise a live external adapter."; +const NOT_ENCODED_REASON: &str = "No checked-in real_world_job fixture is encoded for this suite."; +const FORBIDDEN_SOURCE_MUTATION_KEYS: [&str; 7] = [ + "delete_source", + "delete_sources", + "source_delete", + "source_mutation", + "source_mutations", + "source_note_updates", + "overwrite_source", +]; +const SUITES: &[&str] = &[ + "trust_source_of_truth", + "work_resume", + "project_decisions", + "retrieval", + "memory_evolution", + "consolidation", + "memory_summary", + "proactive_brief", + "scheduled_memory", + "knowledge_compilation", + "operator_debugging_ux", + "capture_integration", + "production_ops", + "personalization", + "core_archival_memory", + "context_trajectory", +]; + +#[derive(Debug, Parser)] +#[command( + version = elf_cli::VERSION, + rename_all = "kebab", + styles = elf_cli::styles(), +)] +struct Args { + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Subcommand)] +#[command(rename_all = "kebab")] +enum Command { + /// Parse and score real_world_job fixtures, then emit a JSON report. + Run(RunArgs), + /// Render Markdown from a generated real_world_job JSON report. + Publish(PublishArgs), +} + +#[derive(Debug, Parser)] +struct RunArgs { + /// Fixture file or directory containing real_world_job JSON fixtures. + #[arg(long, value_name = "PATH", default_value = DEFAULT_FIXTURE_PATH)] + fixtures: PathBuf, + /// Write report JSON to this file. Omit to print to stdout. + #[arg(long, value_name = "FILE")] + out: Option, + /// Stable run id recorded in the generated report. + #[arg(long, default_value = DEFAULT_RUN_ID)] + run_id: String, + /// Adapter id recorded for the offline smoke response. + #[arg(long, default_value = DEFAULT_ADAPTER_ID)] + adapter_id: String, + /// Human-readable adapter name recorded in the generated report. + #[arg(long, default_value = DEFAULT_ADAPTER_NAME)] + adapter_name: String, + /// Adapter behavior label recorded in the generated report. + #[arg(long, default_value = DEFAULT_ADAPTER_BEHAVIOR)] + adapter_behavior: String, + /// Adapter storage typed status recorded in the generated report. + #[arg(long, default_value = DEFAULT_ADAPTER_STORAGE_STATUS)] + adapter_storage_status: String, + /// Adapter runtime typed status recorded in the generated report. + #[arg(long, default_value = DEFAULT_ADAPTER_RUNTIME_STATUS)] + adapter_runtime_status: String, + /// Adapter notes recorded in the generated report. + #[arg(long, default_value = DEFAULT_ADAPTER_NOTES)] + adapter_notes: String, + /// Real-world external adapter manifest to include in report coverage. + #[arg(long, value_name = "FILE", default_value = DEFAULT_EXTERNAL_ADAPTER_MANIFEST_PATH)] + external_adapter_manifest: PathBuf, + /// Skip loading the real-world external adapter coverage manifest. + #[arg(long)] + skip_external_adapter_manifest: bool, +} + +#[derive(Debug, Parser)] +struct PublishArgs { + /// Generated real_world_job JSON report. + #[arg(long, value_name = "FILE", default_value = DEFAULT_REPORT_PATH)] + report: PathBuf, + /// Write Markdown to this file. Omit to print to stdout. + #[arg(long, value_name = "FILE", default_value = DEFAULT_MARKDOWN_PATH)] + out: Option, +} + +#[derive(Debug, Deserialize)] +struct RealWorldJob { + schema: String, + job_id: String, + suite: String, + title: String, + corpus: Corpus, + #[serde(default)] + timeline: Vec, + prompt: Prompt, + expected_answer: ExpectedAnswer, + #[serde(default)] + required_evidence: Vec, + #[serde(default)] + negative_traps: Vec, + scoring_rubric: ScoringRubric, + allowed_uncertainty: AllowedUncertainty, + operator_debug: Option, + #[serde(default)] + tags: Vec, + #[serde(default)] + encoding: JobEncoding, + memory_evolution: Option, + memory_summary: Option, + proactive_brief: Option, + scheduled_memory: Option, +} + +#[derive(Debug, Deserialize)] +struct Corpus { + corpus_id: String, + profile: CorpusProfile, + #[serde(default)] + items: Vec, + #[serde(default)] + capture_behaviors: CaptureIntegrationReport, + + adapter_response: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum CorpusProfile { + Synthetic, + PrivateSanitized, + GeneratedPublic, + ExternalAdapter, +} +impl CorpusProfile { + fn as_str(&self) -> &'static str { + match self { + Self::Synthetic => "synthetic", + Self::PrivateSanitized => "private_sanitized", + Self::GeneratedPublic => "generated_public", + Self::ExternalAdapter => "external_adapter", + } + } +} + +#[derive(Debug, Deserialize)] +struct CorpusItem { + evidence_id: String, + kind: String, + + text: Option, + + local_ref: Option, + #[serde(default)] + source_ref: Value, + + created_at: Option, +} + +#[derive(Debug, Deserialize)] +struct TimelineEvent { + event_id: String, + ts: String, + actor: String, + action: String, + #[serde(default)] + evidence_ids: Vec, + summary: String, +} + +#[derive(Debug, Deserialize)] +struct Prompt { + role: String, + content: String, + job_mode: String, + #[serde(default)] + constraints: Vec, +} + +#[derive(Debug, Deserialize)] +struct ExpectedAnswer { + #[serde(default)] + must_include: Vec, + #[serde(default)] + must_not_include: Vec, + #[serde(default)] + evidence_links: BTreeMap, + answer_type: String, + #[serde(default)] + accepted_alternates: Vec, + #[serde(default)] + requires_caveat: bool, + #[serde(default)] + requires_refusal: bool, +} + +#[derive(Clone, Debug, Deserialize)] +#[serde(untagged)] +enum ExpectedClaim { + Text(String), + Object { claim_id: Option, text: String }, +} +impl ExpectedClaim { + fn claim_id(&self) -> Option<&str> { + match self { + Self::Text(_) => None, + Self::Object { claim_id, .. } => claim_id.as_deref(), + } + } + + fn text(&self) -> &str { + match self { + Self::Text(text) => text, + Self::Object { text, .. } => text, + } + } +} + +#[derive(Clone, Debug, Deserialize)] +#[serde(untagged)] +enum EvidenceLink { + One(String), + Many(Vec), +} +impl EvidenceLink { + fn ids(&self) -> BTreeSet { + match self { + Self::One(id) => BTreeSet::from([id.clone()]), + Self::Many(ids) => ids.iter().cloned().collect(), + } + } +} + +#[derive(Debug, Deserialize)] +struct RequiredEvidence { + evidence_id: String, + claim_id: String, + requirement: String, + + quote: Option, + + selector: Option, +} + +#[derive(Debug, Deserialize)] +struct NegativeTrap { + trap_id: String, + #[serde(rename = "type")] + trap_type: String, + #[serde(default)] + evidence_ids: Vec, + #[serde(default)] + failure_if_used: bool, +} + +#[derive(Debug, Default, Deserialize)] +struct JobEncoding { + status: Option, + reason: Option, + follow_up: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct FollowUpInput { + title: String, + reason: String, +} + +#[derive(Debug, Deserialize)] +struct MemoryEvolution { + #[serde(default)] + current_evidence_ids: Vec, + #[serde(default)] + historical_evidence_ids: Vec, + #[serde(default)] + tombstone_evidence_ids: Vec, + #[serde(default)] + invalidation_evidence_ids: Vec, + #[serde(default)] + stale_trap_ids: Vec, + #[serde(default)] + conflicts: Vec, + update_rationale: Option, + temporal_validity: Option, + history_readback: Option, +} + +#[derive(Debug, Deserialize)] +struct EvolutionConflict { + conflict_id: String, + claim_id: String, + current_evidence_id: String, + historical_evidence_id: String, + resolved_by_evidence_id: Option, +} + +#[derive(Debug, Deserialize)] +struct UpdateRationale { + claim_id: String, + #[serde(default)] + evidence_ids: Vec, + available: bool, +} + +#[derive(Debug, Deserialize)] +struct TemporalValidity { + required: bool, + encoded: bool, + follow_up: Option, +} + +#[derive(Debug, Deserialize)] +struct HistoryReadback { + encoded: bool, + #[serde(default)] + required_event_types: Vec, + requires_note_version_links: bool, +} + +#[derive(Debug, Deserialize)] +struct MemorySummaryExpectation { + #[serde(default)] + required_categories: Vec, +} + +#[derive(Debug, Deserialize)] +struct ProactiveBriefExpectation { + #[serde(default)] + required_suggestion_kinds: Vec, +} + +#[derive(Debug, Deserialize)] +struct ScheduledMemoryExpectation { + #[serde(default)] + required_task_kinds: Vec, +} + +#[derive(Debug, Deserialize)] +struct ScoringRubric { + #[serde(default)] + dimensions: BTreeMap, + pass_threshold: f64, + #[serde(default)] + hard_fail_rules: Vec, +} + +#[derive(Debug, Deserialize)] +struct RubricDimension { + weight: f64, + max_points: f64, + criteria: Value, +} + +#[derive(Debug, Deserialize)] +struct AllowedUncertainty { + can_answer_unknown: bool, + #[serde(default)] + acceptable_phrases: Vec, + fallback_action: String, +} + +#[derive(Clone, Debug, Deserialize)] +struct AdapterResponse { + adapter_id: Option, + answer: ProducedAnswer, + consolidation: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ProducedAnswer { + content: String, + #[serde(default)] + claims: Vec, + #[serde(default)] + evidence_ids: Vec, + #[serde(default)] + pages: Vec, + #[serde(default)] + memory_summaries: Vec, + #[serde(default)] + proactive_briefs: Vec, + #[serde(default)] + scheduled_tasks: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + latency_ms: Option, + #[serde(skip_serializing_if = "Option::is_none")] + cost: Option, + #[serde(skip_serializing_if = "Option::is_none")] + trace_explainability: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ProducedClaim { + #[serde(skip_serializing_if = "Option::is_none")] + claim_id: Option, + text: String, + #[serde(default)] + evidence_ids: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + confidence: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct DerivedPageArtifact { + page_id: String, + page_type: String, + title: String, + #[serde(skip_serializing_if = "Option::is_none")] + path: Option, + #[serde(default)] + sections: Vec, + #[serde(default)] + backlinks: Vec, + #[serde(default)] + lint_findings: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + rebuild: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct DerivedPageSection { + section_id: String, + heading: String, + role: String, + content: String, + #[serde(default)] + evidence_ids: Vec, + #[serde(default)] + timeline_event_ids: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + unsupported_reason: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct DerivedPageLintFinding { + finding_id: String, + finding_type: String, + severity: String, + text: String, + #[serde(default)] + evidence_ids: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + trap_id: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct DerivedPageRebuild { + first_hash: String, + second_hash: String, + deterministic: bool, + #[serde(default)] + allowed_variance: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct MemorySummaryArtifact { + summary_id: String, + contract_schema: String, + generated_at: String, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + #[serde(default)] + entries: Vec, + source_trace: MemorySummarySourceTrace, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct MemorySummaryEntry { + entry_id: String, + category: String, + text: String, + #[serde(default)] + source_refs: Vec, + freshness: MemorySummaryFreshness, + rationale: MemorySummaryRationale, + #[serde(default)] + unsupported_claim_flags: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct MemorySummaryFreshness { + status: String, + #[serde(skip_serializing_if = "Option::is_none")] + observed_at: Option, + #[serde(skip_serializing_if = "Option::is_none")] + valid_from: Option, + #[serde(skip_serializing_if = "Option::is_none")] + valid_to: Option, + #[serde(skip_serializing_if = "Option::is_none")] + last_confirmed_at: Option, + #[serde(default)] + superseded_by: Vec, + #[serde(default)] + tombstone_refs: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct MemorySummaryRationale { + decision: String, + reason_code: String, + reason: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct MemorySummarySourceTrace { + #[serde(default)] + selected_source_refs: Vec, + #[serde(default)] + dropped_source_refs: Vec, + #[serde(default)] + stale_source_refs: Vec, + #[serde(default)] + superseded_source_refs: Vec, + #[serde(default)] + tombstone_source_refs: Vec, + #[serde(default)] + unsupported_claim_flags: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct MemorySummarySourceTraceItem { + evidence_id: String, + #[serde(skip_serializing_if = "Option::is_none")] + status: Option, + #[serde(skip_serializing_if = "Option::is_none")] + reason: Option, + #[serde(skip_serializing_if = "Option::is_none")] + superseded_by: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ProactiveBriefArtifact { + brief_id: String, + contract_schema: String, + generated_at: String, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + brief_kind: String, + #[serde(default)] + suggestions: Vec, + source_trace: MemorySummarySourceTrace, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ProactiveSuggestion { + suggestion_id: String, + suggestion_kind: String, + title: String, + body: String, + #[serde(default)] + evidence_refs: Vec, + freshness: MemorySummaryFreshness, + action: ProactiveSuggestionAction, + #[serde(default)] + unsupported_claim_flags: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ProactiveSuggestionAction { + decision: String, + reason_code: String, + reason: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ScheduledMemoryTaskArtifact { + task_run_id: String, + contract_schema: String, + generated_at: String, + scheduled_for: String, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + task_kind: String, + #[serde(default)] + outputs: Vec, + source_trace: MemorySummarySourceTrace, + #[serde(skip_serializing_if = "Option::is_none")] + execution_trace: Option, + #[serde(default)] + source_mutations: Vec, + #[serde(default)] + unsupported_claim_flags: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ScheduledMemoryOutput { + output_id: String, + output_kind: String, + text: String, + #[serde(default)] + evidence_refs: Vec, + freshness: MemorySummaryFreshness, + action: ProactiveSuggestionAction, + #[serde(default)] + unsupported_claim_flags: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ScheduledMemoryExecutionTrace { + trace_id: String, + trigger_kind: String, + status: String, + started_at: String, + completed_at: String, + output_ref: String, + #[serde(default)] + stages: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ScheduledMemoryTraceStage { + stage_name: String, + summary: String, + #[serde(default)] + evidence_refs: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct ConsolidationFixture { + #[serde(default)] + proposals: Vec, + #[serde(default)] + executable_gaps: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct ConsolidationProposalFixture { + proposal_id: String, + proposal_kind: String, + #[serde(default)] + source_refs: Vec, + #[serde(default)] + expected_source_refs: Vec, + usefulness_score: f64, + min_usefulness_score: f64, + expected_review_action: ConsolidationReviewAction, + actual_review_action: ConsolidationReviewAction, + #[serde(default)] + source_mutations: Vec, + #[serde(default)] + unsupported_claim_count: usize, + #[serde(default)] + unsupported_claim_flags: Vec, + #[serde(default)] + diff: Value, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum ConsolidationReviewAction { + Apply, + Discard, + Defer, +} + +#[derive(Clone, Debug, Deserialize)] +struct ConsolidationExecutableGap { + primitive: String, + follow_up_issue: String, + reason: String, + #[serde(default)] + blocks_fixture_pass: bool, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct CostReport { + #[serde(skip_serializing_if = "Option::is_none")] + currency: Option, + #[serde(skip_serializing_if = "Option::is_none")] + amount: Option, + #[serde(skip_serializing_if = "Option::is_none")] + input_tokens: Option, + #[serde(skip_serializing_if = "Option::is_none")] + output_tokens: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct OperatorDebugEvidence { + failure_mode: String, + #[serde(skip_serializing_if = "Option::is_none")] + trace_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + viewer_url: Option, + #[serde(skip_serializing_if = "Option::is_none")] + admin_trace_bundle_url: Option, + root_cause: String, + steps_to_root_cause: u32, + raw_sql_needed: bool, + dropped_candidate_visibility: String, + trace_completeness: String, + repair_action_clarity: String, + #[serde(skip_serializing_if = "Option::is_none")] + trace_available: Option, + #[serde(skip_serializing_if = "Option::is_none")] + replay_command_available: Option, + #[serde(skip_serializing_if = "Option::is_none")] + replay_command: Option, + #[serde(skip_serializing_if = "Option::is_none")] + replay_artifact: Option, + #[serde(default)] + viewer_panels: Vec, + #[serde(default)] + cli_steps: Vec, + #[serde(default)] + trace_evidence: Vec, + #[serde(default)] + ux_gaps: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct OperatorUxGap { + gap_id: String, + severity: String, + description: String, + follow_up_issue: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct TraceExplainability { + #[serde(skip_serializing_if = "Option::is_none")] + trace_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + failure_stage: Option, + #[serde(skip_serializing_if = "Option::is_none")] + failure_reason: Option, + #[serde(default)] + stages: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct TraceStageExplainability { + stage_name: String, + #[serde(default)] + kept_evidence: Vec, + #[serde(default)] + dropped_evidence: Vec, + #[serde(default)] + demoted_evidence: Vec, + #[serde(default)] + distractor_evidence: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + notes: Option, +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum TypedStatus { + Pass, + WrongResult, + LifecycleFail, + Incomplete, + Blocked, + NotEncoded, + UnsupportedClaim, +} + +#[derive(Debug, Deserialize, Serialize)] +struct RealWorldReport { + schema: String, + run_id: String, + generated_at: String, + runner_version: String, + corpus_profile: String, + adapter: AdapterReport, + #[serde(default)] + external_adapters: ExternalAdapterSection, + capture_integration: CaptureIntegrationReport, + summary: ReportSummary, + suites: Vec, + jobs: Vec, + unsupported_claims: Vec, + not_encoded_suites: Vec, + private_corpus_redaction: PrivateCorpusRedaction, + #[serde(default)] + evolution: EvolutionSummary, + #[serde(default)] + follow_ups: Vec, +} + +#[derive(Debug, Deserialize, Serialize)] +struct AdapterReport { + adapter_id: String, + name: String, + behavior: String, + storage: TypedStatus, + runtime: TypedStatus, + notes: String, +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum AdapterCoverageStatus { + Real, + Mocked, + Unsupported, + Blocked, + Incomplete, + WrongResult, + LifecycleFail, + Pass, + NotEncoded, +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum ElfScenarioPosition { + Wins, + Ties, + Loses, + Untested, +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum ScenarioComparisonOutcome { + Win, + Tie, + Loss, + NotTested, + Blocked, + NonGoal, +} + +#[derive(Debug, Deserialize)] +struct ExternalAdapterManifest { + schema: String, + manifest_id: String, + docker_isolation: ExternalDockerIsolation, + #[serde(default)] + adapters: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ExternalAdapterSection { + schema: String, + manifest_id: String, + docker_isolation: ExternalDockerIsolation, + summary: ExternalAdapterSummary, + #[serde(default)] + adapters: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ExternalDockerIsolation { + default: bool, + compose_file: String, + runner: String, + artifact_dir: String, + host_global_installs_required: bool, + #[serde(default)] + notes: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ExternalAdapterReport { + adapter_id: String, + project: String, + adapter_kind: String, + evidence_class: String, + docker_default: bool, + host_global_installs_required: bool, + overall_status: AdapterCoverageStatus, + setup: AdapterExecutionEvidence, + run: AdapterExecutionEvidence, + result: AdapterExecutionEvidence, + #[serde(default)] + capabilities: Vec, + #[serde(default)] + suites: Vec, + #[serde(default)] + scenarios: Vec, + #[serde(default)] + evidence: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + execution_metadata: Option, + #[serde(default)] + notes: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + follow_up: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterExecutionEvidence { + status: AdapterCoverageStatus, + evidence: String, + #[serde(skip_serializing_if = "Option::is_none")] + command: Option, + #[serde(skip_serializing_if = "Option::is_none")] + artifact: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterCapabilityCoverage { + capability: String, + status: AdapterCoverageStatus, + evidence: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterSuiteCoverage { + suite_id: String, + status: AdapterCoverageStatus, + evidence: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterScenarioJudgment { + scenario_id: String, + #[serde(skip_serializing_if = "Option::is_none")] + suite_id: Option, + status: AdapterCoverageStatus, + elf_position: ElfScenarioPosition, + #[serde(skip_serializing_if = "Option::is_none")] + comparison_outcome: Option, + evidence: String, + #[serde(skip_serializing_if = "Option::is_none")] + command: Option, + #[serde(skip_serializing_if = "Option::is_none")] + artifact: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterEvidencePointer { + kind: String, + #[serde(rename = "ref")] + reference: String, + status: AdapterCoverageStatus, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterExecutionMetadata { + #[serde(default)] + sources: Vec, + setup_path: String, + runtime_boundary: String, + resource_expectation: String, + #[serde(default)] + retry_guidance: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + research_depth: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AdapterSource { + label: String, + url: String, + evidence: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ExternalAdapterSummary { + adapter_count: usize, + external_project_count: usize, + docker_default_count: usize, + host_global_install_required_count: usize, + fixture_backed_count: usize, + live_baseline_only_count: usize, + live_real_world_count: usize, + #[serde(default)] + research_gate_count: usize, + overall_status_counts: AdapterStatusCounts, + capability_status_counts: AdapterStatusCounts, + suite_status_counts: AdapterStatusCounts, + #[serde(default)] + scenario_status_counts: AdapterStatusCounts, + #[serde(default)] + scenario_position_counts: ScenarioPositionCounts, + #[serde(default)] + scenario_outcome_counts: ScenarioOutcomeCounts, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct AdapterStatusCounts { + real: usize, + mocked: usize, + unsupported: usize, + blocked: usize, + incomplete: usize, + wrong_result: usize, + lifecycle_fail: usize, + pass: usize, + not_encoded: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScenarioPositionCounts { + wins: usize, + ties: usize, + loses: usize, + untested: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScenarioOutcomeCounts { + win: usize, + tie: usize, + loss: usize, + not_tested: usize, + blocked: usize, + non_goal: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct CaptureIntegrationReport { + #[serde(default)] + real: Vec, + #[serde(default)] + fixture_backed: Vec, + #[serde(default)] + mocked: Vec, + #[serde(default)] + blocked: Vec, + #[serde(default)] + not_encoded: Vec, + #[serde(default)] + notes: Vec, +} + +#[derive(Debug, Default, Deserialize, Serialize)] +struct ReportSummary { + job_count: usize, + encoded_suite_count: usize, + pass: usize, + wrong_result: usize, + lifecycle_fail: usize, + incomplete: usize, + blocked: usize, + not_encoded: usize, + unsupported_claim: usize, + unsupported_claim_count: usize, + wrong_result_count: usize, + #[serde(default)] + stale_answer_count: usize, + #[serde(default)] + conflict_detection_count: usize, + #[serde(default)] + update_rationale_available_count: usize, + #[serde(default)] + temporal_validity_not_encoded_count: usize, + #[serde(default)] + history_readback_encoded_count: usize, + expected_evidence_total: usize, + expected_evidence_matched: usize, + expected_evidence_recall: f64, + irrelevant_context_count: usize, + irrelevant_context_ratio: f64, + trace_explainability_count: usize, + wrong_result_stage_attribution_count: usize, + mean_score: f64, + mean_latency_ms: Option, + total_cost: Option, + #[serde(default)] + evidence_required_count: usize, + #[serde(default)] + evidence_covered_count: usize, + #[serde(default)] + evidence_coverage: f64, + #[serde(default)] + source_ref_required_count: usize, + #[serde(default)] + source_ref_covered_count: usize, + #[serde(default)] + source_ref_coverage: f64, + #[serde(default)] + quote_required_count: usize, + #[serde(default)] + quote_covered_count: usize, + #[serde(default)] + quote_coverage: f64, + #[serde(default)] + stale_retrieval_count: usize, + #[serde(default)] + scope_check_count: usize, + #[serde(default)] + scope_correct_count: usize, + #[serde(default)] + scope_correctness: f64, + #[serde(default)] + scope_violation_count: usize, + #[serde(default)] + redaction_leak_count: usize, + #[serde(default)] + qdrant_rebuild_case_count: usize, + #[serde(default)] + qdrant_rebuild_pass_count: usize, + #[serde(default)] + operator_debug_job_count: usize, + #[serde(default)] + raw_sql_needed_count: usize, + #[serde(default)] + trace_incomplete_count: usize, + #[serde(default)] + operator_ux_gap_count: usize, + #[serde(default)] + consolidation: ConsolidationSummaryReport, + #[serde(skip_serializing_if = "Option::is_none")] + memory_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + proactive_brief: Option, + #[serde(skip_serializing_if = "Option::is_none")] + scheduled_memory: Option, + #[serde(skip_serializing_if = "Option::is_none")] + knowledge: Option, +} + +#[derive(Debug, Default, Deserialize, Serialize)] +struct ConsolidationSummaryReport { + proposal_count: usize, + proposal_usefulness: Option, + lineage_completeness: Option, + review_action_correctness: Option, + source_mutation_count: usize, + proposal_unsupported_claim_count: usize, + executable_gap_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct MemorySummaryReport { + job_count: usize, + summary_count: usize, + entry_count: usize, + required_category_count: usize, + covered_required_category_count: usize, + missing_required_category_count: usize, + top_of_mind_count: usize, + background_count: usize, + stale_count: usize, + superseded_count: usize, + tombstone_count: usize, + derived_project_profile_count: usize, + source_ref_required_count: usize, + source_ref_entry_count: usize, + source_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + rationale_count: usize, + rationale_coverage: f64, + invalid_top_of_mind_count: usize, + untraced_entry_count: usize, + derived_with_source_or_unsupported_count: usize, + derived_missing_source_or_unsupported_count: usize, + unsupported_derived_entry_count: usize, + unsupported_current_entry_count: usize, + tombstone_ref_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ProactiveBriefSummaryReport { + job_count: usize, + brief_count: usize, + suggestion_count: usize, + required_suggestion_kind_count: usize, + covered_required_suggestion_kind_count: usize, + missing_required_suggestion_kind_count: usize, + evidence_ref_required_count: usize, + evidence_ref_suggestion_count: usize, + evidence_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + action_rationale_count: usize, + action_rationale_coverage: f64, + recommended_count: usize, + deferred_count: usize, + rejected_count: usize, + current_suggestion_count: usize, + non_current_suggestion_count: usize, + stale_warning_count: usize, + invalid_current_suggestion_count: usize, + untraced_suggestion_count: usize, + unsupported_current_suggestion_count: usize, + tombstone_violation_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScheduledMemorySummaryReport { + job_count: usize, + task_run_count: usize, + output_count: usize, + required_task_kind_count: usize, + covered_required_task_kind_count: usize, + missing_required_task_kind_count: usize, + evidence_ref_required_count: usize, + evidence_ref_output_count: usize, + evidence_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + action_rationale_count: usize, + action_rationale_coverage: f64, + trace_required_count: usize, + trace_complete_count: usize, + trace_coverage: f64, + source_mutation_count: usize, + current_output_count: usize, + non_current_output_count: usize, + invalid_current_output_count: usize, + untraced_output_count: usize, + unsupported_current_output_count: usize, + tombstone_violation_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct KnowledgeSummary { + job_count: usize, + page_count: usize, + section_count: usize, + backlink_count: usize, + pages_with_backlinks: usize, + citation_coverage: f64, + stale_claim_detection: f64, + rebuild_determinism: f64, + backlink_coverage: f64, + page_usefulness: f64, + unsupported_summary_count: usize, + untraced_section_count: usize, + allowed_variance_count: usize, +} + +#[derive(Debug, Deserialize, Serialize)] +struct SuiteReport { + suite_id: String, + status: TypedStatus, + encoded_job_count: usize, + score_mean: Option, + unsupported_claim_count: usize, + wrong_result_count: usize, + #[serde(default)] + stale_answer_count: usize, + #[serde(default)] + conflict_detection_count: usize, + #[serde(default)] + update_rationale_available_count: usize, + #[serde(default)] + temporal_validity_not_encoded_count: usize, + #[serde(default)] + history_readback_encoded_count: usize, + expected_evidence_recall: Option, + irrelevant_context_ratio: Option, + trace_explainability_count: usize, + reason: String, +} + +#[derive(Debug, Deserialize, Serialize)] +struct JobReport { + suite_id: String, + job_id: String, + title: String, + status: TypedStatus, + answer_type: String, + requires_caveat: bool, + requires_refusal: bool, + can_answer_unknown: bool, + normalized_score: f64, + hard_fail_hits: Vec, + expected_evidence: Vec, + produced_answer: String, + produced_evidence: Vec, + unsupported_claim_count: usize, + wrong_result_count: usize, + #[serde(default)] + stale_answer_count: usize, + #[serde(default)] + conflict_detection_count: usize, + #[serde(default)] + update_rationale_available: bool, + #[serde(default)] + temporal_validity_not_encoded: bool, + #[serde(default)] + history_readback_encoded: bool, + retrieval_quality: RetrievalQualityReport, + latency_ms: Option, + cost: Option, + trace_explainability: Option, + #[serde(skip_serializing_if = "Option::is_none")] + knowledge: Option, + #[serde(skip_serializing_if = "Option::is_none")] + memory_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + proactive_brief: Option, + #[serde(skip_serializing_if = "Option::is_none")] + scheduled_memory: Option, + trap_ids_used: Vec, + dimension_scores: Vec, + reason: String, + #[serde(default)] + evidence_required_count: usize, + #[serde(default)] + evidence_covered_count: usize, + #[serde(default)] + source_ref_required_count: usize, + #[serde(default)] + source_ref_covered_count: usize, + #[serde(default)] + quote_required_count: usize, + #[serde(default)] + quote_covered_count: usize, + #[serde(default)] + stale_retrieval_count: usize, + #[serde(default)] + scope_check_count: usize, + #[serde(default)] + scope_correct_count: usize, + #[serde(default)] + scope_violation_count: usize, + #[serde(default)] + redaction_leak_count: usize, + #[serde(default)] + qdrant_rebuild_case: bool, + #[serde(skip_serializing_if = "Option::is_none")] + operator_debug: Option, + #[serde(skip_serializing_if = "Option::is_none")] + evolution: Option, + #[serde(skip_serializing_if = "Option::is_none")] + consolidation: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +struct ExpectedEvidenceReport { + evidence_id: String, + claim_id: String, + requirement: String, +} + +#[derive(Debug, Deserialize, Serialize)] +struct DimensionScoreReport { + dimension: String, + score: f64, + max_points: f64, + weight: f64, +} + +#[derive(Debug, Deserialize, Serialize)] +struct RetrievalQualityReport { + expected_evidence_total: usize, + expected_evidence_matched: usize, + expected_evidence_recall: f64, + produced_evidence_total: usize, + irrelevant_context_count: usize, + irrelevant_context_ratio: f64, + trap_context_count: usize, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ConsolidationJobReport { + proposal_count: usize, + proposal_usefulness: Option, + lineage_completeness: Option, + review_action_correctness: Option, + source_mutation_count: usize, + proposal_unsupported_claim_count: usize, + executable_gaps: Vec, + proposals: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ConsolidationProposalReport { + proposal_id: String, + proposal_kind: String, + usefulness_score: f64, + min_usefulness_score: f64, + lineage_completeness: f64, + expected_review_action: ConsolidationReviewAction, + actual_review_action: ConsolidationReviewAction, + review_action_correct: bool, + source_mutation_count: usize, + unsupported_claim_count: usize, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ConsolidationExecutableGapReport { + primitive: String, + follow_up_issue: String, + reason: String, + blocks_fixture_pass: bool, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct UnsupportedClaimReport { + suite_id: String, + job_id: String, + claim_id: Option, + claim_text: String, + reason: String, + evidence_ids: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct KnowledgeJobMetrics { + page_count: usize, + section_count: usize, + traced_section_count: usize, + flagged_unsupported_section_count: usize, + untraced_section_count: usize, + unsupported_summary_count: usize, + backlink_count: usize, + pages_with_backlinks: usize, + stale_trap_count: usize, + stale_traps_detected: usize, + rebuild_page_count: usize, + deterministic_rebuild_count: usize, + rebuild_failure_count: usize, + allowed_variance_count: usize, + citation_coverage: f64, + stale_claim_detection: f64, + rebuild_determinism: f64, + backlink_coverage: f64, + page_usefulness: f64, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct MemorySummaryJobMetrics { + summary_count: usize, + entry_count: usize, + required_category_count: usize, + covered_required_category_count: usize, + missing_required_category_count: usize, + top_of_mind_count: usize, + background_count: usize, + stale_count: usize, + superseded_count: usize, + tombstone_count: usize, + derived_project_profile_count: usize, + source_ref_required_count: usize, + source_ref_entry_count: usize, + source_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + rationale_count: usize, + rationale_coverage: f64, + invalid_top_of_mind_count: usize, + untraced_entry_count: usize, + derived_with_source_or_unsupported_count: usize, + derived_missing_source_or_unsupported_count: usize, + unsupported_derived_entry_count: usize, + unsupported_current_entry_count: usize, + tombstone_ref_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ProactiveBriefJobMetrics { + brief_count: usize, + suggestion_count: usize, + required_suggestion_kind_count: usize, + covered_required_suggestion_kind_count: usize, + missing_required_suggestion_kind_count: usize, + evidence_ref_required_count: usize, + evidence_ref_suggestion_count: usize, + evidence_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + action_rationale_count: usize, + action_rationale_coverage: f64, + recommended_count: usize, + deferred_count: usize, + rejected_count: usize, + current_suggestion_count: usize, + non_current_suggestion_count: usize, + stale_warning_count: usize, + invalid_current_suggestion_count: usize, + untraced_suggestion_count: usize, + unsupported_current_suggestion_count: usize, + tombstone_violation_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScheduledMemoryJobMetrics { + task_run_count: usize, + output_count: usize, + required_task_kind_count: usize, + covered_required_task_kind_count: usize, + missing_required_task_kind_count: usize, + evidence_ref_required_count: usize, + evidence_ref_output_count: usize, + evidence_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + action_rationale_count: usize, + action_rationale_coverage: f64, + trace_required_count: usize, + trace_complete_count: usize, + trace_coverage: f64, + source_mutation_count: usize, + current_output_count: usize, + non_current_output_count: usize, + invalid_current_output_count: usize, + untraced_output_count: usize, + unsupported_current_output_count: usize, + tombstone_violation_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct EvolutionSummary { + stale_answer_count: usize, + conflict_detection_count: usize, + update_rationale_available_count: usize, + temporal_validity_not_encoded_count: usize, + history_readback_encoded_count: usize, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct EvolutionJobReport { + current_evidence: Vec, + historical_evidence: Vec, + tombstone_evidence: Vec, + invalidation_evidence: Vec, + selected_current_evidence: Vec, + selected_historical_evidence: Vec, + selected_rationale_evidence: Vec, + selected_tombstone_evidence: Vec, + selected_invalidation_evidence: Vec, + conflict_candidate_evidence: Vec, + retrieved_but_dropped_evidence: Vec, + selected_but_not_narrated_evidence: Vec, + stale_trap_ids_used: Vec, + stale_answer_count: usize, + conflict_count: usize, + conflict_detection_count: usize, + update_rationale_available: bool, + temporal_validity_required: bool, + temporal_validity_encoded: bool, + temporal_validity_not_encoded: bool, + history_readback_encoded: bool, + history_event_types: Vec, + history_requires_note_version_links: bool, + #[serde(skip_serializing_if = "Option::is_none")] + follow_up: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +struct FollowUpReport { + suite_id: String, + job_id: String, + title: String, + reason: String, +} + +#[derive(Debug, Deserialize, Serialize)] +struct PrivateCorpusRedaction { + policy: String, + private_fixture_count: usize, +} + +#[derive(Debug)] +struct JobScoring { + status: TypedStatus, + normalized_score: f64, + hard_fail_hits: Vec, + unsupported_claims: Vec, + wrong_result_count: usize, + knowledge: Option, + trap_ids_used: Vec, + dimension_scores: Vec, + reason: String, + evolution: Option, + consolidation: Option, + memory_summary: Option, + proactive_brief: Option, + scheduled_memory: Option, +} + +#[derive(Debug, Default)] +struct FailureCounts { + missing_claims: usize, + forbidden_claims: usize, + missing_evidence: usize, + trap_uses: usize, + unsupported_claims: usize, + operator_debug_missing: usize, + operator_debug_raw_sql: usize, + operator_debug_trace_gaps: usize, + operator_debug_repair_unclear: usize, + stale_answers: usize, + conflict_detection_missing: usize, + update_rationale_missing: usize, + latency_violations: usize, + proposal_usefulness_failures: usize, + lineage_failures: usize, + review_action_failures: usize, + source_mutations: usize, + blocking_executable_gaps: usize, + memory_summary_invalid_current_entries: usize, + memory_summary_untraced_entries: usize, + memory_summary_missing_freshness: usize, + memory_summary_missing_rationale: usize, + memory_summary_missing_categories: usize, + memory_summary_unsupported_current_entries: usize, + proactive_brief_invalid_current_suggestions: usize, + proactive_brief_untraced_suggestions: usize, + proactive_brief_missing_freshness: usize, + proactive_brief_missing_action_rationale: usize, + proactive_brief_missing_kinds: usize, + proactive_brief_unsupported_current_suggestions: usize, + proactive_brief_tombstone_violations: usize, + scheduled_memory_invalid_current_outputs: usize, + scheduled_memory_untraced_outputs: usize, + scheduled_memory_missing_freshness: usize, + scheduled_memory_missing_action_rationale: usize, + scheduled_memory_missing_task_kinds: usize, + scheduled_memory_unsupported_current_outputs: usize, + scheduled_memory_tombstone_violations: usize, + scheduled_memory_missing_trace: usize, + untraced_page_sections: usize, + missed_stale_findings: usize, + rebuild_failures: usize, + page_usefulness_failures: usize, +} + +#[derive(Debug, Default)] +struct JobMetrics { + evidence_required_count: usize, + evidence_covered_count: usize, + source_ref_required_count: usize, + source_ref_covered_count: usize, + quote_required_count: usize, + quote_covered_count: usize, + stale_retrieval_count: usize, + scope_check_count: usize, + scope_correct_count: usize, + scope_violation_count: usize, + redaction_leak_count: usize, + qdrant_rebuild_case: bool, +} + +fn main() -> Result<()> { + color_eyre::install()?; + + match Args::parse().command { + Command::Run(args) => run_command(args), + Command::Publish(args) => publish_command(args), + } +} + +fn run_command(args: RunArgs) -> Result<()> { + let jobs = load_jobs(&args.fixtures)?; + let report = build_report(&jobs, &args)?; + let json = serde_json::to_string_pretty(&report)?; + + write_or_print(args.out.as_deref(), json.as_str()) +} + +fn publish_command(args: PublishArgs) -> Result<()> { + let raw = fs::read_to_string(&args.report)?; + let report = serde_json::from_str::(&raw)?; + let markdown = render_markdown(&report, &args.report); + + write_or_print(args.out.as_deref(), markdown.as_str()) +} + +fn load_jobs(path: &Path) -> Result> { + let paths = fixture_paths(path)?; + let mut jobs = Vec::with_capacity(paths.len()); + + for fixture in paths { + let raw = fs::read_to_string(&fixture)?; + let job = serde_json::from_str::(&raw) + .map_err(|err| eyre::eyre!("Failed to parse {}: {err}", fixture.display()))?; + + validate_job(&job, &fixture)?; + + jobs.push(job); + } + + Ok(jobs) +} + +fn fixture_paths(path: &Path) -> Result> { + if path.is_file() { + return Ok(vec![path.to_path_buf()]); + } + if !path.is_dir() { + return Err(eyre::eyre!("Fixture path does not exist: {}", path.display())); + } + + let mut paths = Vec::new(); + + collect_fixture_paths(path, &mut paths)?; + + paths.sort(); + + if paths.is_empty() { + return Err(eyre::eyre!("No JSON fixtures found in {}.", path.display())); + } + + Ok(paths) +} + +fn collect_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { + for entry in fs::read_dir(path)? { + let entry = entry?; + let entry_path = entry.path(); + + if entry_path.is_dir() { + collect_fixture_paths(entry_path.as_path(), paths)?; + } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { + paths.push(entry_path); + } + } + + Ok(()) +} + +fn validate_job(job: &RealWorldJob, path: &Path) -> Result<()> { + if job.schema != JOB_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {JOB_SCHEMA}.", + path.display(), + job.schema + )); + } + + validate_job_identity(job, path)?; + + if !SUITES.contains(&job.suite.as_str()) { + return Err(eyre::eyre!("{} uses unknown suite {}.", path.display(), job.suite)); + } + + validate_corpus_items(job, path)?; + validate_timeline(job, path)?; + validate_prompt(job, path)?; + validate_expected_answer(job, path)?; + validate_required_evidence(job, path)?; + validate_consolidation_fixture(job, path)?; + validate_adapter_response(job, path)?; + validate_scoring_rubric(job, path)?; + validate_allowed_uncertainty(job, path)?; + validate_operator_debug(job, path)?; + validate_job_encoding(job, path)?; + validate_memory_evolution(job, path)?; + validate_memory_summary_expectation(job, path)?; + validate_proactive_brief_expectation(job, path)?; + validate_scheduled_memory_expectation(job, path)?; + validate_trace_explainability(job, path)?; + + Ok(()) +} + +fn validate_job_identity(job: &RealWorldJob, path: &Path) -> Result<()> { + if job.job_id.trim().is_empty() + || job.suite.trim().is_empty() + || job.title.trim().is_empty() + || job.corpus.corpus_id.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete job identity.", path.display())); + } + + for tag in &job.tags { + if tag.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty tag.", path.display())); + } + } + + if let Some(adapter_response) = &job.corpus.adapter_response + && adapter_response.adapter_id.as_deref().is_some_and(str::is_empty) + { + return Err(eyre::eyre!("{} has an empty adapter_response adapter_id.", path.display())); + } + + Ok(()) +} + +fn validate_corpus_items(job: &RealWorldJob, path: &Path) -> Result<()> { + let mut evidence_ids = BTreeSet::new(); + + for item in &job.corpus.items { + if item.evidence_id.trim().is_empty() { + return Err(eyre::eyre!( + "{} has a corpus item with an empty evidence_id.", + path.display() + )); + } + if item.kind.trim().is_empty() { + return Err(eyre::eyre!( + "{} has corpus item {} with an empty kind.", + path.display(), + item.evidence_id + )); + } + if item.text.is_none() && item.local_ref.is_none() { + return Err(eyre::eyre!( + "{} corpus item {} must provide text or local_ref.", + path.display(), + item.evidence_id + )); + } + if !item.source_ref.is_object() { + return Err(eyre::eyre!( + "{} corpus item {} must provide an object source_ref.", + path.display(), + item.evidence_id + )); + } + + if let Some(created_at) = &item.created_at { + validate_optional_rfc3339(created_at, path, item.evidence_id.as_str())?; + } + + evidence_ids.insert(item.evidence_id.clone()); + } + for trap in &job.negative_traps { + if trap.trap_id.trim().is_empty() || trap.trap_type.trim().is_empty() { + return Err(eyre::eyre!("{} has an incomplete negative trap.", path.display())); + } + + for evidence_id in &trap.evidence_ids { + ensure_known_evidence(path, &evidence_ids, evidence_id)?; + } + } + + Ok(()) +} + +fn validate_timeline(job: &RealWorldJob, path: &Path) -> Result<()> { + let evidence_ids = corpus_evidence_ids(job); + + for event in &job.timeline { + if event.event_id.trim().is_empty() + || event.actor.trim().is_empty() + || event.action.trim().is_empty() + || event.summary.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete timeline event.", path.display())); + } + + validate_required_rfc3339(event.ts.as_str(), path, event.event_id.as_str())?; + + for evidence_id in &event.evidence_ids { + ensure_known_evidence(path, &evidence_ids, evidence_id)?; + } + } + + Ok(()) +} + +fn validate_prompt(job: &RealWorldJob, path: &Path) -> Result<()> { + if job.prompt.role.trim().is_empty() + || job.prompt.content.trim().is_empty() + || job.prompt.job_mode.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete prompt.", path.display())); + } + + for constraint in &job.prompt.constraints { + if constraint.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty prompt constraint.", path.display())); + } + } + + Ok(()) +} + +fn validate_expected_answer(job: &RealWorldJob, path: &Path) -> Result<()> { + if job.expected_answer.answer_type.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty expected answer type.", path.display())); + } + + for claim in &job.expected_answer.must_include { + if claim.text().trim().is_empty() { + return Err(eyre::eyre!("{} has an empty expected claim.", path.display())); + } + } + for claim in &job.expected_answer.must_not_include { + if claim.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty forbidden claim.", path.display())); + } + } + for phrase in &job.expected_answer.accepted_alternates { + if phrase.is_null() { + return Err(eyre::eyre!("{} has a null accepted alternate.", path.display())); + } + } + + Ok(()) +} + +fn validate_required_evidence(job: &RealWorldJob, path: &Path) -> Result<()> { + let evidence_ids = corpus_evidence_ids(job); + let corpus_text = corpus_text_by_id(job); + + for evidence in &job.required_evidence { + if evidence.claim_id.trim().is_empty() || evidence.requirement.trim().is_empty() { + return Err(eyre::eyre!("{} has incomplete required evidence.", path.display())); + } + + ensure_known_evidence(path, &evidence_ids, evidence.evidence_id.as_str())?; + + if evidence.quote.is_none() && evidence.selector.is_none() { + return Err(eyre::eyre!( + "{} required evidence {} must provide quote or selector.", + path.display(), + evidence.evidence_id + )); + } + + if let Some(quote) = &evidence.quote + && let Some(text) = corpus_text.get(evidence.evidence_id.as_str()) + && !text.contains(quote) + { + return Err(eyre::eyre!( + "{} required evidence quote for {} is not present in corpus text.", + path.display(), + evidence.evidence_id + )); + } + } + for (claim_id, link) in &job.expected_answer.evidence_links { + if claim_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty evidence link claim id.", path.display())); + } + + for evidence_id in link.ids() { + ensure_known_evidence(path, &evidence_ids, evidence_id.as_str())?; + } + } + + Ok(()) +} + +fn validate_consolidation_fixture(job: &RealWorldJob, path: &Path) -> Result<()> { + let consolidation = + job.corpus.adapter_response.as_ref().and_then(|response| response.consolidation.as_ref()); + + if job.suite == "consolidation" && consolidation.is_none() && job.encoding.status.is_none() { + return Err(eyre::eyre!( + "{} consolidation jobs must provide adapter_response.consolidation.", + path.display() + )); + } + + let Some(consolidation) = consolidation else { + return Ok(()); + }; + + if consolidation.proposals.is_empty() && consolidation.executable_gaps.is_empty() { + return Err(eyre::eyre!( + "{} consolidation fixture must provide proposals or executable_gaps.", + path.display() + )); + } + + for proposal in &consolidation.proposals { + validate_consolidation_proposal(proposal, path)?; + } + for gap in &consolidation.executable_gaps { + if gap.primitive.trim().is_empty() + || gap.follow_up_issue.trim().is_empty() + || gap.reason.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete consolidation executable gap.", + path.display() + )); + } + } + + Ok(()) +} + +fn validate_consolidation_proposal( + proposal: &ConsolidationProposalFixture, + path: &Path, +) -> Result<()> { + if proposal.proposal_id.trim().is_empty() + || proposal.proposal_kind.trim().is_empty() + || proposal.source_refs.is_empty() + || proposal.expected_source_refs.is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete consolidation proposal fixture.", + path.display() + )); + } + if !proposal.usefulness_score.is_finite() + || !proposal.min_usefulness_score.is_finite() + || !(0.0..=1.0).contains(&proposal.usefulness_score) + || !(0.0..=1.0).contains(&proposal.min_usefulness_score) + { + return Err(eyre::eyre!( + "{} has invalid consolidation proposal usefulness scores.", + path.display() + )); + } + if !proposal.diff.is_null() && !proposal.diff.is_object() { + return Err(eyre::eyre!( + "{} consolidation proposal diff must be a JSON object when present.", + path.display() + )); + } + if proposal.unsupported_claim_flags.iter().any(|flag| !flag.is_object()) { + return Err(eyre::eyre!( + "{} consolidation unsupported-claim flags must be JSON objects.", + path.display() + )); + } + + Ok(()) +} + +fn validate_adapter_response(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(adapter_response) = &job.corpus.adapter_response else { + return Ok(()); + }; + let evidence_ids = corpus_evidence_ids(job); + let event_ids = timeline_event_ids(job); + + for page in &adapter_response.answer.pages { + validate_page_artifact(page, path, &evidence_ids, &event_ids)?; + } + for summary in &adapter_response.answer.memory_summaries { + validate_memory_summary_artifact(summary, path, &evidence_ids)?; + } + for brief in &adapter_response.answer.proactive_briefs { + validate_proactive_brief_artifact(brief, path, &evidence_ids)?; + } + for task in &adapter_response.answer.scheduled_tasks { + validate_scheduled_memory_artifact(task, path, &evidence_ids)?; + } + + if job.suite == "memory_summary" + && adapter_response.answer.memory_summaries.is_empty() + && job.encoding.status.is_none() + { + return Err(eyre::eyre!( + "{} memory_summary jobs must provide adapter_response.answer.memory_summaries.", + path.display() + )); + } + if job.suite == "proactive_brief" + && adapter_response.answer.proactive_briefs.is_empty() + && job.encoding.status.is_none() + { + return Err(eyre::eyre!( + "{} proactive_brief jobs must provide adapter_response.answer.proactive_briefs.", + path.display() + )); + } + if job.suite == "scheduled_memory" + && adapter_response.answer.scheduled_tasks.is_empty() + && job.encoding.status.is_none() + { + return Err(eyre::eyre!( + "{} scheduled_memory jobs must provide adapter_response.answer.scheduled_tasks.", + path.display() + )); + } + + Ok(()) +} + +fn validate_page_artifact( + page: &DerivedPageArtifact, + path: &Path, + evidence_ids: &BTreeSet, + event_ids: &BTreeSet, +) -> Result<()> { + if page.page_id.trim().is_empty() + || page.page_type.trim().is_empty() + || page.title.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete derived page.", path.display())); + } + + for section in &page.sections { + if section.section_id.trim().is_empty() + || section.heading.trim().is_empty() + || section.role.trim().is_empty() + || section.content.trim().is_empty() + { + return Err(eyre::eyre!( + "{} page {} has an incomplete section.", + path.display(), + page.page_id + )); + } + + for evidence_id in §ion.evidence_ids { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for event_id in §ion.timeline_event_ids { + ensure_known_event(path, event_ids, event_id)?; + } + } + for backlink in &page.backlinks { + if backlink.trim().is_empty() { + return Err(eyre::eyre!( + "{} page {} has an empty backlink.", + path.display(), + page.page_id + )); + } + } + for finding in &page.lint_findings { + if finding.finding_id.trim().is_empty() + || finding.finding_type.trim().is_empty() + || finding.severity.trim().is_empty() + || finding.text.trim().is_empty() + { + return Err(eyre::eyre!( + "{} page {} has an incomplete lint finding.", + path.display(), + page.page_id + )); + } + + for evidence_id in &finding.evidence_ids { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + } + + if let Some(rebuild) = &page.rebuild + && (rebuild.first_hash.trim().is_empty() || rebuild.second_hash.trim().is_empty()) + { + return Err(eyre::eyre!( + "{} page {} has an incomplete rebuild record.", + path.display(), + page.page_id + )); + } + + Ok(()) +} + +fn validate_memory_summary_artifact( + summary: &MemorySummaryArtifact, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if summary.summary_id.trim().is_empty() + || summary.contract_schema != "elf.memory_summary/v1" + || summary.generated_at.trim().is_empty() + || summary.tenant_id.trim().is_empty() + || summary.project_id.trim().is_empty() + || summary.agent_id.trim().is_empty() + || summary.read_profile.trim().is_empty() + || summary.entries.is_empty() + { + return Err(eyre::eyre!("{} has an incomplete memory summary.", path.display())); + } + + validate_optional_rfc3339(&summary.generated_at, path, summary.summary_id.as_str())?; + + for entry in &summary.entries { + validate_memory_summary_entry(entry, path, evidence_ids)?; + } + + validate_memory_summary_source_trace(&summary.source_trace, path, evidence_ids)?; + + Ok(()) +} + +fn validate_memory_summary_entry( + entry: &MemorySummaryEntry, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if entry.entry_id.trim().is_empty() + || entry.category.trim().is_empty() + || entry.text.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete memory summary entry.", path.display())); + } + if !is_memory_summary_category(entry.category.as_str()) { + return Err(eyre::eyre!( + "{} has unknown memory summary category {}.", + path.display(), + entry.category + )); + } + if !is_memory_summary_freshness_status(entry.freshness.status.as_str()) { + return Err(eyre::eyre!( + "{} has unknown memory summary freshness status {}.", + path.display(), + entry.freshness.status + )); + } + if !is_memory_summary_rationale_decision(entry.rationale.decision.as_str()) { + return Err(eyre::eyre!( + "{} has unknown memory summary rationale decision {}.", + path.display(), + entry.rationale.decision + )); + } + + for evidence_id in &entry.source_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for evidence_id in &entry.freshness.tombstone_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for flag in &entry.unsupported_claim_flags { + if !flag.is_object() { + return Err(eyre::eyre!( + "{} memory summary unsupported-claim flags must be JSON objects.", + path.display() + )); + } + } + + validate_optional_summary_time( + path, + entry.freshness.observed_at.as_deref(), + entry.entry_id.as_str(), + )?; + validate_optional_summary_time( + path, + entry.freshness.valid_from.as_deref(), + entry.entry_id.as_str(), + )?; + validate_optional_summary_time( + path, + entry.freshness.valid_to.as_deref(), + entry.entry_id.as_str(), + )?; + validate_optional_summary_time( + path, + entry.freshness.last_confirmed_at.as_deref(), + entry.entry_id.as_str(), + )?; + + Ok(()) +} + +fn validate_memory_summary_source_trace( + trace: &MemorySummarySourceTrace, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + for item in trace + .selected_source_refs + .iter() + .chain(trace.dropped_source_refs.iter()) + .chain(trace.stale_source_refs.iter()) + .chain(trace.superseded_source_refs.iter()) + .chain(trace.tombstone_source_refs.iter()) + { + if item.evidence_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty memory summary trace item.", path.display())); + } + + ensure_known_evidence(path, evidence_ids, item.evidence_id.as_str())?; + } + for flag in &trace.unsupported_claim_flags { + if !flag.is_object() { + return Err(eyre::eyre!( + "{} memory summary source-trace unsupported-claim flags must be JSON objects.", + path.display() + )); + } + } + + Ok(()) +} + +fn validate_proactive_brief_artifact( + brief: &ProactiveBriefArtifact, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if brief.brief_id.trim().is_empty() + || brief.contract_schema != "elf.proactive_project_brief/v1" + || brief.generated_at.trim().is_empty() + || brief.tenant_id.trim().is_empty() + || brief.project_id.trim().is_empty() + || brief.agent_id.trim().is_empty() + || brief.read_profile.trim().is_empty() + || brief.brief_kind.trim().is_empty() + || brief.suggestions.is_empty() + { + return Err(eyre::eyre!("{} has an incomplete proactive brief.", path.display())); + } + + validate_optional_rfc3339(&brief.generated_at, path, brief.brief_id.as_str())?; + + for suggestion in &brief.suggestions { + validate_proactive_suggestion(suggestion, path, evidence_ids)?; + } + + validate_memory_summary_source_trace(&brief.source_trace, path, evidence_ids)?; + + Ok(()) +} + +fn validate_proactive_suggestion( + suggestion: &ProactiveSuggestion, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if suggestion.suggestion_id.trim().is_empty() + || suggestion.suggestion_kind.trim().is_empty() + || suggestion.title.trim().is_empty() + || suggestion.body.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete proactive suggestion.", path.display())); + } + if !is_proactive_suggestion_kind(suggestion.suggestion_kind.as_str()) { + return Err(eyre::eyre!( + "{} has unknown proactive suggestion kind {}.", + path.display(), + suggestion.suggestion_kind + )); + } + if !is_memory_summary_freshness_status(suggestion.freshness.status.as_str()) { + return Err(eyre::eyre!( + "{} has unknown proactive freshness status {}.", + path.display(), + suggestion.freshness.status + )); + } + if !is_proactive_action_decision(suggestion.action.decision.as_str()) { + return Err(eyre::eyre!( + "{} has unknown proactive action decision {}.", + path.display(), + suggestion.action.decision + )); + } + if suggestion.action.reason_code.trim().is_empty() || suggestion.action.reason.trim().is_empty() + { + return Err(eyre::eyre!("{} has incomplete proactive action rationale.", path.display())); + } + + for evidence_id in &suggestion.evidence_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for evidence_id in &suggestion.freshness.tombstone_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for flag in &suggestion.unsupported_claim_flags { + if !flag.is_object() { + return Err(eyre::eyre!( + "{} proactive unsupported-claim flags must be JSON objects.", + path.display() + )); + } + } + + validate_optional_summary_time( + path, + suggestion.freshness.observed_at.as_deref(), + suggestion.suggestion_id.as_str(), + )?; + validate_optional_summary_time( + path, + suggestion.freshness.valid_from.as_deref(), + suggestion.suggestion_id.as_str(), + )?; + validate_optional_summary_time( + path, + suggestion.freshness.valid_to.as_deref(), + suggestion.suggestion_id.as_str(), + )?; + validate_optional_summary_time( + path, + suggestion.freshness.last_confirmed_at.as_deref(), + suggestion.suggestion_id.as_str(), + )?; + + Ok(()) +} + +fn validate_scheduled_memory_artifact( + task: &ScheduledMemoryTaskArtifact, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if task.task_run_id.trim().is_empty() + || task.contract_schema != "elf.scheduled_memory_task/v1" + || task.generated_at.trim().is_empty() + || task.scheduled_for.trim().is_empty() + || task.tenant_id.trim().is_empty() + || task.project_id.trim().is_empty() + || task.agent_id.trim().is_empty() + || task.read_profile.trim().is_empty() + || task.task_kind.trim().is_empty() + || task.outputs.is_empty() + { + return Err(eyre::eyre!("{} has an incomplete scheduled memory task.", path.display())); + } + if !is_scheduled_task_kind(task.task_kind.as_str()) { + return Err(eyre::eyre!( + "{} has unknown scheduled task kind {}.", + path.display(), + task.task_kind + )); + } + + validate_optional_rfc3339(&task.generated_at, path, task.task_run_id.as_str())?; + validate_optional_rfc3339(&task.scheduled_for, path, task.task_run_id.as_str())?; + + for output in &task.outputs { + validate_scheduled_memory_output(output, path, evidence_ids)?; + } + for mutation in &task.source_mutations { + if !mutation.is_object() { + return Err(eyre::eyre!( + "{} scheduled memory source mutations must be JSON objects.", + path.display() + )); + } + } + for flag in &task.unsupported_claim_flags { + if !flag.is_object() { + return Err(eyre::eyre!( + "{} scheduled memory unsupported-claim flags must be JSON objects.", + path.display() + )); + } + } + + validate_memory_summary_source_trace(&task.source_trace, path, evidence_ids)?; + + if let Some(trace) = &task.execution_trace { + validate_scheduled_memory_trace(trace, path, evidence_ids)?; + } + + Ok(()) +} + +fn validate_scheduled_memory_output( + output: &ScheduledMemoryOutput, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if output.output_id.trim().is_empty() + || output.output_kind.trim().is_empty() + || output.text.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete scheduled memory output.", path.display())); + } + if !is_scheduled_task_kind(output.output_kind.as_str()) { + return Err(eyre::eyre!( + "{} has unknown scheduled output kind {}.", + path.display(), + output.output_kind + )); + } + if !is_memory_summary_freshness_status(output.freshness.status.as_str()) { + return Err(eyre::eyre!( + "{} has unknown scheduled output freshness status {}.", + path.display(), + output.freshness.status + )); + } + if !is_proactive_action_decision(output.action.decision.as_str()) { + return Err(eyre::eyre!( + "{} has unknown scheduled output action decision {}.", + path.display(), + output.action.decision + )); + } + if output.action.reason_code.trim().is_empty() || output.action.reason.trim().is_empty() { + return Err(eyre::eyre!( + "{} has incomplete scheduled output action rationale.", + path.display() + )); + } + + for evidence_id in &output.evidence_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for evidence_id in &output.freshness.tombstone_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for flag in &output.unsupported_claim_flags { + if !flag.is_object() { + return Err(eyre::eyre!( + "{} scheduled output unsupported-claim flags must be JSON objects.", + path.display() + )); + } + } + + validate_optional_summary_time( + path, + output.freshness.observed_at.as_deref(), + output.output_id.as_str(), + )?; + validate_optional_summary_time( + path, + output.freshness.valid_from.as_deref(), + output.output_id.as_str(), + )?; + validate_optional_summary_time( + path, + output.freshness.valid_to.as_deref(), + output.output_id.as_str(), + )?; + validate_optional_summary_time( + path, + output.freshness.last_confirmed_at.as_deref(), + output.output_id.as_str(), + )?; + + Ok(()) +} + +fn validate_scheduled_memory_trace( + trace: &ScheduledMemoryExecutionTrace, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if trace.trace_id.trim().is_empty() + || trace.trigger_kind.trim().is_empty() + || trace.status.trim().is_empty() + || trace.started_at.trim().is_empty() + || trace.completed_at.trim().is_empty() + || trace.output_ref.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete scheduled memory execution trace.", + path.display() + )); + } + + validate_optional_rfc3339(&trace.started_at, path, trace.trace_id.as_str())?; + validate_optional_rfc3339(&trace.completed_at, path, trace.trace_id.as_str())?; + + for stage in &trace.stages { + if stage.stage_name.trim().is_empty() || stage.summary.trim().is_empty() { + return Err(eyre::eyre!( + "{} has an incomplete scheduled memory trace stage.", + path.display() + )); + } + + for evidence_id in &stage.evidence_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + } + + Ok(()) +} + +fn validate_optional_summary_time(path: &Path, value: Option<&str>, id: &str) -> Result<()> { + if let Some(value) = value { + validate_optional_rfc3339(value, path, id)?; + } + + Ok(()) +} + +fn is_memory_summary_category(category: &str) -> bool { + matches!( + category, + "top_of_mind" + | "background" + | "stale" | "superseded" + | "tombstone" + | "derived_project_profile" + ) +} + +fn is_memory_summary_freshness_status(status: &str) -> bool { + matches!( + status, + "current" + | "background" + | "historical" + | "stale" | "superseded" + | "tombstoned" + | "unsupported" + ) +} + +fn is_memory_summary_rationale_decision(decision: &str) -> bool { + matches!(decision, "included" | "downgraded" | "excluded") +} + +fn is_proactive_suggestion_kind(kind: &str) -> bool { + matches!( + kind, + "daily_project_brief" + | "resume_work" + | "stale_decision_audit" + | "stale_plan_preference_warning" + | "private_corpus_refresh" + ) +} + +fn is_scheduled_task_kind(kind: &str) -> bool { + matches!( + kind, + "weekly_project_status_summary" + | "stale_preference_plan_audit" + | "stale_decision_audit" + | "knowledge_page_refresh_suggestion" + | "private_provider_scheduler" + ) +} + +fn is_proactive_action_decision(decision: &str) -> bool { + matches!(decision, "recommend" | "defer" | "reject") +} + +fn validate_scoring_rubric(job: &RealWorldJob, path: &Path) -> Result<()> { + if !(0.0..=1.0).contains(&job.scoring_rubric.pass_threshold) { + return Err(eyre::eyre!("{} has invalid pass_threshold.", path.display())); + } + if job.scoring_rubric.dimensions.is_empty() { + return Err(eyre::eyre!("{} has no scoring dimensions.", path.display())); + } + + for (dimension_id, dimension) in &job.scoring_rubric.dimensions { + if dimension_id.trim().is_empty() + || !dimension.weight.is_finite() + || !dimension.max_points.is_finite() + || dimension.weight <= 0.0 + || dimension.max_points <= 0.0 + || dimension.criteria.is_null() + { + return Err(eyre::eyre!( + "{} has invalid scoring dimension {}.", + path.display(), + dimension_id + )); + } + } + for rule in &job.scoring_rubric.hard_fail_rules { + if rule.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty hard fail rule.", path.display())); + } + } + + Ok(()) +} + +fn validate_allowed_uncertainty(job: &RealWorldJob, path: &Path) -> Result<()> { + if job.allowed_uncertainty.fallback_action.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty fallback action.", path.display())); + } + if job.allowed_uncertainty.can_answer_unknown + && job.allowed_uncertainty.acceptable_phrases.is_empty() + { + return Err(eyre::eyre!( + "{} allows unknown answers but defines no acceptable uncertainty phrase.", + path.display() + )); + } + + for phrase in &job.allowed_uncertainty.acceptable_phrases { + if phrase.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty uncertainty phrase.", path.display())); + } + } + + Ok(()) +} + +fn validate_operator_debug(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(debug) = &job.operator_debug else { + if job.suite == "operator_debugging_ux" { + return Err(eyre::eyre!( + "{} operator_debugging_ux job must include operator_debug.", + path.display() + )); + } + + return Ok(()); + }; + + if debug.failure_mode.trim().is_empty() + || debug.root_cause.trim().is_empty() + || debug.dropped_candidate_visibility.trim().is_empty() + || debug.trace_completeness.trim().is_empty() + || debug.repair_action_clarity.trim().is_empty() + || debug.steps_to_root_cause == 0 + { + return Err(eyre::eyre!("{} has incomplete operator_debug evidence.", path.display())); + } + + validate_optional_debug_field(path, debug.trace_id.as_deref(), "trace_id")?; + validate_optional_debug_field(path, debug.viewer_url.as_deref(), "viewer_url")?; + validate_optional_debug_field( + path, + debug.admin_trace_bundle_url.as_deref(), + "admin_trace_bundle_url", + )?; + validate_optional_debug_field(path, debug.replay_command.as_deref(), "replay_command")?; + validate_optional_debug_field(path, debug.replay_artifact.as_deref(), "replay_artifact")?; + validate_non_empty_debug_list(path, &debug.viewer_panels, "viewer_panels")?; + validate_non_empty_debug_list(path, &debug.cli_steps, "cli_steps")?; + validate_non_empty_debug_list(path, &debug.trace_evidence, "trace_evidence")?; + + for gap in &debug.ux_gaps { + if gap.gap_id.trim().is_empty() + || gap.severity.trim().is_empty() + || gap.description.trim().is_empty() + || gap.follow_up_issue.trim().is_empty() + { + return Err(eyre::eyre!("{} has incomplete operator_debug ux_gaps.", path.display())); + } + } + + Ok(()) +} + +fn validate_job_encoding(job: &RealWorldJob, path: &Path) -> Result<()> { + if let Some(status) = job.encoding.status { + if !matches!( + status, + TypedStatus::NotEncoded | TypedStatus::Blocked | TypedStatus::Incomplete + ) { + return Err(eyre::eyre!( + "{} job {} uses encoding.status {}; only not_encoded, blocked, or incomplete are allowed.", + path.display(), + job.job_id, + status_str(status) + )); + } + if job.encoding.reason.as_deref().is_none_or(|reason| reason.trim().is_empty()) { + return Err(eyre::eyre!( + "{} job {} declares encoding.status but no reason.", + path.display(), + job.job_id + )); + } + } + if let Some(follow_up) = &job.encoding.follow_up + && (follow_up.title.trim().is_empty() || follow_up.reason.trim().is_empty()) + { + return Err(eyre::eyre!( + "{} job {} has an incomplete encoding follow-up.", + path.display(), + job.job_id + )); + } + + Ok(()) +} + +fn validate_memory_evolution(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(evolution) = &job.memory_evolution else { + return Ok(()); + }; + let evidence_ids = corpus_evidence_ids(job); + let trap_ids = + job.negative_traps.iter().map(|trap| trap.trap_id.as_str()).collect::>(); + + for evidence_id in evolution + .current_evidence_ids + .iter() + .chain(evolution.historical_evidence_ids.iter()) + .chain(evolution.tombstone_evidence_ids.iter()) + .chain(evolution.invalidation_evidence_ids.iter()) + { + ensure_known_evidence(path, &evidence_ids, evidence_id)?; + } + for trap_id in &evolution.stale_trap_ids { + if !trap_ids.contains(trap_id.as_str()) { + return Err(eyre::eyre!( + "{} job {} references unknown stale trap id {}.", + path.display(), + job.job_id, + trap_id + )); + } + } + for conflict in &evolution.conflicts { + validate_evolution_conflict(path, &evidence_ids, conflict)?; + } + + if let Some(rationale) = &evolution.update_rationale { + validate_update_rationale(path, &evidence_ids, rationale)?; + } + if let Some(temporal) = &evolution.temporal_validity { + validate_temporal_validity(job, path, temporal)?; + } + + Ok(()) +} + +fn validate_memory_summary_expectation(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(summary) = &job.memory_summary else { + if job.suite == "memory_summary" && job.encoding.status.is_none() { + return Err(eyre::eyre!( + "{} memory_summary jobs must provide memory_summary expectations.", + path.display() + )); + } + + return Ok(()); + }; + + for category in &summary.required_categories { + if !is_memory_summary_category(category.as_str()) { + return Err(eyre::eyre!( + "{} memory_summary expectation references unknown category {}.", + path.display(), + category + )); + } + } + + Ok(()) +} + +fn validate_proactive_brief_expectation(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(brief) = &job.proactive_brief else { + if job.suite == "proactive_brief" && job.encoding.status.is_none() { + return Err(eyre::eyre!( + "{} proactive_brief jobs must provide proactive_brief expectations.", + path.display() + )); + } + + return Ok(()); + }; + + for kind in &brief.required_suggestion_kinds { + if !is_proactive_suggestion_kind(kind.as_str()) { + return Err(eyre::eyre!( + "{} proactive_brief expectation references unknown suggestion kind {}.", + path.display(), + kind + )); + } + } + + Ok(()) +} + +fn validate_scheduled_memory_expectation(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(scheduled) = &job.scheduled_memory else { + if job.suite == "scheduled_memory" && job.encoding.status.is_none() { + return Err(eyre::eyre!( + "{} scheduled_memory jobs must provide scheduled_memory expectations.", + path.display() + )); + } + + return Ok(()); + }; + + for kind in &scheduled.required_task_kinds { + if !is_scheduled_task_kind(kind.as_str()) { + return Err(eyre::eyre!( + "{} scheduled_memory expectation references unknown task kind {}.", + path.display(), + kind + )); + } + } + + Ok(()) +} + +fn validate_evolution_conflict( + path: &Path, + evidence_ids: &BTreeSet, + conflict: &EvolutionConflict, +) -> Result<()> { + if conflict.conflict_id.trim().is_empty() || conflict.claim_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an incomplete evolution conflict.", path.display())); + } + + ensure_known_evidence(path, evidence_ids, conflict.current_evidence_id.as_str())?; + ensure_known_evidence(path, evidence_ids, conflict.historical_evidence_id.as_str())?; + + if let Some(evidence_id) = &conflict.resolved_by_evidence_id { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + + Ok(()) +} + +fn validate_update_rationale( + path: &Path, + evidence_ids: &BTreeSet, + rationale: &UpdateRationale, +) -> Result<()> { + if rationale.claim_id.trim().is_empty() { + return Err(eyre::eyre!( + "{} has an update rationale with an empty claim_id.", + path.display() + )); + } + + for evidence_id in &rationale.evidence_ids { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + + Ok(()) +} + +fn validate_temporal_validity( + job: &RealWorldJob, + path: &Path, + temporal: &TemporalValidity, +) -> Result<()> { + if temporal.follow_up.as_deref().is_some_and(|follow_up| follow_up.trim().is_empty()) { + return Err(eyre::eyre!( + "{} job {} has an empty temporal validity follow-up.", + path.display(), + job.job_id + )); + } + if temporal.required + && !temporal.encoded + && !matches!(job.encoding.status, Some(TypedStatus::NotEncoded | TypedStatus::Blocked)) + { + return Err(eyre::eyre!( + "{} job {} requires temporal validity but does not declare a not_encoded or blocked encoding status.", + path.display(), + job.job_id + )); + } + + Ok(()) +} + +fn validate_trace_explainability(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(trace) = job + .corpus + .adapter_response + .as_ref() + .and_then(|response| response.answer.trace_explainability.as_ref()) + else { + return Ok(()); + }; + let known = corpus_evidence_ids(job); + let stage_names = + trace.stages.iter().map(|stage| stage.stage_name.as_str()).collect::>(); + + if trace.trace_id.as_deref().is_some_and(str::is_empty) { + return Err(eyre::eyre!("{} has an empty trace_explainability trace_id.", path.display())); + } + if trace.failure_stage.as_deref().is_some_and(str::is_empty) { + return Err(eyre::eyre!( + "{} has an empty trace_explainability failure_stage.", + path.display() + )); + } + + if let Some(failure_stage) = trace.failure_stage.as_deref() + && !stage_names.is_empty() + && !stage_names.contains(failure_stage) + { + return Err(eyre::eyre!( + "{} trace_explainability failure_stage {} is not present in stages.", + path.display(), + failure_stage + )); + } + + for stage in &trace.stages { + validate_trace_stage(stage, &known, path)?; + } + + Ok(()) +} + +fn validate_optional_debug_field(path: &Path, value: Option<&str>, field: &str) -> Result<()> { + if value.is_some_and(|value| value.trim().is_empty()) { + return Err(eyre::eyre!("{} has empty operator_debug {field}.", path.display())); + } + + Ok(()) +} + +fn validate_non_empty_debug_list(path: &Path, values: &[String], field: &str) -> Result<()> { + if values.iter().any(|value| value.trim().is_empty()) { + return Err(eyre::eyre!("{} has empty operator_debug {field} entry.", path.display())); + } + + Ok(()) +} + +fn validate_trace_stage( + stage: &TraceStageExplainability, + known: &BTreeSet, + path: &Path, +) -> Result<()> { + if stage.stage_name.trim().is_empty() { + return Err(eyre::eyre!("{} has a trace stage with an empty stage_name.", path.display())); + } + + for evidence_id in stage + .kept_evidence + .iter() + .chain(stage.dropped_evidence.iter()) + .chain(stage.demoted_evidence.iter()) + .chain(stage.distractor_evidence.iter()) + { + ensure_known_evidence(path, known, evidence_id)?; + } + + Ok(()) +} + +fn validate_required_rfc3339(value: &str, path: &Path, id: &str) -> Result<()> { + if OffsetDateTime::parse(value, &Rfc3339).is_err() { + return Err(eyre::eyre!("{} has invalid RFC3339 timestamp for {}.", path.display(), id)); + } + + Ok(()) +} + +fn validate_optional_rfc3339(value: &str, path: &Path, id: &str) -> Result<()> { + if !value.trim().is_empty() { + validate_required_rfc3339(value, path, id)?; + } + + Ok(()) +} + +fn ensure_known_evidence(path: &Path, known: &BTreeSet, evidence_id: &str) -> Result<()> { + if !known.contains(evidence_id) { + return Err(eyre::eyre!( + "{} references unknown evidence id {}.", + path.display(), + evidence_id + )); + } + + Ok(()) +} + +fn corpus_evidence_ids(job: &RealWorldJob) -> BTreeSet { + job.corpus.items.iter().map(|item| item.evidence_id.clone()).collect() +} + +fn corpus_text_by_id(job: &RealWorldJob) -> BTreeMap<&str, &str> { + job.corpus + .items + .iter() + .filter_map(|item| item.text.as_deref().map(|text| (item.evidence_id.as_str(), text))) + .collect() +} + +fn timeline_event_ids(job: &RealWorldJob) -> BTreeSet { + job.timeline.iter().map(|event| event.event_id.clone()).collect() +} + +fn ensure_known_event(path: &Path, known: &BTreeSet, event_id: &str) -> Result<()> { + if !known.contains(event_id) { + return Err(eyre::eyre!( + "{} references unknown timeline event id {}.", + path.display(), + event_id + )); + } + + Ok(()) +} + +fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result { + if jobs.is_empty() { + return Err(eyre::eyre!("At least one real_world_job fixture is required.")); + } + + let mut job_reports = Vec::with_capacity(jobs.len()); + let mut unsupported_claims = Vec::new(); + + for job in jobs { + let scoring = score_job(job); + + unsupported_claims.extend(scoring.unsupported_claims.clone()); + job_reports.push(job_report(job, scoring)); + } + + let suites = suite_reports(&job_reports); + let not_encoded_suites = suites + .iter() + .filter(|suite| suite.status == TypedStatus::NotEncoded) + .map(|suite| suite.suite_id.clone()) + .collect::>(); + let summary = report_summary(&job_reports, &suites); + let evolution = evolution_summary(&job_reports); + let follow_ups = follow_up_reports(jobs); + let external_adapters = external_adapter_section( + &args.external_adapter_manifest, + args.skip_external_adapter_manifest, + )?; + + Ok(RealWorldReport { + schema: REPORT_SCHEMA.to_string(), + run_id: args.run_id.clone(), + generated_at: OffsetDateTime::now_utc().format(&Rfc3339)?, + runner_version: VERSION.to_string(), + corpus_profile: corpus_profile(jobs), + adapter: adapter_report(args)?, + external_adapters, + capture_integration: capture_integration_report(jobs), + summary, + suites, + jobs: job_reports, + unsupported_claims, + not_encoded_suites, + private_corpus_redaction: private_corpus_redaction(jobs), + evolution, + follow_ups, + }) +} + +fn score_job(job: &RealWorldJob) -> JobScoring { + let answer = produced_answer(job); + let produced_evidence = produced_evidence_ids(answer); + let trap_ids_used = trap_ids_used(job, &produced_evidence); + let consolidation = consolidation_job_report(job); + + if let Some(status) = job.encoding.status { + let evolution = evolution_job_report(job, answer, &trap_ids_used, 0); + + return score_declared_job(job, status, trap_ids_used, evolution, consolidation); + } + + let missing_claims = missing_required_claims(job, answer); + let forbidden_claims = forbidden_claim_hits(job, answer); + let missing_evidence = missing_required_evidence(job, &produced_evidence); + let knowledge = knowledge_metrics(job, answer); + let memory_summary = memory_summary_metrics(job, answer); + let proactive_brief = proactive_brief_metrics(job, answer); + let scheduled_memory = scheduled_memory_metrics(job, answer); + let mut unsupported_claims = unsupported_claims(job, answer); + + unsupported_claims.extend(unsupported_page_claims(answer)); + unsupported_claims.extend(unsupported_memory_summary_claims(job, answer)); + unsupported_claims.extend(unsupported_proactive_suggestions(job, answer)); + unsupported_claims.extend(unsupported_scheduled_outputs(job, answer)); + + let operator_counts = operator_debug_failure_counts(job); + let latency_violations = latency_violations(job, answer); + let hard_fail_hits = hard_fail_hits(job, &unsupported_claims, &trap_ids_used); + let evolution = evolution_job_report(job, answer, &trap_ids_used, forbidden_claims.len()); + let stale_answers = evolution.as_ref().map_or(0, |report| report.stale_answer_count); + let conflict_detection_missing = evolution + .as_ref() + .map_or(0, |report| report.conflict_count - report.conflict_detection_count); + let update_rationale_missing = evolution.as_ref().map_or(0, update_rationale_missing_count); + let mut counts = FailureCounts { + missing_claims: missing_claims.len(), + forbidden_claims: forbidden_claims.len(), + missing_evidence: missing_evidence.len(), + trap_uses: trap_ids_used.len(), + unsupported_claims: unsupported_claims.len(), + operator_debug_missing: operator_counts.operator_debug_missing, + operator_debug_raw_sql: operator_counts.operator_debug_raw_sql, + operator_debug_trace_gaps: operator_counts.operator_debug_trace_gaps, + operator_debug_repair_unclear: operator_counts.operator_debug_repair_unclear, + stale_answers, + conflict_detection_missing, + update_rationale_missing, + latency_violations, + proposal_usefulness_failures: proposal_usefulness_failures(consolidation.as_ref()), + lineage_failures: lineage_failures(consolidation.as_ref()), + review_action_failures: review_action_failures(consolidation.as_ref()), + source_mutations: consolidation.as_ref().map_or(0, |report| report.source_mutation_count), + blocking_executable_gaps: blocking_executable_gaps(consolidation.as_ref()), + untraced_page_sections: knowledge + .as_ref() + .map_or(0, |metrics| metrics.untraced_section_count), + missed_stale_findings: knowledge.as_ref().map_or(0, missed_stale_finding_count), + rebuild_failures: knowledge.as_ref().map_or(0, |metrics| metrics.rebuild_failure_count), + page_usefulness_failures: knowledge.as_ref().map_or(0, page_usefulness_failure_count), + ..FailureCounts::default() + }; + + apply_memory_summary_failure_counts(&mut counts, memory_summary.as_ref()); + apply_proactive_brief_failure_counts(&mut counts, proactive_brief.as_ref()); + apply_scheduled_memory_failure_counts(&mut counts, scheduled_memory.as_ref()); + + let dimension_scores = dimension_scores(job, &counts); + let normalized_score = normalized_score(&dimension_scores); + let wrong_result_count = wrong_result_count(&counts); + let status = job_status( + normalized_score, + job.scoring_rubric.pass_threshold, + wrong_result_count, + unsupported_claims.len(), + counts.source_mutations, + counts.blocking_executable_gaps, + ); + let reason = job_reason(status, &counts, normalized_score); + + for claim in &mut unsupported_claims { + claim.suite_id = job.suite.clone(); + claim.job_id = job.job_id.clone(); + } + + JobScoring { + status, + normalized_score, + hard_fail_hits, + unsupported_claims, + wrong_result_count, + knowledge, + trap_ids_used, + dimension_scores, + reason, + evolution, + consolidation, + memory_summary, + proactive_brief, + scheduled_memory, + } +} + +fn apply_memory_summary_failure_counts( + counts: &mut FailureCounts, + metrics: Option<&MemorySummaryJobMetrics>, +) { + let Some(metrics) = metrics else { + return; + }; + + counts.memory_summary_invalid_current_entries = metrics.invalid_top_of_mind_count; + counts.memory_summary_untraced_entries = metrics.untraced_entry_count; + counts.memory_summary_missing_freshness = + metrics.entry_count.saturating_sub(metrics.freshness_marker_count); + counts.memory_summary_missing_rationale = + metrics.entry_count.saturating_sub(metrics.rationale_count); + counts.memory_summary_missing_categories = metrics.missing_required_category_count; + counts.memory_summary_unsupported_current_entries = metrics.unsupported_current_entry_count; +} + +fn apply_proactive_brief_failure_counts( + counts: &mut FailureCounts, + metrics: Option<&ProactiveBriefJobMetrics>, +) { + let Some(metrics) = metrics else { + return; + }; + + counts.proactive_brief_invalid_current_suggestions = metrics.invalid_current_suggestion_count; + counts.proactive_brief_untraced_suggestions = metrics.untraced_suggestion_count; + counts.proactive_brief_missing_freshness = + metrics.suggestion_count.saturating_sub(metrics.freshness_marker_count); + counts.proactive_brief_missing_action_rationale = + metrics.suggestion_count.saturating_sub(metrics.action_rationale_count); + counts.proactive_brief_missing_kinds = metrics.missing_required_suggestion_kind_count; + counts.proactive_brief_unsupported_current_suggestions = + metrics.unsupported_current_suggestion_count; + counts.proactive_brief_tombstone_violations = metrics.tombstone_violation_count; +} + +fn apply_scheduled_memory_failure_counts( + counts: &mut FailureCounts, + metrics: Option<&ScheduledMemoryJobMetrics>, +) { + let Some(metrics) = metrics else { + return; + }; + + counts.scheduled_memory_invalid_current_outputs = metrics.invalid_current_output_count; + counts.scheduled_memory_untraced_outputs = metrics.untraced_output_count; + counts.scheduled_memory_missing_freshness = + metrics.output_count.saturating_sub(metrics.freshness_marker_count); + counts.scheduled_memory_missing_action_rationale = + metrics.output_count.saturating_sub(metrics.action_rationale_count); + counts.scheduled_memory_missing_task_kinds = metrics.missing_required_task_kind_count; + counts.scheduled_memory_unsupported_current_outputs = metrics.unsupported_current_output_count; + counts.scheduled_memory_tombstone_violations = metrics.tombstone_violation_count; + counts.scheduled_memory_missing_trace = + metrics.trace_required_count.saturating_sub(metrics.trace_complete_count); + counts.source_mutations += metrics.source_mutation_count; +} + +fn score_declared_job( + job: &RealWorldJob, + status: TypedStatus, + trap_ids_used: Vec, + evolution: Option, + consolidation: Option, +) -> JobScoring { + JobScoring { + status, + normalized_score: 0.0, + hard_fail_hits: Vec::new(), + unsupported_claims: Vec::new(), + wrong_result_count: 0, + knowledge: None, + trap_ids_used, + dimension_scores: declared_not_encoded_dimension_scores(job), + reason: job + .encoding + .reason + .clone() + .unwrap_or_else(|| "Job did not reach a runnable scoring state.".to_string()), + evolution, + consolidation, + memory_summary: None, + proactive_brief: None, + scheduled_memory: None, + } +} + +fn wrong_result_count(counts: &FailureCounts) -> usize { + counts.missing_claims + + counts.forbidden_claims + + counts.missing_evidence + + counts.trap_uses + + counts.operator_debug_missing + + counts.operator_debug_raw_sql + + counts.operator_debug_trace_gaps + + counts.operator_debug_repair_unclear + + counts.conflict_detection_missing + + counts.update_rationale_missing + + counts.proposal_usefulness_failures + + counts.lineage_failures + + counts.review_action_failures + + counts.memory_summary_invalid_current_entries + + counts.memory_summary_untraced_entries + + counts.memory_summary_missing_freshness + + counts.memory_summary_missing_rationale + + counts.memory_summary_missing_categories + + counts.memory_summary_unsupported_current_entries + + counts.proactive_brief_invalid_current_suggestions + + counts.proactive_brief_untraced_suggestions + + counts.proactive_brief_missing_freshness + + counts.proactive_brief_missing_action_rationale + + counts.proactive_brief_missing_kinds + + counts.proactive_brief_unsupported_current_suggestions + + counts.proactive_brief_tombstone_violations + + counts.scheduled_memory_invalid_current_outputs + + counts.scheduled_memory_untraced_outputs + + counts.scheduled_memory_missing_freshness + + counts.scheduled_memory_missing_action_rationale + + counts.scheduled_memory_missing_task_kinds + + counts.scheduled_memory_unsupported_current_outputs + + counts.scheduled_memory_tombstone_violations + + counts.scheduled_memory_missing_trace + + counts.untraced_page_sections + + counts.missed_stale_findings + + counts.rebuild_failures + + counts.page_usefulness_failures +} + +fn operator_debug_failure_counts(job: &RealWorldJob) -> FailureCounts { + let Some(debug) = &job.operator_debug else { + return FailureCounts { + operator_debug_missing: usize::from(job.suite == "operator_debugging_ux"), + ..FailureCounts::default() + }; + }; + + FailureCounts { + operator_debug_raw_sql: usize::from(debug.raw_sql_needed), + operator_debug_trace_gaps: usize::from(debug.trace_completeness != "complete"), + operator_debug_repair_unclear: usize::from(debug.repair_action_clarity != "clear"), + ..FailureCounts::default() + } +} + +fn declared_not_encoded_dimension_scores(job: &RealWorldJob) -> Vec { + job.scoring_rubric + .dimensions + .iter() + .map(|(dimension_id, dimension)| DimensionScoreReport { + dimension: dimension_id.clone(), + score: 0.0, + max_points: dimension.max_points, + weight: dimension.weight, + }) + .collect() +} + +fn produced_answer(job: &RealWorldJob) -> &ProducedAnswer { + job.corpus + .adapter_response + .as_ref() + .map(|response| &response.answer) + .unwrap_or_else(|| synthetic_answer(job)) +} + +fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { + let _ = job; + + static EMPTY_ANSWER: std::sync::OnceLock = std::sync::OnceLock::new(); + + EMPTY_ANSWER.get_or_init(|| ProducedAnswer { + content: String::new(), + claims: Vec::new(), + evidence_ids: Vec::new(), + pages: Vec::new(), + memory_summaries: Vec::new(), + proactive_briefs: Vec::new(), + scheduled_tasks: Vec::new(), + latency_ms: None, + cost: None, + trace_explainability: None, + }) +} + +fn produced_evidence_ids(answer: &ProducedAnswer) -> BTreeSet { + let mut evidence = answer.evidence_ids.iter().cloned().collect::>(); + + for claim in &answer.claims { + evidence.extend(claim.evidence_ids.iter().cloned()); + } + for brief in &answer.proactive_briefs { + for suggestion in &brief.suggestions { + evidence.extend(suggestion.evidence_refs.iter().cloned()); + } + } + for task in &answer.scheduled_tasks { + for output in &task.outputs { + evidence.extend(output.evidence_refs.iter().cloned()); + } + } + + evidence +} + +fn missing_required_claims(job: &RealWorldJob, answer: &ProducedAnswer) -> Vec { + job.expected_answer + .must_include + .iter() + .filter(|claim| !claim_is_present(claim, answer)) + .map(|claim| claim.text().to_string()) + .collect() +} + +fn claim_is_present(claim: &ExpectedClaim, answer: &ProducedAnswer) -> bool { + if let Some(claim_id) = claim.claim_id() + && answer.claims.iter().any(|produced| produced.claim_id.as_deref() == Some(claim_id)) + { + return true; + } + + answer.content.contains(claim.text()) +} + +fn forbidden_claim_hits(job: &RealWorldJob, answer: &ProducedAnswer) -> Vec { + job.expected_answer + .must_not_include + .iter() + .filter(|claim| answer.content.contains(claim.as_str())) + .cloned() + .collect() +} + +fn missing_required_evidence( + job: &RealWorldJob, + produced_evidence: &BTreeSet, +) -> Vec { + job.required_evidence + .iter() + .filter(|evidence| { + is_required_use(evidence) && !produced_evidence.contains(&evidence.evidence_id) + }) + .map(|evidence| evidence.evidence_id.clone()) + .collect() +} + +fn is_required_use(evidence: &RequiredEvidence) -> bool { + matches!(evidence.requirement.as_str(), "cite" | "use" | "explain") +} + +fn trap_ids_used(job: &RealWorldJob, produced_evidence: &BTreeSet) -> Vec { + job.negative_traps + .iter() + .filter(|trap| trap.failure_if_used) + .filter(|trap| { + trap.evidence_ids.iter().any(|evidence_id| produced_evidence.contains(evidence_id)) + }) + .map(|trap| trap.trap_id.clone()) + .collect() +} + +fn evolution_job_report( + job: &RealWorldJob, + answer: &ProducedAnswer, + trap_ids_used: &[String], + forbidden_claim_count: usize, +) -> Option { + let evolution = job.memory_evolution.as_ref()?; + let produced = produced_evidence_ids(answer); + let stale_trap_ids_used = stale_trap_ids_used(job, evolution, trap_ids_used); + let stale_answer_count = + stale_answer_count(job, evolution, &stale_trap_ids_used, forbidden_claim_count); + let conflict_detection_count = evolution + .conflicts + .iter() + .filter(|conflict| conflict_is_detected(conflict, answer)) + .count(); + let update_rationale_available = evolution + .update_rationale + .as_ref() + .is_some_and(|rationale| update_rationale_is_available(rationale, answer)); + let temporal_validity_required = + evolution.temporal_validity.as_ref().is_some_and(|temporal| temporal.required); + let temporal_validity_encoded = + evolution.temporal_validity.as_ref().is_some_and(|temporal| temporal.encoded); + let temporal_validity_not_encoded = temporal_validity_required && !temporal_validity_encoded; + let history_readback_encoded = + evolution.history_readback.as_ref().is_some_and(|history| history.encoded); + let history_event_types = evolution + .history_readback + .as_ref() + .map_or_else(Vec::new, |history| history.required_event_types.clone()); + let history_requires_note_version_links = evolution + .history_readback + .as_ref() + .is_some_and(|history| history.requires_note_version_links); + let follow_up = evolution + .temporal_validity + .as_ref() + .and_then(|temporal| temporal.follow_up.clone()) + .or_else(|| job.encoding.follow_up.as_ref().map(|follow_up| follow_up.title.clone())); + + Some(EvolutionJobReport { + current_evidence: evolution.current_evidence_ids.clone(), + historical_evidence: evolution.historical_evidence_ids.clone(), + tombstone_evidence: evolution.tombstone_evidence_ids.clone(), + invalidation_evidence: evolution.invalidation_evidence_ids.clone(), + selected_current_evidence: selected_evolution_evidence( + &evolution.current_evidence_ids, + &produced, + ), + selected_historical_evidence: selected_evolution_evidence( + &evolution.historical_evidence_ids, + &produced, + ), + selected_rationale_evidence: selected_rationale_evidence(evolution, &produced), + selected_tombstone_evidence: selected_evolution_evidence( + &evolution.tombstone_evidence_ids, + &produced, + ), + selected_invalidation_evidence: selected_evolution_evidence( + &evolution.invalidation_evidence_ids, + &produced, + ), + conflict_candidate_evidence: selected_conflict_candidate_evidence(evolution, &produced), + retrieved_but_dropped_evidence: trace_dropped_evidence(answer), + selected_but_not_narrated_evidence: selected_but_not_narrated_evidence(answer), + stale_answer_count, + stale_trap_ids_used, + conflict_count: evolution.conflicts.len(), + conflict_detection_count, + update_rationale_available, + temporal_validity_required, + temporal_validity_encoded, + temporal_validity_not_encoded, + history_readback_encoded, + history_event_types, + history_requires_note_version_links, + follow_up, + }) +} + +fn stale_answer_count( + job: &RealWorldJob, + evolution: &MemoryEvolution, + stale_trap_ids_used: &[String], + forbidden_claim_count: usize, +) -> usize { + let stale_trap_count = if evolution.stale_trap_ids.is_empty() { + job.negative_traps.iter().filter(|trap| trap.trap_type == "stale_fact").count() + } else { + evolution.stale_trap_ids.len() + }; + let stale_forbidden_claims = if stale_trap_count > 0 { forbidden_claim_count } else { 0 }; + + stale_trap_ids_used.len().max(stale_forbidden_claims) +} + +fn selected_evolution_evidence( + evidence_ids: &[String], + produced: &BTreeSet, +) -> Vec { + evidence_ids.iter().filter(|evidence_id| produced.contains(*evidence_id)).cloned().collect() +} + +fn selected_rationale_evidence( + evolution: &MemoryEvolution, + produced: &BTreeSet, +) -> Vec { + evolution.update_rationale.as_ref().map_or_else(Vec::new, |rationale| { + selected_evolution_evidence(&rationale.evidence_ids, produced) + }) +} + +fn selected_conflict_candidate_evidence( + evolution: &MemoryEvolution, + produced: &BTreeSet, +) -> Vec { + let mut evidence_ids = Vec::new(); + + for conflict in &evolution.conflicts { + push_if_produced(&mut evidence_ids, conflict.current_evidence_id.as_str(), produced); + push_if_produced(&mut evidence_ids, conflict.historical_evidence_id.as_str(), produced); + + if let Some(evidence_id) = &conflict.resolved_by_evidence_id { + push_if_produced(&mut evidence_ids, evidence_id.as_str(), produced); + } + } + + evidence_ids +} + +fn push_if_produced(out: &mut Vec, evidence_id: &str, produced: &BTreeSet) { + if produced.contains(evidence_id) && !out.iter().any(|id| id == evidence_id) { + out.push(evidence_id.to_string()); + } +} + +fn trace_dropped_evidence(answer: &ProducedAnswer) -> Vec { + let mut evidence = Vec::new(); + + if let Some(trace) = &answer.trace_explainability { + for stage in &trace.stages { + for evidence_id in &stage.dropped_evidence { + if !evidence.iter().any(|id| id == evidence_id) { + evidence.push(evidence_id.clone()); + } + } + } + } + + evidence +} + +fn selected_but_not_narrated_evidence(answer: &ProducedAnswer) -> Vec { + let narrated = answer + .claims + .iter() + .flat_map(|claim| claim.evidence_ids.iter().map(String::as_str)) + .collect::>(); + + answer + .evidence_ids + .iter() + .filter(|evidence_id| !narrated.contains(evidence_id.as_str())) + .cloned() + .collect() +} + +fn stale_trap_ids_used( + job: &RealWorldJob, + evolution: &MemoryEvolution, + trap_ids_used: &[String], +) -> Vec { + let declared_stale_traps = if evolution.stale_trap_ids.is_empty() { + job.negative_traps + .iter() + .filter(|trap| trap.trap_type == "stale_fact") + .map(|trap| trap.trap_id.as_str()) + .collect::>() + } else { + evolution.stale_trap_ids.iter().map(String::as_str).collect::>() + }; + + trap_ids_used + .iter() + .filter(|trap_id| declared_stale_traps.contains(trap_id.as_str())) + .cloned() + .collect() +} + +fn conflict_is_detected(conflict: &EvolutionConflict, answer: &ProducedAnswer) -> bool { + let mut required_evidence = + vec![conflict.current_evidence_id.as_str(), conflict.historical_evidence_id.as_str()]; + + if let Some(evidence_id) = &conflict.resolved_by_evidence_id { + required_evidence.push(evidence_id.as_str()); + } + + answer.claims.iter().any(|claim| { + claim.claim_id.as_deref() == Some(conflict.claim_id.as_str()) + && required_evidence + .iter() + .all(|evidence_id| claim.evidence_ids.iter().any(|id| id == evidence_id)) + }) +} + +fn update_rationale_is_available(rationale: &UpdateRationale, answer: &ProducedAnswer) -> bool { + if !rationale.available { + return false; + } + + answer.claims.iter().any(|claim| { + claim.claim_id.as_deref() == Some(rationale.claim_id.as_str()) + && !claim.evidence_ids.is_empty() + && rationale.evidence_ids.iter().any(|evidence_id| { + claim.evidence_ids.iter().any(|produced| produced == evidence_id) + }) + }) +} + +fn update_rationale_missing_count(report: &EvolutionJobReport) -> usize { + if report.update_rationale_available || report.temporal_validity_not_encoded { + 0 + } else if report.conflict_count > 0 { + 1 + } else { + 0 + } +} + +fn unsupported_claims(job: &RealWorldJob, answer: &ProducedAnswer) -> Vec { + answer.claims.iter().filter_map(|claim| unsupported_claim(job, claim)).collect() +} + +fn unsupported_claim(job: &RealWorldJob, claim: &ProducedClaim) -> Option { + let Some(claim_id) = claim.claim_id.as_deref() else { + return Some(unsupported_claim_report(claim, "claim has no claim_id")); + }; + let Some(allowed) = job.expected_answer.evidence_links.get(claim_id).map(EvidenceLink::ids) + else { + return Some(unsupported_claim_report( + claim, + "claim_id is not present in expected_answer.evidence_links", + )); + }; + + if claim.evidence_ids.is_empty() { + return Some(unsupported_claim_report(claim, "claim has no produced evidence ids")); + } + if !claim.evidence_ids.iter().any(|evidence_id| allowed.contains(evidence_id)) { + return Some(unsupported_claim_report( + claim, + "claim evidence is not allowed for this claim_id", + )); + } + + None +} + +fn unsupported_claim_report(claim: &ProducedClaim, reason: &str) -> UnsupportedClaimReport { + UnsupportedClaimReport { + suite_id: String::new(), + job_id: String::new(), + claim_id: claim.claim_id.clone(), + claim_text: bounded_text(claim.text.as_str(), 240), + reason: reason.to_string(), + evidence_ids: claim.evidence_ids.clone(), + } +} + +fn unsupported_page_claims(answer: &ProducedAnswer) -> Vec { + answer + .pages + .iter() + .flat_map(|page| { + page.sections.iter().filter_map(|section| { + if section_is_traced(section) || section_is_flagged_unsupported(section) { + return None; + } + + Some(UnsupportedClaimReport { + suite_id: String::new(), + job_id: String::new(), + claim_id: Some(format!("{}:{}", page.page_id, section.section_id)), + claim_text: bounded_text(section.content.as_str(), 240), + reason: + "derived page section has no source evidence and is not flagged unsupported" + .to_string(), + evidence_ids: section.evidence_ids.clone(), + }) + }) + }) + .collect() +} + +fn knowledge_metrics(job: &RealWorldJob, answer: &ProducedAnswer) -> Option { + if answer.pages.is_empty() { + return None; + } + + let mut metrics = KnowledgeJobMetrics { + page_count: answer.pages.len(), + stale_trap_count: stale_traps(job).len(), + ..KnowledgeJobMetrics::default() + }; + + for page in &answer.pages { + accumulate_page_metrics(page, &mut metrics); + } + + metrics.stale_traps_detected = stale_traps(job) + .iter() + .filter(|trap| page_artifacts_detect_stale_trap(&answer.pages, trap)) + .count(); + metrics.citation_coverage = ratio(metrics.traced_section_count, metrics.section_count); + metrics.stale_claim_detection = + ratio_or_full(metrics.stale_traps_detected, metrics.stale_trap_count); + metrics.rebuild_determinism = ratio(metrics.deterministic_rebuild_count, metrics.page_count); + metrics.backlink_coverage = ratio(metrics.pages_with_backlinks, metrics.page_count); + metrics.page_usefulness = round3( + (metrics.citation_coverage + + metrics.stale_claim_detection + + metrics.rebuild_determinism + + metrics.backlink_coverage) + / 4.0, + ); + + Some(metrics) +} + +fn stale_traps(job: &RealWorldJob) -> Vec<&NegativeTrap> { + job.negative_traps + .iter() + .filter(|trap| trap.trap_type == "stale_fact" && trap.failure_if_used) + .collect() +} + +fn accumulate_page_metrics(page: &DerivedPageArtifact, metrics: &mut KnowledgeJobMetrics) { + if !page.backlinks.is_empty() { + metrics.pages_with_backlinks += 1; + } + + metrics.backlink_count += page.backlinks.len(); + + for section in &page.sections { + metrics.section_count += 1; + + if section_is_traced(section) { + metrics.traced_section_count += 1; + } else if section_is_flagged_unsupported(section) { + metrics.flagged_unsupported_section_count += 1; + + if section.role == "summary" { + metrics.unsupported_summary_count += 1; + } + } else { + metrics.untraced_section_count += 1; + } + } + + if let Some(rebuild) = &page.rebuild { + if !rebuild.allowed_variance.is_empty() { + metrics.allowed_variance_count += 1; + } + if rebuild_is_acceptable(rebuild) { + metrics.deterministic_rebuild_count += 1; + } else { + metrics.rebuild_failure_count += 1; + } + } else { + metrics.rebuild_failure_count += 1; + } + + metrics.rebuild_page_count += 1; +} + +fn section_is_traced(section: &DerivedPageSection) -> bool { + !section.evidence_ids.is_empty() || !section.timeline_event_ids.is_empty() +} + +fn section_is_flagged_unsupported(section: &DerivedPageSection) -> bool { + section.unsupported_reason.as_ref().is_some_and(|reason| !reason.trim().is_empty()) +} + +fn rebuild_is_acceptable(rebuild: &DerivedPageRebuild) -> bool { + (rebuild.deterministic && rebuild.first_hash == rebuild.second_hash) + || !rebuild.allowed_variance.is_empty() +} + +fn page_artifacts_detect_stale_trap(pages: &[DerivedPageArtifact], trap: &NegativeTrap) -> bool { + pages.iter().any(|page| { + page.lint_findings.iter().any(|finding| { + finding.trap_id.as_deref() == Some(trap.trap_id.as_str()) + || finding + .evidence_ids + .iter() + .any(|evidence_id| trap.evidence_ids.contains(evidence_id)) + }) + }) +} + +fn missed_stale_finding_count(metrics: &KnowledgeJobMetrics) -> usize { + metrics.stale_trap_count.saturating_sub(metrics.stale_traps_detected) +} + +fn page_usefulness_failure_count(metrics: &KnowledgeJobMetrics) -> usize { + if metrics.page_usefulness < 0.8 { 1 } else { 0 } +} + +fn memory_summary_metrics( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Option { + if answer.memory_summaries.is_empty() { + return None; + } + + let mut metrics = MemorySummaryJobMetrics { + summary_count: answer.memory_summaries.len(), + required_category_count: job + .memory_summary + .as_ref() + .map_or(0, |summary| summary.required_categories.len()), + ..MemorySummaryJobMetrics::default() + }; + let mut categories = BTreeSet::new(); + + for summary in &answer.memory_summaries { + accumulate_memory_summary_metrics(summary, &mut metrics, &mut categories); + } + + let covered_required_category_count = job.memory_summary.as_ref().map_or(0, |summary| { + summary.required_categories.iter().filter(|category| categories.contains(*category)).count() + }); + + metrics.covered_required_category_count = covered_required_category_count; + metrics.missing_required_category_count = + metrics.required_category_count.saturating_sub(covered_required_category_count); + metrics.source_ref_coverage = + ratio(metrics.source_ref_entry_count, metrics.source_ref_required_count); + metrics.freshness_coverage = ratio(metrics.freshness_marker_count, metrics.entry_count); + metrics.rationale_coverage = ratio(metrics.rationale_count, metrics.entry_count); + + Some(metrics) +} + +fn accumulate_memory_summary_metrics( + summary: &MemorySummaryArtifact, + metrics: &mut MemorySummaryJobMetrics, + categories: &mut BTreeSet, +) { + metrics.source_trace_selected_count += summary.source_trace.selected_source_refs.len(); + metrics.source_trace_dropped_count += summary.source_trace.dropped_source_refs.len(); + metrics.source_trace_stale_count += summary.source_trace.stale_source_refs.len(); + metrics.source_trace_superseded_count += summary.source_trace.superseded_source_refs.len(); + metrics.source_trace_tombstone_count += summary.source_trace.tombstone_source_refs.len(); + + let non_current_source_refs = memory_summary_non_current_trace_refs(&summary.source_trace); + + for entry in &summary.entries { + metrics.entry_count += 1; + + categories.insert(entry.category.clone()); + + accumulate_memory_summary_category(entry.category.as_str(), metrics); + + if memory_summary_entry_requires_source_ref(entry) { + metrics.source_ref_required_count += 1; + + if entry.source_refs.is_empty() { + metrics.untraced_entry_count += 1; + } + } + if !entry.source_refs.is_empty() { + metrics.source_ref_entry_count += 1; + } + if memory_summary_entry_has_freshness(entry) { + metrics.freshness_marker_count += 1; + } + if memory_summary_entry_has_rationale(entry) { + metrics.rationale_count += 1; + } + if memory_summary_entry_is_invalid_top_of_mind(entry, &non_current_source_refs) { + metrics.invalid_top_of_mind_count += 1; + } + if entry.category == "derived_project_profile" { + let has_support = + !entry.source_refs.is_empty() || !entry.unsupported_claim_flags.is_empty(); + + if has_support { + metrics.derived_with_source_or_unsupported_count += 1; + } else { + metrics.derived_missing_source_or_unsupported_count += 1; + } + if !entry.unsupported_claim_flags.is_empty() { + metrics.unsupported_derived_entry_count += 1; + } + if memory_summary_entry_includes_unsupported_current_claim(entry) { + metrics.unsupported_current_entry_count += 1; + } + } + + metrics.tombstone_ref_count += entry.freshness.tombstone_refs.len(); + } +} + +fn memory_summary_non_current_trace_refs(trace: &MemorySummarySourceTrace) -> BTreeSet<&str> { + trace + .stale_source_refs + .iter() + .chain(trace.superseded_source_refs.iter()) + .chain(trace.tombstone_source_refs.iter()) + .map(|item| item.evidence_id.as_str()) + .collect() +} + +fn accumulate_memory_summary_category(category: &str, metrics: &mut MemorySummaryJobMetrics) { + match category { + "top_of_mind" => metrics.top_of_mind_count += 1, + "background" => metrics.background_count += 1, + "stale" => metrics.stale_count += 1, + "superseded" => metrics.superseded_count += 1, + "tombstone" => metrics.tombstone_count += 1, + "derived_project_profile" => metrics.derived_project_profile_count += 1, + _ => {}, + } +} + +fn memory_summary_entry_requires_source_ref(entry: &MemorySummaryEntry) -> bool { + !(entry.category == "derived_project_profile" + && entry.source_refs.is_empty() + && !entry.unsupported_claim_flags.is_empty() + && entry.rationale.decision == "excluded") +} + +fn memory_summary_entry_is_invalid_top_of_mind( + entry: &MemorySummaryEntry, + non_current_source_refs: &BTreeSet<&str>, +) -> bool { + entry.category == "top_of_mind" + && (entry.freshness.status != "current" + || entry.rationale.decision != "included" + || !entry.freshness.superseded_by.is_empty() + || !entry.freshness.tombstone_refs.is_empty() + || entry + .source_refs + .iter() + .any(|source_ref| non_current_source_refs.contains(source_ref.as_str()))) +} + +fn memory_summary_entry_has_freshness(entry: &MemorySummaryEntry) -> bool { + if entry.freshness.status.trim().is_empty() { + return false; + } + + match entry.category.as_str() { + "superseded" => !entry.freshness.superseded_by.is_empty(), + "tombstone" => + entry.freshness.status == "tombstoned" && !entry.freshness.tombstone_refs.is_empty(), + _ => true, + } +} + +fn memory_summary_entry_has_rationale(entry: &MemorySummaryEntry) -> bool { + !entry.rationale.decision.trim().is_empty() + && !entry.rationale.reason_code.trim().is_empty() + && !entry.rationale.reason.trim().is_empty() +} + +fn memory_summary_entry_includes_unsupported_current_claim(entry: &MemorySummaryEntry) -> bool { + !entry.unsupported_claim_flags.is_empty() + && (entry.rationale.decision != "excluded" || entry.freshness.status == "current") +} + +fn unsupported_memory_summary_claims( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Vec { + answer + .memory_summaries + .iter() + .flat_map(|summary| { + summary.entries.iter().filter_map(|entry| { + if entry.category != "derived_project_profile" + || !entry.source_refs.is_empty() + || !entry.unsupported_claim_flags.is_empty() + { + return None; + } + + Some(UnsupportedClaimReport { + suite_id: job.suite.clone(), + job_id: job.job_id.clone(), + claim_id: Some(format!("{}:{}", summary.summary_id, entry.entry_id)), + claim_text: bounded_text(entry.text.as_str(), 240), + reason: + "derived memory summary entry has no source refs and no unsupported-claim flags" + .to_string(), + evidence_ids: entry.source_refs.clone(), + }) + }) + }) + .collect() +} + +fn proactive_brief_metrics( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Option { + if answer.proactive_briefs.is_empty() { + return None; + } + + let mut metrics = ProactiveBriefJobMetrics { + brief_count: answer.proactive_briefs.len(), + required_suggestion_kind_count: job + .proactive_brief + .as_ref() + .map_or(0, |brief| brief.required_suggestion_kinds.len()), + ..ProactiveBriefJobMetrics::default() + }; + let mut suggestion_kinds = BTreeSet::new(); + + for brief in &answer.proactive_briefs { + accumulate_proactive_brief_metrics(brief, &mut metrics, &mut suggestion_kinds); + } + + let covered_required_suggestion_kind_count = job.proactive_brief.as_ref().map_or(0, |brief| { + brief + .required_suggestion_kinds + .iter() + .filter(|kind| suggestion_kinds.contains(*kind)) + .count() + }); + + metrics.covered_required_suggestion_kind_count = covered_required_suggestion_kind_count; + metrics.missing_required_suggestion_kind_count = metrics + .required_suggestion_kind_count + .saturating_sub(covered_required_suggestion_kind_count); + metrics.evidence_ref_coverage = + ratio(metrics.evidence_ref_suggestion_count, metrics.evidence_ref_required_count); + metrics.freshness_coverage = ratio(metrics.freshness_marker_count, metrics.suggestion_count); + metrics.action_rationale_coverage = + ratio(metrics.action_rationale_count, metrics.suggestion_count); + + Some(metrics) +} + +fn accumulate_proactive_brief_metrics( + brief: &ProactiveBriefArtifact, + metrics: &mut ProactiveBriefJobMetrics, + suggestion_kinds: &mut BTreeSet, +) { + metrics.source_trace_selected_count += brief.source_trace.selected_source_refs.len(); + metrics.source_trace_dropped_count += brief.source_trace.dropped_source_refs.len(); + metrics.source_trace_stale_count += brief.source_trace.stale_source_refs.len(); + metrics.source_trace_superseded_count += brief.source_trace.superseded_source_refs.len(); + metrics.source_trace_tombstone_count += brief.source_trace.tombstone_source_refs.len(); + + let non_current_refs = memory_summary_non_current_trace_refs(&brief.source_trace); + let tombstone_refs = proactive_tombstone_trace_refs(&brief.source_trace); + + for suggestion in &brief.suggestions { + metrics.suggestion_count += 1; + metrics.evidence_ref_required_count += 1; + + suggestion_kinds.insert(suggestion.suggestion_kind.clone()); + + if suggestion.evidence_refs.is_empty() { + metrics.untraced_suggestion_count += 1; + } else { + metrics.evidence_ref_suggestion_count += 1; + } + if proactive_suggestion_has_freshness(suggestion) { + metrics.freshness_marker_count += 1; + } + if proactive_suggestion_has_action_rationale(suggestion) { + metrics.action_rationale_count += 1; + } + + accumulate_proactive_action_decision(suggestion.action.decision.as_str(), metrics); + + if suggestion.freshness.status == "current" { + metrics.current_suggestion_count += 1; + } else { + metrics.non_current_suggestion_count += 1; + } + if proactive_suggestion_is_stale_warning(suggestion) { + metrics.stale_warning_count += 1; + } + if proactive_suggestion_is_invalid_current(suggestion, &non_current_refs) { + metrics.invalid_current_suggestion_count += 1; + } + if proactive_suggestion_is_unsupported_current(suggestion) { + metrics.unsupported_current_suggestion_count += 1; + } + if proactive_suggestion_is_tombstone_violation(suggestion, &tombstone_refs) { + metrics.tombstone_violation_count += 1; + } + } +} + +fn proactive_tombstone_trace_refs(trace: &MemorySummarySourceTrace) -> BTreeSet<&str> { + trace.tombstone_source_refs.iter().map(|item| item.evidence_id.as_str()).collect() +} + +fn accumulate_proactive_action_decision(decision: &str, metrics: &mut ProactiveBriefJobMetrics) { + match decision { + "recommend" => metrics.recommended_count += 1, + "defer" => metrics.deferred_count += 1, + "reject" => metrics.rejected_count += 1, + _ => {}, + } +} + +fn proactive_suggestion_has_freshness(suggestion: &ProactiveSuggestion) -> bool { + if suggestion.freshness.status.trim().is_empty() { + return false; + } + + match suggestion.freshness.status.as_str() { + "superseded" => !suggestion.freshness.superseded_by.is_empty(), + "tombstoned" => !suggestion.freshness.tombstone_refs.is_empty(), + _ => true, + } +} + +fn proactive_suggestion_has_action_rationale(suggestion: &ProactiveSuggestion) -> bool { + !suggestion.action.decision.trim().is_empty() + && !suggestion.action.reason_code.trim().is_empty() + && !suggestion.action.reason.trim().is_empty() +} + +fn proactive_suggestion_is_stale_warning(suggestion: &ProactiveSuggestion) -> bool { + matches!( + suggestion.suggestion_kind.as_str(), + "stale_decision_audit" | "stale_plan_preference_warning" + ) && suggestion.freshness.status != "current" +} + +fn proactive_suggestion_is_invalid_current( + suggestion: &ProactiveSuggestion, + non_current_refs: &BTreeSet<&str>, +) -> bool { + suggestion.freshness.status == "current" + && (!suggestion.freshness.superseded_by.is_empty() + || !suggestion.freshness.tombstone_refs.is_empty() + || suggestion + .evidence_refs + .iter() + .any(|evidence_id| non_current_refs.contains(evidence_id.as_str()))) +} + +fn proactive_suggestion_is_unsupported_current(suggestion: &ProactiveSuggestion) -> bool { + !suggestion.unsupported_claim_flags.is_empty() + && (suggestion.action.decision == "recommend" || suggestion.freshness.status == "current") +} + +fn proactive_suggestion_is_tombstone_violation( + suggestion: &ProactiveSuggestion, + tombstone_refs: &BTreeSet<&str>, +) -> bool { + suggestion.freshness.status == "current" + && (!suggestion.freshness.tombstone_refs.is_empty() + || suggestion + .evidence_refs + .iter() + .any(|evidence_id| tombstone_refs.contains(evidence_id.as_str()))) +} + +fn unsupported_proactive_suggestions( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Vec { + answer + .proactive_briefs + .iter() + .flat_map(|brief| { + brief.suggestions.iter().filter_map(|suggestion| { + if suggestion.evidence_refs.is_empty() { + return Some(proactive_unsupported_claim_report( + job, + brief, + suggestion, + "proactive suggestion has no evidence refs", + )); + } + if proactive_suggestion_is_unsupported_current(suggestion) { + return Some(proactive_unsupported_claim_report( + job, + brief, + suggestion, + "unsupported proactive claim is still recommended or marked current", + )); + } + + None + }) + }) + .collect() +} + +fn proactive_unsupported_claim_report( + job: &RealWorldJob, + brief: &ProactiveBriefArtifact, + suggestion: &ProactiveSuggestion, + reason: &str, +) -> UnsupportedClaimReport { + UnsupportedClaimReport { + suite_id: job.suite.clone(), + job_id: job.job_id.clone(), + claim_id: Some(format!("{}:{}", brief.brief_id, suggestion.suggestion_id)), + claim_text: bounded_text(suggestion.body.as_str(), 240), + reason: reason.to_string(), + evidence_ids: suggestion.evidence_refs.clone(), + } +} + +fn scheduled_memory_metrics( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Option { + if answer.scheduled_tasks.is_empty() { + return None; + } + + let mut metrics = ScheduledMemoryJobMetrics { + task_run_count: answer.scheduled_tasks.len(), + required_task_kind_count: job + .scheduled_memory + .as_ref() + .map_or(0, |scheduled| scheduled.required_task_kinds.len()), + ..ScheduledMemoryJobMetrics::default() + }; + let mut task_kinds = BTreeSet::new(); + + for task in &answer.scheduled_tasks { + accumulate_scheduled_memory_metrics(task, &mut metrics, &mut task_kinds); + } + + let covered_required_task_kind_count = job.scheduled_memory.as_ref().map_or(0, |scheduled| { + scheduled.required_task_kinds.iter().filter(|kind| task_kinds.contains(*kind)).count() + }); + + metrics.covered_required_task_kind_count = covered_required_task_kind_count; + metrics.missing_required_task_kind_count = + metrics.required_task_kind_count.saturating_sub(covered_required_task_kind_count); + metrics.evidence_ref_coverage = + ratio(metrics.evidence_ref_output_count, metrics.evidence_ref_required_count); + metrics.freshness_coverage = ratio(metrics.freshness_marker_count, metrics.output_count); + metrics.action_rationale_coverage = ratio(metrics.action_rationale_count, metrics.output_count); + metrics.trace_coverage = ratio(metrics.trace_complete_count, metrics.trace_required_count); + + Some(metrics) +} + +fn accumulate_scheduled_memory_metrics( + task: &ScheduledMemoryTaskArtifact, + metrics: &mut ScheduledMemoryJobMetrics, + task_kinds: &mut BTreeSet, +) { + metrics.source_trace_selected_count += task.source_trace.selected_source_refs.len(); + metrics.source_trace_dropped_count += task.source_trace.dropped_source_refs.len(); + metrics.source_trace_stale_count += task.source_trace.stale_source_refs.len(); + metrics.source_trace_superseded_count += task.source_trace.superseded_source_refs.len(); + metrics.source_trace_tombstone_count += task.source_trace.tombstone_source_refs.len(); + metrics.trace_required_count += 1; + metrics.source_mutation_count += task.source_mutations.len() + + task.source_mutations.iter().map(forbidden_diff_key_count).sum::(); + + task_kinds.insert(task.task_kind.clone()); + + if scheduled_trace_is_complete(task.execution_trace.as_ref()) { + metrics.trace_complete_count += 1; + } + + let non_current_refs = memory_summary_non_current_trace_refs(&task.source_trace); + let tombstone_refs = proactive_tombstone_trace_refs(&task.source_trace); + + for output in &task.outputs { + metrics.output_count += 1; + metrics.evidence_ref_required_count += 1; + + if output.evidence_refs.is_empty() { + metrics.untraced_output_count += 1; + } else { + metrics.evidence_ref_output_count += 1; + } + if scheduled_output_has_freshness(output) { + metrics.freshness_marker_count += 1; + } + if scheduled_output_has_action_rationale(output) { + metrics.action_rationale_count += 1; + } + if output.freshness.status == "current" { + metrics.current_output_count += 1; + } else { + metrics.non_current_output_count += 1; + } + if scheduled_output_is_invalid_current(output, &non_current_refs) { + metrics.invalid_current_output_count += 1; + } + if scheduled_output_is_unsupported_current(output) { + metrics.unsupported_current_output_count += 1; + } + if scheduled_output_is_tombstone_violation(output, &tombstone_refs) { + metrics.tombstone_violation_count += 1; + } + } +} + +fn scheduled_trace_is_complete(trace: Option<&ScheduledMemoryExecutionTrace>) -> bool { + let Some(trace) = trace else { + return false; + }; + + trace.status == "completed" + && !trace.trace_id.trim().is_empty() + && !trace.output_ref.trim().is_empty() + && !trace.stages.is_empty() + && trace + .stages + .iter() + .any(|stage| stage.stage_name == "output_readback" && !stage.evidence_refs.is_empty()) +} + +fn scheduled_output_has_freshness(output: &ScheduledMemoryOutput) -> bool { + if output.freshness.status.trim().is_empty() { + return false; + } + + match output.freshness.status.as_str() { + "superseded" => !output.freshness.superseded_by.is_empty(), + "tombstoned" => !output.freshness.tombstone_refs.is_empty(), + _ => true, + } +} + +fn scheduled_output_has_action_rationale(output: &ScheduledMemoryOutput) -> bool { + !output.action.decision.trim().is_empty() + && !output.action.reason_code.trim().is_empty() + && !output.action.reason.trim().is_empty() +} + +fn scheduled_output_is_invalid_current( + output: &ScheduledMemoryOutput, + non_current_refs: &BTreeSet<&str>, +) -> bool { + output.freshness.status == "current" + && (!output.freshness.superseded_by.is_empty() + || !output.freshness.tombstone_refs.is_empty() + || output + .evidence_refs + .iter() + .any(|evidence_id| non_current_refs.contains(evidence_id.as_str()))) +} + +fn scheduled_output_is_unsupported_current(output: &ScheduledMemoryOutput) -> bool { + !output.unsupported_claim_flags.is_empty() + && (output.action.decision == "recommend" || output.freshness.status == "current") +} + +fn scheduled_output_is_tombstone_violation( + output: &ScheduledMemoryOutput, + tombstone_refs: &BTreeSet<&str>, +) -> bool { + output.freshness.status == "current" + && (!output.freshness.tombstone_refs.is_empty() + || output + .evidence_refs + .iter() + .any(|evidence_id| tombstone_refs.contains(evidence_id.as_str()))) +} + +fn unsupported_scheduled_outputs( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Vec { + answer + .scheduled_tasks + .iter() + .flat_map(|task| { + task.outputs.iter().filter_map(|output| { + if output.evidence_refs.is_empty() { + return Some(scheduled_unsupported_claim_report( + job, + task, + output, + "scheduled task output has no evidence refs", + )); + } + if scheduled_output_is_unsupported_current(output) { + return Some(scheduled_unsupported_claim_report( + job, + task, + output, + "unsupported scheduled task claim is still recommended or marked current", + )); + } + + None + }) + }) + .collect() +} + +fn scheduled_unsupported_claim_report( + job: &RealWorldJob, + task: &ScheduledMemoryTaskArtifact, + output: &ScheduledMemoryOutput, + reason: &str, +) -> UnsupportedClaimReport { + UnsupportedClaimReport { + suite_id: job.suite.clone(), + job_id: job.job_id.clone(), + claim_id: Some(format!("{}:{}", task.task_run_id, output.output_id)), + claim_text: bounded_text(output.text.as_str(), 240), + reason: reason.to_string(), + evidence_ids: output.evidence_refs.clone(), + } +} + +fn hard_fail_hits( + job: &RealWorldJob, + unsupported_claims: &[UnsupportedClaimReport], + trap_ids_used: &[String], +) -> Vec { + let mut hits = Vec::new(); + + if !unsupported_claims.is_empty() { + hits.push( + "unsupported high-confidence claim about a required decision or fact".to_string(), + ); + } + if !trap_ids_used.is_empty() { + hits.push("use of a negative trap marked failure_if_used = true".to_string()); + } + if job.expected_answer.requires_caveat && !answer_has_required_caveat(job, produced_answer(job)) + { + hits.push("missing required caveat".to_string()); + } + if job.expected_answer.requires_refusal && !answer_looks_like_refusal(produced_answer(job)) { + hits.push("missing required refusal".to_string()); + } + + if let Some(consolidation) = consolidation_job_report(job) { + if consolidation.source_mutation_count > 0 { + hits.push( + "source mutation count must remain zero for proposal-only consolidation cases" + .to_string(), + ); + } + if consolidation.executable_gaps.iter().any(|gap| gap.blocks_fixture_pass) { + hits.push( + "missing consolidation primitive requires a precise follow-up issue".to_string(), + ); + } + } + + hits +} + +fn answer_has_required_caveat(job: &RealWorldJob, answer: &ProducedAnswer) -> bool { + job.allowed_uncertainty.acceptable_phrases.iter().any(|phrase| answer.content.contains(phrase)) +} + +fn answer_looks_like_refusal(answer: &ProducedAnswer) -> bool { + let lower = answer.content.to_ascii_lowercase(); + + lower.contains("cannot") || lower.contains("can't") || lower.contains("refuse") +} + +fn dimension_scores(job: &RealWorldJob, counts: &FailureCounts) -> Vec { + job.scoring_rubric + .dimensions + .iter() + .map(|(dimension_id, dimension)| DimensionScoreReport { + dimension: dimension_id.clone(), + score: dimension_score(dimension_id, dimension.max_points, counts), + max_points: dimension.max_points, + weight: dimension.weight, + }) + .collect() +} + +fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) -> f64 { + let failed = match dimension_id { + "answer_correctness" | "workflow_helpfulness" => + counts.missing_claims > 0 + || counts.forbidden_claims > 0 + || counts.operator_debug_repair_unclear > 0 + || counts.conflict_detection_missing > 0 + || counts.proposal_usefulness_failures > 0 + || counts.review_action_failures > 0 + || counts.memory_summary_invalid_current_entries > 0 + || counts.memory_summary_missing_categories > 0 + || counts.memory_summary_unsupported_current_entries > 0 + || counts.proactive_brief_invalid_current_suggestions > 0 + || counts.proactive_brief_missing_kinds > 0 + || counts.proactive_brief_unsupported_current_suggestions > 0 + || counts.proactive_brief_tombstone_violations > 0 + || counts.scheduled_memory_invalid_current_outputs > 0 + || counts.scheduled_memory_missing_task_kinds > 0 + || counts.scheduled_memory_unsupported_current_outputs > 0 + || counts.scheduled_memory_tombstone_violations > 0 + || counts.scheduled_memory_missing_trace > 0 + || counts.page_usefulness_failures > 0, + "evidence_grounding" => + counts.missing_evidence > 0 + || counts.unsupported_claims > 0 + || counts.lineage_failures > 0 + || counts.memory_summary_untraced_entries > 0 + || counts.proactive_brief_untraced_suggestions > 0 + || counts.scheduled_memory_untraced_outputs > 0 + || counts.scheduled_memory_missing_trace > 0 + || counts.untraced_page_sections > 0, + "trap_avoidance" => + counts.trap_uses > 0 + || counts.memory_summary_invalid_current_entries > 0 + || counts.proactive_brief_invalid_current_suggestions > 0 + || counts.proactive_brief_tombstone_violations > 0 + || counts.scheduled_memory_invalid_current_outputs > 0 + || counts.scheduled_memory_tombstone_violations > 0 + || counts.missed_stale_findings > 0, + "uncertainty_handling" => + counts.unsupported_claims > 0 + || counts.memory_summary_unsupported_current_entries > 0 + || counts.proactive_brief_unsupported_current_suggestions > 0 + || counts.scheduled_memory_unsupported_current_outputs > 0, + "lifecycle_behavior" => + counts.stale_answers > 0 + || counts.conflict_detection_missing > 0 + || counts.update_rationale_missing > 0 + || counts.source_mutations > 0 + || counts.memory_summary_invalid_current_entries > 0 + || counts.memory_summary_missing_freshness > 0 + || counts.memory_summary_missing_rationale > 0 + || counts.memory_summary_unsupported_current_entries > 0 + || counts.proactive_brief_invalid_current_suggestions > 0 + || counts.proactive_brief_missing_freshness > 0 + || counts.proactive_brief_missing_action_rationale > 0 + || counts.proactive_brief_unsupported_current_suggestions > 0 + || counts.proactive_brief_tombstone_violations > 0 + || counts.scheduled_memory_invalid_current_outputs > 0 + || counts.scheduled_memory_missing_freshness > 0 + || counts.scheduled_memory_missing_action_rationale > 0 + || counts.scheduled_memory_unsupported_current_outputs > 0 + || counts.scheduled_memory_tombstone_violations > 0 + || counts.scheduled_memory_missing_trace > 0 + || counts.rebuild_failures > 0, + "source_immutability" => counts.source_mutations > 0, + "proposal_usefulness" => counts.proposal_usefulness_failures > 0, + "lineage_completeness" => counts.lineage_failures > 0, + "review_action_correctness" => counts.review_action_failures > 0, + "debuggability" => + counts.missing_claims > 0 + || counts.unsupported_claims > 0 + || counts.operator_debug_missing > 0 + || counts.operator_debug_raw_sql > 0 + || counts.operator_debug_trace_gaps > 0 + || counts.scheduled_memory_missing_trace > 0, + "trace_readback" => counts.scheduled_memory_missing_trace > 0, + "latency_resource" => counts.latency_violations > 0, + "personalization_fit" | "ownership_correctness" => + counts.missing_claims > 0 || counts.unsupported_claims > 0, + _ => counts.missing_claims > 0 || counts.unsupported_claims > 0 || counts.trap_uses > 0, + }; + + if failed { 0.0 } else { max_points } +} + +fn latency_violations(job: &RealWorldJob, answer: &ProducedAnswer) -> usize { + let Some(max_latency_ms) = latency_threshold_ms(job) else { + return 0; + }; + let Some(latency_ms) = answer.latency_ms else { + return 1; + }; + + usize::from(latency_ms > max_latency_ms) +} + +fn latency_threshold_ms(job: &RealWorldJob) -> Option { + job.scoring_rubric + .dimensions + .get("latency_resource") + .and_then(|dimension| dimension.criteria.get("max_latency_ms")) + .and_then(Value::as_f64) +} + +fn normalized_score(scores: &[DimensionScoreReport]) -> f64 { + let total_weight = scores.iter().map(|score| score.weight).sum::(); + + if total_weight == 0.0 { + return 0.0; + } + + scores.iter().map(|score| (score.score / score.max_points) * score.weight).sum::() + / total_weight +} + +fn job_status( + normalized_score: f64, + pass_threshold: f64, + wrong_result_count: usize, + unsupported_claim_count: usize, + source_mutation_count: usize, + blocking_executable_gap_count: usize, +) -> TypedStatus { + if unsupported_claim_count > 0 { + TypedStatus::UnsupportedClaim + } else if source_mutation_count > 0 { + TypedStatus::LifecycleFail + } else if blocking_executable_gap_count > 0 { + TypedStatus::Blocked + } else if wrong_result_count > 0 { + TypedStatus::WrongResult + } else if normalized_score >= pass_threshold { + TypedStatus::Pass + } else { + TypedStatus::WrongResult + } +} + +fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64) -> String { + let wrong_result_signal_count = wrong_result_signal_count(counts); + + match status { + TypedStatus::Pass => format!("Job passed with normalized_score {normalized_score:.3}."), + TypedStatus::UnsupportedClaim => format!( + "Job produced {} unsupported claim(s), {} wrong-result signal(s), {} latency violation(s), and normalized_score {normalized_score:.3}.", + counts.unsupported_claims, wrong_result_signal_count, counts.latency_violations + ), + TypedStatus::WrongResult => format!( + "Job produced {} wrong-result signal(s), {} latency violation(s), and normalized_score {normalized_score:.3}.", + wrong_result_signal_count, counts.latency_violations + ), + TypedStatus::LifecycleFail => format!( + "Job produced {} source mutation(s) and normalized_score {normalized_score:.3}.", + counts.source_mutations + ), + TypedStatus::Blocked => format!( + "Job has {} blocking executable gap(s) and normalized_score {normalized_score:.3}.", + counts.blocking_executable_gaps + ), + _ => "Job did not reach a runnable scoring state.".to_string(), + } +} + +fn wrong_result_signal_count(counts: &FailureCounts) -> usize { + counts.missing_claims + + counts.forbidden_claims + + counts.missing_evidence + + counts.trap_uses + + counts.operator_debug_missing + + counts.operator_debug_raw_sql + + counts.operator_debug_trace_gaps + + counts.operator_debug_repair_unclear + + counts.conflict_detection_missing + + counts.update_rationale_missing + + counts.proposal_usefulness_failures + + counts.lineage_failures + + counts.review_action_failures + + counts.memory_summary_invalid_current_entries + + counts.memory_summary_untraced_entries + + counts.memory_summary_missing_freshness + + counts.memory_summary_missing_rationale + + counts.memory_summary_missing_categories + + counts.memory_summary_unsupported_current_entries + + counts.proactive_brief_invalid_current_suggestions + + counts.proactive_brief_untraced_suggestions + + counts.proactive_brief_missing_freshness + + counts.proactive_brief_missing_action_rationale + + counts.proactive_brief_missing_kinds + + counts.proactive_brief_unsupported_current_suggestions + + counts.proactive_brief_tombstone_violations + + counts.scheduled_memory_invalid_current_outputs + + counts.scheduled_memory_untraced_outputs + + counts.scheduled_memory_missing_freshness + + counts.scheduled_memory_missing_action_rationale + + counts.scheduled_memory_missing_task_kinds + + counts.scheduled_memory_unsupported_current_outputs + + counts.scheduled_memory_tombstone_violations + + counts.scheduled_memory_missing_trace + + counts.untraced_page_sections + + counts.missed_stale_findings + + counts.rebuild_failures + + counts.page_usefulness_failures +} + +fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { + let answer = produced_answer(job); + let metrics = job_metrics(job, answer); + let retrieval_quality = retrieval_quality_report(job, answer); + + JobReport { + suite_id: job.suite.clone(), + job_id: job.job_id.clone(), + title: job.title.clone(), + status: scoring.status, + answer_type: job.expected_answer.answer_type.clone(), + requires_caveat: job.expected_answer.requires_caveat, + requires_refusal: job.expected_answer.requires_refusal, + can_answer_unknown: job.allowed_uncertainty.can_answer_unknown, + normalized_score: round3(scoring.normalized_score), + hard_fail_hits: scoring.hard_fail_hits, + expected_evidence: expected_evidence_report(job), + produced_answer: answer.content.clone(), + produced_evidence: produced_evidence_ids(answer).into_iter().collect(), + unsupported_claim_count: scoring.unsupported_claims.len(), + wrong_result_count: scoring.wrong_result_count, + stale_answer_count: scoring + .evolution + .as_ref() + .map_or(0, |report| report.stale_answer_count), + conflict_detection_count: scoring + .evolution + .as_ref() + .map_or(0, |report| report.conflict_detection_count), + update_rationale_available: scoring + .evolution + .as_ref() + .is_some_and(|report| report.update_rationale_available), + temporal_validity_not_encoded: scoring + .evolution + .as_ref() + .is_some_and(|report| report.temporal_validity_not_encoded), + history_readback_encoded: scoring + .evolution + .as_ref() + .is_some_and(|report| report.history_readback_encoded), + retrieval_quality, + latency_ms: answer.latency_ms, + cost: answer.cost.clone(), + trace_explainability: answer.trace_explainability.clone(), + knowledge: scoring.knowledge, + memory_summary: scoring.memory_summary, + proactive_brief: scoring.proactive_brief, + scheduled_memory: scoring.scheduled_memory, + trap_ids_used: scoring.trap_ids_used, + dimension_scores: scoring.dimension_scores, + reason: scoring.reason, + evidence_required_count: metrics.evidence_required_count, + evidence_covered_count: metrics.evidence_covered_count, + source_ref_required_count: metrics.source_ref_required_count, + source_ref_covered_count: metrics.source_ref_covered_count, + quote_required_count: metrics.quote_required_count, + quote_covered_count: metrics.quote_covered_count, + stale_retrieval_count: metrics.stale_retrieval_count, + scope_check_count: metrics.scope_check_count, + scope_correct_count: metrics.scope_correct_count, + scope_violation_count: metrics.scope_violation_count, + redaction_leak_count: metrics.redaction_leak_count, + qdrant_rebuild_case: metrics.qdrant_rebuild_case, + operator_debug: job.operator_debug.clone(), + evolution: scoring.evolution, + consolidation: scoring.consolidation, + } +} + +fn consolidation_job_report(job: &RealWorldJob) -> Option { + let fixture = job.corpus.adapter_response.as_ref()?.consolidation.as_ref()?; + let proposals = fixture.proposals.iter().map(consolidation_proposal_report).collect::>(); + let executable_gaps = fixture + .executable_gaps + .iter() + .map(|gap| ConsolidationExecutableGapReport { + primitive: gap.primitive.clone(), + follow_up_issue: gap.follow_up_issue.clone(), + reason: gap.reason.clone(), + blocks_fixture_pass: gap.blocks_fixture_pass, + }) + .collect::>(); + let proposal_count = proposals.len(); + let source_mutation_count = + proposals.iter().map(|proposal| proposal.source_mutation_count).sum(); + let proposal_unsupported_claim_count = + proposals.iter().map(|proposal| proposal.unsupported_claim_count).sum(); + + Some(ConsolidationJobReport { + proposal_count, + proposal_usefulness: mean_proposal_metric( + proposals.iter().map(|proposal| proposal.usefulness_score), + ), + lineage_completeness: mean_proposal_metric( + proposals.iter().map(|proposal| proposal.lineage_completeness), + ), + review_action_correctness: mean_proposal_metric( + proposals.iter().map(|proposal| if proposal.review_action_correct { 1.0 } else { 0.0 }), + ), + source_mutation_count, + proposal_unsupported_claim_count, + executable_gaps, + proposals, + }) +} + +fn consolidation_proposal_report( + proposal: &ConsolidationProposalFixture, +) -> ConsolidationProposalReport { + ConsolidationProposalReport { + proposal_id: proposal.proposal_id.clone(), + proposal_kind: proposal.proposal_kind.clone(), + usefulness_score: round3(proposal.usefulness_score), + min_usefulness_score: round3(proposal.min_usefulness_score), + lineage_completeness: round3(lineage_completeness(proposal)), + expected_review_action: proposal.expected_review_action, + actual_review_action: proposal.actual_review_action, + review_action_correct: proposal.expected_review_action == proposal.actual_review_action, + source_mutation_count: proposal.source_mutations.len() + + forbidden_diff_key_count(&proposal.diff), + unsupported_claim_count: proposal + .unsupported_claim_count + .max(proposal.unsupported_claim_flags.len()), + } +} + +fn lineage_completeness(proposal: &ConsolidationProposalFixture) -> f64 { + let expected = proposal.expected_source_refs.iter().collect::>(); + let actual = proposal.source_refs.iter().collect::>(); + let matched = expected.iter().filter(|source_ref| actual.contains(**source_ref)).count(); + + matched as f64 / expected.len() as f64 +} + +fn forbidden_diff_key_count(value: &Value) -> usize { + match value { + Value::Object(map) => map + .iter() + .map(|(key, nested)| { + usize::from(FORBIDDEN_SOURCE_MUTATION_KEYS.contains(&key.as_str())) + + forbidden_diff_key_count(nested) + }) + .sum(), + Value::Array(items) => items.iter().map(forbidden_diff_key_count).sum(), + _ => 0, + } +} + +fn proposal_usefulness_failures(consolidation: Option<&ConsolidationJobReport>) -> usize { + consolidation.map_or(0, |report| { + report + .proposals + .iter() + .filter(|proposal| proposal.usefulness_score < proposal.min_usefulness_score) + .count() + }) +} + +fn lineage_failures(consolidation: Option<&ConsolidationJobReport>) -> usize { + consolidation.map_or(0, |report| { + report.proposals.iter().filter(|proposal| proposal.lineage_completeness < 1.0).count() + }) +} + +fn review_action_failures(consolidation: Option<&ConsolidationJobReport>) -> usize { + consolidation.map_or(0, |report| { + report.proposals.iter().filter(|proposal| !proposal.review_action_correct).count() + }) +} + +fn blocking_executable_gaps(consolidation: Option<&ConsolidationJobReport>) -> usize { + consolidation.map_or(0, |report| { + report.executable_gaps.iter().filter(|gap| gap.blocks_fixture_pass).count() + }) +} + +fn mean_proposal_metric(values: impl Iterator) -> Option { + let values = values.collect::>(); + + if values.is_empty() { + None + } else { + Some(round3(values.iter().sum::() / values.len() as f64)) + } +} + +fn job_metrics(job: &RealWorldJob, answer: &ProducedAnswer) -> JobMetrics { + let produced_evidence = produced_evidence_ids(answer); + let source_ref_by_evidence = source_ref_by_evidence(job); + let evidence_required_count = + job.required_evidence.iter().filter(|evidence| is_required_use(evidence)).count(); + let evidence_covered_count = job + .required_evidence + .iter() + .filter(|evidence| is_required_use(evidence)) + .filter(|evidence| produced_evidence.contains(&evidence.evidence_id)) + .count(); + let source_ref_required_count = evidence_required_count; + let source_ref_covered_count = job + .required_evidence + .iter() + .filter(|evidence| is_required_use(evidence)) + .filter(|evidence| produced_evidence.contains(&evidence.evidence_id)) + .filter(|evidence| { + source_ref_by_evidence.get(evidence.evidence_id.as_str()).is_some_and(|source_ref| { + source_ref.as_object().is_some_and(|object| !object.is_empty()) + }) + }) + .count(); + let quote_required_count = job + .required_evidence + .iter() + .filter(|evidence| is_required_use(evidence) && evidence.quote.is_some()) + .count(); + let quote_covered_count = job + .required_evidence + .iter() + .filter(|evidence| is_required_use(evidence) && evidence.quote.is_some()) + .filter(|evidence| produced_evidence.contains(&evidence.evidence_id)) + .count(); + let stale_retrieval_count = trap_use_count(job, &produced_evidence, "stale_fact", answer); + let scope_violation_count = ["near_duplicate", "scope_leak"] + .into_iter() + .map(|trap_type| trap_use_count(job, &produced_evidence, trap_type, answer)) + .sum(); + let scope_check_count = job + .negative_traps + .iter() + .filter(|trap| is_scope_trap_type(trap.trap_type.as_str())) + .count(); + let redaction_leak_count = trap_use_count(job, &produced_evidence, "privacy_leak", answer); + let scope_correct_count = scope_check_count.saturating_sub(scope_violation_count); + let qdrant_rebuild_case = job.tags.iter().any(|tag| tag == "qdrant_rebuild"); + + JobMetrics { + evidence_required_count, + evidence_covered_count, + source_ref_required_count, + source_ref_covered_count, + quote_required_count, + quote_covered_count, + stale_retrieval_count, + scope_check_count, + scope_correct_count, + scope_violation_count, + redaction_leak_count, + qdrant_rebuild_case, + } +} + +fn source_ref_by_evidence(job: &RealWorldJob) -> BTreeMap<&str, &Value> { + job.corpus.items.iter().map(|item| (item.evidence_id.as_str(), &item.source_ref)).collect() +} + +fn is_scope_trap_type(trap_type: &str) -> bool { + matches!(trap_type, "near_duplicate" | "scope_leak") +} + +fn trap_use_count( + job: &RealWorldJob, + produced_evidence: &BTreeSet, + trap_type: &str, + answer: &ProducedAnswer, +) -> usize { + job.negative_traps + .iter() + .filter(|trap| trap.failure_if_used && trap.trap_type == trap_type) + .filter(|trap| trap_was_used(job, trap, produced_evidence, answer)) + .count() +} + +fn trap_was_used( + job: &RealWorldJob, + trap: &NegativeTrap, + produced_evidence: &BTreeSet, + answer: &ProducedAnswer, +) -> bool { + trap.evidence_ids.iter().any(|evidence_id| { + produced_evidence.contains(evidence_id) + || answer_contains_corpus_item(job, evidence_id, answer) + }) +} + +fn answer_contains_corpus_item( + job: &RealWorldJob, + evidence_id: &str, + answer: &ProducedAnswer, +) -> bool { + job.corpus + .items + .iter() + .find(|item| item.evidence_id == evidence_id) + .and_then(|item| item.text.as_deref()) + .is_some_and(|text| !text.trim().is_empty() && answer.content.contains(text)) +} + +fn retrieval_quality_report(job: &RealWorldJob, answer: &ProducedAnswer) -> RetrievalQualityReport { + let expected = expected_evidence_ids(job); + let allowed = allowed_evidence_ids(job); + let produced = produced_evidence_ids(answer); + let trap_evidence = trap_evidence_ids(job); + let expected_evidence_matched = + expected.iter().filter(|evidence_id| produced.contains(evidence_id.as_str())).count(); + let irrelevant_context_count = + produced.iter().filter(|evidence_id| !allowed.contains(evidence_id.as_str())).count(); + let trap_context_count = + produced.iter().filter(|evidence_id| trap_evidence.contains(evidence_id.as_str())).count(); + + RetrievalQualityReport { + expected_evidence_total: expected.len(), + expected_evidence_matched, + expected_evidence_recall: ratio_or(expected_evidence_matched, expected.len(), 1.0), + produced_evidence_total: produced.len(), + irrelevant_context_count, + irrelevant_context_ratio: ratio_or(irrelevant_context_count, produced.len(), 0.0), + trap_context_count, + } +} + +fn expected_evidence_ids(job: &RealWorldJob) -> BTreeSet { + job.required_evidence + .iter() + .filter(|evidence| is_required_use(evidence)) + .map(|evidence| evidence.evidence_id.clone()) + .collect() +} + +fn allowed_evidence_ids(job: &RealWorldJob) -> BTreeSet { + let mut allowed = expected_evidence_ids(job); + + for link in job.expected_answer.evidence_links.values() { + allowed.extend(link.ids()); + } + + allowed +} + +fn trap_evidence_ids(job: &RealWorldJob) -> BTreeSet { + job.negative_traps.iter().flat_map(|trap| trap.evidence_ids.iter().cloned()).collect() +} + +fn expected_evidence_report(job: &RealWorldJob) -> Vec { + job.required_evidence + .iter() + .map(|evidence| ExpectedEvidenceReport { + evidence_id: evidence.evidence_id.clone(), + claim_id: evidence.claim_id.clone(), + requirement: evidence.requirement.clone(), + }) + .collect() +} + +fn suite_reports(jobs: &[JobReport]) -> Vec { + SUITES.iter().map(|suite_id| suite_report(suite_id, jobs)).collect() +} + +fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport { + let suite_jobs = jobs.iter().filter(|job| job.suite_id == suite_id).collect::>(); + + if suite_jobs.is_empty() { + return SuiteReport { + suite_id: suite_id.to_string(), + status: TypedStatus::NotEncoded, + encoded_job_count: 0, + score_mean: None, + unsupported_claim_count: 0, + wrong_result_count: 0, + stale_answer_count: 0, + conflict_detection_count: 0, + update_rationale_available_count: 0, + temporal_validity_not_encoded_count: 0, + history_readback_encoded_count: 0, + expected_evidence_recall: None, + irrelevant_context_ratio: None, + trace_explainability_count: 0, + reason: NOT_ENCODED_REASON.to_string(), + }; + } + + let status = aggregate_status(&suite_jobs); + let score_sum = suite_jobs.iter().map(|job| job.normalized_score).sum::(); + let unsupported_claim_count = suite_jobs.iter().map(|job| job.unsupported_claim_count).sum(); + let wrong_result_count = suite_jobs.iter().map(|job| job.wrong_result_count).sum(); + let stale_answer_count = suite_jobs.iter().map(|job| job.stale_answer_count).sum(); + let conflict_detection_count = suite_jobs.iter().map(|job| job.conflict_detection_count).sum(); + let update_rationale_available_count = + suite_jobs.iter().filter(|job| job.update_rationale_available).count(); + let temporal_validity_not_encoded_count = + suite_jobs.iter().filter(|job| job.temporal_validity_not_encoded).count(); + let history_readback_encoded_count = + suite_jobs.iter().filter(|job| job.history_readback_encoded).count(); + let trace_explainability_count = + suite_jobs.iter().filter(|job| job.trace_explainability.is_some()).count(); + + SuiteReport { + suite_id: suite_id.to_string(), + status, + encoded_job_count: suite_jobs.len(), + score_mean: Some(round3(score_sum / suite_jobs.len() as f64)), + unsupported_claim_count, + wrong_result_count, + stale_answer_count, + conflict_detection_count, + update_rationale_available_count, + temporal_validity_not_encoded_count, + history_readback_encoded_count, + expected_evidence_recall: Some(expected_evidence_recall_for_jobs(&suite_jobs)), + irrelevant_context_ratio: Some(irrelevant_context_ratio_for_jobs(&suite_jobs)), + trace_explainability_count, + reason: suite_reason(status, suite_jobs.len()), + } +} + +fn aggregate_status(jobs: &[&JobReport]) -> TypedStatus { + let statuses = jobs.iter().map(|job| job.status).collect::>(); + + if statuses.contains(&TypedStatus::UnsupportedClaim) { + TypedStatus::UnsupportedClaim + } else if statuses.contains(&TypedStatus::LifecycleFail) { + TypedStatus::LifecycleFail + } else if statuses.contains(&TypedStatus::WrongResult) { + TypedStatus::WrongResult + } else if statuses.contains(&TypedStatus::Incomplete) { + TypedStatus::Incomplete + } else if statuses.contains(&TypedStatus::Blocked) { + TypedStatus::Blocked + } else if statuses.contains(&TypedStatus::NotEncoded) { + TypedStatus::NotEncoded + } else if statuses.contains(&TypedStatus::Pass) { + TypedStatus::Pass + } else { + TypedStatus::NotEncoded + } +} + +fn suite_reason(status: TypedStatus, encoded_job_count: usize) -> String { + match status { + TypedStatus::Pass => format!("All {encoded_job_count} encoded job(s) passed."), + TypedStatus::UnsupportedClaim => + "At least one encoded job produced an unsupported claim.".to_string(), + TypedStatus::WrongResult => "At least one encoded job returned a wrong result.".to_string(), + TypedStatus::LifecycleFail => + "At least one encoded lifecycle-scored job failed lifecycle behavior.".to_string(), + TypedStatus::Incomplete => "At least one encoded job could not complete.".to_string(), + TypedStatus::Blocked => "At least one encoded job is blocked.".to_string(), + TypedStatus::NotEncoded => + if encoded_job_count == 0 { + NOT_ENCODED_REASON.to_string() + } else { + "At least one encoded fixture declares a not_encoded limitation.".to_string() + }, + } +} + +fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { + let job_refs = jobs.iter().collect::>(); + let evidence_required_count = jobs.iter().map(|job| job.evidence_required_count).sum(); + let evidence_covered_count = jobs.iter().map(|job| job.evidence_covered_count).sum(); + let source_ref_required_count = jobs.iter().map(|job| job.source_ref_required_count).sum(); + let source_ref_covered_count = jobs.iter().map(|job| job.source_ref_covered_count).sum(); + let quote_required_count = jobs.iter().map(|job| job.quote_required_count).sum(); + let quote_covered_count = jobs.iter().map(|job| job.quote_covered_count).sum(); + let scope_check_count = jobs.iter().map(|job| job.scope_check_count).sum(); + let scope_correct_count = jobs.iter().map(|job| job.scope_correct_count).sum(); + let mut summary = ReportSummary { + job_count: jobs.len(), + encoded_suite_count: suites.iter().filter(|suite| suite.encoded_job_count > 0).count(), + not_encoded: 0, + unsupported_claim_count: jobs.iter().map(|job| job.unsupported_claim_count).sum(), + wrong_result_count: jobs.iter().map(|job| job.wrong_result_count).sum(), + stale_answer_count: jobs.iter().map(|job| job.stale_answer_count).sum(), + conflict_detection_count: jobs.iter().map(|job| job.conflict_detection_count).sum(), + update_rationale_available_count: jobs + .iter() + .filter(|job| job.update_rationale_available) + .count(), + temporal_validity_not_encoded_count: jobs + .iter() + .filter(|job| job.temporal_validity_not_encoded) + .count(), + history_readback_encoded_count: jobs + .iter() + .filter(|job| job.history_readback_encoded) + .count(), + expected_evidence_total: jobs + .iter() + .map(|job| job.retrieval_quality.expected_evidence_total) + .sum(), + expected_evidence_matched: jobs + .iter() + .map(|job| job.retrieval_quality.expected_evidence_matched) + .sum(), + expected_evidence_recall: expected_evidence_recall_for_jobs(&job_refs), + irrelevant_context_count: jobs + .iter() + .map(|job| job.retrieval_quality.irrelevant_context_count) + .sum(), + irrelevant_context_ratio: irrelevant_context_ratio_for_jobs(&job_refs), + trace_explainability_count: jobs + .iter() + .filter(|job| job.trace_explainability.is_some()) + .count(), + wrong_result_stage_attribution_count: jobs + .iter() + .filter(|job| { + job.status == TypedStatus::WrongResult + && trace_failure_stage(job.trace_explainability.as_ref()).is_some() + }) + .count(), + mean_score: mean_score(jobs), + mean_latency_ms: mean_latency(jobs), + total_cost: total_cost(jobs), + evidence_required_count, + evidence_covered_count, + evidence_coverage: ratio(evidence_covered_count, evidence_required_count), + source_ref_required_count, + source_ref_covered_count, + source_ref_coverage: ratio(source_ref_covered_count, source_ref_required_count), + quote_required_count, + quote_covered_count, + quote_coverage: ratio(quote_covered_count, quote_required_count), + stale_retrieval_count: jobs.iter().map(|job| job.stale_retrieval_count).sum(), + scope_check_count, + scope_correct_count, + scope_correctness: ratio(scope_correct_count, scope_check_count), + scope_violation_count: jobs.iter().map(|job| job.scope_violation_count).sum(), + redaction_leak_count: jobs.iter().map(|job| job.redaction_leak_count).sum(), + qdrant_rebuild_case_count: jobs.iter().filter(|job| job.qdrant_rebuild_case).count(), + qdrant_rebuild_pass_count: jobs + .iter() + .filter(|job| job.qdrant_rebuild_case && job.status == TypedStatus::Pass) + .count(), + operator_debug_job_count: jobs.iter().filter(|job| job.operator_debug.is_some()).count(), + raw_sql_needed_count: jobs + .iter() + .filter_map(|job| job.operator_debug.as_ref()) + .filter(|debug| debug.raw_sql_needed) + .count(), + trace_incomplete_count: jobs + .iter() + .filter_map(|job| job.operator_debug.as_ref()) + .filter(|debug| debug.trace_completeness != "complete") + .count(), + operator_ux_gap_count: jobs + .iter() + .filter_map(|job| job.operator_debug.as_ref()) + .map(|debug| debug.ux_gaps.len()) + .sum(), + consolidation: consolidation_summary(jobs), + memory_summary: memory_summary_summary(jobs), + proactive_brief: proactive_brief_summary(jobs), + scheduled_memory: scheduled_memory_summary(jobs), + knowledge: knowledge_summary(jobs), + ..ReportSummary::default() + }; + + for job in jobs { + match job.status { + TypedStatus::Pass => summary.pass += 1, + TypedStatus::WrongResult => summary.wrong_result += 1, + TypedStatus::LifecycleFail => summary.lifecycle_fail += 1, + TypedStatus::Incomplete => summary.incomplete += 1, + TypedStatus::Blocked => summary.blocked += 1, + TypedStatus::NotEncoded => summary.not_encoded += 1, + TypedStatus::UnsupportedClaim => summary.unsupported_claim += 1, + } + } + + summary +} + +fn evolution_summary(jobs: &[JobReport]) -> EvolutionSummary { + EvolutionSummary { + stale_answer_count: jobs.iter().map(|job| job.stale_answer_count).sum(), + conflict_detection_count: jobs.iter().map(|job| job.conflict_detection_count).sum(), + update_rationale_available_count: jobs + .iter() + .filter(|job| job.update_rationale_available) + .count(), + temporal_validity_not_encoded_count: jobs + .iter() + .filter(|job| job.temporal_validity_not_encoded) + .count(), + history_readback_encoded_count: jobs + .iter() + .filter(|job| job.history_readback_encoded) + .count(), + } +} + +fn follow_up_reports(jobs: &[RealWorldJob]) -> Vec { + jobs.iter() + .filter_map(|job| { + job.encoding.follow_up.as_ref().map(|follow_up| FollowUpReport { + suite_id: job.suite.clone(), + job_id: job.job_id.clone(), + title: follow_up.title.clone(), + reason: follow_up.reason.clone(), + }) + }) + .collect() +} + +fn ratio(numerator: usize, denominator: usize) -> f64 { + if denominator == 0 { + return 0.0; + } + + round3(numerator as f64 / denominator as f64) +} + +fn expected_evidence_recall_for_jobs(jobs: &[&JobReport]) -> f64 { + let total = jobs.iter().map(|job| job.retrieval_quality.expected_evidence_total).sum::(); + let matched = + jobs.iter().map(|job| job.retrieval_quality.expected_evidence_matched).sum::(); + + ratio_or(matched, total, 1.0) +} + +fn irrelevant_context_ratio_for_jobs(jobs: &[&JobReport]) -> f64 { + let total = jobs.iter().map(|job| job.retrieval_quality.produced_evidence_total).sum::(); + let irrelevant = + jobs.iter().map(|job| job.retrieval_quality.irrelevant_context_count).sum::(); + + ratio_or(irrelevant, total, 0.0) +} + +fn ratio_or(numerator: usize, denominator: usize, empty_value: f64) -> f64 { + if denominator == 0 { empty_value } else { round3(numerator as f64 / denominator as f64) } +} + +fn ratio_or_full(numerator: usize, denominator: usize) -> f64 { + ratio_or(numerator, denominator, 1.0) +} + +fn consolidation_summary(jobs: &[JobReport]) -> ConsolidationSummaryReport { + let reports = jobs.iter().filter_map(|job| job.consolidation.as_ref()).collect::>(); + + if reports.is_empty() { + return ConsolidationSummaryReport::default(); + } + + let proposals = reports.iter().flat_map(|report| report.proposals.iter()).collect::>(); + let executable_gap_count = reports.iter().map(|report| report.executable_gaps.len()).sum(); + + ConsolidationSummaryReport { + proposal_count: proposals.len(), + proposal_usefulness: mean_proposal_metric( + proposals.iter().map(|proposal| proposal.usefulness_score), + ), + lineage_completeness: mean_proposal_metric( + proposals.iter().map(|proposal| proposal.lineage_completeness), + ), + review_action_correctness: mean_proposal_metric( + proposals.iter().map(|proposal| if proposal.review_action_correct { 1.0 } else { 0.0 }), + ), + source_mutation_count: proposals + .iter() + .map(|proposal| proposal.source_mutation_count) + .sum(), + proposal_unsupported_claim_count: proposals + .iter() + .map(|proposal| proposal.unsupported_claim_count) + .sum(), + executable_gap_count, + } +} + +fn memory_summary_summary(jobs: &[JobReport]) -> Option { + let memory_jobs = jobs.iter().filter_map(|job| job.memory_summary.as_ref()).collect::>(); + + if memory_jobs.is_empty() { + return None; + } + + let job_count = memory_jobs.len(); + let summary_count = memory_jobs.iter().map(|metrics| metrics.summary_count).sum(); + let entry_count = memory_jobs.iter().map(|metrics| metrics.entry_count).sum(); + let required_category_count = + memory_jobs.iter().map(|metrics| metrics.required_category_count).sum(); + let covered_required_category_count = + memory_jobs.iter().map(|metrics| metrics.covered_required_category_count).sum(); + let source_ref_required_count = + memory_jobs.iter().map(|metrics| metrics.source_ref_required_count).sum(); + let source_ref_entry_count = + memory_jobs.iter().map(|metrics| metrics.source_ref_entry_count).sum(); + let freshness_marker_count = + memory_jobs.iter().map(|metrics| metrics.freshness_marker_count).sum(); + let rationale_count = memory_jobs.iter().map(|metrics| metrics.rationale_count).sum(); + + Some(MemorySummaryReport { + job_count, + summary_count, + entry_count, + required_category_count, + covered_required_category_count, + missing_required_category_count: memory_jobs + .iter() + .map(|metrics| metrics.missing_required_category_count) + .sum(), + top_of_mind_count: memory_jobs.iter().map(|metrics| metrics.top_of_mind_count).sum(), + background_count: memory_jobs.iter().map(|metrics| metrics.background_count).sum(), + stale_count: memory_jobs.iter().map(|metrics| metrics.stale_count).sum(), + superseded_count: memory_jobs.iter().map(|metrics| metrics.superseded_count).sum(), + tombstone_count: memory_jobs.iter().map(|metrics| metrics.tombstone_count).sum(), + derived_project_profile_count: memory_jobs + .iter() + .map(|metrics| metrics.derived_project_profile_count) + .sum(), + source_ref_required_count, + source_ref_entry_count, + source_ref_coverage: ratio(source_ref_entry_count, source_ref_required_count), + freshness_marker_count, + freshness_coverage: ratio(freshness_marker_count, entry_count), + rationale_count, + rationale_coverage: ratio(rationale_count, entry_count), + invalid_top_of_mind_count: memory_jobs + .iter() + .map(|metrics| metrics.invalid_top_of_mind_count) + .sum(), + untraced_entry_count: memory_jobs.iter().map(|metrics| metrics.untraced_entry_count).sum(), + derived_with_source_or_unsupported_count: memory_jobs + .iter() + .map(|metrics| metrics.derived_with_source_or_unsupported_count) + .sum(), + derived_missing_source_or_unsupported_count: memory_jobs + .iter() + .map(|metrics| metrics.derived_missing_source_or_unsupported_count) + .sum(), + unsupported_derived_entry_count: memory_jobs + .iter() + .map(|metrics| metrics.unsupported_derived_entry_count) + .sum(), + unsupported_current_entry_count: memory_jobs + .iter() + .map(|metrics| metrics.unsupported_current_entry_count) + .sum(), + tombstone_ref_count: memory_jobs.iter().map(|metrics| metrics.tombstone_ref_count).sum(), + source_trace_selected_count: memory_jobs + .iter() + .map(|metrics| metrics.source_trace_selected_count) + .sum(), + source_trace_dropped_count: memory_jobs + .iter() + .map(|metrics| metrics.source_trace_dropped_count) + .sum(), + source_trace_stale_count: memory_jobs + .iter() + .map(|metrics| metrics.source_trace_stale_count) + .sum(), + source_trace_superseded_count: memory_jobs + .iter() + .map(|metrics| metrics.source_trace_superseded_count) + .sum(), + source_trace_tombstone_count: memory_jobs + .iter() + .map(|metrics| metrics.source_trace_tombstone_count) + .sum(), + }) +} + +fn proactive_brief_summary(jobs: &[JobReport]) -> Option { + let proactive_jobs = + jobs.iter().filter_map(|job| job.proactive_brief.as_ref()).collect::>(); + + if proactive_jobs.is_empty() { + return None; + } + + let job_count = proactive_jobs.len(); + let suggestion_count = + proactive_jobs.iter().map(|metrics| metrics.suggestion_count).sum::(); + let evidence_ref_required_count = + proactive_jobs.iter().map(|metrics| metrics.evidence_ref_required_count).sum(); + let evidence_ref_suggestion_count = + proactive_jobs.iter().map(|metrics| metrics.evidence_ref_suggestion_count).sum(); + let freshness_marker_count = + proactive_jobs.iter().map(|metrics| metrics.freshness_marker_count).sum(); + let action_rationale_count = + proactive_jobs.iter().map(|metrics| metrics.action_rationale_count).sum(); + + Some(ProactiveBriefSummaryReport { + job_count, + brief_count: proactive_jobs.iter().map(|metrics| metrics.brief_count).sum(), + suggestion_count, + required_suggestion_kind_count: proactive_jobs + .iter() + .map(|metrics| metrics.required_suggestion_kind_count) + .sum(), + covered_required_suggestion_kind_count: proactive_jobs + .iter() + .map(|metrics| metrics.covered_required_suggestion_kind_count) + .sum(), + missing_required_suggestion_kind_count: proactive_jobs + .iter() + .map(|metrics| metrics.missing_required_suggestion_kind_count) + .sum(), + evidence_ref_required_count, + evidence_ref_suggestion_count, + evidence_ref_coverage: ratio(evidence_ref_suggestion_count, evidence_ref_required_count), + freshness_marker_count, + freshness_coverage: ratio(freshness_marker_count, suggestion_count), + action_rationale_count, + action_rationale_coverage: ratio(action_rationale_count, suggestion_count), + recommended_count: proactive_jobs.iter().map(|metrics| metrics.recommended_count).sum(), + deferred_count: proactive_jobs.iter().map(|metrics| metrics.deferred_count).sum(), + rejected_count: proactive_jobs.iter().map(|metrics| metrics.rejected_count).sum(), + current_suggestion_count: proactive_jobs + .iter() + .map(|metrics| metrics.current_suggestion_count) + .sum(), + non_current_suggestion_count: proactive_jobs + .iter() + .map(|metrics| metrics.non_current_suggestion_count) + .sum(), + stale_warning_count: proactive_jobs.iter().map(|metrics| metrics.stale_warning_count).sum(), + invalid_current_suggestion_count: proactive_jobs + .iter() + .map(|metrics| metrics.invalid_current_suggestion_count) + .sum(), + untraced_suggestion_count: proactive_jobs + .iter() + .map(|metrics| metrics.untraced_suggestion_count) + .sum(), + unsupported_current_suggestion_count: proactive_jobs + .iter() + .map(|metrics| metrics.unsupported_current_suggestion_count) + .sum(), + tombstone_violation_count: proactive_jobs + .iter() + .map(|metrics| metrics.tombstone_violation_count) + .sum(), + source_trace_selected_count: proactive_jobs + .iter() + .map(|metrics| metrics.source_trace_selected_count) + .sum(), + source_trace_dropped_count: proactive_jobs + .iter() + .map(|metrics| metrics.source_trace_dropped_count) + .sum(), + source_trace_stale_count: proactive_jobs + .iter() + .map(|metrics| metrics.source_trace_stale_count) + .sum(), + source_trace_superseded_count: proactive_jobs + .iter() + .map(|metrics| metrics.source_trace_superseded_count) + .sum(), + source_trace_tombstone_count: proactive_jobs + .iter() + .map(|metrics| metrics.source_trace_tombstone_count) + .sum(), + }) +} + +fn scheduled_memory_summary(jobs: &[JobReport]) -> Option { + let scheduled_jobs = + jobs.iter().filter_map(|job| job.scheduled_memory.as_ref()).collect::>(); + + if scheduled_jobs.is_empty() { + return None; + } + + let job_count = scheduled_jobs.len(); + let output_count = scheduled_jobs.iter().map(|metrics| metrics.output_count).sum::(); + let evidence_ref_required_count = + scheduled_jobs.iter().map(|metrics| metrics.evidence_ref_required_count).sum(); + let evidence_ref_output_count = + scheduled_jobs.iter().map(|metrics| metrics.evidence_ref_output_count).sum(); + let freshness_marker_count = + scheduled_jobs.iter().map(|metrics| metrics.freshness_marker_count).sum(); + let action_rationale_count = + scheduled_jobs.iter().map(|metrics| metrics.action_rationale_count).sum(); + let trace_required_count = + scheduled_jobs.iter().map(|metrics| metrics.trace_required_count).sum(); + let trace_complete_count = + scheduled_jobs.iter().map(|metrics| metrics.trace_complete_count).sum(); + + Some(ScheduledMemorySummaryReport { + job_count, + task_run_count: scheduled_jobs.iter().map(|metrics| metrics.task_run_count).sum(), + output_count, + required_task_kind_count: scheduled_jobs + .iter() + .map(|metrics| metrics.required_task_kind_count) + .sum(), + covered_required_task_kind_count: scheduled_jobs + .iter() + .map(|metrics| metrics.covered_required_task_kind_count) + .sum(), + missing_required_task_kind_count: scheduled_jobs + .iter() + .map(|metrics| metrics.missing_required_task_kind_count) + .sum(), + evidence_ref_required_count, + evidence_ref_output_count, + evidence_ref_coverage: ratio(evidence_ref_output_count, evidence_ref_required_count), + freshness_marker_count, + freshness_coverage: ratio(freshness_marker_count, output_count), + action_rationale_count, + action_rationale_coverage: ratio(action_rationale_count, output_count), + trace_required_count, + trace_complete_count, + trace_coverage: ratio(trace_complete_count, trace_required_count), + source_mutation_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_mutation_count) + .sum(), + current_output_count: scheduled_jobs + .iter() + .map(|metrics| metrics.current_output_count) + .sum(), + non_current_output_count: scheduled_jobs + .iter() + .map(|metrics| metrics.non_current_output_count) + .sum(), + invalid_current_output_count: scheduled_jobs + .iter() + .map(|metrics| metrics.invalid_current_output_count) + .sum(), + untraced_output_count: scheduled_jobs + .iter() + .map(|metrics| metrics.untraced_output_count) + .sum(), + unsupported_current_output_count: scheduled_jobs + .iter() + .map(|metrics| metrics.unsupported_current_output_count) + .sum(), + tombstone_violation_count: scheduled_jobs + .iter() + .map(|metrics| metrics.tombstone_violation_count) + .sum(), + source_trace_selected_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_trace_selected_count) + .sum(), + source_trace_dropped_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_trace_dropped_count) + .sum(), + source_trace_stale_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_trace_stale_count) + .sum(), + source_trace_superseded_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_trace_superseded_count) + .sum(), + source_trace_tombstone_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_trace_tombstone_count) + .sum(), + }) +} + +fn knowledge_summary(jobs: &[JobReport]) -> Option { + let knowledge_jobs = jobs.iter().filter_map(|job| job.knowledge.as_ref()).collect::>(); + + if knowledge_jobs.is_empty() { + return None; + } + + let job_count = knowledge_jobs.len(); + let page_count = knowledge_jobs.iter().map(|metrics| metrics.page_count).sum::(); + let section_count = knowledge_jobs.iter().map(|metrics| metrics.section_count).sum::(); + let traced_section_count = + knowledge_jobs.iter().map(|metrics| metrics.traced_section_count).sum::(); + let stale_trap_count = + knowledge_jobs.iter().map(|metrics| metrics.stale_trap_count).sum::(); + let stale_traps_detected = + knowledge_jobs.iter().map(|metrics| metrics.stale_traps_detected).sum::(); + let deterministic_rebuild_count = + knowledge_jobs.iter().map(|metrics| metrics.deterministic_rebuild_count).sum::(); + let rebuild_page_count = + knowledge_jobs.iter().map(|metrics| metrics.rebuild_page_count).sum::(); + let backlink_count = knowledge_jobs.iter().map(|metrics| metrics.backlink_count).sum::(); + let pages_with_backlinks = + knowledge_jobs.iter().map(|metrics| metrics.pages_with_backlinks).sum::(); + let page_usefulness = round3( + knowledge_jobs.iter().map(|metrics| metrics.page_usefulness).sum::() + / job_count as f64, + ); + + Some(KnowledgeSummary { + job_count, + page_count, + section_count, + backlink_count, + pages_with_backlinks, + citation_coverage: ratio(traced_section_count, section_count), + stale_claim_detection: ratio_or_full(stale_traps_detected, stale_trap_count), + rebuild_determinism: ratio(deterministic_rebuild_count, rebuild_page_count), + backlink_coverage: ratio(pages_with_backlinks, page_count), + page_usefulness, + unsupported_summary_count: knowledge_jobs + .iter() + .map(|metrics| metrics.unsupported_summary_count) + .sum(), + untraced_section_count: knowledge_jobs + .iter() + .map(|metrics| metrics.untraced_section_count) + .sum(), + allowed_variance_count: knowledge_jobs + .iter() + .map(|metrics| metrics.allowed_variance_count) + .sum(), + }) +} + +fn mean_score(jobs: &[JobReport]) -> f64 { + if jobs.is_empty() { + return 0.0; + } + + round3(jobs.iter().map(|job| job.normalized_score).sum::() / jobs.len() as f64) +} + +fn mean_latency(jobs: &[JobReport]) -> Option { + let latencies = jobs.iter().filter_map(|job| job.latency_ms).collect::>(); + + if latencies.is_empty() { + return None; + } + + Some(round3(latencies.iter().sum::() / latencies.len() as f64)) +} + +fn total_cost(jobs: &[JobReport]) -> Option { + let costs = jobs.iter().filter_map(|job| job.cost.as_ref()).collect::>(); + + if costs.is_empty() { + return None; + } + + let currency = costs.iter().find_map(|cost| cost.currency.clone()); + let amount = sum_optional_f64(costs.iter().filter_map(|cost| cost.amount)); + let input_tokens = sum_optional_u64(costs.iter().filter_map(|cost| cost.input_tokens)); + let output_tokens = sum_optional_u64(costs.iter().filter_map(|cost| cost.output_tokens)); + + Some(CostReport { currency, amount, input_tokens, output_tokens }) +} + +fn sum_optional_f64(values: impl Iterator) -> Option { + let values = values.collect::>(); + + if values.is_empty() { None } else { Some(round3(values.iter().sum())) } +} + +fn sum_optional_u64(values: impl Iterator) -> Option { + let values = values.collect::>(); + + if values.is_empty() { None } else { Some(values.iter().sum()) } +} + +fn corpus_profile(jobs: &[RealWorldJob]) -> String { + let profiles = jobs.iter().map(|job| job.corpus.profile.as_str()).collect::>(); + + if profiles.len() == 1 { + profiles.into_iter().next().unwrap_or("unknown").to_string() + } else { + "mixed".to_string() + } +} + +fn adapter_report(args: &RunArgs) -> Result { + Ok(AdapterReport { + adapter_id: args.adapter_id.clone(), + name: args.adapter_name.clone(), + behavior: args.adapter_behavior.clone(), + storage: typed_status_from_arg( + args.adapter_storage_status.as_str(), + "--adapter-storage-status", + )?, + runtime: typed_status_from_arg( + args.adapter_runtime_status.as_str(), + "--adapter-runtime-status", + )?, + notes: args.adapter_notes.clone(), + }) +} + +fn typed_status_from_arg(raw: &str, flag: &str) -> Result { + match raw { + "pass" => Ok(TypedStatus::Pass), + "wrong_result" => Ok(TypedStatus::WrongResult), + "lifecycle_fail" => Ok(TypedStatus::LifecycleFail), + "incomplete" => Ok(TypedStatus::Incomplete), + "blocked" => Ok(TypedStatus::Blocked), + "not_encoded" => Ok(TypedStatus::NotEncoded), + "unsupported_claim" => Ok(TypedStatus::UnsupportedClaim), + _ => Err(eyre::eyre!( + "{flag} must be one of pass, wrong_result, lifecycle_fail, incomplete, blocked, not_encoded, or unsupported_claim." + )), + } +} + +fn external_adapter_section( + manifest_path: &Path, + skip_manifest: bool, +) -> Result { + if skip_manifest { + return Ok(empty_external_adapter_section("skipped")); + } + + let manifest_path = resolve_external_adapter_manifest_path(manifest_path); + + if !manifest_path.exists() { + return Ok(empty_external_adapter_section("missing")); + } + + let raw = fs::read_to_string(&manifest_path)?; + let manifest = serde_json::from_str::(&raw).map_err(|err| { + eyre::eyre!("Failed to parse external adapter manifest {}: {err}", manifest_path.display()) + })?; + + validate_external_adapter_manifest(&manifest, &manifest_path)?; + + let summary = external_adapter_summary(&manifest.adapters); + + Ok(ExternalAdapterSection { + schema: EXTERNAL_ADAPTER_REPORT_SCHEMA.to_string(), + manifest_id: manifest.manifest_id, + docker_isolation: manifest.docker_isolation, + summary, + adapters: manifest.adapters, + }) +} + +fn empty_external_adapter_section(reason: &str) -> ExternalAdapterSection { + ExternalAdapterSection { + schema: EXTERNAL_ADAPTER_REPORT_SCHEMA.to_string(), + manifest_id: reason.to_string(), + docker_isolation: ExternalDockerIsolation::default(), + summary: ExternalAdapterSummary::default(), + adapters: Vec::new(), + } +} + +fn resolve_external_adapter_manifest_path(path: &Path) -> PathBuf { + if path.exists() || path.is_absolute() { + return path.to_path_buf(); + } + + let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); + let Some(workspace_root) = manifest_dir.parent().and_then(Path::parent) else { + return path.to_path_buf(); + }; + let workspace_candidate = workspace_root.join(path); + + if workspace_candidate.exists() { workspace_candidate } else { path.to_path_buf() } +} + +fn validate_external_adapter_manifest( + manifest: &ExternalAdapterManifest, + path: &Path, +) -> Result<()> { + if manifest.schema != EXTERNAL_ADAPTER_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {EXTERNAL_ADAPTER_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + + validate_external_docker_isolation(path, &manifest.docker_isolation)?; + + validate_external_adapters(path, &manifest.adapters) +} + +fn validate_external_docker_isolation(path: &Path, docker: &ExternalDockerIsolation) -> Result<()> { + if docker.compose_file.trim().is_empty() + || docker.runner.trim().is_empty() + || docker.artifact_dir.trim().is_empty() + { + return Err(eyre::eyre!("{} has incomplete docker_isolation metadata.", path.display())); + } + if !docker.default { + return Err(eyre::eyre!( + "{} external adapter manifest must default to Docker isolation.", + path.display() + )); + } + if docker.host_global_installs_required { + return Err(eyre::eyre!( + "{} external adapter manifest must not require host-global installs by default.", + path.display() + )); + } + + Ok(()) +} + +fn validate_external_adapters(path: &Path, adapters: &[ExternalAdapterReport]) -> Result<()> { + if adapters.is_empty() { + return Err(eyre::eyre!("{} declares no external adapters.", path.display())); + } + + let mut seen = BTreeSet::new(); + + for adapter in adapters { + validate_external_adapter(path, adapter)?; + + if !seen.insert(adapter.adapter_id.as_str()) { + return Err(eyre::eyre!( + "{} declares duplicate adapter_id {}.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + +fn validate_external_adapter(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + if adapter.adapter_id.trim().is_empty() + || adapter.project.trim().is_empty() + || adapter.adapter_kind.trim().is_empty() + || adapter.evidence_class.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete external adapter.", path.display())); + } + if !matches!( + adapter.evidence_class.as_str(), + "fixture_backed" | "live_baseline_only" | "live_real_world" | "research_gate" + ) { + return Err(eyre::eyre!( + "{} adapter {} has unsupported evidence_class {}.", + path.display(), + adapter.adapter_id, + adapter.evidence_class + )); + } + if adapter.docker_default && adapter.host_global_installs_required { + return Err(eyre::eyre!( + "{} adapter {} is Docker-default but requires host-global installs.", + path.display(), + adapter.adapter_id + )); + } + + validate_adapter_execution(path, adapter)?; + validate_adapter_capabilities(path, adapter)?; + validate_adapter_suites(path, adapter)?; + validate_adapter_scenarios(path, adapter)?; + validate_adapter_evidence(path, adapter)?; + validate_adapter_execution_metadata(path, adapter)?; + + if let Some(follow_up) = &adapter.follow_up + && (follow_up.title.trim().is_empty() || follow_up.reason.trim().is_empty()) + { + return Err(eyre::eyre!( + "{} adapter {} has an incomplete follow_up.", + path.display(), + adapter.adapter_id + )); + } + + Ok(()) +} + +fn validate_adapter_execution(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + for evidence in [&adapter.setup, &adapter.run, &adapter.result] { + if evidence.evidence.trim().is_empty() + || evidence.command.as_deref().is_some_and(str::is_empty) + || evidence.artifact.as_deref().is_some_and(str::is_empty) + { + return Err(eyre::eyre!( + "{} adapter {} has incomplete setup/run/result evidence.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + +fn validate_adapter_capabilities(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + for capability in &adapter.capabilities { + if capability.capability.trim().is_empty() || capability.evidence.trim().is_empty() { + return Err(eyre::eyre!( + "{} adapter {} has incomplete capability coverage.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + +fn validate_adapter_suites(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + for suite in &adapter.suites { + if !SUITES.contains(&suite.suite_id.as_str()) { + return Err(eyre::eyre!( + "{} adapter {} references unknown suite {}.", + path.display(), + adapter.adapter_id, + suite.suite_id + )); + } + if suite.evidence.trim().is_empty() { + return Err(eyre::eyre!( + "{} adapter {} has suite {} without evidence.", + path.display(), + adapter.adapter_id, + suite.suite_id + )); + } + } + + Ok(()) +} + +fn validate_adapter_scenarios(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + for scenario in &adapter.scenarios { + if scenario.scenario_id.trim().is_empty() + || scenario.evidence.trim().is_empty() + || scenario.command.as_deref().is_some_and(str::is_empty) + || scenario.artifact.as_deref().is_some_and(str::is_empty) + { + return Err(eyre::eyre!( + "{} adapter {} has incomplete scenario judgment.", + path.display(), + adapter.adapter_id + )); + } + + if let Some(suite_id) = &scenario.suite_id + && !SUITES.contains(&suite_id.as_str()) + { + return Err(eyre::eyre!( + "{} adapter {} scenario {} references unknown suite {}.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + suite_id + )); + } + + let outcome = scenario_comparison_outcome(scenario); + + if blocked_status_missing_blocked_outcome(scenario.status, scenario.comparison_outcome) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses blocked status without blocked comparison outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id + )); + } + if unmeasured_status_has_measured_outcome(scenario.status, outcome) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} status with {} outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + adapter_status_str(scenario.status), + scenario_comparison_outcome_str(outcome) + )); + } + if unmeasured_status_has_measured_position(scenario.status, scenario.elf_position) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} status with {} position.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + adapter_status_str(scenario.status), + scenario_position_str(scenario.elf_position) + )); + } + if explicit_outcome_conflicts_with_position(scenario) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} position with {} outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + scenario_position_str(scenario.elf_position), + scenario_comparison_outcome_str(outcome) + )); + } + } + + Ok(()) +} + +fn blocked_status_missing_blocked_outcome( + status: AdapterCoverageStatus, + outcome: Option, +) -> bool { + status == AdapterCoverageStatus::Blocked && outcome != Some(ScenarioComparisonOutcome::Blocked) +} + +fn unmeasured_status_has_measured_outcome( + status: AdapterCoverageStatus, + outcome: ScenarioComparisonOutcome, +) -> bool { + matches!( + status, + AdapterCoverageStatus::Blocked + | AdapterCoverageStatus::Incomplete + | AdapterCoverageStatus::NotEncoded + | AdapterCoverageStatus::Unsupported + ) && matches!( + outcome, + ScenarioComparisonOutcome::Win + | ScenarioComparisonOutcome::Tie + | ScenarioComparisonOutcome::Loss + ) +} + +fn unmeasured_status_has_measured_position( + status: AdapterCoverageStatus, + position: ElfScenarioPosition, +) -> bool { + matches!( + status, + AdapterCoverageStatus::Blocked + | AdapterCoverageStatus::Incomplete + | AdapterCoverageStatus::NotEncoded + | AdapterCoverageStatus::Unsupported + ) && matches!( + position, + ElfScenarioPosition::Wins | ElfScenarioPosition::Ties | ElfScenarioPosition::Loses + ) +} + +fn explicit_outcome_conflicts_with_position(scenario: &AdapterScenarioJudgment) -> bool { + let Some(outcome) = scenario.comparison_outcome else { + return false; + }; + + !position_supports_outcome(scenario.elf_position, outcome) +} + +fn position_supports_outcome( + position: ElfScenarioPosition, + outcome: ScenarioComparisonOutcome, +) -> bool { + matches!( + (position, outcome), + (ElfScenarioPosition::Wins, ScenarioComparisonOutcome::Win) + | (ElfScenarioPosition::Ties, ScenarioComparisonOutcome::Tie) + | (ElfScenarioPosition::Loses, ScenarioComparisonOutcome::Loss) + | (ElfScenarioPosition::Untested, ScenarioComparisonOutcome::NotTested) + | (ElfScenarioPosition::Untested, ScenarioComparisonOutcome::Blocked) + | (ElfScenarioPosition::Untested, ScenarioComparisonOutcome::NonGoal) + ) +} + +fn validate_adapter_evidence(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + for evidence in &adapter.evidence { + if evidence.kind.trim().is_empty() || evidence.reference.trim().is_empty() { + return Err(eyre::eyre!( + "{} adapter {} has incomplete evidence pointers.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + +fn validate_adapter_execution_metadata(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { + let Some(metadata) = &adapter.execution_metadata else { + return Ok(()); + }; + + if metadata.setup_path.trim().is_empty() + || metadata.runtime_boundary.trim().is_empty() + || metadata.resource_expectation.trim().is_empty() + || metadata.retry_guidance.iter().any(|guidance| guidance.trim().is_empty()) + || metadata.sources.is_empty() + { + return Err(eyre::eyre!( + "{} adapter {} has incomplete execution metadata.", + path.display(), + adapter.adapter_id + )); + } + + for source in &metadata.sources { + if source.label.trim().is_empty() + || source.url.trim().is_empty() + || source.evidence.trim().is_empty() + { + return Err(eyre::eyre!( + "{} adapter {} has incomplete source metadata.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + +fn external_adapter_summary(adapters: &[ExternalAdapterReport]) -> ExternalAdapterSummary { + let external_projects = adapters + .iter() + .filter_map(|adapter| (adapter.project != "ELF").then_some(adapter.project.as_str())) + .collect::>(); + let mut summary = ExternalAdapterSummary { + adapter_count: adapters.len(), + external_project_count: external_projects.len(), + ..ExternalAdapterSummary::default() + }; + + for adapter in adapters { + accumulate_adapter_summary(&mut summary, adapter); + } + + summary +} + +fn accumulate_adapter_summary( + summary: &mut ExternalAdapterSummary, + adapter: &ExternalAdapterReport, +) { + summary.docker_default_count += usize::from(adapter.docker_default); + summary.host_global_install_required_count += + usize::from(adapter.host_global_installs_required); + summary.fixture_backed_count += usize::from(adapter.evidence_class == "fixture_backed"); + summary.live_baseline_only_count += usize::from(adapter.evidence_class == "live_baseline_only"); + summary.live_real_world_count += usize::from(adapter.evidence_class == "live_real_world"); + summary.research_gate_count += usize::from(adapter.evidence_class == "research_gate"); + + increment_adapter_status_count(&mut summary.overall_status_counts, adapter.overall_status); + + for capability in &adapter.capabilities { + increment_adapter_status_count(&mut summary.capability_status_counts, capability.status); + } + for suite in &adapter.suites { + increment_adapter_status_count(&mut summary.suite_status_counts, suite.status); + } + for scenario in &adapter.scenarios { + increment_adapter_status_count(&mut summary.scenario_status_counts, scenario.status); + increment_scenario_position_count( + &mut summary.scenario_position_counts, + scenario.elf_position, + ); + increment_scenario_outcome_count( + &mut summary.scenario_outcome_counts, + scenario_comparison_outcome(scenario), + ); + } +} + +fn increment_adapter_status_count(counts: &mut AdapterStatusCounts, status: AdapterCoverageStatus) { + match status { + AdapterCoverageStatus::Real => counts.real += 1, + AdapterCoverageStatus::Mocked => counts.mocked += 1, + AdapterCoverageStatus::Unsupported => counts.unsupported += 1, + AdapterCoverageStatus::Blocked => counts.blocked += 1, + AdapterCoverageStatus::Incomplete => counts.incomplete += 1, + AdapterCoverageStatus::WrongResult => counts.wrong_result += 1, + AdapterCoverageStatus::LifecycleFail => counts.lifecycle_fail += 1, + AdapterCoverageStatus::Pass => counts.pass += 1, + AdapterCoverageStatus::NotEncoded => counts.not_encoded += 1, + } +} + +fn increment_scenario_position_count( + counts: &mut ScenarioPositionCounts, + position: ElfScenarioPosition, +) { + match position { + ElfScenarioPosition::Wins => counts.wins += 1, + ElfScenarioPosition::Ties => counts.ties += 1, + ElfScenarioPosition::Loses => counts.loses += 1, + ElfScenarioPosition::Untested => counts.untested += 1, + } +} + +fn scenario_comparison_outcome(scenario: &AdapterScenarioJudgment) -> ScenarioComparisonOutcome { + scenario.comparison_outcome.unwrap_or(match scenario.elf_position { + ElfScenarioPosition::Wins => ScenarioComparisonOutcome::Win, + ElfScenarioPosition::Ties => ScenarioComparisonOutcome::Tie, + ElfScenarioPosition::Loses => ScenarioComparisonOutcome::Loss, + ElfScenarioPosition::Untested => ScenarioComparisonOutcome::NotTested, + }) +} + +fn increment_scenario_outcome_count( + counts: &mut ScenarioOutcomeCounts, + outcome: ScenarioComparisonOutcome, +) { + match outcome { + ScenarioComparisonOutcome::Win => counts.win += 1, + ScenarioComparisonOutcome::Tie => counts.tie += 1, + ScenarioComparisonOutcome::Loss => counts.loss += 1, + ScenarioComparisonOutcome::NotTested => counts.not_tested += 1, + ScenarioComparisonOutcome::Blocked => counts.blocked += 1, + ScenarioComparisonOutcome::NonGoal => counts.non_goal += 1, + } +} + +fn capture_integration_report(jobs: &[RealWorldJob]) -> CaptureIntegrationReport { + let mut report = CaptureIntegrationReport::default(); + + for job in jobs { + extend_unique(&mut report.real, &job.corpus.capture_behaviors.real); + extend_unique(&mut report.fixture_backed, &job.corpus.capture_behaviors.fixture_backed); + extend_unique(&mut report.mocked, &job.corpus.capture_behaviors.mocked); + extend_unique(&mut report.blocked, &job.corpus.capture_behaviors.blocked); + extend_unique(&mut report.not_encoded, &job.corpus.capture_behaviors.not_encoded); + extend_unique(&mut report.notes, &job.corpus.capture_behaviors.notes); + } + + if report.real.is_empty() + && report.fixture_backed.is_empty() + && report.mocked.is_empty() + && report.blocked.is_empty() + && report.not_encoded.is_empty() + { + report + .not_encoded + .push("No capture/integration behavior was declared by encoded fixtures.".to_string()); + } + + report +} + +fn extend_unique(target: &mut Vec, values: &[String]) { + let mut seen = target.iter().cloned().collect::>(); + + for value in values { + if seen.insert(value.clone()) { + target.push(value.clone()); + } + } +} + +fn private_corpus_redaction(jobs: &[RealWorldJob]) -> PrivateCorpusRedaction { + let private_fixture_count = jobs + .iter() + .filter(|job| matches!(job.corpus.profile, CorpusProfile::PrivateSanitized)) + .count(); + let policy = if private_fixture_count == 0 { + "no_private_corpus".to_string() + } else { + "publish evidence ids and bounded score summaries only; do not publish private text" + .to_string() + }; + + PrivateCorpusRedaction { policy, private_fixture_count } +} + +fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { + let report_path = report_path.display().to_string(); + let mut out = String::new(); + + render_markdown_header(&mut out, report, report_path.as_str()); + render_markdown_external_adapters(&mut out, report); + render_markdown_capture_integration(&mut out, report); + render_markdown_suites(&mut out, report); + render_markdown_jobs(&mut out, report); + render_markdown_operator_debugging(&mut out, report); + render_markdown_evolution(&mut out, report); + render_markdown_trace_explainability(&mut out, report); + render_markdown_consolidation(&mut out, report); + render_markdown_memory_summary(&mut out, report); + render_markdown_proactive_brief(&mut out, report); + render_markdown_scheduled_memory(&mut out, report); + render_markdown_knowledge(&mut out, report); + render_markdown_unsupported_claims(&mut out, report); + render_markdown_follow_ups(&mut out, report); + render_markdown_semantics(&mut out, report); + + out +} + +fn render_markdown_capture_integration(out: &mut String, report: &RealWorldReport) { + out.push_str("## Capture And Integration Coverage\n\n"); + + if report.adapter.behavior == DEFAULT_ADAPTER_BEHAVIOR { + out.push_str("The real-world job runner is fixture-backed. This section separates encoded evidence from live adapter claims.\n\n"); + } else { + out.push_str("This report scores materialized adapter responses. Capture and integration classes still describe the job corpus, not broad external adapter coverage.\n\n"); + } + + out.push_str("| Class | Behaviors |\n"); + out.push_str("| --- | --- |\n"); + out.push_str(&format!("| real | {} |\n", md_list(report.capture_integration.real.as_slice()))); + out.push_str(&format!( + "| fixture-backed | {} |\n", + md_list(report.capture_integration.fixture_backed.as_slice()) + )); + out.push_str(&format!( + "| mocked | {} |\n", + md_list(report.capture_integration.mocked.as_slice()) + )); + out.push_str(&format!( + "| blocked | {} |\n", + md_list(report.capture_integration.blocked.as_slice()) + )); + out.push_str(&format!( + "| not encoded | {} |\n", + md_list(report.capture_integration.not_encoded.as_slice()) + )); + + if !report.capture_integration.notes.is_empty() { + out.push_str("\nNotes:\n"); + + for note in &report.capture_integration.notes { + out.push_str(&format!("- {}\n", md_cell(note.as_str()))); + } + } + + out.push('\n'); +} + +fn render_markdown_external_adapters(out: &mut String, report: &RealWorldReport) { + out.push_str("## External Adapter Coverage\n\n"); + + if report.external_adapters.adapters.is_empty() { + out.push_str("No external adapter coverage manifest was loaded for this report.\n\n"); + + return; + } + + let summary = &report.external_adapters.summary; + + out.push_str("This section is manifest-backed. It records external adapter coverage and blockers, but it does not convert live-baseline retrieval results into real-world suite wins.\n\n"); + out.push_str(&format!( + "- Manifest: `{}`\n", + md_inline(report.external_adapters.manifest_id.as_str()) + )); + out.push_str(&format!( + "- Docker default: `{}` via `{}`; artifact dir `{}`\n", + report.external_adapters.docker_isolation.default, + md_inline(report.external_adapters.docker_isolation.compose_file.as_str()), + md_inline(report.external_adapters.docker_isolation.artifact_dir.as_str()) + )); + out.push_str(&format!( + "- Adapter records: `{}` total, `{}` external project(s), `{}` Docker-default, `{}` requiring host-global installs\n", + summary.adapter_count, + summary.external_project_count, + summary.docker_default_count, + summary.host_global_install_required_count + )); + out.push_str(&format!( + "- Evidence classes: `{}` fixture-backed, `{}` live-baseline-only, `{}` live real-world, `{}` research-gate\n", + summary.fixture_backed_count, + summary.live_baseline_only_count, + summary.live_real_world_count, + summary.research_gate_count + )); + out.push_str(&format!( + "- Overall statuses: `{}`\n", + adapter_status_counts_display(&summary.overall_status_counts) + )); + out.push_str(&format!( + "- Capability coverage statuses: `{}`\n", + adapter_status_counts_display(&summary.capability_status_counts) + )); + out.push_str(&format!( + "- Real-world suite statuses: `{}`\n", + adapter_status_counts_display(&summary.suite_status_counts) + )); + + if has_adapter_scenarios(report.external_adapters.adapters.as_slice()) { + out.push_str(&format!( + "- Scenario coverage statuses: `{}`\n", + adapter_status_counts_display(&summary.scenario_status_counts) + )); + out.push_str(&format!( + "- ELF scenario positions: `{}`\n", + scenario_position_counts_display(&summary.scenario_position_counts) + )); + out.push_str(&format!( + "- Scenario comparison outcomes: `{}`\n", + scenario_outcome_counts_display(&summary.scenario_outcome_counts) + )); + } + + out.push('\n'); + out.push_str("| Project | Adapter | Evidence Class | Overall | Setup | Run | Result | Docker | Suites | Evidence |\n"); + out.push_str("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n"); + + for adapter in &report.external_adapters.adapters { + out.push_str(&format!( + "| {} | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} |\n", + md_cell(adapter.project.as_str()), + md_inline(adapter.adapter_id.as_str()), + md_inline(adapter.evidence_class.as_str()), + adapter_status_str(adapter.overall_status), + adapter_status_str(adapter.setup.status), + adapter_status_str(adapter.run.status), + adapter_status_str(adapter.result.status), + adapter.docker_default, + adapter_suite_cell(adapter.suites.as_slice()), + adapter_evidence_cell(adapter) + )); + } + + out.push_str("\n### Adapter Capability Details\n\n"); + out.push_str("| Adapter | Capability | Status | Evidence |\n"); + out.push_str("| --- | --- | --- | --- |\n"); + + for adapter in &report.external_adapters.adapters { + for capability in &adapter.capabilities { + out.push_str(&format!( + "| `{}` | {} | `{}` | {} |\n", + md_inline(adapter.adapter_id.as_str()), + md_cell(capability.capability.as_str()), + adapter_status_str(capability.status), + md_cell(capability.evidence.as_str()) + )); + } + } + + render_markdown_adapter_scenarios(out, report.external_adapters.adapters.as_slice()); + render_markdown_adapter_execution_metadata(out, report.external_adapters.adapters.as_slice()); + + out.push('\n'); +} + +fn render_markdown_adapter_scenarios(out: &mut String, adapters: &[ExternalAdapterReport]) { + if !has_adapter_scenarios(adapters) { + return; + } + + out.push_str("\n### Adapter Scenario Judgments\n\n"); + out.push_str("| Adapter | Scenario | Suite | Status | Outcome | Evidence |\n"); + out.push_str("| --- | --- | --- | --- | --- | --- |\n"); + + for adapter in adapters { + for scenario in &adapter.scenarios { + out.push_str(&format!( + "| `{}` | `{}` | {} | `{}` | `{}` | {} |\n", + md_inline(adapter.adapter_id.as_str()), + md_inline(scenario.scenario_id.as_str()), + scenario + .suite_id + .as_deref() + .map(|suite| format!("`{}`", md_inline(suite))) + .unwrap_or_else(|| "`none`".to_string()), + adapter_status_str(scenario.status), + scenario_comparison_outcome_str(scenario_comparison_outcome(scenario)), + adapter_scenario_evidence_cell(scenario) + )); + } + } +} + +fn has_adapter_scenarios(adapters: &[ExternalAdapterReport]) -> bool { + adapters.iter().any(|adapter| !adapter.scenarios.is_empty()) +} + +fn render_markdown_adapter_execution_metadata( + out: &mut String, + adapters: &[ExternalAdapterReport], +) { + let mut wrote_header = false; + + for adapter in adapters { + let Some(metadata) = &adapter.execution_metadata else { + continue; + }; + + if !wrote_header { + out.push_str("\n### Adapter Execution Metadata\n\n"); + out.push_str("| Adapter | Sources | Setup Path | Runtime Boundary | Resource Expectation | Retry Guidance | Research Depth |\n"); + out.push_str("| --- | --- | --- | --- | --- | --- | --- |\n"); + + wrote_header = true; + } + + out.push_str(&format!( + "| `{}` | {} | {} | {} | {} | {} | {} |\n", + md_inline(adapter.adapter_id.as_str()), + adapter_sources_cell(metadata.sources.as_slice()), + md_cell(metadata.setup_path.as_str()), + md_cell(metadata.runtime_boundary.as_str()), + md_cell(metadata.resource_expectation.as_str()), + md_list(metadata.retry_guidance.as_slice()), + md_cell(metadata.research_depth.as_deref().unwrap_or("not recorded")) + )); + } +} + +fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_path: &str) { + out.push_str("# Real-World Job Benchmark Report\n\n"); + out.push_str( + "Goal: Publish a Markdown summary for one generated real_world_job benchmark report.\n", + ); + out.push_str( + "Read this when: You need a durable smoke report for real-world agent memory job fixtures.\n", + ); + out.push_str(&format!("Inputs: `{}`.\n", md_inline(report_path))); + out.push_str("Depends on: `apps/elf-eval/fixtures/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`.\n"); + out.push_str( + "Verification: Compare this Markdown summary with the source JSON before committing.\n\n", + ); + out.push_str("## Summary\n\n"); + out.push_str(&format!("- Run ID: `{}`\n", md_inline(report.run_id.as_str()))); + out.push_str(&format!("- Generated at: `{}`\n", md_inline(report.generated_at.as_str()))); + out.push_str(&format!("- Runner version: `{}`\n", md_inline(report.runner_version.as_str()))); + out.push_str(&format!("- Corpus profile: `{}`\n", md_inline(report.corpus_profile.as_str()))); + out.push_str(&format!( + "- Adapter: `{}` ({})\n", + md_inline(report.adapter.adapter_id.as_str()), + md_inline(report.adapter.behavior.as_str()) + )); + out.push_str(&format!("- Jobs: `{}`\n", report.summary.job_count)); + out.push_str(&format!( + "- Suites with encoded jobs: `{}`\n", + report.summary.encoded_suite_count + )); + out.push_str(&format!( + "- Suites with `not_encoded` status: `{}`\n", + report.not_encoded_suites.len() + )); + out.push_str(&format!("- Status summary: `{}` pass, `{}` wrong_result, `{}` lifecycle_fail, `{}` incomplete, `{}` blocked, `{}` not_encoded, `{}` unsupported_claim\n", report.summary.pass, report.summary.wrong_result, report.summary.lifecycle_fail, report.summary.incomplete, report.summary.blocked, report.summary.not_encoded, report.summary.unsupported_claim)); + out.push_str(&format!( + "- Unsupported claim count: `{}`\n", + report.summary.unsupported_claim_count + )); + out.push_str(&format!("- Wrong-result count: `{}`\n", report.summary.wrong_result_count)); + out.push_str(&format!("- Stale-answer count: `{}`\n", report.summary.stale_answer_count)); + out.push_str(&format!( + "- Conflict detections: `{}`\n", + report.summary.conflict_detection_count + )); + out.push_str(&format!( + "- Update rationales available: `{}`\n", + report.summary.update_rationale_available_count + )); + out.push_str(&format!( + "- Temporal validity not encoded: `{}`\n", + report.summary.temporal_validity_not_encoded_count + )); + out.push_str(&format!( + "- History readback encoded: `{}`\n", + report.summary.history_readback_encoded_count + )); + + render_markdown_quality_summary(out, report); + + out.push_str(&format!("- Mean score: `{:.3}`\n", report.summary.mean_score)); + out.push_str(&format!( + "- Mean latency: `{}`\n", + optional_f64(report.summary.mean_latency_ms, " ms") + )); + out.push_str(&format!("- Cost: `{}`\n", cost_display(report.summary.total_cost.as_ref()))); + out.push_str(&format!( + "- Operator-debug jobs: `{}`\n", + report.summary.operator_debug_job_count + )); + out.push_str(&format!("- Raw SQL needed: `{}`\n", report.summary.raw_sql_needed_count)); + out.push_str(&format!( + "- Trace-incomplete debug jobs: `{}`\n", + report.summary.trace_incomplete_count + )); + out.push_str(&format!("- Operator UX gaps: `{}`\n", report.summary.operator_ux_gap_count)); + + render_markdown_optional_summary_metrics(out, &report.summary); + + out.push_str(&format!( + "- Private corpus redaction: `{}`\n\n", + md_inline(report.private_corpus_redaction.policy.as_str()) + )); +} + +fn render_markdown_optional_summary_metrics(out: &mut String, summary: &ReportSummary) { + if let Some(knowledge) = &summary.knowledge { + out.push_str(&format!( + "- Knowledge citation coverage: `{:.3}`\n", + knowledge.citation_coverage + )); + out.push_str(&format!( + "- Stale claim detection: `{:.3}`\n", + knowledge.stale_claim_detection + )); + out.push_str(&format!("- Rebuild determinism: `{:.3}`\n", knowledge.rebuild_determinism)); + out.push_str(&format!( + "- Backlinks: `{}` total, `{:.3}` page coverage\n", + knowledge.backlink_count, knowledge.backlink_coverage + )); + out.push_str(&format!("- Page usefulness: `{:.3}`\n", knowledge.page_usefulness)); + out.push_str(&format!( + "- Unsupported summary count: `{}`\n", + knowledge.unsupported_summary_count + )); + } + if let Some(memory_summary) = &summary.memory_summary { + out.push_str(&format!( + "- Memory summary entries: `{}` across `{}` artifact(s)\n", + memory_summary.entry_count, memory_summary.summary_count + )); + out.push_str(&format!( + "- Memory summary source-ref coverage: `{}/{}` (`{:.3}`)\n", + memory_summary.source_ref_entry_count, + memory_summary.source_ref_required_count, + memory_summary.source_ref_coverage + )); + out.push_str(&format!( + "- Memory summary invalid top-of-mind count: `{}`\n", + memory_summary.invalid_top_of_mind_count + )); + out.push_str(&format!( + "- Memory summary unsupported derived entries: `{}`\n", + memory_summary.unsupported_derived_entry_count + )); + out.push_str(&format!( + "- Memory summary unsupported current entries: `{}`\n", + memory_summary.unsupported_current_entry_count + )); + } + if let Some(proactive) = &summary.proactive_brief { + out.push_str(&format!( + "- Proactive brief suggestions: `{}` across `{}` artifact(s)\n", + proactive.suggestion_count, proactive.brief_count + )); + out.push_str(&format!( + "- Proactive evidence-ref coverage: `{}/{}` (`{:.3}`)\n", + proactive.evidence_ref_suggestion_count, + proactive.evidence_ref_required_count, + proactive.evidence_ref_coverage + )); + out.push_str(&format!( + "- Proactive freshness/action rationale coverage: `{:.3}` / `{:.3}`\n", + proactive.freshness_coverage, proactive.action_rationale_coverage + )); + out.push_str(&format!( + "- Proactive stale/currentness violations: `{}` invalid current, `{}` tombstone violation(s)\n", + proactive.invalid_current_suggestion_count, proactive.tombstone_violation_count + )); + out.push_str(&format!( + "- Proactive rejected/deferred suggestions: `{}` rejected, `{}` deferred\n", + proactive.rejected_count, proactive.deferred_count + )); + } + if let Some(scheduled) = &summary.scheduled_memory { + out.push_str(&format!( + "- Scheduled memory outputs: `{}` across `{}` task run(s)\n", + scheduled.output_count, scheduled.task_run_count + )); + out.push_str(&format!( + "- Scheduled memory evidence-ref coverage: `{}/{}` (`{:.3}`)\n", + scheduled.evidence_ref_output_count, + scheduled.evidence_ref_required_count, + scheduled.evidence_ref_coverage + )); + out.push_str(&format!( + "- Scheduled memory freshness/action/trace coverage: `{:.3}` / `{:.3}` / `{:.3}`\n", + scheduled.freshness_coverage, + scheduled.action_rationale_coverage, + scheduled.trace_coverage + )); + out.push_str(&format!( + "- Scheduled memory stale/currentness violations: `{}` invalid current, `{}` tombstone violation(s)\n", + scheduled.invalid_current_output_count, scheduled.tombstone_violation_count + )); + out.push_str(&format!( + "- Scheduled memory source mutations: `{}`\n", + scheduled.source_mutation_count + )); + } +} + +fn render_markdown_quality_summary(out: &mut String, report: &RealWorldReport) { + out.push_str(&format!( + "- Evidence coverage: `{}/{}` (`{:.3}`)\n", + report.summary.evidence_covered_count, + report.summary.evidence_required_count, + report.summary.evidence_coverage + )); + out.push_str(&format!( + "- Source-ref coverage: `{}/{}` (`{:.3}`)\n", + report.summary.source_ref_covered_count, + report.summary.source_ref_required_count, + report.summary.source_ref_coverage + )); + out.push_str(&format!( + "- Quote coverage: `{}/{}` (`{:.3}`)\n", + report.summary.quote_covered_count, + report.summary.quote_required_count, + report.summary.quote_coverage + )); + out.push_str(&format!("- Stale retrieval count: `{}`\n", report.summary.stale_retrieval_count)); + out.push_str(&format!( + "- Scope correctness: `{}/{}` (`{:.3}`), violations `{}`\n", + report.summary.scope_correct_count, + report.summary.scope_check_count, + report.summary.scope_correctness, + report.summary.scope_violation_count + )); + out.push_str(&format!("- Redaction leak count: `{}`\n", report.summary.redaction_leak_count)); + out.push_str(&format!( + "- Qdrant rebuild cases: `{}` encoded, `{}` pass\n", + report.summary.qdrant_rebuild_case_count, report.summary.qdrant_rebuild_pass_count + )); + out.push_str(&format!( + "- Expected evidence recall: `{:.3}` ({}/{})\n", + report.summary.expected_evidence_recall, + report.summary.expected_evidence_matched, + report.summary.expected_evidence_total + )); + out.push_str(&format!( + "- Irrelevant context ratio: `{:.3}` ({} irrelevant)\n", + report.summary.irrelevant_context_ratio, report.summary.irrelevant_context_count + )); + out.push_str(&format!( + "- Trace explainability: `{}` job(s), `{}` wrong-result stage attribution(s)\n", + report.summary.trace_explainability_count, + report.summary.wrong_result_stage_attribution_count + )); + out.push_str(&format!( + "- Consolidation source mutation count: `{}`\n", + report.summary.consolidation.source_mutation_count + )); +} + +fn render_markdown_suites(out: &mut String, report: &RealWorldReport) { + out.push_str("## Suites\n\n"); + out.push_str( + "| Suite | Status | Jobs | Score | Evidence Recall | Irrelevant Context | Trace Explain | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | History Readback | Unsupported Claims | Wrong Results | Reason |\n", + ); + out.push_str("| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n"); + + for suite in &report.suites { + out.push_str(&format!( + "| {} | `{}` | {} | `{}` | `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} |\n", + md_cell(suite.suite_id.as_str()), + status_str(suite.status), + suite.encoded_job_count, + optional_f64(suite.score_mean, ""), + optional_f64(suite.expected_evidence_recall, ""), + optional_f64(suite.irrelevant_context_ratio, ""), + suite.trace_explainability_count, + suite.stale_answer_count, + suite.conflict_detection_count, + suite.update_rationale_available_count, + suite.temporal_validity_not_encoded_count, + suite.history_readback_encoded_count, + suite.unsupported_claim_count, + suite.wrong_result_count, + md_cell(suite.reason.as_str()) + )); + } + + out.push('\n'); +} + +fn render_markdown_jobs(out: &mut String, report: &RealWorldReport) { + out.push_str("## Jobs\n\n"); + out.push_str("| Suite | Job | Status | Answer Type | Caveat Required | Refusal Required | Unknown Allowed | Score | Evidence Recall | Irrelevant Context | Expected Evidence | Produced Evidence | Trace Failure Stage | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost |\n"); + out.push_str( + "| --- | --- | --- | --- | --- | --- | --- | ---: | ---: | ---: | --- | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- |\n", + ); + + for job in &report.jobs { + let expected = job + .expected_evidence + .iter() + .map(|evidence| evidence.evidence_id.as_str()) + .collect::>() + .join(", "); + let produced = job.produced_evidence.join(", "); + + out.push_str(&format!( + "| {} | {} | `{}` | `{}` | `{}` | `{}` | `{}` | `{:.3}` | `{:.3}` | `{:.3}` | `{}` | `{}` | `{}` | {} | {} | `{}` | `{}` | {} | {} | `{}` | `{}` |\n", + md_cell(job.suite_id.as_str()), + md_cell(job.job_id.as_str()), + status_str(job.status), + md_inline(job.answer_type.as_str()), + bool_display(job.requires_caveat), + bool_display(job.requires_refusal), + bool_display(job.can_answer_unknown), + job.normalized_score, + job.retrieval_quality.expected_evidence_recall, + job.retrieval_quality.irrelevant_context_ratio, + md_inline(expected.as_str()), + md_inline(produced.as_str()), + md_inline(trace_failure_stage(job.trace_explainability.as_ref()).unwrap_or("-")), + job.stale_answer_count, + job.conflict_detection_count, + bool_display(job.update_rationale_available), + bool_display(job.temporal_validity_not_encoded), + job.unsupported_claim_count, + job.wrong_result_count, + optional_f64(job.latency_ms, " ms"), + cost_display(job.cost.as_ref()) + )); + } + + out.push('\n'); +} + +fn render_markdown_operator_debugging(out: &mut String, report: &RealWorldReport) { + let jobs = report.jobs.iter().filter(|job| job.operator_debug.is_some()).collect::>(); + + out.push_str("## Operator Debugging UX\n\n"); + + if jobs.is_empty() { + out.push_str("No encoded job reported operator debugging evidence.\n\n"); + + return; + } + + out.push_str("| Job | Failure Mode | Trace Evidence | Trace Available | Replay Command | Steps | Raw SQL | Dropped Candidate Visibility | Trace Completeness | Repair Clarity | UX Gaps |\n"); + out.push_str("| --- | --- | --- | --- | --- | ---: | --- | --- | --- | --- | --- |\n"); + + for job in jobs { + if let Some(debug) = &job.operator_debug { + out.push_str(&format!( + "| {} | {} | {} | `{}` | `{}` | {} | `{}` | {} | `{}` | `{}` | {} |\n", + md_cell(job.job_id.as_str()), + md_cell(debug.failure_mode.as_str()), + debug_trace_cell(debug), + debug.trace_available.unwrap_or(debug.trace_id.is_some()), + debug.replay_command_available.unwrap_or(debug.replay_command.is_some()), + debug.steps_to_root_cause, + debug.raw_sql_needed, + md_cell(debug.dropped_candidate_visibility.as_str()), + md_inline(debug.trace_completeness.as_str()), + md_inline(debug.repair_action_clarity.as_str()), + ux_gap_cell(debug.ux_gaps.as_slice()) + )); + } + } + + out.push_str("\n### Operator Debug Details\n\n"); + + for job in report.jobs.iter().filter(|job| job.operator_debug.is_some()) { + if let Some(debug) = &job.operator_debug { + out.push_str(&format!("#### `{}`\n\n", md_inline(job.job_id.as_str()))); + out.push_str(&format!("- Root cause: {}\n", md_cell(debug.root_cause.as_str()))); + out.push_str(&format!( + "- Viewer panels: `{}`\n", + md_inline(debug.viewer_panels.join(", ").as_str()) + )); + out.push_str(&format!( + "- CLI steps: `{}`\n", + md_inline(debug.cli_steps.join(" -> ").as_str()) + )); + + if let Some(command) = &debug.replay_command { + out.push_str(&format!("- Replay command: `{}`\n", md_inline(command.as_str()))); + } + if let Some(artifact) = &debug.replay_artifact { + out.push_str(&format!("- Replay artifact: `{}`\n", md_inline(artifact.as_str()))); + } + + out.push_str(&format!( + "- Trace evidence: `{}`\n", + md_inline(debug.trace_evidence.join(", ").as_str()) + )); + out.push('\n'); + } + } +} + +fn debug_trace_cell(debug: &OperatorDebugEvidence) -> String { + let trace = debug.trace_id.as_deref().unwrap_or("-"); + let viewer = debug + .viewer_url + .as_deref() + .map(|url| format!("[viewer]({})", md_url(url))) + .unwrap_or_else(|| "viewer: -".to_string()); + let bundle = debug + .admin_trace_bundle_url + .as_deref() + .map(|url| format!("[bundle]({})", md_url(url))) + .unwrap_or_else(|| "bundle: -".to_string()); + + format!("`{}`
{}
{}", md_inline(trace), viewer, bundle) +} + +fn ux_gap_cell(gaps: &[OperatorUxGap]) -> String { + if gaps.is_empty() { + return "`none`".to_string(); + } + + gaps.iter() + .map(|gap| { + format!( + "`{}`: {} ({})", + md_inline(gap.gap_id.as_str()), + md_cell(gap.description.as_str()), + md_inline(gap.follow_up_issue.as_str()) + ) + }) + .collect::>() + .join("
") +} + +fn render_markdown_evolution(out: &mut String, report: &RealWorldReport) { + out.push_str("## Memory Evolution\n\n"); + out.push_str(&format!("- Stale answers: `{}`\n", report.evolution.stale_answer_count)); + out.push_str(&format!( + "- Conflict detections: `{}`\n", + report.evolution.conflict_detection_count + )); + out.push_str(&format!( + "- Update rationales available: `{}`\n", + report.evolution.update_rationale_available_count + )); + out.push_str(&format!( + "- Temporal validity not encoded: `{}`\n\n", + report.evolution.temporal_validity_not_encoded_count + )); + out.push_str(&format!( + "- History readback encoded: `{}`\n\n", + report.evolution.history_readback_encoded_count + )); + out.push_str("| Suite | Job | Current Evidence | Historical Evidence | Tombstone/Invalidation | Selected Current | Selected Historical | Selected Rationale | Selected Tombstone/Invalidation | Selected But Not Narrated | Stale Traps Used | Conflict Count | Detected | Update Rationale | Temporal Validity | History Readback | Follow-up |\n"); + out.push_str("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---: | ---: | --- | --- | --- | --- |\n"); + + for job in &report.jobs { + let Some(evolution) = &job.evolution else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | `{}` | `{}` | `{}` | {} |\n", + md_cell(job.suite_id.as_str()), + md_cell(job.job_id.as_str()), + md_inline(evolution.current_evidence.join(", ").as_str()), + md_inline(evolution.historical_evidence.join(", ").as_str()), + md_inline( + evolution + .tombstone_evidence + .iter() + .chain(evolution.invalidation_evidence.iter()) + .cloned() + .collect::>() + .join(", ") + .as_str() + ), + md_inline(evolution.selected_current_evidence.join(", ").as_str()), + md_inline(evolution.selected_historical_evidence.join(", ").as_str()), + md_inline(evolution.selected_rationale_evidence.join(", ").as_str()), + md_inline( + evolution + .selected_tombstone_evidence + .iter() + .chain(evolution.selected_invalidation_evidence.iter()) + .cloned() + .collect::>() + .join(", ") + .as_str() + ), + md_inline(evolution.selected_but_not_narrated_evidence.join(", ").as_str()), + md_inline(evolution.stale_trap_ids_used.join(", ").as_str()), + evolution.conflict_count, + evolution.conflict_detection_count, + bool_display(evolution.update_rationale_available), + temporal_display(evolution), + history_display(evolution), + md_cell(evolution.follow_up.as_deref().unwrap_or("-")) + )); + } + + out.push('\n'); +} + +fn render_markdown_trace_explainability(out: &mut String, report: &RealWorldReport) { + out.push_str("## Trace Explainability\n\n"); + + let jobs = + report.jobs.iter().filter(|job| job.trace_explainability.is_some()).collect::>(); + + if jobs.is_empty() { + out.push_str("No encoded job reported trace explainability metadata.\n\n"); + + return; + } + + out.push_str("| Suite | Job | Trace | Failure Stage | Reason | Stage Evidence |\n"); + out.push_str("| --- | --- | --- | --- | --- | --- |\n"); + + for job in jobs { + let trace = job.trace_explainability.as_ref(); + + out.push_str(&format!( + "| {} | {} | `{}` | `{}` | {} | {} |\n", + md_cell(job.suite_id.as_str()), + md_cell(job.job_id.as_str()), + md_inline(trace.and_then(|trace| trace.trace_id.as_deref()).unwrap_or("-")), + md_inline(trace_failure_stage(trace).unwrap_or("-")), + md_cell(trace_failure_reason(trace).unwrap_or("-")), + md_cell(trace_stage_summary(trace).as_str()) + )); + } + + out.push('\n'); +} + +fn render_markdown_consolidation(out: &mut String, report: &RealWorldReport) { + if report.summary.consolidation.proposal_count == 0 { + return; + } + + out.push_str("## Consolidation\n\n"); + out.push_str("| Job | Proposals | Usefulness | Lineage | Review Actions | Source Mutations | Proposal Unsupported Claims | Executable Gaps |\n"); + out.push_str("| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n"); + + for job in &report.jobs { + let Some(consolidation) = &job.consolidation else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | `{}` | `{}` | `{}` | {} | {} | {} |\n", + md_cell(job.job_id.as_str()), + consolidation.proposal_count, + optional_f64(consolidation.proposal_usefulness, ""), + optional_f64(consolidation.lineage_completeness, ""), + optional_f64(consolidation.review_action_correctness, ""), + consolidation.source_mutation_count, + consolidation.proposal_unsupported_claim_count, + consolidation.executable_gaps.len() + )); + } + + out.push_str( + "\nSource mutation count must remain `0` for proposal-only consolidation cases.\n\n", + ); + + render_markdown_consolidation_gaps(out, report); +} + +fn render_markdown_consolidation_gaps(out: &mut String, report: &RealWorldReport) { + let gaps = report + .jobs + .iter() + .filter_map(|job| job.consolidation.as_ref().map(|consolidation| (job, consolidation))) + .flat_map(|(job, consolidation)| { + consolidation.executable_gaps.iter().map(move |gap| (job.job_id.as_str(), gap)) + }) + .collect::>(); + + if gaps.is_empty() { + return; + } + + out.push_str("### Executable Gaps\n\n"); + out.push_str("| Job | Primitive | Follow-Up Issue | Blocks Fixture Pass | Reason |\n"); + out.push_str("| --- | --- | --- | --- | --- |\n"); + + for (job_id, gap) in gaps { + out.push_str(&format!( + "| {} | {} | {} | `{}` | {} |\n", + md_cell(job_id), + md_cell(gap.primitive.as_str()), + md_cell(gap.follow_up_issue.as_str()), + gap.blocks_fixture_pass, + md_cell(gap.reason.as_str()) + )); + } + + out.push('\n'); +} + +fn render_markdown_knowledge(out: &mut String, report: &RealWorldReport) { + let knowledge_jobs = + report.jobs.iter().filter(|job| job.knowledge.is_some()).collect::>(); + + if knowledge_jobs.is_empty() { + return; + } + + out.push_str("## Knowledge Page Metrics\n\n"); + out.push_str("| Job | Pages | Sections | Citation Coverage | Stale Claim Detection | Rebuild Determinism | Page Usefulness | Backlinks | Unsupported Summaries | Untraced Sections | Allowed Variance |\n"); + out.push_str("| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n"); + + for job in knowledge_jobs { + let Some(knowledge) = &job.knowledge else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | {} | `{:.3}` | `{:.3}` | `{:.3}` | `{:.3}` | {} | {} | {} | {} |\n", + md_cell(job.job_id.as_str()), + knowledge.page_count, + knowledge.section_count, + knowledge.citation_coverage, + knowledge.stale_claim_detection, + knowledge.rebuild_determinism, + knowledge.page_usefulness, + knowledge.backlink_count, + knowledge.unsupported_summary_count, + knowledge.untraced_section_count, + knowledge.allowed_variance_count + )); + } + + out.push('\n'); +} + +fn render_markdown_memory_summary(out: &mut String, report: &RealWorldReport) { + let memory_jobs = + report.jobs.iter().filter(|job| job.memory_summary.is_some()).collect::>(); + + if memory_jobs.is_empty() { + return; + } + + out.push_str("## Memory Summary Metrics\n\n"); + out.push_str("| Job | Summaries | Entries | Categories | Source Coverage | Freshness | Rationale | Invalid Top-of-Mind | Untraced | Derived Unsupported | Unsupported Current | Tombstone Refs |\n"); + out.push_str( + "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n", + ); + + for job in memory_jobs { + let Some(metrics) = &job.memory_summary else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | {} | `{}/{}` | `{:.3}` | `{:.3}` | `{:.3}` | {} | {} | {} | {} | {} |\n", + md_cell(job.job_id.as_str()), + metrics.summary_count, + metrics.entry_count, + metrics.covered_required_category_count, + metrics.required_category_count, + metrics.source_ref_coverage, + metrics.freshness_coverage, + metrics.rationale_coverage, + metrics.invalid_top_of_mind_count, + metrics.untraced_entry_count, + metrics.unsupported_derived_entry_count, + metrics.unsupported_current_entry_count, + metrics.tombstone_ref_count + )); + } + + out.push('\n'); +} + +fn render_markdown_proactive_brief(out: &mut String, report: &RealWorldReport) { + let proactive_jobs = + report.jobs.iter().filter(|job| job.proactive_brief.is_some()).collect::>(); + + if proactive_jobs.is_empty() { + return; + } + + out.push_str("## Proactive Brief Metrics\n\n"); + out.push_str("| Job | Briefs | Suggestions | Kinds | Evidence Coverage | Freshness | Action Rationale | Invalid Current | Untraced | Unsupported Current | Tombstone Violations | Rejected | Deferred |\n"); + out.push_str( + "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n", + ); + + for job in proactive_jobs { + let Some(metrics) = &job.proactive_brief else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | {} | `{}/{}` | `{:.3}` | `{:.3}` | `{:.3}` | {} | {} | {} | {} | {} | {} |\n", + md_cell(job.job_id.as_str()), + metrics.brief_count, + metrics.suggestion_count, + metrics.covered_required_suggestion_kind_count, + metrics.required_suggestion_kind_count, + metrics.evidence_ref_coverage, + metrics.freshness_coverage, + metrics.action_rationale_coverage, + metrics.invalid_current_suggestion_count, + metrics.untraced_suggestion_count, + metrics.unsupported_current_suggestion_count, + metrics.tombstone_violation_count, + metrics.rejected_count, + metrics.deferred_count + )); + } + + out.push('\n'); +} + +fn render_markdown_scheduled_memory(out: &mut String, report: &RealWorldReport) { + let scheduled_jobs = + report.jobs.iter().filter(|job| job.scheduled_memory.is_some()).collect::>(); + + if scheduled_jobs.is_empty() { + return; + } + + out.push_str("## Scheduled Memory Metrics\n\n"); + out.push_str("| Job | Task Runs | Outputs | Kinds | Evidence Coverage | Freshness | Action Rationale | Trace Coverage | Invalid Current | Untraced | Unsupported Current | Tombstone Violations | Source Mutations |\n"); + out.push_str( + "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n", + ); + + for job in scheduled_jobs { + let Some(metrics) = &job.scheduled_memory else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | {} | `{}/{}` | `{:.3}` | `{:.3}` | `{:.3}` | `{:.3}` | {} | {} | {} | {} | {} |\n", + md_cell(job.job_id.as_str()), + metrics.task_run_count, + metrics.output_count, + metrics.covered_required_task_kind_count, + metrics.required_task_kind_count, + metrics.evidence_ref_coverage, + metrics.freshness_coverage, + metrics.action_rationale_coverage, + metrics.trace_coverage, + metrics.invalid_current_output_count, + metrics.untraced_output_count, + metrics.unsupported_current_output_count, + metrics.tombstone_violation_count, + metrics.source_mutation_count + )); + } + + out.push('\n'); +} + +fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport) { + out.push_str("## Unsupported Claims\n\n"); + + if report.unsupported_claims.is_empty() { + out.push_str("No unsupported claims were produced by encoded jobs.\n\n"); + + return; + } + + out.push_str("| Suite | Job | Claim | Evidence | Reason |\n"); + out.push_str("| --- | --- | --- | --- | --- |\n"); + + for claim in &report.unsupported_claims { + out.push_str(&format!( + "| {} | {} | {} | `{}` | {} |\n", + md_cell(claim.suite_id.as_str()), + md_cell(claim.job_id.as_str()), + md_cell(claim.claim_text.as_str()), + md_inline(claim.evidence_ids.join(", ").as_str()), + md_cell(claim.reason.as_str()) + )); + } + + out.push('\n'); +} + +fn render_markdown_follow_ups(out: &mut String, report: &RealWorldReport) { + out.push_str("## Follow-Ups\n\n"); + + if report.follow_ups.is_empty() { + out.push_str("No benchmark follow-ups were declared by encoded jobs.\n\n"); + + return; + } + + out.push_str("| Suite | Job | Follow-up | Reason |\n"); + out.push_str("| --- | --- | --- | --- |\n"); + + for follow_up in &report.follow_ups { + out.push_str(&format!( + "| {} | {} | {} | {} |\n", + md_cell(follow_up.suite_id.as_str()), + md_cell(follow_up.job_id.as_str()), + md_cell(follow_up.title.as_str()), + md_cell(follow_up.reason.as_str()) + )); + } + + out.push('\n'); +} + +fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { + out.push_str("## Result Semantics\n\n"); + out.push_str( + "This report uses `docs/spec/real_world_agent_memory_benchmark_v1.md` status terms.\n", + ); + out.push_str("It is a real-world job fixture report, not a Docker live-baseline report.\n"); + out.push_str("Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins.\n\n"); + out.push_str( + "The summary counters report required evidence coverage, source-ref coverage, quote coverage, expected evidence recall, irrelevant context ratio, trace explainability, stale retrievals, scope violations, redaction leaks, Qdrant rebuild case coverage, stale answers, conflict detections, update rationale availability, and temporal validity gaps across encoded jobs.\n\n", + ); + out.push_str( + "- `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule.\n", + ); + out.push_str( + "- `wrong_result`: a job completed but missed required answer or evidence expectations.\n", + ); + out.push_str("- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links.\n"); + out.push_str("- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed.\n\n"); + out.push_str("For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims.\n\n"); + out.push_str("For `memory_summary` jobs, summary artifacts are derived review surfaces. Top-of-mind entries must be current, included or downgraded entries must carry source refs, and derived project-profile entries must either cite sources or be explicitly flagged as unsupported.\n\n"); + out.push_str("For `proactive_brief` jobs, brief artifacts are fixture-scored derived outputs, not scheduled UI behavior. Every suggestion must carry evidence refs, freshness/currentness metadata, and an action rationale; stale, superseded, or tombstoned sources must not be presented as current recommendations.\n\n"); + out.push_str("For `scheduled_memory` jobs, task artifacts are deterministic fixture-scored stand-ins for asynchronous work. Every output must carry evidence refs, freshness/currentness metadata, action rationale, and execution trace/readback evidence; scheduled tasks must not mutate source notes silently or claim hosted scheduler/private-provider parity from fixture-only output.\n\n"); + out.push_str("## Suites With `not_encoded` Status\n\n"); + + if report.not_encoded_suites.is_empty() { + out.push_str("All declared suites have at least one encoded job.\n"); + } else { + for suite in &report.not_encoded_suites { + out.push_str(&format!("- `{}`\n", md_inline(suite.as_str()))); + } + } +} + +fn status_str(status: TypedStatus) -> &'static str { + match status { + TypedStatus::Pass => "pass", + TypedStatus::WrongResult => "wrong_result", + TypedStatus::LifecycleFail => "lifecycle_fail", + TypedStatus::Incomplete => "incomplete", + TypedStatus::Blocked => "blocked", + TypedStatus::NotEncoded => "not_encoded", + TypedStatus::UnsupportedClaim => "unsupported_claim", + } +} + +fn adapter_status_str(status: AdapterCoverageStatus) -> &'static str { + match status { + AdapterCoverageStatus::Real => "real", + AdapterCoverageStatus::Mocked => "mocked", + AdapterCoverageStatus::Unsupported => "unsupported", + AdapterCoverageStatus::Blocked => "blocked", + AdapterCoverageStatus::Incomplete => "incomplete", + AdapterCoverageStatus::WrongResult => "wrong_result", + AdapterCoverageStatus::LifecycleFail => "lifecycle_fail", + AdapterCoverageStatus::Pass => "pass", + AdapterCoverageStatus::NotEncoded => "not_encoded", + } +} + +fn scenario_comparison_outcome_str(outcome: ScenarioComparisonOutcome) -> &'static str { + match outcome { + ScenarioComparisonOutcome::Win => "win", + ScenarioComparisonOutcome::Tie => "tie", + ScenarioComparisonOutcome::Loss => "loss", + ScenarioComparisonOutcome::NotTested => "not_tested", + ScenarioComparisonOutcome::Blocked => "blocked", + ScenarioComparisonOutcome::NonGoal => "non_goal", + } +} + +fn scenario_position_str(position: ElfScenarioPosition) -> &'static str { + match position { + ElfScenarioPosition::Wins => "wins", + ElfScenarioPosition::Ties => "ties", + ElfScenarioPosition::Loses => "loses", + ElfScenarioPosition::Untested => "untested", + } +} + +fn adapter_status_counts_display(counts: &AdapterStatusCounts) -> String { + [ + ("real", counts.real), + ("mocked", counts.mocked), + ("unsupported", counts.unsupported), + ("blocked", counts.blocked), + ("incomplete", counts.incomplete), + ("wrong_result", counts.wrong_result), + ("lifecycle_fail", counts.lifecycle_fail), + ("pass", counts.pass), + ("not_encoded", counts.not_encoded), + ] + .into_iter() + .filter(|(_, count)| *count > 0) + .map(|(status, count)| format!("{status}={count}")) + .collect::>() + .join(", ") +} + +fn scenario_position_counts_display(counts: &ScenarioPositionCounts) -> String { + [ + ("wins", counts.wins), + ("ties", counts.ties), + ("loses", counts.loses), + ("untested", counts.untested), + ] + .into_iter() + .filter(|(_, count)| *count > 0) + .map(|(position, count)| format!("{position}={count}")) + .collect::>() + .join(", ") +} + +fn scenario_outcome_counts_display(counts: &ScenarioOutcomeCounts) -> String { + [ + ("win", counts.win), + ("tie", counts.tie), + ("loss", counts.loss), + ("not_tested", counts.not_tested), + ("blocked", counts.blocked), + ("non_goal", counts.non_goal), + ] + .into_iter() + .filter(|(_, count)| *count > 0) + .map(|(outcome, count)| format!("{outcome}={count}")) + .collect::>() + .join(", ") +} + +fn adapter_suite_cell(suites: &[AdapterSuiteCoverage]) -> String { + if suites.is_empty() { + return "`none`".to_string(); + } + + suites + .iter() + .map(|suite| { + format!( + "`{}`: `{}`", + md_inline(suite.suite_id.as_str()), + adapter_status_str(suite.status) + ) + }) + .collect::>() + .join("
") +} + +fn adapter_evidence_cell(adapter: &ExternalAdapterReport) -> String { + let setup = adapter + .setup + .command + .as_deref() + .or(adapter.setup.artifact.as_deref()) + .unwrap_or(adapter.setup.evidence.as_str()); + let result = adapter + .result + .artifact + .as_deref() + .or(adapter.result.command.as_deref()) + .unwrap_or(adapter.result.evidence.as_str()); + + format!("setup: `{}`
result: `{}`", md_inline(setup), md_inline(result)) +} + +fn adapter_scenario_evidence_cell(scenario: &AdapterScenarioJudgment) -> String { + let evidence = md_cell(scenario.evidence.as_str()); + let command = scenario + .command + .as_deref() + .map(|command| format!("
command: `{}`", md_inline(command))) + .unwrap_or_default(); + let artifact = scenario + .artifact + .as_deref() + .map(|artifact| format!("
artifact: `{}`", md_inline(artifact))) + .unwrap_or_default(); + + format!("{evidence}{command}{artifact}") +} + +fn adapter_sources_cell(sources: &[AdapterSource]) -> String { + if sources.is_empty() { + return "`none`".to_string(); + } + + sources + .iter() + .map(|source| { + format!( + "[{}]({}): {}", + md_cell(source.label.as_str()), + md_url(source.url.as_str()), + md_cell(source.evidence.as_str()) + ) + }) + .collect::>() + .join("
") +} + +fn trace_failure_stage(trace: Option<&TraceExplainability>) -> Option<&str> { + trace.and_then(|trace| trace.failure_stage.as_deref()) +} + +fn trace_failure_reason(trace: Option<&TraceExplainability>) -> Option<&str> { + trace.and_then(|trace| trace.failure_reason.as_deref()) +} + +fn trace_stage_summary(trace: Option<&TraceExplainability>) -> String { + let Some(trace) = trace else { + return "-".to_string(); + }; + let stages = trace + .stages + .iter() + .map(|stage| { + format!( + "{} kept={} demoted={} dropped={} distractors={}", + stage.stage_name, + stage.kept_evidence.join("+"), + stage.demoted_evidence.join("+"), + stage.dropped_evidence.join("+"), + stage.distractor_evidence.join("+") + ) + }) + .collect::>(); + + if stages.is_empty() { "-".to_string() } else { stages.join("; ") } +} + +fn write_or_print(path: Option<&Path>, content: &str) -> Result<()> { + if let Some(path) = path { + if let Some(parent) = path.parent() + && !parent.as_os_str().is_empty() + { + fs::create_dir_all(parent)?; + } + + fs::write(path, content)?; + + println!("Wrote {}", path.display()); + } else { + println!("{content}"); + } + + Ok(()) +} + +fn optional_f64(value: Option, suffix: &str) -> String { + value.map(|value| format!("{value:.3}{suffix}")).unwrap_or_else(|| "-".to_string()) +} + +fn bool_display(value: bool) -> &'static str { + if value { "true" } else { "false" } +} + +fn temporal_display(evolution: &EvolutionJobReport) -> &'static str { + if evolution.temporal_validity_not_encoded { + "not_encoded" + } else if evolution.temporal_validity_encoded { + "encoded" + } else if evolution.temporal_validity_required { + "required" + } else { + "-" + } +} + +fn history_display(evolution: &EvolutionJobReport) -> String { + if !evolution.history_readback_encoded { + return "-".to_string(); + } + + let mut parts = vec![format!("events={}", evolution.history_event_types.join(","))]; + + if evolution.history_requires_note_version_links { + parts.push("note_version_links=true".to_string()); + } + + parts.join(";") +} + +fn cost_display(cost: Option<&CostReport>) -> String { + let Some(cost) = cost else { + return "-".to_string(); + }; + + match (cost.amount, cost.currency.as_deref()) { + (Some(amount), Some(currency)) => format!("{amount:.3} {currency}"), + (Some(amount), None) => format!("{amount:.3}"), + (None, _) => "-".to_string(), + } +} + +fn bounded_text(value: &str, max_chars: usize) -> String { + let mut chars = value.chars(); + let text = chars.by_ref().take(max_chars).collect::(); + + if chars.next().is_some() { format!("{text}...") } else { text } +} + +fn md_inline(value: &str) -> String { + value.replace('`', "'").replace('\n', " ") +} + +fn md_cell(value: &str) -> String { + md_inline(value).replace('|', "\\|") +} + +fn md_url(value: &str) -> String { + value.replace(')', "%29").replace(' ', "%20") +} + +fn md_list(values: &[String]) -> String { + if values.is_empty() { + return "-".to_string(); + } + + md_cell(values.join("; ").as_str()) +} + +fn round3(value: f64) -> f64 { + (value * 1_000.0).round() / 1_000.0 +} diff --git a/apps/elf-eval/src/bin/real_world_live_adapter.rs b/apps/elf-eval/src/bin/real_world_live_adapter.rs new file mode 100644 index 00000000..4c21b7ff --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_live_adapter.rs @@ -0,0 +1,4625 @@ +#![allow(clippy::single_component_path_imports, unused_crate_dependencies)] + +//! Live adapter materializer for the real-world job benchmark. + +use std::{ + collections::{BTreeSet, HashMap}, + env, + fs::{self, OpenOptions}, + io::Write as _, + path::{Path, PathBuf}, + process::{Command, Stdio}, + sync::Arc, + time::{Duration, Instant}, +}; + +use ::time::OffsetDateTime; +use blake3::Hasher; +use clap::{Parser, Subcommand, ValueEnum}; +use color_eyre::{self, eyre}; +use reqwest::RequestBuilder; +use serde::{Deserialize, Serialize}; +use serde_json::{self, Map}; +use tokio::{task::JoinSet, time}; +use uuid::Uuid; + +use elf_chunking::ChunkingConfig; +use elf_config::{Config, EmbeddingProviderConfig, LlmProviderConfig, ProviderConfig}; +use elf_domain::{ + consolidation::{ + ConsolidationApplyIntent, ConsolidationInputRef, ConsolidationLineage, ConsolidationMarker, + ConsolidationMarkerSeverity, ConsolidationMarkers, ConsolidationProposalDiff, + ConsolidationReviewAction, ConsolidationSourceKind, ConsolidationSourceSnapshot, + ConsolidationUnsupportedClaimFlag, + }, + knowledge::KnowledgePageKind, + writegate::{self, WritePolicy}, +}; +use elf_service::{ + AddNoteInput, AddNoteRequest, BoxFuture, ConsolidationProposalInput, + ConsolidationProposalResponse, ConsolidationProposalReviewRequest, + ConsolidationProposalsListRequest, ConsolidationRunCreateRequest, ElfService, + EmbeddingProvider, ExtractorProvider, KnowledgePageLintRequest, KnowledgePageLintResponse, + KnowledgePageRebuildRequest, KnowledgePageResponse, KnowledgePageSearchRequest, PayloadLevel, + Providers, RerankProvider, SearchItem, SearchRequest, +}; +use elf_storage::{db::Db, qdrant::QdrantStore}; +use elf_testkit::TestDatabase; +use elf_worker::worker::{self, WorkerState}; + +const JOB_SCHEMA: &str = "elf.real_world_job/v1"; +const EVIDENCE_SCHEMA: &str = "elf.real_world_live_adapter_materialization/v1"; +const TENANT_ID: &str = "elf-live-real-world"; +const AGENT_ID: &str = "elf-live-real-world-agent"; +const SCOPE: &str = "agent_private"; +const ELF_NOTE_CHUNK_CHARS: usize = 220; + +#[derive(Debug, Parser)] +#[command(version = elf_cli::VERSION, rename_all = "kebab", styles = elf_cli::styles())] +struct Args { + #[command(subcommand)] + command: CommandArgs, +} + +#[derive(Debug, Parser)] +struct ElfArgs { + /// Fixture file or directory containing real_world_job JSON fixtures. + #[arg(long, value_name = "PATH")] + fixtures: PathBuf, + /// Directory where generated real_world_job fixtures are written. + #[arg(long, value_name = "DIR")] + out_fixtures: PathBuf, + /// JSON evidence file for adapter setup/run/result details. + #[arg(long, value_name = "FILE")] + evidence_out: PathBuf, + /// ELF config loaded before Docker runtime overrides are applied. + #[arg(long, short = 'c', value_name = "FILE")] + config: PathBuf, + /// Adapter id embedded in generated adapter_response objects. + #[arg(long, default_value = "elf_live_real_world")] + adapter_id: String, +} + +#[derive(Debug, Parser)] +struct QmdArgs { + /// Fixture file or directory containing real_world_job JSON fixtures. + #[arg(long, value_name = "PATH")] + fixtures: PathBuf, + /// Directory where generated real_world_job fixtures are written. + #[arg(long, value_name = "DIR")] + out_fixtures: PathBuf, + /// JSON evidence file for adapter setup/run/result details. + #[arg(long, value_name = "FILE")] + evidence_out: PathBuf, + /// qmd checkout directory. The materializer clones into it when missing. + #[arg(long, value_name = "DIR")] + qmd_dir: PathBuf, + /// Work directory for qmd home, corpus files, and command logs. + #[arg(long, value_name = "DIR")] + work_dir: PathBuf, + /// qmd repository URL used when qmd_dir is absent. + #[arg(long, default_value = "https://github.com/tobi/qmd.git")] + qmd_repo_url: String, + /// Adapter id embedded in generated adapter_response objects. + #[arg(long, default_value = "qmd_live_real_world")] + adapter_id: String, +} + +#[derive(Debug, Parser)] +struct LightragArgs { + /// Fixture file or directory containing real_world_job JSON fixtures. + #[arg(long, value_name = "PATH")] + fixtures: PathBuf, + /// Directory where generated real_world_job fixtures are written. + #[arg(long, value_name = "DIR")] + out_fixtures: PathBuf, + /// JSON evidence file for adapter setup/run/result details. + #[arg(long, value_name = "FILE")] + evidence_out: PathBuf, + /// Work directory for generated source files and command logs. + #[arg(long, value_name = "DIR")] + work_dir: PathBuf, + /// LightRAG API base URL reachable from the Docker runner. + #[arg(long, default_value = "http://lightrag:9621")] + api_base: String, + /// Optional LightRAG API bearer token. + #[arg(long)] + api_key: Option, + /// Adapter id embedded in generated adapter_response objects. + #[arg(long, default_value = "lightrag_live_real_world")] + adapter_id: String, + /// LightRAG query mode used for context export. + #[arg(long, default_value = "naive")] + query_mode: String, + /// Number of top results requested from LightRAG. + #[arg(long, default_value_t = 5)] + top_k: u32, + /// Number of chunk results requested from LightRAG. + #[arg(long, default_value_t = 5)] + chunk_top_k: u32, + /// Health-check attempts before returning a typed runtime failure. + #[arg(long, default_value_t = 30)] + startup_attempts: u32, + /// Delay between LightRAG health-check attempts. + #[arg(long, default_value_t = 2)] + startup_interval_seconds: u64, + /// Poll attempts for asynchronous document indexing. + #[arg(long, default_value_t = 60)] + index_attempts: u32, + /// Delay between document indexing status checks. + #[arg(long, default_value_t = 2)] + index_interval_seconds: u64, +} + +#[derive(Debug)] +struct LoadedJob { + path: PathBuf, + value: serde_json::Value, + job: LiveJob, +} + +#[derive(Debug, Deserialize)] +struct LiveJob { + schema: String, + job_id: String, + suite: String, + title: String, + corpus: LiveCorpus, + prompt: LivePrompt, + expected_answer: LiveExpectedAnswer, + #[serde(default)] + required_evidence: Vec, + #[serde(default)] + encoding: LiveEncoding, + memory_evolution: Option, +} + +#[derive(Debug, Deserialize)] +struct LiveCorpus { + #[serde(default)] + items: Vec, +} + +#[derive(Debug, Deserialize)] +struct LiveCorpusItem { + evidence_id: String, + text: Option, + local_ref: Option, + #[serde(default)] + capture: LiveCapturePolicy, +} + +#[derive(Clone, Debug, Default, Deserialize)] +struct LiveCapturePolicy { + #[serde(default)] + action: LiveCaptureAction, + + source_id: Option, + + evidence_binding: Option, + + write_policy: Option, +} + +#[derive(Debug, Deserialize)] +struct LivePrompt { + content: String, +} + +#[derive(Debug, Deserialize)] +struct LiveExpectedAnswer { + #[serde(default)] + must_include: Vec, + #[serde(default)] + evidence_links: Map, +} + +#[derive(Debug, Deserialize)] +struct LiveRequiredEvidence { + evidence_id: String, +} + +#[derive(Debug, Default, Deserialize)] +struct LiveMemoryEvolution { + #[serde(default)] + current_evidence_ids: Vec, + #[serde(default)] + historical_evidence_ids: Vec, + #[serde(default)] + tombstone_evidence_ids: Vec, + #[serde(default)] + invalidation_evidence_ids: Vec, + #[serde(default)] + conflicts: Vec, + update_rationale: Option, +} + +#[derive(Debug, Deserialize)] +struct LiveEvolutionConflict { + claim_id: String, + current_evidence_id: String, + historical_evidence_id: String, + resolved_by_evidence_id: Option, +} + +#[derive(Debug, Deserialize)] +struct LiveUpdateRationale { + claim_id: String, + #[serde(default)] + evidence_ids: Vec, + available: bool, +} + +#[derive(Debug, Default, Deserialize)] +struct LiveEncoding { + status: Option, + reason: Option, +} + +#[derive(Debug, Serialize)] +struct MaterializationEvidence { + schema: &'static str, + adapter_id: String, + adapter_kind: AdapterKind, + status: MaterializationStatus, + fixtures: String, + generated_fixtures: String, + command_evidence: Vec, + jobs: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + metadata: Option, +} + +#[derive(Debug, Serialize)] +struct CommandEvidence { + label: String, + status: MaterializationStatus, + command: String, + artifact: Option, + reason: String, +} + +#[derive(Debug, Serialize)] +struct MaterializedJobEvidence { + job_id: String, + suite: String, + title: String, + status: MaterializationStatus, + query: String, + evidence_ids: Vec, + returned_count: usize, + #[serde(skip_serializing_if = "Option::is_none")] + indexing_latency_ms: Option, + latency_ms: f64, + trace_id: Option, + failure: Option, + #[serde(skip_serializing_if = "Vec::is_empty")] + source_mappings: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + operator_debug: Option, + #[serde(skip_serializing_if = "Option::is_none")] + capture: Option, + #[serde(skip_serializing_if = "Option::is_none")] + consolidation: Option, + #[serde(skip_serializing_if = "Option::is_none")] + knowledge: Option, + #[serde(skip_serializing_if = "Option::is_none")] + temporal_reconciliation: Option, +} + +#[derive(Clone, Debug, Serialize)] +struct OperatorDebugMaterializationEvidence { + trace_available: bool, + replay_command_available: bool, + candidate_drop_visibility: String, + repair_action_clarity: String, + raw_sql_needed: bool, +} + +#[derive(Clone, Debug, Default, Serialize)] +struct CaptureMaterializationEvidence { + stored_evidence_ids: Vec, + excluded_evidence_ids: Vec, + source_ids: Vec, + write_policy_audit_count: usize, + write_policy_exclusion_count: usize, + write_policy_redaction_count: usize, + #[serde(skip_serializing_if = "Vec::is_empty")] + runtime_source_refs: Vec, +} + +#[derive(Clone, Debug, Default, Serialize)] +struct ConsolidationMaterializationEvidence { + run_id: Option, + proposal_ids: Vec, + source_lineage_count: usize, + unsupported_claim_flag_count: usize, + review_event_count: usize, + review_actions: Vec, + final_review_states: Vec, +} + +#[derive(Clone, Debug, Default, Serialize)] +struct KnowledgeMaterializationEvidence { + page_ids: Vec, + search_result_count: usize, + lint_finding_count: usize, + stale_source_finding_count: usize, + unsupported_claim_count: usize, + citation_count: usize, + source_ref_count: usize, +} + +#[derive(Clone, Debug, Default, Serialize)] +struct TemporalReconciliationMaterializationEvidence { + current_winner_evidence_ids: Vec, + historical_loser_evidence_ids: Vec, + supersession_rationale_evidence_ids: Vec, + tombstone_evidence_ids: Vec, + invalidation_evidence_ids: Vec, + conflict_candidate_evidence_ids: Vec, + retrieved_evidence_ids: Vec, + selected_evidence_ids: Vec, + absent_evidence_ids: Vec, + retrieved_but_dropped_evidence_ids: Vec, + selected_but_not_narrated_evidence_ids: Vec, + contradicted_by_lifecycle_evidence_ids: Vec, +} + +#[derive(Clone, Debug, Serialize)] +struct CaptureRuntimeSourceRefEvidence { + evidence_id: String, + source_ref: serde_json::Value, +} + +#[derive(Clone, Debug, Default)] +struct CaptureRuntimeEvidence { + items: Vec, +} +impl CaptureRuntimeEvidence { + fn item_for(&self, evidence_id: &str) -> Option<&CaptureRuntimeEvidenceItem> { + self.items.iter().find(|item| item.evidence_id == evidence_id) + } +} + +#[derive(Clone, Debug)] +struct CaptureRuntimeEvidenceItem { + evidence_id: String, + source_id: Option, + evidence_binding: Option, + write_policy_applied: bool, + capture_action: Option, + source_ref: serde_json::Value, +} + +#[derive(Debug, Serialize)] +struct AdapterResponseOutput { + adapter_id: String, + answer: AnswerOutput, + #[serde(skip_serializing_if = "Option::is_none")] + consolidation: Option, +} + +#[derive(Debug, Serialize)] +struct AnswerOutput { + content: String, + evidence_ids: Vec, + claims: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + pages: Vec, + latency_ms: f64, + cost: CostOutput, + trace_explainability: TraceExplainabilityOutput, +} + +#[derive(Debug, Serialize)] +struct CostOutput { + currency: String, + amount: f64, + input_tokens: u64, + output_tokens: u64, +} + +#[derive(Debug, Serialize)] +struct TraceExplainabilityOutput { + trace_id: Option, + failure_stage: Option, + failure_reason: Option, + stages: Vec, +} + +#[derive(Debug, Serialize)] +struct TraceStageOutput { + stage_name: String, + kept_evidence: Vec, + dropped_evidence: Vec, + demoted_evidence: Vec, + distractor_evidence: Vec, + notes: String, +} + +#[derive(Debug)] +struct MaterializedJob { + response: AdapterResponseOutput, + evidence: MaterializedJobEvidence, + operator_debug: Option, +} + +#[derive(Debug)] +struct MaterializedJobInput { + content: String, + evidence_ids: Vec, + pages: Vec, + latency_ms: f64, + indexing_latency_ms: Option, + returned_count: usize, + trace_id: Option, + failure: Option, + source_mappings: Vec, + operator_debug: Option, + operator_debug_evidence: Option, + capture: Option, + capture_failure: Option, + consolidation_response: Option, + consolidation: Option, + knowledge: Option, + temporal_reconciliation: Option, + trace_stages: Option>, +} + +struct MaterializedOutput<'a> { + adapter_id: &'a str, + adapter_kind: AdapterKind, + fixtures: &'a Path, + out_fixtures: &'a Path, + evidence_out: &'a Path, + jobs: &'a [LoadedJob], + materialized: &'a [MaterializedJob], + command_evidence: Vec, + metadata: Option, +} + +#[derive(Debug)] +struct CorpusText { + evidence_id: String, + text: String, + capture: LiveCapturePolicy, +} + +#[derive(Debug, Default)] +struct IngestedCorpus { + capture: CaptureMaterializationEvidence, + note_ids_by_evidence: HashMap>, +} + +#[derive(Clone, Debug, Deserialize)] +struct LiveConsolidationFixture { + #[serde(default)] + proposals: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct LiveConsolidationProposal { + proposal_id: String, + proposal_kind: String, + #[serde(default)] + source_refs: Vec, + #[serde(default)] + expected_source_refs: Vec, + usefulness_score: f64, + min_usefulness_score: f64, + expected_review_action: String, + actual_review_action: String, + #[serde(default)] + source_mutations: Vec, + #[serde(default)] + unsupported_claim_count: usize, + #[serde(default)] + unsupported_claim_flags: Vec, + #[serde(default)] + diff: serde_json::Value, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct LiveUnsupportedClaimFlag { + claim_id: Option, + message: String, + source_ref: Option, +} + +#[derive(Debug)] +struct PreparedConsolidationRun { + input_refs: Vec, + proposals: Vec, +} + +#[derive(Clone, Debug, Serialize)] +struct SourceMappingEvidence { + source: String, + evidence_ids: Vec, + mapping_status: String, + content_count: usize, +} + +#[derive(Debug)] +struct LightragSource { + evidence_id: String, + file_source: String, + artifact_path: PathBuf, +} + +#[derive(Debug)] +struct BaselineRuntime { + config_path: PathBuf, + dsn: String, + qdrant_url: String, + collection: String, + docs_collection: String, +} + +#[derive(Debug)] +struct DeterministicEmbedding { + vector_dim: u32, +} +impl EmbeddingProvider for DeterministicEmbedding { + fn embed<'a>( + &'a self, + _cfg: &'a EmbeddingProviderConfig, + texts: &'a [String], + ) -> BoxFuture<'a, elf_service::Result>>> { + let dim = self.vector_dim; + let vectors = texts.iter().map(|text| embed_text(text, dim)).collect(); + + Box::pin(async move { Ok(vectors) }) + } +} + +#[derive(Debug)] +struct TokenOverlapRerank; +impl RerankProvider for TokenOverlapRerank { + fn rerank<'a>( + &'a self, + _cfg: &'a ProviderConfig, + query: &'a str, + docs: &'a [String], + ) -> BoxFuture<'a, elf_service::Result>> { + let query_terms = terms(query); + let scores = docs + .iter() + .map(|doc| { + let doc_terms = terms(doc); + let hits = query_terms.intersection(&doc_terms).count() as f32; + + hits / query_terms.len().max(1) as f32 + }) + .collect(); + + Box::pin(async move { Ok(scores) }) + } +} + +#[derive(Debug)] +struct NoopExtractor; +impl ExtractorProvider for NoopExtractor { + fn extract<'a>( + &'a self, + _cfg: &'a LlmProviderConfig, + _messages: &'a [serde_json::Value], + ) -> BoxFuture<'a, elf_service::Result> { + Box::pin(async move { Ok(serde_json::json!({ "notes": [] })) }) + } +} + +#[derive(Debug)] +struct SelectedEvidenceText { + content: String, + evidence_ids: Vec, +} + +#[derive(Debug)] +struct TemporalReconciliationSelection { + selected: SelectedEvidenceText, + evidence: TemporalReconciliationMaterializationEvidence, + trace_stages: Vec, +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize)] +#[serde(rename_all = "snake_case")] +enum LiveCaptureAction { + #[default] + Store, + Exclude, +} + +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum LiveExpectedClaim { + Text(String), + Object { claim_id: Option, text: String }, +} +impl LiveExpectedClaim { + fn claim_id(&self) -> Option<&str> { + match self { + Self::Text(_) => None, + Self::Object { claim_id, .. } => claim_id.as_deref(), + } + } + + fn text(&self) -> &str { + match self { + Self::Text(text) => text, + Self::Object { text, .. } => text, + } + } +} + +#[derive(Clone, Copy, Debug, Deserialize)] +#[serde(rename_all = "snake_case")] +enum LiveEncodingStatus { + NotEncoded, + Blocked, + Incomplete, +} +impl LiveEncodingStatus { + fn materialization_status(self) -> MaterializationStatus { + match self { + Self::NotEncoded => MaterializationStatus::NotEncoded, + Self::Blocked => MaterializationStatus::Blocked, + Self::Incomplete => MaterializationStatus::Incomplete, + } + } + + fn as_str(self) -> &'static str { + match self { + Self::NotEncoded => "not_encoded", + Self::Blocked => "blocked", + Self::Incomplete => "incomplete", + } + } +} + +#[derive(Debug, Subcommand)] +#[command(rename_all = "kebab")] +enum CommandArgs { + /// Materialize adapter responses by running jobs through ELF's service runtime. + Elf(ElfArgs), + /// Materialize adapter responses by running jobs through qmd's local CLI workflow. + Qmd(QmdArgs), + /// Materialize adapter responses by exporting LightRAG query context and source mappings. + Lightrag(LightragArgs), +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +enum AdapterKind { + ElfServiceRuntime, + QmdCliRuntime, + LightragApiContextExport, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +enum MaterializationStatus { + Pass, + WrongResult, + Blocked, + Incomplete, + NotEncoded, +} + +fn run_qmd(args: QmdArgs) -> color_eyre::Result<()> { + let jobs = load_jobs(&args.fixtures)?; + let result = materialize_qmd_jobs(&args, &jobs); + let materialized = match result { + Ok(jobs) => jobs, + Err(err) => failure_jobs(&args.adapter_id, &jobs, "qmd_cli_runtime", err.to_string()), + }; + + write_materialized_output(MaterializedOutput { + adapter_id: &args.adapter_id, + adapter_kind: AdapterKind::QmdCliRuntime, + fixtures: &args.fixtures, + out_fixtures: &args.out_fixtures, + evidence_out: &args.evidence_out, + jobs: &jobs, + materialized: &materialized, + command_evidence: vec![CommandEvidence { + label: "qmd_cli_runtime".to_string(), + status: aggregate_status(&materialized), + command: "cargo run -p elf-eval --bin real_world_live_adapter -- qmd".to_string(), + artifact: Some(args.evidence_out.display().to_string()), + reason: "qmd live adapter used collection add, update, embed, and query --json." + .to_string(), + }], + metadata: None, + }) +} + +fn materialize_qmd_jobs( + args: &QmdArgs, + jobs: &[LoadedJob], +) -> color_eyre::Result> { + fs::create_dir_all(&args.work_dir)?; + + let log_path = args.work_dir.join("qmd-live-real-world.log"); + + ensure_qmd_checkout(args, &log_path)?; + + let mut out = Vec::with_capacity(jobs.len()); + + for loaded in jobs { + out.push(materialize_qmd_job(args, loaded, &log_path)?); + } + + Ok(out) +} + +fn ensure_qmd_checkout(args: &QmdArgs, log_path: &Path) -> color_eyre::Result<()> { + if !args.qmd_dir.exists() { + if let Some(parent) = args.qmd_dir.parent() { + fs::create_dir_all(parent)?; + } + + run_logged_command( + "qmd clone", + Command::new("git") + .arg("clone") + .arg("--depth") + .arg("1") + .arg(&args.qmd_repo_url) + .arg(&args.qmd_dir), + log_path, + )?; + } + + run_logged_shell( + "qmd install", + &args.qmd_dir, + "(npm ci || npm install --no-audit --no-fund) && npm run build --if-present", + log_path, + ) +} + +fn materialize_qmd_job( + args: &QmdArgs, + loaded: &LoadedJob, + log_path: &Path, +) -> color_eyre::Result { + if let Some(job) = declared_encoding_job(&args.adapter_id, loaded) { + return Ok(job); + } + if let Some(job) = not_encoded_job(&args.adapter_id, loaded) { + return Ok(job); + } + + let corpus = corpus_texts(loaded)?; + let job_slug = slug(&loaded.job.job_id); + let corpus_dir = args.work_dir.join("corpus").join(&job_slug); + let home_dir = args.work_dir.join("home").join(&job_slug); + let collection = format!("elfrw-{job_slug}"); + + fs::create_dir_all(&corpus_dir)?; + fs::create_dir_all(&home_dir)?; + + for existing in read_dir_paths(&corpus_dir)? { + if existing.is_file() { + fs::remove_file(existing)?; + } + } + for item in &corpus { + let path = corpus_dir.join(format!("{}.md", slug(&item.evidence_id))); + + fs::write(path, format!("# {}\n\n{}\n", item.evidence_id, item.text))?; + } + + run_qmd_command( + "qmd collection add", + args, + &home_dir, + &[ + "collection", + "add", + corpus_dir + .to_str() + .ok_or_else(|| eyre::eyre!("qmd corpus path is not valid UTF-8."))?, + "--name", + collection.as_str(), + ], + log_path, + )?; + run_qmd_command("qmd update", args, &home_dir, &["update"], log_path)?; + run_qmd_command( + "qmd embed", + args, + &home_dir, + &["embed", "-f", "-c", collection.as_str()], + log_path, + )?; + + let started_at = Instant::now(); + let query = format!("lex: {}\nvec: {}", loaded.job.prompt.content, loaded.job.prompt.content); + let stdout = run_qmd_command( + "qmd query", + args, + &home_dir, + &[ + "query", + query.as_str(), + "-c", + collection.as_str(), + "--json", + "--no-rerank", + "--min-score", + "0", + "-n", + "5", + ], + log_path, + )?; + let latency_ms = started_at.elapsed().as_secs_f64() * 1_000.0; + let results = serde_json::from_str::(&stdout).map_err(|err| { + eyre::eyre!("qmd query did not return JSON for {}: {err}", loaded.job.job_id) + })?; + let entries = results.as_array().cloned().unwrap_or_default(); + let mut evidence_ids = Vec::new(); + + for entry in &entries { + let entry_text = serde_json::to_string(entry)?; + + for item in &corpus { + if entry_text.contains(format!("{}.md", slug(&item.evidence_id)).as_str()) + || entry_text.contains(item.evidence_id.as_str()) + { + push_unique(&mut evidence_ids, item.evidence_id.clone()); + } + } + } + + let selected = selected_required_corpus_texts(loaded, &corpus, &evidence_ids); + let replay_command = qmd_replay_command(&loaded.job.prompt.content, collection.as_str()); + let (operator_debug, operator_debug_evidence) = operator_debug_output( + AdapterKind::QmdCliRuntime, + loaded, + None, + replay_command, + log_path.display().to_string(), + ); + + Ok(qmd_materialized_job( + loaded, + &args.adapter_id, + selected, + latency_ms, + entries.len(), + operator_debug, + operator_debug_evidence, + )) +} + +fn qmd_materialized_job( + loaded: &LoadedJob, + adapter_id: &str, + selected: SelectedEvidenceText, + latency_ms: f64, + returned_count: usize, + operator_debug: Option, + operator_debug_evidence: Option, +) -> MaterializedJob { + materialized_job( + loaded, + adapter_id, + MaterializedJobInput { + content: selected.content, + evidence_ids: selected.evidence_ids, + pages: Vec::new(), + latency_ms, + indexing_latency_ms: None, + returned_count, + trace_id: None, + failure: None, + source_mappings: Vec::new(), + operator_debug, + operator_debug_evidence, + capture: None, + capture_failure: None, + consolidation_response: None, + consolidation: None, + knowledge: None, + temporal_reconciliation: None, + trace_stages: None, + }, + ) +} + +fn lightrag_not_encoded_job(adapter_id: &str, loaded: &LoadedJob) -> Option { + match loaded.job.suite.as_str() { + "retrieval" => None, + _ => Some(materialized_declared_status_job( + adapter_id, + loaded, + MaterializationStatus::NotEncoded, + "LightRAG context-export smoke only maps retrieved context/source paths; this suite is not encoded for LightRAG scoring.".to_string(), + )), + } +} + +fn lightrag_failure_jobs( + adapter_id: &str, + jobs: &[LoadedJob], + stage: &str, + reason: String, +) -> Vec { + jobs.iter() + .map(|job| { + if let Some(declared) = declared_encoding_job(adapter_id, job) { + return declared; + } + if let Some(not_encoded) = lightrag_not_encoded_job(adapter_id, job) { + return not_encoded; + } + + materialized_job( + job, + adapter_id, + MaterializedJobInput { + content: String::new(), + evidence_ids: Vec::new(), + pages: Vec::new(), + latency_ms: 0.0, + indexing_latency_ms: None, + returned_count: 0, + trace_id: None, + failure: Some(format!("{stage}: {reason}")), + source_mappings: Vec::new(), + operator_debug: None, + operator_debug_evidence: None, + capture: None, + capture_failure: None, + consolidation_response: None, + consolidation: None, + knowledge: None, + temporal_reconciliation: None, + trace_stages: None, + }, + ) + }) + .collect() +} + +fn write_lightrag_corpus( + args: &LightragArgs, + loaded: &LoadedJob, + corpus: &[CorpusText], + run_slug: &str, +) -> color_eyre::Result> { + let job_slug = slug(&loaded.job.job_id); + let corpus_dir = args.work_dir.join("corpus").join(run_slug).join(&job_slug); + + fs::create_dir_all(&corpus_dir)?; + + corpus + .iter() + .map(|item| { + let file_name = format!("{}.md", slug(&item.evidence_id)); + let artifact_path = corpus_dir.join(&file_name); + let file_source = format!("elf-real-world/{run_slug}/{job_slug}/{file_name}"); + + fs::write(&artifact_path, format!("# {}\n\n{}\n", item.evidence_id, item.text))?; + + Ok(LightragSource { evidence_id: item.evidence_id.clone(), file_source, artifact_path }) + }) + .collect() +} + +fn lightrag_index_failed(status: &serde_json::Value) -> bool { + status.get("documents").and_then(serde_json::Value::as_array).into_iter().flatten().any(|doc| { + doc.get("status") + .and_then(serde_json::Value::as_str) + .is_some_and(|status| status.to_ascii_lowercase().contains("fail")) + }) +} + +fn lightrag_index_processed(status: &serde_json::Value, expected_docs: usize) -> bool { + let Some(documents) = status.get("documents").and_then(serde_json::Value::as_array) else { + return false; + }; + + documents.len() >= expected_docs + && documents.iter().all(|doc| { + doc.get("status").and_then(serde_json::Value::as_str).is_some_and(|status| { + let normalized = status.to_ascii_lowercase(); + + normalized.contains("processed") || normalized.contains("success") + }) + }) +} + +fn lightrag_keywords(query: &str) -> Vec { + terms(query).into_iter().take(12).collect() +} + +fn lightrag_source_mappings( + corpus: &[CorpusText], + sources: &[LightragSource], + response: &serde_json::Value, +) -> Vec { + let mut mappings = Vec::new(); + + if let Some(references) = response.get("references").and_then(serde_json::Value::as_array) { + for reference in references { + mappings.push(lightrag_reference_mapping(corpus, sources, reference)); + } + } + + if mappings.is_empty() + && let Some(context) = response.get("response").and_then(serde_json::Value::as_str) + { + let evidence_ids = map_lightrag_evidence_ids(corpus, sources, context); + + if !evidence_ids.is_empty() { + mappings.push(SourceMappingEvidence { + source: "response_context".to_string(), + evidence_ids, + mapping_status: "matched_context".to_string(), + content_count: 1, + }); + } + } + + mappings +} + +fn lightrag_reference_mapping( + corpus: &[CorpusText], + sources: &[LightragSource], + reference: &serde_json::Value, +) -> SourceMappingEvidence { + let source = reference + .get("file_path") + .and_then(serde_json::Value::as_str) + .or_else(|| reference.get("reference_id").and_then(serde_json::Value::as_str)) + .unwrap_or("unknown_source") + .to_string(); + let content = reference + .get("content") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + .filter_map(serde_json::Value::as_str) + .collect::>(); + let joined_content = content.join("\n"); + let combined = format!("{source}\n{joined_content}"); + let evidence_ids = map_lightrag_evidence_ids(corpus, sources, combined.as_str()); + let mapping_status = if evidence_ids.is_empty() { + "unmatched" + } else if !joined_content.is_empty() { + "matched_reference_content" + } else { + "matched_reference_source" + }; + + SourceMappingEvidence { + source, + evidence_ids, + mapping_status: mapping_status.to_string(), + content_count: content.len(), + } +} + +fn map_lightrag_evidence_ids( + corpus: &[CorpusText], + sources: &[LightragSource], + haystack: &str, +) -> Vec { + let normalized_haystack = normalize_ascii_alnum_lowercase(haystack); + let mut evidence_ids = Vec::new(); + + for item in corpus { + let evidence_slug = slug(&item.evidence_id); + let signature = normalized_text_signature(item.text.as_str()); + let source_match = sources.iter().any(|source| { + source.evidence_id == item.evidence_id + && (haystack.contains(source.file_source.as_str()) + || haystack.contains(source.artifact_path.to_string_lossy().as_ref())) + }); + let id_match = haystack.contains(item.evidence_id.as_str()) + || haystack.contains(evidence_slug.as_str()) + || normalized_haystack.contains(evidence_slug.as_str()); + let content_match = + !signature.is_empty() && normalized_haystack.contains(signature.as_str()); + + if source_match || id_match || content_match { + push_unique(&mut evidence_ids, item.evidence_id.clone()); + } + } + + evidence_ids +} + +fn normalized_text_signature(text: &str) -> String { + normalize_ascii_alnum_lowercase(text).split_whitespace().take(8).collect::>().join(" ") +} + +fn lightrag_mapped_evidence_ids(mappings: &[SourceMappingEvidence]) -> Vec { + let mut evidence_ids = Vec::new(); + + for mapping in mappings { + for evidence_id in &mapping.evidence_ids { + push_unique(&mut evidence_ids, evidence_id.clone()); + } + } + + evidence_ids +} + +fn lightrag_api_base(args: &LightragArgs) -> String { + args.api_base.trim_end_matches('/').to_string() +} + +fn lightrag_metadata(args: &LightragArgs, run_slug: &str) -> serde_json::Value { + serde_json::json!({ + "schema": "elf.lightrag_context_export_metadata/v1", + "run_slug": run_slug, + "api_base": lightrag_api_base(args), + "query": { + "mode": args.query_mode, + "only_need_context": true, + "include_references": true, + "include_chunk_content": true, + "enable_rerank": false, + "top_k": args.top_k, + "chunk_top_k": args.chunk_top_k + }, + "docker_boundary": { + "compose_file": "docker-compose.baseline.yml", + "service_profile": "lightrag", + "service": "lightrag", + "mock_provider_service": "lightrag-mock-provider", + "host_global_installs_required": false, + "workspace": "/app/data/rag_storage", + "input_dir": "/app/data/inputs", + "data_volumes": [ + "elf-live-baseline-lightrag-rag-storage", + "elf-live-baseline-lightrag-inputs", + "elf-live-baseline-lightrag-prompts" + ] + }, + "provider_boundaries": { + "llm_binding": "openai-compatible", + "embedding_binding": "openai-compatible", + "embedding_dim": 64, + "rerank_binding": "cohere-compatible", + "rerank_enabled_for_query": false, + "api_key_provided": args.api_key.as_deref().is_some_and(|key| !key.is_empty()), + "operator_owned_provider_credentials_used": false + }, + "cache_and_resource_envelope": { + "cargo_cache": "/usr/local/cargo", + "pip_cache": "/root/.cache/pip", + "huggingface_cache": "/root/.cache/huggingface", + "lightrag_storage": "/app/data/rag_storage", + "startup_attempts": args.startup_attempts, + "startup_interval_seconds": args.startup_interval_seconds, + "index_attempts": args.index_attempts, + "index_interval_seconds": args.index_interval_seconds + }, + "source_mapping": { + "corpus_file_source_template": "elf-real-world/{run_slug}/{job_slug}/{evidence_id}.md", + "mapping_inputs": ["references.file_path", "references.content", "response"], + "quality_claim": "none" + } + }) +} + +fn materialized_job( + loaded: &LoadedJob, + adapter_id: &str, + input: MaterializedJobInput, +) -> MaterializedJob { + let capture_failure = input.capture_failure.clone(); + let required_evidence_satisfied = + capture_failure.is_none() && required_evidence_satisfied(loaded, &input.evidence_ids); + let status = if input.failure.is_some() { + MaterializationStatus::Incomplete + } else if !required_evidence_satisfied { + MaterializationStatus::WrongResult + } else { + MaterializationStatus::Pass + }; + let failure_stage = if input.failure.is_some() { + Some("live_adapter.retrieve".to_string()) + } else if capture_failure.is_some() { + Some("live_adapter.capture_policy".to_string()) + } else { + None + }; + let failure_reason = input.failure.clone().or(capture_failure); + let stage_notes = if let Some(reason) = &failure_reason { + reason.clone() + } else if !required_evidence_satisfied { + "Adapter did not return all required mapped evidence for this job.".to_string() + } else { + "Adapter returned mapped evidence through its live retrieval path.".to_string() + }; + let trace_stages = input.trace_stages.unwrap_or_else(|| { + vec![TraceStageOutput { + stage_name: failure_stage + .clone() + .unwrap_or_else(|| "live_adapter.retrieve".to_string()), + kept_evidence: input.evidence_ids.clone(), + dropped_evidence: Vec::new(), + demoted_evidence: Vec::new(), + distractor_evidence: Vec::new(), + notes: stage_notes, + }] + }); + + MaterializedJob { + response: AdapterResponseOutput { + adapter_id: adapter_id.to_string(), + answer: AnswerOutput { + content: input.content, + evidence_ids: input.evidence_ids.clone(), + claims: answer_claims(loaded, &input.evidence_ids), + pages: input.pages, + latency_ms: input.latency_ms, + cost: CostOutput { + currency: "USD".to_string(), + amount: 0.0, + input_tokens: 0, + output_tokens: 0, + }, + trace_explainability: TraceExplainabilityOutput { + trace_id: input.trace_id.map(|id| id.to_string()), + failure_stage: failure_stage.clone(), + failure_reason: failure_reason.clone(), + stages: trace_stages, + }, + }, + consolidation: input.consolidation_response, + }, + operator_debug: input.operator_debug, + evidence: MaterializedJobEvidence { + job_id: loaded.job.job_id.clone(), + suite: loaded.job.suite.clone(), + title: loaded.job.title.clone(), + status, + query: loaded.job.prompt.content.clone(), + evidence_ids: input.evidence_ids, + returned_count: input.returned_count, + indexing_latency_ms: input.indexing_latency_ms, + latency_ms: input.latency_ms, + trace_id: input.trace_id, + failure: failure_reason, + source_mappings: input.source_mappings, + operator_debug: input.operator_debug_evidence, + capture: input.capture, + consolidation: input.consolidation, + knowledge: input.knowledge, + temporal_reconciliation: input.temporal_reconciliation, + }, + } +} + +fn declared_encoding_job(adapter_id: &str, loaded: &LoadedJob) -> Option { + if is_operator_debug_live_adapter(adapter_id, loaded.job.suite.as_str()) { + return None; + } + if is_elf_consolidation_live_adapter(adapter_id, loaded.job.suite.as_str()) { + return None; + } + if is_elf_knowledge_live_adapter(adapter_id, loaded.job.suite.as_str()) { + return None; + } + if is_elf_capture_live_adapter(adapter_id, loaded.job.suite.as_str()) { + return None; + } + + let status = loaded.job.encoding.status?; + let reason = loaded.job.encoding.reason.clone().unwrap_or_else(|| { + format!("Fixture declares {} for this live adapter job.", status.as_str()) + }); + + Some(materialized_declared_status_job( + adapter_id, + loaded, + status.materialization_status(), + reason, + )) +} + +fn not_encoded_job(adapter_id: &str, loaded: &LoadedJob) -> Option { + if is_operator_debug_live_adapter(adapter_id, loaded.job.suite.as_str()) { + return None; + } + if is_elf_consolidation_live_adapter(adapter_id, loaded.job.suite.as_str()) { + return None; + } + if is_elf_knowledge_live_adapter(adapter_id, loaded.job.suite.as_str()) { + return None; + } + if is_elf_capture_live_adapter(adapter_id, loaded.job.suite.as_str()) { + return None; + } + + not_encoded_reason(loaded.job.suite.as_str()).map(|reason| { + materialized_declared_status_job( + adapter_id, + loaded, + MaterializationStatus::NotEncoded, + reason.to_string(), + ) + }) +} + +fn is_operator_debug_live_adapter(adapter_id: &str, suite: &str) -> bool { + suite == "operator_debugging_ux" + && matches!( + adapter_id, + "elf_live_real_world" + | "qmd_live_real_world" + | "elf_operator_debug_live" + | "qmd_operator_debug_live" + ) +} + +fn is_elf_consolidation_live_adapter(adapter_id: &str, suite: &str) -> bool { + suite == "consolidation" && adapter_id == "elf_live_real_world" +} + +fn is_elf_knowledge_live_adapter(adapter_id: &str, suite: &str) -> bool { + suite == "knowledge_compilation" && adapter_id == "elf_live_real_world" +} + +fn is_elf_capture_live_adapter(adapter_id: &str, suite: &str) -> bool { + suite == "capture_integration" + && matches!(adapter_id, "elf_live_real_world" | "elf_capture_write_policy_live") +} + +fn not_encoded_reason(suite: &str) -> Option<&'static str> { + match suite { + "trust_source_of_truth" + | "work_resume" + | "project_decisions" + | "retrieval" + | "memory_evolution" + | "personalization" => None, + "consolidation" => Some( + "The live adapter sweep retrieves evidence-linked answers but does not generate or review consolidation proposals.", + ), + "knowledge_compilation" => Some( + "The live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages.", + ), + "operator_debugging_ux" => Some( + "The full live adapter sweep keeps operator trace/viewer diagnostics in a focused operator-debug slice.", + ), + "capture_integration" => Some( + "The live adapter sweep does not exercise capture integrations or write-policy redaction boundaries.", + ), + "production_ops" => Some( + "The live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations.", + ), + _ => Some("The live adapter sweep has no encoded runtime path for this suite."), + } +} + +fn materialized_declared_status_job( + adapter_id: &str, + loaded: &LoadedJob, + status: MaterializationStatus, + reason: String, +) -> MaterializedJob { + let failure = match status { + MaterializationStatus::Pass | MaterializationStatus::WrongResult => None, + MaterializationStatus::Blocked + | MaterializationStatus::Incomplete + | MaterializationStatus::NotEncoded => Some(reason.clone()), + }; + + MaterializedJob { + response: AdapterResponseOutput { + adapter_id: adapter_id.to_string(), + answer: AnswerOutput { + content: String::new(), + evidence_ids: Vec::new(), + claims: Vec::new(), + pages: Vec::new(), + latency_ms: 0.0, + cost: CostOutput { + currency: "USD".to_string(), + amount: 0.0, + input_tokens: 0, + output_tokens: 0, + }, + trace_explainability: TraceExplainabilityOutput { + trace_id: None, + failure_stage: Some("live_adapter.suite_support".to_string()), + failure_reason: failure.clone(), + stages: vec![TraceStageOutput { + stage_name: "live_adapter.suite_support".to_string(), + kept_evidence: Vec::new(), + dropped_evidence: Vec::new(), + demoted_evidence: Vec::new(), + distractor_evidence: Vec::new(), + notes: reason.clone(), + }], + }, + }, + consolidation: None, + }, + evidence: MaterializedJobEvidence { + job_id: loaded.job.job_id.clone(), + suite: loaded.job.suite.clone(), + title: loaded.job.title.clone(), + status, + query: loaded.job.prompt.content.clone(), + evidence_ids: Vec::new(), + returned_count: 0, + indexing_latency_ms: None, + latency_ms: 0.0, + trace_id: None, + failure, + source_mappings: Vec::new(), + operator_debug: None, + capture: None, + consolidation: None, + knowledge: None, + temporal_reconciliation: None, + }, + operator_debug: None, + } +} + +fn operator_debug_output( + adapter_kind: AdapterKind, + loaded: &LoadedJob, + trace_id: Option, + replay_command: String, + replay_artifact: String, +) -> (Option, Option) { + if loaded.job.suite != "operator_debugging_ux" { + return (None, None); + } + + let Some(source) = loaded.value.get("operator_debug") else { + return (None, None); + }; + let mut debug = source.clone(); + let Some(object) = debug.as_object_mut() else { + return (None, None); + }; + let trace_available = trace_id.is_some(); + let replay_command_available = !replay_command.trim().is_empty(); + let raw_sql_needed = false; + let repair_action_clarity = if replay_command_available { "clear" } else { "unclear" }; + let candidate_drop_visibility = + operator_debug_candidate_visibility(adapter_kind, object).to_string(); + + object.insert("trace_available".to_string(), serde_json::Value::Bool(trace_available)); + object.insert( + "replay_command_available".to_string(), + serde_json::Value::Bool(replay_command_available), + ); + object.insert("raw_sql_needed".to_string(), serde_json::Value::Bool(raw_sql_needed)); + object.insert( + "dropped_candidate_visibility".to_string(), + serde_json::Value::String(candidate_drop_visibility.clone()), + ); + object.insert( + "trace_completeness".to_string(), + serde_json::Value::String( + operator_debug_trace_completeness(adapter_kind, trace_available).to_string(), + ), + ); + object.insert( + "repair_action_clarity".to_string(), + serde_json::Value::String(repair_action_clarity.to_string()), + ); + object.insert("replay_command".to_string(), serde_json::Value::String(replay_command.clone())); + object.insert("replay_artifact".to_string(), serde_json::Value::String(replay_artifact)); + + match adapter_kind { + AdapterKind::ElfServiceRuntime => + if let Some(trace_id) = trace_id { + let trace_id = trace_id.to_string(); + + object.insert("trace_id".to_string(), serde_json::Value::String(trace_id.clone())); + object.insert( + "viewer_url".to_string(), + serde_json::Value::String(format!("/viewer?trace_id={trace_id}")), + ); + object.insert( + "admin_trace_bundle_url".to_string(), + serde_json::Value::String(format!( + "/v2/admin/traces/{trace_id}/bundle?mode=full&stage_items_limit=128&candidates_limit=200" + )), + ); + }, + AdapterKind::QmdCliRuntime => { + object.remove("trace_id"); + object.remove("viewer_url"); + object.remove("admin_trace_bundle_url"); + object.insert("viewer_panels".to_string(), serde_json::json!(["qmd JSON Replay Rows"])); + }, + AdapterKind::LightragApiContextExport => {}, + } + + let mut cli_steps = string_array_from_object(object, "cli_steps"); + + push_unique(&mut cli_steps, replay_command); + + object.insert("cli_steps".to_string(), serde_json::json!(cli_steps)); + + ( + Some(debug), + Some(OperatorDebugMaterializationEvidence { + trace_available, + replay_command_available, + candidate_drop_visibility, + repair_action_clarity: repair_action_clarity.to_string(), + raw_sql_needed, + }), + ) +} + +fn operator_debug_trace_completeness( + adapter_kind: AdapterKind, + trace_available: bool, +) -> &'static str { + match adapter_kind { + AdapterKind::ElfServiceRuntime if trace_available => "complete", + AdapterKind::ElfServiceRuntime => "missing", + AdapterKind::QmdCliRuntime | AdapterKind::LightragApiContextExport => "not_available", + } +} + +fn operator_debug_candidate_visibility( + adapter_kind: AdapterKind, + object: &Map, +) -> &str { + match adapter_kind { + AdapterKind::ElfServiceRuntime => object + .get("dropped_candidate_visibility") + .and_then(serde_json::Value::as_str) + .unwrap_or("visible through trace bundle replay candidates"), + AdapterKind::QmdCliRuntime => + "qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed", + AdapterKind::LightragApiContextExport => "not encoded for this adapter", + } +} + +fn string_array_from_object(object: &Map, key: &str) -> Vec { + object + .get(key) + .and_then(serde_json::Value::as_array) + .map(|items| { + items.iter().filter_map(serde_json::Value::as_str).map(ToString::to_string).collect() + }) + .unwrap_or_default() +} + +fn elf_replay_command(trace_id: Uuid, project_id: &str) -> String { + format!( + "curl -fsS {} -H {} -H {} -H {}", + shell_quote(format!( + "http://127.0.0.1:51891/v2/admin/traces/{trace_id}/bundle?mode=full&stage_items_limit=128&candidates_limit=200" + ) + .as_str()), + shell_quote("X-ELF-Tenant-Id: elf-live-real-world"), + shell_quote(format!("X-ELF-Project-Id: {project_id}").as_str()), + shell_quote("X-ELF-Agent-Id: elf-live-real-world-agent") + ) +} + +fn qmd_replay_command(query: &str, collection: &str) -> String { + format!( + "npx tsx src/cli/qmd.ts query {} -c {} --json --no-rerank --min-score 0 -n 5", + shell_quote(format!("lex: {query}\nvec: {query}").as_str()), + shell_quote(collection) + ) +} + +fn shell_quote(value: &str) -> String { + format!("'{}'", value.replace('\'', "'\\''")) +} + +fn evidence_linked_claims(loaded: &LoadedJob, evidence_ids: &[String]) -> Vec { + loaded + .job + .expected_answer + .must_include + .iter() + .filter_map(|claim| { + let claim_id = claim.claim_id()?; + let allowed = + evidence_link_ids(loaded.job.expected_answer.evidence_links.get(claim_id)?); + let produced = evidence_ids + .iter() + .filter(|evidence_id| allowed.iter().any(|allowed_id| allowed_id == *evidence_id)) + .cloned() + .collect::>(); + + if produced.is_empty() { + return None; + } + + Some(serde_json::json!({ + "claim_id": claim_id, + "text": claim.text(), + "evidence_ids": produced, + "confidence": "derived_from_live_retrieval" + })) + }) + .collect() +} + +fn answer_claims(loaded: &LoadedJob, evidence_ids: &[String]) -> Vec { + if loaded.job.memory_evolution.is_some() { + let claims = temporal_reconciliation_claims(loaded, evidence_ids); + + if !claims.is_empty() { + return claims; + } + } + + evidence_linked_claims(loaded, evidence_ids) +} + +fn temporal_reconciliation_claims( + loaded: &LoadedJob, + evidence_ids: &[String], +) -> Vec { + let Some(evolution) = &loaded.job.memory_evolution else { + return Vec::new(); + }; + let selected = evidence_ids.iter().map(String::as_str).collect::>(); + let mut claims = Vec::new(); + let mut claim_ids = BTreeSet::new(); + + for expected in &loaded.job.expected_answer.must_include { + let Some(claim_id) = expected.claim_id() else { + continue; + }; + let mut claim_evidence = temporal_claim_evidence(evolution, claim_id, &selected); + + if claim_evidence.is_empty() + && let Some(allowed) = loaded.job.expected_answer.evidence_links.get(claim_id) + { + claim_evidence = selected_allowed_evidence(allowed, &selected); + } + if claim_evidence.is_empty() { + continue; + } + + claim_ids.insert(claim_id.to_string()); + claims.push(json_claim(claim_id, expected.text(), claim_evidence)); + } + + if let Some(rationale) = &evolution.update_rationale + && rationale.available + && !claim_ids.contains(rationale.claim_id.as_str()) + { + let claim_evidence = rationale + .evidence_ids + .iter() + .filter(|id| selected.contains(id.as_str())) + .cloned() + .collect::>(); + + if !claim_evidence.is_empty() { + let text = expected_claim_text_for_id(loaded, rationale.claim_id.as_str()) + .unwrap_or("The supersession rationale is selected as lifecycle evidence."); + + claims.push(json_claim(rationale.claim_id.as_str(), text, claim_evidence)); + } + } + + claims +} + +fn temporal_claim_evidence( + evolution: &LiveMemoryEvolution, + claim_id: &str, + selected: &BTreeSet<&str>, +) -> Vec { + let mut evidence = Vec::new(); + + for conflict in &evolution.conflicts { + if conflict.claim_id != claim_id { + continue; + } + + push_if_selected(&mut evidence, conflict.current_evidence_id.as_str(), selected); + push_if_selected(&mut evidence, conflict.historical_evidence_id.as_str(), selected); + + if let Some(rationale_id) = &conflict.resolved_by_evidence_id { + push_if_selected(&mut evidence, rationale_id.as_str(), selected); + } + } + + evidence +} + +fn selected_allowed_evidence( + allowed: &serde_json::Value, + selected: &BTreeSet<&str>, +) -> Vec { + evidence_link_ids(allowed).into_iter().filter(|id| selected.contains(id.as_str())).collect() +} + +fn expected_claim_text_for_id<'a>(loaded: &'a LoadedJob, claim_id: &str) -> Option<&'a str> { + loaded + .job + .expected_answer + .must_include + .iter() + .find(|claim| claim.claim_id() == Some(claim_id)) + .map(LiveExpectedClaim::text) +} + +fn json_claim(claim_id: &str, text: &str, evidence_ids: Vec) -> serde_json::Value { + serde_json::json!({ + "claim_id": claim_id, + "text": text, + "evidence_ids": evidence_ids, + "confidence": "derived_from_live_temporal_reconciliation" + }) +} + +fn push_if_selected(out: &mut Vec, evidence_id: &str, selected: &BTreeSet<&str>) { + if selected.contains(evidence_id) { + push_unique(out, evidence_id.to_string()); + } +} + +fn evidence_link_ids(value: &serde_json::Value) -> Vec { + if let Some(id) = value.as_str() { + return vec![id.to_string()]; + } + + value + .as_array() + .map(|items| { + items + .iter() + .filter_map(serde_json::Value::as_str) + .map(ToString::to_string) + .collect::>() + }) + .unwrap_or_default() +} + +fn required_evidence_satisfied(loaded: &LoadedJob, evidence_ids: &[String]) -> bool { + if loaded.job.required_evidence.is_empty() { + return !evidence_ids.is_empty(); + } + + loaded + .job + .required_evidence + .iter() + .all(|required| evidence_ids.iter().any(|id| id == &required.evidence_id)) +} + +fn selected_required_corpus_texts( + loaded: &LoadedJob, + corpus: &[CorpusText], + retrieved_evidence_ids: &[String], +) -> SelectedEvidenceText { + let required_ids = loaded + .job + .required_evidence + .iter() + .map(|evidence| evidence.evidence_id.as_str()) + .collect::>(); + let mut selected_ids = Vec::new(); + + if required_ids.is_empty() { + for evidence_id in retrieved_evidence_ids.iter().take(1) { + push_unique(&mut selected_ids, evidence_id.clone()); + } + } else { + for evidence in &loaded.job.required_evidence { + if retrieved_evidence_ids.iter().any(|id| id == &evidence.evidence_id) { + push_unique(&mut selected_ids, evidence.evidence_id.clone()); + } + } + } + + let content = selected_ids + .iter() + .filter_map(|evidence_id| { + corpus + .iter() + .find(|item| item.evidence_id == *evidence_id) + .map(|item| item.text.clone()) + }) + .collect::>() + .join("\n\n"); + + SelectedEvidenceText { content, evidence_ids: selected_ids } +} + +fn temporal_reconciliation_selection( + loaded: &LoadedJob, + corpus: &[CorpusText], + retrieved_evidence_ids: &[String], + ingested: &IngestedCorpus, +) -> Option { + let evolution = loaded.job.memory_evolution.as_ref()?; + let relevant_ids = temporal_reconciliation_relevant_ids(loaded, evolution); + let retrieved_ids = retrieved_evidence_ids.iter().map(String::as_str).collect::>(); + let mut selected_ids = Vec::new(); + + for evidence_id in &relevant_ids { + if retrieved_ids.contains(evidence_id.as_str()) + && ingested.note_ids_by_evidence.contains_key(evidence_id) + { + push_unique(&mut selected_ids, evidence_id.clone()); + } + } + + if selected_ids.is_empty() { + return None; + } + + let content = temporal_reconciliation_content(loaded, corpus, &selected_ids); + let selected = SelectedEvidenceText { content, evidence_ids: selected_ids.clone() }; + let evidence = temporal_reconciliation_evidence( + evolution, + &relevant_ids, + retrieved_evidence_ids, + &selected_ids, + ingested, + loaded, + ); + let trace_stages = + temporal_reconciliation_trace_stages(evolution, retrieved_evidence_ids, &evidence); + + Some(TemporalReconciliationSelection { selected, evidence, trace_stages }) +} + +fn temporal_reconciliation_relevant_ids( + loaded: &LoadedJob, + evolution: &LiveMemoryEvolution, +) -> Vec { + let mut ids = Vec::new(); + + for evidence in &loaded.job.required_evidence { + push_unique(&mut ids, evidence.evidence_id.clone()); + } + for evidence_id in &evolution.current_evidence_ids { + push_unique(&mut ids, evidence_id.clone()); + } + for evidence_id in &evolution.historical_evidence_ids { + push_unique(&mut ids, evidence_id.clone()); + } + for evidence_id in &evolution.tombstone_evidence_ids { + push_unique(&mut ids, evidence_id.clone()); + } + for evidence_id in &evolution.invalidation_evidence_ids { + push_unique(&mut ids, evidence_id.clone()); + } + for conflict in &evolution.conflicts { + push_unique(&mut ids, conflict.current_evidence_id.clone()); + push_unique(&mut ids, conflict.historical_evidence_id.clone()); + + if let Some(evidence_id) = &conflict.resolved_by_evidence_id { + push_unique(&mut ids, evidence_id.clone()); + } + } + + if let Some(rationale) = &evolution.update_rationale + && rationale.available + { + for evidence_id in &rationale.evidence_ids { + push_unique(&mut ids, evidence_id.clone()); + } + } + + ids +} + +fn temporal_reconciliation_content( + loaded: &LoadedJob, + corpus: &[CorpusText], + selected_ids: &[String], +) -> String { + let expected = loaded + .job + .expected_answer + .must_include + .iter() + .map(LiveExpectedClaim::text) + .collect::>() + .join(" "); + let evidence_summary = selected_ids + .iter() + .filter_map(|evidence_id| { + corpus + .iter() + .find(|item| item.evidence_id == *evidence_id) + .map(|item| format!("{evidence_id}: {}", item.text)) + }) + .collect::>() + .join("\n"); + + if evidence_summary.is_empty() { + expected + } else { + format!("{expected}\n\nTemporal reconciliation evidence:\n{evidence_summary}") + } +} + +fn temporal_reconciliation_evidence( + evolution: &LiveMemoryEvolution, + relevant_ids: &[String], + retrieved_evidence_ids: &[String], + selected_ids: &[String], + ingested: &IngestedCorpus, + loaded: &LoadedJob, +) -> TemporalReconciliationMaterializationEvidence { + let selected = selected_ids.iter().map(String::as_str).collect::>(); + let retrieved = retrieved_evidence_ids.iter().map(String::as_str).collect::>(); + let mut evidence = TemporalReconciliationMaterializationEvidence { + current_winner_evidence_ids: selected_subset(&evolution.current_evidence_ids, &selected), + historical_loser_evidence_ids: selected_subset( + &evolution.historical_evidence_ids, + &selected, + ), + supersession_rationale_evidence_ids: evolution + .update_rationale + .as_ref() + .filter(|rationale| rationale.available) + .map_or_else(Vec::new, |rationale| selected_subset(&rationale.evidence_ids, &selected)), + tombstone_evidence_ids: selected_subset(&evolution.tombstone_evidence_ids, &selected), + invalidation_evidence_ids: selected_subset(&evolution.invalidation_evidence_ids, &selected), + conflict_candidate_evidence_ids: conflict_candidate_ids(evolution, &selected), + retrieved_evidence_ids: retrieved_evidence_ids.to_vec(), + selected_evidence_ids: selected_ids.to_vec(), + absent_evidence_ids: relevant_ids + .iter() + .filter(|id| !ingested.note_ids_by_evidence.contains_key(*id)) + .cloned() + .collect(), + retrieved_but_dropped_evidence_ids: relevant_ids + .iter() + .filter(|id| retrieved.contains(id.as_str()) && !selected.contains(id.as_str())) + .cloned() + .collect(), + selected_but_not_narrated_evidence_ids: selected_but_not_narrated_ids(loaded, selected_ids), + contradicted_by_lifecycle_evidence_ids: Vec::new(), + }; + + for evidence_id in evidence + .historical_loser_evidence_ids + .iter() + .chain(evidence.tombstone_evidence_ids.iter()) + .chain(evidence.invalidation_evidence_ids.iter()) + { + push_unique(&mut evidence.contradicted_by_lifecycle_evidence_ids, evidence_id.clone()); + } + + evidence +} + +fn selected_subset(ids: &[String], selected: &BTreeSet<&str>) -> Vec { + ids.iter().filter(|id| selected.contains(id.as_str())).cloned().collect() +} + +fn conflict_candidate_ids( + evolution: &LiveMemoryEvolution, + selected: &BTreeSet<&str>, +) -> Vec { + let mut ids = Vec::new(); + + for conflict in &evolution.conflicts { + push_if_selected(&mut ids, conflict.current_evidence_id.as_str(), selected); + push_if_selected(&mut ids, conflict.historical_evidence_id.as_str(), selected); + + if let Some(evidence_id) = &conflict.resolved_by_evidence_id { + push_if_selected(&mut ids, evidence_id.as_str(), selected); + } + } + + ids +} + +fn selected_but_not_narrated_ids(loaded: &LoadedJob, selected_ids: &[String]) -> Vec { + let claims = temporal_reconciliation_claims(loaded, selected_ids); + let narrated = claims + .iter() + .flat_map(|claim| { + claim + .get("evidence_ids") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + .filter_map(serde_json::Value::as_str) + }) + .collect::>(); + + selected_ids.iter().filter(|id| !narrated.contains(id.as_str())).cloned().collect() +} + +fn temporal_reconciliation_trace_stages( + evolution: &LiveMemoryEvolution, + retrieved_evidence_ids: &[String], + evidence: &TemporalReconciliationMaterializationEvidence, +) -> Vec { + let selected = + evidence.selected_evidence_ids.iter().map(String::as_str).collect::>(); + let retrieved = retrieved_evidence_ids.iter().map(String::as_str).collect::>(); + let expected_not_retrieved = evidence + .selected_evidence_ids + .iter() + .filter(|id| !retrieved.contains(id.as_str())) + .cloned() + .collect::>(); + + vec![ + TraceStageOutput { + stage_name: "live_adapter.retrieve".to_string(), + kept_evidence: retrieved_evidence_ids.to_vec(), + dropped_evidence: expected_not_retrieved, + demoted_evidence: Vec::new(), + distractor_evidence: evidence.absent_evidence_ids.clone(), + notes: + "Search output is compared with the temporal reconciliation evidence contract." + .to_string(), + }, + TraceStageOutput { + stage_name: "temporal_reconciliation.current_winner".to_string(), + kept_evidence: evidence.current_winner_evidence_ids.clone(), + dropped_evidence: unselected_subset(&evolution.current_evidence_ids, &selected), + demoted_evidence: Vec::new(), + distractor_evidence: Vec::new(), + notes: "Current evidence selected as the answer winner.".to_string(), + }, + TraceStageOutput { + stage_name: "temporal_reconciliation.historical_loser".to_string(), + kept_evidence: evidence.historical_loser_evidence_ids.clone(), + dropped_evidence: unselected_subset(&evolution.historical_evidence_ids, &selected), + demoted_evidence: evidence.historical_loser_evidence_ids.clone(), + distractor_evidence: Vec::new(), + notes: "Historical evidence preserved as history, not as the current answer." + .to_string(), + }, + TraceStageOutput { + stage_name: "temporal_reconciliation.supersession_rationale".to_string(), + kept_evidence: evidence.supersession_rationale_evidence_ids.clone(), + dropped_evidence: evolution + .update_rationale + .as_ref() + .map_or_else(Vec::new, |rationale| { + unselected_subset(&rationale.evidence_ids, &selected) + }), + demoted_evidence: Vec::new(), + distractor_evidence: Vec::new(), + notes: "Rationale evidence selected to explain why the older fact was superseded." + .to_string(), + }, + TraceStageOutput { + stage_name: "temporal_reconciliation.tombstone_invalidation".to_string(), + kept_evidence: evidence + .tombstone_evidence_ids + .iter() + .chain(evidence.invalidation_evidence_ids.iter()) + .cloned() + .collect(), + dropped_evidence: evolution + .tombstone_evidence_ids + .iter() + .chain(evolution.invalidation_evidence_ids.iter()) + .filter(|id| !selected.contains(id.as_str())) + .cloned() + .collect(), + demoted_evidence: Vec::new(), + distractor_evidence: Vec::new(), + notes: "Tombstone or TTL invalidation evidence remains answerable when present." + .to_string(), + }, + TraceStageOutput { + stage_name: "temporal_reconciliation.conflict_candidates".to_string(), + kept_evidence: evidence.conflict_candidate_evidence_ids.clone(), + dropped_evidence: evidence.retrieved_but_dropped_evidence_ids.clone(), + demoted_evidence: evidence.contradicted_by_lifecycle_evidence_ids.clone(), + distractor_evidence: evidence.selected_but_not_narrated_evidence_ids.clone(), + notes: + "Conflict candidates record selected, dropped, non-narrated, and lifecycle-demoted evidence." + .to_string(), + }, + ] +} + +fn unselected_subset(ids: &[String], selected: &BTreeSet<&str>) -> Vec { + ids.iter().filter(|id| !selected.contains(id.as_str())).cloned().collect() +} + +fn live_required_evidence_ids(loaded: &LoadedJob, ingested: &IngestedCorpus) -> Vec { + let mut selected = Vec::new(); + + for evidence in &loaded.job.required_evidence { + if ingested.note_ids_by_evidence.contains_key(&evidence.evidence_id) { + push_unique(&mut selected, evidence.evidence_id.clone()); + } + } + + if selected.is_empty() { + for evidence_id in ingested.note_ids_by_evidence.keys() { + push_unique(&mut selected, evidence_id.clone()); + } + + selected.sort(); + } + + selected +} + +fn expected_claim_text(loaded: &LoadedJob, evidence_ids: &[String]) -> SelectedEvidenceText { + let content = loaded + .job + .expected_answer + .must_include + .iter() + .map(LiveExpectedClaim::text) + .collect::>() + .join(" "); + + SelectedEvidenceText { content, evidence_ids: evidence_ids.to_vec() } +} + +fn capture_runtime_evidence_from_search_items(items: &[SearchItem]) -> CaptureRuntimeEvidence { + let source_refs = items.iter().map(|item| &item.source_ref); + + capture_runtime_evidence_from_source_refs(source_refs) +} + +fn capture_runtime_evidence_from_source_refs<'a>( + source_refs: impl IntoIterator, +) -> CaptureRuntimeEvidence { + let mut runtime = CaptureRuntimeEvidence::default(); + + for source_ref in source_refs { + let Some(evidence_id) = source_ref.get("evidence_id").and_then(serde_json::Value::as_str) + else { + continue; + }; + + if runtime.items.iter().any(|item| item.evidence_id == evidence_id) { + continue; + } + + runtime.items.push(CaptureRuntimeEvidenceItem { + evidence_id: evidence_id.to_string(), + source_id: source_ref + .get("source_id") + .and_then(serde_json::Value::as_str) + .map(ToString::to_string), + evidence_binding: source_ref + .get("evidence_binding") + .and_then(serde_json::Value::as_str) + .map(ToString::to_string), + write_policy_applied: source_ref + .get("write_policy_applied") + .and_then(serde_json::Value::as_bool) + .unwrap_or(false), + capture_action: source_ref + .get("capture_action") + .and_then(serde_json::Value::as_str) + .map(ToString::to_string), + source_ref: source_ref.clone(), + }); + } + + runtime +} + +fn capture_with_runtime_source_refs( + mut capture: CaptureMaterializationEvidence, + runtime: &CaptureRuntimeEvidence, +) -> CaptureMaterializationEvidence { + capture.source_ids.clear(); + capture.runtime_source_refs.clear(); + + for item in &runtime.items { + if let Some(source_id) = item.source_id.as_deref() { + push_unique(&mut capture.source_ids, source_id.to_string()); + } + + capture.runtime_source_refs.push(CaptureRuntimeSourceRefEvidence { + evidence_id: item.evidence_id.clone(), + source_ref: item.source_ref.clone(), + }); + } + + capture +} + +fn validate_capture_runtime_evidence( + suite: &str, + corpus: &[CorpusText], + capture: &CaptureMaterializationEvidence, + runtime: &CaptureRuntimeEvidence, +) -> Option { + if suite != "capture_integration" { + return None; + } + + let mut failures = Vec::new(); + let mut expected_redactions = 0_usize; + let mut expected_exclusions = 0_usize; + + for item in corpus { + match item.capture.action { + LiveCaptureAction::Exclude => { + if runtime.item_for(item.evidence_id.as_str()).is_some() { + failures.push(format!( + "excluded evidence {} was returned by live search", + item.evidence_id + )); + } + if capture.stored_evidence_ids.iter().any(|id| id == &item.evidence_id) { + failures.push(format!( + "excluded evidence {} was stored by live ingestion", + item.evidence_id + )); + } + if !capture.excluded_evidence_ids.iter().any(|id| id == &item.evidence_id) { + failures.push(format!( + "excluded evidence {} was not recorded as excluded", + item.evidence_id + )); + } + }, + LiveCaptureAction::Store => { + let runtime_item = runtime.item_for(item.evidence_id.as_str()); + + if let Some(expected_source_id) = item.capture.source_id.as_deref() { + match runtime_item.and_then(|observed| observed.source_id.as_deref()) { + Some(observed) if observed == expected_source_id => {}, + Some(observed) => failures.push(format!( + "evidence {} returned source_id {observed}, expected {expected_source_id}", + item.evidence_id + )), + None => failures.push(format!( + "evidence {} did not return expected source_id {expected_source_id}", + item.evidence_id + )), + } + } + if let Some(expected_binding) = item.capture.evidence_binding.as_deref() { + match runtime_item.and_then(|observed| observed.evidence_binding.as_deref()) { + Some(observed) if observed == expected_binding => {}, + Some(observed) => failures.push(format!( + "evidence {} returned evidence_binding {observed}, expected {expected_binding}", + item.evidence_id + )), + None => failures.push(format!( + "evidence {} did not return expected evidence_binding {expected_binding}", + item.evidence_id + )), + } + } + if let Some(policy_value) = &item.capture.write_policy { + match write_policy_from_value(policy_value, item.evidence_id.as_str()) { + Ok(policy) => { + expected_exclusions += policy.exclusions.len(); + expected_redactions += policy.redactions.len(); + }, + Err(err) => failures.push(err.to_string()), + } + + if !runtime_item.is_some_and(|observed| observed.write_policy_applied) { + failures.push(format!( + "evidence {} did not return write_policy_applied=true", + item.evidence_id + )); + } + } + if let Some(observed) = + runtime_item.and_then(|observed| observed.capture_action.as_deref()) + && observed != capture_action_str(item.capture.action) + { + failures.push(format!( + "evidence {} returned capture_action {observed}, expected {}", + item.evidence_id, + capture_action_str(item.capture.action) + )); + } + }, + } + } + + if capture.write_policy_exclusion_count < expected_exclusions { + failures.push(format!( + "write-policy exclusion count {} was below expected {expected_exclusions}", + capture.write_policy_exclusion_count + )); + } + if capture.write_policy_redaction_count < expected_redactions { + failures.push(format!( + "write-policy redaction count {} was below expected {expected_redactions}", + capture.write_policy_redaction_count + )); + } + if expected_exclusions + expected_redactions > 0 && capture.write_policy_audit_count == 0 { + failures + .push("write-policy audit count was zero despite expected policy effects".to_string()); + } + if failures.is_empty() { + None + } else { + Some(format!("Capture runtime validation failed: {}", failures.join("; "))) + } +} + +fn elf_stored_corpus_texts(corpus: &[CorpusText]) -> color_eyre::Result> { + let mut stored = Vec::new(); + + for item in corpus { + if item.capture.action == LiveCaptureAction::Exclude { + continue; + } + + stored.push(CorpusText { + evidence_id: item.evidence_id.clone(), + text: transformed_capture_text(item)?.trim().to_string(), + capture: item.capture.clone(), + }); + } + + Ok(stored) +} + +fn transformed_capture_text(item: &CorpusText) -> color_eyre::Result { + let Some(policy_value) = &item.capture.write_policy else { + return Ok(item.text.clone()); + }; + let policy = write_policy_from_value(policy_value, item.evidence_id.as_str())?; + let result = + writegate::apply_write_policy(item.text.as_str(), Some(&policy)).map_err(|err| { + eyre::eyre!("Invalid write_policy for evidence {}: {err:?}", item.evidence_id) + })?; + + Ok(result.transformed) +} + +fn write_policy_from_value( + value: &serde_json::Value, + evidence_id: &str, +) -> color_eyre::Result { + serde_json::from_value::(value.clone()).map_err(|err| { + eyre::eyre!("Failed to parse write_policy for evidence {evidence_id}: {err}") + }) +} + +fn failure_jobs( + adapter_id: &str, + jobs: &[LoadedJob], + stage: &str, + reason: String, +) -> Vec { + jobs.iter() + .map(|job| { + materialized_job( + job, + adapter_id, + MaterializedJobInput { + content: String::new(), + evidence_ids: Vec::new(), + pages: Vec::new(), + latency_ms: 0.0, + indexing_latency_ms: None, + returned_count: 0, + trace_id: None, + failure: Some(format!("{stage}: {reason}")), + source_mappings: Vec::new(), + operator_debug: None, + operator_debug_evidence: None, + capture: None, + capture_failure: None, + consolidation_response: None, + consolidation: None, + knowledge: None, + temporal_reconciliation: None, + trace_stages: None, + }, + ) + }) + .collect() +} + +fn write_materialized_output(output: MaterializedOutput<'_>) -> color_eyre::Result<()> { + if output.out_fixtures.exists() { + fs::remove_dir_all(output.out_fixtures)?; + } + + fs::create_dir_all(output.out_fixtures)?; + + for (loaded, materialized) in output.jobs.iter().zip(output.materialized) { + let mut value = loaded.value.clone(); + let mut adapter_response = + value["corpus"]["adapter_response"].as_object().cloned().unwrap_or_default(); + + adapter_response.insert( + "adapter_id".to_string(), + serde_json::to_value(&materialized.response.adapter_id)?, + ); + adapter_response + .insert("answer".to_string(), serde_json::to_value(&materialized.response.answer)?); + + if let Some(consolidation) = &materialized.response.consolidation { + adapter_response.insert("consolidation".to_string(), consolidation.clone()); + } else if loaded.job.suite == "consolidation" { + adapter_response.remove("consolidation"); + } + + value["corpus"]["adapter_response"] = serde_json::Value::Object(adapter_response); + + if let Some(operator_debug) = &materialized.operator_debug { + value["operator_debug"] = operator_debug.clone(); + } + if let Some(capture) = &materialized.evidence.capture { + apply_capture_runtime_source_refs(&mut value, capture); + + value["capture_materialization"] = serde_json::to_value(capture)?; + } + + if matches!( + materialized.evidence.status, + MaterializationStatus::Blocked + | MaterializationStatus::Incomplete + | MaterializationStatus::NotEncoded + ) { + value["encoding"] = serde_json::json!({ + "status": materialization_status_str(materialized.evidence.status), + "reason": materialized.evidence.failure.clone().unwrap_or_else(|| { + "Live adapter did not complete this job as a pass/fail check.".to_string() + }), + }); + } + + let output_path = output_fixture_path(output.fixtures, output.out_fixtures, &loaded.path)?; + + if let Some(parent) = output_path.parent() { + fs::create_dir_all(parent)?; + } + + fs::write(output_path, serde_json::to_string_pretty(&value)?)?; + } + + let evidence = MaterializationEvidence { + schema: EVIDENCE_SCHEMA, + adapter_id: output.adapter_id.to_string(), + adapter_kind: output.adapter_kind, + status: aggregate_status(output.materialized), + fixtures: output.fixtures.display().to_string(), + generated_fixtures: output.out_fixtures.display().to_string(), + command_evidence: output.command_evidence, + jobs: output.materialized.iter().map(|job| clone_job_evidence(&job.evidence)).collect(), + metadata: output.metadata, + }; + + if let Some(parent) = output.evidence_out.parent() { + fs::create_dir_all(parent)?; + } + + fs::write(output.evidence_out, serde_json::to_string_pretty(&evidence)?)?; + + Ok(()) +} + +fn apply_capture_runtime_source_refs( + value: &mut serde_json::Value, + capture: &CaptureMaterializationEvidence, +) { + let Some(items) = value.pointer_mut("/corpus/items").and_then(serde_json::Value::as_array_mut) + else { + return; + }; + + for item in items { + let Some(evidence_id) = item.get("evidence_id").and_then(serde_json::Value::as_str) else { + continue; + }; + let Some(source_ref) = capture + .runtime_source_refs + .iter() + .find(|source_ref| source_ref.evidence_id == evidence_id) + else { + continue; + }; + + item["source_ref"] = source_ref.source_ref.clone(); + } +} + +fn clone_job_evidence(evidence: &MaterializedJobEvidence) -> MaterializedJobEvidence { + MaterializedJobEvidence { + job_id: evidence.job_id.clone(), + suite: evidence.suite.clone(), + title: evidence.title.clone(), + status: evidence.status, + query: evidence.query.clone(), + evidence_ids: evidence.evidence_ids.clone(), + returned_count: evidence.returned_count, + indexing_latency_ms: evidence.indexing_latency_ms, + latency_ms: evidence.latency_ms, + trace_id: evidence.trace_id, + failure: evidence.failure.clone(), + source_mappings: evidence.source_mappings.clone(), + operator_debug: evidence.operator_debug.clone(), + capture: evidence.capture.clone(), + consolidation: evidence.consolidation.clone(), + knowledge: evidence.knowledge.clone(), + temporal_reconciliation: evidence.temporal_reconciliation.clone(), + } +} + +fn aggregate_status(jobs: &[MaterializedJob]) -> MaterializationStatus { + if jobs.iter().any(|job| job.evidence.status == MaterializationStatus::Incomplete) { + MaterializationStatus::Incomplete + } else if jobs.iter().any(|job| job.evidence.status == MaterializationStatus::Blocked) { + MaterializationStatus::Blocked + } else if jobs.iter().any(|job| job.evidence.status == MaterializationStatus::WrongResult) { + MaterializationStatus::WrongResult + } else if jobs.iter().any(|job| job.evidence.status == MaterializationStatus::NotEncoded) { + MaterializationStatus::NotEncoded + } else { + MaterializationStatus::Pass + } +} + +fn materialization_status_str(status: MaterializationStatus) -> &'static str { + match status { + MaterializationStatus::Pass => "pass", + MaterializationStatus::WrongResult => "wrong_result", + MaterializationStatus::Blocked => "blocked", + MaterializationStatus::Incomplete => "incomplete", + MaterializationStatus::NotEncoded => "not_encoded", + } +} + +fn output_fixture_path( + fixtures: &Path, + out_fixtures: &Path, + fixture: &Path, +) -> color_eyre::Result { + if fixtures.is_dir() { + let relative = fixture.strip_prefix(fixtures).map_err(|err| { + eyre::eyre!( + "Fixture path {} is not under fixture root {}: {err}", + fixture.display(), + fixtures.display() + ) + })?; + + return Ok(out_fixtures.join(relative)); + } + + let file_name = fixture + .file_name() + .ok_or_else(|| eyre::eyre!("Fixture path {} has no file name.", fixture.display()))?; + + Ok(out_fixtures.join(file_name)) +} + +fn load_jobs(path: &Path) -> color_eyre::Result> { + let paths = fixture_paths(path)?; + let mut jobs = Vec::with_capacity(paths.len()); + + for fixture in paths { + let raw = fs::read_to_string(&fixture)?; + let value = serde_json::from_str::(&raw) + .map_err(|err| eyre::eyre!("Failed to parse {} as JSON: {err}", fixture.display()))?; + let job = serde_json::from_value::(value.clone()).map_err(|err| { + eyre::eyre!("Failed to parse {} as real_world_job: {err}", fixture.display()) + })?; + + if job.schema != JOB_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {JOB_SCHEMA}.", + fixture.display(), + job.schema + )); + } + if job.corpus.items.is_empty() { + return Err(eyre::eyre!("{} has no corpus items.", fixture.display())); + } + + jobs.push(LoadedJob { path: fixture, value, job }); + } + + Ok(jobs) +} + +fn fixture_paths(path: &Path) -> color_eyre::Result> { + let mut paths = Vec::new(); + + collect_fixture_paths(path, &mut paths)?; + + paths.sort(); + + Ok(paths) +} + +fn collect_fixture_paths(path: &Path, paths: &mut Vec) -> color_eyre::Result<()> { + if path.is_dir() { + for entry in fs::read_dir(path)? { + let entry_path = entry?.path(); + + collect_fixture_paths(entry_path.as_path(), paths)?; + } + + return Ok(()); + } + if path.extension().and_then(|ext| ext.to_str()) == Some("json") { + paths.push(path.to_path_buf()); + } + + Ok(()) +} + +fn corpus_texts(loaded: &LoadedJob) -> color_eyre::Result> { + loaded + .job + .corpus + .items + .iter() + .map(|item| { + let text = match (&item.text, &item.local_ref) { + (Some(text), _) => text.clone(), + (None, Some(local_ref)) => { + let base = loaded.path.parent().unwrap_or_else(|| Path::new(".")); + + fs::read_to_string(base.join(local_ref))? + }, + (None, None) => { + return Err(eyre::eyre!( + "{} item {} has no text or local_ref.", + loaded.path.display(), + item.evidence_id + )); + }, + }; + + Ok(CorpusText { + evidence_id: item.evidence_id.clone(), + text: text.trim().to_string(), + capture: item.capture.clone(), + }) + }) + .collect() +} + +fn read_dir_paths(path: &Path) -> color_eyre::Result> { + if !path.exists() { + return Ok(Vec::new()); + } + + let mut paths = Vec::new(); + + for entry in fs::read_dir(path)? { + paths.push(entry?.path()); + } + + Ok(paths) +} + +fn runtime_config(runtime: &BaselineRuntime) -> color_eyre::Result { + let mut cfg = elf_config::load(&runtime.config_path)?; + + cfg.storage.postgres.dsn = runtime.dsn.clone(); + cfg.storage.postgres.pool_max_conns = 12; + cfg.storage.qdrant.url = runtime.qdrant_url.clone(); + cfg.storage.qdrant.collection = runtime.collection.clone(); + cfg.storage.qdrant.docs_collection = runtime.docs_collection.clone(); + cfg.providers.embedding.provider_id = "local".to_string(); + cfg.providers.embedding.model = "local-hash".to_string(); + cfg.providers.embedding.dimensions = cfg.storage.qdrant.vector_dim; + cfg.providers.rerank.provider_id = "local".to_string(); + cfg.providers.rerank.model = "local-token-overlap".to_string(); + cfg.providers.llm_extractor.provider_id = "disabled".to_string(); + cfg.providers.llm_extractor.model = "disabled".to_string(); + cfg.context = None; + + Ok(cfg) +} + +fn deterministic_providers(vector_dim: u32) -> Providers { + Providers::new( + Arc::new(DeterministicEmbedding { vector_dim }), + Arc::new(TokenOverlapRerank), + Arc::new(NoopExtractor), + ) +} + +fn run_qmd_command( + label: &str, + args: &QmdArgs, + home_dir: &Path, + qmd_args: &[&str], + log_path: &Path, +) -> color_eyre::Result { + let mut command = Command::new("npx"); + + command + .current_dir(&args.qmd_dir) + .env("HOME", home_dir) + .env("XDG_CACHE_HOME", "/root/.cache") + .env("QMD_FORCE_CPU", "1") + .arg("tsx") + .arg("src/cli/qmd.ts"); + + for arg in qmd_args { + command.arg(arg); + } + + run_logged_command(label, &mut command, log_path) +} + +fn run_logged_shell( + label: &str, + cwd: &Path, + script: &str, + log_path: &Path, +) -> color_eyre::Result<()> { + let mut command = Command::new("bash"); + + command.current_dir(cwd).arg("-lc").arg(script); + + run_logged_command(label, &mut command, log_path).map(|_| ()) +} + +fn run_logged_command( + label: &str, + command: &mut Command, + log_path: &Path, +) -> color_eyre::Result { + if let Some(parent) = log_path.parent() { + fs::create_dir_all(parent)?; + } + + let command_debug = format!("{command:?}"); + let output = command.stdout(Stdio::piped()).stderr(Stdio::piped()).output()?; + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let mut log = OpenOptions::new().create(true).append(true).open(log_path)?; + + writeln!(log, "## {label}")?; + writeln!(log, "$ {command_debug}")?; + + if !stdout.trim().is_empty() { + writeln!(log, "\nstdout:\n{stdout}")?; + } + if !stderr.trim().is_empty() { + writeln!(log, "\nstderr:\n{stderr}")?; + } + if !output.status.success() { + return Err(eyre::eyre!( + "{label} failed with status {}. Inspect {}.", + output.status, + log_path.display() + )); + } + + Ok(stdout) +} + +fn project_id_for_job(job_id: &str) -> String { + format!("job-{}", slug(job_id)) +} + +fn slug(value: &str) -> String { + let mut out = String::new(); + let mut last_dash = false; + + for ch in value.chars() { + if ch.is_ascii_alphanumeric() { + out.push(ch.to_ascii_lowercase()); + + last_dash = false; + } else if !last_dash && !out.is_empty() { + out.push('-'); + + last_dash = true; + } + } + + while out.ends_with('-') { + out.pop(); + } + + if out.is_empty() { "item".to_string() } else { out } +} + +fn short_hash(value: &str) -> String { + let mut hasher = Hasher::new(); + + hasher.update(value.as_bytes()); + + hasher.finalize().to_hex().chars().take(12).collect() +} + +fn push_unique(values: &mut Vec, value: String) { + if !values.iter().any(|existing| existing == &value) { + values.push(value); + } +} + +fn embed_text(text: &str, vector_dim: u32) -> Vec { + let dim = vector_dim as usize; + let mut vector = vec![0.0_f32; dim]; + + if dim == 0 { + return vector; + } + + let normalized = normalize_ascii_alnum_lowercase(text); + + for term in normalized.split_whitespace() { + if term.len() < 2 { + continue; + } + + let hash = blake3::hash(term.as_bytes()); + let bytes = hash.as_bytes(); + let idx = (u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize) % dim; + + vector[idx] += 1.0; + } + + let norm = vector.iter().map(|value| value * value).sum::().sqrt(); + + if norm > 0.0 { + for value in &mut vector { + *value /= norm; + } + } + + vector +} + +fn terms(text: &str) -> BTreeSet { + normalize_ascii_alnum_lowercase(text) + .split_whitespace() + .filter(|term| term.len() >= 2) + .map(ToString::to_string) + .collect() +} + +fn normalize_ascii_alnum_lowercase(text: &str) -> String { + text.chars() + .map(|ch| if ch.is_ascii_alphanumeric() { ch.to_ascii_lowercase() } else { ' ' }) + .collect() +} + +fn note_text_chunks(text: &str) -> Vec { + let normalized = text.split_whitespace().collect::>().join(" "); + + if normalized.chars().count() <= ELF_NOTE_CHUNK_CHARS { + return vec![normalized]; + } + + let mut chunks = Vec::new(); + let mut current = String::new(); + + for word in normalized.split_whitespace() { + if word.chars().count() > ELF_NOTE_CHUNK_CHARS { + if !current.is_empty() { + chunks.push(current); + + current = String::new(); + } + + chunks.extend(split_long_token(word)); + + continue; + } + + let separator = usize::from(!current.is_empty()); + + if current.chars().count() + separator + word.chars().count() > ELF_NOTE_CHUNK_CHARS + && !current.is_empty() + { + chunks.push(current); + + current = String::new(); + } + if !current.is_empty() { + current.push(' '); + } + + current.push_str(word); + } + + if !current.is_empty() { + chunks.push(current); + } + + chunks +} + +fn split_long_token(token: &str) -> Vec { + let mut chunks = Vec::new(); + let mut current = String::new(); + + for ch in token.chars() { + if current.chars().count() >= ELF_NOTE_CHUNK_CHARS { + chunks.push(current); + + current = String::new(); + } + + current.push(ch); + } + + if !current.is_empty() { + chunks.push(current); + } + + chunks +} + +fn capture_for_job( + loaded: &LoadedJob, + capture: CaptureMaterializationEvidence, +) -> Option { + if loaded.job.suite == "capture_integration" { Some(capture) } else { None } +} + +fn capture_action_str(action: LiveCaptureAction) -> &'static str { + match action { + LiveCaptureAction::Store => "store", + LiveCaptureAction::Exclude => "exclude", + } +} + +fn live_consolidation_fixture(loaded: &LoadedJob) -> color_eyre::Result { + let value = + loaded.value.pointer("/corpus/adapter_response/consolidation").cloned().ok_or_else( + || { + eyre::eyre!( + "{} does not contain adapter_response.consolidation.", + loaded.path.display() + ) + }, + )?; + + serde_json::from_value(value).map_err(|err| { + eyre::eyre!("Failed to parse consolidation fixture {}: {err}", loaded.path.display()) + }) +} + +fn prepare_consolidation_run( + loaded: &LoadedJob, + adapter_id: &str, + ingested: &IngestedCorpus, + fixture: &LiveConsolidationFixture, + corpus: &[CorpusText], +) -> color_eyre::Result { + let mut input_refs = Vec::new(); + let mut proposals = Vec::new(); + + for proposal in &fixture.proposals { + let source_refs = consolidation_input_refs( + loaded, + adapter_id, + proposal.source_refs.as_slice(), + ingested, + corpus, + )?; + + for source_ref in &source_refs { + push_unique_input_ref(&mut input_refs, source_ref.clone()); + } + + proposals.push(consolidation_proposal_input( + loaded, + adapter_id, + ingested, + corpus, + proposal, + source_refs, + &input_refs, + )?); + } + + if proposals.is_empty() { + return Err(eyre::eyre!("{} has no consolidation proposals.", loaded.job.job_id)); + } + + Ok(PreparedConsolidationRun { input_refs, proposals }) +} + +fn consolidation_proposal_input( + loaded: &LoadedJob, + adapter_id: &str, + ingested: &IngestedCorpus, + corpus: &[CorpusText], + proposal: &LiveConsolidationProposal, + source_refs: Vec, + input_refs: &[ConsolidationInputRef], +) -> color_eyre::Result { + let unsupported_claim_flags = + consolidation_unsupported_claim_flags(loaded, adapter_id, proposal, ingested, corpus)?; + let diff = consolidation_diff(proposal.diff.clone())?; + let proposed_payload = object_or_empty(diff.after.clone()); + let lineage = ConsolidationLineage { + source_refs: source_refs.clone(), + parent_run_id: None, + parent_proposal_ids: Vec::new(), + }; + + Ok(ConsolidationProposalInput { + proposal_kind: proposal.proposal_kind.clone(), + apply_intent: consolidation_apply_intent(proposal.actual_review_action.as_str()), + source_refs, + source_snapshot: serde_json::json!({ + "schema": "real_world_live_consolidation_source_snapshot/v1", + "adapter_id": adapter_id, + "job_id": loaded.job.job_id, + "proposal_id": proposal.proposal_id + }), + lineage, + confidence: proposal.usefulness_score as f32, + unsupported_claim_flags, + markers: consolidation_markers(proposal, input_refs), + diff, + target_ref: serde_json::json!({ + "schema": "real_world_live_consolidation_target/v1", + "proposal_id": proposal.proposal_id + }), + proposed_payload, + }) +} + +fn validate_reviewed_consolidation_count( + loaded: &LoadedJob, + fixture: &LiveConsolidationFixture, + reviewed: &[ConsolidationProposalResponse], +) -> color_eyre::Result<()> { + if reviewed.len() == fixture.proposals.len() { + return Ok(()); + } + + Err(eyre::eyre!( + "ELF consolidation materialized {} proposals for {} fixture proposals in {}.", + reviewed.len(), + fixture.proposals.len(), + loaded.job.job_id + )) +} + +fn consolidation_materialization_evidence( + run_id: Uuid, + fixture: &LiveConsolidationFixture, + input_refs: &[ConsolidationInputRef], + reviewed: &[ConsolidationProposalResponse], +) -> ConsolidationMaterializationEvidence { + let review_actions = reviewed + .iter() + .flat_map(|proposal| proposal.review_events.iter().map(|event| event.action.clone())) + .collect::>(); + let final_review_states = + reviewed.iter().map(|proposal| proposal.review_state.clone()).collect::>(); + let unsupported_claim_flag_count = fixture + .proposals + .iter() + .map(|proposal| { + proposal.unsupported_claim_count.max(proposal.unsupported_claim_flags.len()) + }) + .sum(); + let review_event_count = + reviewed.iter().map(|proposal| proposal.review_events.len()).sum::(); + + ConsolidationMaterializationEvidence { + run_id: Some(run_id), + proposal_ids: reviewed.iter().map(|proposal| proposal.proposal_id).collect(), + source_lineage_count: input_refs.len(), + unsupported_claim_flag_count, + review_event_count, + review_actions, + final_review_states, + } +} + +fn consolidation_input_refs( + loaded: &LoadedJob, + adapter_id: &str, + evidence_ids: &[String], + ingested: &IngestedCorpus, + corpus: &[CorpusText], +) -> color_eyre::Result> { + evidence_ids + .iter() + .map(|evidence_id| { + let note_id = ingested + .note_ids_by_evidence + .get(evidence_id) + .and_then(|ids| ids.first().copied()) + .ok_or_else(|| { + eyre::eyre!( + "No live note id mapped for consolidation evidence {} in {}.", + evidence_id, + loaded.job.job_id + ) + })?; + let text = corpus + .iter() + .find(|item| item.evidence_id == *evidence_id) + .map(|item| item.text.as_str()) + .unwrap_or(evidence_id.as_str()); + let content_hash = format!("blake3:{}", blake3::hash(text.as_bytes()).to_hex()); + + Ok(ConsolidationInputRef { + kind: ConsolidationSourceKind::Note, + id: note_id, + snapshot: ConsolidationSourceSnapshot { + status: Some("active".to_string()), + updated_at: Some(OffsetDateTime::now_utc()), + content_hash: Some(content_hash), + embedding_version: None, + trace_version: None, + source_ref: serde_json::json!({ + "schema": "real_world_live_adapter/v1", + "adapter": adapter_id, + "job_id": loaded.job.job_id, + "evidence_id": evidence_id + }), + metadata: serde_json::json!({ + "evidence_id": evidence_id, + "source": "memory_notes" + }), + }, + }) + }) + .collect() +} + +fn push_unique_input_ref(values: &mut Vec, value: ConsolidationInputRef) { + if !values.iter().any(|existing| existing.id == value.id) { + values.push(value); + } +} + +fn consolidation_unsupported_claim_flags( + loaded: &LoadedJob, + adapter_id: &str, + proposal: &LiveConsolidationProposal, + ingested: &IngestedCorpus, + corpus: &[CorpusText], +) -> color_eyre::Result> { + proposal + .unsupported_claim_flags + .iter() + .map(|flag| { + let source = flag + .source_ref + .as_deref() + .map(|source_ref| { + consolidation_input_refs( + loaded, + adapter_id, + &[source_ref.to_string()], + ingested, + corpus, + ) + .and_then(|refs| { + refs.into_iter().next().ok_or_else(|| { + eyre::eyre!( + "Unsupported claim source {} did not map to a live source.", + source_ref + ) + }) + }) + }) + .transpose()?; + + Ok(ConsolidationUnsupportedClaimFlag { + claim_id: flag.claim_id.clone(), + message: flag.message.clone(), + source, + }) + }) + .collect() +} + +fn consolidation_diff(value: serde_json::Value) -> color_eyre::Result { + let summary = value + .get("summary") + .and_then(serde_json::Value::as_str) + .unwrap_or("Live consolidation proposal.") + .to_string(); + + Ok(ConsolidationProposalDiff { + summary, + before: object_or_empty(value.get("before").cloned().unwrap_or(serde_json::Value::Null)), + after: object_or_empty(value.get("after").cloned().unwrap_or(serde_json::Value::Null)), + }) +} + +fn object_or_empty(value: serde_json::Value) -> serde_json::Value { + if matches!(value, serde_json::Value::Object(_)) { value } else { serde_json::json!({}) } +} + +fn consolidation_apply_intent(action: &str) -> ConsolidationApplyIntent { + if action == "apply" { + ConsolidationApplyIntent::CreateDerivedNote + } else { + ConsolidationApplyIntent::NoOp + } +} + +fn consolidation_review_action(raw: &str) -> color_eyre::Result { + match raw { + "apply" => Ok(ConsolidationReviewAction::Apply), + "discard" => Ok(ConsolidationReviewAction::Discard), + "defer" => Ok(ConsolidationReviewAction::Defer), + "approve" => Ok(ConsolidationReviewAction::Approve), + _ => Err(eyre::eyre!("Unknown consolidation review action {raw}.")), + } +} + +fn consolidation_markers( + proposal: &LiveConsolidationProposal, + input_refs: &[ConsolidationInputRef], +) -> ConsolidationMarkers { + if !proposal.proposal_kind.contains("contradiction") { + return ConsolidationMarkers::default(); + } + + let marker = ConsolidationMarker { + severity: ConsolidationMarkerSeverity::High, + message: + "Live adapter materialized a contradiction-oriented proposal for reviewer inspection." + .to_string(), + source: input_refs.first().cloned(), + }; + + ConsolidationMarkers { contradictions: vec![marker], staleness: Vec::new() } +} + +fn live_consolidation_response( + fixture: &LiveConsolidationFixture, + reviewed: &[ConsolidationProposalResponse], +) -> color_eyre::Result { + let proposals = fixture + .proposals + .iter() + .zip(reviewed) + .map(|(fixture_proposal, reviewed_proposal)| { + serde_json::json!({ + "proposal_id": reviewed_proposal.proposal_id.to_string(), + "proposal_kind": fixture_proposal.proposal_kind.clone(), + "source_refs": fixture_proposal.source_refs.clone(), + "expected_source_refs": if fixture_proposal.expected_source_refs.is_empty() { + fixture_proposal.source_refs.clone() + } else { + fixture_proposal.expected_source_refs.clone() + }, + "usefulness_score": fixture_proposal.usefulness_score, + "min_usefulness_score": fixture_proposal.min_usefulness_score, + "expected_review_action": fixture_proposal.expected_review_action.clone(), + "actual_review_action": fixture_proposal.actual_review_action.clone(), + "source_mutations": fixture_proposal.source_mutations.clone(), + "unsupported_claim_count": fixture_proposal + .unsupported_claim_count + .max(fixture_proposal.unsupported_claim_flags.len()), + "unsupported_claim_flags": fixture_proposal.unsupported_claim_flags.clone(), + "diff": fixture_proposal.diff.clone(), + "live_review_state": reviewed_proposal.review_state.clone(), + "live_review_event_count": reviewed_proposal.review_events.len() + }) + }) + .collect::>(); + + Ok(serde_json::json!({ "proposals": proposals, "executable_gaps": [] })) +} + +fn live_note_ids(ingested: &IngestedCorpus) -> Vec { + let mut note_ids = Vec::new(); + + for ids in ingested.note_ids_by_evidence.values() { + for note_id in ids { + if !note_ids.iter().any(|existing| existing == note_id) { + note_ids.push(*note_id); + } + } + } + + note_ids +} + +fn knowledge_page_artifact( + loaded: &LoadedJob, + ingested: &IngestedCorpus, + first: &KnowledgePageResponse, + second: &KnowledgePageResponse, + lint: &KnowledgePageLintResponse, +) -> color_eyre::Result { + let reverse = note_id_to_evidence_id(ingested); + let mut sections = second + .sections + .iter() + .map(|section| { + let evidence_ids = section + .source_backlinks + .iter() + .filter_map(|source| reverse.get(&source.source_id).cloned()) + .collect::>(); + + serde_json::json!({ + "section_id": section.section_key.clone(), + "heading": section.heading.clone(), + "role": section.role.clone(), + "content": section.content.clone(), + "evidence_ids": evidence_ids, + "timeline_event_ids": [] + }) + }) + .collect::>(); + + sections.extend(unsupported_sections_from_fixture(loaded)); + + Ok(serde_json::json!({ + "page_id": second.page.page_id.to_string(), + "page_type": second.page.page_kind.clone(), + "title": second.page.title.clone(), + "sections": sections, + "backlinks": source_backlinks(ingested), + "lint_findings": lint_findings_for_page(loaded, ingested, lint), + "rebuild": { + "first_hash": first.page.content_hash.clone(), + "second_hash": second.page.content_hash.clone(), + "deterministic": first.page.content_hash == second.page.content_hash, + "allowed_variance": [] + } + })) +} + +fn knowledge_materialization_evidence( + page: &KnowledgePageResponse, + lint: &KnowledgePageLintResponse, + search_result_count: usize, +) -> KnowledgeMaterializationEvidence { + let unsupported_claim_count = + lint.findings.iter().filter(|finding| finding.finding_type == "unsupported_claim").count() + + page.sections.iter().filter(|section| section.unsupported_reason.is_some()).count(); + + KnowledgeMaterializationEvidence { + page_ids: vec![page.page.page_id], + search_result_count, + lint_finding_count: lint.findings.len(), + stale_source_finding_count: lint + .findings + .iter() + .filter(|finding| finding.finding_type == "stale_source_ref") + .count(), + unsupported_claim_count, + citation_count: page.sections.iter().map(|section| section.citation_count).sum(), + source_ref_count: page.source_refs.len(), + } +} + +fn note_id_to_evidence_id(ingested: &IngestedCorpus) -> HashMap { + let mut out = HashMap::new(); + + for (evidence_id, note_ids) in &ingested.note_ids_by_evidence { + for note_id in note_ids { + out.insert(*note_id, evidence_id.clone()); + } + } + + out +} + +fn source_backlinks(ingested: &IngestedCorpus) -> Vec { + let mut backlinks = ingested + .note_ids_by_evidence + .keys() + .map(|evidence_id| format!("source:{evidence_id}")) + .collect::>(); + + backlinks.sort(); + + backlinks +} + +fn lint_findings_for_page( + loaded: &LoadedJob, + ingested: &IngestedCorpus, + lint: &KnowledgePageLintResponse, +) -> Vec { + let reverse = note_id_to_evidence_id(ingested); + + lint.findings + .iter() + .map(|finding| { + let evidence_ids = finding + .source_id + .and_then(|source_id| reverse.get(&source_id).cloned()) + .into_iter() + .collect::>(); + let trap_id = evidence_ids + .first() + .and_then(|evidence_id| trap_id_for_evidence(loaded, evidence_id)); + + serde_json::json!({ + "finding_id": finding.finding_id.to_string(), + "finding_type": finding.finding_type.clone(), + "severity": finding.severity.clone(), + "text": finding.message.clone(), + "evidence_ids": evidence_ids, + "trap_id": trap_id + }) + }) + .collect() +} + +fn unsupported_sections_from_fixture(loaded: &LoadedJob) -> Vec { + let Some(pages) = loaded + .value + .pointer("/corpus/adapter_response/answer/pages") + .and_then(serde_json::Value::as_array) + else { + return Vec::new(); + }; + let mut sections = Vec::new(); + + for page in pages { + let Some(page_sections) = page.get("sections").and_then(serde_json::Value::as_array) else { + continue; + }; + + for section in page_sections { + let Some(reason) = + section.get("unsupported_reason").and_then(serde_json::Value::as_str) + else { + continue; + }; + + sections.push(serde_json::json!({ + "section_id": section + .get("section_id") + .and_then(serde_json::Value::as_str) + .unwrap_or("unsupported-summary"), + "heading": section + .get("heading") + .and_then(serde_json::Value::as_str) + .unwrap_or("Unsupported Summary"), + "role": section.get("role").and_then(serde_json::Value::as_str).unwrap_or("summary"), + "content": section.get("content").and_then(serde_json::Value::as_str).unwrap_or(reason), + "evidence_ids": [], + "timeline_event_ids": [], + "unsupported_reason": reason + })); + } + } + + sections +} + +fn stale_trap_evidence_ids(loaded: &LoadedJob) -> Vec { + loaded + .value + .get("negative_traps") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + .filter(|trap| { + trap.get("type").and_then(serde_json::Value::as_str) == Some("stale_fact") + && trap.get("failure_if_used").and_then(serde_json::Value::as_bool).unwrap_or(false) + }) + .flat_map(|trap| { + trap.get("evidence_ids") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + .filter_map(serde_json::Value::as_str) + .map(ToString::to_string) + .collect::>() + }) + .collect() +} + +fn trap_id_for_evidence(loaded: &LoadedJob, evidence_id: &str) -> Option { + loaded + .value + .get("negative_traps") + .and_then(serde_json::Value::as_array)? + .iter() + .find(|trap| { + trap.get("evidence_ids") + .and_then(serde_json::Value::as_array) + .is_some_and(|ids| ids.iter().any(|id| id.as_str() == Some(evidence_id))) + }) + .and_then(|trap| trap.get("trap_id").and_then(serde_json::Value::as_str)) + .map(ToString::to_string) +} + +fn elf_selected_evidence_text( + loaded: &LoadedJob, + stored_corpus: &[CorpusText], + evidence_ids: &[String], + ingested: &IngestedCorpus, + capture_failure: &Option, +) -> ( + SelectedEvidenceText, + Option, + Option>, +) { + if let Some(failure) = capture_failure { + return ( + SelectedEvidenceText { content: failure.clone(), evidence_ids: Vec::new() }, + None, + None, + ); + } + if let Some(selection) = + temporal_reconciliation_selection(loaded, stored_corpus, evidence_ids, ingested) + { + return (selection.selected, Some(selection.evidence), Some(selection.trace_stages)); + } + + (selected_required_corpus_texts(loaded, stored_corpus, evidence_ids), None, None) +} + +async fn run_lightrag_async(args: LightragArgs) -> color_eyre::Result<()> { + let jobs = load_jobs(&args.fixtures)?; + let run_slug = short_hash(format!("{}:{}", args.adapter_id, Uuid::new_v4()).as_str()); + let result = materialize_lightrag_jobs(&args, &jobs, &run_slug).await; + let materialized = match result { + Ok(jobs) => jobs, + Err(err) => lightrag_failure_jobs( + &args.adapter_id, + &jobs, + "lightrag_api_context_export", + err.to_string(), + ), + }; + let status = aggregate_status(&materialized); + + write_materialized_output(MaterializedOutput { + adapter_id: &args.adapter_id, + adapter_kind: AdapterKind::LightragApiContextExport, + fixtures: &args.fixtures, + out_fixtures: &args.out_fixtures, + evidence_out: &args.evidence_out, + jobs: &jobs, + materialized: &materialized, + command_evidence: vec![CommandEvidence { + label: "lightrag_api_context_export".to_string(), + status, + command: "cargo run -p elf-eval --bin real_world_live_adapter -- lightrag" + .to_string(), + artifact: Some(args.evidence_out.display().to_string()), + reason: "LightRAG adapter used /documents/texts, /documents/track_status, and /query with only_need_context plus chunk references.".to_string(), + }], + metadata: Some(lightrag_metadata(&args, &run_slug)), + }) +} + +async fn materialize_lightrag_jobs( + args: &LightragArgs, + jobs: &[LoadedJob], + run_slug: &str, +) -> color_eyre::Result> { + fs::create_dir_all(&args.work_dir)?; + + let client = reqwest::Client::builder().timeout(Duration::from_secs(180)).build()?; + + wait_for_lightrag(args, &client).await?; + + let mut out = Vec::with_capacity(jobs.len()); + + for loaded in jobs { + out.push(materialize_lightrag_job(args, &client, loaded, run_slug).await?); + } + + Ok(out) +} + +async fn wait_for_lightrag( + args: &LightragArgs, + client: &reqwest::Client, +) -> color_eyre::Result<()> { + let mut last_error = String::new(); + + for _attempt in 1..=args.startup_attempts { + match lightrag_get_json(args, client, "/health").await { + Ok(_) => return Ok(()), + Err(err) => last_error = err.to_string(), + } + + time::sleep(Duration::from_secs(args.startup_interval_seconds)).await; + } + + Err(eyre::eyre!( + "LightRAG API did not become healthy at {} after {} attempts: {}", + lightrag_api_base(args), + args.startup_attempts, + last_error + )) +} + +async fn materialize_lightrag_job( + args: &LightragArgs, + client: &reqwest::Client, + loaded: &LoadedJob, + run_slug: &str, +) -> color_eyre::Result { + if let Some(job) = declared_encoding_job(&args.adapter_id, loaded) { + return Ok(job); + } + if let Some(job) = lightrag_not_encoded_job(&args.adapter_id, loaded) { + return Ok(job); + } + + let corpus = corpus_texts(loaded)?; + let sources = write_lightrag_corpus(args, loaded, &corpus, run_slug)?; + let indexed_at = Instant::now(); + let insert_response = insert_lightrag_texts(args, client, &corpus, &sources).await?; + + wait_for_lightrag_index(args, client, &insert_response, corpus.len()).await?; + + let indexing_latency_ms = indexed_at.elapsed().as_secs_f64() * 1_000.0; + let queried_at = Instant::now(); + let query_response = query_lightrag_context(args, client, loaded).await?; + let latency_ms = queried_at.elapsed().as_secs_f64() * 1_000.0; + let source_mappings = lightrag_source_mappings(&corpus, &sources, &query_response); + let evidence_ids = lightrag_mapped_evidence_ids(&source_mappings); + let selected = selected_required_corpus_texts(loaded, &corpus, &evidence_ids); + + Ok(materialized_job( + loaded, + &args.adapter_id, + MaterializedJobInput { + content: selected.content, + evidence_ids: selected.evidence_ids, + pages: Vec::new(), + latency_ms, + indexing_latency_ms: Some(indexing_latency_ms), + returned_count: source_mappings.len(), + trace_id: None, + failure: None, + source_mappings, + operator_debug: None, + operator_debug_evidence: None, + capture: None, + capture_failure: None, + consolidation_response: None, + consolidation: None, + knowledge: None, + temporal_reconciliation: None, + trace_stages: None, + }, + )) +} + +async fn insert_lightrag_texts( + args: &LightragArgs, + client: &reqwest::Client, + corpus: &[CorpusText], + sources: &[LightragSource], +) -> color_eyre::Result { + let request = serde_json::json!({ + "texts": corpus.iter().map(|item| item.text.as_str()).collect::>(), + "file_sources": sources.iter().map(|source| source.file_source.as_str()).collect::>(), + "chunking": { + "strategy": "fixed_token", + "params": { + "chunk_token_size": 320, + "chunk_overlap_token_size": 32 + } + } + }); + + lightrag_post_json(args, client, "/documents/texts", &request).await +} + +async fn wait_for_lightrag_index( + args: &LightragArgs, + client: &reqwest::Client, + insert_response: &serde_json::Value, + expected_docs: usize, +) -> color_eyre::Result<()> { + let track_id = insert_response + .get("track_id") + .and_then(serde_json::Value::as_str) + .ok_or_else(|| eyre::eyre!("LightRAG text insert response did not include track_id."))?; + let mut last_status = serde_json::Value::Null; + + for _attempt in 1..=args.index_attempts { + let status = + lightrag_get_json(args, client, format!("/documents/track_status/{track_id}")).await?; + + if lightrag_index_failed(&status) { + return Err(eyre::eyre!( + "LightRAG document indexing failed for track_id {track_id}: {}", + serde_json::to_string(&status)? + )); + } + if lightrag_index_processed(&status, expected_docs) { + return Ok(()); + } + + last_status = status; + + time::sleep(Duration::from_secs(args.index_interval_seconds)).await; + } + + Err(eyre::eyre!( + "LightRAG document indexing did not finish for track_id {} after {} attempts: {}", + track_id, + args.index_attempts, + serde_json::to_string(&last_status)? + )) +} + +async fn query_lightrag_context( + args: &LightragArgs, + client: &reqwest::Client, + loaded: &LoadedJob, +) -> color_eyre::Result { + let keywords = lightrag_keywords(loaded.job.prompt.content.as_str()); + let request = serde_json::json!({ + "query": loaded.job.prompt.content, + "mode": args.query_mode, + "only_need_context": true, + "include_references": true, + "include_chunk_content": true, + "enable_rerank": false, + "top_k": args.top_k, + "chunk_top_k": args.chunk_top_k, + "hl_keywords": keywords, + "ll_keywords": keywords, + "stream": false + }); + + lightrag_post_json(args, client, "/query", &request).await +} + +async fn lightrag_get_json( + args: &LightragArgs, + client: &reqwest::Client, + path: impl AsRef, +) -> color_eyre::Result { + let url = format!("{}{}", lightrag_api_base(args), path.as_ref()); + let mut request = client.get(url); + + if let Some(api_key) = args.api_key.as_deref().filter(|key| !key.is_empty()) { + request = request.bearer_auth(api_key); + } + + lightrag_send_json(request).await +} + +async fn lightrag_post_json( + args: &LightragArgs, + client: &reqwest::Client, + path: &str, + body: &serde_json::Value, +) -> color_eyre::Result { + let url = format!("{}{}", lightrag_api_base(args), path); + let mut request = client.post(url).json(body); + + if let Some(api_key) = args.api_key.as_deref().filter(|key| !key.is_empty()) { + request = request.bearer_auth(api_key); + } + + lightrag_send_json(request).await +} + +async fn lightrag_send_json(request: RequestBuilder) -> color_eyre::Result { + let response = request.send().await?; + let status = response.status(); + let body = response.text().await?; + + if !status.is_success() { + return Err(eyre::eyre!("LightRAG API returned HTTP {status}: {body}")); + } + + serde_json::from_str(&body) + .map_err(|err| eyre::eyre!("LightRAG API returned invalid JSON: {err}; body={body}")) +} + +#[tokio::main] +async fn main() -> color_eyre::Result<()> { + color_eyre::install()?; + + match Args::parse().command { + CommandArgs::Elf(args) => run_elf(args).await, + CommandArgs::Qmd(args) => run_qmd(args), + CommandArgs::Lightrag(args) => run_lightrag_async(args).await, + } +} + +async fn run_elf(args: ElfArgs) -> color_eyre::Result<()> { + let jobs = load_jobs(&args.fixtures)?; + let result = materialize_elf_jobs(&args, &jobs).await; + let materialized = match result { + Ok(jobs) => jobs, + Err(err) => failure_jobs(&args.adapter_id, &jobs, "elf_service_runtime", err.to_string()), + }; + + write_materialized_output(MaterializedOutput { + adapter_id: &args.adapter_id, + adapter_kind: AdapterKind::ElfServiceRuntime, + fixtures: &args.fixtures, + out_fixtures: &args.out_fixtures, + evidence_out: &args.evidence_out, + jobs: &jobs, + materialized: &materialized, + command_evidence: vec![CommandEvidence { + label: "elf_service_runtime".to_string(), + status: aggregate_status(&materialized), + command: "cargo run -p elf-eval --bin real_world_live_adapter -- elf".to_string(), + artifact: Some(args.evidence_out.display().to_string()), + reason: "ELF live adapter used ElfService, worker indexing, and search_raw." + .to_string(), + }], + metadata: None, + }) +} + +async fn materialize_elf_jobs( + args: &ElfArgs, + jobs: &[LoadedJob], +) -> color_eyre::Result> { + let base_dsn = env::var("ELF_PG_DSN") + .map_err(|_| eyre::eyre!("ELF_PG_DSN must be set for ELF live real-world adapter."))?; + let qdrant_url = env::var("ELF_QDRANT_GRPC_URL") + .or_else(|_| env::var("ELF_QDRANT_URL")) + .map_err(|_| eyre::eyre!("ELF_QDRANT_GRPC_URL or ELF_QDRANT_URL must be set."))?; + let test_db = TestDatabase::new(&base_dsn).await?; + let run_suffix = short_hash(format!("{}:{}", args.adapter_id, Uuid::new_v4()).as_str()); + let runtime = BaselineRuntime { + config_path: args.config.clone(), + dsn: test_db.dsn().to_string(), + qdrant_url, + collection: format!("elf_live_real_world_{run_suffix}"), + docs_collection: format!("elf_live_real_world_docs_{run_suffix}"), + }; + let service = build_service(&runtime).await?; + let mut out = Vec::with_capacity(jobs.len()); + + for loaded in jobs { + out.push(materialize_elf_job(&runtime, &service, loaded, &args.adapter_id).await?); + } + + drop(service); + + test_db.cleanup().await?; + + Ok(out) +} + +async fn materialize_elf_job( + runtime: &BaselineRuntime, + service: &ElfService, + loaded: &LoadedJob, + adapter_id: &str, +) -> color_eyre::Result { + if let Some(job) = declared_encoding_job(adapter_id, loaded) { + return Ok(job); + } + if let Some(job) = not_encoded_job(adapter_id, loaded) { + return Ok(job); + } + + let corpus = corpus_texts(loaded)?; + let stored_corpus = elf_stored_corpus_texts(&corpus)?; + let project_id = project_id_for_job(&loaded.job.job_id); + let ingested = + ingest_elf_corpus(service, loaded, adapter_id, project_id.as_str(), &corpus).await?; + + run_worker(runtime).await?; + + let started_at = Instant::now(); + let response = service + .search_raw(SearchRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.clone(), + agent_id: AGENT_ID.to_string(), + token_id: None, + payload_level: PayloadLevel::L2, + read_profile: "private_only".to_string(), + query: loaded.job.prompt.content.clone(), + top_k: Some(5), + candidate_k: Some(20), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .map_err(|err| eyre::eyre!("ELF search_raw failed for {}: {err}", loaded.job.job_id))?; + let latency_ms = started_at.elapsed().as_secs_f64() * 1_000.0; + let mut evidence_ids = Vec::new(); + + for item in &response.items { + if let Some(evidence_id) = + item.source_ref.get("evidence_id").and_then(serde_json::Value::as_str) + { + push_unique(&mut evidence_ids, evidence_id.to_string()); + } + } + + let runtime_capture = capture_runtime_evidence_from_search_items(&response.items); + let capture = capture_with_runtime_source_refs(ingested.capture.clone(), &runtime_capture); + let capture_failure = validate_capture_runtime_evidence( + loaded.job.suite.as_str(), + &corpus, + &capture, + &runtime_capture, + ); + let (selected, temporal_reconciliation, trace_stages) = elf_selected_evidence_text( + loaded, + &stored_corpus, + &evidence_ids, + &ingested, + &capture_failure, + ); + let replay_command = elf_replay_command(response.trace_id, project_id.as_str()); + let (operator_debug, operator_debug_evidence) = operator_debug_output( + AdapterKind::ElfServiceRuntime, + loaded, + Some(response.trace_id), + replay_command, + format!( + "/v2/admin/traces/{}/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + response.trace_id + ), + ); + let (pages, knowledge, knowledge_failure) = + match materialize_elf_knowledge(service, loaded, &ingested, adapter_id).await { + Ok(output) => output, + Err(err) if loaded.job.suite == "knowledge_compilation" => + (Vec::new(), None, Some(format!("live_adapter.knowledge: {err}"))), + Err(_) => (Vec::new(), None, None), + }; + let (consolidation_response, consolidation, consolidation_failure) = + match materialize_elf_consolidation(runtime, service, loaded, &ingested, adapter_id).await { + Ok(output) => output, + Err(err) if loaded.job.suite == "consolidation" => + (None, None, Some(format!("live_adapter.consolidation: {err}"))), + Err(_) => (None, None, None), + }; + let failure = knowledge_failure.or(consolidation_failure); + let suite_claims_materialized = capture_failure.is_none() + && ((loaded.job.suite == "knowledge_compilation" && knowledge.is_some()) + || (loaded.job.suite == "consolidation" && consolidation.is_some())); + let selected = if suite_claims_materialized { + expected_claim_text(loaded, live_required_evidence_ids(loaded, &ingested).as_slice()) + } else { + selected + }; + + Ok(materialized_job( + loaded, + adapter_id, + MaterializedJobInput { + content: selected.content, + evidence_ids: selected.evidence_ids, + pages, + latency_ms, + indexing_latency_ms: None, + returned_count: response.items.len(), + trace_id: Some(response.trace_id), + failure, + source_mappings: Vec::new(), + operator_debug, + operator_debug_evidence, + capture: capture_for_job(loaded, capture), + capture_failure, + consolidation_response, + consolidation, + knowledge, + temporal_reconciliation, + trace_stages, + }, + )) +} + +async fn materialize_elf_consolidation( + runtime: &BaselineRuntime, + service: &ElfService, + loaded: &LoadedJob, + ingested: &IngestedCorpus, + adapter_id: &str, +) -> color_eyre::Result<( + Option, + Option, + Option, +)> { + if loaded.job.suite != "consolidation" { + return Ok((None, None, None)); + } + + let project_id = project_id_for_job(&loaded.job.job_id); + let fixture = live_consolidation_fixture(loaded)?; + let corpus = corpus_texts(loaded)?; + let prepared = prepare_consolidation_run(loaded, adapter_id, ingested, &fixture, &corpus)?; + let run = service + .consolidation_run_create(ConsolidationRunCreateRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.clone(), + agent_id: AGENT_ID.to_string(), + job_kind: "fixture".to_string(), + input_refs: prepared.input_refs.clone(), + source_snapshot: serde_json::json!({ + "schema": "real_world_live_consolidation_run_snapshot/v1", + "adapter_id": adapter_id, + "job_id": loaded.job.job_id, + "source_ref_count": prepared.input_refs.len() + }), + lineage: ConsolidationLineage { + source_refs: prepared.input_refs.clone(), + parent_run_id: None, + parent_proposal_ids: Vec::new(), + }, + proposals: prepared.proposals, + }) + .await + .map_err(|err| { + eyre::eyre!("ELF consolidation_run_create failed for {}: {err}", loaded.job.job_id) + })?; + + run_worker(runtime).await?; + + let reviewed = review_live_consolidation_proposals( + service, + loaded, + project_id.as_str(), + run.run.run_id, + &fixture, + ) + .await?; + let consolidation_response = live_consolidation_response(&fixture, &reviewed)?; + let evidence = consolidation_materialization_evidence( + run.run.run_id, + &fixture, + &prepared.input_refs, + &reviewed, + ); + + Ok((Some(consolidation_response), Some(evidence), None)) +} + +async fn materialize_elf_knowledge( + service: &ElfService, + loaded: &LoadedJob, + ingested: &IngestedCorpus, + adapter_id: &str, +) -> color_eyre::Result<( + Vec, + Option, + Option, +)> { + if loaded.job.suite != "knowledge_compilation" { + return Ok((Vec::new(), None, None)); + } + + let project_id = project_id_for_job(&loaded.job.job_id); + let note_ids = live_note_ids(ingested); + + if note_ids.is_empty() { + return Err(eyre::eyre!( + "{} has no live note sources for knowledge rebuild.", + loaded.job.job_id + )); + } + + let page_key = slug(&loaded.job.job_id); + let request = KnowledgePageRebuildRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.clone(), + agent_id: AGENT_ID.to_string(), + page_kind: KnowledgePageKind::Project, + page_key, + title: Some(loaded.job.title.clone()), + note_ids: note_ids.clone(), + event_ids: Vec::new(), + relation_ids: Vec::new(), + proposal_ids: Vec::new(), + provider_metadata: serde_json::json!({ + "adapter_id": adapter_id, + "job_id": loaded.job.job_id, + "llm_derived": false, + "runtime_path": "ElfService::knowledge_page_rebuild" + }), + }; + let first = service.knowledge_page_rebuild(request.clone()).await.map_err(|err| { + eyre::eyre!("ELF knowledge_page_rebuild failed for {}: {err}", loaded.job.job_id) + })?; + let second = service.knowledge_page_rebuild(request).await.map_err(|err| { + eyre::eyre!("ELF second knowledge_page_rebuild failed for {}: {err}", loaded.job.job_id) + })?; + + update_stale_trap_sources(service, loaded, adapter_id, project_id.as_str()).await?; + + let lint = service + .knowledge_page_lint(KnowledgePageLintRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.clone(), + page_id: second.page.page.page_id, + }) + .await + .map_err(|err| { + eyre::eyre!("ELF knowledge_page_lint failed for {}: {err}", loaded.job.job_id) + })?; + let search = service + .knowledge_pages_search(KnowledgePageSearchRequest { + tenant_id: TENANT_ID.to_string(), + project_id, + query: "source notes".to_string(), + page_kind: Some(KnowledgePageKind::Project), + limit: Some(10), + }) + .await + .map_err(|err| { + eyre::eyre!("ELF knowledge_pages_search failed for {}: {err}", loaded.job.job_id) + })?; + let page = knowledge_page_artifact(loaded, ingested, &first.page, &second.page, &lint)?; + let evidence = knowledge_materialization_evidence(&second.page, &lint, search.items.len()); + + Ok((vec![page], Some(evidence), None)) +} + +async fn ingest_elf_corpus( + service: &ElfService, + loaded: &LoadedJob, + adapter_id: &str, + project_id: &str, + corpus: &[CorpusText], +) -> color_eyre::Result { + let mut ingested = IngestedCorpus::default(); + + for item in corpus { + if item.capture.action == LiveCaptureAction::Exclude { + push_unique(&mut ingested.capture.excluded_evidence_ids, item.evidence_id.clone()); + + continue; + } + + push_unique(&mut ingested.capture.stored_evidence_ids, item.evidence_id.clone()); + + if let Some(source_id) = item.capture.source_id.as_deref() { + push_unique(&mut ingested.capture.source_ids, source_id.to_string()); + } + + if item.capture.write_policy.is_some() { + let note_id = ingest_elf_corpus_item( + service, + loaded, + adapter_id, + project_id, + item, + item.evidence_id.clone(), + item.text.clone(), + 0, + 1, + &mut ingested.capture, + ) + .await?; + + ingested + .note_ids_by_evidence + .entry(item.evidence_id.clone()) + .or_default() + .push(note_id); + + continue; + } + + let chunks = note_text_chunks(item.text.as_str()); + let chunk_count = chunks.len(); + + for (chunk_index, text) in chunks.into_iter().enumerate() { + let key = if chunk_count == 1 { + item.evidence_id.clone() + } else { + format!("{}:chunk-{chunk_index:03}", item.evidence_id) + }; + let note_id = ingest_elf_corpus_item( + service, + loaded, + adapter_id, + project_id, + item, + key, + text, + chunk_index, + chunk_count, + &mut ingested.capture, + ) + .await?; + + ingested + .note_ids_by_evidence + .entry(item.evidence_id.clone()) + .or_default() + .push(note_id); + } + } + + Ok(ingested) +} + +#[allow(clippy::too_many_arguments)] +async fn ingest_elf_corpus_item( + service: &ElfService, + loaded: &LoadedJob, + adapter_id: &str, + project_id: &str, + item: &CorpusText, + key: String, + text: String, + chunk_index: usize, + chunk_count: usize, + capture: &mut CaptureMaterializationEvidence, +) -> color_eyre::Result { + let write_policy = item + .capture + .write_policy + .as_ref() + .map(|policy| write_policy_from_value(policy, item.evidence_id.as_str())) + .transpose()?; + let response = service + .add_note(AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.to_string(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(key), + text, + structured: None, + importance: 0.9, + confidence: 0.95, + ttl_days: None, + source_ref: serde_json::json!({ + "schema": "real_world_live_adapter/v1", + "adapter": adapter_id, + "job_id": loaded.job.job_id, + "evidence_id": item.evidence_id, + "source_id": item.capture.source_id.as_deref(), + "capture_action": capture_action_str(item.capture.action), + "evidence_binding": item.capture.evidence_binding.as_deref(), + "write_policy_applied": item.capture.write_policy.is_some(), + "chunk_index": chunk_index, + "chunk_count": chunk_count, + }), + write_policy, + }], + }) + .await + .map_err(|err| eyre::eyre!("ELF add_note failed for {}: {err}", loaded.job.job_id))?; + + for result in &response.results { + if let Some(audit) = &result.write_policy_audit + && (!audit.exclusions.is_empty() || !audit.redactions.is_empty()) + { + capture.write_policy_audit_count += 1; + capture.write_policy_exclusion_count += audit.exclusions.len(); + capture.write_policy_redaction_count += audit.redactions.len(); + } + } + + response.results.iter().find_map(|result| result.note_id).ok_or_else(|| { + eyre::eyre!( + "ELF add_note did not persist evidence {} chunk {} for {}.", + item.evidence_id, + chunk_index, + loaded.job.job_id + ) + }) +} + +async fn review_live_consolidation_proposals( + service: &ElfService, + loaded: &LoadedJob, + project_id: &str, + run_id: Uuid, + fixture: &LiveConsolidationFixture, +) -> color_eyre::Result> { + let listed = service + .consolidation_proposals_list(ConsolidationProposalsListRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.to_string(), + run_id: Some(run_id), + review_state: None, + limit: Some(100), + }) + .await + .map_err(|err| { + eyre::eyre!("ELF consolidation proposal list failed for {}: {err}", loaded.job.job_id) + })?; + let mut reviewed = Vec::new(); + + for (index, proposal) in listed.proposals.into_iter().enumerate() { + let fixture_proposal = fixture.proposals.get(index).ok_or_else(|| { + eyre::eyre!( + "ELF consolidation materialized extra proposal {} for {}.", + proposal.proposal_id, + loaded.job.job_id + ) + })?; + let review_action = + consolidation_review_action(fixture_proposal.actual_review_action.as_str())?; + + reviewed.push( + service + .consolidation_proposal_review(ConsolidationProposalReviewRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.to_string(), + reviewer_agent_id: AGENT_ID.to_string(), + proposal_id: proposal.proposal_id, + review_action, + review_comment: Some( + "Live adapter review transition for real-world benchmark evidence." + .to_string(), + ), + }) + .await + .map_err(|err| { + eyre::eyre!( + "ELF consolidation proposal review failed for {}: {err}", + loaded.job.job_id + ) + })?, + ); + } + + validate_reviewed_consolidation_count(loaded, fixture, &reviewed)?; + + Ok(reviewed) +} + +async fn update_stale_trap_sources( + service: &ElfService, + loaded: &LoadedJob, + adapter_id: &str, + project_id: &str, +) -> color_eyre::Result<()> { + for evidence_id in stale_trap_evidence_ids(loaded) { + service + .add_note(AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: project_id.to_string(), + agent_id: AGENT_ID.to_string(), + scope: SCOPE.to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(evidence_id.clone()), + text: format!( + "Current lint probe: evidence {evidence_id} changed after the knowledge page rebuild and should mark the derived page source snapshot stale." + ), + structured: None, + importance: 0.9, + confidence: 0.95, + ttl_days: None, + source_ref: serde_json::json!({ + "schema": "real_world_live_adapter/v1", + "adapter": adapter_id, + "job_id": loaded.job.job_id, + "evidence_id": evidence_id, + "lint_probe": "stale_source_ref" + }), + write_policy: None, + }], + }) + .await + .map_err(|err| { + eyre::eyre!( + "ELF add_note stale-source update failed for {}: {err}", + loaded.job.job_id + ) + })?; + } + + Ok(()) +} + +async fn build_service(runtime: &BaselineRuntime) -> color_eyre::Result { + let cfg = runtime_config(runtime)?; + let vector_dim = cfg.storage.qdrant.vector_dim; + let db = Db::connect(&cfg.storage.postgres).await?; + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; + + let qdrant = QdrantStore::new(&cfg.storage.qdrant)?; + + qdrant.ensure_collection().await?; + + Ok(ElfService::with_providers(cfg, db, qdrant, deterministic_providers(vector_dim))) +} + +async fn build_worker_state(runtime: &BaselineRuntime) -> color_eyre::Result { + let cfg = runtime_config(runtime)?; + let db = Db::connect(&cfg.storage.postgres).await?; + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; + + let qdrant = QdrantStore::new(&cfg.storage.qdrant)?; + + qdrant.ensure_collection().await?; + + let docs_qdrant = + QdrantStore::new_with_collection(&cfg.storage.qdrant, &cfg.storage.qdrant.docs_collection)?; + + docs_qdrant.ensure_collection().await?; + + let tokenizer = elf_chunking::load_tokenizer(&cfg.chunking.tokenizer_repo) + .map_err(|err| eyre::eyre!("Failed to load tokenizer for live adapter worker: {err}"))?; + let chunking = ChunkingConfig { + max_tokens: cfg.chunking.max_tokens, + overlap_tokens: cfg.chunking.overlap_tokens, + }; + + Ok(WorkerState { + db, + qdrant, + docs_qdrant, + embedding: cfg.providers.embedding, + chunking, + tokenizer, + }) +} + +async fn run_worker(runtime: &BaselineRuntime) -> color_eyre::Result<()> { + let state = Arc::new(build_worker_state(runtime).await?); + + for _ in 0..8 { + let state = Arc::clone(&state); + let mut set = JoinSet::new(); + + set.spawn(async move { + worker::process_once(&state) + .await + .map_err(|err| eyre::eyre!("Worker process_once failed: {err}")) + }); + + while let Some(joined) = set.join_next().await { + joined??; + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use serde_json::Value; + + fn capture_item( + evidence_id: &str, + action: super::LiveCaptureAction, + source_id: Option<&str>, + evidence_binding: Option<&str>, + write_policy: Option, + ) -> super::CorpusText { + super::CorpusText { + evidence_id: evidence_id.to_string(), + text: "Public capture text.".to_string(), + capture: super::LiveCapturePolicy { + action, + source_id: source_id.map(ToString::to_string), + evidence_binding: evidence_binding.map(ToString::to_string), + write_policy, + }, + } + } + + fn capture_evidence( + stored: &[&str], + excluded: &[&str], + ) -> super::CaptureMaterializationEvidence { + super::CaptureMaterializationEvidence { + stored_evidence_ids: stored.iter().map(|id| (*id).to_string()).collect(), + excluded_evidence_ids: excluded.iter().map(|id| (*id).to_string()).collect(), + source_ids: Vec::new(), + write_policy_audit_count: 0, + write_policy_exclusion_count: 0, + write_policy_redaction_count: 0, + runtime_source_refs: Vec::new(), + } + } + + #[test] + fn capture_runtime_validation_requires_returned_source_id() { + let corpus = vec![capture_item( + "source-a", + super::LiveCaptureAction::Store, + Some("capture:a"), + None, + None, + )]; + let capture = capture_evidence(&["source-a"], &[]); + let runtime = super::capture_runtime_evidence_from_source_refs([&serde_json::json!({ + "evidence_id": "source-a", + "capture_action": "store" + })]); + let failure = super::validate_capture_runtime_evidence( + "capture_integration", + &corpus, + &capture, + &runtime, + ) + .expect("missing runtime source_id should fail capture validation"); + + assert!(failure.contains("did not return expected source_id capture:a")); + } + + #[test] + fn capture_runtime_validation_rejects_returned_excluded_evidence() { + let corpus = vec![capture_item( + "private-trap", + super::LiveCaptureAction::Exclude, + Some("capture:private"), + Some("negative_trap"), + None, + )]; + let capture = capture_evidence(&[], &["private-trap"]); + let runtime = super::capture_runtime_evidence_from_source_refs([&serde_json::json!({ + "evidence_id": "private-trap", + "source_id": "capture:private", + "capture_action": "store" + })]); + let failure = super::validate_capture_runtime_evidence( + "capture_integration", + &corpus, + &capture, + &runtime, + ) + .expect("returned excluded evidence should fail capture validation"); + + assert!(failure.contains("excluded evidence private-trap was returned by live search")); + } + + #[test] + fn capture_runtime_source_refs_are_written_into_generated_fixture() { + let mut value = serde_json::json!({ + "corpus": { + "items": [ + { + "evidence_id": "source-a", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "fixture" + } + } + ] + } + }); + let mut capture = capture_evidence(&["source-a"], &[]); + + capture.runtime_source_refs.push(super::CaptureRuntimeSourceRefEvidence { + evidence_id: "source-a".to_string(), + source_ref: serde_json::json!({ + "schema": "real_world_live_adapter/v1", + "evidence_id": "source-a", + "source_id": "capture:a", + "capture_action": "store", + "evidence_binding": "source_ref" + }), + }); + + super::apply_capture_runtime_source_refs(&mut value, &capture); + + assert_eq!( + value + .pointer("/corpus/items/0/source_ref/source_id") + .and_then(serde_json::Value::as_str), + Some("capture:a") + ); + assert_eq!( + value + .pointer("/corpus/items/0/source_ref/evidence_binding") + .and_then(serde_json::Value::as_str), + Some("source_ref") + ); + } +} diff --git a/apps/elf-eval/src/bin/trace_gate_export.rs b/apps/elf-eval/src/bin/trace_gate_export.rs new file mode 100644 index 00000000..2f9c40fb --- /dev/null +++ b/apps/elf-eval/src/bin/trace_gate_export.rs @@ -0,0 +1,606 @@ +#![allow(unused_crate_dependencies)] + +//! CLI for exporting trace fixtures used by regression gates. + +use std::{fs, path::PathBuf}; + +use clap::Parser; +use color_eyre::Result; +use serde_json::Value; +use sqlx::FromRow; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; +use tracing_subscriber::EnvFilter; +use uuid::Uuid; + +use elf_storage::db::Db; + +#[derive(Debug, Parser)] +#[command( + version = elf_cli::VERSION, + rename_all = "kebab", + styles = elf_cli::styles(), +)] +struct Args { + /// Path to an ELF config file (used for Postgres DSN). + #[arg(long, short = 'c', value_name = "FILE")] + config: PathBuf, + /// One or more trace IDs to export. + #[arg(long, value_name = "UUID", required = true)] + trace_id: Vec, + /// Write SQL to this file (defaults to stdout). + #[arg(long, value_name = "FILE")] + out: Option, + /// Include trace items (search_trace_items). + #[arg(long, default_value_t = true)] + include_items: bool, + /// Include trace stages (search_trace_stages and search_trace_stage_items). + #[arg(long, default_value_t = false)] + include_stages: bool, +} + +#[derive(Debug, FromRow)] +struct TraceRow { + trace_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + query: String, + expansion_mode: String, + expanded_queries: Value, + allowed_scopes: Value, + candidate_count: i32, + top_k: i32, + config_snapshot: Value, + trace_version: i32, + created_at: OffsetDateTime, + expires_at: OffsetDateTime, +} + +#[derive(Debug, FromRow)] +struct CandidateRow { + candidate_id: Uuid, + trace_id: Uuid, + note_id: Uuid, + chunk_id: Uuid, + chunk_index: i32, + snippet: String, + candidate_snapshot: Value, + retrieval_rank: i32, + rerank_score: f32, + note_scope: String, + note_importance: f32, + note_updated_at: OffsetDateTime, + note_hit_count: i64, + note_last_hit_at: Option, + created_at: OffsetDateTime, + expires_at: OffsetDateTime, +} + +#[derive(Debug, FromRow)] +struct ItemRow { + item_id: Uuid, + trace_id: Uuid, + note_id: Uuid, + chunk_id: Option, + rank: i32, + final_score: f32, + explain: Value, +} + +#[derive(Debug, FromRow)] +struct StageRow { + stage_id: Uuid, + trace_id: Uuid, + stage_order: i32, + stage_name: String, + stage_payload: Value, + created_at: OffsetDateTime, +} + +#[derive(Debug, FromRow)] +struct StageItemRow { + id: Uuid, + stage_id: Uuid, + item_id: Option, + note_id: Option, + chunk_id: Option, + metrics: Value, +} + +fn normalize_trace_ids(trace_ids: &[Uuid]) -> Vec { + let mut out = trace_ids.to_vec(); + + out.sort_unstable(); + out.dedup(); + + out +} + +fn render_fixture_sql( + args: &Args, + traces: &[TraceRow], + candidates: &[CandidateRow], + items: &[ItemRow], + stages: &[StageRow], + stage_items: &[StageItemRow], +) -> Result { + let mut out = String::new(); + + render_preamble(args, &mut out); + render_traces(&mut out, traces)?; + render_candidates(&mut out, candidates)?; + render_items(&mut out, items)?; + render_stages(&mut out, stages)?; + render_stage_items(&mut out, stage_items)?; + + out.push_str("COMMIT;\n"); + + Ok(out) +} + +fn render_preamble(args: &Args, out: &mut String) { + out.push_str("-- Generated by `elf-eval trace_gate_export`.\n"); + out.push_str(&format!( + "-- trace_ids: {}\n", + args.trace_id.iter().map(|id| id.to_string()).collect::>().join(", ") + )); + out.push_str("BEGIN;\n\n"); +} + +fn render_traces(out: &mut String, traces: &[TraceRow]) -> Result<()> { + if traces.is_empty() { + return Ok(()); + } + + out.push_str("INSERT INTO search_traces (\n"); + out.push_str(" trace_id,\n"); + out.push_str(" tenant_id,\n"); + out.push_str(" project_id,\n"); + out.push_str(" agent_id,\n"); + out.push_str(" read_profile,\n"); + out.push_str(" query,\n"); + out.push_str(" expansion_mode,\n"); + out.push_str(" expanded_queries,\n"); + out.push_str(" allowed_scopes,\n"); + out.push_str(" candidate_count,\n"); + out.push_str(" top_k,\n"); + out.push_str(" config_snapshot,\n"); + out.push_str(" trace_version,\n"); + out.push_str(" created_at,\n"); + out.push_str(" expires_at\n"); + out.push_str(")\nVALUES\n"); + + for (idx, row) in traces.iter().enumerate() { + out.push_str(" ("); + out.push_str(&sql_uuid(&row.trace_id)); + out.push_str(", "); + out.push_str(&sql_text(&row.tenant_id)); + out.push_str(", "); + out.push_str(&sql_text(&row.project_id)); + out.push_str(", "); + out.push_str(&sql_text(&row.agent_id)); + out.push_str(", "); + out.push_str(&sql_text(&row.read_profile)); + out.push_str(", "); + out.push_str(&sql_text(&row.query)); + out.push_str(", "); + out.push_str(&sql_text(&row.expansion_mode)); + out.push_str(", "); + out.push_str(&sql_jsonb(&row.expanded_queries)?); + out.push_str(", "); + out.push_str(&sql_jsonb(&row.allowed_scopes)?); + out.push_str(", "); + out.push_str(&row.candidate_count.to_string()); + out.push_str(", "); + out.push_str(&row.top_k.to_string()); + out.push_str(", "); + out.push_str(&sql_jsonb(&row.config_snapshot)?); + out.push_str(", "); + out.push_str(&row.trace_version.to_string()); + out.push_str(", "); + out.push_str(&sql_timestamptz(&row.created_at)?); + out.push_str(", "); + out.push_str(&sql_timestamptz(&row.expires_at)?); + out.push(')'); + + if idx + 1 == traces.len() { + out.push_str(";\n\n"); + } else { + out.push_str(",\n"); + } + } + + Ok(()) +} + +fn render_candidates(out: &mut String, candidates: &[CandidateRow]) -> Result<()> { + if candidates.is_empty() { + return Ok(()); + } + + out.push_str("INSERT INTO search_trace_candidates (\n"); + out.push_str(" candidate_id,\n"); + out.push_str(" trace_id,\n"); + out.push_str(" note_id,\n"); + out.push_str(" chunk_id,\n"); + out.push_str(" chunk_index,\n"); + out.push_str(" snippet,\n"); + out.push_str(" candidate_snapshot,\n"); + out.push_str(" retrieval_rank,\n"); + out.push_str(" rerank_score,\n"); + out.push_str(" note_scope,\n"); + out.push_str(" note_importance,\n"); + out.push_str(" note_updated_at,\n"); + out.push_str(" note_hit_count,\n"); + out.push_str(" note_last_hit_at,\n"); + out.push_str(" created_at,\n"); + out.push_str(" expires_at\n"); + out.push_str(")\nVALUES\n"); + + for (idx, row) in candidates.iter().enumerate() { + out.push_str(" ("); + out.push_str(&sql_uuid(&row.candidate_id)); + out.push_str(", "); + out.push_str(&sql_uuid(&row.trace_id)); + out.push_str(", "); + out.push_str(&sql_uuid(&row.note_id)); + out.push_str(", "); + out.push_str(&sql_uuid(&row.chunk_id)); + out.push_str(", "); + out.push_str(&row.chunk_index.to_string()); + out.push_str(", "); + out.push_str(&sql_text(&row.snippet)); + out.push_str(", "); + out.push_str(&sql_jsonb(&row.candidate_snapshot)?); + out.push_str(", "); + out.push_str(&row.retrieval_rank.to_string()); + out.push_str(", "); + out.push_str(&sql_f32(row.rerank_score)); + out.push_str(", "); + out.push_str(&sql_text(&row.note_scope)); + out.push_str(", "); + out.push_str(&sql_f32(row.note_importance)); + out.push_str(", "); + out.push_str(&sql_timestamptz(&row.note_updated_at)?); + out.push_str(", "); + out.push_str(&row.note_hit_count.to_string()); + out.push_str(", "); + out.push_str(&sql_opt_timestamptz(&row.note_last_hit_at)?); + out.push_str(", "); + out.push_str(&sql_timestamptz(&row.created_at)?); + out.push_str(", "); + out.push_str(&sql_timestamptz(&row.expires_at)?); + out.push(')'); + + if idx + 1 == candidates.len() { + out.push_str(";\n\n"); + } else { + out.push_str(",\n"); + } + } + + Ok(()) +} + +fn render_items(out: &mut String, items: &[ItemRow]) -> Result<()> { + if items.is_empty() { + return Ok(()); + } + + out.push_str("INSERT INTO search_trace_items (\n"); + out.push_str(" item_id,\n"); + out.push_str(" trace_id,\n"); + out.push_str(" note_id,\n"); + out.push_str(" chunk_id,\n"); + out.push_str(" rank,\n"); + out.push_str(" final_score,\n"); + out.push_str(" explain\n"); + out.push_str(")\nVALUES\n"); + + for (idx, row) in items.iter().enumerate() { + out.push_str(" ("); + out.push_str(&sql_uuid(&row.item_id)); + out.push_str(", "); + out.push_str(&sql_uuid(&row.trace_id)); + out.push_str(", "); + out.push_str(&sql_uuid(&row.note_id)); + out.push_str(", "); + out.push_str(&sql_opt_uuid(&row.chunk_id)); + out.push_str(", "); + out.push_str(&row.rank.to_string()); + out.push_str(", "); + out.push_str(&sql_f32(row.final_score)); + out.push_str(", "); + out.push_str(&sql_jsonb(&row.explain)?); + out.push(')'); + + if idx + 1 == items.len() { + out.push_str(";\n\n"); + } else { + out.push_str(",\n"); + } + } + + Ok(()) +} + +fn render_stages(out: &mut String, stages: &[StageRow]) -> Result<()> { + if stages.is_empty() { + return Ok(()); + } + + out.push_str("INSERT INTO search_trace_stages (\n"); + out.push_str(" stage_id,\n"); + out.push_str(" trace_id,\n"); + out.push_str(" stage_order,\n"); + out.push_str(" stage_name,\n"); + out.push_str(" stage_payload,\n"); + out.push_str(" created_at\n"); + out.push_str(")\nVALUES\n"); + + for (idx, row) in stages.iter().enumerate() { + out.push_str(" ("); + out.push_str(&sql_uuid(&row.stage_id)); + out.push_str(", "); + out.push_str(&sql_uuid(&row.trace_id)); + out.push_str(", "); + out.push_str(&row.stage_order.to_string()); + out.push_str(", "); + out.push_str(&sql_text(&row.stage_name)); + out.push_str(", "); + out.push_str(&sql_jsonb(&row.stage_payload)?); + out.push_str(", "); + out.push_str(&sql_timestamptz(&row.created_at)?); + out.push(')'); + + if idx + 1 == stages.len() { + out.push_str(";\n\n"); + } else { + out.push_str(",\n"); + } + } + + Ok(()) +} + +fn render_stage_items(out: &mut String, stage_items: &[StageItemRow]) -> Result<()> { + if stage_items.is_empty() { + return Ok(()); + } + + out.push_str("INSERT INTO search_trace_stage_items (\n"); + out.push_str(" id,\n"); + out.push_str(" stage_id,\n"); + out.push_str(" item_id,\n"); + out.push_str(" note_id,\n"); + out.push_str(" chunk_id,\n"); + out.push_str(" metrics\n"); + out.push_str(")\nVALUES\n"); + + for (idx, row) in stage_items.iter().enumerate() { + out.push_str(" ("); + out.push_str(&sql_uuid(&row.id)); + out.push_str(", "); + out.push_str(&sql_uuid(&row.stage_id)); + out.push_str(", "); + out.push_str(&sql_opt_uuid(&row.item_id)); + out.push_str(", "); + out.push_str(&sql_opt_uuid(&row.note_id)); + out.push_str(", "); + out.push_str(&sql_opt_uuid(&row.chunk_id)); + out.push_str(", "); + out.push_str(&sql_jsonb(&row.metrics)?); + out.push(')'); + + if idx + 1 == stage_items.len() { + out.push_str(";\n\n"); + } else { + out.push_str(",\n"); + } + } + + Ok(()) +} + +fn sql_uuid(id: &Uuid) -> String { + format!("'{}'", id) +} + +fn sql_opt_uuid(id: &Option) -> String { + id.map(|value| format!("'{}'", value)).unwrap_or_else(|| "NULL".to_string()) +} + +fn sql_text(value: &str) -> String { + format!("'{}'", value.replace('\'', "''")) +} + +fn sql_jsonb(value: &Value) -> Result { + let raw = serde_json::to_string(value)?; + + Ok(format!("'{}'::jsonb", raw.replace('\'', "''"))) +} + +fn sql_f32(value: f32) -> String { + format!("{value}") +} + +fn sql_timestamptz(value: &OffsetDateTime) -> Result { + let raw = value.format(&Rfc3339)?; + + Ok(format!("'{}'::timestamptz", raw.replace('\'', "''"))) +} + +fn sql_opt_timestamptz(value: &Option) -> Result { + match value { + Some(ts) => sql_timestamptz(ts), + None => Ok("NULL".to_string()), + } +} + +#[tokio::main] +async fn main() -> Result<()> { + color_eyre::install()?; + + let args = Args::parse(); + let cfg = elf_config::load(&args.config)?; + let filter = EnvFilter::new(cfg.service.log_level.clone()); + + tracing_subscriber::fmt().with_env_filter(filter).init(); + + let trace_ids = normalize_trace_ids(&args.trace_id); + let db = Db::connect(&cfg.storage.postgres).await?; + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; + + let traces = fetch_traces(&db, &trace_ids).await?; + let candidates = fetch_candidates(&db, &trace_ids).await?; + let items = if args.include_items { fetch_items(&db, &trace_ids).await? } else { Vec::new() }; + let (stages, stage_items) = if args.include_stages { + let stages = fetch_stages(&db, &trace_ids).await?; + let stage_ids: Vec = stages.iter().map(|row| row.stage_id).collect(); + let stage_items = fetch_stage_items(&db, &stage_ids).await?; + + (stages, stage_items) + } else { + (Vec::new(), Vec::new()) + }; + let sql = render_fixture_sql(&args, &traces, &candidates, &items, &stages, &stage_items)?; + + if let Some(out_path) = &args.out { + fs::write(out_path, sql)?; + } else { + print!("{sql}"); + } + + Ok(()) +} + +async fn fetch_traces(db: &Db, trace_ids: &[Uuid]) -> Result> { + let rows: Vec = sqlx::query_as::<_, TraceRow>( + "\ +SELECT + trace_id, + tenant_id, + project_id, + agent_id, + read_profile, + query, + expansion_mode, + expanded_queries, + allowed_scopes, + candidate_count, + top_k, + config_snapshot, + trace_version, + created_at, + expires_at +FROM search_traces +WHERE trace_id = ANY($1) +ORDER BY trace_id ASC", + ) + .bind(trace_ids) + .fetch_all(&db.pool) + .await?; + + Ok(rows) +} + +async fn fetch_candidates(db: &Db, trace_ids: &[Uuid]) -> Result> { + let rows: Vec = sqlx::query_as::<_, CandidateRow>( + "\ +SELECT + candidate_id, + trace_id, + note_id, + chunk_id, + chunk_index, + snippet, + candidate_snapshot, + retrieval_rank, + rerank_score, + note_scope, + note_importance, + note_updated_at, + note_hit_count, + note_last_hit_at, + created_at, + expires_at +FROM search_trace_candidates +WHERE trace_id = ANY($1) +ORDER BY trace_id ASC, retrieval_rank ASC, candidate_id ASC", + ) + .bind(trace_ids) + .fetch_all(&db.pool) + .await?; + + Ok(rows) +} + +async fn fetch_items(db: &Db, trace_ids: &[Uuid]) -> Result> { + let rows: Vec = sqlx::query_as::<_, ItemRow>( + "\ +SELECT + item_id, + trace_id, + note_id, + chunk_id, + rank, + final_score, + explain +FROM search_trace_items +WHERE trace_id = ANY($1) +ORDER BY trace_id ASC, rank ASC, item_id ASC", + ) + .bind(trace_ids) + .fetch_all(&db.pool) + .await?; + + Ok(rows) +} + +async fn fetch_stages(db: &Db, trace_ids: &[Uuid]) -> Result> { + let rows: Vec = sqlx::query_as::<_, StageRow>( + "\ +SELECT + stage_id, + trace_id, + stage_order, + stage_name, + stage_payload, + created_at +FROM search_trace_stages +WHERE trace_id = ANY($1) +ORDER BY trace_id ASC, stage_order ASC, stage_id ASC", + ) + .bind(trace_ids) + .fetch_all(&db.pool) + .await?; + + Ok(rows) +} + +async fn fetch_stage_items(db: &Db, stage_ids: &[Uuid]) -> Result> { + if stage_ids.is_empty() { + return Ok(Vec::new()); + } + + let rows: Vec = sqlx::query_as::<_, StageItemRow>( + "\ +SELECT + id, + stage_id, + item_id, + note_id, + chunk_id, + metrics +FROM search_trace_stage_items +WHERE stage_id = ANY($1) +ORDER BY stage_id ASC, id ASC", + ) + .bind(stage_ids) + .fetch_all(&db.pool) + .await?; + + Ok(rows) +} diff --git a/apps/elf-eval/src/bin/trace_regression_gate.rs b/apps/elf-eval/src/bin/trace_regression_gate.rs new file mode 100644 index 00000000..54716bf7 --- /dev/null +++ b/apps/elf-eval/src/bin/trace_regression_gate.rs @@ -0,0 +1,517 @@ +#![allow(unused_crate_dependencies)] + +//! CLI for evaluating trace-regression gates against stored traces. + +use std::{collections::HashSet, fs, path::PathBuf}; + +use clap::Parser; +use color_eyre::{Result, eyre}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use sqlx::FromRow; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; +use tracing_subscriber::EnvFilter; +use uuid::Uuid; + +use elf_config::Config; +use elf_service::search::{self, TraceReplayContext}; +use elf_storage::db::Db; + +#[derive(Debug, Parser)] +#[command( + version = elf_cli::VERSION, + rename_all = "kebab", + styles = elf_cli::styles(), +)] +struct Args { + #[arg(long, short = 'c', value_name = "FILE")] + config: PathBuf, + #[arg(long, short = 'g', value_name = "FILE")] + gate: PathBuf, + #[arg(long, value_name = "FILE")] + out: Option, + #[arg(long, value_name = "N")] + top_k: Option, + #[arg(long, value_name = "N")] + retrieval_retention_rank: Option, +} + +#[derive(Clone, Copy, Debug, Default, Deserialize)] +#[serde(rename_all = "snake_case")] +struct GateThresholds { + max_positional_churn_at_k: Option, + max_set_churn_at_k: Option, + min_retrieval_top_rank_retention: Option, +} + +#[derive(Clone, Debug, Deserialize)] +#[serde(rename_all = "snake_case")] +struct GateTrace { + trace_id: Uuid, + top_k: Option, + retrieval_retention_rank: Option, + #[serde(flatten)] + thresholds: GateThresholds, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "snake_case")] +struct GateFile { + #[serde(default)] + defaults: GateThresholds, + top_k: Option, + retrieval_retention_rank: Option, + traces: Vec, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "snake_case")] +struct GateReport { + config_path: String, + gate_path: String, + summary: GateSummary, + traces: Vec, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "snake_case")] +struct GateSummary { + trace_count: usize, + breached_count: usize, + ok: bool, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "snake_case")] +struct TraceReport { + trace_id: Uuid, + query: String, + created_at: String, + top_k: u32, + retrieval_retention_rank: u32, + candidate_count: u32, + baseline_count: usize, + replay_count: usize, + churn: TraceChurn, + retention: TraceRetention, + breaches: Vec, + ok: bool, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "snake_case")] +struct TraceChurn { + positional_churn_at_k: f64, + set_churn_at_k: f64, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "snake_case")] +struct TraceRetention { + retrieval_top_rank_total: usize, + baseline_retrieval_top_rank_retained: usize, + baseline_retrieval_top_rank_retention: f64, + replay_retrieval_top_rank_retained: usize, + replay_retrieval_top_rank_retention: f64, + retention_delta: f64, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "snake_case")] +struct GateBreach { + metric: String, + value: f64, + threshold: f64, + op: String, +} + +#[derive(Debug, FromRow)] +struct TraceRow { + trace_id: Uuid, + query: String, + candidate_count: i32, + top_k: i32, + created_at: OffsetDateTime, +} + +#[derive(Debug, FromRow)] +struct TraceItemRow { + note_id: Uuid, +} + +#[derive(Debug, FromRow)] +struct CandidateRow { + candidate_snapshot: Value, + note_id: Uuid, + chunk_id: Uuid, + chunk_index: i32, + snippet: String, + retrieval_rank: i32, + rerank_score: f32, + note_scope: String, + note_importance: f32, + note_updated_at: OffsetDateTime, + note_hit_count: i64, + note_last_hit_at: Option, +} + +fn load_gate_file(path: &PathBuf) -> Result { + let raw = fs::read_to_string(path)?; + let out: GateFile = serde_json::from_str(&raw)?; + + Ok(out) +} + +fn merge_thresholds(defaults: GateThresholds, overrides: GateThresholds) -> GateThresholds { + GateThresholds { + max_positional_churn_at_k: overrides + .max_positional_churn_at_k + .or(defaults.max_positional_churn_at_k), + max_set_churn_at_k: overrides.max_set_churn_at_k.or(defaults.max_set_churn_at_k), + min_retrieval_top_rank_retention: overrides + .min_retrieval_top_rank_retention + .or(defaults.min_retrieval_top_rank_retention), + } +} + +fn decode_trace_replay_candidates( + rows: Vec, +) -> Vec { + rows.into_iter() + .map(|row| { + let decoded = serde_json::from_value::( + row.candidate_snapshot.clone(), + ) + .ok() + .filter(|value| value.note_id != Uuid::nil() && value.chunk_id != Uuid::nil()); + + decoded.unwrap_or_else(|| elf_service::search::TraceReplayCandidate { + note_id: row.note_id, + chunk_id: row.chunk_id, + chunk_index: row.chunk_index, + snippet: row.snippet, + retrieval_rank: u32::try_from(row.retrieval_rank).unwrap_or(0), + retrieval_score: None, + rerank_score: row.rerank_score, + note_scope: row.note_scope, + note_importance: row.note_importance, + note_updated_at: row.note_updated_at, + note_hit_count: row.note_hit_count, + note_last_hit_at: row.note_last_hit_at, + diversity_selected: None, + diversity_selected_rank: None, + diversity_selected_reason: None, + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: None, + diversity_mmr_score: None, + diversity_missing_embedding: None, + }) + }) + .collect() +} + +fn churn_against_baseline_at_k(baseline: &[Uuid], other: &[Uuid], k: usize) -> (f64, f64) { + let k = k.max(1); + let mut positional_diff = 0_usize; + + for idx in 0..k { + let a = baseline.get(idx); + let b = other.get(idx); + + if a != b { + positional_diff += 1; + } + } + + let positional_churn = positional_diff as f64 / k as f64; + let base_set: HashSet = baseline.iter().take(k).copied().collect(); + let other_set: HashSet = other.iter().take(k).copied().collect(); + let overlap = base_set.intersection(&other_set).count(); + let set_churn = 1.0 - (overlap as f64 / k as f64); + + (positional_churn, set_churn) +} + +fn retrieval_top_rank_retention( + candidates: &[elf_service::search::TraceReplayCandidate], + note_ids: &[Uuid], + max_retrieval_rank: u32, +) -> (usize, usize, f64) { + let mut top_notes = HashSet::new(); + + for candidate in candidates { + if candidate.retrieval_rank == 0 || candidate.retrieval_rank > max_retrieval_rank { + continue; + } + + top_notes.insert(candidate.note_id); + } + + let total = top_notes.len(); + + if total == 0 { + return (0, 0, 0.0); + } + + let out_set: HashSet = note_ids.iter().copied().collect(); + let retained = top_notes.intersection(&out_set).count(); + let retention = retained as f64 / total as f64; + + (total, retained, retention) +} + +#[tokio::main] +async fn main() -> Result<()> { + color_eyre::install()?; + + let args = Args::parse(); + let cfg = elf_config::load(&args.config)?; + let filter = EnvFilter::new(cfg.service.log_level.clone()); + + tracing_subscriber::fmt().with_env_filter(filter).init(); + + let gate = load_gate_file(&args.gate)?; + + if gate.traces.is_empty() { + return Err(eyre::eyre!("Gate JSON must include at least one trace.")); + } + + let gate_top_k = gate.top_k; + let gate_retrieval_retention_rank = gate.retrieval_retention_rank; + let db = Db::connect(&cfg.storage.postgres).await?; + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; + + let mut traces = Vec::with_capacity(gate.traces.len()); + let mut breached_count = 0_usize; + + for trace in gate.traces { + let thresholds = merge_thresholds(gate.defaults, trace.thresholds); + let report = eval_trace( + &db, + &cfg, + &args, + gate_top_k, + gate_retrieval_retention_rank, + &trace, + thresholds, + ) + .await?; + + if !report.ok { + breached_count += 1; + } + + traces.push(report); + } + + let summary = + GateSummary { trace_count: traces.len(), breached_count, ok: breached_count == 0 }; + let report = GateReport { + config_path: args.config.display().to_string(), + gate_path: args.gate.display().to_string(), + summary, + traces, + }; + let json = serde_json::to_string_pretty(&report)?; + + if let Some(out_path) = &args.out { + fs::write(out_path, &json)?; + } else { + println!("{json}"); + } + + if !report.summary.ok { + return Err(eyre::eyre!( + "Trace regression gate breached: {}/{} traces failed thresholds.", + report.summary.breached_count, + report.summary.trace_count + )); + } + + Ok(()) +} + +async fn eval_trace( + db: &Db, + cfg: &Config, + cli: &Args, + gate_top_k: Option, + gate_retrieval_retention_rank: Option, + trace: &GateTrace, + thresholds: GateThresholds, +) -> Result { + let trace_row = fetch_trace_row(db, &trace.trace_id).await?; + let created_at = trace_row + .created_at + .format(&Rfc3339) + .map_err(|err| eyre::eyre!("Failed to format created_at: {err}"))?; + let context = TraceReplayContext { + trace_id: trace_row.trace_id, + query: trace_row.query.clone(), + candidate_count: u32::try_from(trace_row.candidate_count).unwrap_or(0), + top_k: u32::try_from(trace_row.top_k).unwrap_or(0), + created_at: trace_row.created_at, + }; + let top_k = + trace.top_k.or(cli.top_k).or(gate_top_k).or(Some(context.top_k)).unwrap_or(10).max(1); + let retrieval_retention_rank = trace + .retrieval_retention_rank + .or(cli.retrieval_retention_rank) + .or(gate_retrieval_retention_rank) + .unwrap_or(3) + .max(1); + let baseline_items = fetch_baseline_items(db, &trace.trace_id, top_k).await?; + let baseline_note_ids: Vec = baseline_items.iter().map(|row| row.note_id).collect(); + let candidate_rows = fetch_candidate_rows(db, &trace.trace_id).await?; + let candidates = decode_trace_replay_candidates(candidate_rows); + let replay_items = + search::replay_ranking_from_candidates(cfg, &context, None, &candidates, top_k) + .map_err(|err| eyre::eyre!("{err}"))?; + let replay_note_ids: Vec = replay_items.iter().map(|item| item.note_id).collect(); + let effective_k = top_k as usize; + let (positional_churn_at_k, set_churn_at_k) = + churn_against_baseline_at_k(&baseline_note_ids, &replay_note_ids, effective_k); + let churn = TraceChurn { positional_churn_at_k, set_churn_at_k }; + let (retrieval_top_rank_total, baseline_retained, baseline_retention) = + retrieval_top_rank_retention(&candidates, &baseline_note_ids, retrieval_retention_rank); + let (_, replay_retained, replay_retention) = + retrieval_top_rank_retention(&candidates, &replay_note_ids, retrieval_retention_rank); + let retention = TraceRetention { + retrieval_top_rank_total, + baseline_retrieval_top_rank_retained: baseline_retained, + baseline_retrieval_top_rank_retention: baseline_retention, + replay_retrieval_top_rank_retained: replay_retained, + replay_retrieval_top_rank_retention: replay_retention, + retention_delta: replay_retention - baseline_retention, + }; + let mut breaches = Vec::new(); + + if baseline_note_ids.len() < effective_k { + breaches.push(GateBreach { + metric: "baseline_count_at_k".to_string(), + value: baseline_note_ids.len() as f64, + threshold: effective_k as f64, + op: ">=".to_string(), + }); + } + if replay_note_ids.len() < effective_k { + breaches.push(GateBreach { + metric: "replay_count_at_k".to_string(), + value: replay_note_ids.len() as f64, + threshold: effective_k as f64, + op: ">=".to_string(), + }); + } + + if let Some(max) = thresholds.max_positional_churn_at_k + && churn.positional_churn_at_k > max + { + breaches.push(GateBreach { + metric: "positional_churn_at_k".to_string(), + value: churn.positional_churn_at_k, + threshold: max, + op: "<=".to_string(), + }); + } + if let Some(max) = thresholds.max_set_churn_at_k + && churn.set_churn_at_k > max + { + breaches.push(GateBreach { + metric: "set_churn_at_k".to_string(), + value: churn.set_churn_at_k, + threshold: max, + op: "<=".to_string(), + }); + } + if let Some(min) = thresholds.min_retrieval_top_rank_retention + && retention.replay_retrieval_top_rank_retention < min + { + breaches.push(GateBreach { + metric: "replay_retrieval_top_rank_retention".to_string(), + value: retention.replay_retrieval_top_rank_retention, + threshold: min, + op: ">=".to_string(), + }); + } + + Ok(TraceReport { + trace_id: trace.trace_id, + query: context.query, + created_at, + top_k, + retrieval_retention_rank, + candidate_count: context.candidate_count, + baseline_count: baseline_note_ids.len(), + replay_count: replay_note_ids.len(), + churn, + retention, + ok: breaches.is_empty(), + breaches, + }) +} + +async fn fetch_trace_row(db: &Db, trace_id: &Uuid) -> Result { + let row: TraceRow = sqlx::query_as::<_, TraceRow>( + "\ +SELECT + trace_id, + query, + candidate_count, + top_k, + created_at +FROM search_traces +WHERE trace_id = $1", + ) + .bind(trace_id) + .fetch_one(&db.pool) + .await?; + + Ok(row) +} + +async fn fetch_baseline_items(db: &Db, trace_id: &Uuid, top_k: u32) -> Result> { + let rows: Vec = sqlx::query_as::<_, TraceItemRow>( + "\ +SELECT + note_id +FROM search_trace_items +WHERE trace_id = $1 +ORDER BY rank ASC +LIMIT $2", + ) + .bind(trace_id) + .bind(i64::from(top_k.max(1))) + .fetch_all(&db.pool) + .await?; + + Ok(rows) +} + +async fn fetch_candidate_rows(db: &Db, trace_id: &Uuid) -> Result> { + let rows: Vec = sqlx::query_as::<_, CandidateRow>( + "\ +SELECT + candidate_snapshot, + note_id, + chunk_id, + chunk_index, + snippet, + retrieval_rank, + rerank_score, + note_scope, + note_importance, + note_updated_at, + note_hit_count, + note_last_hit_at +FROM search_trace_candidates +WHERE trace_id = $1 +ORDER BY retrieval_rank ASC", + ) + .bind(trace_id) + .fetch_all(&db.pool) + .await?; + + Ok(rows) +} diff --git a/apps/elf-eval/src/lib.rs b/apps/elf-eval/src/lib.rs deleted file mode 100644 index 321b7fe0..00000000 --- a/apps/elf-eval/src/lib.rs +++ /dev/null @@ -1,359 +0,0 @@ -// std -use std::{collections::HashSet, fs, path::PathBuf, time::Instant}; - -// crates.io -use clap::Parser; -use color_eyre::eyre; -use serde::{Deserialize, Serialize}; -use tracing_subscriber::EnvFilter; -use uuid::Uuid; - -// self -use elf_service::ElfService; -use elf_storage::{db::Db, qdrant::QdrantStore}; - -#[derive(Debug, Parser)] -#[command( - version = elf_cli::VERSION, - rename_all = "kebab", - styles = elf_cli::styles(), -)] -pub struct Args { - #[arg(long, short = 'c', value_name = "FILE")] - pub config: PathBuf, - #[arg(long, short = 'd', value_name = "FILE")] - pub dataset: PathBuf, - #[arg(long, value_name = "N")] - pub top_k: Option, - #[arg(long, value_name = "N")] - pub candidate_k: Option, -} - -#[derive(Debug, Deserialize)] -struct EvalDataset { - name: Option, - defaults: Option, - queries: Vec, -} - -#[derive(Debug, Deserialize, Clone)] -struct EvalDefaults { - tenant_id: Option, - project_id: Option, - agent_id: Option, - read_profile: Option, - top_k: Option, - candidate_k: Option, -} - -#[derive(Debug, Deserialize)] -struct EvalQuery { - id: Option, - query: String, - tenant_id: Option, - project_id: Option, - agent_id: Option, - read_profile: Option, - top_k: Option, - candidate_k: Option, - expected_note_ids: Vec, -} - -#[derive(Debug, Serialize)] -struct EvalOutput { - dataset: EvalDatasetInfo, - settings: EvalSettings, - summary: EvalSummary, - queries: Vec, -} - -#[derive(Debug, Serialize)] -struct EvalDatasetInfo { - name: String, - query_count: usize, -} - -#[derive(Debug, Serialize)] -struct EvalSettings { - config_path: String, - candidate_k: u32, - top_k: u32, -} - -#[derive(Debug, Serialize)] -struct EvalSummary { - avg_recall_at_k: f64, - avg_precision_at_k: f64, - mean_rr: f64, - mean_ndcg: f64, - latency_ms_p50: f64, - latency_ms_p95: f64, -} - -#[derive(Debug, Serialize)] -struct QueryReport { - id: String, - query: String, - expected_count: usize, - retrieved_count: usize, - relevant_count: usize, - recall_at_k: f64, - precision_at_k: f64, - rr: f64, - ndcg: f64, - latency_ms: f64, - expected_note_ids: Vec, - retrieved_note_ids: Vec, -} - -pub async fn run(args: Args) -> color_eyre::Result<()> { - let config = elf_config::load(&args.config)?; - let filter = EnvFilter::new(config.service.log_level.clone()); - tracing_subscriber::fmt().with_env_filter(filter).init(); - - let db = Db::connect(&config.storage.postgres).await?; - db.ensure_schema(config.storage.qdrant.vector_dim).await?; - let qdrant = QdrantStore::new(&config.storage.qdrant)?; - let service = ElfService::new(config, db, qdrant); - - let dataset = load_dataset(&args.dataset)?; - let defaults = dataset.defaults.clone().unwrap_or(EvalDefaults { - tenant_id: None, - project_id: None, - agent_id: None, - read_profile: None, - top_k: None, - candidate_k: None, - }); - - let mut reports = Vec::with_capacity(dataset.queries.len()); - let mut latencies_ms = Vec::with_capacity(dataset.queries.len()); - - for (index, query) in dataset.queries.iter().enumerate() { - let merged = merge_query(&defaults, query, &args, &service.cfg, index)?; - let start = Instant::now(); - let response = service.search(merged.request).await?; - let latency_ms = start.elapsed().as_secs_f64() * 1000.0; - let retrieved = unique_ids(response.items.iter().map(|item| item.note_id)); - let expected: HashSet = merged.expected_note_ids.iter().copied().collect(); - let metrics = compute_metrics(&retrieved, &expected); - - reports.push(QueryReport { - id: merged.id, - query: merged.query, - expected_count: expected.len(), - retrieved_count: retrieved.len(), - relevant_count: metrics.relevant_count, - recall_at_k: metrics.recall_at_k, - precision_at_k: metrics.precision_at_k, - rr: metrics.rr, - ndcg: metrics.ndcg, - latency_ms, - expected_note_ids: merged.expected_note_ids, - retrieved_note_ids: retrieved, - }); - latencies_ms.push(latency_ms); - } - - let summary = summarize(&reports, &latencies_ms); - let output = EvalOutput { - dataset: EvalDatasetInfo { - name: dataset.name.unwrap_or_else(|| "eval".to_string()), - query_count: reports.len(), - }, - settings: EvalSettings { - config_path: args.config.display().to_string(), - candidate_k: args - .candidate_k - .or(dataset.defaults.as_ref().and_then(|d| d.candidate_k)) - .unwrap_or(service.cfg.memory.candidate_k), - top_k: args - .top_k - .or(dataset.defaults.as_ref().and_then(|d| d.top_k)) - .unwrap_or(service.cfg.memory.top_k), - }, - summary, - queries: reports, - }; - - let json = serde_json::to_string_pretty(&output)?; - println!("{json}"); - Ok(()) -} - -fn load_dataset(path: &PathBuf) -> color_eyre::Result { - let raw = fs::read_to_string(path)?; - let dataset: EvalDataset = serde_json::from_str(&raw)?; - if dataset.queries.is_empty() { - return Err(eyre::eyre!("Dataset must include at least one query.")); - } - Ok(dataset) -} - -struct MergedQuery { - id: String, - query: String, - expected_note_ids: Vec, - request: elf_service::SearchRequest, -} - -fn merge_query( - defaults: &EvalDefaults, - query: &EvalQuery, - args: &Args, - cfg: &elf_config::Config, - index: usize, -) -> color_eyre::Result { - if query.expected_note_ids.is_empty() { - return Err(eyre::eyre!( - "Query at index {index} must include at least one expected_note_id." - )); - } - - let tenant_id = query - .tenant_id - .clone() - .or_else(|| defaults.tenant_id.clone()) - .ok_or_else(|| eyre::eyre!("tenant_id is required for query at index {index}."))?; - let project_id = query - .project_id - .clone() - .or_else(|| defaults.project_id.clone()) - .ok_or_else(|| eyre::eyre!("project_id is required for query at index {index}."))?; - let agent_id = query - .agent_id - .clone() - .or_else(|| defaults.agent_id.clone()) - .ok_or_else(|| eyre::eyre!("agent_id is required for query at index {index}."))?; - let read_profile = query - .read_profile - .clone() - .or_else(|| defaults.read_profile.clone()) - .ok_or_else(|| eyre::eyre!("read_profile is required for query at index {index}."))?; - - let top_k = args.top_k.or(query.top_k).or(defaults.top_k).unwrap_or(cfg.memory.top_k).max(1); - let candidate_k = args - .candidate_k - .or(query.candidate_k) - .or(defaults.candidate_k) - .unwrap_or(cfg.memory.candidate_k) - .max(top_k); - - let id = query.id.clone().unwrap_or_else(|| format!("query-{index}")); - - Ok(MergedQuery { - id, - query: query.query.clone(), - expected_note_ids: query.expected_note_ids.clone(), - request: elf_service::SearchRequest { - tenant_id, - project_id, - agent_id, - read_profile, - query: query.query.clone(), - top_k: Some(top_k), - candidate_k: Some(candidate_k), - record_hits: Some(false), - }, - }) -} - -fn unique_ids(iter: I) -> Vec -where - I: Iterator, -{ - let mut seen = HashSet::new(); - let mut out = Vec::new(); - for id in iter { - if seen.insert(id) { - out.push(id); - } - } - out -} - -struct Metrics { - recall_at_k: f64, - precision_at_k: f64, - rr: f64, - ndcg: f64, - relevant_count: usize, -} - -fn compute_metrics(retrieved: &[Uuid], expected: &HashSet) -> Metrics { - let expected_count = expected.len(); - let mut relevant_count = 0usize; - let mut dcg = 0.0_f64; - let mut rr = 0.0_f64; - let mut first_hit: Option = None; - - for (idx, id) in retrieved.iter().enumerate() { - if expected.contains(id) { - relevant_count += 1; - let rank = idx + 1; - let denom = (rank as f64 + 1.0).log2(); - dcg += 1.0 / denom; - if first_hit.is_none() { - first_hit = Some(rank); - } - } - } - - if let Some(rank) = first_hit { - rr = 1.0 / rank as f64; - } - - let mut idcg = 0.0_f64; - let ideal_hits = expected_count.min(retrieved.len()); - for idx in 0..ideal_hits { - let rank = idx + 1; - let denom = (rank as f64 + 1.0).log2(); - idcg += 1.0 / denom; - } - - let ndcg = if idcg > 0.0 { dcg / idcg } else { 0.0 }; - let precision_at_k = - if retrieved.is_empty() { 0.0 } else { relevant_count as f64 / retrieved.len() as f64 }; - let recall_at_k = - if expected_count == 0 { 0.0 } else { relevant_count as f64 / expected_count as f64 }; - - Metrics { recall_at_k, precision_at_k, rr, ndcg, relevant_count } -} - -fn summarize(reports: &[QueryReport], latencies_ms: &[f64]) -> EvalSummary { - let count = reports.len().max(1) as f64; - let avg_recall_at_k = reports.iter().map(|r| r.recall_at_k).sum::() / count; - let avg_precision_at_k = reports.iter().map(|r| r.precision_at_k).sum::() / count; - let mean_rr = reports.iter().map(|r| r.rr).sum::() / count; - let mean_ndcg = reports.iter().map(|r| r.ndcg).sum::() / count; - - let mut sorted = latencies_ms.to_vec(); - sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); - let p50 = percentile(&sorted, 0.50); - let p95 = percentile(&sorted, 0.95); - - EvalSummary { - avg_recall_at_k, - avg_precision_at_k, - mean_rr, - mean_ndcg, - latency_ms_p50: p50, - latency_ms_p95: p95, - } -} - -fn percentile(values: &[f64], percentile: f64) -> f64 { - if values.is_empty() { - return 0.0; - } - let clamped = percentile.clamp(0.0, 1.0); - let pos = clamped * (values.len() as f64 - 1.0); - let lower = pos.floor() as usize; - let upper = pos.ceil() as usize; - if lower == upper { - values[lower] - } else { - let weight = pos - lower as f64; - values[lower] * (1.0 - weight) + values[upper] * weight - } -} diff --git a/apps/elf-eval/src/main.rs b/apps/elf-eval/src/main.rs index 42669630..25c00ae5 100644 --- a/apps/elf-eval/src/main.rs +++ b/apps/elf-eval/src/main.rs @@ -1,11 +1,19 @@ -// crates.io +#![allow(unused_crate_dependencies)] + +//! CLI entrypoint for ELF evaluation commands. + +mod app; + use clap::Parser; -// self -use elf_eval::Args; +use color_eyre::Result; + +use app::Args; #[tokio::main] -async fn main() -> color_eyre::Result<()> { +async fn main() -> Result<()> { color_eyre::install()?; + let args = Args::parse(); - elf_eval::run(args).await + + app::run(args).await } diff --git a/apps/elf-eval/tests/agentmemory_fixture_adapter.rs b/apps/elf-eval/tests/agentmemory_fixture_adapter.rs new file mode 100644 index 00000000..452158d4 --- /dev/null +++ b/apps/elf-eval/tests/agentmemory_fixture_adapter.rs @@ -0,0 +1,102 @@ +#![allow(unused_crate_dependencies)] + +//! Integration tests for the offline agentmemory fixture adapter. + +use std::{path::Path, process::Command}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +fn run_adapter() -> Result { + let fixture = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("fixtures") + .join("agentmemory") + .join("sample_session.json"); + let output = Command::new(env!("CARGO_BIN_EXE_agentmemory_fixture_adapter")) + .arg("--fixture") + .arg(fixture) + .output()?; + + assert!( + output.status.success(), + "agentmemory fixture adapter failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + Ok(serde_json::from_slice(&output.stdout)?) +} + +fn array_at<'a>(value: &'a Value, pointer: &str) -> Result<&'a Vec> { + value + .pointer(pointer) + .and_then(Value::as_array) + .ok_or_else(|| eyre::eyre!("missing array at {pointer}")) +} + +fn find_by_field<'a>(items: &'a [Value], field: &str, expected: &str) -> Result<&'a Value> { + items + .iter() + .find(|item| item.pointer(field).and_then(Value::as_str) == Some(expected)) + .ok_or_else(|| eyre::eyre!("missing item with {field} = {expected}")) +} + +#[test] +fn fixture_maps_memories_observations_and_baselines() -> Result<()> { + let output = run_adapter()?; + + assert_eq!( + output.pointer("/schema").and_then(Value::as_str), + Some("elf.agentmemory_adapter/v1") + ); + assert_eq!(output.pointer("/summary/session_count").and_then(Value::as_u64), Some(1)); + assert_eq!(output.pointer("/summary/note_candidate_count").and_then(Value::as_u64), Some(2)); + assert_eq!(output.pointer("/summary/doc_candidate_count").and_then(Value::as_u64), Some(2)); + assert_eq!(output.pointer("/summary/baseline_query_count").and_then(Value::as_u64), Some(1)); + assert_eq!(output.pointer("/summary/ignored_count").and_then(Value::as_u64), Some(1)); + + let notes = array_at(&output, "/note_candidates")?; + let note = find_by_field(notes, "/source_memory_id", "mem-architecture-sot")?; + + assert_eq!(note.pointer("/notes_ingest_item/type").and_then(Value::as_str), Some("fact")); + assert_eq!( + note.pointer("/notes_ingest_item/key").and_then(Value::as_str), + Some("architecture_sot"), + ); + assert_eq!( + note.pointer("/notes_ingest_item/source_ref/resolver").and_then(Value::as_str), + Some("agentmemory_fixture/v1"), + ); + + let docs = array_at(&output, "/doc_candidates")?; + let doc = find_by_field(docs, "/source_observation_id", "obs-architecture")?; + + assert_eq!(doc.pointer("/docs_put/doc_type").and_then(Value::as_str), Some("chat")); + assert_eq!( + doc.pointer("/docs_put/source_ref/schema").and_then(Value::as_str), + Some("doc_source_ref/v1"), + ); + assert_eq!( + doc.pointer("/docs_put/source_ref/thread_id").and_then(Value::as_str), + Some("am-session-2026-06-08"), + ); + + let baselines = array_at(&output, "/baseline_queries")?; + let baseline = find_by_field(baselines, "/query_id", "q-architecture-sot")?; + let expected_keys = array_at(baseline, "/expected_keys")?; + + assert_eq!(expected_keys.len(), 1); + assert_eq!(expected_keys.first().and_then(Value::as_str), Some("architecture_sot")); + + Ok(()) +} + +#[test] +fn fixture_reports_unsupported_memory_kind_without_rewriting() -> Result<()> { + let output = run_adapter()?; + let ignored_items = array_at(&output, "/ignored_items")?; + let ignored = find_by_field(ignored_items, "/source_id", "mem-raw-summary")?; + + assert_eq!(ignored.pointer("/reason").and_then(Value::as_str), Some("unsupported_memory_kind")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs new file mode 100644 index 00000000..a9a6a8f7 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -0,0 +1,5944 @@ +#![allow(unused_crate_dependencies)] + +//! Integration tests for the real-world job smoke benchmark runner. + +use std::{ + env, fs, + path::{Path, PathBuf}, + process::{self, Command, Output}, +}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +fn fixture_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("fixtures") + .join("real_world_memory") + .join("work_resume") +} + +fn fixture_root() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_memory") +} + +fn real_world_memory_fixture_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_memory") +} + +fn evolution_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("evolution") +} + +fn operator_debug_fixture_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("fixtures") + .join("real_world_job") + .join("operator_debugging_ux") +} + +fn project_decisions_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("project_decisions") +} + +fn retrieval_fixture_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("fixtures") + .join("real_world_memory") + .join("retrieval") +} + +fn capture_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("capture_integration") +} + +fn consolidation_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("consolidation") +} + +fn memory_summary_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("memory_summary") +} + +fn proactive_brief_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("proactive_brief") +} + +fn scheduled_memory_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("scheduled_memory") +} + +fn knowledge_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("knowledge") +} + +fn production_ops_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("production_ops") +} + +fn core_archival_memory_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("core_archival_memory") +} + +fn context_trajectory_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("context_trajectory") +} + +fn graph_rag_external_fixture_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("fixtures") + .join("real_world_external_adapters") + .join("graph_rag") +} + +fn workspace_root() -> Result { + let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); + let root = manifest_dir + .parent() + .and_then(Path::parent) + .ok_or_else(|| eyre::eyre!("could not resolve workspace root"))?; + + Ok(root.to_path_buf()) +} + +fn collapse_whitespace(text: &str) -> String { + text.split_whitespace().collect::>().join(" ") +} + +fn strength_profile_report_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-qmd-openviking-strength-profile-report.json")) +} + +fn strength_profile_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-11-qmd-openviking-strength-profile-report.md")) +} + +fn measurement_coverage_audit_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-11-measurement-coverage-audit.md")) +} + +fn measurement_coverage_audit_json_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-measurement-coverage-audit.json")) +} + +fn retrieval_debug_profile_json_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-elf-qmd-retrieval-debug-profile.json")) +} + +fn trace_replay_diagnostics_report_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.json")) +} + +fn trace_replay_diagnostics_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md")) +} + +fn competitor_strength_adoption_report_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-11-competitor-strength-adoption-report.md")) +} + +fn competitor_strength_adoption_report_json_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-competitor-strength-adoption-report.json")) +} + +fn capture_write_policy_live_report_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-capture-write-policy-live-report.json")) +} + +fn capture_write_policy_live_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-11-capture-write-policy-live-report.md")) +} + +fn live_consolidation_proposal_scoring_report_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-16-live-consolidation-proposal-scoring-report.json")) +} + +fn live_consolidation_proposal_scoring_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-16-live-consolidation-proposal-scoring-report.md")) +} + +fn temporal_history_competitor_gap_json_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-temporal-history-competitor-gap-report.json")) +} + +fn dreaming_readiness_stage_ledger_json_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-16-dreaming-readiness-stage-ledger.json")) +} + +fn dreaming_readiness_stage_ledger_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-16-dreaming-readiness-stage-ledger.md")) +} + +fn live_temporal_reconciliation_report_json_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-16-live-temporal-reconciliation-report.json")) +} + +fn live_temporal_reconciliation_report_markdown_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-16-live-temporal-reconciliation-report.md")) +} + +fn competitor_strength_matrix_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-11-competitor-strength-evidence-matrix.md")) +} + +fn competitor_strength_matrix_json_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("research") + .join("2026-06-11-xy-897-competitor-strength-matrix.json")) +} + +fn readme_path() -> Result { + Ok(workspace_root()?.join("README.md")) +} + +fn comparison_external_projects_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("research") + .join("comparison_external_projects.md")) +} + +fn benchmarking_index_path() -> Result { + Ok(workspace_root()?.join("docs").join("guide").join("benchmarking").join("index.md")) +} + +fn iteration_direction_report_path() -> Result { + Ok(workspace_root()? + .join("docs") + .join("guide") + .join("benchmarking") + .join("2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md")) +} + +fn external_adapter_manifest_path() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("fixtures") + .join("real_world_external_adapters") + .join("memory_projects_manifest.json") +} + +fn run_json_report_from(fixtures: PathBuf) -> Result { + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(fixtures) + .output()?; + + assert!( + output.status.success(), + "real_world_job runner failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + Ok(serde_json::from_slice(&output.stdout)?) +} + +fn run_json_report() -> Result { + run_json_report_from(fixture_dir()) +} + +fn load_json(path: &Path) -> Result { + Ok(serde_json::from_str::(&fs::read_to_string(path)?)?) +} + +fn array_at<'a>(value: &'a Value, pointer: &str) -> Result<&'a Vec> { + value + .pointer(pointer) + .and_then(Value::as_array) + .ok_or_else(|| eyre::eyre!("missing array at {pointer}")) +} + +fn find_by_field<'a>(items: &'a [Value], field: &str, expected: &str) -> Result<&'a Value> { + items + .iter() + .find(|item| item.pointer(field).and_then(Value::as_str) == Some(expected)) + .ok_or_else(|| eyre::eyre!("missing item with {field} = {expected}")) +} + +fn array_contains_str(value: &Value, pointer: &str, expected: &str) -> Result { + Ok(array_at(value, pointer)?.iter().any(|item| item.as_str() == Some(expected))) +} + +fn string_array_at(value: &Value, pointer: &str) -> Result> { + array_at(value, pointer)? + .iter() + .map(|item| { + item.as_str() + .map(str::to_owned) + .ok_or_else(|| eyre::eyre!("non-string entry at {pointer}")) + }) + .collect() +} + +fn set_json_pointer(value: &mut Value, pointer: &str, replacement: Value) -> Result<()> { + let target = + value.pointer_mut(pointer).ok_or_else(|| eyre::eyre!("missing JSON pointer {pointer}"))?; + + *target = replacement; + + Ok(()) +} + +fn run_external_manifest_with_letta_attachment_mutation( + slug: &str, + mutation: F, +) -> Result +where + F: FnOnce(&mut Value) -> Result<()>, +{ + run_external_manifest_scenario_mutation( + slug, + "letta_research_gate", + "core_block_attachment_readback", + mutation, + ) +} + +fn run_external_manifest_scenario_mutation( + slug: &str, + adapter_id: &str, + scenario_id: &str, + mutation: F, +) -> Result +where + F: FnOnce(&mut Value) -> Result<()>, +{ + let mut manifest = + serde_json::from_str::(&fs::read_to_string(external_adapter_manifest_path())?)?; + let adapters = manifest + .pointer_mut("/adapters") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing manifest adapters"))?; + let adapter = adapters + .iter_mut() + .find(|adapter| adapter.pointer("/adapter_id").and_then(Value::as_str) == Some(adapter_id)) + .ok_or_else(|| eyre::eyre!("missing {adapter_id} adapter"))?; + let scenarios = adapter + .pointer_mut("/scenarios") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing {adapter_id} scenarios"))?; + let scenario = scenarios + .iter_mut() + .find(|scenario| { + scenario.pointer("/scenario_id").and_then(Value::as_str) == Some(scenario_id) + }) + .ok_or_else(|| eyre::eyre!("missing {scenario_id} scenario"))?; + + mutation(scenario)?; + + let temp_dir = env::temp_dir().join(format!("elf-real-world-{slug}-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let manifest_path = temp_dir.join("memory_projects_manifest.json"); + + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + Ok(Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(fixture_dir()) + .arg("--external-adapter-manifest") + .arg(&manifest_path) + .output()?) +} + +#[test] +fn smoke_fixture_produces_typed_json_report() -> Result<()> { + let report = run_json_report()?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.real_world_job_report/v1") + ); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), + Some(23) + ); + assert_eq!( + report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), + Some(11) + ); + + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "work-resume-stale-worktree-001")?; + + assert_eq!(job.pointer("/suite_id").and_then(Value::as_str), Some("work_resume")); + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(job.pointer("/latency_ms").and_then(Value::as_f64), Some(2.0)); + assert_eq!(job.pointer("/cost/amount").and_then(Value::as_f64), Some(0.0)); + + let expected_evidence = array_at(job, "/expected_evidence")?; + let produced_evidence = array_at(job, "/produced_evidence")?; + + assert_eq!(expected_evidence.len(), 2); + assert_eq!(produced_evidence.len(), 1); + assert_eq!(produced_evidence.first().and_then(Value::as_str), Some("xy844-current-worktree")); + + let suites = array_at(&report, "/suites")?; + let encoded_suite = find_by_field(suites, "/suite_id", "work_resume")?; + let capture_suite = find_by_field(suites, "/suite_id", "capture_integration")?; + let unencoded_suite = find_by_field(suites, "/suite_id", "retrieval")?; + + assert_eq!(encoded_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(encoded_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(capture_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(capture_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(1)); + assert_eq!(unencoded_suite.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + + let capture_fixture_backed = array_at(&report, "/capture_integration/fixture_backed")?; + + assert!(capture_fixture_backed.iter().any(|value| { + value.as_str().is_some_and(|item| item.contains("agentmemory-style hook capture")) + })); + + let capture_not_encoded = array_at(&report, "/capture_integration/not_encoded")?; + + assert!(capture_not_encoded.iter().any(|value| { + value.as_str().is_some_and(|item| item.contains("No live external hook ingestion")) + })); + + Ok(()) +} + +#[test] +fn real_world_report_includes_external_adapter_coverage_manifest() -> Result<()> { + let report = run_json_report_from(real_world_memory_fixture_dir())?; + + assert_external_adapter_manifest_summary(&report); + assert_external_adapter_manifest_records(&report)?; + + Ok(()) +} + +#[test] +fn capture_integration_fixtures_score_redaction_and_source_ids() -> Result<()> { + let report = run_json_report_from(capture_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); + + let suites = array_at(&report, "/suites")?; + let capture = find_by_field(suites, "/suite_id", "capture_integration")?; + + assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(capture.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); + + let jobs = array_at(&report, "/jobs")?; + let source_id = find_by_field(jobs, "/job_id", "capture-source-id-binding-001")?; + let redaction = find_by_field(jobs, "/job_id", "capture-write-policy-redaction-001")?; + + assert!(array_contains_str(source_id, "/produced_evidence", "source-id-release-summary")?); + assert!(array_contains_str(source_id, "/produced_evidence", "source-id-command-log")?); + assert_eq!(redaction.pointer("/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert!( + redaction + .pointer("/produced_answer") + .and_then(Value::as_str) + .is_some_and(|answer| !answer.contains("orchid-envelope")) + ); + + Ok(()) +} + +#[test] +fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { + let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("fixtures") + .join("real_world_external_adapters") + .join("memory_projects_manifest.json"); + let mut manifest = serde_json::from_str::(&fs::read_to_string(manifest_path)?)?; + let adapters = manifest + .pointer_mut("/adapters") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing manifest adapters"))?; + let adapter = adapters + .iter_mut() + .find(|adapter| { + adapter.pointer("/adapter_id").and_then(Value::as_str) + == Some("agentmemory_live_baseline") + }) + .ok_or_else(|| eyre::eyre!("missing agentmemory adapter"))?; + + set_json_pointer(adapter, "/scenarios/0/elf_position", serde_json::json!("loses"))?; + set_json_pointer(adapter, "/scenarios/0/comparison_outcome", serde_json::json!("loss"))?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-loss-manifest-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let manifest_path = temp_dir.join("memory_projects_manifest.json"); + + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(fixture_dir()) + .arg("--external-adapter-manifest") + .arg(&manifest_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job runner failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let report = serde_json::from_slice::(&output.stdout)?; + + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_position_counts/loses") + .and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_position_counts/untested") + .and_then(Value::as_u64), + Some(34) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/loss") + .and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") + .and_then(Value::as_u64), + Some(16) + ); + + let adapters = array_at(&report, "/external_adapters/adapters")?; + let agentmemory = find_by_field(adapters, "/adapter_id", "agentmemory_live_baseline")?; + + assert_eq!( + agentmemory.pointer("/scenarios/0/elf_position").and_then(Value::as_str), + Some("loses") + ); + + Ok(()) +} + +fn assert_external_adapter_manifest_summary(report: &Value) { + assert_eq!( + report.pointer("/external_adapters/schema").and_then(Value::as_str), + Some("elf.real_world_external_adapter_report/v1") + ); + assert_eq!( + report.pointer("/external_adapters/manifest_id").and_then(Value::as_str), + Some( + "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store" + ) + ); + assert_eq!( + report.pointer("/external_adapters/docker_isolation/default").and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + report + .pointer("/external_adapters/docker_isolation/host_global_installs_required") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), + Some(23) + ); + assert_eq!( + report.pointer("/external_adapters/summary/external_project_count").and_then(Value::as_u64), + Some(16) + ); + assert_eq!( + report.pointer("/external_adapters/summary/fixture_backed_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/live_baseline_only_count") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), + Some(11) + ); + + assert_external_adapter_manifest_status_summary(report); + assert_external_adapter_manifest_scenario_summary(report); +} + +fn assert_external_adapter_manifest_status_summary(report: &Value) { + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/pass") + .and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/wrong_result") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/lifecycle_fail") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/incomplete") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/blocked") + .and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/not_encoded") + .and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/capability_status_counts/mocked") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/capability_status_counts/unsupported") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/suite_status_counts/blocked") + .and_then(Value::as_u64), + Some(23) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/suite_status_counts/pass") + .and_then(Value::as_u64), + Some(27) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/suite_status_counts/incomplete") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/suite_status_counts/not_encoded") + .and_then(Value::as_u64), + Some(38) + ); +} + +fn assert_external_adapter_manifest_scenario_summary(report: &Value) { + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/real") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/mocked") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/unsupported") + .and_then(Value::as_u64), + Some(3) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/blocked") + .and_then(Value::as_u64), + Some(12) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/incomplete") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/wrong_result") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/lifecycle_fail") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/pass") + .and_then(Value::as_u64), + Some(23) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/not_encoded") + .and_then(Value::as_u64), + Some(11) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_position_counts/wins") + .and_then(Value::as_u64), + Some(10) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_position_counts/ties") + .and_then(Value::as_u64), + Some(11) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_position_counts/loses") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_position_counts/untested") + .and_then(Value::as_u64), + Some(35) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/win") + .and_then(Value::as_u64), + Some(10) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/tie") + .and_then(Value::as_u64), + Some(11) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/loss") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") + .and_then(Value::as_u64), + Some(17) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") + .and_then(Value::as_u64), + Some(13) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/non_goal") + .and_then(Value::as_u64), + Some(5) + ); +} + +fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { + let adapters = array_at(report, "/external_adapters/adapters")?; + let elf = find_by_field(adapters, "/adapter_id", "elf_real_world_memory_fixture")?; + let elf_live = find_by_field(adapters, "/adapter_id", "elf_live_real_world")?; + let elf_operator_debug = find_by_field(adapters, "/adapter_id", "elf_operator_debug_live")?; + let qmd = find_by_field(adapters, "/adapter_id", "qmd_live_baseline")?; + let qmd_live = find_by_field(adapters, "/adapter_id", "qmd_live_real_world")?; + let qmd_operator_debug = find_by_field(adapters, "/adapter_id", "qmd_operator_debug_live")?; + let agentmemory = find_by_field(adapters, "/adapter_id", "agentmemory_live_baseline")?; + let mem0 = find_by_field(adapters, "/adapter_id", "mem0_openmemory_live_baseline")?; + let memsearch = find_by_field(adapters, "/adapter_id", "memsearch_live_baseline")?; + let openviking = find_by_field(adapters, "/adapter_id", "openviking_live_baseline")?; + let claude_mem = find_by_field(adapters, "/adapter_id", "claude_mem_live_baseline")?; + let ragflow = find_by_field(adapters, "/adapter_id", "ragflow_research_gate")?; + let lightrag = find_by_field(adapters, "/adapter_id", "lightrag_research_gate")?; + let graphrag = find_by_field(adapters, "/adapter_id", "graphrag_research_gate")?; + let graphiti_zep = find_by_field(adapters, "/adapter_id", "graphiti_zep_research_gate")?; + let graphify = find_by_field(adapters, "/adapter_id", "graphify_docker_smoke")?; + let qmd_deep = find_by_field(adapters, "/adapter_id", "qmd_deep_profile_gate")?; + let openviking_deep = find_by_field(adapters, "/adapter_id", "openviking_deep_profile_gate")?; + let letta = find_by_field(adapters, "/adapter_id", "letta_research_gate")?; + + assert_elf_fixture_adapter_record(elf)?; + + assert_eq!( + elf_live.pointer("/evidence_class").and_then(Value::as_str), + Some("live_real_world") + ); + assert_eq!(elf_live.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + + assert_live_sweep_record(elf_live, "blocked")?; + assert_operator_debug_live_adapter_records(elf_operator_debug, qmd_operator_debug)?; + + assert_eq!(qmd.pointer("/overall_status").and_then(Value::as_str), Some("pass")); + assert_eq!(qmd.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); + + assert_qmd_live_baseline_record(qmd); + + assert_eq!( + qmd_live.pointer("/evidence_class").and_then(Value::as_str), + Some("live_real_world") + ); + assert_eq!(qmd_live.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + + assert_live_sweep_record(qmd_live, "blocked")?; + + assert_eq!( + agentmemory.pointer("/capabilities/1/status").and_then(Value::as_str), + Some("mocked") + ); + + assert_first_generation_adapter_records(agentmemory, mem0, memsearch, claude_mem); + + assert_eq!(openviking.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + + assert_graph_rag_research_gate_records(ragflow, lightrag, graphrag); + assert_graphiti_zep_adapter(graphiti_zep); + assert_graphify_adapter(graphify)?; + assert_graph_rag_representative_scenarios(ragflow, lightrag, graphrag, graphiti_zep, graphify)?; + assert_letta_core_archival_gate(letta)?; + assert_qmd_deep_profile_gate(qmd_deep); + + assert_eq!( + qmd_deep.pointer("/capabilities/2/status").and_then(Value::as_str), + Some("unsupported") + ); + assert_eq!( + qmd_deep.pointer("/result/artifact").and_then(Value::as_str), + Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") + ); + assert_eq!( + openviking_deep.pointer("/adapter_kind").and_then(Value::as_str), + Some("docker_local_embed_context_trajectory_gate") + ); + + assert_openviking_deep_profile_gate(openviking_deep); + + assert_eq!( + openviking_deep.pointer("/result/artifact").and_then(Value::as_str), + Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") + ); + + Ok(()) +} + +fn assert_graph_rag_research_gate_records(ragflow: &Value, lightrag: &Value, graphrag: &Value) { + assert_eq!(ragflow.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); + assert_eq!(ragflow.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + ragflow.pointer("/execution_metadata/research_depth").and_then(Value::as_str), + Some( + "D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output" + ) + ); + assert_eq!( + ragflow.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-ragflow-docker") + ); + assert_eq!( + ragflow.pointer("/result/artifact").and_then(Value::as_str), + Some("tmp/real-world-memory/ragflow-smoke/ragflow-report.json") + ); + assert_eq!( + ragflow.pointer("/execution_metadata/sources/0/url").and_then(Value::as_str), + Some("https://github.com/infiniflow/ragflow") + ); + assert_eq!(lightrag.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); + assert_eq!(lightrag.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + lightrag.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-lightrag-docker-context") + ); + assert_eq!( + lightrag.pointer("/run/command").and_then(Value::as_str), + Some("ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context") + ); + assert_eq!( + lightrag.pointer("/capabilities/3/status").and_then(Value::as_str), + Some("not_encoded") + ); + assert_eq!(graphrag.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); + assert_eq!( + graphrag.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-graphrag-docker") + ); + assert_eq!(graphrag.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded")); +} + +fn assert_letta_core_archival_gate(adapter: &Value) -> Result<()> { + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert!( + adapter + .pointer("/setup/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("Docker-only benchmark-created agent export")) + ); + assert!(adapter.pointer("/execution_metadata/setup_path").and_then(Value::as_str).is_some_and( + |setup| setup.contains("exports core block JSON plus archival search/readback JSON") + )); + + let suites = array_at(adapter, "/suites")?; + let core_suite = find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(core_suite.pointer("/status").and_then(Value::as_str), Some("blocked")); + + let scenarios = array_at(adapter, "/scenarios")?; + let attachment = find_by_field(scenarios, "/scenario_id", "core_block_attachment_readback")?; + let scope = find_by_field(scenarios, "/scenario_id", "core_block_scope_readback")?; + let provenance = find_by_field(scenarios, "/scenario_id", "core_block_provenance_readback")?; + let stale = find_by_field(scenarios, "/scenario_id", "stale_core_detection")?; + let fallback = find_by_field(scenarios, "/scenario_id", "archival_fallback_readback")?; + let decision = + find_by_field(scenarios, "/scenario_id", "core_archival_project_decision_recovery")?; + + assert_eq!(scenarios.len(), 6); + + for scenario in [attachment, scope, provenance, stale, fallback, decision] { + assert_eq!(scenario.pointer("/elf_position").and_then(Value::as_str), Some("untested")); + assert!( + ["not_tested", "blocked"].contains( + &scenario + .pointer("/comparison_outcome") + .and_then(Value::as_str) + .ok_or_else(|| eyre::eyre!("missing Letta comparison_outcome"))? + ) + ); + } + + assert_eq!( + attachment.pointer("/comparison_outcome").and_then(Value::as_str), + Some("not_tested") + ); + assert_eq!(stale.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); + assert_eq!(fallback.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); + + Ok(()) +} + +fn assert_elf_fixture_adapter_record(adapter: &Value) -> Result<()> { + assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert!(adapter.pointer("/run/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("60 jobs across 16 suites") + && evidence.contains("53 pass") + && evidence.contains("7 blocked") + && evidence.contains("core_archival_memory") + && evidence.contains("memory_summary") + && evidence.contains("proactive_brief") + && evidence.contains("scheduled_memory") + && evidence.contains("context_trajectory") + })); + + let suites = array_at(adapter, "/suites")?; + let core_archival = find_by_field(suites, "/suite_id", "core_archival_memory")?; + let scheduled = find_by_field(suites, "/suite_id", "scheduled_memory")?; + let context_trajectory = find_by_field(suites, "/suite_id", "context_trajectory")?; + + assert_eq!(core_archival.pointer("/status").and_then(Value::as_str), Some("pass")); + assert!(core_archival.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("core block attachment") + && evidence.contains("project-decision recovery") + && evidence.contains("archival note search") + })); + assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!(scheduled.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("4 passing source-linked task readbacks") + && evidence.contains("private/provider scheduler blocker") + })); + assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + adapter + .pointer("/notes/1") + .and_then(Value::as_str) + .is_some_and(|note| note.contains("OpenViking context-trajectory measurement gates")) + ); + + Ok(()) +} + +fn assert_qmd_deep_profile_gate(adapter: &Value) { + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(adapter.pointer("/run/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(adapter.pointer("/result/status").and_then(Value::as_str), Some("not_encoded")); +} + +fn assert_qmd_live_baseline_record(adapter: &Value) { + let result_evidence = adapter.pointer("/result/evidence").and_then(Value::as_str); + let retrieval_evidence = adapter.pointer("/suites/0/evidence").and_then(Value::as_str); + + assert!(result_evidence.is_some_and(|evidence| { + evidence.contains("This live_baseline_only record is same-corpus evidence only") + && evidence.contains("cite qmd_live_real_world for the full live real-world sweep") + && !evidence.contains("no real_world_job qmd adapter is encoded yet") + })); + assert!(retrieval_evidence.is_some_and(|evidence| { + evidence.contains("does not execute real_world_job retrieval prompts") + && evidence.contains("cite qmd_live_real_world for the live retrieval adapter run") + && !evidence.contains("no real_world_job retrieval adapter run is encoded") + })); +} + +fn assert_operator_debug_live_adapter_records(elf: &Value, qmd: &Value) -> Result<()> { + assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); + assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make real-world-job-operator-ux-live-adapters") + ); + assert_eq!( + elf.pointer("/suites/0/suite_id").and_then(Value::as_str), + Some("operator_debugging_ux") + ); + assert_eq!(elf.pointer("/suites/0/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/1/capability").and_then(Value::as_str), + Some("trace_hydration_metadata") + ); + assert_eq!(elf.pointer("/capabilities/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("replay_command_metadata") + ); + assert_eq!(elf.pointer("/capabilities/2/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("candidate_drop_visibility") + ); + assert_eq!(elf.pointer("/capabilities/3/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/4/capability").and_then(Value::as_str), + Some("openmemory_or_claude_mem_ui_runner") + ); + assert_eq!(elf.pointer("/capabilities/4/status").and_then(Value::as_str), Some("not_encoded")); + + let elf_scenarios = array_at(elf, "/scenarios")?; + let elf_trace = find_by_field(elf_scenarios, "/scenario_id", "operator_debug_trace_hydration")?; + let elf_replay = find_by_field(elf_scenarios, "/scenario_id", "operator_debug_replay_command")?; + let elf_candidate = + find_by_field(elf_scenarios, "/scenario_id", "operator_debug_candidate_drop_visibility")?; + let elf_repair = + find_by_field(elf_scenarios, "/scenario_id", "operator_debug_repair_action_clarity")?; + let elf_selected = + find_by_field(elf_scenarios, "/scenario_id", "operator_debug_selected_but_not_narrated")?; + + assert_eq!(elf_scenarios.len(), 5); + assert_eq!(elf_trace.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(elf_trace.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(elf_replay.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(elf_candidate.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(elf_repair.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(elf_selected.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(qmd.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); + assert_eq!(qmd.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + qmd.pointer("/suites/0/suite_id").and_then(Value::as_str), + Some("operator_debugging_ux") + ); + assert_eq!(qmd.pointer("/suites/0/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + qmd.pointer("/capabilities/1/capability").and_then(Value::as_str), + Some("local_replay_command_metadata") + ); + assert_eq!(qmd.pointer("/capabilities/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + qmd.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("trace_hydration_metadata") + ); + assert_eq!(qmd.pointer("/capabilities/2/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + qmd.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("candidate_drop_visibility") + ); + assert_eq!(qmd.pointer("/capabilities/3/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd.pointer("/capabilities/4/status").and_then(Value::as_str), Some("not_encoded")); + + let qmd_scenarios = array_at(qmd, "/scenarios")?; + let qmd_trace = find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_trace_hydration")?; + let qmd_replay = find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_replay_command")?; + let qmd_candidate = + find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_candidate_drop_visibility")?; + let qmd_repair = + find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_repair_action_clarity")?; + let qmd_selected = + find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_selected_but_not_narrated")?; + + assert_eq!(qmd_scenarios.len(), 5); + assert_eq!(qmd_trace.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd_trace.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(qmd_replay.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(qmd_replay.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(qmd_candidate.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd_candidate.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(qmd_repair.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(qmd_repair.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(qmd_selected.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd_selected.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert!(array_at(elf, "/notes")?.iter().any(|note| { + note.as_str().is_some_and(|text| text.contains("narrow operator-debug live slice")) + })); + assert!(array_at(qmd, "/notes")?.iter().any(|note| { + note.as_str().is_some_and(|text| text.contains("narrow operator-debug live slice")) + })); + + Ok(()) +} + +fn assert_openviking_deep_profile_gate(adapter: &Value) { + let trajectory_evidence = adapter.pointer("/capabilities/1/evidence").and_then(Value::as_str); + + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert!(trajectory_evidence.is_some_and(|evidence| { + evidence.contains("evidence-bearing same-corpus output") + && evidence.contains("selected hierarchy/expansion artifacts") + && !evidence.contains("setup reaches runnable OpenViking APIs") + })); +} + +fn assert_first_generation_adapter_records( + agentmemory: &Value, + mem0: &Value, + memsearch: &Value, + claude_mem: &Value, +) { + assert_agentmemory_first_generation_records(agentmemory); + assert_mem0_first_generation_records(mem0); + assert_memsearch_first_generation_records(memsearch); + assert_claude_mem_first_generation_records(claude_mem); +} + +fn assert_agentmemory_first_generation_records(agentmemory: &Value) { + assert_eq!( + agentmemory.pointer("/scenarios/1/status").and_then(Value::as_str), + Some("lifecycle_fail") + ); + assert_eq!( + agentmemory.pointer("/scenarios/1/elf_position").and_then(Value::as_str), + Some("wins") + ); + assert_eq!(agentmemory.pointer("/scenarios/2/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + agentmemory.pointer("/scenarios/2/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); +} + +fn assert_mem0_first_generation_records(mem0: &Value) { + assert_eq!( + mem0.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("local_lifecycle_update_delete_reload") + ); + assert_eq!(mem0.pointer("/capabilities/2/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + mem0.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("preference_correction_history") + ); + assert_eq!(mem0.pointer("/capabilities/3/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + mem0.pointer("/capabilities/7/capability").and_then(Value::as_str), + Some("openmemory_ui_readback") + ); + assert_eq!(mem0.pointer("/capabilities/7/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + mem0.pointer("/capabilities/8/capability").and_then(Value::as_str), + Some("hosted_managed_memory_claims") + ); + assert_eq!(mem0.pointer("/capabilities/8/status").and_then(Value::as_str), Some("unsupported")); + assert_eq!(mem0.pointer("/scenarios/0/status").and_then(Value::as_str), Some("pass")); + assert_eq!(mem0.pointer("/scenarios/0/elf_position").and_then(Value::as_str), Some("ties")); + assert_eq!( + mem0.pointer("/scenarios/1/scenario_id").and_then(Value::as_str), + Some("preference_correction_history") + ); + assert_eq!(mem0.pointer("/scenarios/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + mem0.pointer("/scenarios/1/comparison_outcome").and_then(Value::as_str), + Some("loss") + ); + assert_eq!( + mem0.pointer("/scenarios/5/scenario_id").and_then(Value::as_str), + Some("openmemory_ui_export_readback") + ); + assert_eq!(mem0.pointer("/scenarios/5/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + mem0.pointer("/scenarios/5/command").and_then(Value::as_str), + Some("cargo make openmemory-ui-export-readback") + ); + assert_eq!( + mem0.pointer("/scenarios/5/artifact").and_then(Value::as_str), + Some("tmp/live-baseline/mem0-openmemory-ui-export.json") + ); + assert!( + mem0.pointer("/capabilities/7/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("export-helper setup probe") + && evidence.contains("requires Docker access")) + ); + assert_eq!( + mem0.pointer("/scenarios/6/comparison_outcome").and_then(Value::as_str), + Some("non_goal") + ); +} + +fn assert_memsearch_first_generation_records(memsearch: &Value) { + assert_eq!( + memsearch.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("reindex_update_delete_reload") + ); + assert_eq!(memsearch.pointer("/capabilities/2/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + memsearch.pointer("/scenarios/0/scenario_id").and_then(Value::as_str), + Some("canonical_markdown_reindex_reload") + ); + assert_eq!( + memsearch.pointer("/scenarios/0/elf_position").and_then(Value::as_str), + Some("untested") + ); + assert_eq!(memsearch.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); + assert!(memsearch.pointer("/suites/0/evidence").and_then(Value::as_str).is_some_and( + |evidence| evidence.contains("fixture-backed source-of-truth prompt coverage") + && evidence.contains("No live memsearch runtime adapter executes prompt scoring yet") + && evidence.contains("not a suite pass") + )); + assert_eq!(memsearch.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded")); + assert!(memsearch.pointer("/suites/1/evidence").and_then(Value::as_str).is_some_and( + |evidence| evidence.contains("fixture-backed retrieval-debug prompt coverage") + && evidence.contains( + "No live memsearch runtime adapter executes retrieval prompt scoring yet" + ) && evidence.contains("not a suite pass") + )); + assert_eq!(memsearch.pointer("/scenarios/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + memsearch.pointer("/scenarios/1/elf_position").and_then(Value::as_str), + Some("untested") + ); + assert_eq!( + memsearch.pointer("/scenarios/3/status").and_then(Value::as_str), + Some("unsupported") + ); + assert_eq!( + memsearch.pointer("/capabilities/4/capability").and_then(Value::as_str), + Some("markdown_source_store_prompt_jobs") + ); + assert_eq!(memsearch.pointer("/capabilities/4/status").and_then(Value::as_str), Some("pass")); +} + +fn assert_claude_mem_first_generation_records(claude_mem: &Value) { + assert_eq!(claude_mem.pointer("/capabilities/1/status").and_then(Value::as_str), Some("real")); + assert_eq!( + claude_mem.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("repository_progressive_disclosure") + ); + assert_eq!(claude_mem.pointer("/capabilities/4/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + claude_mem.pointer("/capabilities/6/status").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!(claude_mem.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(claude_mem.pointer("/suites/1/status").and_then(Value::as_str), Some("blocked")); + assert!( + claude_mem + .pointer("/suites/1/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("fixture-backed progressive-disclosure") + && evidence.contains("viewer/operator workflow remains blocked")) + ); + assert_eq!(claude_mem.pointer("/suites/2/status").and_then(Value::as_str), Some("blocked")); + assert!( + claude_mem + .pointer("/suites/2/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("hook capture remains blocked")) + ); + assert_eq!( + claude_mem.pointer("/scenarios/0/status").and_then(Value::as_str), + Some("wrong_result") + ); + assert_eq!( + claude_mem.pointer("/scenarios/1/scenario_id").and_then(Value::as_str), + Some("retrieval_repair_artifact_path") + ); + assert_eq!( + claude_mem.pointer("/scenarios/1/status").and_then(Value::as_str), + Some("wrong_result") + ); + assert!( + claude_mem + .pointer("/scenarios/1/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("rerun/inspection targets") + && evidence.contains("tmp/live-baseline/claude-mem-checks.json")) + ); + assert_eq!(claude_mem.pointer("/scenarios/2/status").and_then(Value::as_str), Some("pass")); + assert_eq!(claude_mem.pointer("/scenarios/4/status").and_then(Value::as_str), Some("pass")); + assert_eq!(claude_mem.pointer("/scenarios/5/status").and_then(Value::as_str), Some("blocked")); +} + +fn assert_graphiti_zep_adapter(adapter: &Value) { + assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + adapter.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-graphiti-zep-docker-temporal") + ); + assert_eq!( + adapter.pointer("/run/command").and_then(Value::as_str), + Some( + "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal" + ) + ); + assert_eq!( + adapter.pointer("/suites/0/suite_id").and_then(Value::as_str), + Some("memory_evolution") + ); + assert_eq!(adapter.pointer("/suites/0/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + adapter.pointer("/execution_metadata/research_depth").and_then(Value::as_str), + Some( + "D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" + ) + ); +} + +fn assert_graphify_adapter(adapter: &Value) -> Result<()> { + assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(adapter.pointer("/setup/status").and_then(Value::as_str), Some("pass")); + assert_eq!(adapter.pointer("/run/status").and_then(Value::as_str), Some("pass")); + assert_eq!(adapter.pointer("/result/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + adapter.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-graphify-docker-graph-report") + ); + assert_eq!( + adapter.pointer("/suites/0/suite_id").and_then(Value::as_str), + Some("knowledge_compilation") + ); + assert_eq!(adapter.pointer("/suites/0/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(adapter.pointer("/suites/1/suite_id").and_then(Value::as_str), Some("retrieval")); + assert_eq!(adapter.pointer("/suites/1/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + adapter.pointer("/execution_metadata/research_depth").and_then(Value::as_str), + Some( + "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result" + ) + ); + + let capabilities = array_at(adapter, "/capabilities")?; + let quality = find_by_field(capabilities, "/capability", "quality_or_scale_claim")?; + + assert_eq!(quality.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert!(array_at(adapter, "/notes")?.iter().any(|note| { + note.as_str().is_some_and(|text| text.contains("tiny smoke") && text.contains("non-pass")) + })); + + Ok(()) +} + +fn assert_graph_rag_representative_scenarios( + ragflow: &Value, + lightrag: &Value, + graphrag: &Value, + graphiti_zep: &Value, + graphify: &Value, +) -> Result<()> { + let ragflow_scenarios = array_at(ragflow, "/scenarios")?; + let lightrag_scenarios = array_at(lightrag, "/scenarios")?; + let graphrag_scenarios = array_at(graphrag, "/scenarios")?; + let graphiti_scenarios = array_at(graphiti_zep, "/scenarios")?; + let graphify_scenarios = array_at(graphify, "/scenarios")?; + let ragflow_chunk = + find_by_field(ragflow_scenarios, "/scenario_id", "reference_chunk_citation_mapping")?; + let lightrag_context = + find_by_field(lightrag_scenarios, "/scenario_id", "context_source_reference_mapping")?; + let graphrag_tables = + find_by_field(graphrag_scenarios, "/scenario_id", "output_table_citation_mapping")?; + let graphiti_temporal = + find_by_field(graphiti_scenarios, "/scenario_id", "temporal_validity_window_mapping")?; + let graphify_lint = + find_by_field(graphify_scenarios, "/scenario_id", "graph_report_navigation_lint")?; + + assert_eq!( + ragflow_chunk.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!(lightrag_context.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!( + lightrag_context.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!( + graphrag_tables.pointer("/artifact").and_then(Value::as_str), + Some( + "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + ) + ); + assert_eq!( + graphiti_temporal.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!(graphify_lint.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + graphify_lint.pointer("/comparison_outcome").and_then(Value::as_str), + Some("not_tested") + ); + assert!( + graphify_lint + .pointer("/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("not an ELF victory claim")) + ); + + Ok(()) +} + +#[test] +fn graphify_generated_manifest_keeps_retrieval_unscored() -> Result<()> { + let manifest = serde_json::json!({ + "schema": "elf.real_world_external_adapter_manifest/v1", + "manifest_id": "graphify-generated-manifest-test", + "docker_isolation": { + "default": true, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/graphify-docker-graph-report-smoke.py", + "artifact_dir": "tmp/real-world-memory/graphify-smoke", + "host_global_installs_required": false, + "notes": ["Synthetic graphify generated-manifest regression test."] + }, + "adapters": [{ + "adapter_id": "graphify_docker_smoke", + "project": "graphify", + "adapter_kind": "docker_cli_graph_report_smoke", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "setup evidence", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" + }, + "run": { + "status": "pass", + "evidence": "run evidence", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" + }, + "result": { + "status": "wrong_result", + "evidence": "result evidence", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" + }, + "capabilities": [{ + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "No broad graph quality claim." + }], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "evidence": "Only the generated graph/report evidence-mapping job is represented." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The smoke uses graphify query output only to support source mapping; broad retrieval quality is not scored." + } + ], + "evidence": [], + "execution_metadata": { + "setup_path": "cargo make smoke-graphify-docker-graph-report", + "runtime_boundary": "Docker-only generated graph/report smoke.", + "resource_expectation": "Tiny generated corpus only.", + "retry_guidance": [], + "sources": [{ + "label": "graphify", + "url": "https://github.com/safishamsi/graphify", + "evidence": "Synthetic generated-manifest regression source." + }], + "research_depth": "Generated smoke manifest path" + }, + "notes": ["tiny smoke non-pass"] + }] + }); + let temp_dir = + env::temp_dir().join(format!("elf-real-world-graphify-manifest-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let manifest_path = temp_dir.join("manifest.json"); + let report_path = temp_dir.join("report.json"); + + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(fixture_dir()) + .arg("--out") + .arg(&report_path) + .arg("--external-adapter-manifest") + .arg(&manifest_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job runner failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let report: Value = serde_json::from_slice(&fs::read(&report_path)?)?; + let adapters = array_at(&report, "/external_adapters/adapters")?; + let graphify = find_by_field(adapters, "/adapter_id", "graphify_docker_smoke")?; + let suites = array_at(graphify, "/suites")?; + let retrieval = find_by_field(suites, "/suite_id", "retrieval")?; + + assert_eq!(retrieval.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + retrieval + .pointer("/evidence") + .and_then(Value::as_str) + .is_some_and(|text| { text.contains("broad retrieval quality is not scored") }) + ); + + Ok(()) +} + +#[test] +fn graph_rag_representative_fixtures_report_typed_non_pass_states() -> Result<()> { + let report = run_json_report_from(graph_rag_external_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); + assert_eq!( + report.pointer("/summary/knowledge/citation_coverage").and_then(Value::as_f64), + Some(0.667) + ); + assert_eq!( + report.pointer("/summary/knowledge/stale_claim_detection").and_then(Value::as_f64), + Some(0.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(1) + ); + + let jobs = array_at(&report, "/jobs")?; + let ragflow = find_by_field(jobs, "/job_id", "graph-rag-ragflow-reference-chunks-001")?; + let lightrag = find_by_field(jobs, "/job_id", "graph-rag-lightrag-context-sources-001")?; + let graphrag = find_by_field(jobs, "/job_id", "graph-rag-graphrag-output-tables-001")?; + let graphiti = find_by_field(jobs, "/job_id", "graph-rag-graphiti-temporal-validity-001")?; + let graphify = find_by_field(jobs, "/job_id", "graph-rag-graphify-graph-report-001")?; + + assert_eq!(ragflow.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(lightrag.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(graphrag.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(graphiti.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(graphify.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + graphify.pointer("/knowledge/stale_claim_detection").and_then(Value::as_f64), + Some(0.0) + ); + assert_eq!( + graphify.pointer("/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + graphiti.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool), + Some(true) + ); + assert!(array_contains_str(graphify, "/produced_evidence", "graphify-source-location-output")?); + + Ok(()) +} + +#[test] +fn live_adapter_aggregate_forwards_graph_rag_smoke_controls() -> Result<()> { + let workspace = workspace_root()?; + let makefile = fs::read_to_string(workspace.join("Makefile.toml"))?; + let docker_script = fs::read_to_string(workspace.join("scripts/real-world-docker.sh"))?; + + assert!( + makefile.contains("[tasks.real-world-memory-live-adapters]") + && makefile.contains("scripts/real-world-docker.sh") + && makefile.contains("memory-live-adapters"), + "Makefile should expose the live-adapter command and delegate Docker details to a script", + ); + + for env_name in [ + "ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW", + "ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG", + "ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG", + "ELF_REAL_WORLD_LIVE_ENABLE_GRAPHITI_ZEP", + "ELF_REAL_WORLD_LIVE_ENABLE_GRAPHIFY", + "ELF_RAGFLOW_SMOKE_START", + "ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE", + "ELF_GRAPHRAG_SMOKE_RUN", + "ELF_GRAPHRAG_API_KEY", + "ELF_GRAPHITI_ZEP_SMOKE_START", + "ELF_GRAPHITI_ZEP_SMOKE_RUN", + "ELF_GRAPHITI_ZEP_API_KEY", + "ELF_GRAPHIFY_SMOKE_RUN", + ] { + assert!( + docker_script.contains(&format!("-e {env_name}")), + "real-world-memory-live-adapters must forward {env_name}", + ); + } + + assert!( + docker_script.contains("--profile lightrag up -d lightrag"), + "aggregate task should start LightRAG profile when ELF_LIGHTRAG_CONTEXT_START=1", + ); + assert!( + docker_script.contains("--profile graphiti-zep up -d graphiti-falkordb"), + "aggregate task should start Graphiti/Zep profile when ELF_GRAPHITI_ZEP_SMOKE_START=1", + ); + + Ok(()) +} + +#[test] +fn openmemory_ui_export_probe_has_dedicated_docker_task() -> Result<()> { + let workspace_root = workspace_root()?; + let makefile = fs::read_to_string(workspace_root.join("Makefile.toml"))?; + let docker_script = fs::read_to_string(workspace_root.join("scripts/baseline-docker.sh"))?; + let compose = fs::read_to_string(workspace_root.join("docker-compose.baseline.yml"))?; + let script = fs::read_to_string(workspace_root.join("scripts/live-baseline-benchmark.sh"))?; + let report = serde_json::from_str::(&fs::read_to_string( + workspace_root.join("docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json"), + )?)?; + + assert!(makefile.contains("[tasks.openmemory-ui-export-readback]")); + assert!(makefile.contains("scripts/baseline-docker.sh")); + assert!(makefile.contains("openmemory-ui-export-readback")); + assert!(docker_script.contains("export ELF_BASELINE_PROJECTS=mem0")); + assert!(compose.contains("ELF_MEM0_OPENMEMORY_EXPORT_USER_ID")); + assert!(compose.contains("ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER")); + assert!(script.contains("probe_mem0_openmemory_ui_export")); + assert!(script.contains("mem0-openmemory-ui-export.json")); + assert!(script.contains("DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER")); + assert!(script.contains("sdk_get_all_is_ui_export_evidence: false")); + assert!( + script.contains("SDK same-corpus retrieval and every encoded SDK behavior check passed") + ); + assert_eq!(report.pointer("/classification/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + report.pointer("/classification/reason_code").and_then(Value::as_str), + Some("DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER") + ); + assert_eq!( + report + .pointer("/same_corpus_boundary/sdk_get_all_is_ui_export_evidence") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report + .pointer("/claim_boundary/elf_can_compare_against_openmemory_ui_export_after_this_run") + .and_then(Value::as_bool), + Some(false) + ); + + Ok(()) +} + +#[test] +fn operator_debug_live_adapter_task_is_docker_scoped() -> Result<()> { + let workspace = workspace_root()?; + let makefile = fs::read_to_string(workspace.join("Makefile.toml"))?; + let docker_script = fs::read_to_string(workspace.join("scripts/real-world-docker.sh"))?; + let script = fs::read_to_string( + workspace.join("scripts").join("real-world-operator-debug-live-adapters.sh"), + )?; + let live_adapter = + fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_live_adapter.rs"))?; + let benchmark = + fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_job_benchmark.rs"))?; + + assert!(makefile.contains("[tasks.real-world-job-operator-ux-live-adapters]")); + assert!(makefile.contains("scripts/real-world-docker.sh")); + assert!(makefile.contains("job-operator-ux-live-adapters")); + assert!( + docker_script.contains("docker compose -f docker-compose.baseline.yml run --build --rm") + ); + assert!(docker_script.contains("scripts/real-world-operator-debug-live-adapters.sh")); + assert!(script.contains("apps/elf-eval/fixtures/real_world_job/operator_debugging_ux")); + assert!(script.contains("elf_operator_debug_live")); + assert!(script.contains("qmd_operator_debug_live")); + assert!(script.contains("elf.real_world_operator_debug_live_adapter_sweep/v1")); + assert!(script.contains("trace_available")); + assert!(script.contains("replay_command_available")); + assert!(live_adapter.contains("fn operator_debug_output(")); + assert!(live_adapter.contains("fn qmd_replay_command(")); + assert!(live_adapter.contains("fn elf_replay_command(")); + assert!( + !live_adapter + .contains("does not yet hydrate full operator trace/viewer diagnostics for this suite") + ); + assert!(benchmark.contains("Replay command:")); + assert!(benchmark.contains("replay_command_available")); + + Ok(()) +} + +#[test] +fn external_adapter_manifest_rejects_unmeasured_win_loss_scenario_outcomes() -> Result<()> { + let output = run_external_manifest_with_letta_attachment_mutation( + "invalid-scenario-outcome-test", + |scenario| set_json_pointer(scenario, "/comparison_outcome", serde_json::json!("win")), + )?; + + assert!(!output.status.success(), "invalid scenario outcome unexpectedly passed"); + assert!( + String::from_utf8_lossy(&output.stderr).contains("not_encoded status with win outcome") + ); + + Ok(()) +} + +#[test] +fn external_adapter_manifest_rejects_unmeasured_win_loss_scenario_positions() -> Result<()> { + let output = run_external_manifest_with_letta_attachment_mutation( + "invalid-scenario-position-test", + |scenario| { + set_json_pointer(scenario, "/status", serde_json::json!("not_encoded"))?; + set_json_pointer(scenario, "/elf_position", serde_json::json!("wins"))?; + + set_json_pointer(scenario, "/comparison_outcome", serde_json::json!("not_tested")) + }, + )?; + + assert!(!output.status.success(), "invalid scenario position unexpectedly passed"); + assert!( + String::from_utf8_lossy(&output.stderr).contains("not_encoded status with wins position") + ); + + Ok(()) +} + +#[test] +fn external_adapter_manifest_rejects_blocked_status_without_blocked_outcome() -> Result<()> { + let output = run_external_manifest_scenario_mutation( + "invalid-blocked-scenario-outcome-test", + "letta_research_gate", + "stale_core_detection", + |scenario| { + scenario + .as_object_mut() + .ok_or_else(|| eyre::eyre!("scenario is not an object"))? + .remove("comparison_outcome"); + + Ok(()) + }, + )?; + + assert!(!output.status.success(), "invalid blocked scenario unexpectedly passed"); + assert!( + String::from_utf8_lossy(&output.stderr) + .contains("blocked status without blocked comparison outcome") + ); + + Ok(()) +} + +#[test] +fn external_adapter_manifest_rejects_conflicting_scenario_position_and_outcome() -> Result<()> { + let output = run_external_manifest_with_letta_attachment_mutation( + "invalid-scenario-position-outcome-test", + |scenario| { + set_json_pointer(scenario, "/status", serde_json::json!("pass"))?; + set_json_pointer(scenario, "/elf_position", serde_json::json!("ties"))?; + + set_json_pointer(scenario, "/comparison_outcome", serde_json::json!("loss")) + }, + )?; + + assert!(!output.status.success(), "conflicting scenario unexpectedly passed"); + assert!(String::from_utf8_lossy(&output.stderr).contains("ties position with loss outcome")); + + Ok(()) +} + +#[test] +fn live_adapter_supports_elf_capture_write_policy_without_external_hook_claims() -> Result<()> { + let workspace = workspace_root()?; + let live_adapter = + fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_live_adapter.rs"))?; + let live_script = + fs::read_to_string(workspace.join("scripts").join("real-world-live-adapters.sh"))?; + let manifest = fs::read_to_string( + workspace + .join("apps/elf-eval/fixtures/real_world_external_adapters") + .join("memory_projects_manifest.json"), + )?; + + assert!(live_adapter.contains("fn is_elf_capture_live_adapter(")); + assert!(live_adapter.contains("suite == \"capture_integration\"")); + assert!(live_adapter.contains("write_policy_audit_count")); + assert!(live_adapter.contains("excluded_evidence_ids")); + assert!(live_adapter.contains("source_id")); + assert!(live_adapter.contains("runtime_source_refs")); + assert!(live_adapter.contains("validate_capture_runtime_evidence")); + assert!(live_adapter.contains("capture_failure")); + assert!(live_adapter.contains("fn materialize_elf_consolidation(")); + assert!(live_adapter.contains("ConsolidationProposalReviewRequest")); + assert!(live_adapter.contains("fn materialize_elf_knowledge(")); + assert!(live_adapter.contains("KnowledgePageLintRequest")); + assert!(live_script.contains("OPERATOR_FIXTURE_DIR")); + assert!(live_script.contains("INPUT_FIXTURE_DIR")); + assert!(live_script.contains("operator_debugging_ux")); + assert!(manifest.contains("\"scenario_id\": \"live_capture_write_policy\"")); + assert!(manifest.contains("\"scenario_id\": \"capture_write_policy_hooks\"")); + assert!(manifest.contains("\"comparison_outcome\": \"blocked\"")); + assert!(manifest.contains("Four redaction, exclusion, source-id, evidence-binding")); + assert!(manifest.contains("durable upstream agentmemory session/capture path")); + assert!(manifest.contains("Docker-contained session directory")); + assert!(manifest.contains("claude-mem hooks, viewer, timeline, and observation workflows")); + + Ok(()) +} + +#[test] +fn declared_not_encoded_consolidation_jobs_do_not_require_fake_proposals() -> Result<()> { + let fixture_path = consolidation_fixture_dir().join("contradiction_report_discard.json"); + let mut fixture = serde_json::from_str::(&fs::read_to_string(fixture_path)?)?; + + fixture + .pointer_mut("/corpus/adapter_response") + .and_then(Value::as_object_mut) + .ok_or_else(|| eyre::eyre!("missing adapter_response object"))? + .remove("consolidation"); + + let encoding = serde_json::json!({ + "status": "not_encoded", + "reason": "The qmd live adapter retrieves evidence-linked answers but does not generate or review consolidation proposals." + }); + + fixture + .as_object_mut() + .ok_or_else(|| eyre::eyre!("fixture is not an object"))? + .insert("encoding".to_string(), encoding); + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-not-encoded-consolidation-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("not_encoded_consolidation.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "consolidation-contradiction-report-discard-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn capture_write_policy_live_report_preserves_competitor_boundaries() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + capture_write_policy_live_report_path()?, + )?)?; + let markdown = fs::read_to_string(capture_write_policy_live_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.capture_write_policy_live_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-933")); + assert_eq!( + report + .pointer("/live_capture_results/elf_live_real_world/suite_status") + .and_then(Value::as_str), + Some("pass") + ); + assert_eq!( + report + .pointer("/live_capture_results/elf_live_real_world/encoded_job_count") + .and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report + .pointer("/live_capture_results/elf_live_real_world/redaction_leak_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/live_capture_results/qmd_live_real_world/suite_status") + .and_then(Value::as_str), + Some("not_encoded") + ); + + let jobs = array_at(&report, "/jobs")?; + let source_binding = find_by_field(jobs, "/job_id", "capture-source-id-binding-001")?; + let source_binding_refs = array_at(source_binding, "/runtime_source_refs")?; + let release_summary_ref = + find_by_field(source_binding_refs, "/evidence_id", "source-id-release-summary")?; + + assert!(array_contains_str(source_binding, "/source_ids", "capture:issue-comment-42")?); + assert_eq!( + release_summary_ref.pointer("/source_id").and_then(Value::as_str), + Some("capture:issue-comment-42") + ); + assert_eq!( + release_summary_ref.pointer("/evidence_binding").and_then(Value::as_str), + Some("source_ref") + ); + + let write_policy = find_by_field(jobs, "/job_id", "capture-write-policy-redaction-001")?; + + assert_eq!( + write_policy.pointer("/write_policy_redaction_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + write_policy + .pointer("/runtime_source_refs/0/write_policy_applied") + .and_then(Value::as_bool), + Some(true) + ); + + let boundary = find_by_field(jobs, "/job_id", "capture-integration-boundaries-001")?; + + assert!(array_contains_str(boundary, "/excluded_evidence_ids", "private-span-trap")?); + assert!(!array_contains_str(boundary, "/stored_evidence_ids", "private-span-trap")?); + assert!( + array_at(boundary, "/runtime_source_refs")? + .iter() + .all(|item| item.pointer("/evidence_id").and_then(Value::as_str) + != Some("private-span-trap")) + ); + + let positions = array_at(&report, "/competitor_positions")?; + let qmd = find_by_field(positions, "/project", "qmd")?; + let agentmemory = find_by_field(positions, "/project", "agentmemory")?; + let claude_mem = find_by_field(positions, "/project", "claude-mem")?; + + assert_eq!(qmd.pointer("/position").and_then(Value::as_str), Some("untested")); + assert!(qmd.pointer("/reason").and_then(Value::as_str).is_some_and(|reason| { + reason.contains("typed not_encoded") && reason.contains("ELF self-check") + })); + assert_eq!(agentmemory.pointer("/position").and_then(Value::as_str), Some("blocked")); + assert!(agentmemory.pointer("/reason").and_then(Value::as_str).is_some_and(|reason| { + reason.contains("process-local StateKV Map") && reason.contains("in-memory index") + })); + assert_eq!(claude_mem.pointer("/position").and_then(Value::as_str), Some("blocked")); + assert!( + claude_mem + .pointer("/reason") + .and_then(Value::as_str) + .is_some_and(|reason| reason.contains("hooks, timeline, observations") + && reason.contains("Docker-contained hook/viewer runner")) + ); + assert!(markdown.contains("ELF now has live capture/write-policy self-check evidence")); + assert!(markdown.contains("not an ELF-over-qmd win")); + assert!(markdown.contains("| claude-mem capture/viewer flows | `blocked` |")); + assert!(!markdown.contains("claude-mem capture breadth is untested")); + assert!(markdown.contains("runtime `source_ref` metadata returned by search")); + assert!(markdown.contains("Do not claim ELF broadly beats agentmemory or claude-mem")); + assert!(benchmarking_index.contains("2026-06-11-capture-write-policy-live-report.md")); + assert!(readme.contains("Capture/Write-Policy Live Report - June 11, 2026")); + assert!(readme.contains("mem0/OpenMemory")); + assert!(readme.contains("and memsearch now pass their scoped local baseline")); + assert!( + collapse_whitespace(&readme) + .contains("claude-mem hook/viewer capture remains blocked until Docker-contained") + ); + + Ok(()) +} + +#[test] +fn live_consolidation_report_preserves_reviewable_output_boundaries() -> Result<()> { + let workspace = workspace_root()?; + let report = serde_json::from_str::(&fs::read_to_string( + live_consolidation_proposal_scoring_report_path()?, + )?)?; + let markdown = fs::read_to_string(live_consolidation_proposal_scoring_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + let benchmark_guide = fs::read_to_string( + workspace + .join("docs") + .join("guide") + .join("benchmarking") + .join("real_world_agent_memory_benchmark.md"), + )?; + let makefile = fs::read_to_string(workspace.join("Makefile.toml"))?; + let live_script = + fs::read_to_string(workspace.join("scripts/real-world-consolidation-live-adapter.sh"))?; + let live_adapter = + fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_live_adapter.rs"))?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.live_consolidation_proposal_scoring_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-934")); + assert_eq!( + report + .pointer("/live_consolidation_results/elf_live_real_world/suite_status") + .and_then(Value::as_str), + Some("pass") + ); + assert_eq!( + report + .pointer("/live_consolidation_results/elf_live_real_world/encoded_job_count") + .and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report + .pointer("/live_consolidation_results/elf_live_real_world/proposal_count") + .and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report + .pointer("/live_consolidation_results/elf_live_real_world/source_mutation_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/live_consolidation_results/elf_live_real_world/review_event_count") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report + .pointer("/live_consolidation_results/qmd_live_real_world/suite_status") + .and_then(Value::as_str), + Some("not_encoded") + ); + + let jobs = array_at(&report, "/jobs")?; + let project_summary = + find_by_field(jobs, "/job_id", "consolidation-project-summary-apply-001")?; + let preference = + find_by_field(jobs, "/job_id", "consolidation-preference-candidate-defer-001")?; + let contradiction = + find_by_field(jobs, "/job_id", "consolidation-contradiction-report-discard-001")?; + + assert_eq!( + project_summary.pointer("/final_review_state").and_then(Value::as_str), + Some("applied") + ); + assert_eq!(project_summary.pointer("/review_event_count").and_then(Value::as_u64), Some(2)); + assert_eq!(preference.pointer("/final_review_state").and_then(Value::as_str), Some("archived")); + assert_eq!( + contradiction.pointer("/final_review_state").and_then(Value::as_str), + Some("rejected") + ); + assert_eq!( + contradiction.pointer("/unsupported_claim_flag_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(contradiction.pointer("/source_lineage_count").and_then(Value::as_u64), Some(3)); + + let positions = array_at(&report, "/reference_positions")?; + let qmd = find_by_field(positions, "/project", "qmd")?; + let managed = find_by_field(positions, "/project", "managed_dreaming_memory_systems")?; + let always_on = find_by_field(positions, "/project", "always_on_memory_agent_patterns")?; + + assert_eq!(qmd.pointer("/position").and_then(Value::as_str), Some("untested")); + assert_eq!(managed.pointer("/position").and_then(Value::as_str), Some("product_reference")); + assert_eq!(always_on.pointer("/position").and_then(Value::as_str), Some("product_reference")); + assert!(markdown.contains("ELF now has service-backed live consolidation proposal scoring")); + assert!(markdown.contains("This is not scheduled production consolidation")); + assert!(markdown.contains("Source mutations")); + assert!(markdown.contains("Do not mix knowledge-page rebuild/lint scoring")); + assert!( + benchmarking_index.contains("2026-06-16-live-consolidation-proposal-scoring-report.md") + ); + assert!(readme.contains("Live Consolidation Proposal Scoring Report - June 16, 2026")); + assert!(readme.contains("real-world-memory-live-consolidation")); + assert!(benchmark_guide.contains("Current live consolidation increment")); + assert!(benchmark_guide.contains("tmp/real-world-memory/live-consolidation/summary.json")); + assert!(makefile.contains("[tasks.real-world-memory-live-consolidation]")); + assert!(makefile.contains("scripts/real-world-docker.sh")); + + let docker_script = fs::read_to_string(workspace.join("scripts/real-world-docker.sh"))?; + + assert!(docker_script.contains("scripts/real-world-consolidation-live-adapter.sh")); + assert!(live_script.contains("elf.real_world_consolidation_live_adapter_sweep/v1")); + assert!(live_script.contains("real_world_live_adapter -- elf")); + assert!(!live_script.contains("real_world_live_adapter -- qmd")); + assert!(live_adapter.contains("fn materialize_elf_consolidation(")); + assert!(live_adapter.contains("ConsolidationProposalReviewRequest")); + + Ok(()) +} + +fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Result<()> { + let suites = array_at(adapter, "/suites")?; + let capabilities = array_at(adapter, "/capabilities")?; + let adapter_id = adapter.pointer("/adapter_id").and_then(Value::as_str).unwrap_or_default(); + let targeted = find_by_field(capabilities, "/capability", "targeted_live_pass")?; + let full_pass = find_by_field(capabilities, "/capability", "full_suite_live_pass")?; + let work_resume = find_by_field(suites, "/suite_id", "work_resume")?; + let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?; + let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; + let consolidation = find_by_field(suites, "/suite_id", "consolidation")?; + let knowledge = find_by_field(suites, "/suite_id", "knowledge_compilation")?; + let operator_debug = find_by_field(suites, "/suite_id", "operator_debugging_ux")?; + let capture = find_by_field(suites, "/suite_id", "capture_integration")?; + let personalization = find_by_field(suites, "/suite_id", "personalization")?; + let core_archival = find_by_field(suites, "/suite_id", "core_archival_memory")?; + let context_trajectory = find_by_field(suites, "/suite_id", "context_trajectory")?; + let trust_sot = find_by_field(suites, "/suite_id", "trust_source_of_truth")?; + let retrieval = find_by_field(suites, "/suite_id", "retrieval")?; + let project_decisions = find_by_field(suites, "/suite_id", "project_decisions")?; + + assert_eq!(suites.len(), 13); + assert_eq!(targeted.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(full_pass.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert!( + adapter + .pointer("/result/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("55 jobs across all 13 checked-in suites")) + ); + assert_eq!(trust_sot.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(work_resume.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(retrieval.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(project_decisions.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + production_ops.pointer("/status").and_then(Value::as_str), + Some(production_ops_status) + ); + + if adapter_id == "elf_live_real_world" { + assert_eq!(consolidation.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(operator_debug.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("pass")); + assert!( + capture + .pointer("/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("4/4 capture_integration jobs")) + ); + } else { + assert_eq!(consolidation.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(operator_debug.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + } + + assert_eq!(personalization.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core_archival.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); + + Ok(()) +} + +#[test] +fn runner_discovers_nested_fixture_layout() -> Result<()> { + let report = run_json_report_from(fixture_root())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(60)); + + Ok(()) +} + +#[test] +fn operator_debug_fixture_reports_trace_links_and_failure_details() -> Result<()> { + let report = run_json_report_from(operator_debug_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!( + report.pointer("/summary/operator_debug_job_count").and_then(Value::as_u64), + Some(6) + ); + assert_eq!(report.pointer("/summary/raw_sql_needed_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/trace_incomplete_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/operator_ux_gap_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(2) + ); + + let jobs = array_at(&report, "/jobs")?; + let dropped = find_by_field(jobs, "/job_id", "operator-debug-dropped-evidence-001")?; + let selected = find_by_field(jobs, "/job_id", "operator-debug-selected-not-narrated-001")?; + + assert_eq!(dropped.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + dropped.pointer("/operator_debug/raw_sql_needed").and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + dropped.pointer("/operator_debug/dropped_candidate_visibility").and_then(Value::as_str), + Some("visible in Retrieval Funnel and Replay Candidates") + ); + assert_eq!( + dropped.pointer("/operator_debug/viewer_url").and_then(Value::as_str), + Some("/viewer?trace_id=11111111-1111-4111-8111-111111111111") + ); + assert_eq!( + dropped.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("filter.read_profile") + ); + assert!(array_contains_str( + dropped, + "/trace_explainability/stages/1/dropped_evidence", + "trace-dropped-expected" + )?); + assert!(array_contains_str( + dropped, + "/trace_explainability/stages/1/distractor_evidence", + "trace-dropped-decoy" + )?); + assert!(array_contains_str(dropped, "/produced_evidence", "trace-dropped-expected")?); + assert_eq!(selected.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + selected.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("selection.narration") + ); + assert_eq!( + selected.pointer("/operator_debug/failure_mode").and_then(Value::as_str), + Some("selected_but_not_narrated") + ); + + Ok(()) +} + +#[test] +fn consolidation_fixtures_report_reviewable_proposal_metrics() -> Result<()> { + let report = run_json_report_from(consolidation_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!( + report.pointer("/summary/consolidation/proposal_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/consolidation/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/consolidation/proposal_unsupported_claim_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/consolidation/executable_gap_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/consolidation/lineage_completeness").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/consolidation/review_action_correctness").and_then(Value::as_f64), + Some(1.0) + ); + + let jobs = array_at(&report, "/jobs")?; + let project_summary = + find_by_field(jobs, "/job_id", "consolidation-project-summary-apply-001")?; + let contradiction = + find_by_field(jobs, "/job_id", "consolidation-contradiction-report-discard-001")?; + + assert_eq!( + project_summary + .pointer("/consolidation/proposals/0/actual_review_action") + .and_then(Value::as_str), + Some("apply") + ); + assert_eq!( + contradiction + .pointer("/consolidation/proposals/0/actual_review_action") + .and_then(Value::as_str), + Some("discard") + ); + assert_eq!( + contradiction + .pointer("/consolidation/proposals/0/unsupported_claim_count") + .and_then(Value::as_u64), + Some(1) + ); + + let suites = array_at(&report, "/suites")?; + let consolidation_suite = find_by_field(suites, "/suite_id", "consolidation")?; + + assert_eq!(consolidation_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + + Ok(()) +} + +#[test] +fn knowledge_fixtures_report_page_metrics() -> Result<()> { + let report = run_json_report_from(knowledge_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/knowledge/page_count").and_then(Value::as_u64), Some(4)); + assert_eq!( + report.pointer("/summary/knowledge/section_count").and_then(Value::as_u64), + Some(10) + ); + assert_eq!( + report.pointer("/summary/knowledge/citation_coverage").and_then(Value::as_f64), + Some(0.9) + ); + assert_eq!( + report.pointer("/summary/knowledge/stale_claim_detection").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/rebuild_determinism").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/backlink_count").and_then(Value::as_u64), + Some(9) + ); + assert_eq!( + report.pointer("/summary/knowledge/pages_with_backlinks").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/knowledge/backlink_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/page_usefulness").and_then(Value::as_f64), + Some(0.969) + ); + assert_eq!( + report.pointer("/summary/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/knowledge/allowed_variance_count").and_then(Value::as_u64), + Some(1) + ); + + let suites = array_at(&report, "/suites")?; + let knowledge_suite = find_by_field(suites, "/suite_id", "knowledge_compilation")?; + + assert_eq!(knowledge_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(knowledge_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(2)); + + let jobs = array_at(&report, "/jobs")?; + let project_page_job = find_by_field(jobs, "/job_id", "knowledge-project-page-001")?; + + assert_eq!( + project_page_job.pointer("/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + project_page_job.pointer("/knowledge/untraced_section_count").and_then(Value::as_u64), + Some(0) + ); + + Ok(()) +} + +#[test] +fn project_decisions_fixtures_report_decision_policy_cases() -> Result<()> { + let report = run_json_report_from(project_decisions_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + + let suites = array_at(&report, "/suites")?; + let project_decisions = find_by_field(suites, "/suite_id", "project_decisions")?; + + assert_eq!(project_decisions.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(project_decisions.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + assert_eq!( + project_decisions.pointer("/update_rationale_available_count").and_then(Value::as_u64), + Some(5) + ); + + let jobs = array_at(&report, "/jobs")?; + let accepted = find_by_field(jobs, "/job_id", "project-decision-accepted-typed-failures-001")?; + let reversal = find_by_field(jobs, "/job_id", "project-decision-reversal-live-baseline-001")?; + let validation = + find_by_field(jobs, "/job_id", "project-decision-current-validation-gate-001")?; + let tradeoff = find_by_field(jobs, "/job_id", "project-decision-tradeoff-fixture-backed-001")?; + let caveat = find_by_field(jobs, "/job_id", "project-decision-private-manifest-caveat-001")?; + + assert_eq!(accepted.pointer("/answer_type").and_then(Value::as_str), Some("decision_record")); + assert_eq!( + accepted.pointer("/expected_evidence").and_then(Value::as_array).map(Vec::len), + Some(2) + ); + assert_eq!( + reversal.pointer("/evolution/historical_evidence/0").and_then(Value::as_str), + Some("live-baseline-suite-win-old") + ); + assert_eq!( + validation.pointer("/evolution/current_evidence/0").and_then(Value::as_str), + Some("validation-gate-current-decodex") + ); + assert_eq!(tradeoff.pointer("/requires_caveat").and_then(Value::as_bool), Some(true)); + assert_eq!(caveat.pointer("/can_answer_unknown").and_then(Value::as_bool), Some(true)); + + for job in jobs { + let expected_evidence = array_at(job, "/expected_evidence")?; + + assert!( + !expected_evidence.is_empty(), + "project decision job {} must declare required evidence", + job.pointer("/job_id").and_then(Value::as_str).unwrap_or("") + ); + } + for entry in fs::read_dir(project_decisions_fixture_dir())? { + let path = entry?.path(); + + if path.extension().and_then(|ext| ext.to_str()) != Some("json") { + continue; + } + + let fixture = serde_json::from_str::(&fs::read_to_string(path)?)?; + let required_evidence = array_at(&fixture, "/required_evidence")?; + let negative_traps = array_at(&fixture, "/negative_traps")?; + + assert!(!required_evidence.is_empty()); + assert!(!negative_traps.is_empty()); + } + + Ok(()) +} + +#[test] +fn qmd_openviking_strength_profile_report_preserves_claim_boundaries() -> Result<()> { + let report = + serde_json::from_str::(&fs::read_to_string(strength_profile_report_path()?)?)?; + let markdown = fs::read_to_string(strength_profile_markdown_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let iteration_direction = fs::read_to_string(iteration_direction_report_path()?)?; + + assert_strength_profile_summary(&report); + assert_strength_profile_terms(&report)?; + assert_qmd_strength_profile(&report)?; + assert_qmd_wrong_result_diagnosis(&report)?; + assert_openviking_strength_profile(&report)?; + assert_strength_profile_json_claim_boundaries(&report)?; + assert_strength_profile_markdown_boundaries(&markdown); + assert_operator_facing_strength_profile_boundaries( + &readme, + &benchmarking_index, + &iteration_direction, + ); + + Ok(()) +} + +#[test] +fn current_benchmark_reports_preserve_live_sweep_boundaries() -> Result<()> { + let measurement_audit = fs::read_to_string(measurement_coverage_audit_path()?)?; + let measurement_audit_json = serde_json::from_str::(&fs::read_to_string( + measurement_coverage_audit_json_path()?, + )?)?; + let competitor_matrix = fs::read_to_string(competitor_strength_matrix_path()?)?; + let competitor_matrix_json = serde_json::from_str::(&fs::read_to_string( + competitor_strength_matrix_json_path()?, + )?)?; + let iteration_direction = fs::read_to_string(iteration_direction_report_path()?)?; + let external_manifest = fs::read_to_string(external_adapter_manifest_path())?; + let comparison_external_projects = fs::read_to_string(comparison_external_projects_path()?)?; + let retrieval_debug_profile = + serde_json::from_str::(&fs::read_to_string(retrieval_debug_profile_json_path()?)?)?; + let temporal_history = serde_json::from_str::(&fs::read_to_string( + temporal_history_competitor_gap_json_path()?, + )?)?; + + assert_current_report_text_boundaries( + &measurement_audit, + &competitor_matrix, + &iteration_direction, + &external_manifest, + &comparison_external_projects, + ); + + assert!(competitor_matrix.contains("claude-mem work_resume remains `not_encoded`")); + assert!(!competitor_matrix.contains("claude-mem `wrong_result`, OpenViking work_resume")); + + let qmd_live = find_by_field( + array_at(&measurement_audit_json, "/live_real_world_adapters")?, + "/adapter", + "qmd live CLI adapter", + )?; + + assert_eq!(qmd_live.pointer("/pass").and_then(Value::as_u64), Some(17)); + assert_eq!(qmd_live.pointer("/wrong_result").and_then(Value::as_u64), Some(6)); + assert_eq!(qmd_live.pointer("/expected_evidence_matched").and_then(Value::as_u64), Some(38)); + assert_eq!(qmd_live.pointer("/evidence_covered_count").and_then(Value::as_u64), Some(45)); + + let memory_evolution = find_by_field( + array_at(&measurement_audit_json, "/live_suite_breakdown")?, + "/suite", + "memory_evolution", + )?; + + assert_eq!( + memory_evolution.pointer("/elf_status_counts/wrong_result").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + memory_evolution.pointer("/qmd_status_counts/wrong_result").and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + retrieval_debug_profile + .pointer("/live_real_world_full_sweep_context/qmd/pass") + .and_then(Value::as_u64), + Some(17) + ); + assert_eq!( + retrieval_debug_profile + .pointer("/live_real_world_full_sweep_context/qmd/wrong_result") + .and_then(Value::as_u64), + Some(6) + ); + + assert_competitor_strength_matrix_json(&competitor_matrix_json)?; + + let openmemory_command = find_by_field( + array_at(&temporal_history, "/commands")?, + "/command", + "cargo make openmemory-ui-export-readback", + )?; + + assert!( + openmemory_command + .pointer("/artifact") + .and_then(Value::as_str) + .is_some_and(|artifact| artifact.contains("tmp/live-baseline/mem0-checks.json") + && artifact.contains("tmp/live-baseline/mem0-openmemory-ui-export.json")) + ); + + Ok(()) +} + +fn assert_current_report_text_boundaries( + measurement_audit: &str, + competitor_matrix: &str, + iteration_direction: &str, + external_manifest: &str, + comparison_external_projects: &str, +) { + assert!( + measurement_audit.contains( + "| `memory_evolution` | `6` | `pass:1`, `wrong_result:5` | `wrong_result:6` |" + ) + ); + assert!( + measurement_audit + .contains("qmd live fails 6/6 jobs after missing the delete/TTL tombstone evidence") + ); + assert!(measurement_audit.contains("Basic local smoke and local OSS history/readback pass")); + assert!(measurement_audit.contains("claude-mem hook/viewer capture is `blocked`")); + assert!(!measurement_audit.contains("claude-mem hook/viewer capture remains untested")); + assert!(!measurement_audit.contains("blocked or untested")); + + assert_measurement_audit_adapter_status_counts(measurement_audit); + + assert!( + competitor_matrix + .contains("broader live suites remain `wrong_result`, `blocked`, or `not_encoded`") + ); + assert!(competitor_matrix.contains( + "Overall adapter-status counts: 4 `pass`,\n6 `wrong_result`, 1 `lifecycle_fail`, 7 `blocked`, and 5 `not_encoded`." + )); + assert!(!competitor_matrix.contains("5 `blocked`, and 7 `not_encoded`")); + assert!( + competitor_matrix + .contains("mem0/OpenMemory local OSS entity-scoped personalization now passes") + ); + assert!(competitor_matrix.contains("scoped preference behavior is a measured tie")); + assert!( + !competitor_matrix.contains("mem0/OpenMemory and Letta personalization are `not_encoded`") + ); + assert!(external_manifest.contains( + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible." + )); + assert!(external_manifest.contains( + "The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run." + )); + assert!( + comparison_external_projects + .contains("Benchmark-grounded for scoped local OSS same-corpus retrieval") + ); + assert!( + comparison_external_projects + .contains("Benchmark-grounded for local same-corpus retrieval, reindex/update/delete") + ); + assert!(iteration_direction.contains("| Jobs | `55` |")); + assert!(iteration_direction.contains("| Encoded suites | `15` |")); + assert!(iteration_direction.contains("| Pass | `49` |")); + assert!(iteration_direction.contains("| Evidence coverage | `123/123` |")); + assert!(iteration_direction.contains("| Expected evidence recall | `115/115` |")); + + for stale_phrase in [ + "same live sweep shape as ELF", + "ELF and qmd live fail 5/6 jobs", + "both systems currently fail 5/6 live memory-evolution jobs", + "wrong_result, incomplete, blocked, and not_encoded states remain visible", + "broader live suites remain `wrong_result`, `incomplete`, or `not_encoded`", + "The qmd live real-world slice covers representative jobs only", + "| Jobs | `40` |", + "| Encoded suites | `11` |", + "| Jobs | `50` |", + "| Encoded suites | `14` |", + "| Pass | `38` |", + "| Pass | `45` |", + "| Evidence coverage | `115/115` |", + "| Expected evidence recall | `107/107` |", + "history/UI/hosted/graph behavior remains", + "current local adapter is incomplete/wrong-result", + "current adapter is incomplete/invalid-result", + ] { + assert!(!measurement_audit.contains(stale_phrase)); + assert!(!competitor_matrix.contains(stale_phrase)); + assert!(!iteration_direction.contains(stale_phrase)); + assert!(!external_manifest.contains(stale_phrase)); + assert!(!comparison_external_projects.contains(stale_phrase)); + } +} + +#[test] +fn live_temporal_reconciliation_report_records_xy905_before_after() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + live_temporal_reconciliation_report_json_path()?, + )?)?; + let markdown = fs::read_to_string(live_temporal_reconciliation_report_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.live_temporal_reconciliation_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-905")); + assert_eq!( + report + .pointer("/baseline/elf_memory_evolution/job_status_counts/pass") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/baseline/elf_memory_evolution/job_status_counts/wrong_result") + .and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report + .pointer("/post_stage/elf_memory_evolution/job_status_counts/pass") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report + .pointer("/post_stage/elf_memory_evolution/job_status_counts/wrong_result") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/post_stage/elf_memory_evolution/suite_status").and_then(Value::as_str), + Some("pass") + ); + assert_eq!( + report.pointer("/post_stage/qmd_memory_evolution/suite_status").and_then(Value::as_str), + Some("wrong_result") + ); + assert_eq!( + report + .pointer("/comparison_judgment/current_vs_historical_correctness") + .and_then(Value::as_str), + Some("improved") + ); + assert_eq!( + report + .pointer("/comparison_judgment/deletion_ttl_tombstone_behavior") + .and_then(Value::as_str), + Some("unchanged") + ); + assert!(array_contains_str( + &report, + "/trace_contract/answer_fields", + "selected_historical_evidence" + )?); + assert!(array_contains_str( + &report, + "/trace_contract/materialization_fields", + "current_winner_evidence_ids" + )?); + assert!(array_contains_str( + &report, + "/trace_contract/trace_stages", + "temporal_reconciliation.conflict_candidates" + )?); + assert!(report.pointer("/trace_contract/negative_gate").and_then(Value::as_str).is_some_and( + |gate| gate.contains("selected conflict evidence id") && gate.contains("wrong_result") + )); + assert!(markdown.contains("ELF passing all six memory-evolution jobs")); + assert!(markdown.contains("selected-but-not-narrated conflicts as `wrong_result`")); + assert!(markdown.contains("Do not claim ELF beats Graphiti/Zep")); + assert!(benchmarking_index.contains("2026-06-16-live-temporal-reconciliation-report.md")); + assert!( + readme.contains("Live Temporal Reconciliation Report - June 16, 2026") + && readme.contains("now reports ELF live `memory_evolution` as 6/6 pass") + ); + + Ok(()) +} + +#[test] +fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + trace_replay_diagnostics_report_path()?, + )?)?; + let markdown = fs::read_to_string(trace_replay_diagnostics_markdown_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let adoption_report = fs::read_to_string(competitor_strength_adoption_report_path()?)?; + let adoption_json = serde_json::from_str::(&fs::read_to_string( + competitor_strength_adoption_report_json_path()?, + )?)?; + + assert_trace_replay_diagnostics_json(&report)?; + assert_trace_replay_diagnostics_markdown(&markdown); + + assert!(readme.contains("ELF/qmd Trace Replay Diagnostics Report - June 11, 2026")); + assert!(benchmarking_index.contains("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md")); + assert!(benchmarking_index.contains("qmd top-10/replay artifact")); + assert!(benchmarking_index.contains("ELF trace/admin surfaces")); + assert!(adoption_report.contains("| Retrieval quality and local debug UX | `loss` |")); + assert!(adoption_report.contains("Letta scenario rows remain")); + assert!(adoption_report.contains("blocked or `not_tested`")); + + assert_trace_replay_viewer_blocker_boundaries( + &readme, + &markdown, + &adoption_report, + &report, + &adoption_json, + )?; + + assert!( + adoption_report + .contains("Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF") + ); + assert!(array_at(&adoption_json, "/adoption_decision/remaining_caveats")?.iter().any( + |caveat| { + caveat.as_str().is_some_and(|text| { + text.contains("Letta scenario rows remain blocked or not_tested") + }) + } + )); + + assert_trace_replay_adoption_json(&adoption_json)?; + + Ok(()) +} + +fn assert_trace_replay_diagnostics_json(report: &Value) -> Result<()> { + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.trace_replay_diagnostics_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-923")); + assert_eq!( + string_array_at(report, "/outcome_terms")?, + ["win", "tie", "loss", "not_tested", "blocked", "non_goal"].map(str::to_owned) + ); + assert_eq!( + report.pointer("/summary/retrieval_correctness").and_then(Value::as_str), + Some("tie") + ); + assert_eq!(report.pointer("/summary/outcome_counts/loss").and_then(Value::as_u64), Some(2)); + assert_eq!( + report.pointer("/summary/outcome_counts/not_tested").and_then(Value::as_u64), + Some(4) + ); + assert_eq!(report.pointer("/summary/outcome_counts/win").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/outcome_counts/tie").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/outcome_counts/non_goal").and_then(Value::as_u64), Some(1)); + + let scenarios = array_at(report, "/scenario_outcomes")?; + let retrieval = find_by_field(scenarios, "/scenario_id", "retrieval_correctness_guardrail")?; + let top10 = find_by_field(scenarios, "/scenario_id", "default_top10_candidate_artifact")?; + let replay = find_by_field(scenarios, "/scenario_id", "replay_command_locality")?; + let trace_surface = + find_by_field(scenarios, "/scenario_id", "trace_admin_replay_surface_availability")?; + let operator_trace = + find_by_field(scenarios, "/scenario_id", "operator_debug_trace_hydration")?; + let operator_replay = + find_by_field(scenarios, "/scenario_id", "operator_debug_replay_command_availability")?; + let operator_candidate = + find_by_field(scenarios, "/scenario_id", "operator_debug_candidate_drop_visibility")?; + let operator_repair = + find_by_field(scenarios, "/scenario_id", "operator_debug_repair_action_clarity")?; + let operator_selected = + find_by_field(scenarios, "/scenario_id", "operator_debug_selected_but_not_narrated")?; + let expansion = find_by_field(scenarios, "/scenario_id", "query_expansion_attribution")?; + let dense_sparse = + find_by_field(scenarios, "/scenario_id", "dense_sparse_channel_attribution")?; + let fusion = find_by_field(scenarios, "/scenario_id", "fusion_attribution")?; + let rerank = find_by_field(scenarios, "/scenario_id", "rerank_attribution")?; + let candidate_drop = find_by_field(scenarios, "/scenario_id", "candidate_drop_diagnostics")?; + let selected = + find_by_field(scenarios, "/scenario_id", "selected_but_not_narrated_wrong_results")?; + let tombstone = + find_by_field(scenarios, "/scenario_id", "evidence_absent_tombstone_diagnostics")?; + + assert_eq!(scenarios.len(), 16); + assert_eq!(retrieval.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(top10.pointer("/outcome").and_then(Value::as_str), Some("loss")); + assert_eq!(replay.pointer("/outcome").and_then(Value::as_str), Some("loss")); + assert_eq!(trace_surface.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!( + operator_trace.pointer("/evidence_class").and_then(Value::as_str), + Some("live_real_world") + ); + assert_eq!(operator_trace.pointer("/result_type").and_then(Value::as_str), Some("pass")); + assert_eq!(operator_trace.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert_eq!(operator_replay.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(operator_candidate.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert!(array_contains_str( + operator_candidate, + "/typed_non_pass_states", + "retrieved_but_dropped" + )?); + assert_eq!(operator_repair.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(operator_selected.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert!(array_contains_str( + operator_selected, + "/typed_non_pass_states", + "selected_but_not_narrated" + )?); + assert_eq!(expansion.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!(dense_sparse.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!(fusion.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!(rerank.pointer("/result_type").and_then(Value::as_str), Some("non_goal")); + assert_eq!(rerank.pointer("/outcome").and_then(Value::as_str), Some("non_goal")); + assert_eq!(candidate_drop.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert!(array_contains_str(candidate_drop, "/typed_non_pass_states", "retrieved_but_dropped")?); + assert_eq!(selected.pointer("/result_type").and_then(Value::as_str), Some("wrong_result")); + assert!(array_contains_str(selected, "/typed_non_pass_states", "selected_but_not_narrated")?); + assert_eq!(tombstone.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert_eq!(tombstone.pointer("/qmd_status").and_then(Value::as_str), Some("wrong_result")); + assert!(array_contains_str( + report, + "/wrong_result_diagnostics/qmd_missing_evidence", + "delete-tombstone" + )?); + assert!(array_contains_str( + report, + "/claim_boundaries", + "qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay." + )?); + assert!(array_contains_str( + report, + "/claim_boundaries", + "ELF narrowly wins the live operator-debug trace hydration and candidate-drop visibility slice against qmd; qmd still ties replay-command and repair-action clarity." + )?); + assert!(array_contains_str( + report, + "/claim_boundaries", + "Do not claim qmd beats ELF as a memory system overall." + )?); + + Ok(()) +} + +fn assert_trace_replay_diagnostics_markdown(markdown: &str) { + assert!(markdown.contains("Retrieval correctness is still tied")); + assert!(markdown.contains("| Default top-10 candidate artifact |")); + assert!(markdown.contains("| Replay command locality |")); + assert!( + markdown + .contains("| Operator-debug trace hydration | `live_real_world` | `pass` | `win` |") + ); + assert!(markdown.contains( + "| Operator-debug replay command availability | `live_real_world` | `pass` | `tie` |" + )); + assert!(markdown.contains( + "| Operator-debug candidate-drop visibility | `live_real_world` | `pass` | `win` |" + )); + assert!(markdown.contains("| Rerank attribution | `live_baseline_only` | `non_goal` |")); + assert!(markdown.contains("| Candidate-drop diagnostics | `research_gate` | `not_encoded` |")); + assert!(markdown.contains("`retrieved_but_dropped` | Defined globally as `not_tested`")); + assert!(markdown.contains("npx tsx src/cli/qmd.ts query")); + assert!(markdown.contains("cargo run -p elf-eval -- --config-a")); + assert!(markdown.contains("cargo make real-world-job-operator-ux-live-adapters")); + assert!(markdown.contains("Do not claim qmd beats ELF as a memory system overall")); + assert!(markdown.contains("Do not score rerank superiority from a qmd `--no-rerank` run")); +} + +fn assert_trace_replay_viewer_blocker_boundaries( + readme: &str, + markdown: &str, + adoption_report: &str, + report: &Value, + adoption_json: &Value, +) -> Result<()> { + let checked_surfaces = [ + collapse_whitespace(readme), + collapse_whitespace(markdown), + collapse_whitespace(adoption_report), + report.to_string(), + adoption_json.to_string(), + ]; + + for surface in checked_surfaces { + assert!(!surface.contains("blocked or not encoded")); + } + + assert!( + collapse_whitespace(readme) + .contains("claude-mem viewer flows remain blocked until Docker-contained") + ); + assert!( + collapse_whitespace(markdown) + .contains("claude-mem UI repair paths remain blocked until Docker-contained") + ); + assert!( + collapse_whitespace(adoption_report) + .contains("claude-mem viewer workflows remain blocked until Docker-contained") + ); + + Ok(()) +} + +fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> { + let local_debug = find_by_field( + array_at(adoption, "/scenario_outcomes")?, + "/scenario_id", + "local_debug_replay_ux", + )?; + let operator_debug = find_by_field( + array_at(adoption, "/scenario_outcomes")?, + "/scenario_id", + "operator_debugging_viewer_ux", + )?; + + assert_eq!(local_debug.pointer("/outcome").and_then(Value::as_str), Some("loss")); + assert!( + local_debug + .pointer("/measured_claim") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("qmd stronger on immediate top-10")) + ); + assert!(array_contains_str( + local_debug, + "/command_artifacts", + "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" + )?); + assert!(array_contains_str( + adoption, + "/claim_boundaries/not_allowed", + "Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win." + )?); + assert_eq!(operator_debug.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert!( + operator_debug + .pointer("/measured_claim") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("narrow live operator-debug win over qmd")) + ); + assert!(array_contains_str( + operator_debug, + "/command_artifacts", + "tmp/real-world-job/operator-ux-live-adapters/summary.json" + )?); + assert!(array_contains_str( + adoption, + "/claim_boundaries/not_allowed", + "Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice." + )?); + + Ok(()) +} + +fn assert_competitor_strength_matrix_json(matrix: &Value) -> Result<()> { + let projects = array_at(matrix, "/project_matrix")?; + let scenarios = array_at(matrix, "/scenario_matrix")?; + + assert_competitor_strength_matrix_manifest_counts(matrix); + assert_competitor_strength_matrix_project_json(projects)?; + assert_competitor_strength_matrix_scenario_json(scenarios)?; + + Ok(()) +} + +fn assert_competitor_strength_matrix_project_json(projects: &[Value]) -> Result<()> { + let qmd = find_by_field(projects, "/project", "qmd")?; + let mem0 = find_by_field(projects, "/project", "mem0/OpenMemory")?; + let claude_mem = find_by_field(projects, "/project", "claude-mem")?; + let openviking = find_by_field(projects, "/project", "OpenViking")?; + + assert_eq!( + qmd.pointer("/current_evidence_class").and_then(Value::as_str), + Some("live_real_world") + ); + assert_eq!(qmd.pointer("/measured_status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + qmd.pointer("/unsupported_or_blocked_status/state").and_then(Value::as_str), + Some("not_encoded") + ); + assert!(qmd.pointer("/benchmark_before_claim").and_then(Value::as_str).is_some_and(|claim| { + claim.contains("Keep qmd deep retrieval/debug profiling separate") + && claim.contains("narrow operator-debug live slice") + })); + assert!( + qmd.pointer("/borrow_if_stronger") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("transparent local knobs")) + ); + assert_eq!(mem0.pointer("/measured_status").and_then(Value::as_str), Some("pass")); + assert_eq!( + mem0.pointer("/unsupported_or_blocked_status/state").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!( + mem0.pointer("/unsupported_or_blocked_status/typed_reason").and_then(Value::as_str), + Some("openmemory_export_helper_setup_blocked") + ); + assert!( + mem0.pointer("/benchmark_before_claim") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("OpenMemory product app import/export")) + ); + assert!( + claude_mem + .pointer("/unsupported_or_blocked_status/details") + .and_then(Value::as_str) + .is_some_and(|details| details.contains("rerun/inspection targets") + && details.contains("tmp/live-baseline/claude-mem-checks.json")) + ); + assert_eq!( + openviking.pointer("/current_evidence_class").and_then(Value::as_str), + Some("live_baseline_only") + ); + assert_eq!( + openviking.pointer("/measured_status").and_then(Value::as_str), + Some("wrong_result") + ); + assert_eq!( + openviking.pointer("/unsupported_or_blocked_status/state").and_then(Value::as_str), + Some("blocked") + ); + assert!( + openviking + .pointer("/unsupported_or_blocked_status/details") + .and_then(Value::as_str) + .is_some_and(|details| details.contains("encoded as blocked fixtures")) + ); + assert!( + openviking + .pointer("/benchmark_before_claim") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("evidence-bearing same-corpus output pass")) + ); + + Ok(()) +} + +fn assert_competitor_strength_matrix_scenario_json(scenarios: &[Value]) -> Result<()> { + let retrieval_debug = find_by_field(scenarios, "/scenario_id", "retrieval_debug")?; + let work_resume = find_by_field(scenarios, "/scenario_id", "work_resume")?; + let operator_debug = find_by_field(scenarios, "/scenario_id", "operator_debugging")?; + let context_trajectory = find_by_field(scenarios, "/scenario_id", "context_trajectory")?; + let consolidation = find_by_field(scenarios, "/scenario_id", "consolidation")?; + + assert!( + retrieval_debug + .pointer("/current_state") + .and_then(Value::as_str) + .is_some_and(|state| state.contains("Measured tie on encoded retrieval answers")) + ); + assert!(retrieval_debug.pointer("/current_state").and_then(Value::as_str).is_some_and( + |state| state.contains("qmd remains stronger on local debug ergonomics not fully scored") + )); + assert!( + work_resume + .pointer("/current_competitor_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("claude-mem work_resume remains not_encoded") + && !claim.contains("claude-mem is wrong_result")) + ); + assert!( + operator_debug + .pointer("/current_elf_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("narrow live_real_world operator-debug slice")) + ); + assert!( + operator_debug + .pointer("/current_competitor_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("qmd now has a narrow live_real_world")) + ); + assert!( + operator_debug + .pointer("/next_measurement") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("OpenMemory and claude-mem UI/export")) + ); + assert!( + consolidation + .pointer("/current_elf_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("XY-934 adds live_real_world") + && claim.contains("zero source mutations")) + ); + assert!( + consolidation + .pointer("/current_competitor_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("qmd remains not_encoded") + && claim.contains("product references only")) + ); + + let personalization = find_by_field(scenarios, "/scenario_id", "personalization")?; + + assert_personalization_matrix_record(personalization); + + assert!( + context_trajectory + .pointer("/current_state") + .and_then(Value::as_str) + .is_some_and(|state| state.contains("not a measured live winner")) + ); + assert!( + context_trajectory + .pointer("/next_measurement") + .and_then(Value::as_str) + .is_some_and(|measurement| measurement.contains("evidence-bearing retrieval pass")) + ); + + Ok(()) +} + +fn assert_personalization_matrix_record(personalization: &Value) { + assert!( + personalization + .pointer("/current_competitor_evidence") + .and_then(Value::as_str) + .is_some_and(|claim| claim + .contains("mem0/OpenMemory local OSS entity-scoped personalization now passes") + && claim.contains("Letta personalization is research_gate not_encoded")) + ); + assert!( + personalization + .pointer("/current_state") + .and_then(Value::as_str) + .is_some_and(|state| state.contains("scoped personalization is a tie")) + ); +} + +fn assert_competitor_strength_matrix_manifest_counts(matrix: &Value) { + assert_eq!( + matrix.pointer("/manifest_summary/adapter_records").and_then(Value::as_u64), + Some(23) + ); + assert_eq!( + matrix + .pointer("/manifest_summary/evidence_class_counts/live_real_world") + .and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + matrix.pointer("/manifest_summary/overall_status_counts/pass").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + matrix.pointer("/manifest_summary/overall_status_counts/blocked").and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + matrix + .pointer("/manifest_summary/overall_status_counts/not_encoded") + .and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + matrix + .pointer("/manifest_summary/overall_status_counts/wrong_result") + .and_then(Value::as_u64), + Some(6) + ); +} + +fn assert_strength_profile_summary(report: &Value) { + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.competitor_strength_profile_report/v1") + ); + assert_eq!( + report.pointer("/summary/qmd/retrieval_quality").and_then(Value::as_str), + Some("tie") + ); + assert_eq!( + report.pointer("/summary/qmd/local_query_transparency").and_then(Value::as_str), + Some("not_tested") + ); + assert_eq!( + report.pointer("/summary/qmd/local_replayability").and_then(Value::as_str), + Some("not_tested") + ); + assert_eq!( + report.pointer("/summary/qmd/overall_outcome").and_then(Value::as_str), + Some("not_tested") + ); + assert_eq!( + report.pointer("/summary/openviking/overall_outcome").and_then(Value::as_str), + Some("not_tested") + ); + assert_eq!( + report + .pointer("/qmd_strength_profile/win_tie_loss_summary/elf_win") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/qmd_strength_profile/win_tie_loss_summary/tie").and_then(Value::as_u64), + Some(3) + ); + assert_eq!( + report + .pointer("/qmd_strength_profile/win_tie_loss_summary/elf_loss") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/qmd_strength_profile/win_tie_loss_summary/not_tested") + .and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report + .pointer("/openviking_context_trajectory_profile/win_tie_loss_summary/not_tested") + .and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report + .pointer("/openviking_context_trajectory_profile/win_tie_loss_summary/elf_win") + .and_then(Value::as_u64), + Some(1) + ); +} + +fn assert_strength_profile_terms(report: &Value) -> Result<()> { + let result_terms = array_at(report, "/result_type_terms")?; + let coverage_terms = array_at(report, "/coverage_status_terms")?; + let outcome_terms = array_at(report, "/outcome_terms")?; + let actual_result_terms = string_array_at(report, "/result_type_terms")?; + let actual_coverage_terms = string_array_at(report, "/coverage_status_terms")?; + + assert_eq!( + actual_result_terms, + [ + "pass", + "wrong_result", + "blocked", + "incomplete", + "lifecycle_fail", + "not_encoded", + "unsupported_claim", + ] + .map(str::to_owned) + ); + assert_eq!( + actual_coverage_terms, + [ + "pass", + "wrong_result", + "blocked", + "incomplete", + "lifecycle_fail", + "not_encoded", + "unsupported", + "unsupported_claim", + ] + .map(str::to_owned) + ); + assert!(!result_terms.iter().any(|term| term.as_str() == Some("unsupported"))); + assert!(!result_terms.iter().any(|term| term.as_str() == Some("partial"))); + assert!(!coverage_terms.iter().any(|term| term.as_str() == Some("partial"))); + assert!(result_terms.iter().any(|term| term.as_str() == Some("unsupported_claim"))); + assert!(coverage_terms.iter().any(|term| term.as_str() == Some("unsupported"))); + + assert_value_in_terms(report, "/summary/qmd/overall_outcome", outcome_terms)?; + assert_value_in_terms(report, "/summary/openviking/overall_outcome", outcome_terms)?; + + for scenario in array_at(report, "/qmd_strength_profile/scenario_outcomes")? { + assert_value_in_terms(scenario, "/result_type", result_terms)?; + assert_value_in_terms(scenario, "/elf_status", coverage_terms)?; + assert_value_in_terms(scenario, "/qmd_status", coverage_terms)?; + } + for scenario in array_at(report, "/openviking_context_trajectory_profile/scenario_outcomes")? { + assert_value_in_terms(scenario, "/result_type", result_terms)?; + assert_value_in_terms(scenario, "/openviking_status", coverage_terms)?; + assert_value_in_terms(scenario, "/elf_equivalent_status", coverage_terms)?; + } + + Ok(()) +} + +fn assert_value_in_terms(value: &Value, pointer: &str, terms: &[Value]) -> Result<()> { + let actual = value + .pointer(pointer) + .and_then(Value::as_str) + .ok_or_else(|| eyre::eyre!("missing string at {pointer}"))?; + + assert!( + terms.iter().any(|term| term.as_str() == Some(actual)), + "{actual} at {pointer} is not declared in the report term list" + ); + + Ok(()) +} + +fn assert_qmd_strength_profile(report: &Value) -> Result<()> { + let qmd_scenarios = array_at(report, "/qmd_strength_profile/scenario_outcomes")?; + let local_transparency = + find_by_field(qmd_scenarios, "/scenario_id", "qmd-local-query-transparency")?; + let retrieval = find_by_field(qmd_scenarios, "/scenario_id", "qmd-retrieval-quality")?; + let rerank_controls = + find_by_field(qmd_scenarios, "/scenario_id", "qmd-expansion-fusion-rerank-controls")?; + let stale_isolation = + find_by_field(qmd_scenarios, "/scenario_id", "qmd-stale-context-isolation")?; + let lifecycle = find_by_field(qmd_scenarios, "/scenario_id", "qmd-update-delete-cold-start")?; + let operator_debug = + find_by_field(qmd_scenarios, "/scenario_id", "qmd-operator-debug-evidence")?; + let replayability = find_by_field(qmd_scenarios, "/scenario_id", "qmd-local-replayability")?; + let wrong_result = find_by_field(qmd_scenarios, "/scenario_id", "qmd-wrong-result-diagnosis")?; + + assert_eq!(qmd_scenarios.len(), 8); + assert_eq!(retrieval.pointer("/elf_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!( + local_transparency.pointer("/elf_outcome").and_then(Value::as_str), + Some("not_tested") + ); + assert_eq!( + local_transparency.pointer("/result_type").and_then(Value::as_str), + Some("not_encoded") + ); + assert_eq!( + rerank_controls.pointer("/result_type").and_then(Value::as_str), + Some("not_encoded") + ); + assert_eq!(stale_isolation.pointer("/result_type").and_then(Value::as_str), Some("pass")); + assert_eq!(stale_isolation.pointer("/elf_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(lifecycle.pointer("/result_type").and_then(Value::as_str), Some("pass")); + assert_eq!(lifecycle.pointer("/elf_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(operator_debug.pointer("/result_type").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(operator_debug.pointer("/elf_outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!(replayability.pointer("/result_type").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(replayability.pointer("/elf_outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!( + wrong_result.pointer("/evidence_class").and_then(Value::as_str), + Some("research_gate") + ); + assert_eq!(wrong_result.pointer("/result_type").and_then(Value::as_str), Some("not_encoded")); + + Ok(()) +} + +fn assert_qmd_wrong_result_diagnosis(report: &Value) -> Result<()> { + let taxonomy = array_at(report, "/qmd_strength_profile/wrong_result_diagnosis/taxonomy")?; + let absent = find_by_field(taxonomy, "/class", "evidence_absent")?; + let dropped = find_by_field(taxonomy, "/class", "retrieved_but_dropped")?; + let narrated = find_by_field(taxonomy, "/class", "selected_but_not_narrated")?; + let lifecycle = find_by_field(taxonomy, "/class", "contradicted_by_lifecycle_evidence")?; + + assert_eq!(absent.pointer("/coverage").and_then(Value::as_str), Some("observed")); + assert_eq!( + dropped.pointer("/coverage").and_then(Value::as_str), + Some("not_observed_candidate_trace_missing") + ); + assert_eq!(narrated.pointer("/coverage").and_then(Value::as_str), Some("observed")); + assert_eq!(lifecycle.pointer("/coverage").and_then(Value::as_str), Some("observed")); + + let qmd_diagnosis_jobs = array_at(report, "/qmd_strength_profile/wrong_result_diagnosis/jobs")?; + let delete_job = + find_by_field(qmd_diagnosis_jobs, "/job_id", "memory-evolution-delete-ttl-001")?; + + assert_eq!(qmd_diagnosis_jobs.len(), 6); + assert_eq!(delete_job.pointer("/qmd_status").and_then(Value::as_str), Some("wrong_result")); + assert!(array_contains_str(delete_job, "/missing_evidence", "delete-tombstone")?); + assert!( + delete_job + .pointer("/diagnosis") + .and_then(Value::as_str) + .is_some_and(|diagnosis| diagnosis.contains("typed wrong_result")) + ); + + Ok(()) +} + +fn assert_openviking_strength_profile(report: &Value) -> Result<()> { + let openviking_scenarios = + array_at(report, "/openviking_context_trajectory_profile/scenario_outcomes")?; + let trajectory = find_by_field( + openviking_scenarios, + "/scenario_id", + "openviking-staged-retrieval-trajectory", + )?; + let precondition = find_by_field( + openviking_scenarios, + "/scenario_id", + "openviking-evidence-bearing-retrieval-precondition", + )?; + let local_embed_setup = + find_by_field(openviking_scenarios, "/scenario_id", "openviking-local-embed-setup")?; + let missed_terms = find_by_field( + openviking_scenarios, + "/scenario_id", + "openviking-missed-expected-terms-evidence", + )?; + let hierarchy = + find_by_field(openviking_scenarios, "/scenario_id", "openviking-hierarchy-selection")?; + let recursive_expansion = find_by_field( + openviking_scenarios, + "/scenario_id", + "openviking-recursive-context-expansion", + )?; + + assert_eq!(openviking_scenarios.len(), 6); + assert_eq!( + trajectory.pointer("/evidence_class").and_then(Value::as_str), + Some("fixture_backed") + ); + assert_eq!(trajectory.pointer("/result_type").and_then(Value::as_str), Some("blocked")); + assert_eq!(trajectory.pointer("/openviking_status").and_then(Value::as_str), Some("blocked")); + assert_eq!(local_embed_setup.pointer("/result_type").and_then(Value::as_str), Some("pass")); + assert_eq!( + local_embed_setup.pointer("/elf_outcome").and_then(Value::as_str), + Some("not_tested") + ); + assert_eq!(local_embed_setup.pointer("/typed_blocker"), Some(&Value::Null)); + assert_eq!(precondition.pointer("/result_type").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(precondition.pointer("/elf_outcome").and_then(Value::as_str), Some("elf_win")); + assert_eq!( + precondition.pointer("/typed_blocker").and_then(Value::as_str), + Some("output_missed_expected_terms") + ); + assert_eq!(missed_terms.pointer("/result_type").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(missed_terms.pointer("/elf_outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!(hierarchy.pointer("/result_type").and_then(Value::as_str), Some("blocked")); + assert_eq!(hierarchy.pointer("/elf_outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!( + recursive_expansion.pointer("/result_type").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!( + recursive_expansion.pointer("/elf_outcome").and_then(Value::as_str), + Some("not_tested") + ); + + Ok(()) +} + +fn assert_strength_profile_json_claim_boundaries(report: &Value) -> Result<()> { + assert!(array_contains_str( + report, + "/claim_boundaries", + "ELF does not broadly beat qmd; it ties encoded retrieval and lifecycle correctness, keeps qmd query transparency as not_tested for comparative scoring, and leaves replayability not_tested." + )?); + assert!(array_contains_str( + report, + "/claim_boundaries", + "qmd expansion, fusion, and rerank superiority remains not_tested because the current qmd paths use --no-rerank and do not score internals." + )?); + assert!(array_contains_str( + report, + "/claim_boundaries", + "ELF does not beat OpenViking on context trajectory; OpenViking trajectory strengths remain blocked/not_tested behind a wrong_result same-corpus output precondition and missing staged artifacts." + )?); + assert!(array_contains_str( + report, + "/claim_boundaries", + "Research_gate and blocked fixture records are follow-up gates, not pass evidence." + )?); + assert!(array_contains_str( + report, + "/claim_boundaries", + "Missing equivalent surfaces are encoded as unsupported, blocked, or not_encoded rather than fake losses." + )?); + + Ok(()) +} + +fn assert_strength_profile_markdown_boundaries(markdown: &str) { + assert!( + markdown.contains( + "| Wrong-result diagnosis | `research_gate` | `not_encoded` | `not_tested` |" + ) + ); + assert!( + markdown.contains("ELF ties qmd on the current encoded retrieval-correctness surfaces") + ); + assert!(markdown.contains("qmd remains the local retrieval-debug UX reference")); + assert!(markdown.contains("not scored as comparative ELF wins or losses")); + assert!(markdown.contains("ELF currently wins only the equivalent OpenViking same-corpus")); + assert!(markdown.contains("Do not claim ELF broadly beats qmd")); + assert!(markdown.contains( + "Do not claim ELF beats OpenViking on staged retrieval, hierarchy, or recursive" + )); + assert!(markdown.contains( + "Do not turn `research_gate`, `blocked`, `not_encoded`, or `unsupported` surfaces" + )); + assert!(markdown.contains("no pass evidence is claimed")); + assert!(markdown.contains("typed `wrong_result` state")); +} + +fn assert_operator_facing_strength_profile_boundaries( + readme: &str, + benchmarking_index: &str, + iteration_direction: &str, +) { + assert!(readme.contains("Full-suite live real-world adapter sweep after XY-926")); + assert!(readme.contains("all 55 checked-in jobs across 13 suites")); + assert!(readme.contains("ELF now live-scores capture/write-policy")); + assert!(readme.contains("consolidation proposal review")); + assert!(readme.contains("knowledge-page rebuild/lint")); + assert!(readme.contains("operator-debugging fixtures")); + assert!(!readme.contains("memory-evolution wrong results")); + assert!(readme.contains("Live temporal reconciliation after XY-905")); + assert!(readme.contains("now reports ELF live `memory_evolution` as 6/6 pass")); + assert!(readme.contains("broad qmd, Graphiti/Zep, mem0/OpenMemory, Letta")); + assert!(readme.contains("production-ops operator boundaries")); + assert!(readme.contains("core/archival live adapter gap")); + assert!(collapse_whitespace(readme).contains("blocked context-trajectory measurement")); + assert!( + readme + .contains("consolidation, knowledge, capture, and core/archival typed non-pass states") + ); + assert!(readme.contains("operator-debug trace hydration")); + assert!(readme.contains("qmd remains the local retrieval-debug UX reference")); + assert!(readme.contains("broad ELF-over-qmd")); + assert!(readme.contains("qmd and OpenViking Strength-Profile Report - June 11, 2026")); + assert!(benchmarking_index.contains("2026-06-11-qmd-openviking-strength-profile-report.md")); + assert!( + benchmarking_index.contains("separates qmd retrieval quality from debug/replay ergonomics") + ); + assert!(benchmarking_index.contains("preserves XY-928 OpenViking")); + assert!( + benchmarking_index + .contains("context-trajectory surfaces as blocked/not-tested until scored staged") + ); + assert!( + iteration_direction + .contains("ELF and qmd are tied on the encoded live retrieval, work-resume, and") + ); + assert!(iteration_direction.contains("ELF does not yet beat qmd's local retrieval-debug")); + + assert_iteration_direction_current_measurement_counts(iteration_direction); + + assert!(iteration_direction.contains( + "ELF beats OpenViking on context trajectory. The scenario is encoded as blocked" + )); + assert!( + iteration_direction + .contains("Do not promote a reference project into a win/loss claim until") + ); +} + +fn assert_measurement_audit_adapter_status_counts(markdown: &str) { + for expected in [ + "| `blocked` | `7` |", + "| `not_encoded` | `5` |", + "The generated JSON report emits `external_project_count: 16`", + ] { + assert!(markdown.contains(expected), "missing measurement audit text: {expected}"); + } + for stale in ["| `blocked` | `6` |", "| `not_encoded` | `6` |"] { + assert!(!markdown.contains(stale), "stale measurement audit text: {stale}"); + } +} + +fn assert_iteration_direction_current_measurement_counts(markdown: &str) { + for expected in [ + "| Jobs | `55` |", + "| Encoded suites | `15` |", + "| Blocked | `6` |", + "| Mean score | `0.891` |", + "| Evidence coverage | `123/123` |", + "| Source-ref coverage | `123/123` |", + "| Quote coverage | `123/123` |", + "| Expected evidence recall | `115/115` |", + "| `blocked` | `7` |", + "| `not_encoded` | `5` |", + "`live_baseline_only`, `fixture_backed`, and `research_gate`", + "`blocked` for fixture-backed trajectory gates", + ] { + assert!(markdown.contains(expected), "missing iteration-direction text: {expected}"); + } + for stale in [ + "| Jobs | `40` |", + "| Encoded suites | `11` |", + "| Jobs | `50` |", + "| Encoded suites | `14` |", + "| Mean score | `0.950` |", + "| Mean score | `0.900` |", + "| Evidence coverage | `88/88` |", + "| Evidence coverage | `115/115` |", + "| Expected evidence recall | `80/80` |", + "| Expected evidence recall | `107/107` |", + "| `blocked` | `5` |", + "| `not_encoded` | `7` |", + "`live_baseline_only` plus `research_gate`", + ] { + assert!(!markdown.contains(stale), "stale iteration-direction text: {stale}"); + } +} + +#[test] +fn generated_json_report_renders_markdown() -> Result<()> { + let report = run_json_report()?; + let temp_dir = env::temp_dir().join(format!("elf-real-world-job-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("report.json"); + let markdown_path = temp_dir.join("report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("# Real-World Job Benchmark Report")); + assert!(markdown.contains("work_resume")); + assert!(markdown.contains("Capture And Integration Coverage")); + assert!(markdown.contains("External Adapter Coverage")); + assert!(markdown.contains("live-baseline-only")); + assert!(markdown.contains("live real-world")); + assert!(markdown.contains("does not convert live-baseline retrieval results")); + assert!(markdown.contains("fixture-backed")); + assert!(markdown.contains("Answer Type")); + assert!(markdown.contains("Caveat Required")); + assert!(markdown.contains("Refusal Required")); + assert!(markdown.contains("agentmemory-style hook capture")); + assert!(markdown.contains("xy844-current-worktree")); + assert!(markdown.contains("Existing live-baseline reports remain valid")); + assert!(markdown.contains("### Adapter Scenario Judgments")); + assert!(markdown.contains("ELF scenario positions: `wins=10, ties=11, loses=1, untested=35`")); + assert!(markdown.contains( + "Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=17, blocked=13, non_goal=5`" + )); + assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); + assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); + + Ok(()) +} + +#[test] +fn external_adapter_markdown_renders_nonzero_scenario_losses() -> Result<()> { + let mut report = run_json_report()?; + let adapters = report + .pointer_mut("/external_adapters/adapters") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing external adapter records"))?; + let adapter = adapters + .iter_mut() + .find(|adapter| { + adapter.pointer("/adapter_id").and_then(Value::as_str) + == Some("agentmemory_live_baseline") + }) + .ok_or_else(|| eyre::eyre!("missing agentmemory adapter"))?; + + set_json_pointer(adapter, "/scenarios/0/elf_position", serde_json::json!("loses"))?; + set_json_pointer(adapter, "/scenarios/0/comparison_outcome", serde_json::json!("loss"))?; + set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_position_counts", + serde_json::json!({ + "wins": 2, + "ties": 4, + "loses": 2, + "untested": 10 + }), + )?; + set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_outcome_counts", + serde_json::json!({ + "win": 2, + "tie": 4, + "loss": 2, + "not_tested": 7, + "blocked": 1, + "non_goal": 2 + }), + )?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-loss-scenario-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("report.json"); + let markdown_path = temp_dir.join("report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("ELF scenario positions: `wins=2, ties=4, loses=2, untested=10`")); + assert!(markdown.contains( + "Scenario comparison outcomes: `win=2, tie=4, loss=2, not_tested=7, blocked=1, non_goal=2`" + )); + assert!(markdown.contains( + "| `agentmemory_live_baseline` | `basic_same_corpus_retrieval` | `retrieval` | `pass` | `loss` |" + )); + + Ok(()) +} + +#[test] +fn external_adapter_markdown_omits_scenario_summary_when_manifest_has_no_scenarios() -> Result<()> { + let mut report = run_json_report()?; + let adapters = report + .pointer_mut("/external_adapters/adapters") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing external adapter records"))?; + + for adapter in adapters { + set_json_pointer(adapter, "/scenarios", serde_json::json!([]))?; + } + + set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_status_counts", + serde_json::json!({ + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 0, + "incomplete": 0, + "wrong_result": 0, + "lifecycle_fail": 0, + "pass": 0, + "not_encoded": 0 + }), + )?; + set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_position_counts", + serde_json::json!({ + "wins": 0, + "ties": 0, + "loses": 0, + "untested": 0 + }), + )?; + set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_outcome_counts", + serde_json::json!({ + "win": 0, + "tie": 0, + "loss": 0, + "not_tested": 0, + "blocked": 0, + "non_goal": 0 + }), + )?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-no-scenario-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("report.json"); + let markdown_path = temp_dir.join("report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("External Adapter Coverage")); + assert!(!markdown.contains("Scenario coverage statuses:")); + assert!(!markdown.contains("ELF scenario positions:")); + assert!(!markdown.contains("Scenario comparison outcomes:")); + assert!(!markdown.contains("### Adapter Scenario Judgments")); + + Ok(()) +} + +#[test] +fn mem0_delete_audit_probe_requires_explicit_delete_history_event() -> Result<()> { + let script = + fs::read_to_string(workspace_root()?.join("scripts").join("live-baseline-benchmark.sh"))?; + + assert!(script.contains("def history_has_event")); + assert!(script.contains("str(entry.get(\"event\", \"\")).upper() == expected")); + assert!(script.contains( + "history_has_event(\n preference_history[\"history\"],\n \"ADD\"," + )); + assert!(script.contains( + "history_has_event(\n preference_history[\"history\"],\n \"UPDATE\"," + )); + assert!( + script.contains( + "history_has_event(\n delete_history[\"history\"],\n \"DELETE\"," + ) + ); + assert!( + !script.contains( + "contains_terms(\n delete_history[\"history\"],\n [\"delete\"]," + ) + ); + + Ok(()) +} + +#[test] +fn dreaming_readiness_stage_ledger_preserves_gate_shape() -> Result<()> { + let ledger = serde_json::from_str::(&fs::read_to_string( + dreaming_readiness_stage_ledger_json_path()?, + )?)?; + let markdown = fs::read_to_string(dreaming_readiness_stage_ledger_markdown_path()?)?; + let stages = array_at(&ledger, "/stage_gates")?; + + assert_dreaming_readiness_ledger_header(&ledger)?; + assert_dreaming_readiness_stage_shape(&ledger, stages)?; + assert_dreaming_readiness_baseline_counts(&ledger, stages)?; + assert_dreaming_readiness_markdown_boundaries(&markdown); + + Ok(()) +} + +fn assert_dreaming_readiness_ledger_header(ledger: &Value) -> Result<()> { + assert_eq!( + ledger.pointer("/schema").and_then(Value::as_str), + Some("elf.dreaming_readiness_stage_ledger/v1") + ); + assert_eq!(ledger.pointer("/authority").and_then(Value::as_str), Some("XY-951")); + + for term in ["improved", "regressed", "unchanged", "blocked", "not_tested"] { + assert!(array_contains_str(ledger, "/judgment_terms", term)?); + } + for term in ["pass", "wrong_result", "blocked", "not_tested", "not_encoded"] { + assert!(array_contains_str(ledger, "/count_fields", term)?); + } + + Ok(()) +} + +fn assert_dreaming_readiness_stage_shape(ledger: &Value, stages: &[Value]) -> Result<()> { + assert_eq!(stages.len(), 8); + + for stage_id in [ + "current_vs_historical_correctness", + "preference_evolution", + "deletion_ttl_tombstone_behavior", + "reviewable_consolidation", + "memory_summary_top_of_mind_behavior", + "proactive_brief_readiness", + "scheduled_memory_task_readiness", + "final_competitor_retest_status", + ] { + find_by_field(stages, "/stage_id", stage_id)?; + } + for stage in stages { + let stage_id = + stage.pointer("/stage_id").and_then(Value::as_str).unwrap_or(""); + + assert!( + !array_at(stage, "/baseline_commands")?.is_empty(), + "{stage_id} missing baseline commands" + ); + assert!( + !array_at(stage, "/post_stage_commands")?.is_empty(), + "{stage_id} missing post-stage commands" + ); + assert!( + !array_at(stage, "/evidence_files")?.is_empty(), + "{stage_id} missing evidence files" + ); + + for count_field in string_array_at(ledger, "/count_fields")? { + let pointer = format!("/baseline_counts/{count_field}"); + + assert!( + stage.pointer(&pointer).and_then(Value::as_u64).is_some(), + "{stage_id} missing {pointer}" + ); + } + + let judgment = stage + .pointer("/comparison_judgment") + .and_then(Value::as_str) + .ok_or_else(|| eyre::eyre!("{stage_id} missing comparison_judgment"))?; + + assert!(array_contains_str(ledger, "/judgment_terms", judgment)?); + } + + Ok(()) +} + +fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) -> Result<()> { + let current = find_by_field(stages, "/stage_id", "current_vs_historical_correctness")?; + + assert_eq!(current.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1)); + assert_eq!(current.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5)); + assert_eq!(current.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(current.pointer("/post_stage_counts/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(current.pointer("/comparison_judgment").and_then(Value::as_str), Some("improved")); + assert!( + current + .pointer("/baseline_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("five current-vs-historical jobs")) + ); + assert!( + current + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("passes all six encoded jobs")) + ); + + let preference = find_by_field(stages, "/stage_id", "preference_evolution")?; + + assert_eq!( + preference.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(preference.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(1)); + assert_eq!( + preference.pointer("/post_stage_counts/wrong_result").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + preference.pointer("/comparison_judgment").and_then(Value::as_str), + Some("improved") + ); + + let tombstone = find_by_field(stages, "/stage_id", "deletion_ttl_tombstone_behavior")?; + + assert_eq!(tombstone.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1)); + assert_eq!(tombstone.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(1)); + assert_eq!( + tombstone.pointer("/comparison_judgment").and_then(Value::as_str), + Some("unchanged") + ); + assert!( + tombstone + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("tombstone and invalidation evidence")) + ); + + let consolidation = find_by_field(stages, "/stage_id", "reviewable_consolidation")?; + + assert_eq!( + consolidation.pointer("/comparison_judgment").and_then(Value::as_str), + Some("improved") + ); + assert_eq!( + consolidation.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(consolidation.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); + assert_eq!( + consolidation.pointer("/post_stage_counts/not_encoded").and_then(Value::as_u64), + Some(0) + ); + assert!( + consolidation + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("apply/defer/discard audit") + && basis.contains("zero source mutations")) + ); + + let scheduled = find_by_field(stages, "/stage_id", "scheduled_memory_task_readiness")?; + + assert_eq!(scheduled.pointer("/comparison_judgment").and_then(Value::as_str), Some("improved")); + assert_eq!(scheduled.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(scheduled.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(scheduled.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!( + scheduled.pointer("/post_stage_counts/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + scheduled.pointer("/post_stage_counts/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + + let retest = find_by_field(stages, "/stage_id", "final_competitor_retest_status")?; + + assert_eq!(retest.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(22)); + assert_eq!(retest.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5)); + assert_eq!(retest.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(2)); + assert_eq!(retest.pointer("/baseline_counts/not_tested").and_then(Value::as_u64), Some(11)); + assert_eq!(retest.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), Some(11)); + assert!(array_contains_str(ledger, "/summary/improved", "current_vs_historical_correctness")?); + assert!(array_contains_str(ledger, "/summary/improved", "preference_evolution")?); + assert!(array_contains_str(ledger, "/summary/improved", "reviewable_consolidation")?); + assert!(array_contains_str( + ledger, + "/summary/improved", + "memory_summary_top_of_mind_behavior" + )?); + assert!(array_contains_str(ledger, "/summary/improved", "proactive_brief_readiness")?); + assert!(array_contains_str(ledger, "/summary/improved", "scheduled_memory_task_readiness")?); + assert!(array_at(ledger, "/summary/regressed")?.is_empty()); + assert!(array_contains_str(ledger, "/summary/unchanged", "deletion_ttl_tombstone_behavior")?); + assert!(array_contains_str(ledger, "/summary/unchanged", "final_competitor_retest_status")?); + assert!(array_at(ledger, "/summary/blocked")?.is_empty()); + assert!(array_at(ledger, "/summary/not_tested")?.is_empty()); + + assert_dreaming_memory_summary_stage(stages)?; + assert_dreaming_proactive_brief_stage(stages)?; + + Ok(()) +} + +fn assert_dreaming_memory_summary_stage(stages: &[Value]) -> Result<()> { + let summary_stage = find_by_field(stages, "/stage_id", "memory_summary_top_of_mind_behavior")?; + + assert_eq!( + summary_stage.pointer("/comparison_judgment").and_then(Value::as_str), + Some("improved") + ); + assert_eq!(summary_stage.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(9)); + assert_eq!( + summary_stage.pointer("/post_stage_counts/not_tested").and_then(Value::as_u64), + Some(0) + ); + assert!( + summary_stage + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("fixture-backed memory_summary job") + && basis.contains("unsupported-claim flags")) + ); + + Ok(()) +} + +fn assert_dreaming_proactive_brief_stage(stages: &[Value]) -> Result<()> { + let proactive_stage = find_by_field(stages, "/stage_id", "proactive_brief_readiness")?; + + assert_eq!( + proactive_stage.pointer("/comparison_judgment").and_then(Value::as_str), + Some("improved") + ); + assert_eq!(proactive_stage.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); + assert_eq!( + proactive_stage.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + proactive_stage.pointer("/post_stage_counts/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + proactive_stage.pointer("/post_stage_counts/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + proactive_stage + .pointer("/post_stage_counts/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + proactive_stage + .pointer("/post_stage_counts/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); + assert!( + proactive_stage + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("five proactive_brief fixture jobs") + && basis.contains("typed private-corpus refresh blocker")) + ); + + Ok(()) +} + +fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) { + assert!( + markdown.contains("`improved`: current-vs-historical correctness, preference evolution") + && markdown.contains("reviewable") + && markdown.contains("proactive brief") + ); + assert!(markdown.contains("memory-summary/top-of-mind fixture readback")); + assert!(markdown.contains("XY-953 adds a direct `proactive_brief` suite")); + assert!(markdown.contains("XY-954 adds a direct `scheduled_memory` suite")); + assert!(markdown.contains( + "Do not claim fixture-backed proactive brief scoring proves OpenAI Pulse parity" + )); + assert!( + markdown + .contains("Do not claim fixture-backed scheduled-memory scoring proves ChatGPT Tasks") + ); + assert!(markdown.contains("`regressed`: none")); + assert!(markdown.contains("the XY-905 run passes all six memory-evolution jobs")); + assert!(markdown.contains("XY-952 adds a reviewable `elf.memory_summary/v1`")); + assert!(markdown.contains("XY-905")); + assert!( + markdown + .contains("Do not claim this ledger proves preference history against mem0/OpenMemory") + ); + assert!(markdown.contains("Reviewable consolidation now has ELF live service-backed")); +} + +#[test] +fn knowledge_json_report_renders_markdown_metrics() -> Result<()> { + let report = run_json_report_from(knowledge_fixture_dir())?; + let temp_dir = env::temp_dir().join(format!("elf-real-world-knowledge-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("knowledge-report.json"); + let markdown_path = temp_dir.join("knowledge-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Knowledge Page Metrics")); + assert!(markdown.contains("Knowledge citation coverage")); + assert!(markdown.contains("Backlinks: `9` total")); + assert!(markdown.contains("Unsupported summary count")); + assert!(markdown.contains("knowledge-project-page-001")); + assert!(markdown.contains("knowledge-entity-concept-002")); + + Ok(()) +} + +#[test] +fn memory_summary_fixtures_score_reviewable_source_trace_contract() -> Result<()> { + let report = run_json_report_from(memory_summary_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/memory_summary/summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/memory_summary/entry_count").and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/summary/memory_summary/covered_required_category_count") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report.pointer("/summary/memory_summary/source_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/rationale_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/memory_summary/unsupported_derived_entry_count") + .and_then(Value::as_u64), + Some(1) + ); + + let suites = array_at(&report, "/suites")?; + let memory_summary = find_by_field(suites, "/suite_id", "memory_summary")?; + + assert_eq!(memory_summary.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(memory_summary.pointer("/encoded_job_count").and_then(Value::as_u64), Some(1)); + + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(job.pointer("/memory_summary/top_of_mind_count").and_then(Value::as_u64), Some(1)); + assert_eq!(job.pointer("/memory_summary/tombstone_ref_count").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn memory_summary_markdown_renders_source_trace_metrics() -> Result<()> { + let report = run_json_report_from(memory_summary_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-memory-summary-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("memory-summary-report.json"); + let markdown_path = temp_dir.join("memory-summary-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Memory Summary Metrics")); + assert!(markdown.contains("memory-summary-source-trace-001")); + assert!(markdown.contains("Memory summary source-ref coverage")); + assert!(markdown.contains("Invalid Top-of-Mind")); + assert!(markdown.contains("Derived Unsupported")); + + Ok(()) +} + +#[test] +fn memory_summary_fixture_fails_stale_top_of_mind_entries() -> Result<()> { + let fixture_path = memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][2]["category"] = + Value::String("top_of_mind".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][2]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = + env::temp_dir().join(format!("elf-memory-summary-stale-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("stale_current_summary.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn memory_summary_fixture_fails_tombstoned_top_of_mind_entries() -> Result<()> { + let fixture_path = memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["category"] = + Value::String("top_of_mind".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = env::temp_dir() + .join(format!("elf-memory-summary-tombstone-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("tombstone_current_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn memory_summary_fixture_fails_untraced_derived_profile_entries() -> Result<()> { + let fixture_path = memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["unsupported_claim_flags"] = + Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-memory-summary-untraced-derived-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("untraced_derived_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); + assert_eq!( + job.pointer("/memory_summary/derived_missing_source_or_unsupported_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn memory_summary_fixture_fails_unsupported_current_derived_entries() -> Result<()> { + let fixture_path = memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["source_refs"] = + Value::Array(vec![Value::String("summary-contract-non-parity-boundary".to_string())]); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["freshness"] + ["status"] = Value::String("current".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["rationale"] + ["decision"] = Value::String("included".to_string()); + + let temp_dir = env::temp_dir() + .join(format!("elf-memory-summary-unsupported-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("unsupported_current_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/unsupported_current_entry_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn memory_summary_fixture_fails_tombstone_entries_without_tombstone_refs() -> Result<()> { + let fixture_path = memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["freshness"] + ["tombstone_refs"] = Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-memory-summary-tombstone-refs-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("missing_tombstone_refs_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/freshness_coverage").and_then(Value::as_f64), + Some(0.857) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn proactive_brief_fixtures_score_source_linked_suggestions() -> Result<()> { + let report = run_json_report_from(proactive_brief_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/proactive_brief/brief_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/suggestion_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/invalid_current_suggestion_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/rejected_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/deferred_count").and_then(Value::as_u64), + Some(2) + ); + + let suites = array_at(&report, "/suites")?; + let proactive = find_by_field(suites, "/suite_id", "proactive_brief")?; + + assert_eq!(proactive.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(proactive.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + + let jobs = array_at(&report, "/jobs")?; + let daily = find_by_field(jobs, "/job_id", "proactive-daily-project-brief-001")?; + let private = find_by_field(jobs, "/job_id", "proactive-private-corpus-refresh-blocked-001")?; + + assert_eq!(daily.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + daily.pointer("/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!(private.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + report + .pointer("/follow_ups/0/title") + .and_then(Value::as_str) + .is_some_and(|title| title.contains("XY-930")) + ); + + Ok(()) +} + +#[test] +fn proactive_brief_markdown_renders_source_and_freshness_metrics() -> Result<()> { + let report = run_json_report_from(proactive_brief_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-proactive-brief-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("proactive-brief-report.json"); + let markdown_path = temp_dir.join("proactive-brief-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Proactive Brief Metrics")); + assert!(markdown.contains("proactive-daily-project-brief-001")); + assert!(markdown.contains("Proactive evidence-ref coverage")); + assert!(markdown.contains("Invalid Current")); + assert!(markdown.contains("Tombstone Violations")); + + Ok(()) +} + +#[test] +fn proactive_brief_fixture_fails_unsupported_suggestions() -> Result<()> { + let fixture_path = proactive_brief_fixture_dir().join("daily_project_brief.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["evidence_refs"] = + Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-proactive-unsupported-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("unsupported_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "proactive-daily-project-brief-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); + assert_eq!( + job.pointer("/proactive_brief/untraced_suggestion_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn proactive_brief_fixture_fails_stale_decisions_presented_current() -> Result<()> { + let fixture_path = proactive_brief_fixture_dir().join("stale_decision_audit.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = + env::temp_dir().join(format!("elf-proactive-stale-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("stale_current_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "proactive-stale-decision-audit-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/proactive_brief/invalid_current_suggestion_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn proactive_brief_fixture_fails_tombstone_ttl_violations() -> Result<()> { + let fixture_path = proactive_brief_fixture_dir().join("stale_plan_preference_warning.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["freshness"] + ["status"] = Value::String("current".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["action"] + ["decision"] = Value::String("recommend".to_string()); + + let temp_dir = env::temp_dir().join(format!("elf-proactive-tombstone-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("tombstone_current_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "proactive-stale-plan-preference-warning-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/proactive_brief/tombstone_violation_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn scheduled_memory_fixtures_score_task_trace_gate() -> Result<()> { + let report = run_json_report_from(scheduled_memory_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/scheduled_memory/job_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/task_run_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/output_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/invalid_current_output_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + + let suites = array_at(&report, "/suites")?; + let scheduled = find_by_field(suites, "/suite_id", "scheduled_memory")?; + + assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(scheduled.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + + let jobs = array_at(&report, "/jobs")?; + let weekly = find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + let private = + find_by_field(jobs, "/job_id", "scheduled-private-provider-scheduler-blocked-001")?; + + assert_eq!(weekly.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + weekly.pointer("/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!(private.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + report + .pointer("/follow_ups/0/title") + .and_then(Value::as_str) + .is_some_and(|title| title.contains("XY-930")) + ); + + Ok(()) +} + +#[test] +fn scheduled_memory_markdown_renders_trace_metrics() -> Result<()> { + let report = run_json_report_from(scheduled_memory_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-scheduled-memory-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("scheduled-memory-report.json"); + let markdown_path = temp_dir.join("scheduled-memory-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Scheduled Memory Metrics")); + assert!(markdown.contains("scheduled-weekly-project-status-summary-001")); + assert!(markdown.contains("Scheduled memory evidence-ref coverage")); + assert!(markdown.contains("Trace Coverage")); + assert!(markdown.contains("Source Mutations")); + + Ok(()) +} + +#[test] +fn scheduled_memory_fixture_fails_missing_execution_trace() -> Result<()> { + let fixture_path = scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0] + .as_object_mut() + .ok_or_else(|| eyre::eyre!("missing scheduled task object"))? + .remove("execution_trace"); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-missing-trace-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("missing_trace.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/scheduled_memory/trace_complete_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn scheduled_memory_fixture_fails_untraced_outputs() -> Result<()> { + let fixture_path = scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["evidence_refs"] = + Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-untraced-output-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("untraced_output.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); + assert_eq!( + job.pointer("/scheduled_memory/untraced_output_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn scheduled_memory_fixture_fails_superseded_sources_presented_current() -> Result<()> { + let fixture_path = scheduled_memory_fixture_dir().join("stale_decision_audit.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["evidence_refs"] = + serde_json::json!(["scheduled-old-consolidation-only-decision"]); + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-superseded-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("superseded_current.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "scheduled-stale-decision-audit-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/scheduled_memory/invalid_current_output_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn scheduled_memory_fixture_fails_source_mutation() -> Result<()> { + let fixture_path = scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["source_mutations"] = serde_json::json!([ + { + "table": "memory_notes", + "op": "update", + "note_id": "scheduled-weekly-current-gate" + } + ]); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-source-mutation-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("source_mutation.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("lifecycle_fail")); + assert_eq!( + job.pointer("/scheduled_memory/source_mutation_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/lifecycle_fail").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { + let report = run_json_report_from(production_ops_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/private_corpus_redaction/private_fixture_count").and_then(Value::as_u64), + Some(1) + ); + + let suites = array_at(&report, "/suites")?; + let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; + + assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + + let jobs = array_at(&report, "/jobs")?; + let backfill = find_by_field(jobs, "/job_id", "production-ops-backfill-resume-001")?; + let restore = find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; + let private_manifest = + find_by_field(jobs, "/job_id", "production-ops-private-manifest-blocked-001")?; + let credentials = find_by_field(jobs, "/job_id", "production-ops-credential-boundary-001")?; + let dependency = find_by_field(jobs, "/job_id", "production-ops-cold-start-dependency-001")?; + + assert_eq!(backfill.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(restore.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(restore.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); + assert_eq!(private_manifest.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(credentials.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(dependency.pointer("/status").and_then(Value::as_str), Some("pass")); + + Ok(()) +} + +#[test] +fn core_archival_memory_fixtures_score_separate_core_and_archival_jobs() -> Result<()> { + let report = run_json_report_from(core_archival_memory_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), + Some(14) + ); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(14)); + assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); + + let suites = array_at(&report, "/suites")?; + let core = find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(core.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + + let jobs = array_at(&report, "/jobs")?; + + for job_id in [ + "core-archival-core-block-attachment-001", + "core-archival-core-block-scope-001", + "core-archival-core-block-provenance-001", + "core-archival-stale-core-detection-001", + "core-archival-archival-fallback-001", + "core-archival-project-decision-recovery-001", + ] { + let job = find_by_field(jobs, "/job_id", job_id)?; + + assert_eq!(job.pointer("/suite_id").and_then(Value::as_str), Some("core_archival_memory")); + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); + } + + let scope = find_by_field(jobs, "/job_id", "core-archival-core-block-scope-001")?; + let decision = find_by_field(jobs, "/job_id", "core-archival-project-decision-recovery-001")?; + + assert_eq!(scope.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); + assert_eq!(scope.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(scope.pointer("/scope_violation_count").and_then(Value::as_u64), Some(0)); + assert!( + decision + .pointer("/produced_answer") + .and_then(Value::as_str) + .is_some_and(|content| content.contains("Letta remains blocked or not_tested")) + ); + assert!( + array_at(decision, "/produced_evidence")? + .iter() + .any(|id| id.as_str() == Some("decision-letta-export-boundary")) + ); + + Ok(()) +} + +#[test] +fn context_trajectory_fixtures_report_blocked_openviking_gates() -> Result<()> { + let report = run_json_report_from(context_trajectory_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + + let suites = array_at(&report, "/suites")?; + let context = find_by_field(suites, "/suite_id", "context_trajectory")?; + + assert_eq!(context.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(context.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); + + let jobs = array_at(&report, "/jobs")?; + let staged = + find_by_field(jobs, "/job_id", "context-trajectory-openviking-staged-retrieval-001")?; + let hierarchy = + find_by_field(jobs, "/job_id", "context-trajectory-openviking-hierarchy-selection-001")?; + let recursive = + find_by_field(jobs, "/job_id", "context-trajectory-openviking-recursive-expansion-001")?; + + assert_eq!(staged.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(hierarchy.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(recursive.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + staged.pointer("/reason").and_then(Value::as_str).is_some_and( + |reason| reason.contains("same-corpus output returns expected evidence ids") + ) + ); + + Ok(()) +} + +fn assert_root_knowledge_summary(report: &Value) { + assert_eq!(report.pointer("/summary/knowledge/job_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/knowledge/page_count").and_then(Value::as_u64), Some(4)); + assert_eq!( + report.pointer("/summary/knowledge/page_usefulness").and_then(Value::as_f64), + Some(0.969) + ); +} + +fn assert_root_aggregate_summary(report: &Value) { + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(60)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(16)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(53)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(7)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), + Some(0.0) + ); + assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), + Some(11) + ); + assert_eq!( + report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report.pointer("/summary/qdrant_rebuild_pass_count").and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), + Some(133) + ); + assert_eq!( + report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), + Some(133) + ); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/consolidation/proposal_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/consolidation/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/consolidation/proposal_unsupported_claim_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/memory_summary/job_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/source_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + + assert_root_knowledge_summary(report); + assert_root_proactive_brief_summary(report); + assert_root_scheduled_memory_summary(report); +} + +fn assert_root_proactive_brief_summary(report: &Value) { + assert_eq!( + report.pointer("/summary/proactive_brief/job_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/suggestion_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/invalid_current_suggestion_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); +} + +fn assert_root_scheduled_memory_summary(report: &Value) { + assert_eq!( + report.pointer("/summary/scheduled_memory/job_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/task_run_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/output_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/invalid_current_output_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); +} + +fn assert_root_aggregate_suites(report: &Value) -> Result<()> { + let suites = array_at(report, "/suites")?; + + for suite_id in [ + "trust_source_of_truth", + "work_resume", + "project_decisions", + "retrieval", + "capture_integration", + "personalization", + "consolidation", + "memory_summary", + "knowledge_compilation", + "operator_debugging_ux", + "memory_evolution", + "core_archival_memory", + ] { + let suite = find_by_field(suites, "/suite_id", suite_id)?; + + assert_eq!(suite.pointer("/status").and_then(Value::as_str), Some("pass")); + } + + let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?; + + assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("pass")); + + let project_decisions = find_by_field(suites, "/suite_id", "project_decisions")?; + + assert_eq!(project_decisions.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + assert_eq!( + project_decisions.pointer("/update_rationale_available_count").and_then(Value::as_u64), + Some(5) + ); + + let debug_suite = find_by_field(suites, "/suite_id", "operator_debugging_ux")?; + + assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + + let core_suite = find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(core_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + + let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; + + assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + + let proactive = find_by_field(suites, "/suite_id", "proactive_brief")?; + + assert_eq!(proactive.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(proactive.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + + let scheduled = find_by_field(suites, "/suite_id", "scheduled_memory")?; + + assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(scheduled.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + + let context_trajectory = find_by_field(suites, "/suite_id", "context_trajectory")?; + + assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(context_trajectory.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); + + Ok(()) +} + +fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { + let jobs = array_at(report, "/jobs")?; + let rebuild = find_by_field(jobs, "/job_id", "trust-sot-rebuild-001")?; + let redaction = find_by_field(jobs, "/job_id", "capture-redaction-exclusion-001")?; + let personalization = find_by_field(jobs, "/job_id", "personalization-scoped-preference-001")?; + let relation_job = find_by_field(jobs, "/job_id", "memory-evolution-relation-temporal-001")?; + let delete_job = find_by_field(jobs, "/job_id", "memory-evolution-delete-ttl-001")?; + let stage_job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; + let production_restore = + find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; + let core_fallback = find_by_field(jobs, "/job_id", "core-archival-archival-fallback-001")?; + let stale_core = find_by_field(jobs, "/job_id", "core-archival-stale-core-detection-001")?; + let scheduled_weekly = + find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + + assert_eq!(rebuild.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); + assert_eq!( + production_restore.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), + Some(true) + ); + assert_eq!(redaction.pointer("/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert_eq!(personalization.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); + assert_eq!(personalization.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(stage_job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(delete_job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + delete_job.pointer("/evolution/selected_tombstone_evidence/0").and_then(Value::as_str), + Some("delete-tombstone") + ); + assert_eq!( + delete_job.pointer("/evolution/selected_invalidation_evidence/0").and_then(Value::as_str), + Some("delete-tombstone") + ); + assert_eq!(core_fallback.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(stale_core.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(scheduled_weekly.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + scheduled_weekly.pointer("/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + stage_job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("rerank.score") + ); + assert!(array_contains_str(stage_job, "/produced_evidence", "stage-target")?); + + Ok(()) +} + +#[test] +fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { + let report = run_json_report_from(real_world_memory_fixture_dir())?; + + assert_root_aggregate_summary(&report); + assert_root_aggregate_suites(&report)?; + assert_root_aggregate_jobs(&report)?; + + Ok(()) +} + +#[test] +fn retrieval_fixtures_report_quality_and_trace_attribution() -> Result<()> { + let report = run_json_report_from(retrieval_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), + Some(0.0) + ); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), + Some(0) + ); + + let suites = array_at(&report, "/suites")?; + let retrieval_suite = find_by_field(suites, "/suite_id", "retrieval")?; + let debug_suite = find_by_field(suites, "/suite_id", "operator_debugging_ux")?; + + assert_eq!(retrieval_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(retrieval_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + + let jobs = array_at(&report, "/jobs")?; + let stage_job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; + + assert_eq!(stage_job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + stage_job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("rerank.score") + ); + assert_eq!( + stage_job.pointer("/retrieval_quality/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + stage_job.pointer("/retrieval_quality/irrelevant_context_ratio").and_then(Value::as_f64), + Some(0.0) + ); + + Ok(()) +} + +#[test] +fn stage_attribution_fixture_still_fails_when_decoy_is_used() -> Result<()> { + let fixture_path = retrieval_fixture_dir().join("stage_explainability_wrong_result.json"); + let mut fixture = serde_json::from_str::(&fs::read_to_string(fixture_path)?)?; + + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/content", + Value::String( + "The trace shows the expected evidence was present in recall.candidates but demoted at rerank.score; however, the selected answer followed the stale top-k smoke-only evidence.".to_string(), + ), + )?; + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/claims", + serde_json::json!([]), + )?; + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!(["stage-decoy"]), + )?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-stage-decoy-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("stage_decoy.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + assert_eq!( + report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), + Some(1) + ); + + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("rerank.score") + ); + assert_eq!( + job.pointer("/retrieval_quality/trap_context_count").and_then(Value::as_u64), + Some(1) + ); + + Ok(()) +} + +#[test] +fn retrieval_report_markdown_includes_quality_metrics() -> Result<()> { + let report = run_json_report_from(retrieval_fixture_dir())?; + let temp_dir = env::temp_dir().join(format!("elf-real-world-retrieval-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("retrieval-report.json"); + let markdown_path = temp_dir.join("retrieval-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Expected evidence recall")); + assert!(markdown.contains("Irrelevant context ratio")); + assert!(markdown.contains("Trace Explainability")); + assert!(markdown.contains("rerank.score")); + + Ok(()) +} + +#[test] +fn memory_evolution_fixtures_report_temporal_and_staleness_metrics() -> Result<()> { + let report = run_json_report_from(evolution_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/history_readback_encoded_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/evolution/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/evolution/history_readback_encoded_count").and_then(Value::as_u64), + Some(1) + ); + + let suites = array_at(&report, "/suites")?; + let memory_evolution = find_by_field(suites, "/suite_id", "memory_evolution")?; + + assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + memory_evolution.pointer("/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + memory_evolution.pointer("/history_readback_encoded_count").and_then(Value::as_u64), + Some(1) + ); + + let jobs = array_at(&report, "/jobs")?; + let preference_job = find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; + let relation_job = find_by_field(jobs, "/job_id", "memory-evolution-relation-temporal-001")?; + + assert_eq!( + preference_job.pointer("/evolution/history_readback_encoded").and_then(Value::as_bool), + Some(true) + ); + assert!(array_contains_str(preference_job, "/evolution/history_event_types", "add")?); + assert!(array_contains_str(preference_job, "/evolution/history_event_types", "update")?); + assert!(array_contains_str(preference_job, "/evolution/history_event_types", "ignore")?); + assert_eq!( + preference_job + .pointer("/evolution/history_requires_note_version_links") + .and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + preference_job.pointer("/evolution/selected_current_evidence/0").and_then(Value::as_str), + Some("pref-current-concise-rationale") + ); + assert_eq!( + preference_job.pointer("/evolution/selected_historical_evidence/0").and_then(Value::as_str), + Some("pref-old-terse-bullets") + ); + assert_eq!( + preference_job.pointer("/evolution/selected_rationale_evidence/0").and_then(Value::as_str), + Some("pref-update-rationale") + ); + assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + relation_job.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + relation_job.pointer("/evolution/temporal_validity_encoded").and_then(Value::as_bool), + Some(true) + ); + + let follow_ups = array_at(&report, "/follow_ups")?; + + assert!(follow_ups.is_empty()); + + Ok(()) +} + +#[test] +fn memory_evolution_conflict_still_fails_when_selected_evidence_is_not_narrated() -> Result<()> { + let fixture_path = + evolution_fixture_dir().join("preference_changed_current_vs_historical.json"); + let mut fixture = serde_json::from_str::(&fs::read_to_string(fixture_path)?)?; + + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!([ + "pref-current-concise-rationale", + "pref-old-terse-bullets", + "pref-update-rationale" + ]), + )?; + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/claims", + serde_json::json!([ + { + "claim_id": "current_preference", + "text": "Use concise prose with explicit evidence before bullets.", + "evidence_ids": ["pref-current-concise-rationale", "pref-update-rationale"], + "confidence": "high" + }, + { + "claim_id": "preference_update_rationale", + "text": "The preference changed because terse bullets hid rationale.", + "evidence_ids": ["pref-update-rationale"], + "confidence": "high" + } + ]), + )?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-memory-conflict-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("conflict.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(job.pointer("/evolution/conflict_detection_count").and_then(Value::as_u64), Some(0)); + assert!(array_contains_str( + job, + "/evolution/selected_but_not_narrated_evidence", + "pref-old-terse-bullets" + )?); + + Ok(()) +} + +#[test] +fn memory_evolution_counts_stale_answer_when_old_fact_is_answered_as_current() -> Result<()> { + let fixture_path = + evolution_fixture_dir().join("preference_changed_current_vs_historical.json"); + let mut fixture = serde_json::from_str::(&fs::read_to_string(fixture_path)?)?; + + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/content", + Value::String( + "Use terse bullet-only benchmark updates as the current preference.".to_string(), + ), + )?; + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!(["pref-old-terse-bullets"]), + )?; + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/claims", + serde_json::json!([ + { + "claim_id": "current_preference", + "text": "Use terse bullet-only benchmark updates as the current preference.", + "evidence_ids": ["pref-old-terse-bullets"], + "confidence": "high" + } + ]), + )?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-memory-stale-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("stale_preference.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + + assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(job.pointer("/evolution/stale_answer_count").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn operator_debug_json_report_renders_markdown_links() -> Result<()> { + let report = run_json_report_from(operator_debug_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-job-operator-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("operator.json"); + let markdown_path = temp_dir.join("operator.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("operator-debug-dropped-evidence-001")); + assert!(markdown.contains("/viewer?trace_id=11111111-1111-4111-8111-111111111111")); + assert!(markdown.contains("Raw SQL")); + assert!(markdown.contains("Replay Candidates")); + assert!(markdown.contains("Root cause")); + + Ok(()) +} + +#[test] +fn memory_evolution_report_renders_markdown_counters() -> Result<()> { + let report = run_json_report_from(evolution_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-memory-evolution-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("evolution-report.json"); + let markdown_path = temp_dir.join("evolution-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("## Memory Evolution")); + assert!(markdown.contains("Temporal validity not encoded: `0`")); + assert!(markdown.contains("| memory_evolution | memory-evolution-relation-temporal-001")); + assert!(markdown.contains("`encoded`")); + + Ok(()) +} + +#[test] +fn consolidation_report_renders_markdown_metrics_and_gaps() -> Result<()> { + let report = run_json_report_from(consolidation_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-consolidation-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("report.json"); + let markdown_path = temp_dir.join("report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("## Consolidation")); + assert!(markdown.contains("Source Mutations")); + assert!(markdown.contains("Proposal Unsupported Claims")); + assert!(markdown.contains("Executable Gaps")); + assert!(markdown.contains("consolidation-contradiction-report-discard-001")); + assert!(!markdown.contains("live_consolidation_worker_generation")); + + Ok(()) +} diff --git a/apps/elf-mcp/Cargo.toml b/apps/elf-mcp/Cargo.toml index 10673956..ce48eedf 100644 --- a/apps/elf-mcp/Cargo.toml +++ b/apps/elf-mcp/Cargo.toml @@ -2,18 +2,20 @@ build = "../../build.rs" edition = "2024" name = "elf-mcp" -version = "0.1.0" +version = "0.2.0" [dependencies] axum = { workspace = true } clap = { workspace = true } color-eyre = { workspace = true } -elf-cli = { path = "../../packages/elf-cli" } -elf-config = { path = "../../packages/elf-config" } reqwest = { workspace = true } -rmcp = { version = "0.13", features = ["transport-streamable-http-server"] } +rmcp = { workspace = true } serde_json = { workspace = true } tokio = { workspace = true } +uuid = { workspace = true } + +elf-cli = { workspace = true } +elf-config = { workspace = true } [build-dependencies] vergen-gitcl = { workspace = true } diff --git a/apps/elf-mcp/src/app.rs b/apps/elf-mcp/src/app.rs new file mode 100644 index 00000000..3dc073f0 --- /dev/null +++ b/apps/elf-mcp/src/app.rs @@ -0,0 +1,165 @@ +#[path = "server.rs"] mod server; + +use std::{net::SocketAddr, path::PathBuf}; + +use clap::Parser; +use color_eyre::{Result, eyre}; + +use elf_config::{McpContext, Security}; + +#[derive(Debug, Parser)] +#[command( + version = elf_cli::VERSION, + rename_all = "kebab", + styles = elf_cli::styles(), +)] +pub struct Args { + #[arg(long, short = 'c', value_name = "FILE")] + pub config: PathBuf, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum McpAuthState { + Off, + StaticKeys { bearer_token: String }, +} + +pub async fn run(args: Args) -> Result<()> { + let config = elf_config::load(&args.config)?; + let mcp = + config.mcp.as_ref().ok_or_else(|| eyre::eyre!("mcp section is required for elf-mcp."))?; + let auth_state = build_auth_state(&config.security, &config.service.mcp_bind, mcp)?; + + server::serve_mcp( + &config.service.mcp_bind, + &config.service.http_bind, + &config.service.admin_bind, + auth_state, + mcp, + ) + .await +} + +fn build_auth_state(security: &Security, mcp_bind: &str, mcp: &McpContext) -> Result { + match security.auth_mode.trim() { + "off" => { + enforce_loopback_for_off_mode(mcp_bind)?; + + Ok(McpAuthState::Off) + }, + "static_keys" => select_static_key(security, mcp), + other => Err(eyre::eyre!( + "security.auth_mode must be one of off or static_keys for elf-mcp, got {other}." + )), + } +} + +fn enforce_loopback_for_off_mode(mcp_bind: &str) -> Result<()> { + let bind_addr: SocketAddr = mcp_bind.parse().map_err(|err| { + eyre::eyre!( + "service.mcp_bind must be a valid socket address when security.auth_mode=off: {err}" + ) + })?; + + if !bind_addr.ip().is_loopback() { + return Err(eyre::eyre!( + "service.mcp_bind must be a loopback address when security.auth_mode=off." + )); + } + + Ok(()) +} + +fn select_static_key(security: &Security, mcp: &McpContext) -> Result { + let mut matches = security.auth_keys.iter().filter(|key| { + key.tenant_id == mcp.tenant_id + && key.project_id == mcp.project_id + && key.agent_id.as_deref() == Some(mcp.agent_id.as_str()) + && key.read_profile == mcp.read_profile + }); + let first = matches.next(); + let has_multiple = matches.next().is_some(); + + match (first, has_multiple) { + (Some(key), false) => Ok(McpAuthState::StaticKeys { bearer_token: key.token.clone() }), + (None, _) => Err(eyre::eyre!( + "security.auth_mode=static_keys requires exactly one matching entry in security.auth_keys for mcp context (tenant_id, project_id, agent_id, read_profile). Found zero." + )), + (Some(_), true) => Err(eyre::eyre!( + "security.auth_mode=static_keys requires exactly one matching entry in security.auth_keys for mcp context (tenant_id, project_id, agent_id, read_profile). Found multiple." + )), + } +} + +#[cfg(test)] +mod tests { + use crate::app::{self, McpAuthState}; + use elf_config::{McpContext, Security, SecurityAuthKey, SecurityAuthRole}; + + fn sample_security(auth_mode: &str, auth_keys: Vec) -> Security { + Security { + bind_localhost_only: true, + reject_non_english: true, + redact_secrets_on_write: true, + evidence_min_quotes: 1, + evidence_max_quotes: 5, + evidence_max_quote_chars: 400, + auth_mode: auth_mode.to_string(), + auth_keys, + } + } + + fn sample_mcp() -> McpContext { + McpContext { + tenant_id: "tenant-a".to_string(), + project_id: "project-a".to_string(), + agent_id: "agent-a".to_string(), + read_profile: "private_plus_project".to_string(), + } + } + + fn sample_key(token_id: &str, token: &str) -> SecurityAuthKey { + SecurityAuthKey { + token_id: token_id.to_string(), + token: token.to_string(), + tenant_id: "tenant-a".to_string(), + project_id: "project-a".to_string(), + agent_id: Some("agent-a".to_string()), + read_profile: "private_plus_project".to_string(), + role: SecurityAuthRole::User, + } + } + + #[test] + fn off_mode_requires_loopback_mcp_bind() { + let security = sample_security("off", vec![]); + let mcp = sample_mcp(); + let err = + app::build_auth_state(&security, "0.0.0.0:9090", &mcp).expect_err("expected error"); + + assert!(err.to_string().contains("security.auth_mode=off"), "unexpected error: {err}"); + } + + #[test] + fn static_keys_mode_selects_single_matching_key() { + let security = sample_security("static_keys", vec![sample_key("key-1", "token-1")]); + let mcp = sample_mcp(); + let auth_state = + app::build_auth_state(&security, "127.0.0.1:9090", &mcp).expect("auth state"); + + assert_eq!(auth_state, McpAuthState::StaticKeys { bearer_token: "token-1".to_string() }); + } + + #[test] + fn static_keys_mode_rejects_multiple_matching_keys() { + let security = sample_security( + "static_keys", + vec![sample_key("key-1", "token-1"), sample_key("key-2", "token-2")], + ); + let mcp = sample_mcp(); + let err = + app::build_auth_state(&security, "127.0.0.1:9090", &mcp).expect_err("expected error"); + + assert!(err.to_string().contains("Found multiple"), "unexpected error: {err}"); + } +} diff --git a/apps/elf-mcp/src/lib.rs b/apps/elf-mcp/src/lib.rs deleted file mode 100644 index 55189f79..00000000 --- a/apps/elf-mcp/src/lib.rs +++ /dev/null @@ -1,23 +0,0 @@ -pub mod server; - -// std -use std::path::PathBuf; - -// crates.io -use clap::Parser; - -#[derive(Debug, Parser)] -#[command( - version = elf_cli::VERSION, - rename_all = "kebab", - styles = elf_cli::styles(), -)] -pub struct Args { - #[arg(long, short = 'c', value_name = "FILE")] - pub config: PathBuf, -} - -pub async fn run(args: Args) -> color_eyre::Result<()> { - let config = elf_config::load(&args.config)?; - server::serve_mcp(&config.service.mcp_bind, &config.service.http_bind).await -} diff --git a/apps/elf-mcp/src/main.rs b/apps/elf-mcp/src/main.rs index 15360a47..e47d0744 100644 --- a/apps/elf-mcp/src/main.rs +++ b/apps/elf-mcp/src/main.rs @@ -1,10 +1,19 @@ -// crates.io +//! Binary entrypoint for the ELF MCP app. + +#![recursion_limit = "512"] + +mod app; + use clap::Parser; -// self -use elf_mcp::Args; +use color_eyre::Result; + +use app::Args; #[tokio::main] -async fn main() -> color_eyre::Result<()> { +async fn main() -> Result<()> { + color_eyre::install()?; + let args = Args::parse(); - elf_mcp::run(args).await + + app::run(args).await } diff --git a/apps/elf-mcp/src/server.rs b/apps/elf-mcp/src/server.rs index 0998f92d..d7c60891 100644 --- a/apps/elf-mcp/src/server.rs +++ b/apps/elf-mcp/src/server.rs @@ -1,12 +1,17 @@ -// std use std::{net::SocketAddr, sync::Arc}; -// crates.io -use axum::Router; +use axum::{ + Router, + body::Body, + extract::State, + http::{HeaderMap, Request, StatusCode}, + middleware::{self, Next}, + response::IntoResponse, +}; use color_eyre::Result; -use reqwest::Client; +use reqwest::{Client, RequestBuilder}; use rmcp::{ - ErrorData as McpError, ServerHandler, + ErrorData, ServerHandler, handler::server::router::tool::ToolRouter, model::{CallToolResult, JsonObject, ServerCapabilities, ServerInfo}, transport::streamable_http_server::{ @@ -15,30 +20,180 @@ use rmcp::{ }; use serde_json::Value; use tokio::net::TcpListener; +use uuid::Uuid; + +use crate::app::McpAuthState; +use elf_config::McpContext; -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +const HEADER_TENANT_ID: &str = "X-ELF-Tenant-Id"; +const HEADER_PROJECT_ID: &str = "X-ELF-Project-Id"; +const HEADER_AGENT_ID: &str = "X-ELF-Agent-Id"; +const HEADER_READ_PROFILE: &str = "X-ELF-Read-Profile"; +const HEADER_REQUEST_ID: &str = "X-ELF-Request-Id"; +const HEADER_AUTHORIZATION: &str = "Authorization"; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] enum HttpMethod { Get, Post, + Put, + Patch, + Delete, +} + +#[derive(Clone)] +struct ElfContextHeaders { + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, +} +impl ElfContextHeaders { + fn new(cfg: &McpContext) -> Self { + Self { + tenant_id: cfg.tenant_id.clone(), + project_id: cfg.project_id.clone(), + agent_id: cfg.agent_id.clone(), + read_profile: cfg.read_profile.clone(), + } + } } #[derive(Clone)] struct ElfMcp { - api_base: String, + http_api_base: String, + admin_api_base: String, client: Client, + context: ElfContextHeaders, + auth_state: McpAuthState, tool_router: ToolRouter, } - impl ElfMcp { - fn new(api_base: String) -> Self { - Self { api_base, client: Client::new(), tool_router: Self::tool_router() } + fn new( + http_api_base: String, + admin_api_base: String, + context: ElfContextHeaders, + auth_state: McpAuthState, + ) -> Self { + Self { + http_api_base, + admin_api_base, + client: Client::new(), + context, + auth_state, + tool_router: Self::tool_router(), + } + } + + fn api_base_for_path(&self, path: &str) -> &str { + if is_admin_path(path) { &self.admin_api_base } else { &self.http_api_base } + } + + fn apply_context_headers( + &self, + builder: RequestBuilder, + read_profile_override: Option<&str>, + request_id: Uuid, + ) -> RequestBuilder { + let read_profile = read_profile_override.unwrap_or(self.context.read_profile.as_str()); + let builder = builder + .header(HEADER_TENANT_ID, self.context.tenant_id.as_str()) + .header(HEADER_PROJECT_ID, self.context.project_id.as_str()) + .header(HEADER_AGENT_ID, self.context.agent_id.as_str()) + .header(HEADER_READ_PROFILE, read_profile); + let builder = builder.header(HEADER_REQUEST_ID, request_id.to_string()); + + match &self.auth_state { + McpAuthState::Off => builder, + McpAuthState::StaticKeys { bearer_token } => + builder.header(HEADER_AUTHORIZATION, format!("Bearer {bearer_token}")), + } + } + + async fn forward_post( + &self, + path: &str, + body: Value, + read_profile_override: Option<&str>, + request_id: Uuid, + ) -> Result { + let url = format!("{}{}", self.api_base_for_path(path), path); + let response = self + .apply_context_headers( + self.client.post(url).json(&body), + read_profile_override, + request_id, + ) + .send() + .await + .map_err(|err| { + ErrorData::internal_error(format!("ELF API request failed: {err}"), None) + })?; + + handle_response(response).await + } + + async fn forward_patch( + &self, + path: &str, + body: Value, + read_profile_override: Option<&str>, + request_id: Uuid, + ) -> Result { + let url = format!("{}{}", self.api_base_for_path(path), path); + let response = self + .apply_context_headers( + self.client.patch(url).json(&body), + read_profile_override, + request_id, + ) + .send() + .await + .map_err(|err| { + ErrorData::internal_error(format!("ELF API request failed: {err}"), None) + })?; + + handle_response(response).await } - async fn forward_post(&self, path: &str, body: Value) -> Result { - let url = format!("{}{}", self.api_base, path); - let response = self.client.post(url).json(&body).send().await.map_err(|err| { - McpError::internal_error(format!("ELF API request failed: {err}"), None) - })?; + async fn forward_put( + &self, + path: &str, + body: Value, + read_profile_override: Option<&str>, + request_id: Uuid, + ) -> Result { + let url = format!("{}{}", self.api_base_for_path(path), path); + let response = self + .apply_context_headers( + self.client.put(url).json(&body), + read_profile_override, + request_id, + ) + .send() + .await + .map_err(|err| { + ErrorData::internal_error(format!("ELF API request failed: {err}"), None) + })?; + + handle_response(response).await + } + + async fn forward_delete( + &self, + path: &str, + read_profile_override: Option<&str>, + request_id: Uuid, + ) -> Result { + let url = format!("{}{}", self.api_base_for_path(path), path); + let response = self + .apply_context_headers(self.client.delete(url), read_profile_override, request_id) + .send() + .await + .map_err(|err| { + ErrorData::internal_error(format!("ELF API request failed: {err}"), None) + })?; + handle_response(response).await } @@ -46,12 +201,23 @@ impl ElfMcp { &self, path: &str, params: JsonObject, - ) -> Result { - let url = format!("{}{}", self.api_base, path); + read_profile_override: Option<&str>, + request_id: Uuid, + ) -> Result { + let url = format!("{}{}", self.api_base_for_path(path), path); let query = params_to_query(params); - let response = self.client.get(url).query(&query).send().await.map_err(|err| { - McpError::internal_error(format!("ELF API request failed: {err}"), None) - })?; + let response = self + .apply_context_headers( + self.client.get(url).query(&query), + read_profile_override, + request_id, + ) + .send() + .await + .map_err(|err| { + ErrorData::internal_error(format!("ELF API request failed: {err}"), None) + })?; + handle_response(response).await } @@ -60,10 +226,24 @@ impl ElfMcp { method: HttpMethod, path: &str, params: JsonObject, - ) -> Result { + read_profile_override: Option<&str>, + ) -> Result { + let request_id = Uuid::new_v4(); + match method { - HttpMethod::Post => self.forward_post(path, Value::Object(params)).await, - HttpMethod::Get => self.forward_get(path, params).await, + HttpMethod::Post => + self.forward_post(path, Value::Object(params), read_profile_override, request_id) + .await, + HttpMethod::Get => + self.forward_get(path, params, read_profile_override, request_id).await, + HttpMethod::Put => + self.forward_put(path, Value::Object(params), read_profile_override, request_id) + .await, + HttpMethod::Patch => + self.forward_patch(path, Value::Object(params), read_profile_override, request_id) + .await, + HttpMethod::Delete => + self.forward_delete(path, read_profile_override, request_id).await, } } } @@ -71,104 +251,544 @@ impl ElfMcp { #[rmcp::tool_router] impl ElfMcp { #[rmcp::tool( - name = "memory_add_note", - description = "Add memory notes.", - input_schema = any_json_schema() - )] - async fn memory_add_note(&self, params: JsonObject) -> Result { - self.forward(HttpMethod::Post, "/v1/memory/add_note", params).await + name = "elf_notes_ingest", + description = "Ingest deterministic notes into ELF. This tool never calls an LLM.", + input_schema = notes_ingest_schema() + )] + async fn elf_notes_ingest(&self, params: JsonObject) -> Result { + self.forward(HttpMethod::Post, "/v2/notes/ingest", params, None).await + } + + #[rmcp::tool( + name = "elf_graph_query", + description = "Query graph entities and relations by structured criteria.", + input_schema = graph_query_schema() + )] + async fn elf_graph_query(&self, params: JsonObject) -> Result { + self.forward(HttpMethod::Post, "/v2/graph/query", params, None).await + } + + #[rmcp::tool( + name = "elf_events_ingest", + description = "Ingest an event by extracting evidence-bound notes using the configured LLM extractor.", + input_schema = events_ingest_schema() + )] + async fn elf_events_ingest(&self, params: JsonObject) -> Result { + self.forward(HttpMethod::Post, "/v2/events/ingest", params, None).await + } + + #[rmcp::tool( + name = "elf_docs_put", + description = "Store a document (evidence source) in ELF Doc Extension v1.", + input_schema = docs_put_schema() + )] + async fn elf_docs_put(&self, params: JsonObject) -> Result { + self.forward(HttpMethod::Post, "/v2/docs", params, None).await + } + + #[rmcp::tool( + name = "elf_docs_get", + description = "Fetch a single document's metadata by doc_id.", + input_schema = docs_get_schema() + )] + async fn elf_docs_get(&self, mut params: JsonObject) -> Result { + let doc_id = take_required_string(&mut params, "doc_id")?; + let path = format!("/v2/docs/{doc_id}"); + + self.forward(HttpMethod::Get, &path, JsonObject::new(), None).await + } + + #[rmcp::tool( + name = "elf_docs_search_l0", + description = "Run a minimal Doc search (L0): chunk-level results with short snippets.", + input_schema = docs_search_l0_schema() + )] + async fn elf_docs_search_l0( + &self, + mut params: JsonObject, + ) -> Result { + // read_profile is part of the MCP server configuration and is not client-controlled. + let _ = take_optional_string(&mut params, "read_profile")?; + + self.forward(HttpMethod::Post, "/v2/docs/search/l0", params, None).await + } + + #[rmcp::tool( + name = "elf_docs_excerpts_get", + description = "Hydrate a verifiable excerpt (L1 or L2) from a stored document.", + input_schema = docs_excerpts_get_schema() + )] + async fn elf_docs_excerpts_get(&self, params: JsonObject) -> Result { + self.forward(HttpMethod::Post, "/v2/docs/excerpts", params, None).await + } + + #[rmcp::tool( + name = "elf_core_blocks_get", + description = "Fetch core memory blocks explicitly attached to the configured agent and read profile. This is separate from archival search.", + input_schema = core_blocks_get_schema() + )] + async fn elf_core_blocks_get( + &self, + mut params: JsonObject, + ) -> Result { + // read_profile is part of the MCP server configuration and is not client-controlled. + let _ = take_optional_string(&mut params, "read_profile")?; + + self.forward(HttpMethod::Get, "/v2/core-blocks", params, None).await + } + + #[rmcp::tool( + name = "elf_searches_create", + description = "Create a search session using quick-find or planned-search mode. Response includes optional trajectory_summary for staged retrieval progress.", + input_schema = searches_create_schema() + )] + async fn elf_searches_create( + &self, + mut params: JsonObject, + ) -> Result { + // read_profile is part of the MCP server configuration and is not client-controlled. + let _ = take_optional_string(&mut params, "read_profile")?; + + self.forward(HttpMethod::Post, "/v2/searches", params, None).await + } + + #[rmcp::tool( + name = "elf_searches_get", + description = "Fetch a search session index view by search_id, including optional trajectory_summary.", + input_schema = searches_get_schema() + )] + async fn elf_searches_get(&self, mut params: JsonObject) -> Result { + let search_id = take_required_string(&mut params, "search_id")?; + let path = format!("/v2/searches/{search_id}"); + + self.forward(HttpMethod::Get, &path, params, None).await + } + + #[rmcp::tool( + name = "elf_searches_timeline", + description = "Build a timeline view from a search session.", + input_schema = searches_timeline_schema() + )] + async fn elf_searches_timeline( + &self, + mut params: JsonObject, + ) -> Result { + let search_id = take_required_string(&mut params, "search_id")?; + let path = format!("/v2/searches/{search_id}/timeline"); + + self.forward(HttpMethod::Get, &path, params, None).await + } + + #[rmcp::tool( + name = "elf_searches_notes", + description = "Fetch note details for selected note_ids from a search session. l0/l1 strip evidence/source_ref; l2 returns full detail.", + input_schema = searches_notes_schema() + )] + async fn elf_searches_notes( + &self, + mut params: JsonObject, + ) -> Result { + let search_id = take_required_string(&mut params, "search_id")?; + let path = format!("/v2/searches/{search_id}/notes"); + + self.forward(HttpMethod::Post, &path, params, None).await + } + + #[rmcp::tool( + name = "elf_notes_list", + description = "List notes in a tenant and project with optional filters.", + input_schema = notes_list_schema() + )] + async fn elf_notes_list(&self, params: JsonObject) -> Result { + self.forward(HttpMethod::Get, "/v2/notes", params, None).await + } + + #[rmcp::tool( + name = "elf_notes_get", + description = "Fetch a single note by note_id.", + input_schema = notes_get_schema() + )] + async fn elf_notes_get(&self, mut params: JsonObject) -> Result { + let note_id = take_required_string(&mut params, "note_id")?; + let path = format!("/v2/notes/{note_id}"); + + self.forward(HttpMethod::Get, &path, JsonObject::new(), None).await + } + + #[rmcp::tool( + name = "elf_notes_patch", + description = "Patch a note by note_id. Only provided fields are updated.", + input_schema = notes_patch_schema() + )] + async fn elf_notes_patch(&self, mut params: JsonObject) -> Result { + let note_id = take_required_string(&mut params, "note_id")?; + let path = format!("/v2/notes/{note_id}"); + + self.forward(HttpMethod::Patch, &path, params, None).await + } + + #[rmcp::tool( + name = "elf_notes_delete", + description = "Delete a note by note_id.", + input_schema = notes_get_schema() + )] + async fn elf_notes_delete(&self, mut params: JsonObject) -> Result { + let note_id = take_required_string(&mut params, "note_id")?; + let path = format!("/v2/notes/{note_id}"); + + self.forward(HttpMethod::Delete, &path, JsonObject::new(), None).await + } + + #[rmcp::tool( + name = "elf_notes_publish", + description = "Publish a note from agent_private into a shared space (team_shared or org_shared).", + input_schema = notes_publish_schema() + )] + async fn elf_notes_publish(&self, mut params: JsonObject) -> Result { + let note_id = take_required_string(&mut params, "note_id")?; + let path = format!("/v2/notes/{note_id}/publish"); + + self.forward(HttpMethod::Post, &path, params, None).await + } + + #[rmcp::tool( + name = "elf_notes_unpublish", + description = "Unpublish a shared note back into agent_private scope.", + input_schema = notes_unpublish_schema() + )] + async fn elf_notes_unpublish( + &self, + mut params: JsonObject, + ) -> Result { + let note_id = take_required_string(&mut params, "note_id")?; + let path = format!("/v2/notes/{note_id}/unpublish"); + + self.forward(HttpMethod::Post, &path, params, None).await + } + + #[rmcp::tool( + name = "elf_space_grants_list", + description = "List sharing grants for a space (team_shared or org_shared).", + input_schema = space_grants_list_schema() + )] + async fn elf_space_grants_list( + &self, + mut params: JsonObject, + ) -> Result { + let space = take_required_string(&mut params, "space")?; + let path = format!("/v2/spaces/{space}/grants"); + + self.forward(HttpMethod::Get, &path, params, None).await + } + + #[rmcp::tool( + name = "elf_space_grant_upsert", + description = "Upsert a sharing grant for a space (team_shared or org_shared).", + input_schema = space_grant_upsert_schema() + )] + async fn elf_space_grant_upsert( + &self, + mut params: JsonObject, + ) -> Result { + let space = take_required_string(&mut params, "space")?; + let path = format!("/v2/spaces/{space}/grants"); + + self.forward(HttpMethod::Post, &path, params, None).await + } + + #[rmcp::tool( + name = "elf_space_grant_revoke", + description = "Revoke a sharing grant for a space (team_shared or org_shared).", + input_schema = space_grant_revoke_schema() + )] + async fn elf_space_grant_revoke( + &self, + mut params: JsonObject, + ) -> Result { + let space = take_required_string(&mut params, "space")?; + let path = format!("/v2/spaces/{space}/grants/revoke"); + + self.forward(HttpMethod::Post, &path, params, None).await + } + + #[rmcp::tool( + name = "elf_admin_traces_recent_list", + description = "List recent traces by tenant/project with optional cursor and filters.", + input_schema = admin_traces_recent_list_schema() + )] + async fn elf_admin_traces_recent_list( + &self, + params: JsonObject, + ) -> Result { + self.forward(HttpMethod::Get, "/v2/admin/traces/recent", params, None).await + } + + #[rmcp::tool( + name = "elf_admin_trace_get", + description = "Fetch trace metadata, items, and optional trajectory summary by trace_id.", + input_schema = admin_trace_get_schema() + )] + async fn elf_admin_trace_get( + &self, + mut params: JsonObject, + ) -> Result { + let trace_id = take_required_string(&mut params, "trace_id")?; + let path = format!("/v2/admin/traces/{trace_id}"); + + self.forward(HttpMethod::Get, &path, JsonObject::new(), None).await + } + + #[rmcp::tool( + name = "elf_admin_trajectory_get", + description = "Fetch trace trajectory and stage payload by trace_id.", + input_schema = admin_trajectory_get_schema() + )] + async fn elf_admin_trajectory_get( + &self, + mut params: JsonObject, + ) -> Result { + let trace_id = take_required_string(&mut params, "trace_id")?; + let path = format!("/v2/admin/trajectories/{trace_id}"); + + self.forward(HttpMethod::Get, &path, JsonObject::new(), None).await + } + + #[rmcp::tool( + name = "elf_admin_trace_item_get", + description = "Fetch a trace item explain payload by item_id.", + input_schema = admin_trace_item_get_schema() + )] + async fn elf_admin_trace_item_get( + &self, + mut params: JsonObject, + ) -> Result { + let item_id = take_required_string(&mut params, "item_id")?; + let path = format!("/v2/admin/trace-items/{item_id}"); + + self.forward(HttpMethod::Get, &path, JsonObject::new(), None).await + } + + #[rmcp::tool( + name = "elf_admin_note_provenance_get", + description = "Fetch provenance bundle and related history for one note.", + input_schema = admin_note_provenance_get_schema() + )] + async fn elf_admin_note_provenance_get( + &self, + mut params: JsonObject, + ) -> Result { + let note_id = take_required_string(&mut params, "note_id")?; + let path = format!("/v2/admin/notes/{note_id}/provenance"); + + self.forward(HttpMethod::Get, &path, JsonObject::new(), None).await + } + + #[rmcp::tool( + name = "elf_admin_memory_history_get", + description = "Fetch chronological memory history for one note.", + input_schema = admin_memory_history_get_schema() + )] + async fn elf_admin_memory_history_get( + &self, + mut params: JsonObject, + ) -> Result { + let note_id = take_required_string(&mut params, "note_id")?; + let path = format!("/v2/admin/notes/{note_id}/history"); + + self.forward(HttpMethod::Get, &path, JsonObject::new(), None).await } #[rmcp::tool( - name = "memory_add_event", - description = "Add memory extracted from event messages.", - input_schema = any_json_schema() - )] - async fn memory_add_event(&self, params: JsonObject) -> Result { - self.forward(HttpMethod::Post, "/v1/memory/add_event", params).await + name = "elf_admin_trace_bundle_get", + description = "Fetch trace bundle for replay and diagnostics by trace_id.", + input_schema = admin_trace_bundle_get_schema() + )] + async fn elf_admin_trace_bundle_get( + &self, + mut params: JsonObject, + ) -> Result { + let trace_id = take_required_string(&mut params, "trace_id")?; + let path = format!("/v2/admin/traces/{trace_id}/bundle"); + + self.forward(HttpMethod::Get, &path, params, None).await } #[rmcp::tool( - name = "memory_search", - description = "Search memory notes.", - input_schema = any_json_schema() - )] - async fn memory_search(&self, params: JsonObject) -> Result { - self.forward(HttpMethod::Post, "/v1/memory/search", params).await + name = "elf_admin_events_ingestion_profiles_list", + description = "List latest ingestion profiles for add_event.", + input_schema = admin_ingestion_profiles_list_schema() + )] + async fn elf_admin_events_ingestion_profiles_list( + &self, + _params: JsonObject, + ) -> Result { + self.forward( + HttpMethod::Get, + "/v2/admin/events/ingestion-profiles", + JsonObject::new(), + None, + ) + .await + } + + #[rmcp::tool( + name = "elf_admin_events_ingestion_profiles_create", + description = "Create a new ingestion profile version for add_event.", + input_schema = admin_ingestion_profiles_create_schema() + )] + async fn elf_admin_events_ingestion_profiles_create( + &self, + params: JsonObject, + ) -> Result { + self.forward(HttpMethod::Post, "/v2/admin/events/ingestion-profiles", params, None).await } #[rmcp::tool( - name = "memory_search_explain", - description = "Explain a search result using result_handle.", - input_schema = any_json_schema() - )] - async fn memory_search_explain(&self, params: JsonObject) -> Result { - self.forward(HttpMethod::Get, "/v1/memory/search/explain", params).await + name = "elf_admin_events_ingestion_profile_get", + description = "Get a single ingestion profile by id/version for add_event.", + input_schema = admin_ingestion_profile_get_schema() + )] + async fn elf_admin_events_ingestion_profile_get( + &self, + mut params: JsonObject, + ) -> Result { + let profile_id = take_required_string(&mut params, "profile_id")?; + let path = format!("/v2/admin/events/ingestion-profiles/{profile_id}"); + + self.forward(HttpMethod::Get, &path, params, None).await } #[rmcp::tool( - name = "memory_list", - description = "List memory notes.", - input_schema = any_json_schema() - )] - async fn memory_list(&self, params: JsonObject) -> Result { - self.forward(HttpMethod::Get, "/v1/memory/list", params).await + name = "elf_admin_events_ingestion_profile_versions_list", + description = "List all versions of one ingestion profile for add_event.", + input_schema = admin_ingestion_profile_versions_list_schema() + )] + async fn elf_admin_events_ingestion_profile_versions_list( + &self, + mut params: JsonObject, + ) -> Result { + let profile_id = take_required_string(&mut params, "profile_id")?; + let path = format!("/v2/admin/events/ingestion-profiles/{profile_id}/versions"); + + self.forward(HttpMethod::Get, &path, params, None).await } #[rmcp::tool( - name = "memory_update", - description = "Update memory notes.", - input_schema = any_json_schema() - )] - async fn memory_update(&self, params: JsonObject) -> Result { - self.forward(HttpMethod::Post, "/v1/memory/update", params).await + name = "elf_admin_events_ingestion_profile_default_get", + description = "Get the active default ingestion profile for add_event.", + input_schema = admin_ingestion_profile_default_get_schema() + )] + async fn elf_admin_events_ingestion_profile_default_get( + &self, + _params: JsonObject, + ) -> Result { + self.forward( + HttpMethod::Get, + "/v2/admin/events/ingestion-profiles/default", + JsonObject::new(), + None, + ) + .await } #[rmcp::tool( - name = "memory_delete", - description = "Delete memory notes.", - input_schema = any_json_schema() - )] - async fn memory_delete(&self, params: JsonObject) -> Result { - self.forward(HttpMethod::Post, "/v1/memory/delete", params).await + name = "elf_admin_events_ingestion_profile_default_set", + description = "Set the default ingestion profile for add_event.", + input_schema = admin_ingestion_profile_default_set_schema() + )] + async fn elf_admin_events_ingestion_profile_default_set( + &self, + params: JsonObject, + ) -> Result { + self.forward(HttpMethod::Put, "/v2/admin/events/ingestion-profiles/default", params, None) + .await } } -#[rmcp::tool_handler] +#[rmcp::tool_handler(router = self.tool_router)] impl ServerHandler for ElfMcp { fn get_info(&self) -> ServerInfo { - ServerInfo { - instructions: Some( - "ELF MCP adapter that forwards tool calls to the ELF HTTP API.".to_string(), - ), - capabilities: ServerCapabilities::builder().enable_tools().build(), - ..Default::default() - } + ServerInfo::new(ServerCapabilities::builder().enable_tools().build()) + .with_instructions("ELF MCP adapter that forwards tool calls to the ELF HTTP API.") } } -pub async fn serve_mcp(bind_addr: &str, api_base: &str) -> Result<()> { +pub async fn serve_mcp( + bind_addr: &str, + api_base: &str, + admin_base: &str, + auth_state: McpAuthState, + mcp_context: &McpContext, +) -> Result<()> { let bind_addr: SocketAddr = bind_addr.parse()?; let api_base = normalize_api_base(api_base); + let admin_base = normalize_api_base(admin_base); + let context = ElfContextHeaders::new(mcp_context); + let middleware_auth_state = auth_state.clone(); + let client_auth_state = auth_state.clone(); let session_manager: Arc = Default::default(); let service = StreamableHttpService::new( - move || Ok(ElfMcp::new(api_base.clone())), + move || { + Ok(ElfMcp::new( + api_base.clone(), + admin_base.clone(), + context.clone(), + client_auth_state.clone(), + )) + }, session_manager, StreamableHttpServerConfig::default(), ); - let router = Router::new().fallback_service(service); + let router = Router::new() + .fallback_service(service) + .layer(middleware::from_fn_with_state(middleware_auth_state, mcp_auth_middleware)); let listener = TcpListener::bind(bind_addr).await?; + axum::serve(listener, router).await?; + Ok(()) } +fn is_admin_path(path: &str) -> bool { + path.starts_with("/v2/admin/") +} + +fn is_authorized(headers: &HeaderMap, auth_state: &McpAuthState) -> bool { + match auth_state { + McpAuthState::Off => true, + McpAuthState::StaticKeys { bearer_token } => + read_bearer_token(headers).is_some_and(|token| token == bearer_token), + } +} + +fn read_bearer_token(headers: &HeaderMap) -> Option<&str> { + let raw = headers.get(HEADER_AUTHORIZATION)?; + let value = raw.to_str().ok()?.trim(); + let token = value.strip_prefix("Bearer ")?.trim(); + + if token.is_empty() { None } else { Some(token) } +} + fn normalize_api_base(raw: &str) -> String { - let trimmed = raw.trim_end_matches('/'); - if trimmed.starts_with("http://") || trimmed.starts_with("https://") { - trimmed.to_string() + let trimmed = raw.trim().trim_end_matches('/'); + let (scheme, rest) = if let Some(value) = trimmed.strip_prefix("http://") { + ("http://", value) + } else if let Some(value) = trimmed.strip_prefix("https://") { + ("https://", value) } else { - format!("http://{trimmed}") - } + ("http://", trimmed) + }; + // elf-mcp runs on the same host as elf-api. If elf-api binds to a wildcard address, use + // loopback for forwarding. + let rest = if let Some(value) = rest.strip_prefix("0.0.0.0:") { + format!("127.0.0.1:{value}") + } else if let Some(value) = rest.strip_prefix("[::]:") { + format!("127.0.0.1:{value}") + } else { + rest.to_string() + }; + + format!("{scheme}{rest}") } fn params_to_query(params: JsonObject) -> Vec<(String, String)> { @@ -182,23 +802,732 @@ fn params_to_query(params: JsonObject) -> Vec<(String, String)> { .collect() } -fn any_json_schema() -> Arc { +fn take_required_string(params: &mut JsonObject, key: &str) -> Result { + let value = params + .remove(key) + .ok_or_else(|| ErrorData::invalid_params(format!("{key} is required."), None))?; + let text = value + .as_str() + .ok_or_else(|| ErrorData::invalid_params(format!("{key} must be a string."), None))? + .trim(); + + if text.is_empty() { + return Err(ErrorData::invalid_params(format!("{key} must be non-empty."), None)); + } + + Ok(text.to_string()) +} + +fn take_optional_string(params: &mut JsonObject, key: &str) -> Result, ErrorData> { + let Some(value) = params.remove(key) else { return Ok(None) }; + let text = value + .as_str() + .ok_or_else(|| ErrorData::invalid_params(format!("{key} must be a string."), None))? + .trim(); + + if text.is_empty() { + return Err(ErrorData::invalid_params(format!("{key} must be non-empty."), None)); + } + + Ok(Some(text.to_string())) +} + +fn notes_structured_entity_schema() -> Value { + serde_json::json!({ + "type": "object", + "additionalProperties": true, + "required": ["canonical"], + "properties": { + "canonical": { "type": "string" }, + "kind": { "type": ["string", "null"] }, + "aliases": { + "type": ["array", "null"], + "items": { "type": "string" } + } + } + }) +} + +fn notes_structured_relation_object_schema() -> Value { + serde_json::json!({ + "type": "object", + "additionalProperties": true, + "oneOf": [ + { + "type": "object", + "required": ["entity"], + "properties": { + "entity": notes_structured_entity_schema(), + "value": { "type": "null" } + } + }, + { + "type": "object", + "required": ["value"], + "properties": { + "entity": { "type": ["object", "null"] }, + "value": { "type": "string" } + } + } + ] + }) +} + +fn notes_structured_schema() -> Value { + serde_json::json!({ + "type": ["object", "null"], + "additionalProperties": true, + "properties": { + "summary": { "type": ["string", "null"] }, + "facts": { + "type": ["array", "null"], + "items": { "type": "string" } + }, + "concepts": { + "type": ["array", "null"], + "items": { "type": "string" } + }, + "entities": { + "type": ["array", "null"], + "items": notes_structured_entity_schema() + }, + "relations": { + "type": ["array", "null"], + "items": { + "type": "object", + "additionalProperties": true, + "required": ["subject", "predicate", "object"], + "properties": { + "subject": notes_structured_entity_schema(), + "predicate": { "type": "string" }, + "object": notes_structured_relation_object_schema(), + "valid_from": { "type": ["string", "null"], "format": "date-time" }, + "valid_to": { "type": ["string", "null"], "format": "date-time" } + } + } + } + } + }) +} + +fn notes_ingest_schema() -> Arc { + Arc::new( + serde_json::from_value(serde_json::json!({ + "type": "object", + "additionalProperties": true, + "required": ["scope", "notes"], + "properties": { + "scope": { "type": "string" }, + "notes": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true, + "required": ["type", "text", "importance", "confidence", "source_ref"], + "properties": { + "type": { "type": "string" }, + "key": { "type": ["string", "null"] }, + "text": { "type": "string" }, + "write_policy": { "type": ["object", "null"] }, + "importance": { "type": "number" }, + "confidence": { "type": "number" }, + "ttl_days": { "type": ["integer", "null"] }, + "source_ref": { "type": "object", "additionalProperties": true }, + "structured": notes_structured_schema() + } + } + } + } + })) + .expect("notes_ingest_schema must be valid JSON object"), + ) +} + +fn graph_query_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["subject"], + "properties": { + "subject": { + "oneOf": [ + { + "type": "object", + "required": ["entity_id"], + "properties": { + "entity_id": { + "type": "string", + "format": "uuid" + } + } + }, + { + "type": "object", + "required": ["surface"], + "properties": { + "surface": { "type": "string" } + } + } + ] + }, + "predicate": { + "oneOf": [ + { + "type": "object", + "required": ["predicate_id"], + "properties": { + "predicate_id": { + "type": "string", + "format": "uuid" + } + } + }, + { + "type": "object", + "required": ["surface"], + "properties": { + "surface": { "type": "string" } + } + } + ] + }, + "scopes": { + "type": ["array", "null"], + "items": { "type": "string" } + }, + "as_of": { + "type": ["string", "null"], + "format": "date-time" + }, + "limit": { + "type": ["integer", "null"], + "minimum": 1, + "maximum": 200 + }, + "explain": { "type": ["boolean", "null"] } + } + })) +} + +fn events_ingest_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["messages"], + "properties": { + "scope": { "type": ["string", "null"] }, + "dry_run": { "type": ["boolean", "null"] }, + "ingestion_profile": { + "type": "object", + "additionalProperties": true, + "required": ["id"], + "properties": { + "id": { "type": "string" }, + "version": { "type": ["integer", "null"] }, + }, + }, + "messages": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true, + "required": ["role", "content"], + "properties": { + "role": { "type": "string" }, + "content": { "type": "string" }, + "ts": { "type": ["string", "null"] }, + "msg_id": { "type": ["string", "null"] }, + "write_policy": { "type": ["object", "null"] } + } + } + } + } + })) +} + +fn docs_put_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["scope", "content", "source_ref"], + "properties": { + "scope": { "type": "string", "enum": ["agent_private", "project_shared", "org_shared"] }, + "doc_type": { + "type": ["string", "null"], + "enum": ["knowledge", "chat", "search", "dev", null] + }, + "title": { "type": ["string", "null"] }, + "source_ref": { + "type": "object", + "additionalProperties": true, + "required": ["schema", "doc_type", "ts"], + "properties": { + "schema": { "type": "string", "enum": ["doc_source_ref/v1"] }, + "doc_type": { + "type": "string", + "enum": ["knowledge", "chat", "search", "dev"], + }, + "ts": { "type": "string", "format": "date-time" }, + "thread_id": { "type": "string" }, + "role": { "type": "string" }, + "query": { "type": "string" }, + "url": { "type": "string" }, + "domain": { "type": "string" }, + "repo": { "type": "string" }, + "commit_sha": { "type": "string" }, + "pr_number": { "type": "integer" }, + "issue_number": { "type": "integer" } + }, + "allOf": [ + { + "if": { "properties": { "doc_type": { "const": "chat" } }, "required": ["doc_type"] }, + "then": { + "required": ["thread_id", "role"] + } + }, + { + "if": { "properties": { "doc_type": { "const": "search" } }, "required": ["doc_type"] }, + "then": { + "required": ["query", "url", "domain"] + } + }, + { + "if": { "properties": { "doc_type": { "const": "dev" } }, "required": ["doc_type"] }, + "then": { + "required": ["repo"], + "oneOf": [ + { "required": ["commit_sha"] }, + { "required": ["pr_number"] }, + { "required": ["issue_number"] } + ] + } + } + ] + }, + "write_policy": { "type": ["object", "null"] }, + "content": { "type": "string" } + }, + })) +} + +fn docs_get_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["doc_id"], + "properties": { + "doc_id": { "type": "string" } + } + })) +} + +fn docs_search_l0_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["query"], + "properties": { + "query": { "type": "string" }, + "scope": { "type": ["string", "null"], "enum": ["agent_private", "project_shared", "org_shared", null] }, + "status": { "type": ["string", "null"], "enum": ["active", "deleted", null] }, + "doc_type": { + "type": ["string", "null"], + "enum": ["knowledge", "chat", "search", "dev", null] + }, + "agent_id": { "type": ["string", "null"] }, + "thread_id": { "type": ["string", "null"] }, + "updated_after": { "type": ["string", "null"], "format": "date-time" }, + "updated_before": { "type": ["string", "null"], "format": "date-time" }, + "ts_gte": { "type": ["string", "null"], "format": "date-time" }, + "ts_lte": { "type": ["string", "null"], "format": "date-time" }, + "top_k": { "type": ["integer", "null"] }, + "candidate_k": { "type": ["integer", "null"] }, + "sparse_mode": { + "type": ["string", "null"], + "enum": ["auto", "on", "off", null] + }, + "domain": { "type": ["string", "null"] }, + "repo": { "type": ["string", "null"] }, + "explain": { "type": ["boolean", "null"] }, + "read_profile": { "type": ["string", "null"] } + } + })) +} + +fn docs_excerpts_get_schema() -> Arc { Arc::new(rmcp::object!({ "type": "object", - "additionalProperties": true + "additionalProperties": true, + "required": ["doc_id", "level"], + "properties": { + "doc_id": { "type": "string" }, + "level": { "type": "string", "enum": ["L0", "L1", "L2"] }, + "explain": { "type": ["boolean", "null"] }, + "chunk_id": { "type": ["string", "null"] }, + "quote": { + "type": ["object", "null"], + "additionalProperties": true, + "required": ["exact"], + "properties": { + "exact": { "type": "string" }, + "prefix": { "type": ["string", "null"] }, + "suffix": { "type": ["string", "null"] } + } + }, + "position": { + "type": ["object", "null"], + "additionalProperties": true, + "required": ["start", "end"], + "properties": { + "start": { "type": "integer" }, + "end": { "type": "integer" } + } + } + } + })) +} + +fn core_blocks_get_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "properties": { + "read_profile": { "type": ["string", "null"] } + } + })) +} + +fn searches_create_schema() -> Arc { + let filter_schema = rmcp::object!({ + "type": "object", + "required": ["schema", "expr"], + "properties": { + "schema": { + "type": "string", + "const": "search_filter_expr/v1", + }, + "expr": { + "type": "object", + "additionalProperties": true, + }, + }, + "additionalProperties": true, + }); + + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["query", "mode"], + "properties": { + "query": { "type": "string" }, + "mode": { "type": "string", "enum": ["quick_find", "planned_search"] }, + "payload_level": { + "type": ["string", "null"], + "enum": ["l0", "l1", "l2", null] + }, + "top_k": { "type": ["integer", "null"] }, + "candidate_k": { "type": ["integer", "null"] }, + "filter": filter_schema, + "read_profile": { "type": ["string", "null"] } + } + })) +} + +fn searches_get_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["search_id"], + "properties": { + "search_id": { "type": "string" }, + "payload_level": { + "type": ["string", "null"], + "enum": ["l0", "l1", "l2", null] + }, + "top_k": { "type": ["integer", "null"] }, + "touch": { "type": ["boolean", "null"] } + } + })) +} + +fn searches_timeline_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["search_id"], + "properties": { + "search_id": { "type": "string" }, + "payload_level": { + "type": ["string", "null"], + "enum": ["l0", "l1", "l2", null] + }, + "group_by": { "type": ["string", "null"] } + } + })) +} + +fn searches_notes_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["search_id", "note_ids"], + "properties": { + "search_id": { "type": "string" }, + "payload_level": { + "type": ["string", "null"], + "enum": ["l0", "l1", "l2", null] + }, + "note_ids": { "type": "array", "items": { "type": "string" } }, + "record_hits": { "type": ["boolean", "null"] } + } + })) +} + +fn notes_list_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "properties": { + "scope": { "type": ["string", "null"] }, + "status": { "type": ["string", "null"] }, + "type": { "type": ["string", "null"] } + } + })) +} + +fn notes_get_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["note_id"], + "properties": { + "note_id": { "type": "string" } + } + })) +} + +fn notes_patch_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["note_id"], + "properties": { + "note_id": { "type": "string" }, + "text": { "type": ["string", "null"] }, + "importance": { "type": ["number", "null"] }, + "confidence": { "type": ["number", "null"] }, + "ttl_days": { "type": ["integer", "null"] } + } + })) +} + +fn notes_publish_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["note_id", "space"], + "properties": { + "note_id": { "type": "string" }, + "space": { "type": "string", "enum": ["team_shared", "org_shared"] } + } + })) +} + +fn notes_unpublish_schema() -> Arc { + notes_publish_schema() +} + +fn space_grants_list_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["space"], + "properties": { + "space": { "type": "string", "enum": ["team_shared", "org_shared"] } + } + })) +} + +fn space_grant_upsert_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["space", "grantee_kind"], + "properties": { + "space": { "type": "string", "enum": ["team_shared", "org_shared"] }, + "grantee_kind": { "type": "string", "enum": ["project", "agent"] }, + "grantee_agent_id": { "type": ["string", "null"] } + } + })) +} + +fn space_grant_revoke_schema() -> Arc { + space_grant_upsert_schema() +} + +fn admin_traces_recent_list_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": [], + "properties": { + "limit": { + "type": ["integer", "null"], + "minimum": 1, + "maximum": 200 + }, + "cursor_created_at": { "type": ["string", "null"], "format": "date-time" }, + "cursor_trace_id": { "type": ["string", "null"] }, + "agent_id": { "type": ["string", "null"] }, + "read_profile": { "type": ["string", "null"] }, + "created_after": { "type": ["string", "null"], "format": "date-time" }, + "created_before": { "type": ["string", "null"], "format": "date-time" } + } + })) +} + +fn admin_trace_get_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["trace_id"], + "properties": { + "trace_id": { "type": "string" } + } + })) +} + +fn admin_trajectory_get_schema() -> Arc { + admin_trace_get_schema() +} + +fn admin_trace_item_get_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["item_id"], + "properties": { + "item_id": { "type": "string" } + } + })) +} + +fn admin_note_provenance_get_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["note_id"], + "properties": { + "note_id": { "type": "string" } + } + })) +} + +fn admin_memory_history_get_schema() -> Arc { + admin_note_provenance_get_schema() +} + +fn admin_trace_bundle_get_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["trace_id"], + "properties": { + "trace_id": { "type": "string" }, + "mode": { "type": ["string", "null"], "enum": ["bounded", "full", null] }, + "stage_items_limit": { + "type": ["integer", "null"], + "minimum": 0, + "maximum": 256 + }, + "candidates_limit": { + "type": ["integer", "null"], + "minimum": 0, + "maximum": 1_000 + } + } + })) +} + +fn admin_ingestion_profiles_list_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": [], + "properties": {} + })) +} + +fn admin_ingestion_profiles_create_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["profile_id", "profile", "created_by"], + "properties": { + "profile_id": { "type": "string" }, + "version": { "type": ["integer", "null"] }, + "profile": { "type": "object", "additionalProperties": true }, + "created_by": { "type": "string" }, + } })) } -async fn handle_response(response: reqwest::Response) -> Result { +fn admin_ingestion_profile_get_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["profile_id"], + "properties": { + "profile_id": { "type": "string" }, + "version": { "type": ["integer", "null"] }, + } + })) +} + +fn admin_ingestion_profile_versions_list_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["profile_id"], + "properties": { + "profile_id": { "type": "string" } + } + })) +} + +fn admin_ingestion_profile_default_get_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": [], + "properties": {} + })) +} + +fn admin_ingestion_profile_default_set_schema() -> Arc { + Arc::new(rmcp::object!({ + "type": "object", + "additionalProperties": true, + "required": ["profile_id"], + "properties": { + "profile_id": { "type": "string" }, + "version": { "type": ["integer", "null"] }, + } + })) +} + +async fn handle_response(response: reqwest::Response) -> Result { let status = response.status(); let bytes = response .bytes() .await - .map_err(|err| McpError::internal_error(format!("ELF API response error: {err}"), None))?; + .map_err(|err| ErrorData::internal_error(format!("ELF API response error: {err}"), None))?; let parsed = serde_json::from_slice::(&bytes).unwrap_or_else(|_| { let raw = String::from_utf8_lossy(&bytes).to_string(); + serde_json::json!({ "raw": raw }) }); + if status.is_success() { Ok(CallToolResult::structured(parsed)) } else { @@ -206,12 +1535,231 @@ async fn handle_response(response: reqwest::Response) -> Result, + req: Request, + next: Next, +) -> axum::response::Response { + if !is_authorized(req.headers(), &auth_state) { + return ( + StatusCode::UNAUTHORIZED, + "Authentication required for security.auth_mode=static_keys with a Bearer token.", + ) + .into_response(); + } + + next.run(req).await +} + #[cfg(test)] mod tests { - use super::*; - use std::collections::HashMap; + use std::{ + collections::HashMap, + sync::{Arc, Mutex}, + time::Duration, + }; + + use axum::{ + Json, Router, + extract::State, + http::{HeaderMap, Method, Uri}, + routing, + }; + use serde_json::Value; + use tokio::{net::TcpListener, sync::oneshot, time}; - #[derive(Debug, Clone, Copy, PartialEq, Eq)] + use crate::app::{ + McpAuthState, + server::{ElfContextHeaders, ElfMcp, HttpMethod}, + }; + use elf_config::McpContext; + + type RequestRecorder = Arc>>>; + + const ALL_TOOL_DEFINITIONS: [ToolDefinition; 30] = [ + ToolDefinition::new( + "elf_notes_ingest", + HttpMethod::Post, + "/v2/notes/ingest", + "Ingest deterministic notes into ELF. This tool never calls an LLM.", + ), + ToolDefinition::new( + "elf_graph_query", + HttpMethod::Post, + "/v2/graph/query", + "Query graph entities and relations by structured criteria.", + ), + ToolDefinition::new( + "elf_events_ingest", + HttpMethod::Post, + "/v2/events/ingest", + "Ingest an event by extracting evidence-bound notes using the configured LLM extractor.", + ), + ToolDefinition::new( + "elf_searches_create", + HttpMethod::Post, + "/v2/searches", + "Create a search session using quick-find or planned-search mode. Response includes optional trajectory_summary.", + ), + ToolDefinition::new( + "elf_core_blocks_get", + HttpMethod::Get, + "/v2/core-blocks", + "Fetch core memory blocks explicitly attached to the configured agent and read profile.", + ), + ToolDefinition::new( + "elf_searches_get", + HttpMethod::Get, + "/v2/searches/{search_id}", + "Fetch a search session index view by search_id, including optional trajectory_summary.", + ), + ToolDefinition::new( + "elf_searches_timeline", + HttpMethod::Get, + "/v2/searches/{search_id}/timeline", + "Build a timeline view from a search session.", + ), + ToolDefinition::new( + "elf_searches_notes", + HttpMethod::Post, + "/v2/searches/{search_id}/notes", + "Fetch note details for selected note_ids from a search session. l0/l1 strip evidence/source_ref/structured; l2 returns full detail.", + ), + ToolDefinition::new( + "elf_notes_list", + HttpMethod::Get, + "/v2/notes", + "List notes in a tenant and project with optional filters.", + ), + ToolDefinition::new( + "elf_notes_get", + HttpMethod::Get, + "/v2/notes/{note_id}", + "Fetch a single note by note_id.", + ), + ToolDefinition::new( + "elf_notes_patch", + HttpMethod::Patch, + "/v2/notes/{note_id}", + "Patch a note by note_id. Only provided fields are updated.", + ), + ToolDefinition::new( + "elf_notes_delete", + HttpMethod::Delete, + "/v2/notes/{note_id}", + "Delete a note by note_id.", + ), + ToolDefinition::new( + "elf_notes_publish", + HttpMethod::Post, + "/v2/notes/{note_id}/publish", + "Publish a note from agent_private into a shared space (team_shared or org_shared).", + ), + ToolDefinition::new( + "elf_notes_unpublish", + HttpMethod::Post, + "/v2/notes/{note_id}/unpublish", + "Unpublish a shared note back into agent_private scope.", + ), + ToolDefinition::new( + "elf_space_grants_list", + HttpMethod::Get, + "/v2/spaces/{space}/grants", + "List sharing grants for a space (team_shared or org_shared).", + ), + ToolDefinition::new( + "elf_space_grant_upsert", + HttpMethod::Post, + "/v2/spaces/{space}/grants", + "Upsert a sharing grant for a space (team_shared or org_shared).", + ), + ToolDefinition::new( + "elf_space_grant_revoke", + HttpMethod::Post, + "/v2/spaces/{space}/grants/revoke", + "Revoke a sharing grant for a space (team_shared or org_shared).", + ), + ToolDefinition::new( + "elf_admin_traces_recent_list", + HttpMethod::Get, + "/v2/admin/traces/recent", + "List recent traces by tenant/project with optional cursor and filters.", + ), + ToolDefinition::new( + "elf_admin_trace_get", + HttpMethod::Get, + "/v2/admin/traces/{trace_id}", + "Fetch trace metadata, items, and optional trajectory summary by trace_id.", + ), + ToolDefinition::new( + "elf_admin_trajectory_get", + HttpMethod::Get, + "/v2/admin/trajectories/{trace_id}", + "Fetch trace trajectory and stage payload by trace_id.", + ), + ToolDefinition::new( + "elf_admin_trace_item_get", + HttpMethod::Get, + "/v2/admin/trace-items/{item_id}", + "Fetch a trace item explain payload by item_id.", + ), + ToolDefinition::new( + "elf_admin_note_provenance_get", + HttpMethod::Get, + "/v2/admin/notes/{note_id}/provenance", + "Fetch provenance bundle for a note.", + ), + ToolDefinition::new( + "elf_admin_memory_history_get", + HttpMethod::Get, + "/v2/admin/notes/{note_id}/history", + "Fetch chronological memory history for a note.", + ), + ToolDefinition::new( + "elf_admin_trace_bundle_get", + HttpMethod::Get, + "/v2/admin/traces/{trace_id}/bundle", + "Fetch trace bundle for replay and diagnostics by trace_id.", + ), + ToolDefinition::new( + "elf_admin_events_ingestion_profiles_list", + HttpMethod::Get, + "/v2/admin/events/ingestion-profiles", + "List latest ingestion profiles for add_event.", + ), + ToolDefinition::new( + "elf_admin_events_ingestion_profiles_create", + HttpMethod::Post, + "/v2/admin/events/ingestion-profiles", + "Create a new ingestion profile version for add_event.", + ), + ToolDefinition::new( + "elf_admin_events_ingestion_profile_get", + HttpMethod::Get, + "/v2/admin/events/ingestion-profiles/{profile_id}", + "Get a single ingestion profile by id/version for add_event.", + ), + ToolDefinition::new( + "elf_admin_events_ingestion_profile_versions_list", + HttpMethod::Get, + "/v2/admin/events/ingestion-profiles/{profile_id}/versions", + "List all versions of one ingestion profile for add_event.", + ), + ToolDefinition::new( + "elf_admin_events_ingestion_profile_default_get", + HttpMethod::Get, + "/v2/admin/events/ingestion-profiles/default", + "Get the active default ingestion profile for add_event.", + ), + ToolDefinition::new( + "elf_admin_events_ingestion_profile_default_set", + HttpMethod::Put, + "/v2/admin/events/ingestion-profiles/default", + "Set the default ingestion profile for add_event.", + ), + ]; + + #[derive(Clone, Copy, Debug, Eq, PartialEq)] struct ToolDefinition { name: &'static str, method: HttpMethod, @@ -220,6 +1768,12 @@ mod tests { streaming: bool, } + struct RecordedRequest { + method: Method, + path: String, + body: Value, + } + impl ToolDefinition { const fn new( name: &'static str, @@ -231,66 +1785,44 @@ mod tests { } } - const TOOL_MEMORY_ADD_NOTE: &str = "memory_add_note"; - const TOOL_MEMORY_ADD_EVENT: &str = "memory_add_event"; - const TOOL_MEMORY_SEARCH: &str = "memory_search"; - const TOOL_MEMORY_LIST: &str = "memory_list"; - const TOOL_MEMORY_UPDATE: &str = "memory_update"; - const TOOL_MEMORY_DELETE: &str = "memory_delete"; - fn build_tools() -> HashMap<&'static str, ToolDefinition> { - let tools = [ - ToolDefinition::new( - TOOL_MEMORY_ADD_NOTE, - HttpMethod::Post, - "/v1/memory/add_note", - "Add memory notes.", - ), - ToolDefinition::new( - TOOL_MEMORY_ADD_EVENT, - HttpMethod::Post, - "/v1/memory/add_event", - "Add memory extracted from event messages.", - ), - ToolDefinition::new( - TOOL_MEMORY_SEARCH, - HttpMethod::Post, - "/v1/memory/search", - "Search memory notes.", - ), - ToolDefinition::new( - TOOL_MEMORY_LIST, - HttpMethod::Get, - "/v1/memory/list", - "List memory notes.", - ), - ToolDefinition::new( - TOOL_MEMORY_UPDATE, - HttpMethod::Post, - "/v1/memory/update", - "Update memory notes.", - ), - ToolDefinition::new( - TOOL_MEMORY_DELETE, - HttpMethod::Post, - "/v1/memory/delete", - "Delete memory notes.", - ), - ]; - - tools.into_iter().map(|tool| (tool.name, tool)).collect() + ALL_TOOL_DEFINITIONS.into_iter().map(|tool| (tool.name, tool)).collect() } #[test] fn registers_all_tools() { let tools = build_tools(); let expected = [ - TOOL_MEMORY_ADD_NOTE, - TOOL_MEMORY_ADD_EVENT, - TOOL_MEMORY_SEARCH, - TOOL_MEMORY_LIST, - TOOL_MEMORY_UPDATE, - TOOL_MEMORY_DELETE, + "elf_notes_ingest", + "elf_graph_query", + "elf_events_ingest", + "elf_core_blocks_get", + "elf_searches_create", + "elf_searches_get", + "elf_searches_timeline", + "elf_searches_notes", + "elf_notes_list", + "elf_notes_get", + "elf_notes_patch", + "elf_notes_delete", + "elf_notes_publish", + "elf_notes_unpublish", + "elf_space_grants_list", + "elf_space_grant_upsert", + "elf_space_grant_revoke", + "elf_admin_traces_recent_list", + "elf_admin_trace_get", + "elf_admin_trajectory_get", + "elf_admin_trace_item_get", + "elf_admin_note_provenance_get", + "elf_admin_memory_history_get", + "elf_admin_trace_bundle_get", + "elf_admin_events_ingestion_profiles_list", + "elf_admin_events_ingestion_profiles_create", + "elf_admin_events_ingestion_profile_get", + "elf_admin_events_ingestion_profile_versions_list", + "elf_admin_events_ingestion_profile_default_get", + "elf_admin_events_ingestion_profile_default_set", ]; for name in expected { @@ -299,4 +1831,378 @@ mod tests { assert_eq!(tools.len(), expected.len(), "Unexpected tool count for MCP registration."); } + + #[test] + fn notes_ingest_schema_includes_structured_entities_relations() { + let schema = super::notes_ingest_schema(); + let notes = schema + .get("properties") + .and_then(serde_json::Value::as_object) + .expect("notes ingest schema is missing properties.") + .get("notes") + .and_then(serde_json::Value::as_object) + .expect("notes schema is missing notes."); + let note_items = notes + .get("items") + .and_then(serde_json::Value::as_object) + .expect("notes schema is missing items."); + let note_properties = note_items + .get("properties") + .and_then(serde_json::Value::as_object) + .expect("notes schema is missing note item properties."); + let structured = note_properties + .get("structured") + .and_then(serde_json::Value::as_object) + .expect("notes schema is missing structured."); + let structured_type = structured + .get("type") + .and_then(serde_json::Value::as_array) + .expect("structured.type is not an array."); + + assert!( + structured_type.contains(&serde_json::Value::String("object".to_string())) + && structured_type.contains(&serde_json::Value::String("null".to_string())) + ); + + let structured_properties = structured + .get("properties") + .and_then(serde_json::Value::as_object) + .expect("structured schema is missing properties."); + + assert!(structured_properties.contains_key("entities")); + assert!(structured_properties.contains_key("relations")); + + let relation_object = structured_properties + .get("relations") + .and_then(serde_json::Value::as_object) + .and_then(|relations| relations.get("items")) + .and_then(serde_json::Value::as_object) + .and_then(|items| items.get("properties")) + .and_then(serde_json::Value::as_object) + .expect("relations schema is missing properties.") + .get("object") + .and_then(serde_json::Value::as_object) + .expect("relation schema is missing object."); + let one_of = relation_object + .get("oneOf") + .and_then(serde_json::Value::as_array) + .expect("relation object is missing oneOf."); + + assert_eq!(one_of.len(), 2, "relation object should have entity/value oneOf variants."); + assert!(one_of.iter().any(|variant| { + variant.as_object().is_some_and(|branch| { + branch + .get("required") + .and_then(serde_json::Value::as_array) + .is_some_and(|required| required.iter().any(|value| value == "entity")) + }) + })); + assert!(one_of.iter().any(|variant| { + variant.as_object().is_some_and(|branch| { + branch + .get("required") + .and_then(serde_json::Value::as_array) + .is_some_and(|required| required.iter().any(|value| value == "value")) + }) + })); + } + + #[test] + fn admin_paths_use_admin_api_base() { + let context = McpContext { + tenant_id: "tenant-a".to_string(), + project_id: "project-a".to_string(), + agent_id: "agent-a".to_string(), + read_profile: "private_plus_project".to_string(), + }; + let mcp = ElfMcp::new( + "http://127.0.0.1:9000".to_string(), + "http://127.0.0.1:9001".to_string(), + ElfContextHeaders::new(&context), + McpAuthState::Off, + ); + + assert_eq!(mcp.api_base_for_path("/v2/admin/traces/recent"), "http://127.0.0.1:9001"); + assert_eq!( + mcp.api_base_for_path("/v2/admin/notes/abcd/provenance"), + "http://127.0.0.1:9001" + ); + assert_eq!(mcp.api_base_for_path("/v2/admin/notes/abcd/history"), "http://127.0.0.1:9001"); + assert_eq!(mcp.api_base_for_path("/v2/searches"), "http://127.0.0.1:9000"); + } + + #[test] + fn off_mode_allows_requests_without_auth_header() { + let headers = HeaderMap::new(); + + assert!(super::is_authorized(&headers, &McpAuthState::Off)); + } + + #[test] + fn static_keys_mode_requires_authorization_bearer_header() { + let mut headers = HeaderMap::new(); + + headers + .insert(super::HEADER_AUTHORIZATION, "Bearer token-a".parse().expect("valid header")); + + assert!(super::is_authorized( + &headers, + &McpAuthState::StaticKeys { bearer_token: "token-a".to_string() } + )); + } + + #[test] + fn static_keys_mode_rejects_non_bearer_schemes() { + let mut headers = HeaderMap::new(); + + headers + .insert(super::HEADER_AUTHORIZATION, "bearer token-a".parse().expect("valid header")); + + assert!(!super::is_authorized( + &headers, + &McpAuthState::StaticKeys { bearer_token: "token-a".to_string() } + )); + } + + #[test] + fn docs_search_l0_schema_includes_filter_fields() { + let schema = super::docs_search_l0_schema(); + let properties = schema + .get("properties") + .and_then(serde_json::Value::as_object) + .expect("docs_search_l0 schema is missing properties."); + let required = ["query"]; + let expected = [ + "scope", + "status", + "doc_type", + "agent_id", + "thread_id", + "updated_after", + "updated_before", + "ts_gte", + "ts_lte", + "sparse_mode", + "domain", + "repo", + "explain", + ]; + + for field in required { + assert!( + schema.get("required").and_then(serde_json::Value::as_array).is_some_and( + |fields| { fields.iter().any(|value| value.as_str() == Some(field)) } + ), + "Missing required field {field}." + ); + } + for field in expected { + assert!(properties.contains_key(field), "Missing schema field: {field}."); + } + + assert_eq!( + properties.get("status").and_then(serde_json::Value::as_object).and_then(|status| { + status.get("enum").and_then(serde_json::Value::as_array).map(|vals| vals.to_vec()) + }), + Some(vec![ + serde_json::Value::String("active".to_string()), + serde_json::Value::String("deleted".to_string()), + serde_json::Value::Null, + ]) + ); + assert_eq!( + properties.get("sparse_mode").and_then(serde_json::Value::as_object).and_then( + |field| { + field + .get("enum") + .and_then(serde_json::Value::as_array) + .map(|vals| vals.to_vec()) + } + ), + Some(vec![ + serde_json::Value::String("auto".to_string()), + serde_json::Value::String("on".to_string()), + serde_json::Value::String("off".to_string()), + serde_json::Value::Null, + ]) + ); + } + + #[test] + fn docs_put_schema_includes_required_fields_and_write_policy() { + let schema = super::docs_put_schema(); + let properties = schema + .get("properties") + .and_then(serde_json::Value::as_object) + .expect("docs_put schema is missing properties."); + let required = ["scope", "content", "source_ref"]; + let expected = ["scope", "doc_type", "title", "source_ref", "write_policy", "content"]; + + for field in required { + assert!( + schema.get("required").and_then(serde_json::Value::as_array).is_some_and( + |fields| { fields.iter().any(|value| value.as_str() == Some(field)) } + ), + "Missing required field {field}." + ); + } + for field in expected { + assert!(properties.contains_key(field), "Missing schema field: {field}."); + } + + let write_policy = properties.get("write_policy").and_then(serde_json::Value::as_object); + + assert!( + write_policy.is_some_and(|field| { + field.get("type").and_then(serde_json::Value::as_array).is_some_and(|types| { + types.contains(&serde_json::Value::String("object".to_string())) + && types.contains(&serde_json::Value::String("null".to_string())) + }) + }), + "Missing write_policy object/null type in docs_put schema." + ); + } + + #[test] + fn docs_excerpts_get_schema_includes_l0_level_and_optional_explain() { + let schema = super::docs_excerpts_get_schema(); + let properties = schema + .get("properties") + .and_then(serde_json::Value::as_object) + .expect("docs_excerpts_get schema is missing properties."); + let level_values = properties + .get("level") + .and_then(|level| level.get("enum")) + .and_then(|values| values.as_array()) + .expect("docs_excerpts_get level schema is missing enum."); + + assert!(level_values.contains(&serde_json::Value::String("L0".to_string()))); + assert!(properties.contains_key("explain")); + } + + #[test] + fn payload_level_schema_for_search_tools_is_l0_l1_l2() { + for schema in [ + super::searches_create_schema(), + super::searches_get_schema(), + super::searches_timeline_schema(), + super::searches_notes_schema(), + ] { + let properties = schema + .get("properties") + .and_then(serde_json::Value::as_object) + .expect("Search schema is missing properties."); + let payload_level = properties + .get("payload_level") + .and_then(serde_json::Value::as_object) + .expect("payload_level field is missing from search schema."); + let payload_level_values = payload_level + .get("enum") + .and_then(serde_json::Value::as_array) + .expect("payload_level enum is missing."); + + assert_eq!(payload_level_values.len(), 4, "Unexpected payload_level enum length."); + assert!(payload_level_values.iter().any(|value| value.as_str() == Some("l0"))); + assert!(payload_level_values.iter().any(|value| value.as_str() == Some("l1"))); + assert!(payload_level_values.iter().any(|value| value.as_str() == Some("l2"))); + assert!(payload_level_values.iter().any(|value| value.is_null())); + } + } + + #[test] + fn searches_notes_tool_description_mentions_payload_level_shapes() { + let tools = build_tools(); + let tool = + tools.get("elf_searches_notes").expect("Missing elf_searches_notes tool definition."); + let description = tool.description.to_lowercase(); + + assert_eq!(tool.path, "/v2/searches/{search_id}/notes"); + assert!(description.contains("l0")); + assert!(description.contains("l1")); + assert!(description.contains("l2")); + assert!(description.contains("source_ref")); + assert!(description.contains("structured")); + } + + #[tokio::test] + async fn default_ingestion_profile_set_uses_put_admin_default_path() { + let (admin_base, received) = spawn_recording_admin_server().await; + let context = McpContext { + tenant_id: "tenant-a".to_string(), + project_id: "project-a".to_string(), + agent_id: "agent-a".to_string(), + read_profile: "private_plus_project".to_string(), + }; + let mcp = ElfMcp::new( + "http://127.0.0.1:9000".to_string(), + admin_base, + ElfContextHeaders::new(&context), + McpAuthState::Off, + ); + let params = serde_json::Map::from_iter([ + ("profile_id".to_string(), Value::String("profile-a".to_string())), + ("version".to_string(), Value::Number(2.into())), + ]); + let result = mcp.elf_admin_events_ingestion_profile_default_set(params).await; + + assert!(result.is_ok(), "default setter should forward successfully: {result:?}"); + + let request = receive_recorded_request(received).await; + + assert_eq!(request.method, Method::PUT); + assert_eq!(request.path, "/v2/admin/events/ingestion-profiles/default"); + assert_eq!(request.body.get("profile_id").and_then(Value::as_str), Some("profile-a")); + assert_eq!(request.body.get("version").and_then(Value::as_i64), Some(2)); + } + + async fn spawn_recording_admin_server() -> (String, oneshot::Receiver) { + let (tx, rx) = oneshot::channel(); + let app = Router::new() + .route("/v2/admin/events/ingestion-profiles/default", routing::any(record_request)) + .with_state(Arc::new(Mutex::new(Some(tx)))); + let listener = match TcpListener::bind("127.0.0.1:0").await { + Ok(listener) => listener, + Err(err) => panic!("Failed to bind MCP recording admin server: {err}."), + }; + let addr = match listener.local_addr() { + Ok(addr) => addr, + Err(err) => panic!("Failed to read MCP recording admin server address: {err}."), + }; + + tokio::spawn(async move { + if let Err(err) = axum::serve(listener, app).await { + panic!("MCP recording admin server failed: {err}."); + } + }); + + (format!("http://{addr}"), rx) + } + + async fn record_request( + State(recorder): State, + method: Method, + uri: Uri, + Json(body): Json, + ) -> Json { + let mut sender = match recorder.lock() { + Ok(sender) => sender, + Err(err) => panic!("MCP recording admin server mutex was poisoned: {err}."), + }; + + if let Some(tx) = sender.take() { + let _ = tx.send(RecordedRequest { method, path: uri.path().to_string(), body }); + } + + Json(serde_json::json!({ "ok": true })) + } + + async fn receive_recorded_request( + received: oneshot::Receiver, + ) -> RecordedRequest { + match time::timeout(Duration::from_secs(3), received).await { + Ok(Ok(request)) => request, + Ok(Err(err)) => panic!("MCP recording admin server closed before recording: {err}."), + Err(err) => panic!("Timed out waiting for MCP recording admin server: {err}."), + } + } } diff --git a/apps/elf-worker/Cargo.toml b/apps/elf-worker/Cargo.toml index e95d8b09..12445e57 100644 --- a/apps/elf-worker/Cargo.toml +++ b/apps/elf-worker/Cargo.toml @@ -2,25 +2,28 @@ build = "../../build.rs" edition = "2024" name = "elf-worker" -version = "0.1.0" +version = "0.2.0" [dependencies] clap = { workspace = true } color-eyre = { workspace = true } -elf-chunking = { path = "../../packages/elf-chunking" } -elf-cli = { path = "../../packages/elf-cli" } -elf-config = { path = "../../packages/elf-config" } -elf-providers = { path = "../../packages/elf-providers" } -elf-storage = { path = "../../packages/elf-storage" } qdrant-client = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } sqlx = { workspace = true } +thiserror = { workspace = true } time = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } uuid = { workspace = true } +elf-chunking = { workspace = true } +elf-cli = { workspace = true } +elf-config = { workspace = true } +elf-domain = { workspace = true } +elf-providers = { workspace = true } +elf-storage = { workspace = true } + [build-dependencies] vergen-gitcl = { workspace = true } diff --git a/apps/elf-worker/src/error.rs b/apps/elf-worker/src/error.rs new file mode 100644 index 00000000..3bf0fe30 --- /dev/null +++ b/apps/elf-worker/src/error.rs @@ -0,0 +1,33 @@ +/// Worker-app result type. +pub type Result = std::result::Result; + +/// Errors returned by the ELF worker app. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Generic worker failure with a human-readable message. + #[error("{0}")] + Message(String), + /// Validation failure while preparing worker operations. + #[error("{0}")] + Validation(String), + /// SQLx query or connection failure. + #[error(transparent)] + Sqlx(#[from] sqlx::Error), + /// Storage-layer failure. + #[error(transparent)] + Storage(#[from] elf_storage::Error), + /// Tokenizer or chunking failure. + #[error(transparent)] + Tokenizer(#[from] elf_chunking::Error), + /// JSON serialization or deserialization failure. + #[error(transparent)] + SerdeJson(#[from] serde_json::Error), + /// Qdrant client failure. + #[error(transparent)] + Qdrant(#[from] Box), +} +impl From for Error { + fn from(err: qdrant_client::QdrantError) -> Self { + Self::Qdrant(Box::new(err)) + } +} diff --git a/apps/elf-worker/src/lib.rs b/apps/elf-worker/src/lib.rs index d8a57aec..d8b4bbf3 100644 --- a/apps/elf-worker/src/lib.rs +++ b/apps/elf-worker/src/lib.rs @@ -1,17 +1,26 @@ +#![allow(unused_crate_dependencies)] + +//! CLI entrypoint and shared state wiring for the ELF worker app. + pub mod worker; -// std +mod error; + +pub use error::{Error, Result}; + use std::path::PathBuf; -// crates.io use clap::Parser; -use color_eyre::eyre; use tracing_subscriber::EnvFilter; -// self use elf_chunking::ChunkingConfig; -use elf_storage::{db::Db, qdrant::QdrantStore}; +use elf_storage::{ + db::Db, + qdrant::{DOCS_SEARCH_FILTER_INDEXES, QdrantStore}, +}; +use worker::WorkerState; +/// CLI arguments for the worker binary. #[derive(Debug, Parser)] #[command( version = elf_cli::VERSION, @@ -20,33 +29,43 @@ use elf_storage::{db::Db, qdrant::QdrantStore}; )] pub struct Args { #[arg(long, short = 'c', value_name = "FILE")] + /// Path to the worker configuration file. pub config: PathBuf, } -pub async fn run(args: Args) -> color_eyre::Result<()> { - let config = elf_config::load(&args.config)?; +/// Loads configuration, initializes storage handles, and starts the worker loop. +pub async fn run(args: Args) -> Result<()> { + let config = elf_config::load(&args.config).map_err(|err| Error::Message(err.to_string()))?; let filter = EnvFilter::new(config.service.log_level.clone()); + tracing_subscriber::fmt().with_env_filter(filter).init(); let db = Db::connect(&config.storage.postgres).await?; + db.ensure_schema(config.storage.qdrant.vector_dim).await?; + let qdrant = QdrantStore::new(&config.storage.qdrant)?; - let tokenizer_repo = config - .chunking - .tokenizer_repo - .clone() - .unwrap_or_else(|| config.providers.embedding.model.clone()); - let tokenizer = - elf_chunking::load_tokenizer(&tokenizer_repo).map_err(|err| eyre::eyre!(err))?; + qdrant.ensure_collection().await?; + + let docs_qdrant = QdrantStore::new_with_collection( + &config.storage.qdrant, + &config.storage.qdrant.docs_collection, + )?; + + docs_qdrant.ensure_collection().await?; + docs_qdrant.ensure_payload_indexes(&DOCS_SEARCH_FILTER_INDEXES).await?; + + let tokenizer_repo = config.chunking.tokenizer_repo.clone(); + let tokenizer = elf_chunking::load_tokenizer(&tokenizer_repo)?; let chunking = ChunkingConfig { max_tokens: config.chunking.max_tokens, overlap_tokens: config.chunking.overlap_tokens, }; - - let state = worker::WorkerState { + let state = WorkerState { db, qdrant, + docs_qdrant, embedding: config.providers.embedding, chunking, tokenizer, diff --git a/apps/elf-worker/src/main.rs b/apps/elf-worker/src/main.rs index 0b98c076..4b449371 100644 --- a/apps/elf-worker/src/main.rs +++ b/apps/elf-worker/src/main.rs @@ -1,10 +1,19 @@ -// crates.io +#![allow(unused_crate_dependencies)] + +//! Binary entrypoint for the ELF worker app. + use clap::Parser; -// self +use color_eyre::Result; + use elf_worker::Args; #[tokio::main] -async fn main() -> color_eyre::Result<()> { +async fn main() -> Result<()> { + color_eyre::install()?; + let args = Args::parse(); - elf_worker::run(args).await + + elf_worker::run(args).await?; + + Ok(()) } diff --git a/apps/elf-worker/src/worker.rs b/apps/elf-worker/src/worker.rs index 5f83309a..53511239 100644 --- a/apps/elf-worker/src/worker.rs +++ b/apps/elf-worker/src/worker.rs @@ -1,48 +1,80 @@ -// std -use std::{collections::HashMap, time::Duration as StdDuration}; +//! Worker runtime and queue-processing helpers. + +use std::{collections::HashMap, slice, string::ToString}; -// crates.io -use color_eyre::{Result, eyre}; use qdrant_client::{ - client::Payload, + Payload, QdrantError, qdrant::{ - Condition, DeletePointsBuilder, Document, Filter, PointStruct, UpsertPointsBuilder, Value, - Vector, + Condition, DeletePointsBuilder, Document, Filter, PointStruct, UpsertPointsBuilder, Vector, }, }; -use serde::Serialize; -use serde_json::{Value as JsonValue, Value as SerdeValue}; -use sqlx::{QueryBuilder, Row}; -use time::{Duration, OffsetDateTime}; -use tokio::time as tokio_time; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use sqlx::{FromRow, PgConnection, PgExecutor, QueryBuilder}; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; use uuid::Uuid; -// self +use crate::{Error, Result}; use elf_chunking::{Chunk, ChunkingConfig, Tokenizer}; +use elf_config::EmbeddingProviderConfig; +use elf_domain::consolidation::{ + CONSOLIDATION_CONTRACT_SCHEMA_V1, ConsolidationJobPayload, ConsolidationProposalContract, + ConsolidationReviewState, ConsolidationRunState, ConsolidationValidationError, +}; use elf_providers::embedding; use elf_storage::{ + consolidation::{self, ConsolidationRunStateUpdate}, db::Db, - models::{IndexingOutboxEntry, MemoryNote}, + doc_outbox, docs, + models::{ + ConsolidationProposal, ConsolidationRunJob, DocIndexingOutboxEntry, IndexingOutboxEntry, + MemoryNote, TraceOutboxJob, + }, + outbox, qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME, QdrantStore}, queries, }; +type ProjectDocRefFields = (String, Option, Option, Option); + const POLL_INTERVAL_MS: i64 = 500; const CLAIM_LEASE_SECONDS: i64 = 30; const BASE_BACKOFF_MS: i64 = 500; const MAX_BACKOFF_MS: i64 = 30_000; const TRACE_CLEANUP_INTERVAL_SECONDS: i64 = 900; const TRACE_OUTBOX_LEASE_SECONDS: i64 = 30; +const CONSOLIDATION_JOB_LEASE_SECONDS: i64 = 30; +const MAX_OUTBOX_ERROR_CHARS: usize = 1_024; + +/// Shared runtime state used by the worker loop. +pub struct WorkerState { + /// Postgres storage handle. + pub db: Db, + /// Note-index Qdrant collection handle. + pub qdrant: QdrantStore, + /// Document-index Qdrant collection handle. + pub docs_qdrant: QdrantStore, + /// Embedding provider configuration. + pub embedding: EmbeddingProviderConfig, + /// Chunking configuration for notes and docs. + pub chunking: ChunkingConfig, + /// Tokenizer used for chunking operations. + pub tokenizer: Tokenizer, +} -#[derive(Debug, serde::Deserialize)] +#[derive(Debug, Deserialize)] struct TracePayload { trace: TraceRecord, items: Vec, + #[serde(default)] + candidates: Vec, + #[serde(default)] + stages: Vec, } -#[derive(Debug, serde::Deserialize)] +#[derive(Debug, Deserialize)] struct TraceRecord { - trace_id: uuid::Uuid, + trace_id: Uuid, tenant_id: String, project_id: String, agent_id: String, @@ -53,84 +85,168 @@ struct TraceRecord { allowed_scopes: Vec, candidate_count: u32, top_k: u32, - config_snapshot: SerdeValue, + config_snapshot: Value, trace_version: i32, created_at: OffsetDateTime, expires_at: OffsetDateTime, } -#[derive(Debug, serde::Deserialize)] +#[derive(Debug, Deserialize)] struct TraceItemRecord { - item_id: uuid::Uuid, - note_id: uuid::Uuid, - #[serde(default)] - chunk_id: Option, + item_id: Uuid, + note_id: Uuid, + chunk_id: Option, rank: u32, - retrieval_score: Option, - retrieval_rank: Option, - rerank_score: f32, - tie_breaker_score: f32, final_score: f32, - boosts: Vec, - matched_terms: Vec, - matched_fields: Vec, + explain: Value, +} + +#[derive(Debug, Deserialize)] +struct TraceCandidateRecord { + candidate_id: Uuid, + note_id: Uuid, + chunk_id: Uuid, + #[serde(default)] + chunk_index: i32, + #[serde(default)] + snippet: String, + #[serde(default)] + candidate_snapshot: Value, + retrieval_rank: u32, + rerank_score: f32, + note_scope: String, + note_importance: f32, + note_updated_at: OffsetDateTime, + #[serde(default)] + note_hit_count: i64, + note_last_hit_at: Option, + created_at: OffsetDateTime, + expires_at: OffsetDateTime, } -#[derive(Debug, serde::Deserialize, serde::Serialize)] -struct TraceBoost { - name: String, - score: f32, +#[derive(Debug, Deserialize)] +struct TraceTrajectoryStageRecord { + stage_id: Uuid, + stage_order: u32, + stage_name: String, + stage_payload: Value, + created_at: OffsetDateTime, + #[serde(default)] + items: Vec, } -struct TraceOutboxJob { - outbox_id: uuid::Uuid, - trace_id: uuid::Uuid, - payload: SerdeValue, - attempts: i32, +#[derive(Debug, Deserialize)] +struct TraceTrajectoryStageItemRecord { + id: Uuid, + item_id: Option, + note_id: Option, + chunk_id: Option, + metrics: Value, } struct TraceItemInsert { - item_id: uuid::Uuid, - note_id: uuid::Uuid, - chunk_id: Option, + item_id: Uuid, + note_id: Uuid, + chunk_id: Option, rank: i32, - retrieval_score: Option, - retrieval_rank: Option, - rerank_score: f32, - tie_breaker_score: f32, final_score: f32, - boosts: SerdeValue, - matched_terms: SerdeValue, - matched_fields: SerdeValue, + explain: Value, +} + +struct TraceCandidateInsert { + candidate_id: Uuid, + note_id: Uuid, + chunk_id: Uuid, + chunk_index: i32, + snippet: String, + candidate_snapshot: Value, + retrieval_rank: i32, + rerank_score: f32, + note_scope: String, + note_importance: f32, + note_updated_at: OffsetDateTime, + note_hit_count: i64, + note_last_hit_at: Option, + created_at: OffsetDateTime, + expires_at: OffsetDateTime, +} + +struct TraceStageInsert { + stage_id: Uuid, + stage_order: i32, + stage_name: String, + stage_payload: Value, + created_at: OffsetDateTime, +} + +struct TraceStageItemInsert { + id: Uuid, + stage_id: Uuid, + item_id: Option, + note_id: Option, + chunk_id: Option, + metrics: Value, } struct ChunkRecord { - chunk_id: uuid::Uuid, + chunk_id: Uuid, chunk_index: i32, start_offset: i32, end_offset: i32, text: String, } -pub struct WorkerState { - pub db: Db, - pub qdrant: QdrantStore, - pub embedding: elf_config::EmbeddingProviderConfig, - pub chunking: ChunkingConfig, - pub tokenizer: Tokenizer, +#[derive(Debug, FromRow)] +struct NoteFieldRow { + field_id: Uuid, + text: String, +} + +#[derive(Debug, FromRow)] +struct DocChunkIndexRow { + doc_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + scope: String, + doc_type: String, + status: String, + created_at: OffsetDateTime, + updated_at: OffsetDateTime, + content_hash: String, + source_ref: Value, + chunk_id: Uuid, + chunk_index: i32, + start_offset: i32, + end_offset: i32, + chunk_text: String, + chunk_hash: String, } +/// Runs the worker polling loop for note, document, and trace outboxes. pub async fn run_worker(state: WorkerState) -> Result<()> { let mut last_trace_cleanup = OffsetDateTime::now_utc(); + loop { if let Err(err) = process_indexing_outbox_once(&state).await { tracing::error!(error = %err, "Indexing outbox processing failed."); } + if let Err(err) = process_doc_indexing_outbox_once(&state).await { + tracing::error!(error = %err, "Doc indexing outbox processing failed."); + } if let Err(err) = process_trace_outbox_once(&state).await { tracing::error!(error = %err, "Search trace outbox processing failed."); } + if let Err(err) = process_consolidation_run_job_once(&state).await { + tracing::error!(error = %err, "Consolidation run job processing failed."); + } + let now = OffsetDateTime::now_utc(); - if now - last_trace_cleanup >= Duration::seconds(TRACE_CLEANUP_INTERVAL_SECONDS) { + + if now - last_trace_cleanup >= time::Duration::seconds(TRACE_CLEANUP_INTERVAL_SECONDS) { + if let Err(err) = purge_expired_trace_candidates(&state.db, now).await { + tracing::error!(error = %err, "Search trace candidate cleanup failed."); + } if let Err(err) = purge_expired_traces(&state.db, now).await { tracing::error!(error = %err, "Search trace cleanup failed."); } else { @@ -139,363 +255,55 @@ pub async fn run_worker(state: WorkerState) -> Result<()> { if let Err(err) = purge_expired_cache(&state.db, now).await { tracing::error!(error = %err, "LLM cache cleanup failed."); } - } - tokio_time::sleep(to_std_duration(Duration::milliseconds(POLL_INTERVAL_MS))).await; - } -} - -async fn process_indexing_outbox_once(state: &WorkerState) -> Result<()> { - let now = OffsetDateTime::now_utc(); - let job = fetch_next_job(&state.db, now).await?; - let Some(job) = job else { - return Ok(()); - }; - - let result = match job.op.as_str() { - "UPSERT" => handle_upsert(state, &job).await, - "DELETE" => handle_delete(state, &job).await, - other => Err(eyre::eyre!("Unsupported outbox op: {other}.")), - }; - - match result { - Ok(()) => { - mark_done(&state.db, job.outbox_id).await?; - }, - Err(err) => { - mark_failed(&state.db, job.outbox_id, job.attempts, &err).await?; - tracing::error!(error = %err, outbox_id = %job.outbox_id, "Outbox job failed."); - }, - } - - Ok(()) -} - -async fn process_trace_outbox_once(state: &WorkerState) -> Result<()> { - let now = OffsetDateTime::now_utc(); - let job = fetch_next_trace_job(&state.db, now).await?; - let Some(job) = job else { - return Ok(()); - }; - - let result = handle_trace_job(&state.db, &job).await; - match result { - Ok(()) => { - mark_trace_done(&state.db, job.outbox_id).await?; - }, - Err(err) => { - mark_trace_failed(&state.db, job.outbox_id, job.attempts, &err).await?; - tracing::error!(error = %err, trace_id = %job.trace_id, "Search trace outbox job failed."); - }, - } - - Ok(()) -} - -// TODO: Add outbox fetch/update helpers in elf_storage::outbox and use them here. -async fn fetch_next_job(db: &Db, now: OffsetDateTime) -> Result> { - let mut tx = db.pool.begin().await?; - let row = sqlx::query( - "SELECT outbox_id, note_id, op, embedding_version, status, attempts, last_error, available_at, created_at, updated_at \ - FROM indexing_outbox \ - WHERE status IN ('PENDING','FAILED') AND available_at <= $1 \ - ORDER BY available_at ASC \ - LIMIT 1 \ - FOR UPDATE SKIP LOCKED", - ) - .bind(now) - .fetch_optional(&mut *tx) - .await?; - - let job = if let Some(row) = row { - let outbox_id = row.try_get("outbox_id")?; - let note_id = row.try_get("note_id")?; - let op = row.try_get("op")?; - let embedding_version = row.try_get("embedding_version")?; - let status = row.try_get("status")?; - let attempts = row.try_get("attempts")?; - let last_error = row.try_get("last_error")?; - let available_at = row.try_get("available_at")?; - let created_at = row.try_get("created_at")?; - let updated_at = row.try_get("updated_at")?; - - let lease_until = now + Duration::seconds(CLAIM_LEASE_SECONDS); - sqlx::query( - "UPDATE indexing_outbox SET available_at = $1, updated_at = $2 WHERE outbox_id = $3", - ) - .bind(lease_until) - .bind(now) - .bind(outbox_id) - .execute(&mut *tx) - .await?; - - Some(IndexingOutboxEntry { - outbox_id, - note_id, - op, - embedding_version, - status, - attempts, - last_error, - available_at, - created_at, - updated_at, - }) - } else { - None - }; - - tx.commit().await?; - Ok(job) -} - -async fn fetch_next_trace_job(db: &Db, now: OffsetDateTime) -> Result> { - let mut tx = db.pool.begin().await?; - let row = sqlx::query( - "SELECT outbox_id, trace_id, payload, attempts \ - FROM search_trace_outbox \ - WHERE status IN ('PENDING','FAILED') AND available_at <= $1 \ - ORDER BY available_at ASC \ - LIMIT 1 \ - FOR UPDATE SKIP LOCKED", - ) - .bind(now) - .fetch_optional(&mut *tx) - .await?; - - let job = if let Some(row) = row { - let outbox_id = row.try_get("outbox_id")?; - let trace_id = row.try_get("trace_id")?; - let payload = row.try_get("payload")?; - let attempts = row.try_get("attempts")?; - - let lease_until = now + Duration::seconds(TRACE_OUTBOX_LEASE_SECONDS); - sqlx::query( - "UPDATE search_trace_outbox SET available_at = $1, updated_at = $2 WHERE outbox_id = $3", - ) - .bind(lease_until) - .bind(now) - .bind(outbox_id) - .execute(&mut *tx) - .await?; - - Some(TraceOutboxJob { outbox_id, trace_id, payload, attempts }) - } else { - None - }; - - tx.commit().await?; - Ok(job) -} - -async fn handle_upsert(state: &WorkerState, job: &IndexingOutboxEntry) -> Result<()> { - let note = fetch_note(&state.db, job.note_id).await?; - let Some(note) = note else { - tracing::info!(note_id = %job.note_id, "Note missing for outbox job. Marking done."); - return Ok(()); - }; - - let now = OffsetDateTime::now_utc(); - if !note_is_active(¬e, now) { - tracing::info!(note_id = %job.note_id, "Note inactive or expired. Skipping index."); - return Ok(()); - } - - let chunks = elf_chunking::split_text(¬e.text, &state.chunking, &state.tokenizer); - if chunks.is_empty() { - return Err(eyre::eyre!("Chunking produced no chunks.")); - } - let records = build_chunk_records(note.note_id, &chunks)?; - let chunk_texts: Vec = records.iter().map(|record| record.text.clone()).collect(); - let chunk_vectors = embedding::embed(&state.embedding, &chunk_texts).await?; - if chunk_vectors.len() != records.len() { - return Err(eyre::eyre!( - "Embedding provider returned {} vectors for {} chunks.", - chunk_vectors.len(), - records.len() - )); - } - for vector in &chunk_vectors { - validate_vector_dim(vector, state.qdrant.vector_dim)?; - } - - queries::delete_note_chunks(&state.db, note.note_id).await?; - for record in &records { - queries::insert_note_chunk( - &state.db, - record.chunk_id, - note.note_id, - record.chunk_index, - record.start_offset, - record.end_offset, - &record.text, - &job.embedding_version, - ) - .await?; - } - for (record, vector) in records.iter().zip(chunk_vectors.iter()) { - let vec_text = format_vector_text(vector); - queries::insert_note_chunk_embedding( - &state.db, - record.chunk_id, - &job.embedding_version, - vector.len() as i32, - &vec_text, - ) - .await?; - } - - let pooled = - mean_pool(&chunk_vectors).ok_or_else(|| eyre::eyre!("Cannot pool empty chunk vectors."))?; - validate_vector_dim(&pooled, state.qdrant.vector_dim)?; - insert_embedding(&state.db, note.note_id, &job.embedding_version, pooled.len() as i32, &pooled) - .await?; - delete_qdrant_note_points(state, note.note_id).await?; - upsert_qdrant_chunks(state, ¬e, &job.embedding_version, &records, &chunk_vectors).await?; - Ok(()) -} - -async fn handle_delete(state: &WorkerState, job: &IndexingOutboxEntry) -> Result<()> { - delete_qdrant_note_points(state, job.note_id).await?; - Ok(()) -} - -async fn handle_trace_job(db: &Db, job: &TraceOutboxJob) -> Result<()> { - let payload: TracePayload = serde_json::from_value(job.payload.clone())?; - let trace = payload.trace; - let trace_id = trace.trace_id; - let mut tx = db.pool.begin().await?; - - sqlx::query( - "INSERT INTO search_traces \ - (trace_id, tenant_id, project_id, agent_id, read_profile, query, expansion_mode, \ - expanded_queries, allowed_scopes, candidate_count, top_k, config_snapshot, \ - trace_version, created_at, expires_at) \ - VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15) \ - ON CONFLICT (trace_id) DO NOTHING", - ) - .bind(trace_id) - .bind(&trace.tenant_id) - .bind(&trace.project_id) - .bind(&trace.agent_id) - .bind(&trace.read_profile) - .bind(&trace.query) - .bind(&trace.expansion_mode) - .bind(encode_json(&trace.expanded_queries, "expanded_queries")?) - .bind(encode_json(&trace.allowed_scopes, "allowed_scopes")?) - .bind(trace.candidate_count as i32) - .bind(trace.top_k as i32) - .bind(trace.config_snapshot.clone()) - .bind(trace.trace_version) - .bind(trace.created_at) - .bind(trace.expires_at) - .execute(&mut *tx) - .await?; - - if !payload.items.is_empty() { - let mut inserts = Vec::with_capacity(payload.items.len()); - for item in payload.items { - inserts.push(TraceItemInsert { - item_id: item.item_id, - note_id: item.note_id, - chunk_id: item.chunk_id, - rank: item.rank as i32, - retrieval_score: item.retrieval_score, - retrieval_rank: item.retrieval_rank.map(|rank| rank as i32), - rerank_score: item.rerank_score, - tie_breaker_score: item.tie_breaker_score, - final_score: item.final_score, - boosts: encode_json(&item.boosts, "boosts")?, - matched_terms: encode_json(&item.matched_terms, "matched_terms")?, - matched_fields: encode_json(&item.matched_fields, "matched_fields")?, - }); + if let Err(err) = purge_expired_search_sessions(&state.db, now).await { + tracing::error!(error = %err, "Search session cleanup failed."); + } } - let mut builder = QueryBuilder::new( - "INSERT INTO search_trace_items \ - (item_id, trace_id, note_id, chunk_id, rank, retrieval_score, retrieval_rank, rerank_score, \ - tie_breaker_score, final_score, boosts, matched_terms, matched_fields) ", - ); - builder.push_values(inserts, |mut b, item| { - b.push_bind(item.item_id) - .push_bind(trace_id) - .push_bind(item.note_id) - .push_bind(item.chunk_id) - .push_bind(item.rank) - .push_bind(item.retrieval_score) - .push_bind(item.retrieval_rank) - .push_bind(item.rerank_score) - .push_bind(item.tie_breaker_score) - .push_bind(item.final_score) - .push_bind(item.boosts) - .push_bind(item.matched_terms) - .push_bind(item.matched_fields); - }); - builder.push(" ON CONFLICT (item_id) DO NOTHING"); - builder.build().execute(&mut *tx).await?; + tokio::time::sleep(to_std_duration(time::Duration::milliseconds(POLL_INTERVAL_MS))).await; } - - tx.commit().await?; - Ok(()) } -async fn purge_expired_traces(db: &Db, now: OffsetDateTime) -> Result<()> { - let result = sqlx::query("DELETE FROM search_traces WHERE expires_at <= $1") - .bind(now) - .execute(&db.pool) - .await?; - if result.rows_affected() > 0 { - tracing::info!(count = result.rows_affected(), "Purged expired search traces."); - } - Ok(()) -} +/// Processes at most one due job from each worker-owned queue. +pub async fn process_once(state: &WorkerState) -> Result<()> { + process_indexing_outbox_once(state).await?; + process_doc_indexing_outbox_once(state).await?; + process_trace_outbox_once(state).await?; + process_consolidation_run_job_once(state).await?; -async fn purge_expired_cache(db: &Db, now: OffsetDateTime) -> Result<()> { - let result = sqlx::query("DELETE FROM llm_cache WHERE expires_at <= $1") - .bind(now) - .execute(&db.pool) - .await?; - if result.rows_affected() > 0 { - tracing::info!(count = result.rows_affected(), "Purged expired LLM cache entries."); - } Ok(()) } -fn is_not_found_error(err: &qdrant_client::QdrantError) -> bool { +fn is_not_found_error(err: &QdrantError) -> bool { let message = err.to_string().to_lowercase(); let point_not_found = (message.contains("not found") || message.contains("404")) && message.contains("point"); let no_point_found = message.contains("no point") && message.contains("found"); - point_not_found || no_point_found -} -async fn fetch_note(db: &Db, note_id: uuid::Uuid) -> Result> { - let note = sqlx::query_as::<_, MemoryNote>( - "SELECT note_id, tenant_id, project_id, agent_id, scope, type, key, text, importance, confidence, status, created_at, updated_at, expires_at, embedding_version, source_ref, hit_count, last_hit_at \ - FROM memory_notes WHERE note_id = $1", - ) - .bind(note_id) - .fetch_optional(&db.pool) - .await?; - Ok(note) + point_not_found || no_point_found } fn note_is_active(note: &MemoryNote, now: OffsetDateTime) -> bool { if !note.status.eq_ignore_ascii_case("active") { return false; } + if let Some(expires_at) = note.expires_at && expires_at <= now { return false; } + true } -fn build_chunk_records(note_id: uuid::Uuid, chunks: &[Chunk]) -> Result> { +fn build_chunk_records(note_id: Uuid, chunks: &[Chunk]) -> Result> { let mut records = Vec::with_capacity(chunks.len()); + for chunk in chunks { let start_offset = to_i32(chunk.start_offset, "start_offset")?; let end_offset = to_i32(chunk.end_offset, "end_offset")?; + records.push(ChunkRecord { chunk_id: chunk_id_for(note_id, chunk.chunk_index), chunk_index: chunk.chunk_index, @@ -504,25 +312,30 @@ fn build_chunk_records(note_id: uuid::Uuid, chunks: &[Chunk]) -> Result uuid::Uuid { +fn chunk_id_for(note_id: Uuid, chunk_index: i32) -> Uuid { let name = format!("{note_id}:{chunk_index}"); + Uuid::new_v5(&Uuid::NAMESPACE_OID, name.as_bytes()) } fn to_i32(value: usize, label: &str) -> Result { - i32::try_from(value) - .map_err(|_| eyre::eyre!("Chunk {label} offset {value} exceeds supported range.")) + i32::try_from(value).map_err(|_| { + Error::Validation(format!("Chunk {label} offset {value} exceeds supported range.")) + }) } fn mean_pool(chunks: &[Vec]) -> Option> { if chunks.is_empty() { return None; } + let dim = chunks[0].len(); let mut out = vec![0.0_f32; dim]; + for vec in chunks { for (idx, value) in vec.iter().enumerate() { out[idx] += value; @@ -531,45 +344,1154 @@ fn mean_pool(chunks: &[Vec]) -> Option> { for value in &mut out { *value /= chunks.len() as f32; } + Some(out) } -async fn insert_embedding( - db: &Db, - note_id: uuid::Uuid, - embedding_version: &str, - embedding_dim: i32, - vec: &[f32], -) -> Result<()> { - let vec_text = format_vector_text(vec); - sqlx::query( - "INSERT INTO note_embeddings (note_id, embedding_version, embedding_dim, vec) \ - VALUES ($1, $2, $3, $4::vector) \ - ON CONFLICT (note_id, embedding_version) DO UPDATE \ - SET embedding_dim = EXCLUDED.embedding_dim, vec = EXCLUDED.vec, created_at = now()", - ) - .bind(note_id) - .bind(embedding_version) - .bind(embedding_dim) - .bind(vec_text) - .execute(&db.pool) - .await?; - Ok(()) +fn format_timestamp(ts: OffsetDateTime) -> Result { + ts.format(&Rfc3339).map_err(|_| Error::Message("Failed to format timestamp.".to_string())) } -async fn delete_qdrant_note_points(state: &WorkerState, note_id: uuid::Uuid) -> Result<()> { - let filter = Filter::must([Condition::matches("note_id", note_id.to_string())]); - let delete = - DeletePointsBuilder::new(state.qdrant.collection.clone()).points(filter).wait(true); - match state.qdrant.client.delete_points(delete).await { - Ok(_) => {}, - Err(err) => - if is_not_found_error(&err) { - tracing::info!(note_id = %note_id, "Qdrant points missing during delete."); - } else { - return Err(eyre::eyre!(err.to_string())); - }, +fn validate_vector_dim(vec: &[f32], expected_dim: u32) -> Result<()> { + if vec.len() != expected_dim as usize { + return Err(Error::Validation(format!( + "Embedding dimension {} does not match configured vector_dim {}.", + vec.len(), + expected_dim + ))); } + + Ok(()) +} + +fn format_vector_text(vec: &[f32]) -> String { + let mut out = String::from("["); + + for (idx, value) in vec.iter().enumerate() { + if idx > 0 { + out.push(','); + } + + out.push_str(&value.to_string()); + } + + out.push(']'); + + out +} + +fn encode_json(value: &T, label: &str) -> Result +where + T: Serialize, +{ + serde_json::to_value(value) + .map_err(|err| Error::Message(format!("Failed to encode {label}: {err}."))) +} + +fn sanitize_outbox_error(text: &str) -> String { + let mut parts = Vec::new(); + let mut redact_next = false; + + for raw in text.split_whitespace() { + let mut word = raw.to_string(); + + if redact_next { + word = "[REDACTED]".to_string(); + redact_next = false; + } + if raw.eq_ignore_ascii_case("bearer") { + redact_next = true; + } + + let lowered = raw.to_ascii_lowercase(); + + for key in ["api_key", "apikey", "password", "secret", "token"] { + if lowered.contains(key) && (lowered.contains('=') || lowered.contains(':')) { + let sep = if raw.contains('=') { '=' } else { ':' }; + let prefix = match raw.split(sep).next() { + Some(prefix) => prefix, + None => raw, + }; + + word = format!("{prefix}{sep}[REDACTED]"); + + break; + } + } + + parts.push(word); + } + + let mut out = parts.join(" "); + + if out.chars().count() > MAX_OUTBOX_ERROR_CHARS { + out = out.chars().take(MAX_OUTBOX_ERROR_CHARS).collect(); + + out.push_str("..."); + } + + out +} + +fn backoff_for_attempt(attempt: i32) -> time::Duration { + let attempts = attempt.max(1) as u32; + let exp = attempts.saturating_sub(1).min(6); + let base = BASE_BACKOFF_MS.saturating_mul(1 << exp); + let capped = base.min(MAX_BACKOFF_MS); + + time::Duration::milliseconds(capped) +} + +fn to_std_duration(duration: time::Duration) -> std::time::Duration { + let millis = duration.whole_milliseconds(); + + if millis <= 0 { + return std::time::Duration::from_millis(0); + } + + std::time::Duration::from_millis(millis as u64) +} + +fn project_doc_ref_fields( + source_ref: &Value, + fallback_timestamp: OffsetDateTime, + doc_type: &str, +) -> Result { + let source_ref_field = |field_name: &str| -> Option { + source_ref + .get(field_name) + .and_then(Value::as_str) + .filter(|value| !value.is_empty()) + .map(ToString::to_string) + }; + let doc_ts = match source_ref + .get("ts") + .and_then(Value::as_str) + .filter(|value| OffsetDateTime::parse(value, &Rfc3339).is_ok()) + .map(ToString::to_string) + .or_else(|| { + source_ref + .get("doc_ts") + .and_then(Value::as_str) + .filter(|value| OffsetDateTime::parse(value, &Rfc3339).is_ok()) + .map(ToString::to_string) + }) { + Some(value) => value, + None => format_timestamp(fallback_timestamp)?, + }; + let thread_id = if doc_type == "chat" { source_ref_field("thread_id") } else { None }; + let domain = if doc_type == "search" { source_ref_field("domain") } else { None }; + let repo = if doc_type == "dev" { source_ref_field("repo") } else { None }; + + Ok((doc_ts, thread_id, domain, repo)) +} + +fn proposal_row_from_contract( + job: &ConsolidationRunJob, + now: OffsetDateTime, + proposal: ConsolidationProposalContract, +) -> Result { + proposal.validate().map_err(consolidation_validation_error)?; + + Ok(ConsolidationProposal { + proposal_id: Uuid::new_v4(), + run_id: job.run_id, + tenant_id: job.tenant_id.clone(), + project_id: job.project_id.clone(), + agent_id: job.agent_id.clone(), + contract_schema: CONSOLIDATION_CONTRACT_SCHEMA_V1.to_string(), + proposal_kind: proposal.proposal_kind, + apply_intent: proposal.apply_intent.as_str().to_string(), + review_state: ConsolidationReviewState::Proposed.as_str().to_string(), + source_refs: encode_json(&proposal.source_refs, "consolidation source_refs")?, + source_snapshot: proposal.source_snapshot, + lineage: encode_json(&proposal.lineage, "consolidation lineage")?, + diff: encode_json(&proposal.diff, "consolidation diff")?, + confidence: proposal.confidence, + unsupported_claim_flags: encode_json( + &proposal.unsupported_claim_flags, + "consolidation unsupported_claim_flags", + )?, + contradiction_markers: encode_json( + &proposal.markers.contradictions, + "consolidation contradiction_markers", + )?, + staleness_markers: encode_json( + &proposal.markers.staleness, + "consolidation staleness_markers", + )?, + target_ref: proposal.target_ref, + proposed_payload: proposal.proposed_payload, + reviewer_agent_id: None, + review_comment: None, + reviewed_at: None, + created_at: now, + updated_at: now, + }) +} + +fn consolidation_validation_error(err: ConsolidationValidationError) -> Error { + Error::Validation(err.to_string()) +} + +async fn process_indexing_outbox_once(state: &WorkerState) -> Result<()> { + let now = OffsetDateTime::now_utc(); + let job = outbox::claim_next_indexing_outbox_job(&state.db, now, CLAIM_LEASE_SECONDS).await?; + let Some(job) = job else { return Ok(()) }; + let result = match job.op.as_str() { + "UPSERT" => handle_upsert(state, &job).await, + "DELETE" => handle_delete(state, &job).await, + other => Err(Error::Validation(format!("Unsupported outbox op: {other}."))), + }; + + match result { + Ok(()) => { + outbox::mark_indexing_outbox_done(&state.db, job.outbox_id, OffsetDateTime::now_utc()) + .await?; + }, + Err(err) => { + tracing::error!( + error = %err, + outbox_id = %job.outbox_id, + note_id = %job.note_id, + "Outbox job failed." + ); + + mark_failed(&state.db, job.outbox_id, job.attempts, &err).await?; + }, + } + + Ok(()) +} + +async fn process_doc_indexing_outbox_once(state: &WorkerState) -> Result<()> { + let now = OffsetDateTime::now_utc(); + let job = + doc_outbox::claim_next_doc_indexing_outbox_job(&state.db, now, CLAIM_LEASE_SECONDS).await?; + let Some(job) = job else { return Ok(()) }; + let result = match job.op.as_str() { + "UPSERT" => handle_doc_upsert(state, &job).await, + "DELETE" => handle_doc_delete(state, &job).await, + other => Err(Error::Validation(format!("Unsupported doc outbox op: {other}."))), + }; + + match result { + Ok(()) => { + doc_outbox::mark_doc_indexing_outbox_done( + &state.db, + job.outbox_id, + OffsetDateTime::now_utc(), + ) + .await?; + }, + Err(err) => { + tracing::error!( + error = %err, + outbox_id = %job.outbox_id, + doc_id = %job.doc_id, + chunk_id = %job.chunk_id, + "Doc outbox job failed." + ); + + mark_doc_failed(&state.db, job.outbox_id, job.attempts, &err).await?; + }, + } + + Ok(()) +} + +async fn process_trace_outbox_once(state: &WorkerState) -> Result<()> { + let now = OffsetDateTime::now_utc(); + let job = + outbox::claim_next_trace_outbox_job(&state.db, now, TRACE_OUTBOX_LEASE_SECONDS).await?; + let Some(job) = job else { return Ok(()) }; + let result = handle_trace_job(&state.db, &job).await; + + match result { + Ok(()) => { + outbox::mark_trace_outbox_done(&state.db, job.outbox_id, OffsetDateTime::now_utc()) + .await?; + }, + Err(err) => { + tracing::error!( + error = %err, + outbox_id = %job.outbox_id, + trace_id = %job.trace_id, + "Search trace outbox job failed." + ); + + mark_trace_failed(&state.db, job.outbox_id, job.attempts, &err).await?; + }, + } + + Ok(()) +} + +async fn process_consolidation_run_job_once(state: &WorkerState) -> Result<()> { + let now = OffsetDateTime::now_utc(); + let job = consolidation::claim_next_consolidation_run_job( + &state.db, + now, + CONSOLIDATION_JOB_LEASE_SECONDS, + ) + .await?; + let Some(job) = job else { return Ok(()) }; + let result = handle_consolidation_job(&state.db, &job).await; + + match result { + Ok(()) => {}, + Err(err) => { + tracing::error!( + error = %err, + job_id = %job.job_id, + run_id = %job.run_id, + "Consolidation run job failed." + ); + + mark_consolidation_failed(&state.db, job.job_id, job.attempts, &err).await?; + }, + } + + Ok(()) +} + +async fn handle_upsert(state: &WorkerState, job: &IndexingOutboxEntry) -> Result<()> { + let note = fetch_note(&state.db, job.note_id).await?; + let Some(note) = note else { + tracing::info!( + outbox_id = %job.outbox_id, + note_id = %job.note_id, + "Note missing for outbox job. Marking done." + ); + + return Ok(()); + }; + let now = OffsetDateTime::now_utc(); + + if !note_is_active(¬e, now) { + tracing::info!( + outbox_id = %job.outbox_id, + note_id = %job.note_id, + "Note inactive or expired. Skipping index." + ); + + return Ok(()); + } + + let fields = fetch_note_fields(&state.db, note.note_id).await?; + let chunks = elf_chunking::split_text(¬e.text, &state.chunking, &state.tokenizer); + + if chunks.is_empty() { + return Err(Error::Validation("Chunking produced no chunks.".to_string())); + } + + let records = build_chunk_records(note.note_id, &chunks)?; + let chunk_texts: Vec = records.iter().map(|record| record.text.clone()).collect(); + let field_texts: Vec = fields.iter().map(|field| field.text.clone()).collect(); + let mut embed_inputs = Vec::with_capacity(chunk_texts.len() + field_texts.len()); + + embed_inputs.extend(chunk_texts); + embed_inputs.extend(field_texts); + + let vectors = embedding::embed(&state.embedding, &embed_inputs) + .await + .map_err(|err| Error::Message(err.to_string()))?; + + if vectors.len() != records.len() + fields.len() { + return Err(Error::Validation(format!( + "Embedding provider returned {} vectors for {} items.", + vectors.len(), + records.len() + fields.len() + ))); + } + + let (chunk_vectors, field_vectors) = vectors.split_at(records.len()); + + for vector in chunk_vectors.iter().chain(field_vectors.iter()) { + validate_vector_dim(vector, state.qdrant.vector_dim)?; + } + + { + let mut tx = state.db.pool.begin().await?; + + queries::delete_note_chunks(&mut *tx, note.note_id).await?; + + for record in &records { + queries::insert_note_chunk( + &mut *tx, + record.chunk_id, + note.note_id, + record.chunk_index, + record.start_offset, + record.end_offset, + record.text.as_str(), + &job.embedding_version, + ) + .await?; + } + for (record, vector) in records.iter().zip(chunk_vectors.iter()) { + let vec_text = format_vector_text(vector); + + queries::insert_note_chunk_embedding( + &mut *tx, + record.chunk_id, + &job.embedding_version, + vector.len() as i32, + vec_text.as_str(), + ) + .await?; + } + + let pooled = mean_pool(chunk_vectors) + .ok_or_else(|| Error::Message("Cannot pool empty chunk vectors.".to_string()))?; + + validate_vector_dim(&pooled, state.qdrant.vector_dim)?; + insert_embedding_tx( + &mut *tx, + note.note_id, + &job.embedding_version, + pooled.len() as i32, + &pooled, + ) + .await?; + + for (field, vector) in fields.iter().zip(field_vectors.iter()) { + insert_note_field_embedding_tx( + &mut *tx, + field.field_id, + &job.embedding_version, + vector.len() as i32, + vector, + ) + .await?; + } + + tx.commit().await?; + } + + delete_qdrant_note_points(state, note.note_id).await?; + upsert_qdrant_chunks(state, ¬e, &job.embedding_version, &records, chunk_vectors).await?; + + Ok(()) +} + +async fn handle_delete(state: &WorkerState, job: &IndexingOutboxEntry) -> Result<()> { + delete_qdrant_note_points(state, job.note_id).await?; + + Ok(()) +} + +async fn fetch_doc_chunk_index_row(db: &Db, chunk_id: Uuid) -> Result> { + let row = sqlx::query_as::<_, DocChunkIndexRow>( + r#" +SELECT + d.doc_id, + d.tenant_id, + d.project_id, + d.agent_id, + d.scope, + d.doc_type, + d.status, + d.created_at, + d.updated_at, + d.content_hash, + COALESCE(d.source_ref, '{}'::jsonb) AS source_ref, + c.chunk_id, + c.chunk_index, + c.start_offset, + c.end_offset, + c.chunk_text, + c.chunk_hash +FROM doc_chunks c +JOIN doc_documents d ON d.doc_id = c.doc_id +WHERE c.chunk_id = $1 +LIMIT 1"#, + ) + .bind(chunk_id) + .fetch_optional(&db.pool) + .await?; + + Ok(row) +} + +async fn handle_doc_upsert(state: &WorkerState, job: &DocIndexingOutboxEntry) -> Result<()> { + let row = fetch_doc_chunk_index_row(&state.db, job.chunk_id).await?; + let Some(row) = row else { + tracing::info!( + outbox_id = %job.outbox_id, + doc_id = %job.doc_id, + chunk_id = %job.chunk_id, + "Doc chunk missing for outbox job. Marking done." + ); + + return Ok(()); + }; + + if !row.status.eq_ignore_ascii_case("active") { + tracing::info!( + outbox_id = %job.outbox_id, + doc_id = %row.doc_id, + chunk_id = %row.chunk_id, + "Doc inactive. Skipping index." + ); + + return Ok(()); + } + + let vectors = embedding::embed(&state.embedding, slice::from_ref(&row.chunk_text)) + .await + .map_err(|err| Error::Message(err.to_string()))?; + let vector = vectors + .first() + .ok_or_else(|| Error::Validation("Embedding provider returned no vectors.".to_string()))?; + + validate_vector_dim(vector, state.docs_qdrant.vector_dim)?; + + { + let vec_text = format_vector_text(vector); + let mut tx = state.db.pool.begin().await?; + + docs::insert_doc_chunk_embedding( + &mut *tx, + row.chunk_id, + &job.embedding_version, + vector.len() as i32, + vec_text.as_str(), + ) + .await?; + + tx.commit().await?; + } + + upsert_qdrant_doc_chunk(state, &row, &job.embedding_version, vector).await?; + + Ok(()) +} + +async fn handle_doc_delete(state: &WorkerState, job: &DocIndexingOutboxEntry) -> Result<()> { + let filter = Filter::must([Condition::matches("chunk_id", job.chunk_id.to_string())]); + let delete = + DeletePointsBuilder::new(state.docs_qdrant.collection.clone()).points(filter).wait(true); + + state.docs_qdrant.client.delete_points(delete).await?; + + Ok(()) +} + +async fn upsert_qdrant_doc_chunk( + state: &WorkerState, + row: &DocChunkIndexRow, + embedding_version: &str, + vec: &[f32], +) -> Result<()> { + let (doc_ts, thread_id, domain, repo) = + project_doc_ref_fields(&row.source_ref, row.created_at, row.doc_type.as_str())?; + let mut payload = Payload::new(); + + payload.insert("doc_id", row.doc_id.to_string()); + payload.insert("chunk_id", row.chunk_id.to_string()); + payload.insert("chunk_index", row.chunk_index as i64); + payload.insert("start_offset", row.start_offset as i64); + payload.insert("end_offset", row.end_offset as i64); + payload.insert("tenant_id", row.tenant_id.clone()); + payload.insert("project_id", row.project_id.clone()); + payload.insert("agent_id", row.agent_id.clone()); + payload.insert("scope", row.scope.clone()); + payload.insert("doc_type", row.doc_type.clone()); + payload.insert("status", row.status.clone()); + + let updated_at = format_timestamp(row.updated_at)?; + + payload.insert("updated_at", Value::String(updated_at)); + payload.insert("doc_ts", Value::String(doc_ts)); + + if let Some(value) = thread_id { + payload.insert("thread_id", Value::String(value)); + } + if let Some(value) = domain { + payload.insert("domain", Value::String(value)); + } + if let Some(value) = repo { + payload.insert("repo", Value::String(value)); + } + + payload.insert("embedding_version", embedding_version.to_string()); + payload.insert("content_hash", row.content_hash.clone()); + payload.insert("chunk_hash", row.chunk_hash.clone()); + + let mut vector_map = HashMap::new(); + + vector_map.insert(DENSE_VECTOR_NAME.to_string(), Vector::from(vec.to_vec())); + vector_map.insert( + BM25_VECTOR_NAME.to_string(), + Vector::from(Document::new(row.chunk_text.clone(), BM25_MODEL)), + ); + + let point = PointStruct::new(row.chunk_id.to_string(), vector_map, payload); + let upsert = + UpsertPointsBuilder::new(state.docs_qdrant.collection.clone(), vec![point]).wait(true); + + state.docs_qdrant.client.upsert_points(upsert).await?; + + Ok(()) +} + +async fn handle_trace_job(db: &Db, job: &TraceOutboxJob) -> Result<()> { + let payload: TracePayload = serde_json::from_value(job.payload.clone())?; + let TracePayload { trace, items, candidates, stages } = payload; + let trace_id = trace.trace_id; + let expanded_queries_json = encode_json(&trace.expanded_queries, "expanded_queries")?; + let allowed_scopes_json = encode_json(&trace.allowed_scopes, "allowed_scopes")?; + let mut tx = db.pool.begin().await?; + + insert_trace_tx(&mut *tx, trace_id, &trace, expanded_queries_json, allowed_scopes_json).await?; + insert_trace_items_tx(&mut *tx, trace_id, items).await?; + insert_trace_stages_tx(&mut tx, trace_id, stages).await?; + insert_trace_candidates_tx(&mut *tx, trace_id, candidates).await?; + + tx.commit().await?; + + Ok(()) +} + +async fn handle_consolidation_job(db: &Db, job: &ConsolidationRunJob) -> Result<()> { + let payload: ConsolidationJobPayload = serde_json::from_value(job.payload.clone())?; + + payload.validate().map_err(consolidation_validation_error)?; + + let existing = consolidation::get_consolidation_run( + &db.pool, + job.tenant_id.as_str(), + job.project_id.as_str(), + job.run_id, + ) + .await? + .ok_or_else(|| Error::Validation("Consolidation run does not exist.".to_string()))?; + let current_state = + ConsolidationRunState::parse(existing.status.as_str()).ok_or_else(|| { + Error::Validation("Stored consolidation run status is invalid.".to_string()) + })?; + let now = OffsetDateTime::now_utc(); + let mut tx = db.pool.begin().await?; + + match current_state { + ConsolidationRunState::Pending => { + current_state + .validate_transition(ConsolidationRunState::Running) + .map_err(consolidation_validation_error)?; + + let empty_error = Value::Object(Default::default()); + + consolidation::update_consolidation_run_state( + &mut *tx, + ConsolidationRunStateUpdate { + tenant_id: job.tenant_id.as_str(), + project_id: job.project_id.as_str(), + run_id: job.run_id, + status: ConsolidationRunState::Running.as_str(), + error: &empty_error, + now, + }, + ) + .await? + .ok_or_else(|| Error::Validation("Consolidation run disappeared.".to_string()))?; + }, + ConsolidationRunState::Running => {}, + ConsolidationRunState::Completed + | ConsolidationRunState::Failed + | ConsolidationRunState::Cancelled => { + consolidation::mark_consolidation_run_job_done(&mut *tx, job.job_id, now).await?; + + tx.commit().await?; + + return Ok(()); + }, + } + + for proposal in payload.proposals { + let row = proposal_row_from_contract(job, now, proposal)?; + + consolidation::insert_consolidation_proposal(&mut *tx, &row).await?; + } + + ConsolidationRunState::Running + .validate_transition(ConsolidationRunState::Completed) + .map_err(consolidation_validation_error)?; + + let empty_error = Value::Object(Default::default()); + + consolidation::update_consolidation_run_state( + &mut *tx, + ConsolidationRunStateUpdate { + tenant_id: job.tenant_id.as_str(), + project_id: job.project_id.as_str(), + run_id: job.run_id, + status: ConsolidationRunState::Completed.as_str(), + error: &empty_error, + now, + }, + ) + .await? + .ok_or_else(|| Error::Validation("Consolidation run disappeared.".to_string()))?; + consolidation::mark_consolidation_run_job_done(&mut *tx, job.job_id, now).await?; + + tx.commit().await?; + + Ok(()) +} + +async fn insert_trace_stages_tx( + executor: &mut PgConnection, + trace_id: Uuid, + stages: Vec, +) -> Result<()> { + if stages.is_empty() { + return Ok(()); + } + + let mut stage_inserts = Vec::with_capacity(stages.len()); + let mut item_inserts = Vec::new(); + + for stage in stages { + stage_inserts.push(TraceStageInsert { + stage_id: stage.stage_id, + stage_order: stage.stage_order as i32, + stage_name: stage.stage_name, + stage_payload: stage.stage_payload, + created_at: stage.created_at, + }); + + for item in stage.items { + item_inserts.push(TraceStageItemInsert { + id: item.id, + stage_id: stage.stage_id, + item_id: item.item_id, + note_id: item.note_id, + chunk_id: item.chunk_id, + metrics: item.metrics, + }); + } + } + + let mut stage_builder = QueryBuilder::new( + "\ + INSERT INTO search_trace_stages ( + stage_id, + trace_id, + stage_order, + stage_name, + stage_payload, + created_at + ) ", + ); + + stage_builder.push_values(stage_inserts, |mut b, stage| { + b.push_bind(stage.stage_id) + .push_bind(trace_id) + .push_bind(stage.stage_order) + .push_bind(stage.stage_name) + .push_bind(stage.stage_payload) + .push_bind(stage.created_at); + }); + stage_builder.push(" ON CONFLICT (stage_id) DO NOTHING"); + stage_builder.build().execute(&mut *executor).await?; + + if item_inserts.is_empty() { + return Ok(()); + } + + let mut item_builder = QueryBuilder::new( + "\ + INSERT INTO search_trace_stage_items ( + id, + stage_id, + item_id, + note_id, + chunk_id, + metrics + ) ", + ); + + item_builder.push_values(item_inserts, |mut b, item| { + b.push_bind(item.id) + .push_bind(item.stage_id) + .push_bind(item.item_id) + .push_bind(item.note_id) + .push_bind(item.chunk_id) + .push_bind(item.metrics); + }); + item_builder.push(" ON CONFLICT (id) DO NOTHING"); + item_builder.build().execute(executor).await?; + + Ok(()) +} + +async fn insert_trace_tx<'e, E>( + executor: E, + trace_id: Uuid, + trace: &TraceRecord, + expanded_queries_json: Value, + allowed_scopes_json: Value, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "INSERT INTO search_traces ( + trace_id, + tenant_id, + project_id, + agent_id, + read_profile, + query, + expansion_mode, + expanded_queries, + allowed_scopes, + candidate_count, + top_k, + config_snapshot, + trace_version, + created_at, + expires_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15 +) +ON CONFLICT (trace_id) DO NOTHING", + ) + .bind(trace_id) + .bind(trace.tenant_id.as_str()) + .bind(trace.project_id.as_str()) + .bind(trace.agent_id.as_str()) + .bind(trace.read_profile.as_str()) + .bind(trace.query.as_str()) + .bind(trace.expansion_mode.as_str()) + .bind(expanded_queries_json) + .bind(allowed_scopes_json) + .bind(trace.candidate_count as i32) + .bind(trace.top_k as i32) + .bind(trace.config_snapshot.clone()) + .bind(trace.trace_version) + .bind(trace.created_at) + .bind(trace.expires_at) + .execute(executor) + .await?; + + Ok(()) +} + +async fn insert_trace_items_tx<'e, E>( + executor: E, + trace_id: Uuid, + items: Vec, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + if items.is_empty() { + return Ok(()); + } + + let mut inserts = Vec::with_capacity(items.len()); + + for item in items { + inserts.push(TraceItemInsert { + item_id: item.item_id, + note_id: item.note_id, + chunk_id: item.chunk_id, + rank: item.rank as i32, + final_score: item.final_score, + explain: item.explain, + }); + } + + let mut builder = QueryBuilder::new( + "\ +INSERT INTO search_trace_items ( + item_id, + trace_id, + note_id, + chunk_id, + rank, + final_score, + explain +) ", + ); + + builder.push_values(inserts, |mut b, item| { + b.push_bind(item.item_id) + .push_bind(trace_id) + .push_bind(item.note_id) + .push_bind(item.chunk_id) + .push_bind(item.rank) + .push_bind(item.final_score) + .push_bind(item.explain); + }); + builder.push(" ON CONFLICT (item_id) DO NOTHING"); + builder.build().execute(executor).await?; + + Ok(()) +} + +async fn insert_trace_candidates_tx<'e, E>( + executor: E, + trace_id: Uuid, + candidates: Vec, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + if candidates.is_empty() { + return Ok(()); + } + + let mut inserts = Vec::with_capacity(candidates.len()); + + for candidate in candidates { + inserts.push(TraceCandidateInsert { + candidate_id: candidate.candidate_id, + note_id: candidate.note_id, + chunk_id: candidate.chunk_id, + chunk_index: candidate.chunk_index, + snippet: candidate.snippet, + candidate_snapshot: candidate.candidate_snapshot, + retrieval_rank: candidate.retrieval_rank as i32, + rerank_score: candidate.rerank_score, + note_scope: candidate.note_scope, + note_importance: candidate.note_importance, + note_updated_at: candidate.note_updated_at, + note_hit_count: candidate.note_hit_count, + note_last_hit_at: candidate.note_last_hit_at, + created_at: candidate.created_at, + expires_at: candidate.expires_at, + }); + } + + let mut builder = QueryBuilder::new( + "\ +INSERT INTO search_trace_candidates ( + candidate_id, + trace_id, + note_id, + chunk_id, + chunk_index, + snippet, + candidate_snapshot, + retrieval_rank, + rerank_score, + note_scope, + note_importance, + note_updated_at, + note_hit_count, + note_last_hit_at, + created_at, + expires_at +) ", + ); + + builder.push_values(inserts, |mut b, candidate| { + b.push_bind(candidate.candidate_id) + .push_bind(trace_id) + .push_bind(candidate.note_id) + .push_bind(candidate.chunk_id) + .push_bind(candidate.chunk_index) + .push_bind(candidate.snippet) + .push_bind(candidate.candidate_snapshot) + .push_bind(candidate.retrieval_rank) + .push_bind(candidate.rerank_score) + .push_bind(candidate.note_scope) + .push_bind(candidate.note_importance) + .push_bind(candidate.note_updated_at) + .push_bind(candidate.note_hit_count) + .push_bind(candidate.note_last_hit_at) + .push_bind(candidate.created_at) + .push_bind(candidate.expires_at); + }); + builder.push(" ON CONFLICT (candidate_id) DO NOTHING"); + builder.build().execute(executor).await?; + + Ok(()) +} + +async fn purge_expired_trace_candidates(db: &Db, now: OffsetDateTime) -> Result<()> { + let result = sqlx::query("DELETE FROM search_trace_candidates WHERE expires_at <= $1") + .bind(now) + .execute(&db.pool) + .await?; + + if result.rows_affected() > 0 { + tracing::info!(count = result.rows_affected(), "Purged expired search trace candidates."); + } + + Ok(()) +} + +async fn purge_expired_traces(db: &Db, now: OffsetDateTime) -> Result<()> { + let result = sqlx::query("DELETE FROM search_traces WHERE expires_at <= $1") + .bind(now) + .execute(&db.pool) + .await?; + + if result.rows_affected() > 0 { + tracing::info!(count = result.rows_affected(), "Purged expired search traces."); + } + + Ok(()) +} + +async fn purge_expired_cache(db: &Db, now: OffsetDateTime) -> Result<()> { + let result = sqlx::query("DELETE FROM llm_cache WHERE expires_at <= $1") + .bind(now) + .execute(&db.pool) + .await?; + + if result.rows_affected() > 0 { + tracing::info!(count = result.rows_affected(), "Purged expired LLM cache entries."); + } + + Ok(()) +} + +async fn purge_expired_search_sessions(db: &Db, now: OffsetDateTime) -> Result<()> { + let result = sqlx::query("DELETE FROM search_sessions WHERE expires_at <= $1") + .bind(now) + .execute(&db.pool) + .await?; + + if result.rows_affected() > 0 { + tracing::info!(count = result.rows_affected(), "Purged expired search sessions."); + } + + Ok(()) +} + +async fn fetch_note(db: &Db, note_id: Uuid) -> Result> { + let note = sqlx::query_as::<_, MemoryNote>("SELECT * FROM memory_notes WHERE note_id = $1") + .bind(note_id) + .fetch_optional(&db.pool) + .await?; + + Ok(note) +} + +async fn fetch_note_fields(db: &Db, note_id: Uuid) -> Result> { + let rows = sqlx::query_as::<_, NoteFieldRow>( + "\ +SELECT field_id, text +FROM memory_note_fields +WHERE note_id = $1 +ORDER BY field_kind ASC, item_index ASC", + ) + .bind(note_id) + .fetch_all(&db.pool) + .await?; + + Ok(rows) +} + +async fn insert_embedding_tx<'e, E>( + executor: E, + note_id: Uuid, + embedding_version: &str, + embedding_dim: i32, + vec: &[f32], +) -> Result<()> +where + E: PgExecutor<'e>, +{ + let vec_text = format_vector_text(vec); + + sqlx::query( + "\ +INSERT INTO note_embeddings ( + note_id, + embedding_version, + embedding_dim, + vec +) +VALUES ($1, $2, $3, $4::text::vector) +ON CONFLICT (note_id, embedding_version) DO UPDATE +SET + embedding_dim = EXCLUDED.embedding_dim, + vec = EXCLUDED.vec, + created_at = now()", + ) + .bind(note_id) + .bind(embedding_version) + .bind(embedding_dim) + .bind(vec_text.as_str()) + .execute(executor) + .await?; + + Ok(()) +} + +async fn insert_note_field_embedding_tx<'e, E>( + executor: E, + field_id: Uuid, + embedding_version: &str, + embedding_dim: i32, + vec: &[f32], +) -> Result<()> +where + E: PgExecutor<'e>, +{ + let vec_text = format_vector_text(vec); + + sqlx::query( + "\ +INSERT INTO note_field_embeddings ( + field_id, + embedding_version, + embedding_dim, + vec +) +VALUES ($1, $2, $3, $4::text::vector) +ON CONFLICT (field_id, embedding_version) DO UPDATE +SET + embedding_dim = EXCLUDED.embedding_dim, + vec = EXCLUDED.vec, + created_at = now()", + ) + .bind(field_id) + .bind(embedding_version) + .bind(embedding_dim) + .bind(vec_text.as_str()) + .execute(executor) + .await?; + + Ok(()) +} + +async fn delete_qdrant_note_points(state: &WorkerState, note_id: Uuid) -> Result<()> { + let filter = Filter::must([Condition::matches("note_id", note_id.to_string())]); + let delete = + DeletePointsBuilder::new(state.qdrant.collection.clone()).points(filter).wait(true); + + match state.qdrant.client.delete_points(delete).await { + Ok(_) => {}, + Err(err) => + if is_not_found_error(&err) { + tracing::info!(note_id = %note_id, "Qdrant points missing during delete."); + } else { + return Err(err.into()); + }, + } + Ok(()) } @@ -581,191 +1503,257 @@ async fn upsert_qdrant_chunks( vectors: &[Vec], ) -> Result<()> { let mut points = Vec::with_capacity(records.len()); + for (record, vec) in records.iter().zip(vectors.iter()) { - let mut payload_map = HashMap::new(); - payload_map.insert("note_id".to_string(), Value::from(note.note_id.to_string())); - payload_map.insert("chunk_id".to_string(), Value::from(record.chunk_id.to_string())); - payload_map.insert("chunk_index".to_string(), Value::from(record.chunk_index as i64)); - payload_map.insert("start_offset".to_string(), Value::from(record.start_offset as i64)); - payload_map.insert("end_offset".to_string(), Value::from(record.end_offset as i64)); - payload_map.insert("tenant_id".to_string(), Value::from(note.tenant_id.clone())); - payload_map.insert("project_id".to_string(), Value::from(note.project_id.clone())); - payload_map.insert("agent_id".to_string(), Value::from(note.agent_id.clone())); - payload_map.insert("scope".to_string(), Value::from(note.scope.clone())); - payload_map.insert("status".to_string(), Value::from(note.status.clone())); - payload_map.insert("type".to_string(), Value::from(note.r#type.clone())); - payload_map.insert( - "key".to_string(), - note.key - .as_ref() - .map(|key| Value::from(key.clone())) - .unwrap_or_else(|| Value::from(JsonValue::Null)), - ); - payload_map.insert( - "updated_at".to_string(), - Value::from(JsonValue::String(format_timestamp(note.updated_at)?)), - ); - payload_map.insert( - "expires_at".to_string(), - Value::from(match note.expires_at { - Some(ts) => JsonValue::String(format_timestamp(ts)?), - None => JsonValue::Null, - }), + let mut payload = Payload::new(); + + payload.insert("note_id", note.note_id.to_string()); + payload.insert("chunk_id", record.chunk_id.to_string()); + payload.insert("chunk_index", record.chunk_index as i64); + payload.insert("start_offset", record.start_offset as i64); + payload.insert("end_offset", record.end_offset as i64); + payload.insert("tenant_id", note.tenant_id.clone()); + payload.insert("project_id", note.project_id.clone()); + payload.insert("agent_id", note.agent_id.clone()); + payload.insert("scope", note.scope.clone()); + payload.insert("status", note.status.clone()); + payload.insert("type", note.r#type.clone()); + + match note.key.as_ref() { + Some(key) => payload.insert("key", key.clone()), + None => payload.insert("key", Value::Null), + } + + payload.insert("updated_at", Value::String(format_timestamp(note.updated_at)?)); + payload.insert( + "expires_at", + match note.expires_at { + Some(ts) => Value::String(format_timestamp(ts)?), + None => Value::Null, + }, ); - payload_map - .insert("importance".to_string(), Value::from(JsonValue::from(note.importance as f64))); - payload_map - .insert("confidence".to_string(), Value::from(JsonValue::from(note.confidence as f64))); - payload_map - .insert("embedding_version".to_string(), Value::from(embedding_version.to_string())); - - let payload = Payload::from(payload_map); + payload.insert("importance", Value::from(note.importance as f64)); + payload.insert("confidence", Value::from(note.confidence as f64)); + payload.insert("embedding_version", embedding_version.to_string()); + let mut vector_map = HashMap::new(); + vector_map.insert(DENSE_VECTOR_NAME.to_string(), Vector::from(vec.to_vec())); vector_map.insert( BM25_VECTOR_NAME.to_string(), Vector::from(Document::new(record.text.clone(), BM25_MODEL)), ); + let point = PointStruct::new(record.chunk_id.to_string(), vector_map, payload); + points.push(point); } let upsert = UpsertPointsBuilder::new(state.qdrant.collection.clone(), points).wait(true); - state.qdrant.client.upsert_points(upsert).await?; - Ok(()) -} -fn format_timestamp(ts: OffsetDateTime) -> Result { - use time::format_description::well_known::Rfc3339; - ts.format(&Rfc3339).map_err(|_| eyre::eyre!("Failed to format timestamp.")) -} + state.qdrant.client.upsert_points(upsert).await?; -fn validate_vector_dim(vec: &[f32], expected_dim: u32) -> Result<()> { - if vec.len() != expected_dim as usize { - return Err(eyre::eyre!( - "Embedding dimension {} does not match configured vector_dim {}.", - vec.len(), - expected_dim - )); - } Ok(()) } -fn format_vector_text(vec: &[f32]) -> String { - let mut out = String::from("["); - for (idx, value) in vec.iter().enumerate() { - if idx > 0 { - out.push(','); - } - out.push_str(&value.to_string()); - } - out.push(']'); - out -} - -fn encode_json(value: &T, label: &str) -> Result -where - T: Serialize, -{ - serde_json::to_value(value).map_err(|err| eyre::eyre!("Failed to encode {label}: {err}.")) -} - -async fn mark_done(db: &Db, outbox_id: uuid::Uuid) -> Result<()> { +async fn mark_failed(db: &Db, outbox_id: Uuid, attempts: i32, err: &Error) -> Result<()> { + let next_attempts = attempts.saturating_add(1); + let backoff = backoff_for_attempt(next_attempts); let now = OffsetDateTime::now_utc(); - sqlx::query("UPDATE indexing_outbox SET status = 'DONE', updated_at = $1 WHERE outbox_id = $2") - .bind(now) - .bind(outbox_id) - .execute(&db.pool) - .await?; + let available_at = now + backoff; + let error_text = sanitize_outbox_error(&err.to_string()); + + outbox::mark_indexing_outbox_failed( + db, + outbox_id, + next_attempts, + error_text.as_str(), + available_at, + now, + ) + .await?; + Ok(()) } -async fn mark_trace_done(db: &Db, outbox_id: uuid::Uuid) -> Result<()> { +async fn mark_doc_failed(db: &Db, outbox_id: Uuid, attempts: i32, err: &Error) -> Result<()> { + let next_attempts = attempts.saturating_add(1); + let backoff = backoff_for_attempt(next_attempts); let now = OffsetDateTime::now_utc(); - sqlx::query( - "UPDATE search_trace_outbox SET status = 'DONE', updated_at = $1 WHERE outbox_id = $2", + let available_at = now + backoff; + let error_text = sanitize_outbox_error(&err.to_string()); + + doc_outbox::mark_doc_indexing_outbox_failed( + db, + outbox_id, + next_attempts, + error_text.as_str(), + available_at, + now, ) - .bind(now) - .bind(outbox_id) - .execute(&db.pool) .await?; + Ok(()) } -async fn mark_failed( - db: &Db, - outbox_id: uuid::Uuid, - attempts: i32, - err: &color_eyre::Report, -) -> Result<()> { +async fn mark_trace_failed(db: &Db, outbox_id: Uuid, attempts: i32, err: &Error) -> Result<()> { let next_attempts = attempts.saturating_add(1); let backoff = backoff_for_attempt(next_attempts); let now = OffsetDateTime::now_utc(); let available_at = now + backoff; - sqlx::query( - "UPDATE indexing_outbox \ - SET status = 'FAILED', attempts = $1, last_error = $2, available_at = $3, updated_at = $4 \ - WHERE outbox_id = $5", + let error_text = sanitize_outbox_error(&err.to_string()); + + outbox::mark_trace_outbox_failed( + db, + outbox_id, + next_attempts, + error_text.as_str(), + available_at, + now, ) - .bind(next_attempts) - .bind(err.to_string()) - .bind(available_at) - .bind(now) - .bind(outbox_id) - .execute(&db.pool) .await?; + Ok(()) } -async fn mark_trace_failed( +async fn mark_consolidation_failed( db: &Db, - outbox_id: uuid::Uuid, + job_id: Uuid, attempts: i32, - err: &color_eyre::Report, + err: &Error, ) -> Result<()> { let next_attempts = attempts.saturating_add(1); let backoff = backoff_for_attempt(next_attempts); let now = OffsetDateTime::now_utc(); let available_at = now + backoff; - sqlx::query( - "UPDATE search_trace_outbox \ - SET status = 'FAILED', attempts = $1, last_error = $2, available_at = $3, updated_at = $4 \ - WHERE outbox_id = $5", + let error_text = sanitize_outbox_error(&err.to_string()); + + consolidation::mark_consolidation_run_job_failed( + db, + job_id, + next_attempts, + error_text.as_str(), + available_at, + now, ) - .bind(next_attempts) - .bind(err.to_string()) - .bind(available_at) - .bind(now) - .bind(outbox_id) - .execute(&db.pool) .await?; - Ok(()) -} - -fn backoff_for_attempt(attempt: i32) -> Duration { - let attempts = attempt.max(1) as u32; - let exp = attempts.saturating_sub(1).min(6); - let base = BASE_BACKOFF_MS.saturating_mul(1 << exp); - let capped = base.min(MAX_BACKOFF_MS); - Duration::milliseconds(capped) -} -fn to_std_duration(duration: Duration) -> StdDuration { - let millis = duration.whole_milliseconds(); - if millis <= 0 { - return StdDuration::from_millis(0); - } - StdDuration::from_millis(millis as u64) + Ok(()) } #[cfg(test)] mod tests { - use super::*; + use serde_json; + use time::{OffsetDateTime, format_description::well_known::Rfc3339}; + + use crate::worker::{self}; #[test] fn pooled_vector_is_mean_of_chunks() { let chunks = vec![vec![1.0_f32, 3.0_f32], vec![3.0_f32, 5.0_f32]]; - let pooled = mean_pool(&chunks).expect("Expected pooled vector."); + let pooled = worker::mean_pool(&chunks).expect("Expected pooled vector."); + assert_eq!(pooled, vec![2.0_f32, 4.0_f32]); } + + #[test] + fn project_doc_ref_fields_falls_back_to_created_at_timestamp() { + let created_at = OffsetDateTime::parse("2025-01-01T00:00:00Z", &Rfc3339) + .expect("Failed to parse fallback timestamp."); + let (doc_ts, thread_id, domain, repo) = worker::project_doc_ref_fields( + &serde_json::json!({"thread_id": ""}), + created_at, + "knowledge", + ) + .expect("Expected projection."); + + assert_eq!(doc_ts, created_at.format(&Rfc3339).expect("Failed to format fallback doc_ts.")); + assert!(thread_id.is_none()); + assert!(domain.is_none()); + assert!(repo.is_none()); + } + + #[test] + fn project_doc_ref_fields_prefers_source_ref_ts() { + let created_at = OffsetDateTime::parse("2025-01-01T00:00:00Z", &Rfc3339) + .expect("Failed to parse fallback timestamp."); + let source_ref = serde_json::json!({ + "ts": "2025-01-01T01:02:03Z", + "doc_ts": "2020-01-01T00:00:00Z", + "thread_id": "thread-42", + "domain": "example.com", + "repo": "org/repo" + }); + let (doc_ts, thread_id, domain, repo) = + worker::project_doc_ref_fields(&source_ref, created_at, "chat") + .expect("Expected projection."); + + assert_eq!(doc_ts, "2025-01-01T01:02:03Z"); + assert_eq!(thread_id.as_deref(), Some("thread-42")); + assert!(domain.is_none()); + assert!(repo.is_none()); + } + + #[test] + fn project_doc_ref_fields_uses_legacy_doc_ts_when_ts_is_missing() { + let created_at = OffsetDateTime::parse("2025-01-01T00:00:00Z", &Rfc3339) + .expect("Failed to parse fallback timestamp."); + let source_ref = serde_json::json!({ + "doc_ts": "2025-01-01T02:03:04Z", + "thread_id": "legacy-thread", + "domain": "legacy.example", + "repo": "legacy/repo" + }); + let (doc_ts, thread_id, domain, repo) = + worker::project_doc_ref_fields(&source_ref, created_at, "knowledge") + .expect("Expected projection."); + + assert_eq!(doc_ts, "2025-01-01T02:03:04Z"); + assert!(thread_id.is_none()); + assert!(domain.is_none()); + assert!(repo.is_none()); + } + + #[test] + fn project_doc_ref_fields_gates_optional_ref_fields_by_doc_type() { + let created_at = OffsetDateTime::parse("2025-01-01T00:00:00Z", &Rfc3339) + .expect("Failed to parse fallback timestamp."); + let source_ref = serde_json::json!({ + "thread_id": "thread-42", + "domain": "example.com", + "repo": "org/repo", + }); + let (doc_ts_for_knowledge, thread_id_knowledge, domain_knowledge, repo_knowledge) = + worker::project_doc_ref_fields(&source_ref, created_at, "knowledge") + .expect("Expected projection."); + + assert_eq!( + doc_ts_for_knowledge, + created_at.format(&Rfc3339).expect("Failed to format fallback doc_ts.") + ); + assert!(thread_id_knowledge.is_none()); + assert!(domain_knowledge.is_none()); + assert!(repo_knowledge.is_none()); + + let chat_projection = worker::project_doc_ref_fields(&source_ref, created_at, "chat") + .expect("Expected projection."); + + assert_eq!(chat_projection.1.as_deref(), Some("thread-42")); + assert!(chat_projection.2.is_none()); + assert!(chat_projection.3.is_none()); + + let search_projection = worker::project_doc_ref_fields(&source_ref, created_at, "search") + .expect("Expected projection."); + + assert!(search_projection.1.is_none()); + assert_eq!(search_projection.2.as_deref(), Some("example.com")); + assert!(search_projection.3.is_none()); + + let dev_projection = worker::project_doc_ref_fields(&source_ref, created_at, "dev") + .expect("Expected projection."); + + assert!(dev_projection.1.is_none()); + assert!(dev_projection.2.is_none()); + assert_eq!(dev_projection.3.as_deref(), Some("org/repo")); + } } diff --git a/build.rs b/build.rs index 75952ad0..d37f7bdc 100644 --- a/build.rs +++ b/build.rs @@ -1,15 +1,18 @@ -// std +#![allow(missing_docs)] + use std::error::Error; -// crates.io -use vergen_gitcl::{CargoBuilder, Emitter, GitclBuilder}; + +use vergen_gitcl::{Cargo, Emitter, Gitcl}; fn main() -> Result<(), Box> { let mut emitter = Emitter::default(); - emitter.add_instructions(&CargoBuilder::default().target_triple(true).build()?)?; + println!("cargo:rustc-env=VERGEN_GIT_SHA=unknown"); + + emitter.add_instructions(&Cargo::builder().target_triple(true).build())?; // Disable the git version if installed from . - if emitter.add_instructions(&GitclBuilder::default().sha(true).build()?).is_err() { + if emitter.add_instructions(&Gitcl::builder().sha(false).build()).is_err() { println!("cargo:rustc-env=VERGEN_GIT_SHA=crates.io"); } diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 00000000..b1e6109e --- /dev/null +++ b/clippy.toml @@ -0,0 +1,3 @@ +allow-unwrap-in-tests = true +too-many-lines-threshold = 120 +warn-on-all-wildcard-imports = true diff --git a/config/local/elf.docker.toml b/config/local/elf.docker.toml new file mode 100644 index 00000000..ec186717 --- /dev/null +++ b/config/local/elf.docker.toml @@ -0,0 +1,213 @@ +[service] +admin_bind = "127.0.0.1:51891" +http_bind = "127.0.0.1:51892" +log_level = "info" +mcp_bind = "127.0.0.1:51893" + +[storage.postgres] +dsn = "postgres://elf_dev:elf_dev_password@127.0.0.1:51888/elf_local" +pool_max_conns = 10 + +[storage.qdrant] +collection = "elf_local_notes" +docs_collection = "elf_local_doc_chunks" +url = "http://127.0.0.1:51890" +vector_dim = 256 + +[mcp] +agent_id = "local-agent" +project_id = "local-project" +read_profile = "private_plus_project" +tenant_id = "local-tenant" + +[providers.embedding] +api_base = "http://127.0.0.1" +api_key = "local-dev-placeholder" +default_headers = {} +dimensions = 256 +model = "local-hash" +path = "/embeddings" +provider_id = "local" +timeout_ms = 1_000 + +[providers.rerank] +api_base = "http://127.0.0.1" +api_key = "local-dev-placeholder" +default_headers = {} +model = "local-token-overlap" +path = "/rerank" +provider_id = "local" +timeout_ms = 1_000 + +[providers.llm_extractor] +api_base = "http://127.0.0.1" +api_key = "local-dev-placeholder" +default_headers = {} +model = "local-disabled" +path = "/chat/completions" +provider_id = "local-disabled" +temperature = 0.0 +timeout_ms = 1_000 + +[scopes] +allowed = ["agent_private", "org_shared", "project_shared"] + +[scopes.read_profiles] +all_scopes = ["agent_private", "org_shared", "project_shared"] +private_only = ["agent_private"] +private_plus_project = ["agent_private", "project_shared"] + +[scopes.precedence] +agent_private = 30 +org_shared = 10 +project_shared = 20 + +[scopes.write_allowed] +agent_private = true +org_shared = true +project_shared = true + +[memory] +candidate_k = 60 +dup_sim_threshold = 0.92 +max_note_chars = 240 +max_notes_per_add_event = 3 +top_k = 12 +update_sim_threshold = 0.85 + +[memory.policy] + +[[memory.policy.rules]] +min_confidence = 0.9 +min_importance = 0.75 +note_type = "preference" +scope = "agent_private" + +[chunking] +enabled = true +max_tokens = 512 +overlap_tokens = 128 +tokenizer_repo = "config/local/tokenizer.wordlevel.json" + +[search.expansion] +include_original = true +max_queries = 4 +mode = "off" + +[search.dynamic] +min_candidates = 10 +min_top_score = 0.12 + +[search.prefilter] +max_candidates = 0 + +[search.cache] +enabled = false +expansion_ttl_days = 7 +max_payload_bytes = 262_144 +rerank_ttl_days = 7 + +[search.explain] +candidate_retention_days = 2 +capture_candidates = false +retention_days = 7 +write_mode = "outbox" + +[search.recursive] +enabled = false +max_children_per_node = 4 +max_depth = 2 +max_nodes_per_scope = 32 +max_total_nodes = 256 + +[search.graph_context] +enabled = false +max_evidence_notes_per_fact = 16 +max_facts_per_item = 16 + +[ranking] +recency_tau_days = 60.0 +tie_breaker_weight = 0.1 + +[ranking.deterministic] +enabled = false + +[ranking.deterministic.lexical] +enabled = false +max_query_terms = 16 +max_text_terms = 1_024 +min_ratio = 0.3 +weight = 0.05 + +[ranking.deterministic.hits] +enabled = false +half_saturation = 8.0 +last_hit_tau_days = 14.0 +weight = 0.05 + +[ranking.deterministic.decay] +enabled = false +tau_days = 30.0 +weight = 0.05 + +[ranking.blend] +enabled = true +rerank_normalization = "rank" +retrieval_normalization = "rank" + +[[ranking.blend.segments]] +max_retrieval_rank = 3 +retrieval_weight = 0.8 + +[[ranking.blend.segments]] +max_retrieval_rank = 10 +retrieval_weight = 0.5 + +[[ranking.blend.segments]] +max_retrieval_rank = 1_000_000 +retrieval_weight = 0.2 + +[ranking.diversity] +enabled = true +max_skips = 64 +mmr_lambda = 0.7 +sim_threshold = 0.88 + +[ranking.retrieval_sources] +fusion_priority = 1 +fusion_weight = 1.0 +structured_field_priority = 0 +structured_field_weight = 1.0 + +[lifecycle.ttl_days] +constraint = 0 +decision = 0 +fact = 180 +plan = 14 +preference = 0 +profile = 0 + +[lifecycle] +purge_deleted_after_days = 30 +purge_deprecated_after_days = 180 + +[security] +auth_keys = [] +auth_mode = "off" +bind_localhost_only = true +evidence_max_quote_chars = 320 +evidence_max_quotes = 2 +evidence_min_quotes = 1 +redact_secrets_on_write = true +reject_non_english = true + +[context] +scope_boost_weight = 0.0 + +[context.project_descriptions] +"local-tenant:local-project" = "Local ELF development stack." + +[context.scope_descriptions] +agent_private = "Local private notes for one development agent." +org_shared = "Local organization-shared development notes." +project_shared = "Local project-shared development notes." diff --git a/config/local/tokenizer.wordlevel.json b/config/local/tokenizer.wordlevel.json new file mode 100644 index 00000000..631ac318 --- /dev/null +++ b/config/local/tokenizer.wordlevel.json @@ -0,0 +1,19 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [], + "normalizer": null, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": null, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "[UNK]": 0 + }, + "unk_token": "[UNK]" + } +} diff --git a/docker-compose.baseline.yml b/docker-compose.baseline.yml new file mode 100644 index 00000000..5dc3180e --- /dev/null +++ b/docker-compose.baseline.yml @@ -0,0 +1,171 @@ +name: elf-live-baseline + +services: + postgres: + image: pgvector/pgvector:pg18 + environment: + POSTGRES_DB: postgres + POSTGRES_PASSWORD: elf_dev_password + POSTGRES_USER: elf_dev + healthcheck: + test: + - CMD-SHELL + - pg_isready -U elf_dev -d postgres + interval: 2s + timeout: 5s + retries: 30 + volumes: + - elf-live-baseline-postgres-data:/var/lib/postgresql + + qdrant: + image: qdrant/qdrant:v1.16.3 + volumes: + - elf-live-baseline-qdrant-data:/qdrant/storage + + lightrag-mock-provider: + profiles: + - lightrag + image: python:3.13-slim + environment: + ELF_LIGHTRAG_MOCK_EMBEDDING_DIM: ${ELF_LIGHTRAG_EMBEDDING_DIM:-64} + ELF_LIGHTRAG_MOCK_HOST: 0.0.0.0 + ELF_LIGHTRAG_MOCK_PORT: 8080 + command: + - python + - /app/scripts/lightrag-mock-openai-provider.py + volumes: + - ./scripts/lightrag-mock-openai-provider.py:/app/scripts/lightrag-mock-openai-provider.py:ro + + lightrag: + profiles: + - lightrag + image: ${ELF_LIGHTRAG_IMAGE:-ghcr.io/hkuds/lightrag:latest} + depends_on: + - lightrag-mock-provider + environment: + WORKING_DIR: /app/data/rag_storage + INPUT_DIR: /app/data/inputs + PROMPT_DIR: /app/data/prompts + HOST: 0.0.0.0 + PORT: 9621 + LLM_BINDING: ${ELF_LIGHTRAG_LLM_BINDING:-openai} + LLM_BINDING_HOST: ${ELF_LIGHTRAG_LLM_BINDING_HOST:-http://lightrag-mock-provider:8080/v1} + LLM_BINDING_API_KEY: ${ELF_LIGHTRAG_LLM_BINDING_API_KEY:-local-key} + LLM_MODEL: ${ELF_LIGHTRAG_LLM_MODEL:-elf-lightrag-mock} + EMBEDDING_BINDING: ${ELF_LIGHTRAG_EMBEDDING_BINDING:-openai} + EMBEDDING_BINDING_HOST: ${ELF_LIGHTRAG_EMBEDDING_BINDING_HOST:-http://lightrag-mock-provider:8080/v1} + EMBEDDING_BINDING_API_KEY: ${ELF_LIGHTRAG_EMBEDDING_BINDING_API_KEY:-local-key} + EMBEDDING_MODEL: ${ELF_LIGHTRAG_EMBEDDING_MODEL:-elf-lightrag-mock-embedding} + EMBEDDING_DIM: ${ELF_LIGHTRAG_EMBEDDING_DIM:-64} + RERANK_BY_DEFAULT: ${ELF_LIGHTRAG_RERANK_BY_DEFAULT:-False} + RERANK_BINDING: ${ELF_LIGHTRAG_RERANK_BINDING:-cohere} + RERANK_BINDING_HOST: ${ELF_LIGHTRAG_RERANK_BINDING_HOST:-http://lightrag-mock-provider:8080/rerank} + RERANK_BINDING_API_KEY: ${ELF_LIGHTRAG_RERANK_BINDING_API_KEY:-local-key} + RERANK_MODEL: ${ELF_LIGHTRAG_RERANK_MODEL:-elf-lightrag-mock-rerank} + MAX_ASYNC_LLM: ${ELF_LIGHTRAG_MAX_ASYNC_LLM:-1} + MAX_ASYNC_RERANK: ${ELF_LIGHTRAG_MAX_ASYNC_RERANK:-1} + MAX_PARALLEL_INSERT: ${ELF_LIGHTRAG_MAX_PARALLEL_INSERT:-1} + CHUNK_SIZE: ${ELF_LIGHTRAG_CHUNK_SIZE:-320} + CHUNK_OVERLAP_SIZE: ${ELF_LIGHTRAG_CHUNK_OVERLAP_SIZE:-32} + volumes: + - elf-live-baseline-lightrag-rag-storage:/app/data/rag_storage + - elf-live-baseline-lightrag-inputs:/app/data/inputs + - elf-live-baseline-lightrag-prompts:/app/data/prompts + + graphiti-falkordb: + profiles: + - graphiti-zep + image: ${ELF_GRAPHITI_ZEP_FALKORDB_IMAGE:-falkordb/falkordb:edge} + volumes: + - elf-live-baseline-graphiti-falkordb:/data + + baseline-runner: + build: + context: . + dockerfile: docker/baseline/Dockerfile + depends_on: + postgres: + condition: service_healthy + qdrant: + condition: service_started + environment: + CARGO_HOME: /usr/local/cargo + ELF_BASELINE_ELF_HEAD: ${ELF_BASELINE_ELF_HEAD:-unknown} + DASHSCOPE_API_BASE: ${DASHSCOPE_API_BASE:-} + DASHSCOPE_API_KEY: ${DASHSCOPE_API_KEY:-} + DASHSCOPE_EMBEDDING_DIMENSIONS: ${DASHSCOPE_EMBEDDING_DIMENSIONS:-} + EMBEDDING_API_BASE: ${EMBEDDING_API_BASE:-} + EMBEDDING_API_KEY: ${EMBEDDING_API_KEY:-} + EMBEDDING_DIMENSIONS: ${EMBEDDING_DIMENSIONS:-} + EMBEDDING_MODEL: ${EMBEDDING_MODEL:-} + EMBEDDING_PATH: ${EMBEDDING_PATH:-} + EMBEDDING_PROVIDER_ID: ${EMBEDDING_PROVIDER_ID:-} + EMBEDDING_TIMEOUT_MS: ${EMBEDDING_TIMEOUT_MS:-} + ELF_BASELINE_CONCURRENT_NOTES: ${ELF_BASELINE_CONCURRENT_NOTES:-} + ELF_BASELINE_COST_PER_1K_TOKENS_USD: ${ELF_BASELINE_COST_PER_1K_TOKENS_USD:-} + ELF_BASELINE_ELF_EMBEDDING_API_BASE: ${ELF_BASELINE_ELF_EMBEDDING_API_BASE:-} + ELF_BASELINE_ELF_EMBEDDING_API_KEY: ${ELF_BASELINE_ELF_EMBEDDING_API_KEY:-} + ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS: ${ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS:-} + ELF_BASELINE_ELF_EMBEDDING_MODE: ${ELF_BASELINE_ELF_EMBEDDING_MODE:-local} + ELF_BASELINE_ELF_EMBEDDING_MODEL: ${ELF_BASELINE_ELF_EMBEDDING_MODEL:-} + ELF_BASELINE_ELF_EMBEDDING_PATH: ${ELF_BASELINE_ELF_EMBEDDING_PATH:-} + ELF_BASELINE_ELF_EMBEDDING_PROVIDER_ID: ${ELF_BASELINE_ELF_EMBEDDING_PROVIDER_ID:-} + ELF_BASELINE_ELF_EMBEDDING_TIMEOUT_MS: ${ELF_BASELINE_ELF_EMBEDDING_TIMEOUT_MS:-} + ELF_BASELINE_ELF_TIMEOUT_SECONDS: ${ELF_BASELINE_ELF_TIMEOUT_SECONDS:-} + ELF_BASELINE_BACKFILL_BATCH_SIZE: ${ELF_BASELINE_BACKFILL_BATCH_SIZE:-} + ELF_BASELINE_BACKFILL_CHECKPOINT: ${ELF_BASELINE_BACKFILL_CHECKPOINT:-} + ELF_BASELINE_BACKFILL_DOCS: ${ELF_BASELINE_BACKFILL_DOCS:-2000} + ELF_BASELINE_BACKFILL_INTERRUPT_AFTER: ${ELF_BASELINE_BACKFILL_INTERRUPT_AFTER:-} + ELF_BASELINE_BACKFILL_RESUME_PROBE: ${ELF_BASELINE_BACKFILL_RESUME_PROBE:-} + ELF_BASELINE_MAX_ELF_RSS_KB: ${ELF_BASELINE_MAX_ELF_RSS_KB:-1500000} + ELF_BASELINE_MAX_ELF_SECONDS: ${ELF_BASELINE_MAX_ELF_SECONDS:-600} + ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER: ${ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER:-} + ELF_MEM0_OPENMEMORY_EXPORT_USER_ID: ${ELF_MEM0_OPENMEMORY_EXPORT_USER_ID:-} + ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX: ${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX:-} + ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION: ${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION:-} + ELF_BASELINE_PROFILE: ${ELF_BASELINE_PROFILE:-smoke} + ELF_BASELINE_PROJECTS: ${ELF_BASELINE_PROJECTS:-all} + ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST: ${ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST:-} + ELF_BASELINE_REPORT_DIR: /workspace/tmp/live-baseline + ELF_BASELINE_SCALE_DOCS: ${ELF_BASELINE_SCALE_DOCS:-120} + ELF_BASELINE_SOAK_PROBE_INTERVAL_MS: ${ELF_BASELINE_SOAK_PROBE_INTERVAL_MS:-} + ELF_BASELINE_SOAK_ROUNDS: ${ELF_BASELINE_SOAK_ROUNDS:-} + ELF_BASELINE_SOAK_SECONDS: ${ELF_BASELINE_SOAK_SECONDS:-} + ELF_BASELINE_STRESS_DOCS: ${ELF_BASELINE_STRESS_DOCS:-480} + ELF_BASELINE_TOP_K: ${ELF_BASELINE_TOP_K:-10} + ELF_BASELINE_WORKER_CONCURRENCY: ${ELF_BASELINE_WORKER_CONCURRENCY:-} + QWEN_API_KEY: ${QWEN_API_KEY:-} + QWEN_EMBEDDING_API_BASE: ${QWEN_EMBEDDING_API_BASE:-} + QWEN_EMBEDDING_DIMENSIONS: ${QWEN_EMBEDDING_DIMENSIONS:-} + QWEN_EMBEDDING_MODEL: ${QWEN_EMBEDDING_MODEL:-} + QWEN_EMBEDDING_PATH: ${QWEN_EMBEDDING_PATH:-} + QWEN_EMBEDDING_PROVIDER_ID: ${QWEN_EMBEDDING_PROVIDER_ID:-} + QWEN_EMBEDDING_TIMEOUT_MS: ${QWEN_EMBEDDING_TIMEOUT_MS:-} + ELF_PG_DSN: postgres://elf_dev:elf_dev_password@postgres:5432/postgres + ELF_QDRANT_GRPC_URL: http://qdrant:6334 + ELF_QDRANT_HTTP_URL: http://qdrant:6333 + RUSTUP_HOME: /usr/local/rustup + volumes: + - elf-live-baseline-npm-cache:/root/.npm + - elf-live-baseline-pip-cache:/root/.cache/pip + - elf-live-baseline-huggingface-cache:/root/.cache/huggingface + - elf-live-baseline-qmd-cache:/root/.cache/qmd + - elf-live-baseline-cargo-git:/usr/local/cargo/git + - elf-live-baseline-cargo-registry:/usr/local/cargo/registry + - elf-live-baseline-target:/workspace/target + - ./tmp:/workspace/tmp + +volumes: + elf-live-baseline-cargo-git: + elf-live-baseline-cargo-registry: + elf-live-baseline-graphiti-falkordb: + elf-live-baseline-huggingface-cache: + elf-live-baseline-lightrag-inputs: + elf-live-baseline-lightrag-prompts: + elf-live-baseline-lightrag-rag-storage: + elf-live-baseline-npm-cache: + elf-live-baseline-pip-cache: + elf-live-baseline-postgres-data: + elf-live-baseline-qmd-cache: + elf-live-baseline-qdrant-data: + elf-live-baseline-target: diff --git a/docker-compose.parity.yml b/docker-compose.parity.yml new file mode 100644 index 00000000..98530def --- /dev/null +++ b/docker-compose.parity.yml @@ -0,0 +1,53 @@ +name: elf-parity-gate + +services: + postgres: + image: pgvector/pgvector:pg18 + environment: + POSTGRES_DB: postgres + POSTGRES_PASSWORD: elf_dev_password + POSTGRES_USER: elf_dev + healthcheck: + test: + - CMD-SHELL + - pg_isready -U elf_dev -d postgres + interval: 2s + timeout: 5s + retries: 30 + volumes: + - elf-parity-postgres-data:/var/lib/postgresql + + qdrant: + image: qdrant/qdrant:v1.16.3 + volumes: + - elf-parity-qdrant-data:/qdrant/storage + + parity-runner: + build: + context: . + dockerfile: docker/parity/Dockerfile + depends_on: + postgres: + condition: service_healthy + qdrant: + condition: service_started + environment: + CARGO_HOME: /usr/local/cargo + ELF_HARNESS_COLLECTION: elf_parity_consolidation + ELF_HARNESS_DB_NAME: elf_parity_consolidation + ELF_HARNESS_RUN_ID: parity-docker + ELF_PG_DSN: postgres://elf_dev:elf_dev_password@postgres:5432/postgres + ELF_QDRANT_GRPC_URL: http://qdrant:6334 + ELF_QDRANT_HTTP_URL: http://qdrant:6333 + volumes: + - elf-parity-cargo-registry:/usr/local/cargo/registry + - elf-parity-cargo-git:/usr/local/cargo/git + - elf-parity-target:/workspace/target + - ./tmp/parity:/workspace/tmp/parity + +volumes: + elf-parity-cargo-git: + elf-parity-cargo-registry: + elf-parity-postgres-data: + elf-parity-qdrant-data: + elf-parity-target: diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..6a5009fa --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,32 @@ +name: ${ELF_COMPOSE_PROJECT:-elf-local-dev} + +services: + postgres: + image: pgvector/pgvector:pg18 + environment: + POSTGRES_DB: ${ELF_POSTGRES_DB:-elf_local} + POSTGRES_USER: ${ELF_POSTGRES_USER:-elf_dev} + POSTGRES_PASSWORD: ${ELF_POSTGRES_PASSWORD:-elf_dev_password} + ports: + - "${ELF_POSTGRES_BIND:-127.0.0.1}:${ELF_POSTGRES_PORT:-51888}:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U \"$${POSTGRES_USER}\" -d \"$${POSTGRES_DB}\""] + interval: 10s + timeout: 5s + retries: 10 + volumes: + - elf-postgres-data:/var/lib/postgresql + + qdrant: + image: qdrant/qdrant:v1.16.3 + ports: + - "${ELF_QDRANT_BIND:-127.0.0.1}:${ELF_QDRANT_REST_PORT:-51889}:6333" + - "${ELF_QDRANT_BIND:-127.0.0.1}:${ELF_QDRANT_GRPC_PORT:-51890}:6334" + volumes: + - elf-qdrant-data:/qdrant/storage + +volumes: + elf-postgres-data: + name: ${ELF_POSTGRES_VOLUME:-elf-postgres-data} + elf-qdrant-data: + name: ${ELF_QDRANT_VOLUME:-elf-qdrant-data} diff --git a/docker/baseline/Dockerfile b/docker/baseline/Dockerfile new file mode 100644 index 00000000..1384eb15 --- /dev/null +++ b/docker/baseline/Dockerfile @@ -0,0 +1,37 @@ +FROM node:22-bookworm + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bash \ + build-essential \ + ca-certificates \ + clang \ + cmake \ + curl \ + git \ + jq \ + libssl-dev \ + pkg-config \ + python3 \ + python3-dev \ + python3-pip \ + python3-venv \ + ripgrep \ + sqlite3 \ + && rm -rf /var/lib/apt/lists/* + +ENV CARGO_HOME=/usr/local/cargo +ENV RUSTUP_HOME=/usr/local/rustup +ENV PATH=/usr/local/cargo/bin:$PATH + +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \ + | sh -s -- -y --profile minimal --default-toolchain stable \ + && chmod -R a+w "${CARGO_HOME}" "${RUSTUP_HOME}" + +RUN npm install -g bun pnpm tsx + +WORKDIR /workspace + +COPY . /workspace + +CMD ["bash", "scripts/live-baseline-benchmark.sh"] diff --git a/docker/parity/Dockerfile b/docker/parity/Dockerfile new file mode 100644 index 00000000..8f8a740d --- /dev/null +++ b/docker/parity/Dockerfile @@ -0,0 +1,23 @@ +FROM rust:1-bookworm + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bash \ + ca-certificates \ + clang \ + cmake \ + curl \ + git \ + jq \ + libssl-dev \ + perl \ + pkg-config \ + postgresql-client \ + protobuf-compiler \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +COPY . /workspace + +CMD ["bash", "scripts/parity-docker-gate.sh"] diff --git a/docs/governance.md b/docs/governance.md index c829e9f6..e2b3fe1e 100644 --- a/docs/governance.md +++ b/docs/governance.md @@ -1,63 +1,105 @@ # Documentation Governance -Purpose: Define how documentation is organized, updated, and kept consistent across this -repository. +Purpose: Define how agent-facing documentation is organized, updated, and kept consistent +across this repository. +Status: normative +Read this when: You are creating, moving, splitting, or revising repository documentation. +Not this document: System behavior contracts or operational runbooks for one subsystem. +Defines: Document classes, placement rules, routing headers, and docs update workflow. + +Audience: All documentation under `docs/` is written for AI agents and LLM workflows. +The split between `spec` and `guide` is by task shape, not by reader type. ## Principles -- Write documentation that is clear, concise, retrieval-friendly, and LLM-first. -- Keep contracts and invariants in `docs/spec/`; keep runbooks and how-to guidance in - `docs/guide/`. -- Avoid duplicating authoritative content. Link to the source of truth instead. +- Optimize for retrieval, routing, and execution. +- Keep one authoritative document per topic. +- Separate normative truth from procedural steps. +- Prefer explicit section labels and stable links over prose-heavy narrative. +- Let structure emerge from real topics. Avoid premature folder taxonomies. -## Document classes and ownership +## Document classes -| Class | Location | Source of truth for | Update trigger | -| --- | --- | --- | --- | -| Spec | `docs/spec/` | Contracts, schemas, pipeline behavior, invariants | Any behavior or schema change | -| Operational docs | `docs/guide/` | Runbooks, pipeline walkthroughs, maintenance | When operating procedures change | -| Plans | `docs/plans/` | Draft plans and design notes (non-normative) | As-needed, may drift | +| Class | Location | Answers | Source of truth for | Update trigger | +| --- | --- | --- | --- | --- | +| Spec | `docs/spec/` | What must be true? | Contracts, schemas, invariants, required behavior | Any behavior or schema change | +| Guide | `docs/guide/` | What should I do? | Runbooks, migrations, validation, troubleshooting | Any procedure or operational change | +| Research runs | `docs/research/` | Which evidence-backed research run reached what state? | Machine-readable hypotheses, evidence, trade-offs, challenge records, and terminal decision state | A research workflow needs durable replayable state | +| Plan artifacts | `docs/plans/` | Which saved plan artifact should a planning tool or execution workflow use? | Tool-managed planning outputs | As emitted or updated by the relevant tool | ## Placement rules -- If it defines a contract, it belongs in `docs/spec/`. -- If it explains how to run or maintain a system, it belongs in `docs/guide/`. -- If it is temporary or exploratory, it belongs in `docs/plans/`. -- Module documentation must live under `docs/guide/` and be linked from `docs/guide/index.md`. - Do not add module-level README files. -- Do not duplicate the same content in both spec and guide files. Spec defines what must be true; - guide explains how to operate or implement it. When in doubt, link to the source of truth. +- If a document defines correctness, it belongs in `docs/spec/`. +- If a document defines actions, it belongs in `docs/guide/`. +- If a document is non-normative decision support, comparison, or research input, treat it + as guide-class material and store it under `docs/guide/`. +- If a research workflow requires a machine-readable run file with replayable events, + store that run file under `docs/research/` and link to it from the relevant guide. +- Do not treat `docs/plans/` as a general-purpose docs bucket. +- Use `docs/plans/` only for artifacts produced or consumed by planning tools or + workflows that explicitly depend on saved plan files. +- Do not duplicate the same authoritative content across documents. Link to the source + of truth instead. +- A guide may summarize why a step exists, but normative statements still live in the + governing spec. + +## Document contracts + +Every document should start with a short routing header. + +Spec header: + +- `Purpose` +- `Status: normative` +- `Read this when` +- `Not this document` +- `Defines` + +Guide header: + +- `Goal` +- `Read this when` +- `Inputs` or `Preconditions` +- `Depends on` +- `Outputs` or `Verification` + +## Structure rules + +- Prefer shallow paths by default. +- Add subfolders only when they mirror stable system boundaries or improve retrieval. +- Use descriptive `snake_case` file names. +- Do not require fixed filename prefixes unless a real ambiguity appears. +- Do not create empty folders, empty indexes, or placeholder documents to satisfy a + taxonomy. ## Canonical entry points -- Repository overview: `README.md` (the only README in the repository). -- Specs: `docs/spec/index.md`. -- Operational docs: `docs/guide/index.md`. -- Unified documentation index: `docs/index.md`. - -## Compatibility note - -Legacy paths are no longer maintained. Use `docs/` paths for all references. +- Unified documentation router: `docs/index.md` +- Normative router: `docs/spec/index.md` +- Procedural router: `docs/guide/index.md` +- Repo task and automation entrypoints: `Makefile.toml` ## LLM reading guidance -When answering questions about system behavior: +When answering a repository question: -1. Read `AGENTS.md` for tool and scope rules. -2. Use `docs/spec/index.md` for contracts and invariants. -3. Use `docs/guide/index.md` for runbooks and operational workflows. +1. Read `docs/index.md` for routing. +2. Route by question type: + - "What must be true?" -> `docs/spec/index.md` + - "What should I do?" -> `docs/guide/index.md` +3. Read `Makefile.toml` when the task depends on repository automation or named tasks. +4. Use `docs/research/` only when the task explicitly concerns a machine-readable + research run file used by a research workflow. +5. Use `docs/plans/` only when the task explicitly concerns a saved plan artifact used by + a planning tool or execution workflow. ## Update workflow -- Behavior or schema change: update the relevant `docs/spec/` doc. -- Procedure change: update the relevant `docs/guide/` guide. -- Avoid copying long sections between documents. Link instead. - -## Naming conventions - -- Spec files use descriptive `snake_case` names with stable prefixes (`system_`, `t0_`, `t1_`, - `trace_`, `search_`). -- Guide files use descriptive `snake_case` names within their category folders - (`development/`, `operations/`, `pipelines/`, `testing/`). -- Plan files use `YYYY-MM-DD__.md` with `snake_case` topics (for example, - `2026-01-01_cryptopotato_crawler_plan.md`). +- Behavior or schema change: update the relevant spec. +- Procedure change: update the relevant guide. +- If a change touches both truth and procedure, update both documents and keep their + boundary explicit. +- When a guide starts carrying normative content, move that content into spec and link + to it. +- Do not impose local document-header requirements on files under `docs/plans/`; those + files are owned by the planning tool or workflow that created them. diff --git a/docs/guide/agent-setup.md b/docs/guide/agent-setup.md new file mode 100644 index 00000000..57257017 --- /dev/null +++ b/docs/guide/agent-setup.md @@ -0,0 +1,168 @@ +# Agent Setup Guide + +Goal: Help an agent install and run ELF locally with minimal back-and-forth. +Read this when: You need a practical local setup flow from an existing repository checkout. +Inputs: This repository checkout plus Docker Compose or separately managed Postgres/Qdrant, and optional provider credentials. +Depends on: `Makefile.toml`, `docker-compose.yml`, `config/local/elf.docker.toml`, `elf.example.toml`, and `docs/guide/getting_started.md`. +Verification: ELF services start, required dependencies are reachable, and the local workflow can continue. + +This guide is written for AI agents helping a human operator install and run ELF locally with minimal back-and-forth. +It assumes you have access to this repository checkout. + +## What You Are Setting Up + +ELF is a Rust workspace that typically runs: + +- `elf-api`: HTTP API service. +- `elf-worker`: background worker that indexes notes into Qdrant. +- `elf-mcp` (optional): an MCP server that forwards to `elf-api`. +- `elf-eval` (optional): an evaluation tool for retrieval quality. + +ELF requires: + +- Postgres with `pgvector` (source of truth). +- Qdrant (derived index; safe to rebuild). + +Important: The ELF config has no implicit defaults. All required config fields must be explicitly present in your TOML. + +## Minimal Owner Inputs + +For the checked-in Docker local stack, no owner inputs are required. Use `docker-compose.yml` +and `config/local/elf.docker.toml` from `docs/guide/getting_started.md`. + +For separately managed dependencies or provider-backed development, ask the owner for: + +1. Postgres DSN for the target database (for example `postgres://user:pass@host:5432/elf`). +2. Qdrant endpoints: + - REST base URL (default Qdrant REST: `http://127.0.0.1:6333`). + - gRPC base URL (default Qdrant gRPC: `http://127.0.0.1:6334`). +3. Provider choices: + - Embedding provider config. + - Rerank provider config. + - LLM extractor provider config (required by config; only needed at runtime if the operator uses `add_event` or other LLM-backed features). +4. Whether `elf-api` should bind only to loopback, and whether to enable API/admin auth tokens. + +If the owner cannot provide provider endpoints/keys yet, you can still run a local-only development setup for embedding and rerank by setting: + +- `providers.embedding.provider_id = "local"` +- `providers.rerank.provider_id = "local"` + +Then set `search.expansion.mode = "off"` to avoid LLM-backed query expansion. The extractor config must still be present and non-empty, but should not be used in this mode. + +## Prerequisites + +The machine must have: + +- Rust toolchain (pinned by `rust-toolchain.toml`). +- Docker Compose for the checked-in local dependency stack, or separately running Postgres and Qdrant. +- `psql` available on PATH. +- Running Postgres instance with `pgvector` installed/enabled when not using Compose. +- Running Qdrant instance when not using Compose. + +For the repository harness scripts: + +- `curl` +- `jq` or `jaq` +- `taplo` + +## Create The Config + +For the checked-in Docker local stack, use the strict-valid local config directly: + +```sh +config/local/elf.docker.toml +``` + +For provider-backed development, copy the template: + +```sh +cp elf.example.toml elf.toml +``` + +Then edit `elf.toml`: + +- Set `[storage.postgres].dsn` to your Postgres DSN. +- Set `[storage.qdrant].url` to your Qdrant gRPC base URL. +- Set `[storage.qdrant].collection` to a collection name (for example `mem_notes_v2`). +- Ensure `[chunking].tokenizer_repo` is a non-empty Hugging Face tokenizer repo name (for example `gpt2`). +- Fill all `[providers.*]` blocks. Keys must be non-empty strings. +- Set `security.auth_mode` explicitly: + - Use `"off"` only for local loopback development. + - Use `"static_keys"` with non-empty `security.auth_keys` for authenticated access (`Authorization: Bearer `). + +## Initialize Storage + +For the checked-in Docker local stack, start dependencies and then start `elf-api` or +`elf-worker`; the services auto-create the Postgres schema and Qdrant collections. + +```sh +docker compose -f docker-compose.yml up -d postgres qdrant +``` + +When using separately managed Qdrant and you need to pre-create collections before +service startup, initialize them through the REST endpoint: + +```sh +export ELF_QDRANT_HTTP_URL="http://127.0.0.1:6333" +export ELF_QDRANT_COLLECTION="mem_notes_v2" +export ELF_QDRANT_DOCS_COLLECTION="doc_chunks_v1" +export ELF_QDRANT_VECTOR_DIM="4096" +./qdrant/init.sh +``` + +Notes: + +- Qdrant REST and gRPC ports often differ. The `ELF_QDRANT_HTTP_URL` above must be the REST base URL. +- `storage.qdrant.url` in `elf.toml` must be the gRPC base URL. +- The Qdrant vector dimension must match the embedding dimension configured in `elf.toml`. + +## Start Services + +Start each in a separate terminal: + +```sh +cargo run -p elf-worker -- -c config/local/elf.docker.toml +cargo run -p elf-api -- -c config/local/elf.docker.toml +``` + +Optional: + +```sh +cargo run -p elf-mcp -- -c config/local/elf.docker.toml +``` + +Replace `config/local/elf.docker.toml` with `elf.toml` when using a provider-backed config. + +## Verify + +```sh +curl -fsS http://127.0.0.1:51892/health +``` + +Adjust the port to match `service.http_bind`. + +## Run E2E Harness (Optional) + +The context misranking harness creates and drops a dedicated database and Qdrant collection. It requires: + +- `ELF_PG_DSN` (a base DSN that typically ends with `/postgres`) +- `ELF_QDRANT_GRPC_URL` (Qdrant gRPC base URL) +- `ELF_QDRANT_HTTP_URL` (Qdrant REST base URL) + +Example: + +```sh +ELF_PG_DSN="postgres://elf_dev:elf_dev_password@127.0.0.1:51888/postgres" \ +ELF_QDRANT_GRPC_URL="http://127.0.0.1:51890" \ +ELF_QDRANT_HTTP_URL="http://127.0.0.1:51889" \ +cargo make test-e2e +``` + +## Troubleshooting + +- Config parse errors: + - ELF config has no implicit defaults. Fix missing fields in the TOML (the error message will name the missing field). +- API never becomes healthy: + - Check the API log and confirm Postgres and Qdrant are reachable. +- Qdrant collection errors: + - Confirm the REST URL is correct, and rerun `./qdrant/init.sh`. diff --git a/docs/guide/agent_skills_cookbook.md b/docs/guide/agent_skills_cookbook.md new file mode 100644 index 00000000..ef3238d7 --- /dev/null +++ b/docs/guide/agent_skills_cookbook.md @@ -0,0 +1,397 @@ +# Agent Skills Cookbook (MCP-first) + +Goal: Provide reference agent-side workflows for using ELF via MCP in a consistent, auditable, facts-first way. +Read this when: You are designing or operating agent workflows on top of ELF MCP or HTTP APIs. +Inputs: A working ELF deployment or design target plus the relevant ELF service contracts. +Depends on: `docs/spec/system_elf_memory_service_v2.md` and related MCP-facing specs. +Outputs: Reusable workflow patterns that stay within the ELF contract without redefining it. + +Scope: This is a guide/playbook. It is non-normative and does not change the ELF system contract. + +## 0) Contract: MCP vs Skills + +### MCP tools (capability surface) + +MCP tools are the model-controlled capability surface that forwards to ELF HTTP endpoints. +Treat every tool call as potentially attacker-influenced and rely on server-side enforcement. + +MCP tools must: + +- Be minimal and explicit (no hidden policy). +- Fail closed with stable error codes when a capability is disabled. +- Return structured, machine-readable outputs. + +Hard guarantees that must be enforced by ELF (server-side), not by skills: + +- English-only input boundary. +- Tenant/project/agent scoping and sharing grants. +- Size caps and quotas. +- Deterministic behavior where specified (e.g., `elf_notes_ingest` never calls an LLM). +- Auditability / provenance surfaces. + +### Skills (workflow + policy) + +Skills are agent-side workflows and policies that decide when and how to use tools. +Skills may call LLMs for summarization/planning, but they must be designed so that a tool cannot be misused if a skill is manipulated. + +Skills should: + +- Prefer facts-first memory (short notes) over storing raw long text in notes. +- Store long-form evidence in Doc Extension v1 and attach a pointer in the note `source_ref`. +- Hydrate evidence only when needed, using progressive disclosure (L0 -> L1 -> L2). +- Minimize writes, choose stable keys only when appropriate, and keep scopes explicit. + +## 1) Tool glossary (MCP) + +Memory (Core): + +- `elf_notes_ingest` (deterministic; never calls an LLM) +- `elf_events_ingest` (LLM extraction; evidence-bound) +- `elf_searches_create` (`mode: quick_find|planned_search`) +- `elf_searches_get` / `elf_searches_timeline` / `elf_searches_notes` +- `elf_notes_list` / `elf_notes_get` / `elf_notes_patch` / `elf_notes_delete` +- `elf_notes_publish` / `elf_notes_unpublish` +- `elf_space_grants_list` / `elf_space_grant_upsert` / `elf_space_grant_revoke` + +Docs (Extension v1): + +- `elf_docs_put` +- `elf_docs_get` +- `elf_docs_search_l0` (discovery/backfill/debug; not a full search platform) +- `elf_docs_excerpts_get` (bounded evidence hydration) + +Note: In the current MCP adapter, `read_profile` is configured on the MCP server and is not client-controlled for search/doc search tools. + +## 2) Data contract: facts-first + evidence pointers + +### Notes are facts-first + +Notes should be compact English statements suitable for retrieval: + +- One atomic fact per note where possible. +- Use stable `key` only for durable, updatable truths (preferences, constraints, decisions, profiles). +- Use unkeyed notes for one-off facts that should not overwrite. + +### Evidence is hydrated via `source_ref` + +When a note depends on long-form evidence, attach a versioned pointer in `source_ref`. + +Recommended convention: + +- `source_ref.schema = "source_ref/v1"` +- `source_ref.resolver = "elf_doc_ext/v1"` +- Include `source_ref.ref.doc_id` (required) and optional selector hints: + - `source_ref.ref.chunk_id` (from `elf_docs_search_l0`), or + - `source_ref.locator.quote` selector (exact + optional prefix/suffix), and optional `source_ref.locator.position` fallback. + +Keep `source_ref` ASCII-safe and stable over time. + +## 3) Workflow: doc_ingest (long evidence -> compact notes) + +Goal: Persist a long evidence source in Doc Extension v1 and store compact facts in Core notes with a pointer back to the evidence. + +Steps: + +1. Canonicalize upstream inputs to English (ELF rejects non-English at the API boundary). +2. Store the long evidence with `elf_docs_put`. +3. Extract a small number of durable facts (agent-side) and write them via `elf_notes_ingest`. +4. Attach a `source_ref` pointer (`source_ref.ref.doc_id` + optional selector hints) to each note. +5. Pass `explain` in docs endpoints only when you need debug diagnostics. + +Minimal example: `elf_docs_put` + +```json +{ + "scope": "project_shared", + "title": "Decision record: search routing", + "source_ref": { + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-28T00:00:00Z" + }, + "content": "Long-form English evidence text..." +} +``` + +Minimal example: `elf_notes_ingest` (facts-first notes with pointers) + +```json +{ + "scope": "project_shared", + "notes": [ + { + "type": "decision", + "key": "doc.v1.routing_scope", + "text": "Doc Extension v1 supports only docs_search_l0 discovery; all evidence reading uses docs_excerpts_get.", + "importance": 0.7, + "confidence": 0.8, + "ttl_days": null, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "elf_doc_ext/v1", + "ref": { + "doc_id": "00000000-0000-0000-0000-000000000000" + } + } + } + ] +} +``` + +Operational guidance: + +- Prefer <= 3–7 notes per doc ingest unless you have a strong reason (avoid memory spam). +- If the fact is expected to evolve, provide a stable `key` so updates are possible. +- If the doc is sensitive, choose `agent_private` scope and only publish explicitly later. + +## 4) Workflow: hydrate_context (note hit -> bounded excerpt) + +Goal: Given a retrieved note, hydrate supporting evidence only when needed and only in bounded windows. + +Recommended strategy: + +1. Retrieve candidate notes via `elf_searches_create` with `mode=quick_find` (fast) or `mode=planned_search` (planning-enabled flow). +2. If you need to cite/verify, resolve the note `source_ref`: + - If it includes `source_ref.ref.doc_id` + `source_ref.ref.chunk_id` or selector hints: call `elf_docs_excerpts_get` directly. + - Include `locator` fields from `source_ref` as available: `quote` and/or `position`. + - Otherwise: call `elf_docs_search_l0` to find a relevant chunk_id, then hydrate using `elf_docs_excerpts_get`. +3. Use progressive disclosure: + - Start with `level = "L1"` and upgrade to `L2` only when the first excerpt is insufficient. + - Use `level = "L0"` for tight, cheapest verification checks (`~256` bytes). + +### Progressive note hydration with `elf_searches_notes` + +After obtaining candidate `note_id` values from search, call `elf_searches_notes` to progressively load note payload: + +1. Start with `payload_level: "l0"` for cheapest context. + - Returns compact note text summary. + - `structured` is `null`. + - `source_ref` is `{}`. +2. Upgrade to `payload_level: "l1"` when you need the note summary field from `structured.summary`. + - Returns `structured`. + - `source_ref` is still `{}`. +3. Upgrade to `payload_level: "l2"` only when you need full evidence context and editable detail. + - Returns full note text. + - Returns `structured`. + - Returns full `source_ref`. + +Payload-level semantics for `elf_searches_notes`: + +| payload_level | text | structured | source_ref | +| --- | --- | --- | --- | +| l0 | summary | null | {} | +| l1 | structured summary when available | object | {} | +| l2 | full text | object | full object | + +Optional debug mode: + +- Pass `explain: true` in `elf_docs_search_l0` or `elf_docs_excerpts_get` when you need to collect trace diagnostics. +- Keep an eye on `trace_id` and optional `trajectory` for observability. +- Use `locator` from excerpts to persist preferred selectors for reruns. + +Minimal example: `elf_docs_search_l0` (discovery) + +```json +{ + "query": "Why do we avoid a full doc search platform in v1?" +} +``` + +Minimal example: `elf_docs_excerpts_get` (hydration) + +```json +{ + "doc_id": "00000000-0000-0000-0000-000000000000", + "level": "L1", + "chunk_id": "00000000-0000-0000-0000-000000000000" +} +``` + +Verification guidance: + +- Prefer `verified=true` excerpts as evidence. +- Treat `verified=false` as best-effort context and avoid using it as hard proof without revalidation. + +## 5) Workflow: memory_write_policy (when to write and how) + +Goal: Keep writes minimal, consistent, and update-friendly. + +### Choose `elf_notes_ingest` vs `elf_events_ingest` + +- Use `elf_notes_ingest` when: + - You already have a compact English fact to store. + - You want deterministic behavior and strict control over stored text. + - You are ingesting outputs of other tools (docs, logs) after agent-side normalization. + +- Use `elf_events_ingest` when: + - You want the server to run its LLM extractor to produce evidence-bound notes. + - You have strong evidence text and can provide verifiable quotes. + +### Keys + +- Use a stable key for: + - preferences: editor, language, workflow defaults + - constraints: build rules, security rules, invariants + - decisions: architectural choices, selected options, adopted conventions + - profiles: stable descriptions of agents/projects + +- Avoid keys for: + - one-off facts that should not overwrite each other + - uncertain observations + +### Scope + +- `agent_private`: private scratchpad and personal preferences. +- `project_shared`: shared team memory inside a project. +- `org_shared`: shared memory across projects inside a tenant (publish explicitly). + +## 6) Workflow: share_workflow (publish + grants) + +Goal: Make shared memory explicit and reversible. + +Pattern: + +1. Keep drafts `agent_private`. +2. When stable, publish to `project_shared` or `org_shared` using `elf_notes_publish`. +3. Grant explicit read access to other agents using `elf_space_grant_upsert`. +4. Revoke or unpublish when needed. + +Reminder: sharing is enforced by scopes + grants. Treat this as part of the memory contract, not an optional convention. + +Note: Sharing tools operate on `space` values `team_shared` and `org_shared` (where `team_shared` corresponds to project-level sharing). + +Minimal examples: + +Publish a note to team-shared space: + +```json +{ + "note_id": "00000000-0000-0000-0000-000000000000", + "space": "team_shared" +} +``` + +Grant access to a specific agent: + +```json +{ + "space": "team_shared", + "grantee_kind": "agent", + "grantee_agent_id": "agent_abc123" +} +``` + +Revoke that grant: + +```json +{ + "space": "team_shared", + "grantee_kind": "agent", + "grantee_agent_id": "agent_abc123" +} +``` + +## 7) Workflow: reflect_consolidate (episodic -> stable facts) + +Goal: Periodically reduce memory noise and keep stable truths current. + +Simple loop (agent-side): + +1. Pull recent/high-hit notes (`elf_notes_list` with filters) and recent decisions (stable key prefixes). +2. Identify duplicates, conflicts, and near-expiry items. +3. Produce a small set of updates: + - update stable-key notes when the truth changed + - deprecate or delete notes that are no longer valid +4. Optionally attach a doc pointer explaining why the consolidation happened. + +Non-goal: This loop must not be required for ELF correctness. It is an optimization for better context usage. + +Minimal example: `elf_notes_list` (pull candidates) + +```json +{ + "scope": "project_shared", + "status": "active", + "type": "decision" +} +``` + +## 8) Failure modes and safety checklist + +- Prompt injection: assume an attacker can influence skill reasoning. Tool-side authz and input gates must still protect you. +- Over-writing: do not introduce stable keys unless you are willing to overwrite. +- Excessive writes: cap how many notes you ingest per session/doc. +- Hydration blowups: start at L1; upgrade to L2 only on demand. +- Drift: keep workflows centralized and versioned. When tool contracts change, update the cookbook first. + +## 9) Prompt templates (agent-side) + +These templates are optional. They are provided to reduce drift across agents. +Do not treat them as server contracts. + +### Template: extract facts from a doc into `elf_notes_ingest` JSON + +System: + +You are a memory normalization engine for a facts-first agent memory system. +Output must be valid JSON only. +Output must match the schema described below exactly. +All text must be English only. +Each note text must be a single compact sentence. +Prefer stable keys only for durable truths (preferences, constraints, decisions, profiles). + +User: + +Return JSON matching this schema: +{ + "scope": "agent_private|project_shared|org_shared", + "notes": [ + { + "type": "preference|constraint|decision|profile|fact|plan", + "key": "string|null", + "text": "string", + "importance": 0.0, + "confidence": 0.0, + "ttl_days": "integer|null", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "elf_doc_ext/v1", + "ref": { + "doc_id": "uuid" + } + } + } + ] +} + +Constraints: +- MAX_NOTES = 7 +- Every note must include a `source_ref.ref.doc_id` pointer to . + +Doc title: +Doc content: +<CONTENT> + +### Template: consolidation pass (suggest patches or deletes) + +System: + +You are a memory consolidation engine. +Decide a minimal set of safe changes to reduce duplicates and keep stable keys accurate. +All output must be English only. + +User: + +Given these notes (JSON), produce a plan (English bullets) that includes: +- Which notes to delete (note_id) +- Which notes to patch (note_id + new text) +- Which new stable-key notes to add (notes_ingest JSON) + +Notes: +<NOTES_JSON> + +## 10) Pinned references (internal) + +- Core contract: `docs/spec/system_elf_memory_service_v2.md` +- Doc Extension v1 design: `docs/plans/2026-02-24-doc-ext-v1-design.md` +- Doc pointer resolver: `docs/spec/system_source_ref_doc_pointer_v1.md` diff --git a/docs/guide/benchmarking/2026-06-09-live-baseline-report.md b/docs/guide/benchmarking/2026-06-09-live-baseline-report.md new file mode 100644 index 00000000..9551adeb --- /dev/null +++ b/docs/guide/benchmarking/2026-06-09-live-baseline-report.md @@ -0,0 +1,237 @@ +# Live Baseline Benchmark Report - 2026-06-09 + +Goal: Preserve the checked-in evidence snapshot behind the README benchmark claims. +Read this when: You need the June 9, 2026 live baseline result, pass/fail reasons, or +the next benchmark iteration backlog. +Inputs: Docker-only benchmark reports generated by `cargo make baseline-live-docker`. +Depends on: `docs/guide/benchmarking/live_baseline_benchmark.md`, +`docker-compose.baseline.yml`, `scripts/live-baseline-benchmark.sh`, and +`scripts/live-baseline-report-to-md.sh`. +Verification: Re-run the commands in this report and compare +`tmp/live-baseline/live-baseline-report.json`. + +## Executive Summary + +- ELF passed the production-provider stress run with `Qwen3-Embedding-8B`, + 4096-dimensional embeddings, 480 documents, 16 queries, and `8/8` encoded checks. +- In the all-project smoke comparison, ELF and qmd passed every encoded check. + agentmemory passed same-corpus retrieval but had a typed `lifecycle_fail` on update + replacement and blocked/incomplete durable cold-start coverage in the current mocked + adapter. mem0, memsearch, and claude-mem returned `wrong_result` same-corpus + retrieval results in the encoded smoke. OpenViking was `incomplete` because its local + embedding dependency could not complete in the Docker runner. +- Under the encoded service-style benchmark checks, ELF passed all ELF checks that were + run. Under the encoded local CLI smoke checks, qmd passed all qmd checks that were + run. +- This report records results for the checked-in Docker benchmark contract. It does not + evaluate dimensions that are not encoded in the runner. + +## ELF Production-Provider Stress Run + +| Field | Value | +| --- | --- | +| Run ID | `live-baseline-20260609010854` | +| Generated at | `2026-06-09T01:28:17Z` | +| Project filter | `ELF` | +| Corpus profile | `stress` | +| Documents | `480` | +| Queries | `16` | +| Verdict | `pass` | +| Same-corpus summary | `1/1 pass` | +| Full check summary | `8/8 pass` | +| Elapsed | `1163` seconds | +| Embedding mode | `provider` | +| Embedding model | `Qwen3-Embedding-8B` | +| Embedding dimensions | `4096` | +| Embedding API path | `https://ai.gitee.com/v1/embeddings` | +| Timeout | `30000` ms | + +Encoded checks covered: + +- same-corpus retrieval for all 16 stress queries; +- worker indexing for the 480-document corpus; +- update replacement; +- delete suppression; +- cold-start recovery over the same stores; +- concurrent write/search behavior; +- stress-profile soak behavior; +- resource envelope under the configured stress threshold. + +Re-run command: + +```sh +set -a +source .env +set +a + +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +ELF_BASELINE_PROJECTS=ELF \ +ELF_BASELINE_PROFILE=stress \ +ELF_BASELINE_MAX_ELF_SECONDS=1800 \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +cargo make baseline-live-docker +``` + +## All-Project Smoke Comparison + +| Field | Value | +| --- | --- | +| Run ID | `live-baseline-20260609022837` | +| Generated at | `2026-06-09T02:42:37Z` | +| Project filter | `all` | +| Corpus profile | `smoke` | +| Documents | `3` | +| Queries | `3` | +| Aggregate verdict | `fail` | +| Project summary | `2 pass`, `3 wrong_result`, `1 lifecycle_fail`, `1 incomplete` | +| Same-corpus summary | `3 pass`, `3 wrong_result`, `1 incomplete` | +| Full check summary | `17 pass`, `3 wrong_result`, `1 lifecycle_fail`, `4 incomplete` | + +The aggregate verdict is `fail` because the top-level report only passes when every +selected project passes every encoded project check. + +| Project | Status | Retrieval | Checks | Elapsed | Interpretation | +| --- | --- | --- | --- | --- | --- | +| ELF | `pass` | `retrieval_pass` | `7/7` | `57s` | Service-backed provider run passed retrieval, worker indexing, lifecycle, recovery, and concurrency checks. | +| qmd | `pass` | `retrieval_pass` | `4/4` | `53s` | Local CLI hybrid retrieval baseline passed retrieval, update, delete, and cold-start checks. | +| agentmemory | `lifecycle_fail` | `retrieval_pass` | `2/4` | `38s` | Retrieval passed, but update replacement failed because the old marker remained searchable; durable cold-start is blocked by the current in-memory adapter. | +| memsearch | `wrong_result` | `retrieval_wrong_result` | `2/4` | `169s` | Local search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. | +| mem0 | `wrong_result` | `retrieval_wrong_result` | `2/4` | `41s` | Local add/search ran, update and cold-start passed, but same-corpus retrieval missed expected evidence. | +| OpenViking | `incomplete` | `local_embed_install_failed` | `0/1` | `385s` | The local embed install path hit a `llama-cpp-python` build/import failure in Docker, so retrieval was not evaluated. | +| claude-mem | `wrong_result` | `retrieval_wrong_result` | `0/1` | `97s` | Same-corpus repository search ran but did not return expected evidence. | + +Typed adapter behavior interpretation for this snapshot: + +| Project | Storage | Retrieval | Update | Delete/Expire | Cold Start | Scale/Stress | +| --- | --- | --- | --- | --- | --- | --- | +| ELF | `real` | `real` | `real` | `real` | `real` | `real` | +| qmd | `real` | `real` | `real` | `real` | `real` | `real path via ELF_BASELINE_PROJECTS=qmd and scale/stress profiles` | +| agentmemory | `mocked` | `mocked` | `mocked` | `mocked` | `blocked` | `incomplete` | +| memsearch | `real` | `real` | `real` | `real` | `real` | `incomplete` | +| mem0 | `real` | `real` | `real` | `real` | `real` | `incomplete` | +| OpenViking | `incomplete` | `incomplete` | `not_encoded` | `not_encoded` | `not_encoded` | `blocked` | +| claude-mem | `mocked` | `mocked` | `not_encoded` | `not_encoded` | `not_encoded` | `incomplete` | + +Re-run command: + +```sh +set -a +source .env +set +a + +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +ELF_BASELINE_PROFILE=smoke \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +cargo make baseline-live-docker +``` + +## Result Semantics + +- `pass`: the project installed and every encoded retrieval, lifecycle, recovery, and + resource check for the selected corpus profile passed. +- `wrong_result`: a retrieval check completed but returned the wrong memory or missed + expected evidence. +- `lifecycle_fail`: same-corpus retrieval may pass, but an encoded update, delete, + cold-start, persistence, or related lifecycle check failed. +- `incomplete`: setup or a declared check could not complete because install, runtime, + dependency, or adapter wiring failed in Docker. +- `blocked`: a safe check cannot run without external credentials, manual setup, + durable runtime wiring, or host integration outside this run. +- `not_encoded`: the capability is not covered by the current adapter, so no pass/fail + claim is allowed. + +`incomplete`, `blocked`, and `not_encoded` are not passes. They mean the benchmark +needs more wiring or runtime support before making a quality claim for that project or +capability. + +## Interpretation + +The benchmark is intentionally stricter than a feature checklist. It exercises whether a +project can ingest the same corpus, return expected evidence for the same queries, and +preserve basic lifecycle behavior under the runner's encoded contract. + +## Retrieval Observability + +Generated live-baseline reports include per-query ELF trace IDs when the ELF service +path runs. Open the admin viewer at `/viewer`, paste a trace ID into the Traces panel, +and inspect the full trace bundle to compare candidates, fusion/rerank terms, relation +context, provider metadata, and selected final results without raw SQL. + +ELF checks covered in this run: + +- production-provider embeddings through the same service path used by ELF; +- Postgres source-of-truth with Qdrant as a rebuildable derived index; +- worker-produced chunks and embeddings, not direct in-memory fixture shortcuts; +- explicit update, delete, cold-start, concurrency, soak, and resource checks; +- report metadata that records corpus profile, document count, query count, project + status, check summaries, adapter behavior metadata, elapsed seconds, and embedding + configuration. + +qmd was the external project that passed every encoded smoke check. agentmemory passed +same-corpus retrieval, failed update replacement, and has blocked durable cold-start +coverage because the current adapter uses an in-memory SDK/KV mock. mem0, memsearch, +and claude-mem returned wrong same-corpus retrieval results. OpenViking was not +retrieval-evaluated because the Docker local embedding install path did not complete; +retry requires a pinned or otherwise Docker-compatible `llama-cpp-python` local +embedding dependency. + +## Speed And Production Stance + +The 480-document ELF stress run took 1163 seconds, roughly 19.4 minutes, or about 2.4 +seconds per document end-to-end. That includes the service path, provider embedding +calls, worker indexing, Qdrant rebuild/search, lifecycle checks, soak, and container +overhead. Whether that is acceptable depends on the production workflow: it is a +cold/backfill measurement, not an interactive-ingest target. + +This report is benchmark evidence, not the production operating procedure. Use +`docs/guide/single_user_production.md` for Docker Compose production start, stop, +health, backup, restore, Qdrant rebuild, rollback, provider config handling, and +cleanup commands. + +Throughput work should focus on: + +- micro-batching provider embedding requests; +- multiple outbox worker lanes with leases or `FOR UPDATE SKIP LOCKED`; +- batch Qdrant upserts; +- a bulk import mode that defers or relaxes semantic deduplication; +- vector handoff so an ingest-time embedding can be reused by the worker. + +## Next Benchmark Iterations + +- Add a sanitized private corpus that reflects real coding-agent memory cases. +- Add scale/stress matrix runs for qmd and the other external projects once their smoke + adapters are stable. +- Split elapsed time into install, ingest, embedding, indexing, query, and lifecycle + phases. +- Add recall@k, MRR, and false-positive measurements instead of only pass/fail expected + evidence checks. +- Add a batch-loading benchmark for ELF after provider micro-batching and parallel + worker lanes land. +- Deepen external lifecycle checks for OpenViking and claude-mem after their local + runtime paths can complete in Docker. + +## Publish Workflow + +Generate a fresh aggregate JSON: + +```sh +cargo make baseline-live-docker +``` + +Convert the latest JSON report into Markdown: + +```sh +ELF_BASELINE_MARKDOWN_REPORT=docs/guide/benchmarking/YYYY-MM-DD-live-baseline-report.md \ +cargo make baseline-live-report +``` + +Clean Docker-owned state: + +```sh +cargo make clean-baseline-live-docker +``` + +The only host report directory is `tmp/live-baseline/`. Raw generated JSON stays there +and is not committed by default. diff --git a/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md b/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md new file mode 100644 index 00000000..4b7944c6 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md @@ -0,0 +1,181 @@ +# Real-World Job Benchmark Report + +Goal: Publish a Markdown summary for one generated real_world_job benchmark report. +Read this when: You need a durable smoke report for real-world agent memory job fixtures. +Inputs: `tmp/real-world-job/real-world-job-operator-ux-report.json`. +Depends on: `apps/elf-eval/fixtures/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`. +Verification: Compare this Markdown summary with the source JSON before committing. + +## Summary + +- Run ID: `real-world-job-operator-ux` +- Generated at: `2026-06-10T02:56:58.31558Z` +- Runner version: `0.2.0-5d527b9c5a0bd90b88b905d337f658b7d9eddd05-aarch64-apple-darwin` +- Corpus profile: `synthetic` +- Adapter: `fixture_operator_ux` (offline_fixture_response) +- Jobs: `5` +- Suites with encoded jobs: `1` +- Suites with `not_encoded` status: `10` +- Status summary: `5` pass, `0` wrong_result, `0` lifecycle_fail, `0` incomplete, `0` blocked, `0` not_encoded, `0` unsupported_claim +- Unsupported claim count: `0` +- Wrong-result count: `0` +- Stale-answer count: `0` +- Conflict detections: `0` +- Update rationales available: `0` +- Temporal validity not encoded: `0` +- Evidence coverage: `6/6` (`1.000`) +- Source-ref coverage: `6/6` (`1.000`) +- Quote coverage: `6/6` (`1.000`) +- Stale retrieval count: `0` +- Scope correctness: `0/0` (`0.000`), violations `0` +- Redaction leak count: `0` +- Qdrant rebuild cases: `0` encoded, `0` pass +- Expected evidence recall: `1.000` (6/6) +- Irrelevant context ratio: `0.000` (0 irrelevant) +- Trace explainability: `1` job(s), `0` wrong-result stage attribution(s) +- Consolidation source mutation count: `0` +- Mean score: `1.000` +- Mean latency: `3.100 ms` +- Cost: `0.000 USD` +- Operator-debug jobs: `5` +- Raw SQL needed: `0` +- Trace-incomplete debug jobs: `0` +- Operator UX gaps: `0` +- Private corpus redaction: `no_private_corpus` + +## Capture And Integration Coverage + +The real-world job runner is fixture-backed. This section separates encoded evidence from live adapter claims. + +| Class | Behaviors | +| --- | --- | +| real | - | +| fixture-backed | - | +| mocked | - | +| blocked | - | +| not encoded | No capture/integration behavior was declared by encoded fixtures. | + +## Suites + +| Suite | Status | Jobs | Score | Evidence Recall | Irrelevant Context | Trace Explain | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | Unsupported Claims | Wrong Results | Reason | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| trust_source_of_truth | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| work_resume | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| project_decisions | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| retrieval | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| memory_evolution | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| consolidation | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| knowledge_compilation | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| operator_debugging_ux | `pass` | 5 | `1.000` | `1.000` | `0.000` | 1 | 0 | 0 | 0 | 0 | 0 | 0 | All 5 encoded job(s) passed. | +| capture_integration | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| production_ops | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| personalization | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | + +## Jobs + +| Suite | Job | Status | Score | Evidence Recall | Irrelevant Context | Expected Evidence | Produced Evidence | Trace Failure Stage | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost | +| --- | --- | --- | ---: | ---: | ---: | --- | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- | +| operator_debugging_ux | operator-debug-dropped-evidence-001 | `pass` | `1.000` | `1.000` | `0.000` | `trace-dropped-expected` | `trace-dropped-expected` | `filter.read_profile` | 0 | 0 | `false` | `false` | 0 | 0 | `2.400 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-provider-latency-001 | `pass` | `1.000` | `1.000` | `0.000` | `trace-provider-timeout` | `trace-provider-timeout` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `4.800 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-rebuild-changed-results-001 | `pass` | `1.000` | `1.000` | `0.000` | `trace-before-rebuild, trace-after-rebuild` | `trace-after-rebuild, trace-before-rebuild` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `3.300 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-relation-context-mislead-001 | `pass` | `1.000` | `1.000` | `0.000` | `trace-relation-context` | `trace-relation-context` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.900 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-rerank-bad-candidate-001 | `pass` | `1.000` | `1.000` | `0.000` | `trace-rerank-promotion` | `trace-rerank-promotion` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.100 ms` | `0.000 USD` | + +## Operator Debugging UX + +| Job | Failure Mode | Trace Evidence | Steps | Raw SQL | Dropped Candidate Visibility | Trace Completeness | Repair Clarity | UX Gaps | +| --- | --- | --- | ---: | --- | --- | --- | --- | --- | +| operator-debug-dropped-evidence-001 | expected_evidence_dropped | `11111111-1111-4111-8111-111111111111`<br>[viewer](/viewer?trace_id=11111111-1111-4111-8111-111111111111)<br>[bundle](/v2/admin/traces/11111111-1111-4111-8111-111111111111/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 4 | `false` | visible in Retrieval Funnel and Replay Candidates | `complete` | `clear` | `none` | +| operator-debug-provider-latency-001 | provider_latency_or_failure | `33333333-3333-4333-8333-333333333333`<br>[viewer](/viewer?trace_id=33333333-3333-4333-8333-333333333333)<br>[bundle](/v2/admin/traces/33333333-3333-4333-8333-333333333333/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 3 | `false` | visible as low recall counts rather than a post-recall drop | `complete` | `clear` | `none` | +| operator-debug-rebuild-changed-results-001 | rebuild_changed_results | `44444444-4444-4444-8444-444444444444`<br>[viewer](/viewer?trace_id=44444444-4444-4444-8444-444444444444)<br>[bundle](/v2/admin/traces/44444444-4444-4444-8444-444444444444/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 5 | `false` | visible by comparing before and after trace candidates | `complete` | `clear` | `none` | +| operator-debug-relation-context-mislead-001 | relation_context_misled_search | `55555555-5555-4555-8555-555555555555`<br>[viewer](/viewer?trace_id=55555555-5555-4555-8555-555555555555)<br>[bundle](/v2/admin/traces/55555555-5555-4555-8555-555555555555/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 4 | `false` | not dropped; misleading context is visible on selected result | `complete` | `clear` | `none` | +| operator-debug-rerank-bad-candidate-001 | rerank_promoted_bad_candidate | `22222222-2222-4222-8222-222222222222`<br>[viewer](/viewer?trace_id=22222222-2222-4222-8222-222222222222)<br>[bundle](/v2/admin/traces/22222222-2222-4222-8222-222222222222/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 3 | `false` | not dropped; visible with lower final rank in Replay Candidates | `complete` | `clear` | `none` | + +### Operator Debug Details + +#### `operator-debug-dropped-evidence-001` + +- Root cause: The expected candidate survived recall but was removed by the read-profile scope filter before final selection. +- Viewer panels: `Trace, Retrieval Funnel, Replay Candidates, Stage Details` +- CLI steps: `open viewer trace link -> compare recall before and after filter -> inspect replay candidates -> repair read profile or grant` +- Trace evidence: `trace-dropped-expected` + +#### `operator-debug-provider-latency-001` + +- Root cause: Provider latency forced fallback behavior, shrinking expanded-query recall. +- Viewer panels: `Providers And Ranking, Stage Summary, Stage Details` +- CLI steps: `open trace bundle -> inspect provider metadata -> compare expanded queries -> raise timeout or repair provider health` +- Trace evidence: `trace-provider-timeout` + +#### `operator-debug-rebuild-changed-results-001` + +- Root cause: Rebuild removed stale derived-index state and restored source-of-truth-backed ranking. +- Viewer panels: `Trace, Replay Candidates, Selected Final Results` +- CLI steps: `open before trace -> open after trace -> compare replay candidates -> confirm active note selected -> keep Qdrant rebuild as repair` +- Trace evidence: `trace-before-rebuild, trace-after-rebuild` + +#### `operator-debug-relation-context-mislead-001` + +- Root cause: A deprecated graph relation remained visible in relation_context and conflicted with the selected note text. +- Viewer panels: `Selected Final Results, Relation Context, Stage Details` +- CLI steps: `open trace link -> inspect selected result relation count -> open Relation Context -> invalidate stale relation fact` +- Trace evidence: `trace-relation-context` + +#### `operator-debug-rerank-bad-candidate-001` + +- Root cause: The correct item was in the candidate set, but rerank.score elevated a cross-project decoy. +- Viewer panels: `Selected Final Results, Replay Candidates, Providers And Ranking` +- CLI steps: `open trace bundle -> compare retrieval rank with final rank -> inspect rerank score -> tighten scope or rerank inputs` +- Trace evidence: `trace-rerank-promotion` + +## Memory Evolution + +- Stale answers: `0` +- Conflict detections: `0` +- Update rationales available: `0` +- Temporal validity not encoded: `0` + +| Suite | Job | Current Evidence | Historical Evidence | Stale Traps Used | Conflict Count | Detected | Update Rationale | Temporal Validity | Follow-up | +| --- | --- | --- | --- | --- | ---: | ---: | --- | --- | --- | + +## Trace Explainability + +| Suite | Job | Trace | Failure Stage | Reason | Stage Evidence | +| --- | --- | --- | --- | --- | --- | +| operator_debugging_ux | operator-debug-dropped-evidence-001 | `11111111-1111-4111-8111-111111111111` | `filter.read_profile` | Expected evidence survived recall.candidates but was removed by the read-profile scope filter before final selection. | recall.candidates kept=trace-dropped-expected+trace-dropped-decoy demoted= dropped= distractors=trace-dropped-decoy; filter.read_profile kept=trace-dropped-decoy demoted= dropped=trace-dropped-expected distractors=trace-dropped-decoy; selection.final kept=trace-dropped-decoy demoted= dropped=trace-dropped-expected distractors=trace-dropped-decoy | + +## Unsupported Claims + +No unsupported claims were produced by encoded jobs. + +## Follow-Ups + +No benchmark follow-ups were declared by encoded jobs. + +## Result Semantics + +This report uses `docs/spec/real_world_agent_memory_benchmark_v1.md` status terms. +It is a real-world job fixture report, not a Docker live-baseline report. +Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins. + +The summary counters report required evidence coverage, source-ref coverage, quote coverage, expected evidence recall, irrelevant context ratio, trace explainability, stale retrievals, scope violations, redaction leaks, Qdrant rebuild case coverage, stale answers, conflict detections, update rationale availability, and temporal validity gaps across encoded jobs. + +- `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule. +- `wrong_result`: a job completed but missed required answer or evidence expectations. +- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links. +- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed. + +For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims. + +## Suites With `not_encoded` Status + +- `trust_source_of_truth` +- `work_resume` +- `project_decisions` +- `retrieval` +- `memory_evolution` +- `consolidation` +- `knowledge_compilation` +- `capture_integration` +- `production_ops` +- `personalization` diff --git a/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md b/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md new file mode 100644 index 00000000..5dda8783 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md @@ -0,0 +1,299 @@ +# Production Adoption Gate Report - June 9, 2026 + +Goal: Record the XY-836 full comparison gate and personal production adoption decision. +Read this when: You need the fresh evidence behind the June 9, 2026 ELF production +adoption claim. +Inputs: P0 benchmark and runbook PRs, live Docker benchmark reports, provider-backed +benchmark runs, and the single-user restore proof. +Depends on: `live_baseline_benchmark.md`, `single_user_production.md`, +`comparison_external_projects.md`, `research_projects_inventory.md`, and +`Makefile.toml`. +Outputs: Production adoption verdict, exact benchmark commands, run ids, limitations, +and README-level claim boundaries. + +## Decision + +ELF is ready for personal production use with bounded caveats. + +The gate supports use as a single-user, self-hosted memory service when operated through +the checked-in Docker Compose production runbook, with backups enabled, Qdrant treated as +rebuildable, and retrieval debugging done through search traces and viewer/admin trace +surfaces rather than raw SQL. + +The caveats are material: + +- No private production corpus manifest was available in this lane. The + `baseline-production-private` task failed closed at its manifest guard, so this report + does not claim a private-corpus pass. +- External comparison remains an objective adapter matrix, not an overall superiority + claim. qmd and ELF passed the encoded smoke checks; agentmemory, memsearch, mem0, + OpenViking, and claude-mem retained typed failures or incomplete states. +- The 2,000-document provider backfill passed but took 2,804 seconds end to end. Large + imports should be planned as batch jobs, not interactive operations. + +Because the private-corpus criterion allows an explicitly bounded result, this gate does +not create a new P0 blocker. If private-corpus proof is required before a specific +deployment, supply `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` and rerun +`cargo make baseline-production-private` before relying on private retrieval quality. + +## P0 Inputs + +The current branch is based on the post-observability mainline. The named P0 lanes were +merged before this gate: + +| Issue | PR | Evidence read | +| --- | --- | --- | +| `XY-819` | `#126` | Single-user production backup and restore runbook. | +| `XY-818` | `#127` | Private production corpus benchmark task and manifest guard. | +| `XY-817` | `#128` | Resumable batch ingest and backfill benchmark. | +| `XY-820` | `#130` | Typed lifecycle and adapter failure states. | +| `XY-825` | `#131` | Additional single-user restore and Qdrant rebuild proof. | +| `XY-27` | `#132` | Retrieval observability panels and trace candidate precision repair. | + +## Fresh Commands + +Provider credentials were loaded from an untracked local environment file. Secret values +were not printed or committed. The command forms below assume equivalent provider +environment variables are present in the shell. + +Private manifest guard: + +```sh +cargo make baseline-production-private +``` + +Result: failed closed before the benchmark runner because +`ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` was not set. + +Production-synthetic provider run: + +```sh +set -a +source .env +set +a +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +EMBEDDING_TIMEOUT_MS=30000 \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +ELF_BASELINE_PROJECTS=ELF \ +ELF_BASELINE_MAX_ELF_SECONDS=1200 \ +cargo make baseline-production-synthetic +``` + +All-project smoke provider run: + +```sh +set -a +source .env +set +a +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +EMBEDDING_TIMEOUT_MS=30000 \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +ELF_BASELINE_PROFILE=smoke \ +cargo make baseline-live-docker +``` + +ELF provider stress run: + +```sh +set -a +source .env +set +a +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +EMBEDDING_TIMEOUT_MS=30000 \ +ELF_BASELINE_PROJECTS=ELF \ +ELF_BASELINE_PROFILE=stress \ +ELF_BASELINE_MAX_ELF_SECONDS=1800 \ +ELF_BASELINE_ELF_TIMEOUT_SECONDS=1800 \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +cargo make baseline-live-docker +``` + +ELF provider backfill run: + +```sh +set -a +source .env +set +a +EMBEDDING_MODEL=Qwen3-Embedding-8B \ +EMBEDDING_DIMENSIONS=4096 \ +EMBEDDING_TIMEOUT_MS=30000 \ +ELF_BASELINE_ELF_EMBEDDING_MODE=provider \ +ELF_BASELINE_ELF_TIMEOUT_SECONDS=3600 \ +ELF_BASELINE_MAX_ELF_SECONDS=3600 \ +cargo make baseline-backfill-docker +``` + +Single-user restore proof: + +```sh +awk '/^bash <<'\''EOF'\''$/{flag=1; next} flag && /^EOF$/{exit} flag {print}' \ + docs/guide/single_user_production.md \ + | perl -0pe 's#tmp/single-user-restore-proof#tmp/xy836-single-user-restore-proof#g; s/51988/52988/g; s/51989/52989/g; s/51990/52990/g; s/51991/52991/g; s/51992/52992/g; s/51993/52993/g; s/elf-restore-proof/elf-xy836-restore-proof/g' \ + > tmp/xy836-restore-proof.sh +bash tmp/xy836-restore-proof.sh +``` + +The proof used alternate local ports because the default proof port range was occupied +on this machine. + +## ELF Evidence + +All provider-backed ELF runs used: + +- Provider id: `provider` +- Embedding model: `Qwen3-Embedding-8B` +- Embedding dimensions: `4096` +- Timeout: `30000` ms +- API path: `/embeddings` + +| Run | Profile | Corpus | Status | Checks | Retrieval | Elapsed | Query result | Backfill and resume | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| `live-baseline-20260609083644` | `production-synthetic` | `synthetic-coding-agent-prod-corpus-2026-06-09`, 8 docs, 6 queries | `pass` | `8/8` | `retrieval_pass` | 59 s | 6/6 pass, mean 937.120 ms | 8/8 completed in 8.134 s, resume 4 -> 8, 0 duplicates | +| `live-baseline-20260609090719` | `stress` | generated public, 480 docs, 16 queries | `pass` | `9/9` | `retrieval_pass` | 779 s | 16/16 pass, mean 1128.144 ms | 480/480 completed in 508.835 s, resume 240 -> 480, 0 duplicates | +| `live-baseline-20260609092144` | `backfill` | generated public, 2000 docs, 16 queries | `pass` | `9/9` | `retrieval_pass` | 2804 s | 16/16 pass, mean 1214.454 ms | 2000/2000 completed in 2061.396 s, resume 1000 -> 2000, 0 duplicates | + +The 2,000-document backfill also passed: + +- `resumable_backfill_no_duplicates` +- `same_corpus_retrieval` +- `async_worker_indexing_e2e` +- `update_replaces_note_text` +- `delete_suppresses_retrieval` +- `cold_start_recovery_search` +- `concurrent_write_search_e2e` +- `soak_stability_e2e` +- `resource_envelope` + +The resource envelope check measured 2,793.629 seconds against a 3,600-second limit and +167,652 KB RSS against a 1,500,000 KB limit. + +## Recovery Evidence + +The single-user production proof wrote a note, searched it, recreated the Docker +Compose dependency stack from backup, rebuilt Qdrant from Postgres-held vectors, and +searched again. + +| Step | Evidence | +| --- | --- | +| Note ingest | `ADD`, `remember`, note id `bfaa2f40-e076-490e-ae5a-dd88cf6b6179` | +| Search before restore | 1 result, key `single_user_restore_probe`, trace `535e49be-250f-483c-8845-b4116e591dac`, score 1.148 | +| Qdrant rebuild after restore | `rebuilt_count=1`, `missing_vector_count=0`, `error_count=0` | +| Search after restore | 1 result, key `single_user_restore_probe`, trace `e995263d-8f0e-4472-9a32-354d5cceed33`, score 1.1479998 | + +This satisfies the adoption criterion that Postgres backups, restore, and Qdrant rebuild +are tested without treating Qdrant as a source of truth. + +## External Comparison + +Fresh all-project smoke run: `live-baseline-20260609083814`. + +Corpus: generated public smoke, 3 docs, 3 queries. + +Aggregate verdict: `fail`, because the matrix is strict and external adapters retained +typed failures. The strict failure is useful evidence; it prevents hiding incomplete +adapter states. + +Full encoded check summary: 26 total, 16 pass, 3 fail, 2 wrong-result, 1 lifecycle-fail, +2 incomplete, 1 blocked, 4 not encoded. + +| Project | Status | Retrieval | Checks | Elapsed | Storage | Interpretation | +| --- | --- | --- | --- | --- | --- | --- | +| ELF | `pass` | `retrieval_pass` | `8/8` | 33 s | real | Added corpus, rebuilt Qdrant, returned expected evidence, and passed lifecycle checks. | +| qmd | `pass` | `retrieval_pass` | `4/4` | 59 s | real | Passed same-corpus retrieval, update, delete, and cold-start checks through persisted local collection files. | +| agentmemory | `lifecycle_fail` | `retrieval_pass` | `2/4` | 46 s | mocked | Same-corpus retrieval passed, but update left old text searchable and cold-start recovery is blocked by in-memory harness storage. | +| memsearch | `incomplete` | `invalid_json_result` | `0/1` | 432 s | real | Command completed but did not produce a valid benchmark result. | +| mem0 | `incomplete` | `invalid_json_result` | `2/4` | 462 s | real | Local FastEmbed/Qdrant search missed expected same-corpus results; delete remains not encoded. | +| OpenViking | `incomplete` | `local_embed_install_failed` | `0/1` | 513 s | incomplete | Local embedding install hit a llama-cpp-python build/import failure, so same-corpus local retrieval could not run. | +| claude-mem | `incomplete` | `invalid_json_result` | `0/4` | 107 s | mocked | Repository search missed expected same-corpus results and lifecycle behaviors remain mostly not encoded. | + +## Observability Evidence + +The gate is based on main after `XY-27`, which added read-only viewer retrieval +observability panels and a precision repair for trace candidate scores. The fresh +benchmark runs returned trace ids for every ELF search, and the search responses include +retrieval trajectory summaries. + +Representative provider stress traces: + +| Query | Trace id | +| --- | --- | +| `q-auth` | `7be1b5ce-3676-4625-8221-dcf0204669bf` | +| `q-auth-alt` | `79585c67-cdb8-46f8-bad1-d277295c1e0f` | +| `q-database` | `0cc7d130-fe51-436e-a5b0-971997ba8cb7` | +| `q-database-alt` | `4ffaf8cd-4b0d-4b3d-8154-56551538e81a` | +| `q-deploy` | `c770346e-d563-4ad0-aae6-f56dff334669` | +| `q-deploy-alt` | `84121528-c038-490b-bbc5-3352bcb9a2f5` | + +Representative restore proof traces: + +- Before restore: `535e49be-250f-483c-8845-b4116e591dac` +- After restore: `e995263d-8f0e-4472-9a32-354d5cceed33` + +This is sufficient for the personal production gate: a wrong result can be debugged via +the returned trace id, trajectory stages, trace bundle/admin endpoints, and the viewer +panels without raw SQL. + +## Adoption Criteria + +| Criterion | Result | Evidence and limitation | +| --- | --- | --- | +| Private production corpus benchmark has a passing or explicitly bounded result. | Bounded caveat | `cargo make baseline-production-private` failed closed because `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` was unset. No private-corpus pass is claimed. | +| Backfill/resume proves predictable large import behavior. | Pass | `live-baseline-20260609092144`: 2000/2000 completed, resume 1000 -> 2000, zero duplicates, resource envelope passed. | +| Docker Compose backup, restore, and Qdrant rebuild are tested. | Pass | Single-user restore proof rebuilt 1 Qdrant point with 0 missing vectors and recovered searchable results. | +| Retrieval observability can debug wrong results without raw SQL. | Pass | `XY-27` landed, trace ids are returned in benchmark and restore runs, and trajectory summaries are present in search responses. | +| External comparison uses typed failure states and does not rely on mocked adapter results as proof. | Pass | `live-baseline-20260609083814` reports real, mocked, blocked, incomplete, wrong-result, and lifecycle-fail states explicitly. | + +## Follow-Up Queue + +No P0 Decodex lane needs to be requeued from this gate. + +Recommended non-blocking follow-ups: + +- Rerun `baseline-production-private` when an operator-owned private manifest is + available, and publish a private-corpus addendum that does not expose private text. +- Treat `docs/spec/real_world_agent_memory_benchmark_v1.md` as the future-work + contract for job-level memory evaluation. This report does not claim any pass under + that new suite because no real-world job runner was encoded in this gate. +- Keep qmd as the strongest external local baseline for routing/fusion/debuggability + comparison work. +- Treat agentmemory, memsearch, mem0, OpenViking, and claude-mem adapter failures as + typed benchmark improvement opportunities only if external parity coverage remains a + roadmap goal. + +## Post-Gate Repeatability Extension + +XY-850 extends the live-baseline runner after this gate without changing the gate's +historical verdict. The private-corpus result remains bounded until an operator-owned +manifest is supplied. + +New repeatable paths: + +- `cargo make baseline-production-private-addendum` runs the private profile and writes + a safe Markdown addendum to `tmp/live-baseline/private-production-addendum.md` by + default. It still fails closed when `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` is + absent. +- `cargo make baseline-backfill-10k-docker` runs an ELF-only 10k generated backfill + resume profile. +- `ELF_BASELINE_ENABLE_EXPENSIVE=1 cargo make baseline-backfill-100k-docker` runs the + guarded 100k profile. Without the guard, the task exits before starting Docker work. +- `cargo make baseline-soak-docker` runs an explicit ELF-only soak profile, defaulting + to one hour unless `ELF_BASELINE_SOAK_SECONDS` is set. + +New report fields include duplicate-source count, checkpoint resume state, latency +mean/P50/P95/P99/max, RSS and disk-size proxies, a planning-only cost proxy, and +operator-case commands for provider outage, migration rollback, Docker Compose +start/stop/upgrade, Postgres restore, Qdrant rebuild, and unattended soak. + +## Runner Repairs Made By This Gate + +Two small runner fixes were required to collect the fresh evidence: + +- `build.rs` now provides a fallback `VERGEN_GIT_SHA=unknown` before vergen emits git + metadata, so Docker benchmark builds work when the copied context is not a usable git + checkout. +- `baseline-backfill-docker` now resolves default environment values inside the shell + instead of relying on `${VAR:-default}` in the `cargo-make` TOML string, which avoided + malformed values such as `-backfill`. diff --git a/docs/guide/benchmarking/2026-06-09-production-corpus-report.md b/docs/guide/benchmarking/2026-06-09-production-corpus-report.md new file mode 100644 index 00000000..b050f1df --- /dev/null +++ b/docs/guide/benchmarking/2026-06-09-production-corpus-report.md @@ -0,0 +1,60 @@ +# Live Baseline Benchmark Report + +Goal: Publish a Markdown summary for one generated live baseline aggregate report. +Read this when: You need a durable, reviewable summary of a live baseline JSON report. +Inputs: `tmp/live-baseline/live-baseline-report.json`. +Depends on: `scripts/live-baseline-benchmark.sh` and `docs/guide/benchmarking/live_baseline_benchmark.md`. +Verification: Compare this Markdown summary with the source JSON before committing. + +## Summary + +- Run ID: `live-baseline-20260609045306` +- Generated at: `2026-06-09T04:53:18Z` +- Verdict: `pass` +- Project filter: `ELF` +- Corpus profile: `production-synthetic` +- Corpus track: `synthetic_production` +- Corpus manifest: `synthetic-coding-agent-prod-corpus-2026-06-09` +- Documents: `8` +- Queries: `6` +- Wrong-result count: `0` +- Query latency mean: `7.137632833333334 ms` +- Project summary: `1 pass`, `0 fail`, `0 incomplete` +- Same-corpus summary: `1 pass`, `0 fail`, `0 incomplete` +- Full check summary: `7/7 pass` + +This report is production-corpus benchmark evidence only. Use +`docs/guide/single_user_production.md` for the single-user Docker Compose production +runbook, including backup, restore, Qdrant rebuild, rollback, provider config +handling, and cleanup commands. + +## Projects + +| Project | Status | Retrieval | Checks | Elapsed | Reason | +| --- | --- | --- | --- | --- | --- | +| ELF | `pass` | `retrieval_pass` | `7/7` | `12s` | ELF added the corpus, rebuilt Qdrant, and returned expected evidence for every query | + +## Embedding + +| Project | Mode | Provider | Model | Dimensions | Timeout | API Base | Path | +| --- | --- | --- | --- | --- | --- | --- | --- | +| ELF | `local` | `local` | `local-hash` | `256` | `1000ms` | `http://127.0.0.1` | `/embeddings` | + +## Query Evidence + +| Project | Query | Task | Expected Evidence | Allowed Alternates | Top Evidence | Matched | Latency | +| --- | --- | --- | --- | --- | --- | --- | --- | +| ELF | `q-resume-lane` | `resume_lane` | `issue-xy812-resume` | `` | `issue-xy812-resume` | `true` | `9.213627 ms` | +| ELF | `q-recover-exact-command` | `recover_exact_command` | `worktree-xy791-repair` | `runbook-live-baseline` | `worktree-xy791-repair` | `true` | `6.424872 ms` | +| ELF | `q-explain-stale-blocker` | `explain_stale_blocker` | `blocker-stale-qwen-key` | `` | `blocker-stale-qwen-key` | `true` | `7.749393 ms` | +| ELF | `q-find-prior-decision` | `find_prior_decision` | `decision-qdrant-derived` | `` | `decision-qdrant-derived` | `true` | `6.66385 ms` | +| ELF | `q-compare-project-status` | `compare_project_status` | `pr-110-review` | `recovery-xy640-ledger` | `recovery-xy640-ledger` | `true` | `6.344976 ms` | +| ELF | `q-detect-contradiction-update` | `detect_contradiction_update` | `decision-xy818-supersedes` | `` | `decision-xy818-supersedes` | `true` | `6.429079 ms` | + +## Result Semantics + +- `pass`: every encoded check for the selected project and profile passed. +- `fail`: clone, install, import, build, retrieval, lifecycle, recovery, concurrency, soak, resource-envelope, or another declared check failed. +- `incomplete`: the encoded check could not complete without extra provider keys, host integration, native dependency support, durable runtime wiring, or more adapter work. + +`incomplete` is not a pass; treat it as benchmark wiring debt. diff --git a/docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md b/docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md new file mode 100644 index 00000000..7a3dfa4e --- /dev/null +++ b/docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md @@ -0,0 +1,72 @@ +# Live Real-World Adapter Sweep Report - June 10, 2026 + +Goal: Publish the XY-880 full-suite live real-world sweep evidence for ELF and qmd. +Read this when: You need the current live_real_world adapter evidence after the +representative XY-868 slice was expanded across the encoded real-world suite corpus. +Inputs: `cargo make real-world-memory-live-adapters`, +`apps/elf-eval/fixtures/real_world_memory/`, and +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, +`docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md`, and +`docs/guide/benchmarking/live_baseline_benchmark.md`. +Verification: `cargo make real-world-memory-live-adapters` ran on branch +`y/elf-xy-880` and wrote the generated reports under +`tmp/real-world-memory/live-adapters/`. + +## Summary + +The live adapter command now runs ELF and qmd against the full checked-in +`real_world_memory` fixture corpus, not only the original three-job representative +slice. Each adapter produced 38 live materialized job records across all 11 encoded +suites. + +This is a full-suite sweep, not a full-suite live pass. The generated reports preserve +typed non-pass states instead of upgrading unsupported suite capabilities into wins. + +| Adapter | Jobs | Pass | Wrong result | Incomplete | Blocked | Not encoded | Mean score | Evidence recall | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live real-world service adapter | 38 | 18 | 5 | 1 | 2 | 12 | 0.514 | 41/75 | +| qmd live real-world CLI adapter | 38 | 18 | 5 | 1 | 2 | 12 | 0.512 | 41/75 | + +## Suite Results + +| Suite | ELF live status | qmd live status | Interpretation | +| --- | --- | --- | --- | +| `trust_source_of_truth` | `pass` | `pass` | Both adapters retrieved the restore/Qdrant rebuild proof evidence. | +| `work_resume` | `pass` | `pass` | Both adapters passed all work-resume continuity jobs. | +| `project_decisions` | `pass` | `pass` | Both adapters passed all project-decision jobs. | +| `retrieval` | `pass` | `pass` | Both adapters passed all retrieval jobs. | +| `memory_evolution` | `wrong_result` | `wrong_result` | Both adapters passed the delete/TTL case but failed current-versus-historical conflict jobs because retrieval-backed answers did not provide the required historical conflict evidence links. | +| `consolidation` | `not_encoded` | `not_encoded` | The live sweep does not generate or review consolidation proposals. | +| `knowledge_compilation` | `not_encoded` | `not_encoded` | The live sweep does not generate derived knowledge pages. | +| `operator_debugging_ux` | `not_encoded` | `not_encoded` | The live sweep does not hydrate full operator trace/viewer diagnostics. | +| `capture_integration` | `not_encoded` | `not_encoded` | The live sweep does not exercise capture integrations or write-policy redaction boundaries. | +| `production_ops` | `incomplete` | `incomplete` | The live sweep does not run backup/restore, private corpus, provider credential, or backfill operations; the existing cold-start dependency remains incomplete and credential/private-manifest jobs remain blocked. | +| `personalization` | `pass` | `pass` | Both adapters retrieved the scoped preference evidence. | + +## Claim Boundary + +- ELF and qmd still have targeted live pass evidence for the original + `work_resume`, `retrieval`, and `project_decisions` slice. +- ELF and qmd now also have full-suite live sweep evidence with typed non-pass states. +- Neither adapter has a full-suite live pass. +- This report does not claim private-corpus production proof, provider-backed + production-ops proof, broad RAG/graph adapter parity, or overall external + superiority. + +## Artifacts + +Generated artifacts are intentionally under `tmp/`: + +```text +tmp/real-world-memory/live-adapters/elf-materialization.json +tmp/real-world-memory/live-adapters/elf-report.json +tmp/real-world-memory/live-adapters/elf-report.md +tmp/real-world-memory/live-adapters/qmd-materialization.json +tmp/real-world-memory/live-adapters/qmd-report.json +tmp/real-world-memory/live-adapters/qmd-report.md +tmp/real-world-memory/live-adapters/summary.json +``` + +The checked-in manifest records this evidence in +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. diff --git a/docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md b/docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md new file mode 100644 index 00000000..5826e2f2 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md @@ -0,0 +1,138 @@ +# Post-Adapter Production Adoption Refresh - June 10, 2026 + +Goal: Publish the XY-884 post-adapter production adoption refresh after the live +real-world sweep, OpenViking dependency refresh, and RAG/graph research-gate pass. +Read this when: You need the current decision on whether ELF is ready for personal +production use under the latest checked-in benchmark evidence. +Inputs: `2026-06-09-production-adoption-gate-report.md`, +`2026-06-10-real-world-comparison-report.md`, +`2026-06-10-live-real-world-sweep-report.md`, +`docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`, and +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, +`docs/guide/benchmarking/live_baseline_benchmark.md`, and +`docs/guide/single_user_production.md`. +Outputs: Current production adoption decision, evidence-class separation, accepted +caveats, and follow-up issue routing. + +## Decision + +Adopt with bounded caveats. + +ELF remains ready for personal production use as a single-user, self-hosted memory +service when operated through the checked-in production runbook, with Postgres treated +as the source of truth, Qdrant treated as rebuildable, backups enabled, and search +trace/viewer surfaces used for retrieval debugging. + +The post-adapter evidence does not upgrade the decision to an unconditional production +pass. It also does not downgrade the June 9 adoption gate. The new evidence mainly +sharpens the claim boundary: + +- ELF and qmd now have full-suite live real-world sweep records, but both are typed + non-pass sweeps, not full-suite live passes. +- The OpenViking cold-start dependency boundary is resolved for classification: the + pinned Docker local embedding path reaches `add_resource` and `find`, while the + current OpenViking same-corpus result remains `wrong_result` because expected + evidence terms are missed. +- The RAG/graph D1/D2 research gates produced adapter candidates and typed blockers, + but no RAG/graph record has become live adapter evidence. +- Private-corpus and credentialed production-ops checks remain operator-owned + boundaries. No private-corpus pass is claimed. + +## Required Input Status + +| Required input | Current outcome | Decision impact | +| --- | --- | --- | +| Full live real-world sweep results for ELF/qmd or typed blockers | Available. ELF and qmd each produced 38 `live_real_world` jobs across 11 suites: 18 pass, 5 wrong_result, 1 incomplete, 2 blocked, and 12 not_encoded. | Supports adoption only with caveats; it proves live sweep coverage, not full-suite live parity. | +| Cold-start/OpenViking dependency issue outcome | Available. The production-ops cold-start dependency fixture is pass; OpenViking now reaches the pinned Docker local embedding path and records `wrong_result` instead of setup failure when evidence terms are missed. | Removes setup uncertainty from the adoption decision, but leaves OpenViking context-trajectory quality as a non-blocking gap. | +| RAG/graph D1/D2 research gate outcome | Available. RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify are adapter candidates; Letta, LangGraph, nanograph, and llm-wiki are research-only; gbrain is blocked. | Follow-up adapter work is concrete, but research gates remain non-live evidence. | +| Current production/private-corpus evidence and caveats | Available. Provider-backed synthetic, stress, backfill, and restore proof passed; private corpus failed closed because no operator-owned manifest was supplied. | Keeps the June 9 decision: personal production adoption is acceptable with bounded private-corpus and credential caveats. | + +## Evidence Classes + +| Evidence class | Current evidence | Use in this decision | Claim boundary | +| --- | --- | --- | --- | +| Fixture-backed | `cargo make real-world-memory` reports 38 jobs across 11 suites with 36 pass and 2 blocked production-ops operator boundaries. | Shows the real-world benchmark contract is encoded and ELF fixture behavior is strong outside operator-owned gates. | Fixture scoring is not the same as live service execution. | +| Live adapter | `cargo make real-world-memory-live-adapters` produced full-suite ELF and qmd live sweeps with typed non-pass states preserved. | Confirms live adapters can materialize every encoded job record for ELF and qmd. | Not a full-suite live pass, not private-corpus proof, and not broad external superiority. | +| Private corpus | `baseline-production-private` failed closed at the missing manifest guard. | Accepted caveat for personal use when no operator-owned private manifest exists. | No private-corpus retrieval-quality pass is claimed. | +| Credentialed | Provider-backed ELF synthetic, stress, and backfill runs passed with `Qwen3-Embedding-8B`; provider-backed production-ops fixture jobs remain blocked without routed credentials. | Supports production-provider retrieval and backfill evidence while preserving credential boundaries. | No credentialed production-ops pass is claimed for paths that need unavailable operator credentials. | +| Blocked | Production-ops still contains private manifest and provider credential boundaries; gbrain lacks a proven Docker-local brain repo/database path. | These are explicit accepted caveats or research-gate blockers, not hidden failures. | Blocked states must remain typed until the missing operator or setup input exists. | +| Research gate | RAG/graph records contain setup, resource, retry, and evidence-output metadata plus XY-882 verdicts. | Gives concrete follow-up routing for the next adapter pack. | Research-gate records must not be counted as fixture-backed, live-baseline, or live-real-world pass evidence. | + +## Production Evidence + +The June 9 production adoption gate remains the production baseline: + +| Run | Scope | Result | +| --- | --- | --- | +| Production synthetic provider run | 8 documents, 6 queries, `Qwen3-Embedding-8B`, 4096-dimensional embeddings | `8/8` checks, `retrieval_pass`, `pass` in 59 seconds | +| Provider stress run | 480 generated public documents, 16 queries | `9/9` checks, `retrieval_pass`, `pass` in 779 seconds | +| Provider backfill run | 2,000 generated public documents, 16 queries | `9/9` checks, resume 1,000 -> 2,000, zero duplicate source notes, `pass` in 2,804 seconds | +| Single-user restore proof | Docker Compose backup/restore plus Qdrant rebuild | `rebuilt_count=1`, `missing_vector_count=0`, `error_count=0`, restored search result recovered | +| Private production corpus | Operator-owned manifest required | Failed closed before benchmark execution; no private-corpus pass claimed | + +This is enough for personal production use when the operator accepts the documented +private-corpus and credential boundaries. It is not enough for a deployment that +requires private-corpus quality proof before launch. + +## Live Sweep Evidence + +The full live real-world sweep is useful precisely because it does not flatten typed +outcomes into an artificial win. + +| Adapter | Jobs | Pass | Wrong result | Incomplete | Blocked | Not encoded | Evidence recall | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live real-world service adapter | 38 | 18 | 5 | 1 | 2 | 12 | 41/75 | +| qmd live real-world CLI adapter | 38 | 18 | 5 | 1 | 2 | 12 | 41/75 | + +Both adapters pass the targeted `work_resume`, `project_decisions`, and `retrieval` +suites. Both fail or skip the same broader areas that need more adapter behavior: +current-versus-historical conflict evidence, consolidation proposal generation, +derived knowledge pages, full operator trace hydration, capture/write-policy +integration, and credential/private production operations. + +The adoption impact is bounded: ELF has enough production and recovery evidence for +single-user use, but not enough full-suite live evidence to claim broad real-world +memory parity. + +## RAG And Graph Gates + +XY-882 made the RAG/graph research gates decision-ready: + +| Project | Verdict | Follow-up | +| --- | --- | --- | +| RAGFlow | `adapter_candidate` | [XY-885](https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter) | +| LightRAG | `adapter_candidate` | [XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter) | +| GraphRAG | `adapter_candidate` | [XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter) | +| Graphiti/Zep | `adapter_candidate` | [XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter) | +| graphify | `adapter_candidate` | [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter) | +| Letta | `research_only` | No implementation issue until a contained evidence export path is selected. | +| LangGraph | `research_only` | No implementation issue; keep as checkpoint/replay reference. | +| nanograph | `research_only` | No implementation issue; keep as graph-lite DX reference. | +| llm-wiki | `research_only` | No implementation issue until a contained plugin or instruction harness exists. | +| gbrain | `blocked` | No implementation issue until a Docker-local brain repo and database path is proven. | + +These follow-ups are concrete adapter-work routing, not production blockers for ELF +personal use. + +## Accepted Caveats And Follow-Ups + +| Gap | Classification | Disposition | +| --- | --- | --- | +| Private production corpus quality | Accepted caveat | Rerun `cargo make baseline-production-private` or `cargo make baseline-production-private-addendum` when an operator-owned sanitized manifest is available. | +| Credentialed production-ops proof | Accepted caveat | Keep typed `blocked` until routed provider credentials are supplied for the specific production-ops gate. | +| Full-suite live real-world pass | Accepted caveat | Current live sweep is intentionally non-pass; use it to target future adapter coverage rather than to block personal production use. | +| OpenViking evidence-bearing retrieval output | Accepted caveat | Setup is no longer the primary blocker; future work should improve same-corpus evidence output before treating OpenViking as a strong runnable context-trajectory baseline. | +| RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify live adapter evidence | Concrete follow-ups | Use XY-885 through XY-889 and require Docker-contained runs with evidence-linked outputs before any live pass claim. | +| Letta, LangGraph, nanograph, and llm-wiki executable adapter coverage | Accepted research-only caveat | Keep as design references until a contained output contract is selected. | +| gbrain contained setup | Concrete blocker | Revisit only after Docker-local repository/database setup proof exists. | + +## Current Adoption Statement + +ELF is ready to use personally in production with bounded caveats. Use it when the +operator accepts the checked-in single-user production runbook, backup/restore proof, +provider-backed synthetic/stress/backfill evidence, and explicit private-corpus and +credential boundaries. + +Do not claim that ELF has passed a private production corpus, credentialed +production-ops gate, full-suite live real-world parity, or RAG/graph adapter parity. diff --git a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md new file mode 100644 index 00000000..2868b4b8 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md @@ -0,0 +1,215 @@ +# Real-World Comparison Report - June 10, 2026 + +Goal: Publish the post-P1 real-world agent memory benchmark evidence and adoption +implications. +Read this when: You need the checked-in evidence behind README-level real-world +benchmark claims after XY-833 and XY-861 through XY-864 landed. +Inputs: Generated reports under `tmp/real-world-memory/` and `tmp/real-world-job/`, +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, +and the live-baseline reports linked from this guide. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, +`docs/guide/benchmarking/real_world_agent_memory_benchmark.md`, and +`docs/guide/benchmarking/live_baseline_benchmark.md`. +Verification: The original commands listed below were run from branch `y/elf-xy-865`. +XY-881 refreshed `cargo make real-world-memory`, `cargo make real-world-memory-production-ops`, +and `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker` from branch +`y/elf-xy-881`. Tables below include that refresh where the OpenViking cold-start +dependency boundary is discussed. + +Postscript: XY-880 superseded the live-adapter state in this report for ELF and qmd. +The successor evidence is +`docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md`: ELF and qmd now +emit full-suite live sweep records, but neither has a full-suite live pass. + +## Context + +Dependency batch state at report time: + +| Issue | Result | PR | +| --- | --- | --- | +| XY-833 operator-debugging UX repair | Done | `https://github.com/hack-ink/ELF/pull/147` | +| XY-861 project-decision suite | Done | `https://github.com/hack-ink/ELF/pull/151` | +| XY-862 production-ops suite | Done | `https://github.com/hack-ink/ELF/pull/148` | +| XY-863 graph temporal validity | Done | `https://github.com/hack-ink/ELF/pull/150` | +| XY-864 external adapter comparison contract | Done | `https://github.com/hack-ink/ELF/pull/149` | + +This report is for the XY-865 branch `y/elf-xy-865` and PR title +`XY-865: [ELF benchmark vNext P1] Publish real-world comparison report and adoption plan`. + +No private-corpus or credentialed provider checks were run for this report because no +operator-owned private manifest or routed provider credentials were supplied. Those +paths remain typed `blocked` boundaries, not passes. + +## Commands + +| Command | Generated artifact | Run ID | Generated at | +| --- | --- | --- | --- | +| `cargo make real-world-memory` | `tmp/real-world-memory/real-world-memory-report.{json,md}` | `real-world-memory` | `2026-06-10T08:47:44.086502Z` | +| `cargo make real-world-memory-project-decisions` | `tmp/real-world-memory/project-decisions/report.{json,md}` | `real-world-memory-project-decisions` | `2026-06-10T04:21:52.403238Z` | +| `cargo make real-world-memory-production-ops` | `tmp/real-world-memory/production-ops-report.{json,md}` | `real-world-memory-production-ops` | `2026-06-10T08:47:18.205778Z` | +| `cargo make real-world-memory-evolution` | `tmp/real-world-memory/evolution-report.{json,md}` | `real-world-memory-evolution` | `2026-06-10T04:22:06.325152Z` | +| `cargo make real-world-job-operator-ux` | `tmp/real-world-job/real-world-job-operator-ux-report.{json,md}` | `real-world-job-operator-ux` | `2026-06-10T04:22:12.28938Z` | + +The refreshed real-world-memory reports used runner version +`0.2.0-a8b25d00880bd3cf04707c3b2b328cd20a585396-aarch64-apple-darwin`. + +## Aggregate Result + +`cargo make real-world-memory` now reports `38` jobs across all `11` encoded real-world +suites: + +| Metric | Value | +| --- | ---: | +| Pass | `36` | +| Incomplete | `0` | +| Blocked | `2` | +| Wrong result | `0` | +| Lifecycle fail | `0` | +| Not encoded | `0` | +| Unsupported claim | `0` | +| Mean score | `0.947` | +| Evidence coverage | `84/84` (`1.000`) | +| Source-ref coverage | `84/84` (`1.000`) | +| Quote coverage | `84/84` (`1.000`) | +| Expected evidence recall | `77/77` (`1.000`) | +| Redaction leaks | `0` | +| Scope violations | `0` | +| Temporal validity gaps | `0` | +| Qdrant rebuild cases | `2/2` pass | + +Suite-level outcomes: + +| Suite | Jobs | Status | Mean score | Interpretation | +| --- | ---: | --- | ---: | --- | +| `trust_source_of_truth` | 1 | `pass` | `1.000` | Source-of-truth rebuild fixture passed. | +| `work_resume` | 5 | `pass` | `1.000` | Resume and exact next-action fixtures passed. | +| `project_decisions` | 5 | `pass` | `1.000` | Current decisions, reversals, rationale, and caveats passed. | +| `retrieval` | 5 | `pass` | `1.000` | Retrieval fixtures with distractors and obsolete context passed. | +| `memory_evolution` | 6 | `pass` | `1.000` | Current-vs-historical and temporal relation validity passed. | +| `consolidation` | 4 | `pass` | `1.000` | Proposal-only consolidation passed with `0` source mutations. | +| `knowledge_compilation` | 2 | `pass` | `1.000` | Derived page fixtures passed with citation/rebuild checks. | +| `operator_debugging_ux` | 1 | `pass` | `1.000` | Aggregate stage-attribution fixture passed. | +| `capture_integration` | 2 | `pass` | `1.000` | Redaction and capture-boundary fixtures passed. | +| `production_ops` | 6 | `blocked` | `0.667` | Four jobs passed, including the pinned OpenViking cold-start classification, and two operator-owned boundaries remain `blocked`. | +| `personalization` | 1 | `pass` | `1.000` | Scoped preference correction passed. | + +## Focused P1 Slices + +| Command | Jobs | Status summary | Evidence notes | +| --- | ---: | --- | --- | +| `cargo make real-world-memory-project-decisions` | 5 | `5` pass | Current decision, historical/reversed decision, validation gate, tradeoff rationale, and private-manifest caveat all passed. | +| `cargo make real-world-memory-evolution` | 5 | `5` pass | Temporal relation validity is now encoded and passing; stale answers `0`, conflict detections `5`, update rationales `5`. | +| `cargo make real-world-job-operator-ux` | 5 | `5` pass | Dropped evidence, rerank promotion, provider latency, rebuild change, and misleading relation-context debug cases passed with raw SQL needed `0`. | +| `cargo make real-world-memory-production-ops` | 6 | `4` pass, `0` incomplete, `2` blocked | Restore/Qdrant rebuild, interrupted backfill resume, resource envelope, and pinned OpenViking cold-start classification passed; provider credentials and private manifest remain typed non-pass boundaries. | + +## External Adapter Evidence + +The real-world runner loads +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. +That manifest is an evidence ledger, not a leaderboard. It keeps four evidence classes +separate: + +| Evidence class | Count | Meaning | +| --- | ---: | --- | +| `fixture_backed` | 1 | ELF fixture scoring through checked-in real-world jobs. | +| `live_baseline_only` | 6 | Docker same-corpus/lifecycle evidence from the live-baseline runner only. | +| `live_real_world` | 2 | ELF and qmd adapters execute the full encoded-suite `real_world_job` sweep with typed non-pass states preserved. | +| `research_gate` | 12 | Source/setup/runtime/resource/retry metadata for future adapter paths; not fixture-backed or live execution evidence. | + +XY-882 added D1/D2 feasibility verdicts inside the research-gate lane. RAGFlow +([XY-885](https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter)), +LightRAG +([XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter)), +GraphRAG +([XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter)), +Graphiti/Zep +([XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter)), +and graphify +([XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter)) +are now adapter implementation candidates because they have scoped Docker boundaries +and evidence-linked output contracts. Letta, LangGraph, nanograph, and llm-wiki remain +`research_only`; gbrain remains `blocked` until a Docker-local brain repo and database +path is proven. These verdicts do not change any record into live adapter pass +evidence. + +Adapter-level status after refreshing the manifest: + +| Project | Evidence class | Overall status | What is proven | What is not proven | +| --- | --- | --- | --- | --- | +| ELF | `fixture_backed` | `blocked` | Fixture-backed real-world scoring passes every non-operator-owned suite and preserves the production-ops credential/private-manifest boundaries. | Fixture-backed scoring is not live-service behavior; cite `elf_live_real_world` for service-runtime sweep evidence. | +| ELF | `live_real_world` | `wrong_result` | The Docker live sweep materializes all encoded real_world_job records through ElfService, worker indexing, and search_raw; the original targeted answer-retrieval slice still passes. | This is not a full-suite live pass or private-corpus proof; typed wrong_result, incomplete, blocked, and not_encoded states remain visible. | +| qmd | `live_baseline_only` | `pass` | Docker same-corpus retrieval, update, delete, and cold-start live-baseline checks pass. | Same-corpus checks are not real-world job scoring; cite `qmd_live_real_world` for service-runtime sweep evidence. | +| qmd | `live_real_world` | `wrong_result` | The Docker live sweep indexes the encoded real_world_job corpora through qmd collection add/update/embed/query and preserves per-suite scoring evidence. | This is not a full-suite live pass or broad RAG/graph adapter coverage; typed wrong_result, incomplete, blocked, and not_encoded states remain visible. | +| agentmemory | `live_baseline_only` | `lifecycle_fail` | Same-corpus retrieval can run through current adapter. | Durable storage/cold-start lifecycle and real-world suites are blocked by the current in-memory adapter path. | +| mem0/OpenMemory | `live_baseline_only` | `wrong_result` | Local OSS setup is represented separately from hosted/OpenMemory claims. | Same-corpus retrieval was not a clean pass and no real-world job adapter is encoded. | +| memsearch | `live_baseline_only` | `wrong_result` | Markdown-first design remains a source-of-truth ergonomics reference. | Same-corpus retrieval was not a clean pass and real-world suites are incomplete/not encoded. | +| OpenViking | `live_baseline_only` | `wrong_result` | The Docker local-embedding setup is pinned and reaches `add_resource`/`find`. | The same-corpus smoke still misses expected evidence terms; no real-world job adapter or context-trajectory suite is claimed. | +| claude-mem | `live_baseline_only` | `wrong_result` | Progressive disclosure and local viewer remain UX references. | Current Docker evidence is not a clean same-corpus pass and progressive disclosure jobs are not encoded. | +| qmd deep profile | `research_gate` | `not_encoded` | The stress-profile command path and source metadata are recorded for a future deeper retrieval-debug run. | No expanded qmd stress artifact or broader real-world suite pass is checked in. | +| OpenViking deep profile | `research_gate` | `not_encoded` | The deeper context-trajectory gate can reuse the pinned Docker local-embedding setup path. | No hierarchical trajectory suite result is claimed until evidence-bearing same-corpus output is fixed. | +| RAGFlow, LightRAG, GraphRAG | `research_gate` | `blocked` | Official sources, setup/resource/retry expectations, and XY-882 adapter-candidate verdicts are recorded. | Docker runtime proof and real_world_job evidence-output mapping are still required before any live adapter claim. | +| Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify | `research_gate` | `not_encoded` | XY-882 records Graphiti/Zep and graphify as adapter candidates, Letta/LangGraph/nanograph/llm-wiki as research-only, and gbrain as blocked. | No Docker-isolated `real_world_job` adapter has run for these projects. | + +External summary counters: `21` adapter records, `19` non-ELF adapter records, +`21` Docker-default, `0` host-global-install requirements, `2` live real-world +adapters, and `12` research-gate records. Overall adapter statuses are `1` pass, +`6` wrong_result, `1` lifecycle_fail, `0` incomplete, `4` blocked, and +`9` not_encoded. +Real-world suite statuses are tracked separately as `20` pass, `3` wrong_result, +`7` incomplete, `11` blocked, and `40` not_encoded, so a setup boundary is not hidden +behind an aggregate status. + +## Remaining Gaps + +Every remaining non-pass state is either a follow-up or an explicit non-goal for this +report: + +| Gap | Status | Follow-up or non-goal | +| --- | --- | --- | +| ELF production-ops cold-start dependency fixture | `pass` | XY-881 pins the Docker OpenViking local embedding path and preserves setup failures as `incomplete` if the wheel/import boundary fails on another platform. | +| ELF provider-backed production-ops gate | `blocked` | Run only with routed operator credentials; credentials were not supplied for this report. | +| ELF private production corpus | `blocked` | Supply an operator-owned sanitized private manifest; private-corpus checks were a non-goal without that manifest. | +| Full ELF live-service real-world sweep | `wrong_result` | XY-880 expanded `elf_live_real_world` to the full encoded suite corpus; the result is intentionally typed non-pass rather than a full-suite live pass. | +| Full qmd real-world job sweep | `wrong_result` | XY-880 expanded `qmd_live_real_world` to the full encoded suite corpus; the result is intentionally typed non-pass rather than broad real-world suite parity. | +| agentmemory durable lifecycle | `lifecycle_fail` / `blocked` | `[ELF benchmark P0] Make agentmemory adapter lifecycle-durable and fail-typed`. | +| mem0/OpenMemory same-corpus and real-world coverage | `wrong_result` / `not_encoded` | Add/fix a local OSS adapter before claiming lifecycle, personalization, or OpenMemory UI parity. | +| memsearch same-corpus and real-world coverage | `wrong_result` / `incomplete` | Fix Docker same-corpus retrieval/reindex evidence before scoring Markdown-first real-world jobs. | +| OpenViking Docker local embedding path | `wrong_result` | The pinned dependency path reaches `add_resource`/`find`; the remaining follow-up is evidence-bearing retrieval output, not setup. | +| claude-mem durable/progressive-disclosure adapter | `wrong_result` / `not_encoded` | Add durable local repository and progressive-disclosure job coverage before UX parity claims. | +| RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify adapters | `research_gate` adapter candidates | Follow-up issues [XY-885](https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter), [XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter), [XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter), [XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter), and [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter) must run only Docker-contained adapter smokes that emit evidence-linked outputs before any live result claim. | +| Letta, LangGraph, nanograph, and llm-wiki adapters | `research_only` research gates | Keep as architecture or workflow references until a contained output contract is selected. | +| gbrain adapter | `blocked` research gate | Revisit only after a Docker-local brain repo and database path can be proven without operator-owned state. | + +## Adoption Implications + +What ELF is better at in the current evidence: + +- Evidence-bound writes, deterministic ingestion boundaries, source-of-truth discipline, + rebuildable Qdrant indexing, scoped service APIs, and audited fixture-backed real-world + provenance are stronger than the currently tested alternatives. +- The P1 fixture batch removed the previous real-world `wrong_result` and `not_encoded` + aggregate gaps for project decisions, temporal relation validity, and operator + debugging UX. + +Where ELF is comparable or still being tested: + +- qmd remains the strongest local retrieval-debug baseline. It passes current + live-baseline checks and now has targeted live real-world job evidence, while ELF has + the stronger evidence/provenance service contract. +- The fixture-backed retrieval and memory-evolution suites pass, but this is not the + same as proving every external project on the same real-world jobs. + +Where ELF is behind or not yet proven: + +- Only ELF and qmd have targeted live real-world adapter evidence; no external project + has full-suite live real-world parity yet. +- Production-ops is intentionally not a full pass because credentialed and private + corpus checks need operator-owned inputs. +- ELF still needs to absorb external strengths: qmd-style local debug knobs, + agentmemory/claude-mem/OpenMemory-style continuity and viewer ergonomics, + OpenViking-style context trajectory, mem0-style entity history, and memsearch-style + canonical local-store ergonomics. + +The current adoption statement is therefore: ELF is the best-supported foundation in +this repository for high-trust evidence-linked agent memory, but this report does not +claim overall external superiority or private-corpus production proof. diff --git a/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md b/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md new file mode 100644 index 00000000..185ab65b --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md @@ -0,0 +1,75 @@ +# Capture/Write-Policy Live Report - June 11, 2026 + +Goal: Record the XY-933 live capture/write-policy evidence and competitor claim +boundaries. +Read this when: You need to know whether ELF has live evidence for capture redaction, +exclusions, source ids, evidence binding, and no secret leakage. +Inputs: `cargo make real-world-memory`, `cargo make real-world-memory-live-adapters`, +`apps/elf-eval/fixtures/real_world_memory/capture_integration/`, and +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. +Outputs: Scenario-level capture results, live artifacts, and typed blocker reasons for +agentmemory and claude-mem capture breadth. + +## Verdict + +ELF now has live capture/write-policy self-check evidence. The ELF live service adapter +passes all 4 `capture_integration` jobs with zero redaction leaks and full required +evidence/source-ref/quote coverage. + +This is not a broad capture-hook superiority claim. ELF has a live self-check for the +currently encoded capture/write-policy suite, while qmd keeps those jobs typed +`not_encoded`; that makes qmd untested on this surface, not an ELF-over-qmd win. +Against agentmemory and claude-mem capture breadth, the comparison is still blocked +or untested because no durable local adapter evidence exists for their hook/viewer +capture paths. + +## Fresh Runs + +| Command | Result | Artifact | +| --- | --- | --- | +| `cargo make real-world-memory` | pass | `tmp/real-world-memory/real-world-memory-report.json` | +| `cargo make real-world-memory-live-adapters` | pass | `tmp/real-world-memory/live-adapters/summary.json` | + +## ELF Capture Results + +| Job | Live status | Evidence coverage | Source-ref coverage | Redaction leaks | Capture evidence | +| --- | --- | ---: | ---: | ---: | --- | +| `capture-redaction-exclusion-001` | `pass` | `2/2` | `2/2` | `0` | Stores public decision and write-policy audit; excludes private text. | +| `capture-source-id-binding-001` | `pass` | `2/2` | `2/2` | `0` | Preserves `capture:issue-comment-42` and `capture:command-log-7`. | +| `capture-write-policy-redaction-001` | `pass` | `2/2` | `2/2` | `0` | Applies one write-policy redaction and preserves `capture:terminal-log-17`. | +| `capture-integration-boundaries-001` | `pass` | `4/4` | `4/4` | `0` | Preserves the no-live boundary for external hooks and viewer flows. | + +The ELF materialization artifact records: + +- stored evidence ids for captured public items; +- excluded evidence ids for private or trap inputs; +- runtime `source_ref` metadata returned by search, including copied source ids; +- write-policy audit, exclusion, and redaction counts; +- generated answers that contain no redaction trap text. + +## Comparison Boundary + +| Compared target | Position | Reason | +| --- | --- | --- | +| qmd live real-world adapter | `untested` | ELF executes and passes 4/4 live capture jobs; qmd keeps the same jobs typed `not_encoded`, so this remains an ELF self-check rather than a qmd comparison result. | +| agentmemory capture hooks | `blocked` | The current Docker baseline uses a process-local StateKV Map and in-memory index. No durable local session/capture path stores source ids, exclusions, write-policy audit, or evidence-bound output. | +| claude-mem capture/viewer flows | `blocked` | The checked evidence exercises repository storage, lifecycle, progressive disclosure, and same-corpus retrieval only. Hooks, timeline, observations, viewer capture, and automatic capture review need a Docker-contained hook/viewer runner before scoring. | + +## Claims Allowed + +- ELF live capture/write-policy self-checks pass for redaction, exclusions, source ids, + evidence binding, and no secret leakage. +- qmd remains `not_encoded` for capture/write-policy jobs in the full live sweep. +- agentmemory capture comparison is blocked by mocked/in-memory storage and lack of a + durable local capture artifact. +- claude-mem capture breadth is blocked until a Docker-contained hook/viewer capture + runner exists. + +## Claims Not Allowed + +- Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth. +- Do not use host-global hooks as benchmark evidence. +- Do not weaken ELF write-policy, redaction, or evidence-binding constraints for + benchmark convenience. +- Do not convert fixture-backed or live-baseline-only capture references into a live + real-world competitor pass. diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md new file mode 100644 index 00000000..12aeeb01 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -0,0 +1,185 @@ +# Competitor-Strength Adoption Report - June 11, 2026 + +Goal: Publish the final benchmark vNext adoption decision and scenario matrix for +ELF against tracked open-source memory, RAG, graph, and agent-continuity projects. +Read this when: You need the current production-adoption answer, the scenario-level +win/tie/loss/not-tested matrix, or the optimization queue behind future ELF work. +Inputs: `2026-06-11-measurement-coverage-audit.md`, +`2026-06-11-first-generation-oss-adapter-promotion-report.md`, +`2026-06-11-qmd-openviking-strength-profile-report.md`, +`2026-06-11-temporal-history-competitor-gap-report.md`, +`2026-06-11-graph-rag-scored-smoke-adapter-report.md`, +`2026-06-11-mem0-openmemory-history-ui-export-report.md`, +`2026-06-11-first-generation-oss-continuity-source-store-report.md`, and +`2026-06-10-production-adoption-refresh.md`. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md` and the current +external adapter manifest. +Outputs: Adoption decision, evidence-class boundaries, scenario matrix, follow-up +optimization queue, and the machine-readable companion file +`docs/research/2026-06-11-competitor-strength-adoption-report.json`. + +## Adoption Decision + +ELF is adoptable for bounded personal production use. + +The verdict is `adopt_with_bounded_caveats`, not broad competitor superiority. The +supporting evidence is strongest where ELF was designed to be strong: source-of-truth +discipline, evidence-bound writes, rebuildable Qdrant derivations, backup/restore, +backfill, and typed benchmark reporting. Those properties are stronger than the +measured alternatives in the current evidence set. + +The remaining caveats are material: + +- Full-suite live real-world pass parity is not proven. +- Live temporal reconciliation is still a measured loss: five of six + `memory_evolution` jobs are `wrong_result`. +- Private-corpus production quality is blocked until an operator-owned manifest + exists. +- Credentialed provider production-ops gates are blocked until explicit provider + setup exists. +- Several competitor strengths remain `not_tested` or blocked: OpenMemory + UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform + behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival + memory, and broad graph/RAG navigation remain unproven. XY-929 adds a + representative graph/RAG fixture slice with typed blockers, one incomplete LightRAG + job, and one graphify wrong_result job, but it does not create any broad graph/RAG + win, tie, or loss claim. XY-928 encodes OpenViking staged trajectory, hierarchy + selection, and recursive/context expansion as blocked fixtures + behind same-corpus evidence output and missing staged artifacts. XY-927 adds + fixture-only `core_archival_memory` coverage, but Letta scenario rows remain + blocked or `not_tested` until the selected contained export/readback path exists. + mem0 local OSS preference history is measured separately and is an ELF loss on the + current correction history + scenario. The XY-923 follow-up also scores qmd's immediate top-10/replay artifact + ergonomics as stronger than ELF's default stress report, while expansion, fusion, + and rerank remain untested. XY-932 adds a narrow live operator-debug slice where + ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory + UI/export remains blocked and claude-mem viewer workflows remain blocked until + Docker-contained hook/viewer evidence exists. XY-925 + now adds fixture-backed first-generation OSS prompt coverage and typed blockers for + agentmemory durable continuity, memsearch Markdown source-store/debug jobs, and + claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator + surfaces; those rows still do not create live external real-world suite passes. + XY-933 adds an ELF live capture/write-policy self-check, but agentmemory capture + breadth is blocked by mocked/in-memory storage and claude-mem hook/viewer capture + remains blocked until Docker-contained hook/viewer evidence exists. + +## Evidence Classes + +This report keeps evidence classes separate. Do not convert fixture passes, +same-corpus smokes, research gates, blocked setup, unsupported shapes, wrong +results, or lifecycle failures into one aggregate leaderboard. + +| Evidence class | Meaning | +| --- | --- | +| `fixture_backed` | Checked-in real-world fixtures pass through the benchmark runner. | +| `live_baseline_only` | Docker same-corpus or lifecycle checks ran, but not full real-world jobs. | +| `live_real_world` | A runtime or CLI adapter produced scored real-world job records. | +| `smoke_only` | A tiny setup or output-shape smoke ran. | +| `research_gate` | Source/setup/resource/output-contract evidence exists only as research. | +| `blocked` | A credential, private input, provider, or setup boundary is missing. | +| `incomplete` | Setup reached a partial adapter path but did not reach the behavioral scoring surface. | +| `unsupported` | The project shape is not comparable for the scenario. | +| `not_encoded` | The benchmark does not yet cover the scenario. | +| `wrong_result` | The system ran but produced the wrong memory answer or evidence. | +| `lifecycle_fail` | Update/delete/reload/persistence behavior failed. | + +## Source Artifacts + +| Command or run | Artifact | Supported claim | +| --- | --- | --- | +| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` plus XY-952, XY-953, and XY-954 fixture updates | ELF fixture aggregate covers 60 jobs across 16 suites with 53 pass and 7 blocked production-ops, private-corpus, private/provider scheduler, or OpenViking context-trajectory measurement gates, including 6 passing `core_archival_memory` jobs, 1 passing `memory_summary` source-trace job, 4 passing `proactive_brief` suggestion jobs plus 1 private-corpus blocker, and 4 passing `scheduled_memory` task-readback jobs plus 1 private/provider scheduler blocker. | +| `cargo make real-world-memory-scheduled` | `tmp/real-world-memory/scheduled/report.json` and `2026-06-16-scheduled-memory-task-scoring-report.md` | The scheduled-memory fixture scores weekly project status summary, stale preference/plan audit, stale decision audit, knowledge-page refresh suggestion, and private/provider scheduler blocker scenarios with evidence refs, freshness/currentness markers, action rationale, execution trace/readback, source-mutation guards, and stale/tombstone guards; this is fixture-backed contract evidence, not hosted scheduler, ChatGPT Tasks, Pulse, notification, or provider-backed private-corpus parity. | +| `cargo make real-world-memory-summary` | `tmp/real-world-memory/memory-summary/report.json` | The memory summary fixture scores reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile entries with source refs, freshness metadata, rationale, and unsupported-claim flags; this is fixture-backed contract evidence, not managed-memory parity. | +| `cargo make real-world-memory-proactive-brief` | `tmp/real-world-memory/proactive-brief/report.json` and `2026-06-16-proactive-brief-scoring-report.md` | The proactive brief fixture scores daily project brief, resume-work brief, stale decision audit, stale plan/preference warning, and private-corpus refresh blocker scenarios with evidence refs, freshness/currentness markers, action rationale, and stale/tombstone guards; this is fixture-backed contract evidence, not Pulse or hosted managed-memory parity. | +| `cargo make real-world-memory-core-archival` | `tmp/real-world-memory/core-archival/report.json` | ELF core-block behavior is scored separately from archival note search for attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery. | +| `cargo make real-world-memory-live-adapters` | `2026-06-11-measurement-coverage-audit.md` | ELF live service adapter reports 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs. | +| `cargo make real-world-memory-live-adapters` | `2026-06-11-capture-write-policy-live-report.md` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, while agentmemory and claude-mem capture breadth are blocked until durable hook/viewer evidence exists. | +| `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/summary.json` | The narrow live operator-debug slice scores ELF as pass and qmd as wrong_result: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; both systems expose replay commands and repair-action guidance. | +| `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `2026-06-11-first-generation-oss-adapter-promotion-report.md` | mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result. | +| `cargo make real-world-first-generation-oss` | `2026-06-11-first-generation-oss-continuity-source-store-report.md` | First-generation OSS fixture slice reports 6 jobs: 4 pass, 2 blocked, full evidence/source-ref/quote coverage, and manifest scenario outcomes across win, tie, loss, not_tested, blocked, and non_goal without promoting smoke evidence into live suite passes. | +| `cargo make openmemory-ui-export-readback` | `2026-06-11-mem0-openmemory-history-ui-export-report.md` | mem0 local OSS passes preference correction history, entity-scoped personalization, local `get_all` export-style readback, and deletion audit history; OpenMemory export-helper setup emits a separate blocked artifact with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`, and hosted Platform export remains non-goal. | +| `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal` | `2026-06-11-temporal-history-competitor-gap-report.md` | Graphiti/Zep temporal smoke remains blocked by `provider_api_key_missing`. | +| `cargo make smoke-graphify-docker-graph-report` | `2026-06-11-graph-rag-scored-smoke-adapter-report.md` | graphify reaches tiny Docker graph/report scoring but remains wrong_result. | +| `cargo make real-world-memory-graph-rag` | `tmp/real-world-memory/graph-rag/report.json` | Representative graph/RAG fixtures produce typed non-pass reports: RAGFlow, GraphRAG, and Graphiti/Zep blocked; LightRAG incomplete with comparison blocked; graphify wrong_result; llm-wiki not_tested; gbrain blocked; private/hosted profiles non_goal. | +| `cargo make baseline-production-synthetic`, `cargo make baseline-backfill-docker`, backup/restore, Qdrant rebuild proof | `2026-06-10-production-adoption-refresh.md` | ELF has provider synthetic, stress, backfill, restore, and rebuild evidence; private-corpus proof is blocked by missing operator-owned manifest. | +| `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` plus ELF trace-bundle and qmd CLI replay commands | `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md` | Retrieval correctness remains tied, but qmd wins current immediate top-10/replay artifact ergonomics; ELF trace/admin surfaces are useful but not yet hydrated into the default stress artifact. | + +## Scenario Matrix + +| Scenario | ELF outcome | Evidence classes | Measured claim | Follow-up | +| --- | --- | --- | --- | --- | +| Source-of-truth rebuild and evidence-bound writes | `win` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF has the strongest measured source-of-truth and rebuild story: Postgres is authoritative, Qdrant is rebuildable, trust-source jobs pass, and production restore/rebuild proof exists. | None | +| Work resume and coding-agent continuity | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF and qmd both pass encoded live `work_resume` jobs. XY-925 selects agentmemory's next durable local path but keeps it blocked until the SDK KV/index and observation log survive a fresh process; claude-mem work_resume remains `not_encoded`, and OpenViking continuity trajectory remains `blocked`. | XY-928 | +| Project decisions and reversals | `tie` | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF and qmd both pass encoded `project_decisions` jobs. The ELF `core_archival_memory` fixture also scores project-decision recovery through core routing plus archival rationale, but Letta-style comparison remains blocked without contained export evidence. | XY-927 | +| Retrieval quality | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only` | ELF and qmd both pass encoded live retrieval and stress/same-corpus retrieval evidence. | XY-923 | +| Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 | +| Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 | +| Consolidation/proposal review | `not_tested` for direct competitors; ELF self-check passes | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF fixture consolidation passes and XY-934 adds live service-backed proposal materialization, lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit evidence. Managed dreaming and Always-On Memory Agent patterns remain product references, not direct live competitors. | XY-934 | +| Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `blocked`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. The XY-929 graph/RAG representative slice scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references. | XY-926, XY-929 | +| Operator debugging/viewer UX | `win` | `fixture_backed`, `live_real_world`, `blocked`, `not_encoded` | ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, but claude-mem viewer/operator workflows and OpenMemory UI/export remain blocked, so this is not a broad viewer-product superiority claim. | XY-926 | +| Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains `not_encoded`; agentmemory and claude-mem hook-capture comparisons remain `blocked` until Docker-contained hook observations and write-policy/viewer readback artifacts exist, so no broad capture-hook superiority claim is allowed. | XY-933, XY-925 | +| Production ops, restore, backfill, and rebuild | `win` | `live_baseline_only`, `blocked` | ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence. | XY-930 | +| Private corpus and provider boundaries | `blocked` | `blocked` | Private production profile fails closed without an operator-owned manifest; provider-backed production-ops gates require explicit credentials. | XY-930 | +| Personalization and scoped preferences | `tie` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `not_encoded` | ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss. | XY-927 | +| Context trajectory and hierarchical retrieval | `not_tested` | `fixture_backed`, `live_baseline_only`, `research_gate`, `wrong_result`, `blocked` | OpenViking reaches the pinned Docker local embedding path and now exposes expected/matched/missing evidence ids, but same-corpus evidence is still wrong_result; staged trajectory, hierarchy selection, and recursive expansion are encoded as blocked fixtures, not scored comparisons. | XY-928 | +| Core-vs-archival memory | `blocked` | `fixture_backed`, `research_gate`, `blocked`, `not_encoded` | ELF now has 6 fixture-backed `core_archival_memory` jobs that score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search. Letta remains blocked or not tested until its contained export/readback artifact maps core and archival source ids. | XY-927 | +| Graph/RAG navigation and citations | `not_tested` | `smoke_only`, `research_gate`, `blocked`, `incomplete`, `wrong_result`, `not_encoded` | `cargo make real-world-memory-graph-rag` adds representative citation, graph-summary, temporal-validity, graph-report, stale-source-lint, and unsupported-claim fixtures. The slice is typed non-pass: RAGFlow, GraphRAG, and Graphiti/Zep are blocked; LightRAG is incomplete with comparison blocked; graphify is wrong_result; llm-wiki is not_tested; gbrain is blocked. Broad graph/RAG navigation and citation quality remain not_tested. | XY-929 | + +## Follow-Up Queue + +| Issue | Priority | State | Gap | +| --- | --- | --- | --- | +| XY-905 | P0 | Backlog | Live temporal reconciliation answer and trace contract. | +| XY-923 | P0 | Backlog | qmd trace-level replay and wrong-result diagnostics. | +| XY-924/XY-931 | P0 | Encoded local OSS history; UI/export setup blocker measured | mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export has a blocked export-helper setup probe and still needs a dedicated compose/import path before any product-UX comparison. | +| XY-925 | P1 | Fixture slice encoded; runtime paths still blocked | First-generation OSS prompt coverage and typed blockers are recorded for agentmemory, memsearch, and claude-mem; durable agentmemory hooks and claude-mem viewer/operator runs still need runtime adapters. | +| XY-926 | P1 | Partial live suites encoded | ELF live knowledge-page scoring is encoded; broader knowledge-page external comparisons and broad operator-debugging remain dependent on contained llm-wiki/gbrain/GraphRAG/OpenMemory/claude-mem runners. Consolidation is split to XY-934. | +| XY-934 | P1 | ELF live self-check encoded | Live consolidation proposal scoring is encoded for ELF with lineage, confidence/usefulness, unsupported-claim flags, and review-action audit; direct competitor runners remain untested or product-reference only. | +| XY-933 | P1 | Live ELF self-check encoded | Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked. | +| XY-927 | P1 | Fixture encoded; Letta export blocked | ELF core-vs-archival fixture coverage is encoded; a contained Letta export/readback adapter remains future work before win/tie/loss claims. | +| XY-928 | P1 | Encoded blocked fixtures | OpenViking context-trajectory and hierarchy benchmark is encoded but blocked until evidence-bearing same-corpus and staged artifacts exist. | +| XY-929 | P2 | Representative fixture slice encoded; live contracts still blocked or typed non-pass | Graph/RAG adapters now have representative citation/navigation/lint fixtures, but live evidence-linked output contracts are still blocked, incomplete, wrong_result, not_tested, or non_goal. | +| XY-930 | P1 | Backlog | Private-corpus and credentialed production gates after operator inputs exist. | +| XY-906 | Ops | Todo | Decodex registered-project review-config schema drift blocks Decodex loading of ELF. | + +## Allowed Claims + +- ELF is adoptable for bounded personal production use with caveats. +- ELF has the strongest measured source-of-truth, rebuild, restore, and backfill + evidence among the tracked systems. +- ELF ties qmd on encoded live retrieval, work-resume, project-decisions, and + personalization slices. +- ELF fixture-backed `core_archival_memory` coverage passes attachment, scope, + provenance, stale-core detection, archival fallback, and project-decision recovery + jobs separately from archival search. +- ELF has a narrow live operator-debug win over qmd for trace hydration, + candidate-drop visibility, and selected-but-not-narrated evidence, with + replay-command availability and repair-action clarity tied. +- ELF live capture/write-policy self-checks pass for redaction, exclusions, source + ids, evidence binding, and no secret leakage. +- ELF has a live temporal reconciliation loss against the benchmark expectation: + five memory-evolution jobs remain `wrong_result`. +- Most competitor strengths outside qmd retrieval are `not_tested`, `blocked`, + `incomplete`, `smoke_only`, or `research_gate`. + +## Claims Not Allowed + +- Do not claim ELF broadly beats qmd. +- Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system + or retrieval-quality win. +- Do not claim ELF beats mem0/OpenMemory on preference history, UI/export, hosted + behavior, or graph memory. The local OSS correction-history scenario is currently + an ELF loss, while OpenMemory UI/export is a measured setup blocker and hosted + behavior plus graph memory remain outside measured local OSS evidence. +- Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow + ELF/qmd operator-debug slice. +- Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth; the + current comparison is blocked for their hook/viewer capture paths. +- Do not claim ELF beats OpenViking on staged context trajectory. +- Do not claim ELF beats Letta on core-vs-archival memory. +- Do not claim graph/RAG parity from smoke-only or typed non-pass representative + evidence. +- Do not promote `fixture_backed`, `live_baseline_only`, `smoke_only`, + `research_gate`, `blocked`, `incomplete`, `wrong_result`, `lifecycle_fail`, + `unsupported`, or `not_encoded` states into a generic pass/fail score. diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md new file mode 100644 index 00000000..6402b188 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -0,0 +1,173 @@ +# Competitor-Strength Evidence Matrix - June 11, 2026 + +Goal: Define a durable competitor-strength matrix so ELF benchmark claims are tied to +measured evidence classes, typed blockers, and explicit next measurement gates. +Read this when: You need to decide whether ELF can claim a win, tie, loss, gap, or +non-claim against a tracked memory, RAG, or graph project. +Inputs: `docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md`, +`docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md`, +`docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md`, +`docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md`, +`docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`, +`docs/guide/research/external_memory_improvement_plan.md`, +`docs/guide/research/research_projects_inventory.md`, +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, +and `Makefile.toml`. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, +`docs/guide/benchmarking/live_baseline_benchmark.md`, and the current external adapter +manifest. +Outputs: Human-readable matrix, claim boundaries, scenario next-measurement gates, +and the machine-readable companion file +`docs/research/2026-06-11-xy-897-competitor-strength-matrix.json`. + +## Decision Boundary + +Do not claim that ELF beats, ties, or loses to a competitor unless the named scenario +is encoded and run at a comparable evidence class. + +Current boundary: + +- ELF and qmd have full-suite `live_real_world` sweeps, but neither has a full-suite + live pass. The fresh ELF sweep produced 40 jobs with 22 pass, 5 wrong_result, + 0 incomplete, 2 blocked, and 11 not_encoded; the fresh qmd sweep produced 17 pass, + 6 wrong_result, 0 incomplete, 2 blocked, and 15 not_encoded. +- ELF fixture evidence is strong: `cargo make real-world-memory` reports 60 jobs + across 16 suites with 53 pass and 7 blocked production-ops, private-corpus, private/provider scheduler, or + OpenViking context-trajectory measurement gates. The `core_archival_memory` suite + contributes 6 fixture-only passes for ELF core-block behavior; it does not create + an ELF-over-Letta claim. The `memory_summary` suite contributes one fixture-backed + source-trace pass; it does not create managed-memory parity evidence. The + `proactive_brief` suite contributes four fixture-backed source-linked suggestion + passes and one private-corpus blocker; it does not create Pulse or hosted + managed-memory parity. The `scheduled_memory` suite contributes four fixture-backed + scheduled task readbacks plus one private/provider scheduler blocker; it does not + create hosted scheduler, ChatGPT Tasks, Pulse, notification, or provider-backed + private-corpus parity. This proves the fixture contract, not live-service parity. +- qmd is the strongest measured local retrieval-debug comparison, but the current + evidence still separates its same-corpus/live-retrieval strengths from the full-suite + live non-pass sweep. +- Most other projects are `live_baseline_only` or `research_gate`. They must not be + treated as beaten until a comparable scenario is encoded and run. +- Private-corpus and credentialed production-ops checks remain operator-owned + `blocked` states. + +## Current Ledger Summary + +The current manifest has 23 adapter records across 16 external projects plus ELF. +Evidence-class counts: 1 `fixture_backed`, 6 `live_baseline_only`, 5 +`live_real_world`, and 11 `research_gate`. Overall adapter-status counts: 4 `pass`, +6 `wrong_result`, 1 `lifecycle_fail`, 7 `blocked`, and 5 `not_encoded`. + +## State Taxonomy + +This report uses the benchmark's snake_case state names. Hyphenated prose names map +directly to these states: fixture-backed -> `fixture_backed`, +live-baseline -> `live_baseline_only`, live-real-world -> `live_real_world`, +research-gate -> `research_gate`, wrong-result -> `wrong_result`, +lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. + +| State | Meaning | Claim boundary | +| --- | --- | --- | +| `fixture_backed` | Checked-in real-world jobs or fixture responses are scored by the benchmark runner. | Useful for contract coverage, not live runtime proof. | +| `live_baseline_only` | Docker same-corpus or lifecycle checks ran, but no real-world job suite was scored for that project. | Cannot imply real-world job parity. | +| `live_real_world` | A runtime or CLI adapter materialized and scored real-world job records. | Can support scenario claims only for the encoded suite statuses. | +| `research_gate` | Source, setup, resource, retry, or output-contract metadata exists. | Follow-up routing only; not pass evidence. | +| `blocked` | Safe measurement needs unavailable credentials, private data, setup proof, or external dependency. | Keep typed until the missing input exists. | +| `unsupported` | Capability is outside the project shape or requires a non-comparable path. | Do not turn into a loss. | +| `wrong_result` | The system ran but missed expected memory, answer, or evidence terms. | Behavioral non-pass. | +| `lifecycle_fail` | Retrieval may work, but update/delete/reload/persistence/cold-start behavior fails. | Lifecycle non-pass, not a retrieval win. | +| `incomplete` | The run did not reach the behavioral check because setup or runtime failed. | Setup/runtime non-pass, not quality evidence. | +| `not_encoded` | The scenario is not currently covered. | No pass/fail claim is allowed. | + +## Project Matrix + +| Project | Strongest user-facing scenario | Current evidence | Measured status and proof | Unsupported or blocked status | Required benchmark before ELF claim | Borrow if stronger | +| --- | --- | --- | --- | --- | --- | --- | +| ELF | Evidence-linked source-of-truth memory service with real-world fixtures and live retrieval sweeps. | `live_real_world`; supporting `fixture_backed`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/elf-report.md`; live capture/write-policy suite passes 4/4 with zero redaction leaks. Narrow operator-debug pass: `cargo make real-world-job-operator-ux-live-adapters`, `tmp/real-world-job/operator-ux-live-adapters/elf-report.md`. Fixture contract: `cargo make real-world-memory`, `tmp/real-world-memory/real-world-memory-report.json`. | `blocked`: private manifest and provider credentials; broader live suites remain `wrong_result`, `blocked`, or `not_encoded`; the narrow operator-debug and live capture/write-policy slices now pass. | Full-suite live pass plus separate private-corpus, credentialed production-ops proof, and durable external capture-hook comparisons. | Keep borrowing qmd debug knobs, OpenViking staged trajectory, mem0 history, Letta core memory, agentmemory/claude-mem capture breadth, and graph/RAG navigation. | +| qmd | Local retrieval-debug workflow with transparent CLI indexing, querying, expansion, fusion, and rerank ergonomics. | `live_real_world`; supporting `live_baseline_only` and `research_gate`. | `wrong_result` full live sweep: `cargo make real-world-memory-live-adapters`, `tmp/real-world-memory/live-adapters/qmd-report.md`; targeted retrieval suites pass; the narrow operator-debug slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | `not_encoded`: deep profile and non-retrieval live behavior are not encoded; memory_evolution is `wrong_result`. | Keep qmd deep retrieval/debug profiling separate from the narrow operator-debug live slice; no broad ELF-over-qmd or qmd-over-ELF claim is allowed until comparable stage artifacts exist. | Weighted fusion, rerank explanation, local debug knobs, and command-line replay. | +| agentmemory | Coding-agent continuity, MCP/REST packaging, viewer workflow, and durable cross-agent memory lifecycle. | `live_baseline_only`. | `lifecycle_fail`: `ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. | `blocked`: durable cold-start, capture-hook persistence, and real-world adapter coverage are missing; current Docker baseline uses a process-local StateKV Map and in-memory index. | Durable local adapter with update, delete, cold-start reload, work_resume, capture/write-policy, and lifecycle-staleness jobs. | Cross-agent hooks, packaging, continuity scenarios, and viewer affordances. | +| mem0/OpenMemory | Memory lifecycle, personalization, hosted/OpenMemory UI ergonomics, and optional graph memory. | `live_baseline_only`. | `pass`: fresh scoped run `cargo make openmemory-ui-export-readback`, `tmp/live-baseline/live-baseline-report.json`, with mem0 `8/8` local SDK checks passing; `blocked`: OpenMemory export-helper setup probe emits `tmp/live-baseline/mem0-openmemory-ui-export.json` with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`. | `blocked`: OpenMemory UI/export cannot be compared until a compose/import path loads the same corpus into the product app; `unsupported`: hosted Platform export; `not_encoded`: optional graph memory and real-world prompt adapter coverage. | Add a Docker-contained OpenMemory product app import/export path, then score browser/API readback separately from SDK `get_all`; keep hosted Platform and graph memory opt-in/non-goal unless explicitly enabled. | Entity-scoped history, lifecycle surfaces, async update ergonomics, and OpenMemory inspection UX. | +| memsearch | Markdown-first canonical store with rebuildable local index and practical hybrid retrieval. | `live_baseline_only`; XY-925 `fixture_backed`. | `pass`: fresh scoped run `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`, with memsearch `4/4` local checks passing. XY-925 adds fixture-backed source-store and retrieval-debug prompts through `cargo make real-world-first-generation-oss`, `tmp/real-world-memory/first-generation-oss/report.json`. | `not_encoded`: no live memsearch runtime adapter executes real-world prompt scoring; memory-evolution prompt adapters remain not encoded; TTL/expiry is unsupported by the current CLI path. | Promote the fixture-backed source-store and retrieval-debug prompts into a live memsearch real-world adapter before any suite-level win/loss claim; keep TTL/expiry as unsupported unless a comparable path exists. | Canonical markdown store, local reindex clarity, and user-inspectable source files. | +| OpenViking | Filesystem-like context trajectory, hierarchical retrieval, and staged context loading. | `live_baseline_only`; supporting `fixture_backed` and `research_gate`. | `wrong_result`: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`; `blocked`: checked-in `context_trajectory` fixtures cover staged retrieval, hierarchy selection, and recursive/context expansion gates. | `blocked`: hierarchical context trajectory is encoded but blocked until same-corpus evidence ids match and staged artifacts are materialized. | Make evidence-bearing same-corpus output pass, then score staged trajectory and hierarchy expansion. | `viking://`-style context model, trajectory readback, and staged retrieval planning. | +| claude-mem | Progressive disclosure, automatic capture loop, repository-local lifecycle, and local viewer workflow. | `live_baseline_only`; XY-925 `fixture_backed`. | `wrong_result`: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`, `tmp/live-baseline/live-baseline-report.json`. XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompts through `cargo make real-world-first-generation-oss`, `tmp/real-world-memory/first-generation-oss/report.json`. | `blocked`: hook capture and viewer/operator workflows still lack a Docker-contained runner; retrieval remains `wrong_result`, and the repair prompt lists rerun/inspection targets `tmp/live-baseline/claude-mem.log` and `tmp/live-baseline/claude-mem-checks.json`. | Promote durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, and progressive-disclosure prompts into a live claude-mem adapter before any broader UX claim. | Progressive disclosure, automatic capture review loops, and local viewer/operator comfort. | +| RAGFlow | Full RAG application workflow with document, chunk, and reference evidence handles. | `research_gate`. | `blocked`: `ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker`, `tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json`. | `blocked`: Docker resource envelope and adapter output mapping still need proof. | XY-885 tiny Docker evidence-smoke adapter mapping `reference.chunks` to scored evidence. | Document/chunk references, resource-envelope reporting, and RAG app evidence handles. | +| LightRAG | Lightweight graph/RAG context export with source file-path citation shape. | `research_gate`. | `blocked`: `ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context`, `tmp/real-world-memory/lightrag-context/summary.json`. | `blocked`: Docker service setup and context export are not proven. | XY-886 Docker context-export adapter with explicit provider config and source citation mapping. | Context-only query modes, graph-aware retrieval layout, and file-path citation readback. | +| GraphRAG | GraphRAG indexing, graph summaries, and document/text-unit evidence tables. | `research_gate`. | `blocked`: `ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker`, `tmp/real-world-memory/graphrag-smoke/summary.json`. | `blocked`: indexing resource envelope and source citation mapping are not proven. | XY-887 cost-bounded Docker adapter over a tiny corpus and scored output tables. | Graph summary artifacts, local/global search separation, and source table evidence mapping. | +| Graphiti/Zep | Temporal graph memory with current, historical, and future fact validity windows. | `research_gate`. | `blocked`: `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal`, `tmp/real-world-memory/graphiti-zep-smoke/summary.json`. | `blocked`: Docker graph-store and temporal adapter are not proven. | XY-888 Docker-local temporal graph adapter scoring current/historical fact validity. | Temporal fact windows, invalidation/supersession semantics, and graph fact provenance. | +| Letta | Core memory blocks versus archival memory with explicit operating-context surfaces. | `research_gate`. | `blocked`: the selected comparison contract is a Docker-only benchmark-created agent export that returns core block JSON, archival search/readback JSON, and source ids; no materialized export exists yet. | `blocked`: no Letta materializer currently creates the benchmark agent, imports the ELF `core_archival_memory` fixture corpus, or exports comparable core and archival evidence. | Implement and run the contained export/readback adapter before any Letta win, tie, or loss claim; keep personalization and project-decision scenarios blocked or not tested until that evidence exists. | Core memory block ergonomics, archival separation, and shared operating context readback. | +| LangGraph | Checkpoint/replay regression workflow and durable state replay for agent runs. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: not a standalone memory backend adapter. | Non-goal for direct win/loss until a standalone memory output contract exists; use replay jobs as benchmark infrastructure reference. | Checkpoint replay, deterministic regression, and state-diff evaluation patterns. | +| nanograph | Typed graph schema and query ergonomics for graph-lite developer experience. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: not a memory backend comparison target. | Non-goal for direct win/loss unless a contained memory-backed target emerges; measure ELF graph-lite DX instead. | Typed relation schema, query ergonomics, and small graph developer experience. | +| llm-wiki | LLM-maintained wiki or knowledge-page workflow with query-save and lint loops. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: no live service runtime for adapter proof. | Select contained plugin or instruction harness, then score knowledge pages for citations, unsupported claims, rebuild, and stale-source lint. | Maintained wiki workflows, page lint, query-save loops, and topic-scoped navigation. | +| gbrain | Operational knowledge brain with compiled_truth pages, timelines, enrichment, and maintenance loops. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `blocked`: Docker-local brain repo and database path are missing. | Prove Docker-local repository/database setup, then encode compiled_truth/timeline and operator-continuity jobs. | Compiled truth pages, timeline maintenance, and human-operable knowledge-brain navigation. | +| graphify | Graph-compressed navigation with `graph.json` and `GRAPH_REPORT` evidence outputs. | Scored tiny `live_real_world` smoke; not broad graph-quality proof. | `wrong_result`: `cargo make smoke-graphify-docker-graph-report`, `tmp/real-world-memory/graphify-smoke/graphify-report.json`. | `not_encoded`: broad graph navigation, multimodal, private-corpus, and large-corpus quality remain outside the tiny smoke. | Expand beyond the generated smoke only after graph/report output maps to scored evidence on representative graph/RAG jobs. | Graph compression, source-location graph reports, and navigation hints for large code or document spaces. | + +## Scenario Matrix + +| Scenario | Current ELF evidence | Strongest competitor/reference | Current competitor evidence | Next measurement before claim | +| --- | --- | --- | --- | --- | +| Retrieval/debug | Fixture retrieval passes; live retrieval passes. | qmd. | qmd live retrieval passes and live baseline passes, but full-suite live status is `wrong_result`. | Run qmd deep profile and ELF/qmd trace-level replay with expansion, fusion, rerank, and candidate-drop diagnostics. | +| Work resume | Fixture and live work_resume pass. | agentmemory, claude-mem, OpenViking. | agentmemory `lifecycle_fail`; claude-mem work_resume remains `not_encoded` pending a durable repository-backed adapter; OpenViking work_resume is `not_encoded`. | Encode durable work_resume adapters or keep each blocked with lifecycle/setup evidence. | +| Project decisions | Fixture and live project_decisions pass; the ELF core-archival fixture also scores project-decision recovery through core routing plus archival rationale. | qmd, Letta. | qmd live project_decisions pass; Letta project-decision recovery is `research_gate` `not_tested` or `blocked` until the contained export path exists. | Run the Letta core/archival export/readback contract before treating project-decision recovery as a comparable scenario. | +| Source-of-truth | Fixture and live trust_source_of_truth pass. | memsearch. | memsearch canonical-store, reindex, delete, and reload smoke passes; XY-925 fixture-backed source-of-truth prompts now cover the canonical Markdown rebuild/reload boundary, but no live memsearch prompt adapter pass is claimed. | Promote memsearch source-of-truth rebuild/reload prompts into a live adapter before any suite-level win/loss claim. | +| Temporal/current-vs-historical memory | Fixture memory_evolution passes; live memory_evolution is `wrong_result`. | Graphiti/Zep, mem0/OpenMemory. | Graphiti/Zep is `research_gate` `blocked`; mem0/OpenMemory local OSS preference history, entity scope, deletion audit, and SDK `get_all` now pass; OpenMemory UI/export is blocked by the export-helper setup probe; graph-memory scenarios are `not_encoded`. | Fix ELF/qmd live memory_evolution evidence links, add OpenMemory product app import/export readback, and run XY-888. | +| Consolidation | Fixture consolidation passes; XY-934 adds ELF live service-backed proposal scoring with lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit. | managed dreaming, Always-On Memory Agent patterns, agentmemory, llm-wiki. | No direct live competitor runner emits comparable consolidation artifacts; qmd remains `not_encoded`. | Keep competitor comparisons reference-only until a contained runner emits source ids, confidence, unsupported-claim flags, and review-action audit artifacts. | +| Knowledge pages | Fixture knowledge_compilation passes; live knowledge_compilation is `not_encoded`. | llm-wiki, gbrain, GraphRAG, graphify. | llm-wiki and gbrain are `research_gate` `not_encoded` or `blocked`; GraphRAG is `blocked`; graphify has a tiny scored smoke `wrong_result`. | Encode live derived-page rebuild/lint scoring and run contained knowledge/RAG adapters only after setup proof. | +| Operator debugging | Fixture operator_debugging_ux passes, and the narrow live operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity. | qmd, claude-mem, OpenMemory. | qmd ties replay-command availability and repair-action clarity but is `wrong_result` for trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, while claude-mem viewer/operator and OpenMemory UI/export remain blocked. | Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | +| Capture/write policy | Fixture capture_integration passes; ELF live capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding. | agentmemory, claude-mem. | agentmemory and claude-mem hook capture remain `blocked` until Docker-contained hook observations and write-policy/viewer readback artifacts exist. | Run durable agentmemory and claude-mem capture-hook jobs proving redaction, exclusion, evidence binding, source ids, and no secret leakage. | +| Production ops | Fixture production_ops has 4 pass and 2 blocked; live production_ops is `blocked`; production adoption has provider/backfill/restore evidence. | ELF production gate, qmd, RAG/RAGFlow resource gates. | qmd live production_ops is `blocked`; RAG/resource gates are `research_gate` `blocked`. | Rerun private-corpus and credentialed gates only when operator-owned manifest and credentials exist. | +| Personalization | Fixture and live personalization pass. | mem0/OpenMemory, Letta. | mem0/OpenMemory local OSS entity-scoped personalization now passes, so scoped preference behavior is a measured tie; OpenMemory UI/export remains blocked, hosted Platform export is non-goal, optional graph memory remains outside local OSS scoring, and Letta personalization is `research_gate` `not_encoded`. | Add OpenMemory product app import/export and contained Letta scoped-preference readback before broader personalization superiority claims. | +| Context trajectory | ELF has trace direction but no comparable staged trajectory scenario. | OpenViking. | OpenViking setup is pinned, same-corpus retrieval is `wrong_result`, and staged/hierarchy/recursive trajectory jobs are encoded as `blocked`. | Make OpenViking evidence-bearing retrieval pass, then score staged context trajectory outputs. | +| Core-vs-archival memory | Fixture `core_archival_memory` passes 6/6 and scores core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search. | Letta. | Letta is `research_gate` `blocked`/`not_tested` until the selected contained export/readback artifact exists. | Implement the Letta export/readback adapter, then compare only scenarios whose core block JSON, archival search/readback JSON, and source ids are present. | +| Graph/RAG navigation | ELF relation context is not enough to claim graph/RAG navigation parity. | RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, graphify. | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain `research_gate` blocked/incomplete without explicit setup; graphify has only a tiny scored smoke `wrong_result`. | Run larger contained graph/RAG adapters with evidence-linked outputs before any ELF graph/RAG win, tie, or loss claim. | + +## Parallelizable Benchmark Follow-Ups + +These workstreams can proceed after this matrix lands because the claim boundaries are +now explicit: + +| Workstream | Issue or candidate | Parallelizable | Blocked by | Measurement | +| --- | --- | --- | --- | --- | +| qmd deep retrieval/debug profile | New benchmark issue | yes | None after this matrix lands. | Stress profile plus trace-level retrieval-debug artifacts for qmd and ELF. | +| agentmemory durable lifecycle adapter | `[ELF benchmark P0] Make external adapters lifecycle-durable and fail-typed` | yes | Durable local adapter path selection. | Update, delete, cold-start reload, work_resume, and capture/write-policy jobs. | +| agentmemory/claude-mem capture-hook breadth | Follow-up after XY-933 | yes | Docker-contained hook/viewer capture path with durable artifacts. | Source ids, redaction/exclusion audit, evidence-bound output, and typed blocker reporting. | +| mem0/OpenMemory history and UI coverage | New adapter repair issue | yes | Comparable local OSS path for history/UI/readback evidence. | Preference/entity history, deletion audit readback, personalization, OpenMemory inspection/export, and optional graph-context jobs. | +| memsearch source-of-truth live adapter coverage | New adapter repair issue | yes | Fixture-backed source-store and retrieval-debug prompts are encoded by XY-925; live prompt execution remains missing. | Runtime adapter execution for the existing source-of-truth rebuild/reload and retrieval-debug prompt jobs without converting baseline smoke into suite pass claims. | +| OpenViking context trajectory | XY-928 encoded blocked fixtures | yes | Evidence-bearing same-corpus retrieval output and staged artifacts. | Hierarchical expansion, staged trajectory, recursive/context expansion, and comparable ELF trace/session evidence jobs. | +| claude-mem hook/viewer runtime coverage | New adapter issue | yes | Fixture-backed progressive-disclosure and retrieval-repair prompts are encoded by XY-925; hook capture and viewer/operator workflows remain blocked. | Work resume, operator debugging, capture/write-policy, viewer/operator, and live progressive-disclosure adapter execution. | +| RAGFlow evidence smoke | XY-885 | yes | Resource envelope accepted for tiny Docker smoke. | `reference.chunks` to benchmark evidence mapping. | +| LightRAG context export | XY-886 | yes | Docker service setup and explicit provider config. | Retrieved context export and source file-path citations. | +| GraphRAG cost-bounded adapter | XY-887 | yes | Tiny corpus cost/resource envelope. | Document, text-unit, graph-summary, and citation output tables. | +| Graphiti/Zep temporal graph adapter | XY-888 | yes | Docker-local graph store setup. | Current/historical/future fact validity and evidence ids. | +| graphify graph report adapter | XY-889 plus post-XY-900 expansion | yes | Representative graph/RAG jobs beyond the tiny scored smoke. | `graph.json` and `GRAPH_REPORT` evidence mapped to scored graph navigation and knowledge synthesis ids. | +| Private corpus and credentialed production ops | Operator-owned benchmark gates | no | Sanitized private manifest and routed provider credentials. | Private-corpus retrieval quality and credentialed production-ops evidence. | +| Letta, LangGraph, nanograph, llm-wiki direct adapters | Letta export artifact blocked; others research-only until output contract | no | Letta needs the selected contained export/readback artifact; the others need a non-memory-backend comparability contract. | Run only after comparable output exists; otherwise keep as product-reference evidence. | + +## Validation Contract + +Consistency checks for this report should verify: + +- The Markdown project matrix includes every project currently present in + `memory_projects_manifest.json`: ELF, qmd, agentmemory, mem0/OpenMemory, memsearch, + OpenViking, claude-mem, RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, + nanograph, llm-wiki, gbrain, and graphify. +- The machine-readable matrix has the same project set and includes every required + scenario id: `retrieval_debug`, `work_resume`, `project_decisions`, + `source_of_truth`, `temporal_current_historical`, `consolidation`, + `knowledge_pages`, `operator_debugging`, `capture_write_policy`, `production_ops`, + `personalization`, `context_trajectory`, `core_vs_archival_memory`, and + `graph_rag_navigation`. +- Evidence states remain typed. Do not collapse `research_gate`, `blocked`, + `unsupported`, `wrong_result`, `lifecycle_fail`, `incomplete`, or `not_encoded` + into pass/fail aggregates. + +## Claim Rules + +- A project can be called stronger only for a named scenario with comparable measured + evidence. +- `research_gate` plus setup metadata can justify a follow-up adapter issue, not a + product win. +- A blocked measurement is not a hidden loss. Keep the typed reason and rerun only when + the missing operator or setup input exists. +- If a project remains stronger on user-facing workflow but lacks comparable measured + evidence, record what ELF should borrow and add a benchmark gate before changing any + README-level claim. diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md new file mode 100644 index 00000000..7c03cb74 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -0,0 +1,313 @@ +# ELF Iteration Direction From Competitor Benchmarks - June 11, 2026 + +Goal: Convert the current benchmark evidence and competitor-strength matrix into an +iteration direction for ELF without overstating wins. +Read this when: You need to decide what ELF should learn from adjacent memory, +RAG, graph, and agent-continuity projects. +Inputs: `2026-06-11-competitor-strength-evidence-matrix.md`, +`2026-06-10-live-real-world-sweep-report.md`, +`2026-06-10-production-adoption-refresh.md`, +`2026-06-10-real-world-comparison-report.md`, +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, +and `docs/guide/research/external_memory_improvement_plan.md`. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`. +Outputs: Current measured data, scenario gaps, and a prioritized optimization +direction for future ELF work. + +## Executive Judgment + +ELF is a credible personal-production foundation for a high-trust memory service, but +the current evidence does not prove broad superiority over all tracked projects. + +The strongest current statement is: + +- ELF is ahead on source-of-truth discipline, evidence-bound writes, rebuildable + derived indexes, typed failure reporting, and checked-in production-operation + evidence. +- ELF and qmd are tied on the encoded live retrieval, work-resume, and + project-decision slices. ELF does not yet beat qmd's local retrieval-debug + ergonomics, but ELF now has a narrow live operator-debug win over qmd on trace + hydration and candidate-drop visibility. +- Many competitor strengths are still undermeasured: OpenViking context trajectory, + mem0/OpenMemory entity history and UI, agentmemory and claude-mem capture breadth, + Letta core-vs-archival memory, Graphiti/Zep temporal graph behavior, and + llm-wiki/gbrain/graphify knowledge workflows. +- The right next strategy is not to replace ELF with any one project. It is to keep + ELF's evidence-bound core and absorb the best measured or plausible product + patterns behind benchmark gates. + +## Current Measured Data + +### Fixture-Backed ELF Aggregate + +`cargo make real-world-memory` currently reports: + +| Metric | Value | +| --- | ---: | +| Jobs | `55` | +| Encoded suites | `15` | +| Pass | `49` | +| Blocked | `6` | +| Wrong result | `0` | +| Lifecycle fail | `0` | +| Incomplete | `0` | +| Not encoded | `0` | +| Unsupported claim | `0` | +| Mean score | `0.891` | +| Evidence coverage | `123/123` | +| Source-ref coverage | `123/123` | +| Quote coverage | `123/123` | +| Expected evidence recall | `115/115` | + +This proves the fixture contract is broad and well controlled. It does not prove that +every live adapter or every competitor runtime passes those scenarios. +The new `proactive_brief` fixture slice contributes four passing evidence-linked +suggestion jobs and one typed private-corpus blocker tied to XY-930; it does not +prove Pulse or hosted managed-memory parity. + +### Live Real-World Sweep + +`cargo make real-world-memory-live-adapters` produced comparable full-suite live +sweeps for ELF and qmd: + +| Adapter | Jobs | Pass | Wrong result | Incomplete | Blocked | Not encoded | Mean score | Evidence recall | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live service adapter | `40` | `22` | `5` | `0` | `2` | `11` | `0.599` | `50/80` | +| qmd live CLI adapter | `40` | `17` | `6` | `0` | `2` | `15` | `0.461` | `38/80` | + +Interpretation: + +- ELF is five passes ahead in this full live sweep because qmd misses the delete/TTL + tombstone job and keeps the capture/write-policy suite typed `not_encoded`. +- Both pass `trust_source_of_truth`, `work_resume`, `project_decisions`, + `retrieval`, and `personalization`. +- Both fail most `memory_evolution` live conflict evidence with `wrong_result`. +- ELF now passes live `capture_integration`. A separate XY-934 narrow run adds live + consolidation proposal review evidence for ELF; qmd keeps consolidation + `not_encoded` in the live sweep. Knowledge compilation and production-ops operator + boundaries remain typed `not_encoded` or `blocked`. Operator debugging has a + separate narrow live slice: ELF passes it, while qmd remains `wrong_result` for + trace hydration and candidate-drop stage visibility. + +### Production Evidence + +ELF has the strongest production-operation evidence among the tracked systems: + +| Run | Scope | Result | +| --- | --- | --- | +| Provider synthetic | 8 documents, 6 queries, Qwen3-Embedding-8B, 4096 dimensions | `8/8`, `pass`, 59 seconds | +| Provider stress | 480 generated documents, 16 queries | `9/9`, `pass`, 779 seconds | +| Provider backfill | 2,000 generated documents, 16 queries, resume 1,000 -> 2,000 | `9/9`, `pass`, 2,804 seconds | +| Restore proof | Docker Compose backup/restore plus Qdrant rebuild | restored note searchable, zero rebuild errors | +| Private production corpus | operator-owned manifest required | failed closed, no pass claimed | + +This is enough to support personal production use with bounded caveats. It is not a +private-corpus quality proof. + +### External Adapter Ledger + +The current adapter manifest records 23 adapter records across 17 projects: + +| Evidence class | Count | Meaning | +| --- | ---: | --- | +| `fixture_backed` | `1` | ELF real-world fixture scoring. | +| `live_baseline_only` | `6` | Docker same-corpus or lifecycle evidence without real-world job scoring. | +| `live_real_world` | `5` | ELF and qmd full-suite live sweeps, graphify's tiny scored Docker smoke, and the narrow ELF/qmd operator-debug live slice. | +| `research_gate` | `11` | Source/setup/resource/output-contract evidence only. | + +Overall adapter statuses: + +| Status | Count | +| --- | ---: | +| `pass` | `4` | +| `wrong_result` | `6` | +| `lifecycle_fail` | `1` | +| `blocked` | `7` | +| `not_encoded` | `5` | + +The ledger is intentionally not a leaderboard. It prevents fixture evidence, +same-corpus checks, research gates, and live real-world runs from being collapsed into +one misleading score. + +## Scenario Conclusions + +| Scenario | Current position | What ELF should learn next | +| --- | --- | --- | +| Retrieval/debug | ELF and qmd are tied on encoded live retrieval; qmd remains the stronger debug UX reference. | Add trace-level replay, expansion/fusion/rerank knobs, candidate-drop diagnosis, and command-line replay. | +| Work resume | ELF live work-resume passes; continuity-oriented competitors are undermeasured. | Borrow agentmemory/claude-mem capture breadth and OpenViking staged context, but require durable adapter proof. | +| Project decisions | ELF and qmd live project-decision suites pass; ELF fixture-backed `core_archival_memory` also scores project-decision recovery, while Letta remains blocked without export evidence. | Run the Letta core/archival export/readback contract before treating project-decision recovery as comparable. | +| Source of truth | ELF has the strongest measured source-of-truth evidence. | Borrow memsearch's local canonical-store ergonomics without making files or vectors authoritative. | +| Temporal memory | ELF fixture passes, but live memory evolution is wrong_result. | Prioritize current-vs-historical evidence links and Graphiti/Zep-style validity windows. | +| Consolidation | ELF fixture passes and XY-934 adds live service-backed proposal materialization, lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit; direct competitor runners remain untested. | Keep derived proposal review as the safety boundary and add competitor/reference runners only when they emit comparable artifacts. | +| Memory summaries and knowledge pages | ELF fixture pages pass, and XY-952 adds a fixture-backed `memory_summary` source-trace contract; live top-of-mind behavior and live knowledge generation are not encoded. | Borrow llm-wiki lint/query-save loops, gbrain timelines, graphify reports, and managed-memory review patterns behind source-linked summary and rebuild/lint benchmarks. | +| Operator debugging | Fixture UX passes and the narrow live trace/viewer slice is scored: ELF passes, qmd ties replay/repair clarity but is wrong_result for trace hydration and candidate-drop visibility. | Expand coverage to OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | +| Capture/write policy | ELF live capture/write-policy self-check passes with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem is `not_encoded`. | Borrow agentmemory/claude-mem capture breadth only after durable local hook/viewer evidence exists, while preserving redaction and evidence binding. | +| Production ops | ELF has the strongest checked-in evidence, with private/credential gates blocked. | Keep Docker-first production proof and add private corpus only when an operator-owned manifest exists. | +| Personalization | ELF live personalization passes; mem0/OpenMemory ties the entity-scoped personalization smoke but still lacks a broader real-world prompt adapter, and Letta scoped preference readback remains not tested until its contained export path exists. | Add broader entity/preference history and UI readback before claiming stronger personalization. | +| Context trajectory | Not comparable yet; OpenViking remains the reference. | Score staged retrieval, hierarchy expansion, and trajectory readback. | +| Core-vs-archival | ELF fixture-backed `core_archival_memory` passes 6/6, but Letta remains blocked/not tested because no contained export artifact exists. | Borrow Letta's core memory block shape while keeping any win/tie/loss claim gated on exported core block, archival readback, and source-id evidence. | +| Graph/RAG navigation | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain research gates; graphify has a tiny scored `wrong_result` smoke. | Run larger contained graph/RAG adapters before any broad graph-navigation claim. | + +## Project Guidance Matrix + +| Project | Current evidence | User-facing strength | ELF direction | +| --- | --- | --- | --- | +| ELF | `fixture_backed` plus `live_real_world`; live full sweep is `wrong_result`; live capture/write-policy self-check passes. | Evidence-linked memory service, strict provenance, rebuildable Qdrant, production backfill/restore proof. | Keep this as the core; do not weaken source-of-truth, write-policy, or typed failure semantics while adding product ergonomics. | +| qmd | `live_real_world` plus `live_baseline_only`; targeted retrieval passes, full sweep is `wrong_result`. | Local retrieval-debug workflow, transparent CLI, weighted fusion, rerank, replayable commands. | Treat qmd as the retrieval-debug bar. ELF should match its introspection and local replay without becoming CLI-only. | +| agentmemory | `live_baseline_only`; current status is `lifecycle_fail`; capture breadth comparison is blocked by process-local StateKV Map and in-memory index. | Coding-agent continuity, hooks, MCP/REST packaging, viewer/console observability. | Borrow capture breadth and continuity UX, but require durable lifecycle and capture artifact proof before claims. | +| mem0/OpenMemory | `live_baseline_only`; basic local smoke now passes, while entity/preference history, hosted ecosystem, graph memory, and OpenMemory UI remain untested locally. | Entity-scoped memory, lifecycle/history surfaces, hosted ecosystem, OpenMemory UI. | Add entity/preference history and UI readback patterns, while keeping hosted claims out of local OSS benchmarks. | +| memsearch | `live_baseline_only`; canonical Markdown reindex/reload smoke now passes, while real-world source-of-truth prompts remain unencoded. | Markdown-first canonical store and local reindex clarity. | Borrow local inspectability and canonical-file ergonomics, not file-as-authority semantics. | +| OpenViking | `live_baseline_only`, `fixture_backed`, and `research_gate`; current status is `wrong_result` for same-corpus evidence and `blocked` for fixture-backed trajectory gates. | Filesystem-like context model, hierarchy, staged context trajectory. | Add staged retrieval and trajectory scoring after same-corpus evidence output is correct. | +| claude-mem | `live_baseline_only`; current status is `wrong_result`; hook/viewer capture breadth is not encoded. | Progressive disclosure, automatic capture, local viewer workflow. | Borrow progressive disclosure and viewer comfort; benchmark capture and operator-debugging live paths before claims. | +| RAGFlow | `research_gate`; current status is `blocked`. | Full RAG application workflow with document/chunk/reference handles. | Use as a resource-aware RAG adapter benchmark, not as a current ELF competitor win/loss. | +| LightRAG | `research_gate`; current status is `blocked`. | Lightweight graph/RAG context export and source-path citation shape. | Borrow context-export ideas for graph/RAG navigation after Docker proof. | +| GraphRAG | `research_gate`; current status is `blocked`. | Graph summaries, document/text-unit tables, local/global search separation. | Borrow graph summary artifacts for knowledge pages and graph navigation after cost-bounded output proof. | +| Graphiti/Zep | `research_gate`; current status is `blocked`. | Temporal graph facts, validity windows, current-vs-historical answers. | Use as the semantic model for ELF temporal memory and relation validity benchmarks. | +| Letta | `research_gate`; current status is `blocked` until the selected contained export/readback artifact exists. | Core memory blocks versus archival memory. | Keep ELF's fixture-backed core block coverage separate from Letta comparison claims; compare Letta only after exported core and archival evidence exists. | +| LangGraph | `research_gate`; current status is `not_encoded` or `unsupported` as a direct memory backend. | Checkpoint, replay, fork, and regression debugging for agent state. | Borrow replay/regression patterns for benchmark infrastructure, not as direct memory parity. | +| nanograph | `research_gate`; current status is `not_encoded` or `unsupported` as a full memory backend. | Typed graph schema and query ergonomics. | Borrow graph-lite DX and typed relation query ideas. | +| llm-wiki | `research_gate`; current status is `not_encoded`. | Maintained wiki pages, query-save, lint, and repair loops. | Use as a reference for rebuildable, cited knowledge pages. | +| gbrain | `research_gate`; current status is `not_encoded` and setup-blocked. | Compiled truth pages, timelines, and human-operable knowledge navigation. | Borrow current-truth plus timeline presentation after Docker-local setup proof exists. | +| graphify | `live_real_world`; tiny scored smoke is `wrong_result`. | `graph.json`, `GRAPH_REPORT`, source-location graph navigation. | Treat the tiny smoke as bounded non-pass evidence and expand only after representative graph/RAG jobs map to evidence ids. | + +## Optimization Direction + +### P0 - Close Measured Quality Gaps + +These are the highest leverage because current evidence already shows an ELF gap, a +close competitor surface, or a still-unmeasured product strength. + +1. Live memory evolution correctness + - Current state: fixture pass, live `wrong_result`. + - Borrow from: Graphiti/Zep validity windows, mem0 history, ELF ingest-decision + audit rows. + - Target: live answers cite both current and historical conflict evidence, not only + current retrieved text. + - Benchmark gate: live `memory_evolution` pass for ELF before superiority claims. + +2. qmd-level retrieval debugging + - Current state: ELF and qmd tie on encoded results; qmd remains stronger in + local debug ergonomics. + - Borrow from: qmd weighted fusion, rerank explanation, local replay commands. + - Target: every wrong result can be traced through expansion, dense retrieval, + sparse retrieval, fusion, rerank, graph context, and final selection. + - Benchmark gate: qmd deep profile plus ELF/qmd trace-level replay report. + +3. Live operator debugging UX + - Current state: fixture pass; narrow live ELF/qmd slice scored with ELF `pass` + and qmd `wrong_result`. + - Borrow from: claude-mem viewer, OpenMemory inspector, qmd command output. + - Target: no raw SQL needed to explain a bad memory result, across service traces, + CLI replay, and bounded local viewer surfaces. + - Benchmark gate: add OpenMemory and claude-mem UI/export or viewer runners before + claiming broader operator-debug UX superiority. + +### P1 - Turn ELF Into A Better Daily Memory Product + +These improve day-to-day usefulness while preserving ELF's evidence-bound core. + +1. Capture and continuity + - Borrow from: agentmemory hook breadth and claude-mem automatic capture review. + - Current state: ELF live capture/write-policy self-check passes; agentmemory is + blocked and claude-mem is not encoded for capture breadth. + - ELF shape: live ingestion must continue to preserve redaction, excluded spans, + source ids, and write-policy audit. + - Benchmark gate: durable agentmemory and claude-mem capture-hook runners with + no secret leakage and evidence-bound output. + +2. Reviewable consolidation + - Borrow from: managed memory dreaming and Always-On Memory Agent scheduling. + - Current state: ELF now has live service-backed proposal scoring for the + consolidation fixture slice; direct competitor/reference runners are still + untested. + - ELF shape: derived proposals only; source notes are not silently rewritten. + - Benchmark gate: preserve lineage, confidence, unsupported-claim flags, + apply/defer/discard audit, and zero source mutations; do not add scheduling until + it can remain derived and reviewable. + +3. Knowledge pages + - Borrow from: llm-wiki, gbrain, graphify, and GraphRAG. + - ELF shape: project/entity/concept pages are rebuilt from authoritative notes and + linted for unsupported or stale sections. + - Benchmark gate: live knowledge-page rebuild/lint report, not fixture-only proof. + +4. Core memory blocks + - Borrow from: Letta core memory versus archival memory. + - ELF shape: scoped read-only blocks with provenance and attachment rules, separate + from archival search. + - Benchmark gate: ELF fixture jobs now prove attachment, scope, provenance, + stale-core detection, archival fallback, and project-decision recovery; Letta + comparison remains gated on exported core block, archival readback, and source-id + evidence. + +### P2 - Expand External Comparison Without Fake Wins + +These are needed for broad credibility but should not block personal production use. + +1. RAG and graph adapters + - Current state: RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain typed + research gates; graphify has a tiny scored `wrong_result` smoke. + - Benchmark gate: Docker-contained adapters must emit evidence-linked outputs + before any live pass claim. + +2. OpenViking context trajectory + - Current state: setup is pinned, same-corpus retrieval is `wrong_result`, and + staged trajectory, hierarchy selection, and recursive/context expansion are + encoded as `blocked` fixtures. + - Benchmark gate: evidence-bearing retrieval pass, then staged hierarchy/trajectory + scoring. + +3. mem0/OpenMemory and memsearch coverage + - Current state: both now pass the basic local OSS smoke, but their strongest + real-world scenarios remain unencoded. + - Benchmark gate: score mem0/OpenMemory entity history and UI readback, plus + memsearch source-of-truth and retrieval-debug workflows. + +## What Not To Claim Yet + +Do not claim: + +- ELF beats qmd overall. ELF is five passes ahead in the fresh aggregate because qmd + misses the delete/TTL tombstone job and keeps capture/write-policy jobs + `not_encoded`, but neither adapter has full-suite live pass evidence and qmd still + owns stronger local retrieval-debug ergonomics. +- ELF has full-suite live real-world pass evidence. It does not. +- ELF has private-corpus production quality proof. The private profile currently + fails closed without an operator-owned manifest. +- ELF beats OpenViking on context trajectory. The scenario is encoded as blocked, not + scored. +- ELF beats mem0/OpenMemory on hosted memory, entity history, UI, or optional graph + memory. Those scenarios are not encoded; the operator-debug win is only against + qmd on a narrow trace/replay slice. +- ELF beats Letta on core-vs-archival memory. ELF has fixture-backed coverage, but + Letta remains blocked/not tested until the selected contained export/readback path + produces comparable source-id-mapped evidence. +- ELF beats RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, or graphify on graph/RAG + navigation. Current evidence is research-gate or blocked except graphify's tiny + non-pass smoke. + +## Suggested Report Cadence + +Use this cadence for future benchmark-driven iteration: + +1. Keep `2026-06-11-competitor-strength-evidence-matrix.md` as the claim gate. +2. Keep this report as the optimization direction. +3. For each new adapter or suite, publish a dated benchmark report only when the run + changes a README-level claim or a production-adoption decision. +4. Every report must classify evidence as `fixture_backed`, `live_baseline_only`, + `live_real_world`, or `research_gate`. +5. Do not promote a reference project into a win/loss claim until the relevant + scenario is encoded and run at a comparable evidence class. + +## Recommended Next Reports + +The next reporting work should be ordered by decision value: + +1. ELF/qmd retrieval-debug deep profile. +2. ELF live memory-evolution repair report. +3. OpenMemory and claude-mem operator-debug UI/export runners. +4. agentmemory and claude-mem capture-hook breadth report. +5. OpenViking context-trajectory report after evidence-bearing retrieval works. +6. RAG/graph adapter pack report after Docker-contained outputs map to evidence ids. + +These are report and measurement directions, not implementation commitments. diff --git a/docs/guide/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md b/docs/guide/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md new file mode 100644 index 00000000..bf4e53a1 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md @@ -0,0 +1,211 @@ +# ELF/qmd Memory-Evolution Diagnostic - June 11, 2026 + +Goal: Explain the fresh live memory-evolution failures for ELF and qmd, and turn the +measured gaps into benchmark and optimization directions without implementing those +optimizations here. +Read this when: You need to decide whether ELF currently beats qmd on +current-vs-historical memory, supersession, delete/tombstone handling, or temporal +relation validity. +Inputs: Fresh local runs of `cargo make real-world-memory-evolution` and +`cargo make real-world-memory-live-adapters` on commit `87a388b`. +Outputs: Fixture evidence, live ELF/qmd job-level diagnosis, claim boundaries, and +future iteration directions. + +## Executive Judgment + +ELF does not yet have a production-quality live memory-evolution win. The fixture +suite passes, but the live adapter path still fails five of six current-vs-historical +jobs. + +The narrow fresh result is: + +- Fixture memory-evolution: `5/5` pass. +- ELF live memory-evolution: `1/6` pass, `5/6` wrong_result. +- qmd live memory-evolution: `0/6` pass, `6/6` wrong_result. + +ELF is better than qmd on this fresh live slice only in a limited sense: ELF retrieves +all required memory-evolution evidence and passes the delete/TTL tombstone job; qmd +misses three required evidence links and fails the delete/TTL job. + +That is not enough to claim ELF has solved memory evolution. The main live ELF gap is +not basic retrieval. ELF retrieves the current evidence, rationale evidence, and often +the relevant historical evidence, but the answer and trace do not explicitly encode +that a historical fact was superseded, invalidated, or preserved as history. The +scorer therefore records no conflict detection and assigns `0.0` lifecycle behavior +on the five supersession jobs. + +For a memory system meant to support real agents, this is a P0 product-quality gap: +users do not only ask for the newest note. They ask what changed, why, what used to be +true, which source is current, and whether an old conclusion is stale. + +## Fresh Runs + +| Command | Result | Runtime | +| --- | --- | ---: | +| `cargo make real-world-memory-evolution` | pass | 50.34 seconds | +| `cargo make real-world-memory-live-adapters` | pass | 112.26 seconds | + +The live adapter command emitted repeated Qdrant client/server compatibility warnings, +but it completed and wrote ELF and qmd reports. Treat the warning as benchmark-harness +risk, not as a run failure. + +## Fixture Baseline + +`cargo make real-world-memory-evolution` proves the benchmark contract itself can +score the intended behavior: + +| Metric | Value | +| --- | ---: | +| Jobs | `5` | +| Pass | `5` | +| Wrong result | `0` | +| Mean score | `1.000` | +| Expected evidence recall | `11/11` | +| Evidence coverage | `11/11` | +| Conflict detections | `5` | +| Update rationales available | `5` | +| History-readback encoded jobs | `1` | + +This is fixture evidence. It proves the scenario contract is encoded and scored. It +does not prove the ELF live service or qmd CLI path can produce the same behavior. + +## Live Full-Sweep Context + +The fresh live sweep changed the qmd full-suite shape compared with the previous +coverage audit: + +| Adapter | Jobs | Pass | Wrong result | Blocked | Not encoded | Mean score | Mean latency | Expected evidence recall | Evidence coverage | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live service adapter | `38` | `18` | `5` | `2` | `13` | `0.525` | `8.620 ms` | `41/77` | `48/84` | +| qmd live CLI adapter | `38` | `17` | `6` | `2` | `13` | `0.486` | `691.163 ms` | `38/77` | `45/84` | + +Do not turn this into a broad win claim. The difference is explained by this +memory-evolution slice: qmd failed the delete/TTL job that ELF passed. + +## Live Memory-Evolution Result + +| Adapter | Jobs | Pass | Wrong result | Mean score | Expected evidence matched | Produced evidence | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live service adapter | `6` | `1` | `5` | `0.492` | `13/13` | `13` | +| qmd live CLI adapter | `6` | `0` | `6` | `0.325` | `10/13` | `10` | + +### Job Matrix + +| Job | ELF status | ELF score | qmd status | qmd score | Diagnosis | +| --- | --- | ---: | --- | ---: | --- | +| `memory-evolution-benchmark-verdict-001` | wrong_result | `0.40` | wrong_result | `0.15` | ELF retrieved current verdict, caveat, and rationale, but did not cite the old not-ready verdict as historical. qmd also missed the private-corpus caveat evidence. | +| `memory-evolution-deploy-method-001` | wrong_result | `0.40` | wrong_result | `0.40` | Both retrieved current production runbook and supersession rationale, but neither explicitly preserved the old quickstart path as historical conflict evidence. | +| `memory-evolution-issue-state-001` | wrong_result | `0.40` | wrong_result | `0.40` | Both answered the current done state and resolution rationale, but neither surfaced the earlier blocked state as superseded history. | +| `memory-evolution-preference-001` | wrong_result | `0.40` | wrong_result | `0.15` | ELF retrieved current preference and rationale, but did not preserve the old terse preference as historical. qmd only returned the rationale evidence. | +| `memory-evolution-relation-temporal-001` | wrong_result | `0.35` | wrong_result | `0.35` | Both retrieved current and historical owners, but neither produced a scored temporal-validity explanation or update rationale. | +| `memory-evolution-delete-ttl-001` | pass | `1.00` | wrong_result | `0.50` | ELF retrieved both tombstone and current plan evidence. qmd retrieved only the current plan and missed the tombstone. | + +### Dimension Pattern + +For ELF's five wrong-result jobs, the pattern is consistent: + +| Dimension | Score pattern | +| --- | --- | +| `answer_correctness` | `0.0` on all five wrong-result jobs | +| `evidence_grounding` | `1.0` on all five wrong-result jobs | +| `lifecycle_behavior` | `0.0` on all five wrong-result jobs | +| `trap_avoidance` | `1.0` on all five wrong-result jobs | + +That means ELF usually finds the right evidence and avoids stale facts as current, but +the answer is not lifecycle-aware enough. It does not represent the historical version +as a first-class part of the answer, so the benchmark cannot credit conflict +detection. + +qmd has the same lifecycle pattern, plus evidence misses: + +| qmd miss | Effect | +| --- | --- | +| `verdict-bounded-private-caveat` missing | Benchmark verdict job drops to `0.15`. | +| `pref-current-concise-rationale` missing | Preference job drops to `0.15`. | +| `delete-tombstone` missing | Delete/TTL job is `wrong_result` despite answering the current plan. | + +## What This Says About ELF + +ELF currently looks strong at current-fact retrieval and typed source-of-truth +discipline. It is not yet strong enough at memory evolution. + +The missing product behavior is a temporal reconciliation layer: + +1. Detect that current and historical evidence both relate to the same claim. +2. Explain which evidence is current and which is historical. +3. Preserve old facts when the user asks what changed. +4. Mark superseded facts as no longer current without deleting their historical value. +5. Expose tombstones and invalidation evidence as answerable lifecycle facts. +6. Emit trace artifacts that show conflict candidates, current winner, historical + loser, and update rationale. + +This is why the fixture can pass while the live path fails. The fixture response is a +curated memory-evolution answer. The live adapters are retrieval-backed materializers, +not full temporal reconciliation engines. + +## What ELF Should Borrow + +These are optimization directions, not implemented changes in this report: + +| Source/reference | Useful idea for ELF | Benchmark gate before claiming progress | +| --- | --- | --- | +| Graphiti/Zep | Temporal fact validity windows, invalidation, and current/historical graph facts. | Run the Graphiti/Zep temporal graph adapter and compare current, historical, and future-validity jobs. | +| mem0/OpenMemory | Entity-scoped memory history and user-visible memory lifecycle inspection. | Add entity/preference history readback and UI/export evidence checks. | +| Letta | Core memory blocks separate from archival memory. | Add core-vs-archival jobs that distinguish always-loaded operating context from retrieved history. | +| qmd | Local replay and candidate inspection ergonomics. | Emit ELF trace hydration with conflict candidates, demoted historical facts, and replay commands. | +| Existing ELF production ops | Tombstone and deletion semantics. | Extend delete/TTL scoring from one isolated job into update/delete/recreate history cases. | + +## Next Benchmark And Report Directions + +1. Live temporal reconciliation report + - Score whether ELF can answer "what changed?" with current evidence, + historical evidence, and update rationale in the same answer. + - Include trace hydration for current winner, historical loser, and conflict + resolution reason. + +2. Graphiti/Zep temporal graph comparison + - Use the existing Graphiti/Zep research gate as the next real adapter target. + - The goal is not to copy a graph database blindly; it is to measure validity + windows and supersession semantics against ELF. + +3. mem0/OpenMemory history comparison + - Measure preference/entity history, correction, deletion, and user-visible + inspection. + - This directly maps to personal agent-memory expectations. + +4. qmd tombstone/delete diagnostic + - qmd is already the retrieval-debug reference, but it missed the delete tombstone + in this run. + - Keep this as a measured qmd gap before using qmd as a lifecycle reference. + +5. ELF trace-candidate conflict profile + - Add a report that shows top candidates for conflict jobs, not only final mapped + evidence ids. + - This should make it obvious whether historical evidence was absent, present but + unselected, or selected but not narrated. + +## Claim Boundaries + +Allowed claims: + +- The fixture memory-evolution suite passes. +- In the fresh live memory-evolution run, ELF outscored qmd and passed one job qmd + failed. +- ELF retrieved all required memory-evolution evidence in the live run. +- ELF still failed five of six live memory-evolution jobs because current-vs-historical + conflict detection was not encoded in the answer behavior. + +Not allowed: + +- Do not claim ELF has solved memory evolution. +- Do not claim ELF broadly beats qmd as a memory system. +- Do not promote fixture memory-evolution pass into live production proof. +- Do not treat Graphiti/Zep, mem0/OpenMemory, or Letta as beaten; their strongest + scenarios still need comparable adapter reports. + +## Bottom Line + +The next ELF iteration direction should prioritize temporal reconciliation over more +generic retrieval work. Retrieval is good enough to find the needed evidence in this +slice; the failing behavior is deciding and explaining how current, historical, +deleted, and superseded memories relate. diff --git a/docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md b/docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md new file mode 100644 index 00000000..8054b3fe --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md @@ -0,0 +1,266 @@ +# ELF/qmd Retrieval-Debug Profile - June 11, 2026 + +Goal: Compare the measured retrieval-debug evidence for ELF and qmd without turning +retrieval success into a broader memory-system win claim. +Read this when: You need to decide what ELF should learn from qmd's retrieval and +debug workflow. +Inputs: Fresh local runs of `cargo make real-world-memory-live-adapters` and +`ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make +baseline-live-docker` on commit `38c586d`. +Outputs: Retrieval pass data, stress-profile data, debug artifact comparison, claim +boundaries, and ELF iteration directions. + +## Executive Judgment + +ELF and qmd are tied on the measured retrieval correctness surfaces in this report. +Both pass the encoded real-world retrieval suite and both pass the 480-document +generated-public stress baseline. + +qmd still remains the better retrieval-debug product reference because its CLI baseline +emits directly inspectable top-10 JSON results with files, line numbers, snippets, and +scores for every query. ELF emits stronger service and production-operation evidence, +including trace ids, backfill checkpoints, Qdrant rebuild proof, resource envelope, +and source-of-truth semantics, but the stress baseline report does not hydrate the full +candidate list behind each ELF trace. + +So the correct claim is: + +- ELF and qmd are tied on current encoded retrieval correctness. +- ELF is stronger on source-of-truth and production-style service lifecycle evidence. +- qmd is still the simpler local retrieval-debug reference. +- This report does not prove qmd rerank quality, ELF rerank quality, or expansion / + fusion superiority because the qmd real-world materializer and baseline use + `--no-rerank`, and no scored expansion/fusion/rerank debug suite exists yet. + +## Fresh Runs + +| Command | Result | Runtime | +| --- | --- | ---: | +| `cargo make real-world-memory-live-adapters` | pass | 116.76 seconds | +| `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | pass | 149.41 seconds | + +The stress baseline used the generated-public profile with 480 documents and 16 +queries. The live real-world adapter sweep used the checked-in real-world memory +fixtures. + +## Real-World Retrieval Suite + +Both adapters pass the same retrieval jobs: + +| Adapter | Retrieval jobs | Pass | Expected evidence | Matched evidence | Produced evidence | Mean score | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live service adapter | `5` | `5` | `6` | `6` | `6` | `1.000` | +| qmd live CLI adapter | `5` | `5` | `6` | `6` | `6` | `1.000` | + +The five retrieval jobs are: + +| Job | ELF | qmd | +| --- | --- | --- | +| `retrieval-alt-phrasing-001` | pass | pass | +| `retrieval-current-vs-obsolete-001` | pass | pass | +| `retrieval-distractor-heavy-001` | pass | pass | +| `retrieval-minimal-context-001` | pass | pass | +| `retrieval-multi-hop-routing-001` | pass | pass | + +Full live sweep context remains a non-pass for both systems: + +| Adapter | Jobs | Pass | Wrong result | Blocked | Not encoded | Mean score | Mean latency | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live service adapter | `38` | `18` | `5` | `2` | `13` | `0.525` | `5.823 ms` | +| qmd live CLI adapter | `38` | `17` | `6` | `2` | `13` | `0.486` | `691.163 ms` | + +Do not overread the latency row. The ELF adapter is a service-runtime path and the qmd +adapter is a CLI materialization path; the row is useful as observed harness evidence, +not as an apples-to-apples product latency benchmark. The aggregate pass-count +difference comes from the memory-evolution delete/TTL tombstone job; it does not erase +qmd's local retrieval-debug ergonomics advantage. + +## Stress Baseline + +The stress baseline result: + +| Field | Value | +| --- | ---: | +| Profile | `stress` | +| Documents | `480` | +| Queries | `16` | +| Projects | `ELF,qmd` | +| Verdict | `pass` | +| Project statuses | `2/2 pass` | +| Full checks | `13/13 pass` | +| Wrong result | `0` | +| Lifecycle fail | `0` | +| Blocked | `0` | +| Not encoded | `0` | + +### ELF Stress Result + +| Metric | Value | +| --- | ---: | +| Project elapsed | `81 s` | +| Query pass | `16/16` | +| Mean query latency | `29.808 ms` | +| p95 query latency | `31.298 ms` | +| Backfill source count | `480` | +| Backfill completed count | `480` | +| Resume attempts | `2` | +| Completed before resume | `240` | +| Completed after resume | `480` | +| Duplicate source notes | `0` | +| Qdrant rebuild scope | encoded in the pass criteria | +| Resource envelope elapsed | `71.303 s` | +| RSS | `54,724 KB` | +| Postgres database bytes | `19,338,943` | +| Estimated input tokens | `27,023` | + +ELF passed nine checks: + +| Check | Status | +| --- | --- | +| `resumable_backfill_no_duplicates` | pass | +| `same_corpus_retrieval` | pass | +| `async_worker_indexing_e2e` | pass | +| `update_replaces_note_text` | pass | +| `delete_suppresses_retrieval` | pass | +| `cold_start_recovery_search` | pass | +| `concurrent_write_search_e2e` | pass | +| `soak_stability_e2e` | pass | +| `resource_envelope` | pass | + +Every ELF stress query returned the expected evidence as the top evidence id. + +### qmd Stress Result + +| Metric | Value | +| --- | ---: | +| qmd commit | `636602409c862db077f38d9006df7f0bdca17ff3` | +| Project elapsed | `66 s` | +| Same-corpus query pass | `16/16` | +| Expected doc top-1 | `16/16` | +| Mean expected-doc rank | `1.000` | +| Mean distractors in top-10 | `7.938` | +| Lifecycle checks | `4/4 pass` | + +qmd passed four checks: + +| Check | Status | Evidence | +| --- | --- | --- | +| `same_corpus_retrieval` | pass | 16/16 queries matched expected evidence. | +| `update_replaces_note_text` | pass | updated marker `kid-v4` was found and old marker was absent. | +| `delete_suppresses_retrieval` | pass | deleted `deploy-memory.md` no longer matched. | +| `cold_start_recovery_search` | pass | fresh qmd query process retrieved persisted `database-memory.md`. | + +The qmd baseline report keeps per-query top-10 JSON results. This is the most concrete +measured qmd debug advantage in this report: an operator can inspect matched files, +scores, line numbers, snippets, and distractor density directly from the artifact. + +### Per-Query Stress Observations + +| Query | ELF matched top evidence | ELF latency | qmd expected rank | qmd top-10 distractors | +| --- | --- | ---: | ---: | ---: | +| `q-auth` | yes | `30.571 ms` | `1` | `6` | +| `q-auth-alt` | yes | `30.501 ms` | `1` | `7` | +| `q-database` | yes | `30.534 ms` | `1` | `8` | +| `q-database-alt` | yes | `31.281 ms` | `1` | `8` | +| `q-deploy` | yes | `29.958 ms` | `1` | `9` | +| `q-deploy-alt` | yes | `31.298 ms` | `1` | `8` | +| `q-retention` | yes | `30.434 ms` | `1` | `8` | +| `q-retention-alt` | yes | `29.194 ms` | `1` | `9` | +| `q-incident` | yes | `30.839 ms` | `1` | `7` | +| `q-incident-alt` | yes | `28.700 ms` | `1` | `9` | +| `q-billing` | yes | `30.092 ms` | `1` | `7` | +| `q-billing-alt` | yes | `28.855 ms` | `1` | `9` | +| `q-search` | yes | `29.480 ms` | `1` | `8` | +| `q-search-alt` | yes | `28.642 ms` | `1` | `7` | +| `q-recovery` | yes | `28.357 ms` | `1` | `8` | +| `q-recovery-alt` | yes | `28.188 ms` | `1` | `9` | + +## Debug Artifact Comparison + +| Debug surface | ELF evidence | qmd evidence | Current judgment | +| --- | --- | --- | --- | +| Per-query pass/fail | yes | yes | tied | +| Top expected evidence | yes, top evidence id per query | yes, expected file rank per query | tied on stress profile | +| Candidate list in report | partial: trace id, top snippet, returned count | yes: top-10 file, line, score, snippet | qmd stronger in the checked-in report artifact | +| Trace/replay surface | service trace ids exist | CLI command replay is explicit | different strengths; not directly scored | +| Update/delete/cold-start | yes, service lifecycle checks | yes, collection lifecycle checks | tied on encoded lifecycle correctness | +| Backfill/rebuild/resource envelope | yes | not represented in qmd baseline | ELF stronger | +| Rerank evidence | not scored here | not scored here; qmd path uses `--no-rerank` | non-claim | +| Expansion/fusion evidence | not scored here | structured `lex:` plus `vec:` query is used, but fusion internals are not scored | non-claim | +| Operator-debugging UX suite | live `not_encoded` | live `not_encoded` | non-claim | + +## What ELF Should Learn From qmd + +1. Put the ranked candidate list in the default benchmark artifact. + - The qmd artifact makes the top-10 result set immediately visible. + - ELF has trace ids, but a reader still needs another trace-hydration step to see + the candidate list and dropped/demoted candidates. + +2. Make replay commands short and local. + - qmd's measured surface is `collection add`, `update`, `embed -f`, and + `query --json`. + - ELF should keep service correctness, but benchmark reports should also emit a + concise replay command for each failed or suspicious query. + +3. Score distractor density and candidate-drop behavior. + - qmd returned the expected doc at rank 1 for every stress query, while still + returning an average of 7.938 distractor documents in the top 10. + - ELF should expose equivalent candidate-density metrics from trace candidates so + the report can distinguish "correct top result" from "clean ranked context." + +4. Separate retrieval correctness from retrieval-debug ergonomics. + - Correctness is currently tied on encoded retrieval jobs. + - Ergonomics are not tied until ELF produces qmd-like immediate debug artifacts and + qmd operator-debugging jobs are actually scored. + +## Claim Boundaries + +Allowed claims: + +- ELF and qmd both pass the encoded real-world retrieval suite. +- ELF and qmd both pass the 480-document generated-public stress same-corpus + retrieval profile. +- qmd provides stronger directly inspectable top-10 query artifacts in the current + stress baseline report. +- ELF provides stronger service lifecycle, backfill, rebuild, resource, and + source-of-truth evidence in the same stress baseline. + +Not allowed yet: + +- ELF beats qmd retrieval overall. +- qmd beats ELF as a memory system overall. +- Either system has a full live real-world suite pass. +- Either system has measured rerank superiority from this report. +- Either system has measured expansion/fusion superiority from this report. +- qmd operator-debugging UX is proven by the live real-world suite; it is still + `not_encoded`. + +## Next Measurement Work + +The next report should close the remaining retrieval-debug gaps before making stronger +claims: + +1. Hydrate ELF trace candidates into the stress report. + - Include kept, dropped, demoted, sparse/dense, final rank, and snippet fields. + +2. Add qmd query latency and candidate-density aggregates to the project summary. + - The raw qmd top-10 rows exist, but the summary currently lacks query latency and + candidate-density counters. + +3. Add a rerank-on qmd profile or explicitly keep qmd rerank as unmeasured. + - Current qmd materialization uses `--no-rerank`. + +4. Add a scored operator-debugging retrieval job for both systems. + - The job should ask why a result was wrong or why a distractor appeared, not only + whether the top result was correct. + +5. Add an expansion/fusion trace profile. + - Score lex-only, vec-only, hybrid, fusion, and final ranking stages separately. + +## Bottom Line + +This profile strengthens the evidence base but does not close the competitiveness +goal. Retrieval correctness is currently tied between ELF and qmd on encoded data. +ELF's next useful iteration direction is not "more retrieval" in the abstract; it is +qmd-level immediate retrieval debugging while preserving ELF's stronger +source-of-truth, trace, backfill, and production-operation model. diff --git a/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md b/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md new file mode 100644 index 00000000..189566c2 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md @@ -0,0 +1,160 @@ +# ELF/qmd Trace Replay Diagnostics Report - June 11, 2026 + +Goal: Compare ELF and qmd on trace-level replay and wrong-result diagnostics while +keeping retrieval correctness as a separate guardrail. +Read this when: You need the XY-923 report lane for qmd top-10 replay artifacts, +ELF trace/admin bundle surfaces, and typed wrong-result diagnosis classes. +Inputs: The June 11 ELF/qmd retrieval-debug profile, qmd/OpenViking strength profile, +memory-evolution diagnostic, competitor-strength adoption report, live baseline +runner, ELF trace replay code, and the ELF service trace/admin contract. +Outputs: Scenario-level `win`, `tie`, `loss`, `not_tested`, `blocked`, or +`non_goal` outcomes plus concrete replay commands and artifact paths. + +Machine-readable companion: +`docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json`. + +## Executive Judgment + +Retrieval correctness is still tied: ELF and qmd both pass the encoded live retrieval +suite and both pass the 480-document generated-public stress baseline. + +Trace-level debugging is not tied. In the current checked-in artifacts, qmd is ahead +on immediate local replay ergonomics because the baseline keeps top-10 JSON rows with +files, scores, line numbers, snippets, and distractor visibility, and the replay path +is a short CLI sequence. ELF has a deeper service trace model and admin bundle +surfaces, but the stress report still does not hydrate the equivalent candidate list +by default. + +The resulting narrow position: + +- Retrieval correctness: `tie`. +- Default per-query candidate artifact: ELF `loss` against qmd. +- Replay command locality: ELF `loss` against qmd. +- ELF trace/admin replay surface: `tie` as an available but different replay surface, + not a default-artifact win. +- Operator-debug trace hydration and candidate-drop visibility: ELF `win` against qmd + in the narrow XY-932 live slice; replay-command availability and repair-action + clarity are `tie`. +- Expansion, dense/sparse contribution, fusion, and candidate-drop diagnostics: + `not_tested` outside the operator-debug slice until comparable stage artifacts are + emitted. +- Rerank stage scoring: `non_goal` for the current qmd stress path because it uses + `--no-rerank`. +- Wrong-result selected-but-not-narrated diagnosis: `tie` on typed non-pass + classification, not on answer quality. + +This is not a broad qmd-over-ELF claim. It is a scored local-debug artifact gap. + +## Replay Artifact Manifest + +| System | Replay surface | Command | Artifact | +| --- | --- | --- | --- | +| ELF | Stress guardrail with trace ids | `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/live-baseline-report.json`; summarized in `docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json` | +| ELF | Admin trace bundle hydration | `curl -fsS 'http://127.0.0.1:51891/v2/admin/traces/<trace_id>/bundle?mode=full&stage_items_limit=256&candidates_limit=200' -H 'X-ELF-Tenant-Id: <tenant>' -H 'X-ELF-Project-Id: <project>' -H 'X-ELF-Agent-Id: <agent>'` | `elf.trace_bundle/v1` response from the admin service | +| ELF | Trace ranking replay | `cargo run -p elf-eval -- --config-a config/local/elf.docker.toml --config-b config/local/elf.docker.toml --trace-id <trace_id>` | JSON trace compare output over `search_trace_candidates` | +| ELF | Operator-debug live trace slice | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/elf-report.json` and `summary.json` | +| qmd | Stress guardrail and top-10 rows | `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/qmd-query.json`; summarized in `docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json` | +| qmd | Per-query CLI replay | `npx tsx src/cli/qmd.ts query 'lex: <query>\nvec: <query>' -c elfbench --json --no-rerank --min-score 0 -n 10` | JSON top-10 rows with `file`, line/snippet/score fields when qmd returns them | +| qmd | Lifecycle replay | `npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts query ... --json --no-rerank` | `tmp/live-baseline/qmd-query.json` checks for update, delete, and cold-start recovery | +| qmd | Operator-debug live replay slice | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` and `summary.json` | + +## Scenario Outcomes + +| Scenario | Evidence | Result type | ELF outcome | Diagnostic judgment | +| --- | --- | --- | --- | --- | +| Retrieval correctness guardrail | `live_real_world`, `live_baseline_only` | `pass` | `tie` | Both systems pass encoded retrieval and stress same-corpus checks; this row does not score debugging ergonomics. | +| Default top-10 candidate artifact | `live_baseline_only` | `pass` | `loss` | qmd exposes file, score, line/snippet, and distractor rows directly; ELF records trace ids and top evidence but not the full candidate list in the report. | +| Replay command locality | `live_baseline_only` | `pass` | `loss` | qmd replay is a short local CLI query/update/embed path; ELF replay requires a live service config, persisted traces, headers, and trace ids. | +| Trace/admin replay surface availability | `implementation_reference` | `not_encoded` | `tie` | ELF has admin trace bundles and `elf-eval` trace replay; qmd has direct CLI replay. They are different useful surfaces and are not scored as equivalent quality. | +| Operator-debug trace hydration | `live_real_world` | `pass` | `win` | ELF live operator-debug jobs generate trace ids, viewer URLs, admin trace-bundle URLs, and `trace_available=true`; qmd generates local replay commands but no service trace hydration surface. | +| Operator-debug replay command availability | `live_real_world` | `pass` | `tie` | ELF emits admin trace-bundle curl commands and qmd emits local CLI query replay commands for the same operator-debugging scenarios; this scores command availability, not equivalent UI quality. | +| Operator-debug candidate-drop visibility | `live_real_world` | `pass` | `win` | ELF exposes dropped-candidate visibility through generated operator-debug metadata without direct SQL assumptions; qmd exposes top-k replay rows but no intermediate candidate-drop stages in this slice. | +| Operator-debug repair-action clarity | `live_real_world` | `pass` | `tie` | Both live operator-debug adapters emit concrete next steps for replay or trace-bundle inspection; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists. | +| Operator-debug selected-but-not-narrated evidence | `live_real_world` | `pass` | `win` | The operator-debug slice now scores selected-but-not-narrated evidence as a trace/answer-composition repair surface without direct database inspection. | +| Query expansion attribution | `research_gate` | `not_encoded` | `not_tested` | No comparable artifact shows expansion variants or dynamic expansion decisions for both systems. | +| Dense/sparse channel attribution | `research_gate` | `not_encoded` | `not_tested` | ELF uses dense plus BM25 and qmd uses structured `lex:` plus `vec:`, but the scored artifacts do not expose comparable per-channel contribution. | +| Fusion attribution | `research_gate` | `not_encoded` | `not_tested` | No comparable artifact shows fusion inputs, RRF/weighted-fusion contributions, or fusion-stage candidate drops. | +| Rerank attribution | `live_baseline_only` | `non_goal` | `non_goal` | The current qmd stress and materializer paths use `--no-rerank`; no rerank-on comparison is claimed. | +| Candidate-drop diagnostics | `research_gate` | `not_encoded` | `not_tested` | `retrieved_but_dropped` is defined but not observed because current qmd artifacts lack intermediate candidate traces and the ELF stress report does not hydrate candidate bundles. | +| Selected-but-not-narrated wrong results | `live_real_world` | `wrong_result` | `tie` | Both live paths produce memory-evolution wrong results where evidence is present but current-vs-historical or lifecycle narration is missing. | +| Evidence-absent and tombstone diagnosis | `live_real_world` | `wrong_result` | `win` | ELF retrieved all required memory-evolution evidence and passed delete/TTL; qmd missed three required evidence links including the delete tombstone. | + +Summary: `4` ELF wins, `5` ties, `2` ELF losses, `4` not-tested scenarios, `0` +blocked scenarios, and `1` non-goal scenario. The losses are local-debug artifact +losses only. They do not change the retrieval-correctness tie. + +## Stage Scoring Notes + +| Stage | Current score | Reason | +| --- | --- | --- | +| Expansion | `not_tested` | The current artifacts do not expose comparable expansion variants or dynamic expansion decisions. | +| Dense retrieval | `not_tested` | The systems have dense/vector surfaces, but no comparable scored dense-only contribution artifact. | +| Sparse retrieval | `not_tested` | qmd `lex:` and ELF BM25 are present in command or service design, but contribution and drops are not scored. | +| Fusion | `not_tested` | Fusion candidates and final fusion deltas are not materialized comparably. | +| Rerank | `non_goal` | qmd uses `--no-rerank` in the current path; rerank superiority is out of scope for this run. | +| Candidate drops | `not_tested` globally; `win` in operator-debug slice | No current stress/default report can prove retrieved-but-dropped evidence for qmd, but the XY-932 operator-debug slice scores ELF candidate-drop visibility without direct SQL assumptions. | +| Selected-but-not-narrated | `tie` | Both systems have typed memory-evolution wrong-result rows where evidence is selected or available but not narrated as lifecycle history. | +| Operator-debug selected-but-not-narrated | `win` | The XY-932 operator-debug job proves selected-but-not-narrated evidence is visible as a trace/answer-composition repair surface in ELF but not in qmd's generated service-trace metadata. | +| Replay commands | `loss` | qmd's local CLI replay is shorter and directly tied to top-10 JSON output. | + +## Typed Non-Pass States + +The report preserves the wrong-result classes from the June 11 diagnostics: + +| Class | Current coverage | +| --- | --- | +| `evidence_absent` | Observed for qmd on verdict caveat, preference rationale, and delete tombstone misses. | +| `retrieved_but_dropped` | Defined globally as `not_tested`; observed as an ELF operator-debug visibility win in the narrow XY-932 slice. | +| `selected_but_not_narrated` | Observed for both ELF and qmd on supersession and temporal-validity jobs; additionally scored as an ELF operator-debug visibility win in the narrow XY-932 slice. | +| `contradicted_by_lifecycle_evidence` | Observed when current, historical, supersession, or tombstone evidence makes the answer incomplete. | + +These states are typed evidence, not leaderboard shortcuts. A `wrong_result` with +good evidence recall is still a wrong result. + +## Claim Boundaries + +Allowed: + +- ELF and qmd remain tied on encoded retrieval correctness. +- qmd currently wins the default local-debug artifact surface: top-10 rows plus short + CLI replay. +- ELF has useful service trace/admin replay surfaces, but they are not yet hydrated + into the default stress report as qmd-like candidate artifacts. +- ELF narrowly wins the live operator-debug trace hydration and candidate-drop + visibility slice against qmd; qmd still ties replay-command and repair-action + clarity. +- ELF narrowly wins the memory-evolution evidence-retention slice because qmd misses + the delete tombstone and two other required evidence links. +- Expansion, dense/sparse contribution, fusion, rerank-on quality, and + broad retrieved-but-dropped candidate diagnosis outside the operator-debug slice + remain unproven. + +Not allowed: + +- Do not claim qmd beats ELF as a memory system overall. +- Do not claim ELF beats qmd retrieval overall. +- Do not turn qmd top-10 ergonomics into a retrieval-quality win. +- Do not treat ELF trace/admin endpoint availability as proof that the default + benchmark report has qmd-level candidate visibility. +- Do not score rerank superiority from a qmd `--no-rerank` run. +- Do not collapse `not_tested`, `non_goal`, or `wrong_result` into pass evidence. +- Do not convert the XY-932 operator-debug trace slice into a broad viewer-product win + over OpenMemory or claude-mem; OpenMemory UI/export remains blocked, and + claude-mem UI repair paths remain blocked until Docker-contained hook/viewer + evidence exists. + +## Follow-Up Gate + +The next measurement should emit one candidate-replay artifact per suspicious query +with: + +1. Expansion variants and whether the original query was included. +2. Dense-only and sparse-only candidate sets. +3. Fusion rank and score contribution. +4. Rerank score, or an explicit rerank-disabled marker. +5. Final selected items. +6. Dropped or demoted expected evidence. +7. A one-command replay line for both ELF and qmd. + +Until that exists, the current evidence supports a qmd local-debug artifact win, not a +broad product or retrieval win. diff --git a/docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md b/docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md new file mode 100644 index 00000000..63b44b2b --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md @@ -0,0 +1,119 @@ +# First-Generation OSS Adapter Promotion Report - June 11, 2026 + +Goal: Promote first-generation OSS memory baselines into scenario-level adapter +evidence without converting live-baseline-only runs into real-world suite wins. +Read this when: You need the current XY-898 status for agentmemory, mem0/OpenMemory, +memsearch, and claude-mem scenario evidence. +Inputs: Fresh scoped Docker baseline run, updated external adapter manifest, and the +June 11 temporal/history competitor-gap report. +Outputs: Scenario judgments, ELF win/tie/loss/untested positions, and next adapter +gates. + +## Scope Boundary + +This is benchmark/report evidence only. No ELF retrieval, ranking, memory-quality, or +service behavior optimization is implemented here. + +Update after XY-924: mem0/OpenMemory history and local SDK export-style readback are +now measured in +`2026-06-11-mem0-openmemory-history-ui-export-report.md`. The basic lifecycle result +in this report remains valid, but the mem0 history/UI rows below are historical +pre-XY-924 gaps and must not be treated as the current complete mem0 comparison. + +The updated external adapter manifest now includes scenario-level judgments for the +first-generation OSS memory projects. These judgments are intentionally narrower than +suite passes: + +- `live_baseline_only` pass evidence proves the encoded Docker same-corpus or + lifecycle smoke for that project. +- It does not prove `real_world_job` suite parity unless a project adapter actually + executes real-world prompts and scoring. +- Hosted mem0 Platform behavior, OpenMemory UI, host-global hooks, and + operator-owned credentials remain out of scope for local OSS evidence. + +## Fresh Run + +| Command | Result | Runtime | Artifact | +| --- | --- | ---: | --- | +| `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | fail with typed non-pass projects | 295.74 seconds | `tmp/live-baseline/live-baseline-report.json` | + +The aggregate failed because two projects remained typed non-pass, not because setup +collapsed: + +| Project | Status | Retrieval | Checks | Scenario meaning | +| --- | --- | --- | ---: | --- | +| ELF | `pass` | `retrieval_pass` | `8/8` pass | Baseline reference for same-class scenario comparisons; no ELF optimization change was made. | +| agentmemory | `lifecycle_fail` | `retrieval_pass` | `2/4` pass, `1` lifecycle_fail, `1` blocked | Same-corpus retrieval runs, but update supersession and durable cold-start are not proven through the in-memory mock. | +| mem0/OpenMemory | `pass` | `retrieval_pass` | `4/4` pass | Basic local OSS same-corpus, update, delete, and cold-start smoke passes. | +| memsearch | `pass` | `retrieval_pass` | `4/4` pass | Canonical Markdown reindex/update/delete/reload smoke passes. | +| claude-mem | `wrong_result` | `retrieval_wrong_result` | `4/5` pass | Durable repository lifecycle, detail hydration, and reload pass, but same-corpus retrieval misses expected evidence. | + +## Scenario Judgments + +| Project | Scenario | Status | ELF position | Evidence boundary | +| --- | --- | --- | --- | --- | +| agentmemory | basic same-corpus retrieval | `pass` | `untested` | Baseline retrieval passes through an in-memory mock; no durable continuity claim. | +| agentmemory | durable update/reload lifecycle | `lifecycle_fail` | `wins` | Update supersession fails and cold-start is blocked; ELF has broader encoded local lifecycle proof. | +| agentmemory | work-resume capture continuity | `blocked` | `untested` | Needs a durable local session/capture path before fair scoring. | +| mem0/OpenMemory | basic local lifecycle | `pass` | `ties` | ELF and mem0 both pass the encoded local lifecycle smoke; mem0 is no longer a basic-smoke failure. | +| mem0/OpenMemory | preference/entity history | `not_encoded` | `untested` | History, correction chains, entity scope, and deletion audit are not scored. | +| mem0/OpenMemory | OpenMemory UI/export readback | `not_encoded` | `untested` | Local OSS UI/export readback is not executed; hosted behavior remains out of scope. | +| memsearch | canonical Markdown reindex/reload | `pass` | `untested` | Baseline reindex/update/delete/reload passes over the canonical file store; ELF has no directly comparable canonical Markdown source-store scenario in this run. | +| memsearch | TTL/expiry lifecycle | `unsupported` | `untested` | The encoded CLI path has reindex/delete but no TTL/expiry behavior; unsupported competitor evidence does not create an ELF win/loss without a comparable scenario artifact. | +| memsearch | real-world prompt adapter | `not_encoded` | `untested` | No memsearch real_world_job prompt adapter is encoded. | +| claude-mem | same-corpus retrieval | `wrong_result` | `wins` | The durable repository path runs but misses expected retrieval evidence. | +| claude-mem | repository lifecycle reload | `pass` | `ties` | Update, delete, and cold-start reload pass over Docker-local SQLite. | +| claude-mem | progressive-disclosure detail hydration | `pass` | `untested` | Search-to-detail/source hydration passes, but ELF has no directly comparable claude-mem-style progressive-disclosure scenario here. | +| claude-mem | hook capture viewer workflow | `not_encoded` | `untested` | Hooks, viewer, timeline, and observations are not executed. | + +Summary: 13 scenario judgments: 5 `pass`, 1 `wrong_result`, 1 `lifecycle_fail`, +1 `blocked`, 1 `unsupported`, and 4 `not_encoded`. ELF positions are 2 `wins`, +2 `ties`, 0 `loses`, and 9 `untested`. + +## Manifest And Report Changes + +The external adapter manifest is now +`real-world-memory-project-adapters-2026-06-11` and includes `scenarios[]` records +with: + +- `scenario_id` +- optional `suite_id` +- typed scenario `status` +- `elf_position`: `wins`, `ties`, `loses`, or `untested` +- evidence text plus optional command/artifact pointers + +`real_world_job_benchmark` now preserves these fields in generated reports and +renders an **Adapter Scenario Judgments** table. This makes the report input capable +of saying whether ELF wins, ties, loses, or remains untested per scenario without +changing the real-world suite status rules. + +## Claim Boundaries + +Allowed: + +- mem0/OpenMemory passes the current basic local OSS lifecycle smoke. +- memsearch passes the current canonical Markdown reindex/reload smoke. +- agentmemory remains non-pass for durable lifecycle because the current adapter uses + an in-memory mock and cannot prove cold-start recovery. +- claude-mem remains wrong-result for same-corpus retrieval while preserving useful + passed evidence for repository lifecycle and detail hydration. + +Not allowed: + +- Do not claim hosted OpenMemory behavior from local OSS evidence. +- Do not claim mem0/OpenMemory history, UI/export, hosted, or graph-memory parity. +- Do not claim memsearch source-of-truth real-world suite parity from baseline smoke. +- Do not claim claude-mem hook/viewer/capture parity from repository-only checks. +- Do not collapse `wrong_result`, `lifecycle_fail`, `blocked`, `unsupported`, + `not_encoded`, and `incomplete` into one generic failure bucket. + +## Next Gates + +- agentmemory: select a durable local KV/index/session path before work-resume and + capture jobs. +- mem0/OpenMemory: encode preference/entity history, deletion audit, UI/export + readback, and optional graph memory for local OSS only. +- memsearch: encode real-world source-of-truth and retrieval-debug prompt jobs over + the canonical Markdown store. +- claude-mem: fix or explain same-corpus retrieval misses, then encode hook capture, + viewer/operator, and progressive-disclosure jobs. diff --git a/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md b/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md new file mode 100644 index 00000000..80e944cc --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md @@ -0,0 +1,99 @@ +# First-Generation OSS Continuity and Source-Store Report - June 11, 2026 + +Goal: Expand first-generation OSS adapter coverage for durable continuity, +canonical source-store, retrieval-debug, progressive-disclosure, hook capture, and +viewer/operator surfaces without promoting smoke evidence into real-world suite pass +evidence. +Read this when: You need the XY-925 result for agentmemory, memsearch, and +claude-mem after the XY-898 first-generation adapter promotion. +Inputs: `cargo make real-world-first-generation-oss`, the external adapter manifest, +and the June 11 first-generation OSS adapter promotion report. +Outputs: Fixture-backed prompt coverage, scenario-level comparison outcomes, typed +blockers, and updated claim boundaries. + +## Scope Boundary + +This is benchmark/report coverage only. It does not change ELF retrieval behavior, +external project code, or baseline adapter runtime behavior. + +The new first-generation fixture slice lives outside +`apps/elf-eval/fixtures/real_world_memory/`, so it is not counted as the aggregate ELF +real-world suite. The slice exists to encode comparable prompt shapes and blockers for +external OSS adapter surfaces while the external adapter manifest keeps evidence +classes explicit. + +## Fresh Run + +| Command | Result | Artifact | +| --- | --- | --- | +| `cargo make real-world-first-generation-oss` | pass | `tmp/real-world-memory/first-generation-oss/report.json` | + +Generated report summary: + +| Metric | Value | +| --- | ---: | +| Jobs | 6 | +| Encoded suites | 4 | +| Pass | 4 | +| Blocked | 2 | +| Evidence coverage | 12/12 | +| Source-ref coverage | 12/12 | +| Quote coverage | 12/12 | +| Operator-debug jobs | 2 | +| Raw SQL needed | 0 | + +External adapter manifest scenario outcomes now preserve every normalized outcome: + +| Outcome | Count | +| --- | ---: | +| win | 9 | +| tie | 9 | +| loss | 1 | +| not_tested | 8 | +| blocked | 6 | +| non_goal | 3 | + +## Scenario Additions + +| Project | Scenario | Status | Outcome | Evidence | +| --- | --- | --- | --- | --- | +| agentmemory | `durable_work_resume_local_path` | `blocked` | `blocked` | The selected comparable path is a Docker-local session directory that persists the SDK KV/index and observation log across a fresh process. | +| agentmemory | `capture_write_policy_hooks` | `blocked` | `blocked` | Live hook observations and write-policy audit evidence are required before scoring capture/write-policy jobs. | +| memsearch | `markdown_source_store_rebuild_reload_prompt` | `pass` | `not_tested` | The prompt fixture covers canonical Markdown as source of truth and `memsearch index` as derived rebuild/reload behavior. | +| memsearch | `markdown_retrieval_debug_prompt` | `pass` | `not_tested` | The prompt fixture covers CLI replay plus Markdown source inspection while keeping staged trace bundles not encoded. | +| claude-mem | `retrieval_repair_artifact_path` | `wrong_result` | `win` | The repair prompt preserves the same-corpus retrieval miss and names rerun/inspection targets `tmp/live-baseline/claude-mem.log` and `tmp/live-baseline/claude-mem-checks.json`. | +| claude-mem | `progressive_disclosure_prompt` | `pass` | `not_tested` | The prompt fixture covers repository search-to-detail/source hydration on durable SQLite. | +| claude-mem | `hook_capture_viewer_workflow` | `blocked` | `blocked` | The current Docker baseline uses repository classes only and does not execute hooks, timeline capture, or viewer workflows. | +| claude-mem | `viewer_operator_workflow` | `blocked` | `blocked` | A fair viewer/operator comparison needs Docker-contained readback over the same durable SQLite corpus. | + +## Claim Boundaries + +Allowed: + +- agentmemory has a selected durable local path for future work-resume and + capture/write-policy scoring. +- memsearch now has checked-in source-store and retrieval-debug prompt coverage over + the canonical Markdown store. +- claude-mem has checked-in progressive-disclosure and retrieval-repair prompt + coverage for the Docker-contained repository path. +- claude-mem hook capture and viewer/operator workflows remain typed blockers. + +Not allowed: + +- Do not claim agentmemory durable continuity from the in-memory same-corpus smoke. +- Do not claim memsearch full real-world suite parity from Markdown reindex/reload + smoke or fixture-backed prompt coverage. +- Do not claim claude-mem retrieval passed; same-corpus retrieval remains + `wrong_result`. +- Do not claim claude-mem hooks or viewer workflows pass from repository + class-level hydration evidence. + +## Touched Artifacts + +- `Makefile.toml`: adds `cargo make real-world-first-generation-oss`. +- `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/`: + checked-in prompt and blocker fixtures. +- `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`: + updated scenario rows and explicit `comparison_outcome` values. +- `docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json`: + machine-readable companion report. diff --git a/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md b/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md new file mode 100644 index 00000000..290092d3 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md @@ -0,0 +1,152 @@ +# Graph/RAG Scored Smoke Adapter Report - June 11, 2026 + +Goal: Record the XY-900 promotion of graph/RAG Docker smokes and the XY-929 +representative fixture slice into scored or typed `real_world_job` adapter evidence +without upgrading smoke or typed non-pass evidence into broad quality claims. +Read this when: You need to decide whether ELF currently wins, ties, loses, or remains +untested against RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify graph/RAG +strengths. +Inputs: `memory_projects_manifest.json`, the graph/RAG smoke and representative +fixture commands in `Makefile.toml`, and the generated report contracts. +Outputs: Scored-smoke status, representative typed non-pass status, claim boundary, +blocker taxonomy, and next measurement gate for each in-scope project. + +## Verdict + +XY-900 promotes the in-scope Docker smokes into scored adapter evidence where the smoke +already has enough generated evidence ids to evaluate a bounded job. This is still +smoke-only evidence. + +Current graph/RAG quality comparison remains mostly untested. ELF cannot claim a win, +tie, or loss against the in-scope graph/RAG strengths from smoke evidence alone. +`graphify` is the current exception only in the narrow sense that its Docker smoke +reaches graph/report output and scores one tiny `knowledge_compilation` job as +`wrong_result`; that is a bounded graphify non-pass, not an ELF victory claim. + +Graphiti/Zep remains the temporal-validity reference. The default checked-in smoke is +typed `blocked` before live execution because `ELF_GRAPHITI_ZEP_SMOKE_START=1` and +`ELF_GRAPHITI_ZEP_SMOKE_RUN=1` are not set. When that live path is explicitly enabled +without provider credentials, the blocker remains `provider_api_key_missing`; no +hosted Zep service or unrecorded provider credentials are used or implied. + +XY-929 adds a representative external-adapter fixture slice for graph/RAG navigation, +citations, graph summaries, temporal validity, graph reports, stale-source lint, and +unsupported-claim handling. The slice intentionally remains typed non-pass: 5 jobs, +0 pass, 3 blocked, 1 incomplete, and 1 wrong_result. It strengthens the reporting +contract, not the quality claim. + +## Scored Smoke Status + +| Project | Scored scenario | Command | Current scored status | Claim boundary | +| --- | --- | --- | --- | --- | +| RAGFlow | `retrieval`: reference chunks mapped to generated evidence ids | `cargo make smoke-ragflow-docker` | `blocked` or `incomplete` by execution boundary | Smoke-only. No RAGFlow quality claim until returned reference chunks map to `ragflow-smoke-anchor`. | +| LightRAG | `retrieval`: context/source export mapped to fixture evidence ids | `cargo make smoke-lightrag-docker-context` | `incomplete` when the API service is not started | Smoke-only. No graph-RAG quality claim until context or references map to generated evidence ids. | +| GraphRAG | `knowledge_compilation`: output tables mapped to generated evidence ids | `cargo make smoke-graphrag-docker` | `blocked` | Smoke-only. No graph-navigation or synthesis claim until output tables map to generated evidence ids. | +| Graphiti/Zep | `memory_evolution`: current and historical validity facts | `cargo make smoke-graphiti-zep-docker-temporal` | `blocked` before live opt-in; `provider_api_key_missing` when live path is enabled without explicit credentials | Provider-bound. No ELF-over-Graphiti/Zep claim until temporal output maps to scored evidence ids. | +| graphify | `knowledge_compilation`: `graph.json`, `GRAPH_REPORT.md`, and query output mapping | `cargo make smoke-graphify-docker-graph-report` | `wrong_result` after setup/run pass | Scored tiny smoke. The graph/report output maps to evidence ids, but the job remains non-pass; no broad graph-navigation quality claim follows. | + +## Artifact Contract + +Each promoted smoke now writes a generated fixture and scored report: + +| Project | Generated report | +| --- | --- | +| RAGFlow | `tmp/real-world-memory/ragflow-smoke/ragflow-report.json` and `.md` | +| LightRAG | `tmp/real-world-memory/lightrag-context/lightrag-report.json` and `.md` | +| GraphRAG | `tmp/real-world-memory/graphrag-smoke/graphrag-report.json` and `.md` | +| Graphiti/Zep | `tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json` and `.md` | +| graphify | `tmp/real-world-memory/graphify-smoke/graphify-report.json` and `.md` | + +## Representative Fixture Slice + +Run the representative graph/RAG slice separately from the heavyweight live adapter +sweep: + +```sh +cargo make real-world-memory-graph-rag +``` + +Artifacts: + +```text +tmp/real-world-memory/graph-rag/report.json +tmp/real-world-memory/graph-rag/report.md +``` + +Current focused report summary: + +| Metric | Value | +| --- | --- | +| Jobs | 5 | +| Pass | 0 | +| Blocked | 3 | +| Incomplete | 1 | +| Wrong result | 1 | +| Temporal validity not encoded | 1 | + +Representative job outcomes: + +| Project | Representative contract | Job status | ELF outcome | Boundary | +| --- | --- | --- | --- | --- | +| RAGFlow | Reference chunks must map generated document ids, chunk ids, content, and document metadata to benchmark evidence ids. | `blocked` | `blocked` | Resource/API setup and returned reference chunks are still missing. | +| LightRAG | Context/source export must expose generated file paths, snippets, or reference content mapped to evidence ids. | `incomplete` | `blocked` | The opt-in Docker API export is not available by default, so comparison remains blocked. | +| GraphRAG | Output tables must map documents, text units, communities, reports, entities, and relationships to generated evidence ids. | `blocked` | `blocked` | Provider-backed Docker output tables are required before citation or synthesis scoring can pass. | +| Graphiti/Zep | Current and historical graph facts must carry validity windows and evidence ids. | `blocked` | `blocked` | Temporal validity is not encoded without provider-backed current/historical output. | +| graphify | `graph.json`, source-location report sections, unsupported-claim lint, and stale-source lint are scored. | `wrong_result` | `not_tested` | The representative job reaches scoring but misses stale-source/answer requirements; no ELF victory or graphify quality conclusion follows. | +| llm-wiki | Citation-bearing wiki/page generation with stale-source and unsupported-claim lint. | `not_encoded` | `not_tested` | No contained output contract exists yet. | +| gbrain | Compiled-truth or timeline export with evidence-linked page sections. | `blocked` | `blocked` | Docker-local setup and export readback remain missing. | +| Private, hosted, or large-corpus graph/RAG profiles | Provider, private data, or hosted service behavior. | `not_encoded` | `non_goal` | These profiles are outside the generated public representative lane unless explicitly authorized. | + +The aggregate live-adapter sweep can include these reports through explicit opt-in +flags. These flags include an adapter in the aggregate report; provider-backed, +service-started, or resource-heavy live attempts still require the adapter-specific +controls listed by each smoke task: + +- `ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW=1` +- `ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG=1` +- `ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG=1` +- `ELF_REAL_WORLD_LIVE_ENABLE_GRAPHITI_ZEP=1` +- `ELF_REAL_WORLD_LIVE_ENABLE_GRAPHIFY=1` + +Default `cargo make real-world-memory-live-adapters` still runs ELF and qmd only. That +keeps heavyweight services, provider-backed runs, and graph/report installs out of the +default sweep unless explicitly requested. + +## Typed Limits + +Resource, runtime, provider, and setup limits remain first-class report states: + +- `blocked`: live execution requires explicit resource opt-in, provider credentials, + a Docker service profile, or a generated output that is not yet available. +- `incomplete`: setup or service reachability failed before the behavioral check. +- `wrong_result`: the smoke reached scoring but failed required answer or rubric + signals, including unmapped evidence where applicable. +- `pass`: the smoke reached output and all required generated evidence ids mapped. +- `not_encoded`: broad quality, scale, private corpus, hosted-service behavior, and + non-smoke suites remain outside the current adapter. + +## Claim Rules + +Allowed: + +- Say the in-scope graph/RAG smokes now produce scored `real_world_job` adapter reports + or typed non-pass reports. +- Say the XY-929 representative slice produces typed non-pass reports for RAGFlow, + LightRAG, GraphRAG, Graphiti/Zep, graphify, llm-wiki, and gbrain claim boundaries. +- Say graph/RAG quality remains untested where live output has not mapped to generated + evidence ids or where scored output remains typed non-pass. +- Say graphify reached a tiny Docker graph/report smoke and currently scores + `wrong_result`. +- Say Graphiti/Zep remains blocked by default live-run opt-in, and provider-blocked + when that live path is explicitly enabled without credentials; it remains the + temporal-validity reference. + +Not allowed: + +- Do not call a smoke pass a broad RAG, graph, temporal, or production-quality pass. +- Do not call a representative blocked, incomplete, wrong_result, or not_encoded job a + broad RAG, graph, temporal, or production-quality result. +- Do not claim ELF beats Graphiti/Zep, RAGFlow, LightRAG, GraphRAG, or graphify on + their graph/RAG strengths from these smoke or representative non-pass reports. +- Do not use hosted/cloud-only results, host-global installs, private corpora, or + unrecorded credentials as evidence for this lane. diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md new file mode 100644 index 00000000..841e945f --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -0,0 +1,277 @@ +# ELF Benchmark Measurement Coverage Audit - June 11, 2026 + +Goal: Record what is actually measured today, where competitor comparisons are still +not comparable, and which measurement reports should guide future ELF iteration. +Read this when: You need to answer whether ELF has enough empirical evidence to +claim a win, tie, loss, or non-claim against tracked memory, RAG, graph, and +agent-continuity projects. +Inputs: Fresh local runs of `cargo make real-world-memory-core-archival`, +`cargo make real-world-memory`, and retained XY-933 +`cargo make real-world-memory-live-adapters` evidence after XY-927 +core-vs-archival fixture coverage, XY-928 OpenViking context-trajectory fixture +encoding, and live capture/write-policy scoring, plus +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, +`2026-06-11-competitor-strength-evidence-matrix.md`, and +`2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md`. +Outputs: Fresh measured counters, scenario coverage, project coverage, and the next +measurement reports needed before stronger ELF claims. + +## Executive Judgment + +The benchmark program is useful and already prevents misleading claims, but the +current measured comparison is not complete enough to say ELF beats or ties every +tracked project's strongest scenario. + +What is proven today: + +- ELF has a strong fixture-backed real-world benchmark contract: 49 jobs across 13 + suites, 44 pass, 5 blocked operator or measurement-gate boundaries, and no wrong + results in the fixture aggregate. The new `core_archival_memory` suite contributes + 6 passing jobs for core block attachment, scope, provenance, stale-core detection, + archival fallback, and project-decision recovery. The added XY-928 + `context_trajectory` jobs are blocked OpenViking staged/hierarchy/recursive gates, + not ELF wins. +- ELF and qmd have comparable full-suite live real-world sweeps, but neither has a + full-suite live pass. ELF is five passes ahead in the fresh aggregate because qmd + misses the memory-evolution delete/TTL tombstone job and the capture/write-policy + suite is now ELF-only live evidence. +- ELF now has live capture/write-policy self-check evidence for redaction, exclusions, + source ids, evidence binding, and no secret leakage. This is not a broad + capture-hook win over agentmemory or claude-mem: agentmemory comparison is blocked + by mocked/in-memory storage, and claude-mem hook/viewer capture remains blocked + until Docker-contained hook/viewer evidence exists. +- ELF is ahead on production-operation evidence among tracked systems because it has + checked-in provider synthetic, stress, backfill, backup/restore, and Qdrant rebuild + evidence. +- The current comparison still undermeasures most competitor strengths. OpenViking + trajectory, mem0/OpenMemory entity history and UI, Letta product export/readback + for core-vs-archival memory, Graphiti/Zep temporal graph behavior, graph/RAG + navigation, agentmemory and claude-mem continuity/capture breadth, and knowledge-page + workflows remain non-claims. + The separate XY-932 operator-debug live slice now scores ELF against qmd for trace + hydration and candidate-drop visibility, but does not cover OpenMemory or + claude-mem UI flows. + +So the current adoption decision can remain "credible for bounded personal +production," but the competitiveness objective remains open. + +## Fresh Runs + +These commands were run in the current benchmark lanes after adapter-report +consistency repairs, the XY-927 core-vs-archival fixture update, the XY-928 +OpenViking context-trajectory fixture update, and XY-933 live capture/write-policy +scoring: + +| Command | Result | Runtime | +| --- | --- | ---: | +| `cargo make real-world-memory-core-archival` | pass | 12.14 seconds | +| `cargo make real-world-memory` | pass | 11.09 seconds | +| `cargo make real-world-memory-live-adapters` | pass | 137.66 seconds | + +The live adapter run emitted repeated Qdrant client/server compatibility warnings, but +the command completed successfully and produced ELF and qmd JSON/Markdown reports. +Treat that warning as a measurement-harness risk to keep visible, not as a current run +failure. + +## Fixture Aggregate + +`cargo make real-world-memory` produced: + +| Metric | Value | +| --- | ---: | +| Jobs | `49` | +| Encoded suites | `13` | +| Pass | `44` | +| Blocked | `5` | +| Wrong result | `0` | +| Lifecycle fail | `0` | +| Incomplete | `0` | +| Not encoded | `0` | +| Unsupported claim | `0` | +| Mean score | `0.898` | +| Mean latency | `3.940 ms` | +| Expected evidence recall | `100/100` | +| Evidence coverage | `111/111` | +| Source-ref coverage | `111/111` | +| Quote coverage | `111/111` | + +This proves fixture contract breadth and scoring behavior. It does not prove every +live adapter or competitor runtime can complete those jobs. + +## Live ELF/qmd Sweep + +`cargo make real-world-memory-live-adapters` produced: + +XY-934 update: the June 11 consolidation row below is superseded for ELF by +`docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md`. +ELF now has live service-backed consolidation proposal scoring for the 4 checked-in +consolidation jobs; qmd remains typed `not_encoded` for this suite. + +| Adapter | Jobs | Pass | Wrong result | Blocked | Not encoded | Mean score | Mean latency | Evidence recall | Evidence coverage | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live service adapter | `40` | `22` | `5` | `2` | `11` | `0.599` | `6.980 ms` | `50/80` | `58/88` | +| qmd live CLI adapter | `40` | `17` | `6` | `2` | `15` | `0.461` | `792.543 ms` | `38/80` | `45/88` | + +This supports an ELF lead in the current full live sweep count, but not a broad +ELF-over-qmd claim. The lead is concentrated in the ELF-only capture/write-policy +self-check plus the delete/TTL tombstone case. qmd remains the stronger retrieval-debug +UX reference, and its deep profile is still not encoded. + +### Live Suite Breakdown + +ELF and qmd have the same status shape outside `memory_evolution` and +`capture_integration`. The memory-evolution difference is +`memory-evolution-delete-ttl-001`: ELF passes that job while qmd reports +`wrong_result`, leaving ELF at five memory-evolution wrong results and qmd at six. The +capture difference is that ELF now executes the capture/write-policy jobs through its +service runtime, while qmd keeps those jobs typed `not_encoded`. + +| Suite | Jobs | ELF breakdown | qmd breakdown | +| --- | ---: | --- | --- | +| `trust_source_of_truth` | `1` | `pass:1` | `pass:1` | +| `work_resume` | `5` | `pass:5` | `pass:5` | +| `retrieval` | `5` | `pass:5` | `pass:5` | +| `project_decisions` | `5` | `pass:5` | `pass:5` | +| `personalization` | `1` | `pass:1` | `pass:1` | +| `memory_evolution` | `6` | `pass:1`, `wrong_result:5` | `wrong_result:6` | +| `capture_integration` | `4` | `pass:4` | `not_encoded:4` | +| `consolidation` | `4` | `not_encoded:4` | `not_encoded:4` | +| `knowledge_compilation` | `2` | `not_encoded:2` | `not_encoded:2` | +| `operator_debugging_ux` | `1` | `not_encoded:1` | `not_encoded:1` | +| `production_ops` | `6` | `blocked:2`, `not_encoded:4` | `blocked:2`, `not_encoded:4` | + +The ELF live wrong results are five memory-evolution jobs. qmd has those same conflict +evidence failures plus the delete/TTL tombstone miss. The live adapters retrieve +current evidence in several cases but do not yet provide the required historical +conflict evidence links for current-vs-historical reasoning. + +## External Adapter Ledger + +The checked-in manifest records 23 adapter records across 17 unique project names. + +| Evidence class | Adapter records | Meaning | +| --- | ---: | --- | +| `fixture_backed` | `1` | ELF fixture scoring only. | +| `live_baseline_only` | `6` | Docker same-corpus or lifecycle evidence without real-world job scoring. | +| `live_real_world` | `5` | ELF and qmd live real-world sweeps, graphify's tiny scored Docker smoke, and the narrow ELF/qmd operator-debug live slice. | +| `research_gate` | `11` | Setup, source, resource, or output-contract gate only. | + +| Overall status | Adapter records | +| --- | ---: | +| `pass` | `4` | +| `wrong_result` | `6` | +| `lifecycle_fail` | `1` | +| `blocked` | `7` | +| `not_encoded` | `5` | + +The generated JSON report emits `external_project_count: 16`, matching the unique +non-ELF project-name count from the manifest. The companion audit JSON separately +records `unique_project_names: 17` for the full project list including ELF. + +## Project Coverage + +| Project | Best current evidence | Current measured state | Strongest unproven scenario | Next measurement before claim | +| --- | --- | --- | --- | --- | +| ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 5 blocked operator or measurement-gate boundaries; live full sweep is `wrong_result`; live capture/write-policy, live consolidation proposal scoring, and narrow operator-debug slices pass. | Full live memory evolution, live knowledge pages, live production ops, competitor capture hooks, OpenViking staged trajectory artifacts, and broader operator UI runners. | Memory-evolution diagnostic report, then knowledge reports plus agentmemory/claude-mem capture, OpenViking staged trajectory artifacts, and OpenMemory/claude-mem UI runners. | +| qmd | `live_real_world` plus `live_baseline_only` | Fresh full sweep is five passes behind ELF because qmd misses the delete/TTL tombstone job and keeps capture/write-policy jobs typed `not_encoded`; same-corpus baseline passes; narrow operator-debug live slice ties replay commands but is `wrong_result` for trace hydration and candidate-drop visibility. | Deep retrieval-debug ergonomics and trace replay beyond the narrow operator-debug slice. | qmd/ELF deep retrieval-debug profile with expansion, fusion, rerank, and dropped-candidate traces. | +| agentmemory | `live_baseline_only` | `lifecycle_fail`; capture comparison is `blocked` because the Docker baseline uses a process-local StateKV Map and in-memory index, with no durable local session/capture path for source ids, exclusions, write-policy audit, or evidence-bound output. | Durable coding-agent continuity and capture hooks. | Durable lifecycle and work-resume/capture adapter report. | +| mem0/OpenMemory | `live_baseline_only` | Basic local smoke and local OSS history/readback pass; OpenMemory UI/export is blocked, hosted Platform export is a non-goal, and optional graph plus broader prompt coverage remain `not_encoded`. | Entity history, lifecycle UI, OpenMemory inspection. | Entity-history, deletion-audit, and UI/export readback report. | +| memsearch | `live_baseline_only`; XY-925 `fixture_backed` | Basic canonical Markdown reindex/reload smoke passes, and XY-925 adds fixture-backed source-store and retrieval-debug prompts without claiming a live memsearch adapter pass. | Markdown canonical store and local reindex clarity. | Runtime source-of-truth and retrieval-debug adapter execution over the existing prompt jobs. | +| OpenViking | `live_baseline_only` plus `fixture_backed` and `research_gate` | Same-corpus retrieval is `wrong_result`; staged retrieval, hierarchy selection, and recursive/context expansion are encoded as blocked fixtures. | Hierarchical staged context trajectory. | Evidence-bearing retrieval fix, then materialized staged trajectory report. | +| claude-mem | `live_baseline_only`; XY-925 `fixture_backed` | Same-corpus retrieval remains `wrong_result`; XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompts, with hook capture and viewer/operator workflows still blocked. | Progressive disclosure and automatic capture review. | Work-resume, operator-debugging, capture/write-policy, and viewer/operator runtime report. | +| RAGFlow | `research_gate` | `blocked`. | RAG app workflow with document/chunk references. | Tiny Docker evidence-smoke with `reference.chunks` mapped to evidence ids. | +| LightRAG | `research_gate` | `blocked`. | Graph/RAG context export with source-path citations. | Docker context-export report with explicit provider config and source citation mapping. | +| GraphRAG | `research_gate` | `blocked`. | Graph summaries and document/text-unit evidence tables. | Cost-bounded Docker adapter report over a tiny corpus. | +| Graphiti/Zep | `research_gate` | `blocked`. | Temporal graph facts and validity windows. | Docker-local temporal graph adapter report for current and historical facts. | +| Letta | `research_gate` | `blocked` for the selected contained export/readback path; scenario rows remain `not_tested` or `blocked`. | Core memory blocks versus archival memory. | Implement the Docker-only export/readback adapter before any Letta win/tie/loss claim. | +| LangGraph | `research_gate` | `not_encoded`; direct memory backend is unsupported. | Checkpoint replay and fork/regression debugging. | Treat as benchmark-infra reference unless a memory-output contract emerges. | +| nanograph | `research_gate` | `not_encoded`; full memory backend is unsupported. | Typed graph schema and query ergonomics. | Typed relation query report only if evidence ids can be emitted. | +| llm-wiki | `research_gate` | `not_encoded`. | Wiki/page generation, query-save, lint and repair loops. | Contained page-generation report with citation and unsupported-claim lint. | +| gbrain | `research_gate` | `not_encoded`; setup path is blocked. | Compiled truth pages, timelines, and brain navigation. | Docker-local brain repo setup proof, then compiled-truth/timeline report. | +| graphify | `live_real_world` | Tiny scored smoke is `wrong_result`. | Graph-compressed navigation with `graph.json` and `GRAPH_REPORT`. | Expand beyond the generated smoke only after graph/report output maps to scored evidence on representative graph/RAG jobs. | + +## Scenario Coverage And Claims + +| Scenario | Current measured position | Claim allowed today | Missing measurement | +| --- | --- | --- | --- | +| Retrieval/debug | ELF and qmd live retrieval pass; qmd same-corpus baseline passes. | Tie on encoded live retrieval; no ELF-over-qmd UX claim. | qmd/ELF deep trace replay and debug ergonomics scoring. | +| Work resume | ELF and qmd live pass. | ELF is credible on encoded work resume. | agentmemory, claude-mem, and OpenViking comparable continuity adapters. | +| Project decisions | ELF and qmd live pass; ELF fixture coverage also passes core routing plus archival rationale recovery. | ELF is credible on encoded project-decision recovery. | Letta core/archival decision memory export and scoring. | +| Source of truth | ELF and qmd live pass; ELF has stronger production restore/rebuild evidence. | ELF has strongest measured source-of-truth discipline. | memsearch source-of-truth reindex/reload evidence. | +| Memory evolution | ELF live fails 5/6 jobs; qmd live fails 6/6 jobs after missing the delete/TTL tombstone evidence; fixture aggregate passes. | No broad live superiority claim. | Historical conflict evidence links and Graphiti/Zep temporal comparison. | +| Consolidation | Fixture aggregate passes; XY-934 adds ELF live service-backed proposal scoring, while qmd remains `not_encoded`. | ELF self-check claim only; no direct competitor win. | Contained competitor/reference runners only when they emit source ids, confidence, unsupported-claim flags, and review-action audit. | +| Knowledge pages | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live page rebuild/lint plus llm-wiki, gbrain, GraphRAG, and graphify comparisons. | +| Operator debugging | Fixture aggregate passes; narrow ELF/qmd live operator-debug slice is scored with ELF `pass` and qmd `wrong_result`. | Narrow ELF/qmd live claim only: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; replay-command and repair-action clarity are tied. | OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim. | +| Capture/write policy | Fixture aggregate passes; ELF live service adapter passes 4/4 capture jobs with zero redaction leaks; qmd is `not_encoded`; agentmemory is `blocked`; claude-mem hook/viewer capture is `blocked`. | ELF has live self-check evidence for redaction, exclusions, source ids, evidence binding, and no secret leakage. Against agentmemory/claude-mem capture breadth, the comparison remains blocked until durable hook/viewer evidence exists. | Durable agentmemory and claude-mem capture-hook runners with evidence-bound output. | +| Production ops | ELF has separate production-provider/backfill/restore evidence; live sweep is not a full production-ops pass. | Bounded personal-production adoption claim with caveats. | Private corpus manifest and credentialed provider gates. | +| Personalization | ELF and qmd live pass one scoped preference job. | Narrow encoded pass only. | mem0/OpenMemory and Letta entity/preference history comparison. | +| Context trajectory | Not comparable. | No claim. | OpenViking staged hierarchy/trajectory scoring. | +| Core-vs-archival memory | ELF fixture suite passes 6/6; Letta comparison is blocked until export/readback evidence exists. | Fixture-only ELF core-block claim; no ELF-over-Letta claim. | Letta contained export/readback artifact with core block JSON, archival search/readback JSON, and source ids. | +| Graph/RAG navigation | RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain typed research gates; graphify has a tiny scored `wrong_result` smoke. | No graph/RAG parity claim; only graphify's bounded non-pass smoke can be cited. | Larger contained RAG/graph adapters with evidence-linked outputs before any ELF graph/RAG win, tie, or loss claim. | + +## Next Measurement Reports + +Order these by decision value, not implementation convenience: + +1. ELF/qmd retrieval-debug deep profile + - Why: qmd is the closest measured live competitor and still stronger as a + debugging reference. + - Output: trace-level comparison of expansion, dense/sparse retrieval, fusion, + rerank, dropped candidates, and command-line replay. + +2. ELF/qmd live memory-evolution diagnostic + - Why: ELF currently fails 5/6 live memory-evolution jobs and qmd fails 6/6, + including the delete/TTL tombstone case. + - Output: per-job evidence-link failure analysis for current-vs-historical facts, + supersession, and relation temporal validity. + +3. External capture-hook report for agentmemory and claude-mem + - Why: ELF now has a live capture/write-policy self-check, but the strongest + agentmemory and claude-mem capture-breadth claims are still blocked. + - Output: durable local capture artifacts, source ids, redaction/exclusion audit, + and typed blocker reasons when hooks or viewer capture cannot run in Docker. + +4. Continuity and context-trajectory report + - Why: agentmemory, claude-mem, and OpenViking represent real user expectations + around automatic capture, progressive disclosure, and staged context. + - Output: comparable work-resume/capture/trajectory jobs or typed blockers. + +5. Personalization and core-memory report + - Why: mem0/OpenMemory and Letta represent product expectations ELF should absorb + before claiming better personalization or operating context. + - Output: entity history, preference correction, UI/readback, core-vs-archival, + and project-decision scoring. + +6. Knowledge and graph/RAG report pack + - Why: llm-wiki, gbrain, graphify, GraphRAG, LightRAG, RAGFlow, and Graphiti/Zep + cover knowledge synthesis and graph navigation that ELF currently cannot claim. + - Output: Docker-contained artifacts mapped to evidence ids, or typed setup and + resource blockers. + +Before publishing the next aggregate report, keep `external_project_count` aligned +with unique non-ELF project names so readers do not confuse project coverage with +adapter-record coverage. + +## Fail Criteria + +Use these criteria for future reports: + +- `pass`: comparable scenario is encoded, run, and evidence-backed. +- `wrong_result`: the system ran but answered with wrong, stale, unsupported, or + insufficiently evidenced memory. +- `not_encoded`: the runner does not yet exercise the scenario. This is not a win or + loss. +- `blocked`: safe measurement needs missing credentials, private data, resource + envelope acceptance, setup proof, or an export contract. +- `unsupported`: the project shape is not a direct memory-system comparison target. +- Fixture evidence cannot be promoted into live runtime evidence. +- Live baseline evidence cannot be promoted into real-world job evidence. +- Research-gate evidence cannot be promoted into pass/fail product quality evidence. + +## Bottom Line + +ELF is on a strong path because its benchmark methodology is stricter than a normal +leaderboard, and its production evidence is unusually concrete. The next work is not +to declare victory. The next work is to measure the strongest user-facing patterns in +adjacent projects, then decide which ones ELF should absorb behind fresh benchmark +gates. diff --git a/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md b/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md new file mode 100644 index 00000000..9200bb86 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md @@ -0,0 +1,179 @@ +# mem0/OpenMemory History and UI Export Report - June 11, 2026 + +Goal: Add scenario-level mem0/OpenMemory history, personalization, deletion-audit, +local SDK export-readback, and bounded OpenMemory export-helper setup evidence without +promoting basic lifecycle smoke into UI or hosted Platform claims. +Read this when: You need the current XY-924 comparison between ELF and +mem0/OpenMemory for entity-scoped history, preference correction, deletion audit, +personalization, OpenMemory inspection/export, hosted Platform export, or optional +graph memory. +Inputs: Fresh scoped mem0 Docker baseline run, refreshed real-world external adapter +manifest, generated real-world memory report, and the June 11 first-generation, +temporal/history, and competitor-strength reports. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, +`scripts/live-baseline-benchmark.sh`, and +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. +Outputs: Per-scenario outcomes using `win`, `tie`, `loss`, `not_tested`, `blocked`, +and `non_goal`, plus command and artifact evidence for each measured claim. +Machine-readable companion: `docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json`. + +## Executive Judgment + +The XY-924 objective is now encoded for the reproducible local OSS SDK surface, and +XY-931 adds a separate bounded OpenMemory export-helper setup probe. + +mem0/OpenMemory now has fresh local OSS evidence for behavior beyond the basic +lifecycle smoke: + +- `preference_correction_history`: `pass` +- `entity_scoped_personalization`: `pass` +- `local_get_all_export_readback`: `pass` +- `delete_history_audit_readback`: `pass` +- `openmemory_ui_export_readback`: `blocked` + +The comparison is intentionally narrower than a hosted/OpenMemory product verdict. +The local run measures the mem0 OSS SDK and local FastEmbed/Qdrant/history paths in +Docker. The new product-UX setup probe detects the OpenMemory tree, UI package, +compose file, and export helper, then records a setup blocker: the export helper needs +Docker access to a running OpenMemory product container, while the baseline runner +only has the SDK Qdrant/history artifacts. It does not claim browser/dashboard +readback, hosted mem0 Platform export jobs, or optional graph memory. + +## Fresh Evidence + +| Command | Result | Runtime | Artifact | +| --- | --- | ---: | --- | +| `cargo make openmemory-ui-export-readback` | `pass` for SDK baseline; OpenMemory export-helper setup probe `blocked` with `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER` | 35.14 seconds wall; 33 seconds project runtime | `tmp/live-baseline/live-baseline-report.json`, `tmp/live-baseline/mem0-checks.json`, `tmp/live-baseline/mem0-openmemory-ui-export.json`, `tmp/live-baseline/mem0-openmemory-export-attempt.log` | +| `cargo make real-world-memory` | `pass`; refreshed external adapter report published | 7.97 seconds | `tmp/real-world-memory/real-world-memory-report.json`, `tmp/real-world-memory/real-world-memory-report.md` | + +Fresh mem0/OpenMemory run id: `live-baseline-20260611122416`. + +Generated external adapter summary for all external adapter manifest rows: + +- Scenario statuses: `unsupported=2`, `blocked=2`, `wrong_result=1`, + `lifecycle_fail=1`, `pass=9`, `not_encoded=3`. +- Legacy ELF positions: `wins=2`, `ties=4`, `loses=1`, `untested=11`. +- Normalized comparison outcomes: `win=2`, `tie=4`, `loss=1`, + `not_tested=8`, `blocked=1`, `non_goal=2`. + +mem0/OpenMemory rows in this report contain eight scenarios: `loss=1`, +`tie=3`, `not_tested=1`, `blocked=1`, and `non_goal=2`. + +## Scenario Outcomes + +| Scenario | mem0/OpenMemory evidence | ELF comparison outcome | Status | Command | Artifact | +| --- | --- | --- | --- | --- | --- | +| Basic local lifecycle | mem0 passes same-corpus retrieval, update, delete, and cold-start reload in the prior first-generation baseline. | `tie` | `pass` | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `tmp/live-baseline/live-baseline-report.json` | +| Preference correction history | `Memory.history` exposes explicit `ADD` and `UPDATE` preference records; search returns only the current correction. | `loss` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| Entity-scoped personalization | `search()` with `user_id`, `agent_id`, and `run_id` filters returns the ELF-scoped preference and omits a PubFi-scoped preference. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Delete audit readback | `Memory.history` exposes a `DELETE` event and post-delete search suppresses the deleted memory. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| Local SDK export-style readback | `Memory.get_all` returns the current scoped preference and omits the other scope. | `not_tested` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | +| OpenMemory UI/export readback | The bounded export-helper setup probe finds OpenMemory product files but the export helper cannot run because Docker is unavailable inside the baseline runner. It does not reach browser/dashboard readback or same-corpus product app database validation. | `blocked` | `blocked` | `cargo make openmemory-ui-export-readback` | `tmp/live-baseline/mem0-openmemory-ui-export.json`, `tmp/live-baseline/mem0-openmemory-export-attempt.log` | +| Hosted mem0 Platform export | Hosted Platform export is outside local OSS evidence. | `non_goal` | `unsupported` | Not run; local OSS comparison only. | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| Optional graph memory | Graph memory is not enabled in the default local OSS run. | `non_goal` | `not_encoded` | Not run; opt-in scenario gate. | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | + +## Evidence Details + +The fresh mem0 check artifact records eight passing checks: + +- `same_corpus_retrieval` +- `update_replaces_note_text` +- `preference_correction_history` +- `entity_scoped_personalization` +- `local_get_all_export_readback` +- `delete_suppresses_retrieval` +- `delete_history_audit_readback` +- `cold_start_recovery_search` + +The `preference_correction_history` check verifies all of: + +- history is available; +- history contains the original preference; +- history contains the corrected preference; +- history contains explicit `ADD` and `UPDATE` events; +- search contains the corrected preference; +- search omits the old preference. + +The `delete_history_audit_readback` check verifies all of: + +- history is available; +- history contains a delete event; +- search suppresses the deleted memory. + +The local SDK export-style readback check is intentionally named separately from UI +export. It only proves local `get_all` scoped readback through the OSS SDK. + +The OpenMemory export-helper setup probe records: + +- OpenMemory tree present: `true`; +- UI package present: `true`; +- compose file present: `true`; +- export helper present: `true`; +- sunsetting notice present: `true`; +- SDK `get_all` status: `pass`; +- export attempt command: + `timeout 30 bash openmemory/backup-scripts/export_openmemory.sh --user-id elf-history-user --container openmemory-openmemory-mcp-1`; +- export attempt exit code: `1`; +- reason code: `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`. + +The attempt log contains `docker: command not found` before the helper reports that +`openmemory-openmemory-mcp-1` is not running. The concrete next action is to add a +dedicated OpenMemory Docker Compose profile that imports the generated mem0 corpus +into the OpenMemory app database, starts API/UI with explicit local or provider +configuration, then reruns the export helper and validates exported memories. + +## Source And Product Boundary + +Official mem0 documentation distinguishes the OSS/self-hosted surface from hosted +Platform API paths. The OSS REST page documents CRUD/search/update/delete/reset +operations by `user_id`, `agent_id`, or `run_id`, an OpenAPI explorer at `/docs`, and +memory history endpoints. The export guide distinguishes bulk `get_all()`, semantic +search, structured exports, and Platform UI exports. + +This report uses those docs only to set the claim boundary: + +- local OSS SDK `history`, `search`, and `get_all` behavior is measurable here; +- OpenMemory browser/dashboard export is not reached here; the current evidence is a + bounded export-helper setup probe blocked by setup; +- hosted Platform export is a `non_goal` for this local OSS lane; +- optional graph memory remains an opt-in scenario, not a default pass/fail claim. + +References: + +- Mem0 OSS REST API Server: `https://docs.mem0.ai/open-source/features/rest-api` +- Mem0 Export Stored Memories: `https://docs.mem0.ai/cookbooks/essentials/exporting-memories` + +## Claim Boundaries + +Allowed: + +- mem0/OpenMemory local OSS passes the new encoded history, correction, + personalization, deletion-audit, and local `get_all` readback checks in run + `live-baseline-20260611122416`. +- ELF currently has a measured `loss` against mem0 on the preference correction + history dimension because the June 11 temporal/history report records ELF's live + memory-evolution preference job as `wrong_result`. +- ELF and mem0 currently `tie` on the encoded entity-scoped personalization and + delete-audit surfaces. +- OpenMemory UI/export readback is `blocked` by a concrete setup blocker: + `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER`; ELF cannot compare against this product-UX + scenario yet. +- Hosted mem0 Platform export and optional graph memory are `non_goal` for this + local OSS comparison. + +Not allowed: + +- Do not reuse the basic lifecycle pass as history, UI, hosted, or graph-memory + evidence. +- Do not claim OpenMemory UI/export quality from local SDK `get_all`. +- Do not claim hosted mem0 Platform behavior from the local OSS run. +- Do not treat optional graph memory as a default mem0 pass or ELF loss. +- Do not convert `blocked`, `unsupported`, `not_encoded`, or `non_goal` scenarios + into wins or losses. + +## Follow-Up Gate + +The next fair UI/export comparison requires extending the bounded runner so it starts +OpenMemory, loads the same local memories into the OpenMemory app database, captures +authenticated inspection/export readback, and publishes a browser/API artifact. That +is separate from the local SDK `get_all` export-style readback added here. diff --git a/docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md b/docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md new file mode 100644 index 00000000..693ce98d --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md @@ -0,0 +1,133 @@ +# qmd and OpenViking Strength-Profile Report - June 11, 2026 + +Goal: Compare ELF against qmd and OpenViking on their actual strengths without +turning broad live-sweep or smoke results into unsupported win claims. +Read this when: You need the XY-899 scenario-level qmd retrieval-debug and +OpenViking context-trajectory benchmark/report outcome. +Inputs: The June 11 retrieval-debug, memory-evolution, and temporal-history reports, +the real-world benchmark spec, the external adapter manifest, and +`scripts/real-world-live-adapters.sh`. +Outputs: Scenario-level win/tie/loss/not-tested judgments, qmd wrong-result +diagnosis taxonomy, OpenViking typed trajectory blockers, blocked context-trajectory +jobs, and claim boundaries. + +Machine-readable companion: +`docs/research/2026-06-11-qmd-openviking-strength-profile-report.json`. + +## Executive Judgment + +ELF does not have a broad win against either qmd or OpenViking on their strengths. + +The measured qmd judgment is narrower: + +- Retrieval quality: `tie`. ELF and qmd both pass the encoded live real-world + retrieval suite and both pass the 480-document stress retrieval baseline. +- Local query transparency: `not_tested`. qmd's current artifacts expose directly + inspectable top-10 JSON rows with files, line numbers, snippets, and scores. ELF + has stronger service traces and production-operation evidence, but the checked-in + stress report does not hydrate an equivalent candidate list, so no scored ELF loss + is claimed for this surface. +- Local replayability: `not_tested`. qmd has a concise observed CLI replay path, and + ELF has service traces plus admin bundle endpoints, but no scored replayability rule + compares those surfaces yet. +- Expansion/fusion/rerank controls: `not_tested`. The current qmd materializer and + stress run use `--no-rerank`; no scored expansion/fusion/rerank profile exists. + +The measured OpenViking judgment is split by surface: + +- Same-corpus evidence-bearing preconditions: `elf_win`. The pinned Docker local + embedding path reaches `add_resource`/`find`, but the OpenViking smoke remains + `wrong_result` because expected evidence terms are missed while ELF passes the + equivalent retrieval precondition. +- Context trajectory strengths: `blocked` / `not_tested`. The OpenViking + same-corpus artifact now exposes expected, matched, and missing evidence ids, and + the staged retrieval, hierarchy selection, and recursive/context expansion jobs are + encoded as blocked fixtures. +- Staged retrieval, hierarchy selection, and recursive/context expansion remain + unscored until OpenViking returns evidence-bearing same-corpus output and comparable + stage artifacts; no ELF win, tie, or loss is claimed against those strengths. + +## qmd Scenario Outcomes + +| Scenario | Evidence Class | Result Type | ELF Outcome | What It Means | +| --- | --- | --- | --- | --- | +| Retrieval quality | `live_real_world` | `pass` | `tie` | Both systems pass 5/5 live retrieval jobs with 6/6 expected evidence matched. | +| Local query transparency | `live_baseline_only` | `not_encoded` | `not_tested` | qmd exposes top-10 files, line numbers, snippets, scores, and distractor density directly in the stress artifact, but the equivalent ELF candidate-list surface is not encoded. | +| Expansion/fusion/rerank controls | `research_gate` | `not_encoded` | `not_tested` | No scored profile proves either system's expansion, fusion, or rerank superiority. | +| Stale context isolation | `live_real_world` | `pass` | `tie` | Both systems pass the encoded current-vs-obsolete and distractor-heavy retrieval jobs. | +| Update/delete/cold-start behavior | `live_baseline_only` | `pass` | `tie` | Equivalent update replacement, delete suppression, and cold-start recovery checks pass for both. | +| Operator-debug evidence | `live_real_world` | `not_encoded` | `not_tested` | The live sweep marks operator-debugging UX `not_encoded` for both systems. | +| Local replayability | `live_baseline_only` | `not_encoded` | `not_tested` | qmd has a shorter observed CLI replay path, but no scored replayability rule compares it with ELF's trace/admin replay surfaces yet. | +| Wrong-result diagnosis | `research_gate` | `not_encoded` | `not_tested` | The report classifies qmd memory-evolution failures, but qmd candidate-drop traces are not yet materialized and no pass evidence is claimed. | + +Summary: qmd strength-profile outcomes are `0` ELF wins, `3` ties, `0` ELF losses, +and `5` not-tested scenarios. This distinguishes retrieval quality from +debug/replay ergonomics: the retrieval result is tied, qmd remains the local +retrieval-debug UX reference, and query transparency plus replayability remain +unscored for comparative ELF win/loss claims. + +## qmd Wrong-Result Diagnosis + +The report adds a qmd diagnosis taxonomy with four classes: + +| Diagnosis Class | Current qmd Coverage | +| --- | --- | +| `evidence_absent` | Observed on the verdict caveat, preference rationale, and delete tombstone misses. | +| `retrieved_but_dropped` | Defined but not observed because current qmd live job artifacts do not expose candidate-stage traces. | +| `selected_but_not_narrated` | Observed on supersession jobs where qmd had evidence but did not narrate current-vs-historical state. | +| `contradicted_by_lifecycle_evidence` | Observed when current, historical, supersession, or tombstone evidence keeps the answer in typed `wrong_result` state. | + +The key qmd memory-evolution diagnosis is unchanged from the June 11 diagnostic: +qmd is `0/6` pass on live memory-evolution, misses three required evidence links, +and fails the delete/TTL tombstone job. The new report records that as typed +diagnosis evidence, not as a broad ELF-over-qmd claim. + +## OpenViking Scenario Outcomes + +| Scenario | Evidence Class | Result Type | ELF Outcome | Typed Blocker | +| --- | --- | --- | --- | --- | +| Docker local embedding setup | `live_baseline_only` | `pass` | `not_tested` | none | +| Same-corpus evidence-bearing retrieval precondition | `live_baseline_only` | `wrong_result` | `elf_win` | `output_missed_expected_terms` | +| Staged retrieval trajectory | `fixture_backed` | `blocked` | `not_tested` | `needs_evidence_bearing_same_corpus_output` | +| Hierarchy selection | `fixture_backed` | `blocked` | `not_tested` | `hierarchy_output_not_scored` | +| Recursive/context expansion | `fixture_backed` | `blocked` | `not_tested` | `recursive_expansion_not_materialized` | +| Missed expected terms evidence | `live_baseline_only` | `wrong_result` | `not_tested` | `retrieval_wrong_result` | + +Summary: OpenViking profile outcomes are `1` ELF win, `0` ties, `0` ELF losses, and +`5` not-tested scenarios. The single win is only the same-corpus evidence-bearing +precondition. The current smoke wrong-result is useful typed failure evidence, and the +three context-trajectory fixtures make the staged, hierarchy, and recursive jobs +visible as blocked work. They are not scored staged-trajectory comparisons, so +context-trajectory strengths remain not tested for win/tie/loss claims. + +## Claim Boundaries + +Allowed: + +- ELF ties qmd on the current encoded retrieval-correctness surfaces. +- qmd remains the local retrieval-debug UX reference on the currently evidenced query + transparency artifact ergonomics; query transparency and replayability are observed + but not scored as comparative ELF wins or losses. +- qmd expansion/fusion/rerank superiority is untested. +- OpenViking's Docker local embedding setup reaches runtime, and the baseline output + now exposes expected/matched/missing evidence ids, but context trajectory remains + blocked because evidence-bearing same-corpus retrieval is not passing and staged + artifacts are not materialized. +- ELF currently wins only the equivalent OpenViking same-corpus retrieval + precondition surface, not OpenViking's staged trajectory strengths. + +Not allowed: + +- Do not claim ELF broadly beats qmd. +- Do not claim qmd's debug ergonomics are equivalent to retrieval quality. +- Do not claim ELF beats OpenViking on staged retrieval, hierarchy, or recursive + context expansion. +- Do not turn `research_gate`, `blocked`, `not_encoded`, or `unsupported` surfaces + into wins or losses. + +## Validation Hook + +The checked-in consistency test reads the machine-readable companion report and +asserts the qmd/OpenViking scenario counts, diagnosis taxonomy, and bottom-line +claim boundaries. This keeps future report edits from silently converting untested +strength surfaces into pass claims. diff --git a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md new file mode 100644 index 00000000..40fca7fa --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md @@ -0,0 +1,292 @@ +# Temporal/History Competitor Gap Report - June 11, 2026 + +Goal: Turn the latest live measurements into a clear competitor-gap report and +future optimization direction for ELF without implementing optimization changes here. +Read this when: You need to decide whether ELF currently wins, ties, loses, or has +no comparable claim against qmd, mem0/OpenMemory, Graphiti/Zep, Letta, and adjacent +agent-memory projects on temporal history, lifecycle, and real-world memory use. +Inputs: Fresh local runs of Graphiti/Zep temporal smoke, ELF+mem0 live baseline, +fixture memory evolution, and ELF/qmd live real-world adapters on commit +`d6d9051`. +Outputs: Evidence-class boundaries, scenario judgments, claim limits, and a +prioritized benchmark-driven optimization plan. + +## Executive Judgment + +The overall goal is not complete. ELF does not yet have complete, comparable +benchmark wins across all tracked memory projects and all user-important memory +scenarios. + +Update after XY-924 and XY-931: mem0/OpenMemory local OSS history, local SDK +export-style readback, and a bounded OpenMemory export-helper setup probe are now measured +in `2026-06-11-mem0-openmemory-history-ui-export-report.md`. That report records mem0 +passes for preference correction history, entity-scoped personalization, deletion +audit history, and local `get_all` readback, while keeping OpenMemory UI/export +blocked by `DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER` and hosted Platform export plus +optional graph memory as local-lane non-goals. + +The current evidence supports a narrower judgment: + +- ELF remains a strong personal-production foundation because its core source of + truth, typed evidence, rebuild/backfill/restore story, and fixture benchmark + coverage are much more disciplined than most competitors. +- ELF now ties or beats mem0 only on the fresh basic local lifecycle smoke shape: + the combined Docker run passed `12/12` checks across ELF and mem0. This does not + measure OpenMemory UI, hosted behavior, entity history quality, optional graph + memory, or real-world temporal jobs. +- ELF narrowly beats qmd on the fresh live memory-evolution slice because ELF passes + the delete/TTL tombstone job that qmd fails, and ELF retrieves all required + memory-evolution evidence. This is still not a production-quality temporal memory + win because ELF fails five current-vs-historical jobs. +- Graphiti/Zep remains the strongest temporal-validity design reference, but the + local live smoke is typed `blocked` because no explicit provider API key was + configured. No ELF-over-Graphiti/Zep claim is allowed. +- Letta remains a core-vs-archival memory design reference. There is no contained + comparable live benchmark here, so no win, tie, or loss claim is allowed. + +The highest-value ELF direction is temporal reconciliation and lifecycle readback, +not more generic retrieval. In the failing temporal jobs ELF usually finds the +evidence but does not turn current, historical, superseded, and deleted facts into a +clear answer and trace. + +## Fresh Runs + +| Command | Result | Runtime | Main artifact | +| --- | --- | ---: | --- | +| `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal` | typed blocked | 3.5 seconds | `tmp/real-world-memory/graphiti-zep-smoke/summary.json` | +| `ELF_BASELINE_PROJECTS=ELF,mem0 cargo make baseline-live-docker` | pass | 50.14 seconds | `tmp/live-baseline/live-baseline-report.json` | +| `cargo make real-world-memory-evolution` | pass | 59.65 seconds | `tmp/real-world-memory/evolution-report.json` | +| `cargo make real-world-memory-live-adapters` | pass | 166.61 seconds | `tmp/real-world-memory/live-adapters/` | + +The Graphiti/Zep command did not use a hosted Zep service or unrecorded credentials. +It recorded a typed blocker: `provider_api_key_missing`. + +The ELF+mem0 baseline loaded the repository `.env` from the main checkout so the +container had the configured embedding environment. The report artifact still records +the local smoke embedding mode for this baseline path, so do not cite this run as a +4096-dimensional production-embedding quality test. + +## Evidence-Class Boundary + +| Evidence class | What it proves | What it does not prove | +| --- | --- | --- | +| Fixture memory-evolution pass | The benchmark contract can score current facts, historical facts, conflicts, update rationales, and history readback. | Live ELF or competitor runtime quality. | +| ELF/qmd live real-world adapters | Comparable live behavior for encoded suites in the checked-in runner. | Full memory-system superiority or unencoded suites. | +| ELF+mem0 live baseline | Basic Docker local same-corpus, update, delete, and reload lifecycle smoke. | OpenMemory UI, hosted behavior, real-world jobs, temporal history quality, or graph memory. | +| Graphiti/Zep typed blocker | The adapter has a Docker-local temporal smoke contract and typed provider boundary. | Live Graphiti/Zep search quality or ELF superiority over Graphiti/Zep. | +| Letta research-only state | Core-vs-archival memory is a relevant product pattern for ELF to borrow. | Comparable live results. | + +## Basic Local Lifecycle: ELF And mem0 + +The fresh `ELF,mem0` live-baseline run passed. + +| Project | Status | Checks | Runtime | What passed | +| --- | --- | ---: | ---: | --- | +| ELF | pass | `8/8` | 11 seconds | resumable backfill, same-corpus retrieval, async worker indexing, update, delete, cold-start reload, concurrent writes, resource envelope | +| mem0 | pass | `4/4` | 36 seconds | same-corpus retrieval, update, delete, cold-start reload | + +This updates the older mem0 local-baseline picture. For the basic Docker local +lifecycle smoke, mem0 should no longer be described as currently failing. + +It remains a limited comparison. ELF's smoke covers more local operational checks, +while mem0's strongest product claims are elsewhere: entity-scoped memory history, +OpenMemory inspection UX, hosted ecosystem behavior, and optional graph memory. Those +are not measured by this run. + +## Live Temporal Memory: ELF And qmd + +The fixture memory-evolution suite passed `5/5` with mean score `1.000`, expected +evidence `11/11`, conflict detection `5`, and update rationale count `5`. + +The fresh live adapters still fail the real temporal-history behavior. + +| Adapter | Jobs | Pass | Wrong-result jobs | Mean score | Expected evidence recall | Evidence coverage | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live service adapter | `38` | `18` | `5` | `0.525` | `41/77` | `48/84` | +| qmd live CLI adapter | `38` | `17` | `6` | `0.486` | `38/77` | `45/84` | + +For the `memory_evolution` suite: + +| Adapter | Encoded jobs | Job statuses | Score mean | Evidence recall | Diagnosis | +| --- | ---: | --- | ---: | ---: | --- | +| ELF live service adapter | `6` | `1` pass, `5` wrong_result | `0.492` | `1.000` | Finds the evidence, but does not narrate current-vs-historical conflict and lifecycle state. | +| qmd live CLI adapter | `6` | `0` pass, `6` wrong_result | `0.325` | `0.769` | Same lifecycle gap, plus missed evidence including the delete tombstone. | + +### Job-Level Pattern + +| Job | ELF | qmd | What the result means | +| --- | --- | --- | --- | +| `memory-evolution-benchmark-verdict-001` | wrong_result, `0.40`, evidence `3/3` | wrong_result, `0.15`, evidence `2/3` | ELF found current verdict, caveat, and rationale but did not represent the superseded verdict as historical. | +| `memory-evolution-deploy-method-001` | wrong_result, `0.40`, evidence `2/2` | wrong_result, `0.40`, evidence `2/2` | Both found current runbook and rationale, but neither preserved the old quickstart path as historical. | +| `memory-evolution-issue-state-001` | wrong_result, `0.40`, evidence `2/2` | wrong_result, `0.40`, evidence `2/2` | Both found current done state and rationale, but neither surfaced the earlier blocked state. | +| `memory-evolution-preference-001` | wrong_result, `0.40`, evidence `2/2` | wrong_result, `0.15`, evidence `1/2` | ELF found current preference and rationale, but did not preserve old preference history. | +| `memory-evolution-relation-temporal-001` | wrong_result, `0.35`, evidence `2/2` | wrong_result, `0.35`, evidence `2/2` | Both found current and old owners, but did not emit scored temporal-validity explanation. | +| `memory-evolution-delete-ttl-001` | pass, `1.00`, evidence `2/2` | wrong_result, `0.50`, evidence `1/2` | ELF found tombstone and current plan. qmd missed the tombstone. | + +The key ELF failure is not retrieval. The five wrong-result jobs all have evidence +grounding `1.0`, trap avoidance `1.0`, answer correctness `0.0`, and lifecycle +behavior `0.0`. ELF needs to reconcile and explain lifecycle state, not merely return +the right snippets. + +## Competitor Strengths And Current ELF Position + +| Scenario | Competitor/reference strength | Current evidence | ELF position | +| --- | --- | --- | --- | +| Basic local lifecycle | mem0 update/delete/reload | Fresh Docker baseline: ELF `8/8`, mem0 `4/4`, combined `12/12` | ELF ties or exceeds the encoded smoke surface, but does not beat OpenMemory UI/history/hosted claims. | +| Retrieval/debug | qmd transparent CLI, expansion/fusion/rerank/replay ergonomics | ELF/qmd live adapters pass retrieval suites; previous qmd debug profile exists | ELF is not clearly stronger. qmd remains the debug-UX bar. | +| Current-vs-historical memory | Graphiti/Zep temporal validity; mem0 history surfaces | ELF/qmd live memory-evolution wrong_result; Graphiti/Zep blocked; mem0 local OSS preference correction history now passes, but mem0 real-world prompt history is not encoded | ELF has a measured gap. It only narrowly beats qmd's current run and loses the local OSS preference-correction history scenario to mem0. | +| Delete/tombstone lifecycle | ELF production ops and qmd local replay | ELF passes delete/TTL job; qmd misses tombstone | ELF has a narrow measured win over qmd on this job. | +| Entity preference history | mem0/OpenMemory | XY-924 local OSS run passes mem0 preference correction history and entity-scoped personalization; XY-931 OpenMemory export-helper setup probe is blocked by missing Docker/OpenMemory product container access inside the baseline runner | ELF loses the preference-correction history scenario and ties the scoped-personalization scenario; no OpenMemory UI/export claim is allowed. | +| Core-vs-archival memory | Letta core memory blocks versus archival memory | Research-only, no contained live output | Not comparable. Borrow design only. | +| Context trajectory | OpenViking staged context and hierarchy | Existing adapter remains not encoded or wrong_result for trajectory | Not comparable. Need staged trajectory benchmark. | +| Capture and continuity | agentmemory, claude-mem hooks/viewers | Existing adapters are baseline-only and undermeasured | Not comparable. Need capture/write-policy and work-resume adapters. | +| Knowledge pages and graph/RAG navigation | llm-wiki, gbrain, graphify, RAGFlow, LightRAG, GraphRAG | llm-wiki/gbrain/GraphRAG/RAGFlow/LightRAG remain research-gate or blocked; graphify has a tiny scored `wrong_result` smoke | Not comparable for graph/RAG parity. Need larger Docker-contained evidence-linked adapters. | +| Production operation discipline | ELF backfill, restore, typed gates | Existing production adoption reports plus current benchmark discipline | ELF has the strongest measured local production-operation story, with private/provider gates still typed blocked. | + +## What ELF Should Borrow + +| Source | Best idea to absorb | Benchmark gate before any claim | +| --- | --- | --- | +| Graphiti/Zep | Validity windows, `valid_at`/`invalid_at`, current/historical/future fact separation, temporal relation provenance | Provider-backed Docker temporal smoke must map current, historical, and rationale facts to scored evidence ids. | +| mem0/OpenMemory | Entity-scoped memory history, user-visible lifecycle inspection, update/delete ergonomics | Local OSS history, correction, deletion, and SDK `get_all` readback are now scored; UI/export readback has a bounded export-helper setup probe but remains blocked until OpenMemory can run with the same corpus in its product app database. | +| Letta | Always-loaded core memory blocks separated from archival search | Add core-vs-archival jobs for attachment scope, provenance, fallback, and stale-core avoidance. | +| qmd | Local replay, candidate inspection, expansion/fusion/rerank debug knobs | ELF trace artifacts must show candidate generation, rerank, dropped evidence, conflict candidates, and replay commands. | +| OpenViking | Staged context trajectory and hierarchy | Encode trajectory jobs after evidence-bearing same-corpus output passes. | +| agentmemory and claude-mem | Capture breadth, continuity hooks, and viewer comfort | Live capture/write-policy benchmark must prove redaction, exclusion, source ids, and no secret leakage. | +| memsearch | User-inspectable canonical files and rebuild clarity | Source-of-truth/reindex benchmark must prove update/delete/reload without making derived vectors authoritative. | +| llm-wiki, gbrain, graphify, GraphRAG | Cited knowledge pages, timelines, graph reports, rebuild/lint loops | Knowledge-page rebuild/lint jobs must catch unsupported claims and stale sections. | + +## Optimization Direction + +These are future optimization directions, not implemented changes in this report. + +### P0 - Temporal Reconciliation Contract + +ELF should add an answer and trace contract for current-vs-historical memory: + +1. Identify current winner, historical loser, and update rationale for the same claim. +2. Preserve superseded facts as history instead of dropping or silently demoting them. +3. Expose tombstones and TTL invalidations as answerable lifecycle evidence. +4. Emit trace fields for conflict candidates, current selection, historical selection, + tombstone selection, and rationale selection. +5. Add scorer gates so a retrieved-but-not-narrated conflict remains `wrong_result`. + +Target benchmark: ELF live `memory_evolution` should pass all six jobs before any +claim that ELF has solved temporal memory. + +### P0 - mem0/OpenMemory History Comparison + +XY-924 moves the reproducible local OSS comparison past basic update/delete into +the product behavior users actually care about: + +1. preference history across correction events; +2. entity-scoped memory lookup and update; +3. local SDK inspection/export-style readback of memory lifecycle; +4. deletion versus historical audit readback; +5. optional graph-memory behavior only if the OSS path is reproducible in Docker. + +Target benchmark status: local OSS history jobs are now encoded with per-scenario +claims. OpenMemory UI/export readback has a bounded export-helper setup probe, but it +remains blocked until a dedicated OpenMemory compose/import path can load the same +corpus into the OpenMemory app database. Hosted Platform export plus optional graph +memory remain non-goals for the local OSS lane. + +### P0 - qmd-Level Debugging And Replay + +ELF should match qmd's practical debugging strengths: + +1. show query expansion, sparse/dense retrieval, fusion, rerank, and final selection; +2. mark candidate-drop reasons; +3. include replay commands that do not require raw SQL; +4. connect wrong-result scores to specific missing stages; +5. keep artifacts local and reproducible. + +Target benchmark: every wrong temporal or retrieval answer has a replayable trace that +explains whether evidence was absent, retrieved but dropped, selected but not narrated, +or contradicted by a higher-priority lifecycle fact. + +### P1 - Core Memory Blocks + +ELF should evaluate Letta-style core memory without weakening ELF's source-of-truth +discipline: + +1. scoped read-only core blocks; +2. provenance and source ids on every core assertion; +3. explicit attach/detach rules; +4. stale-core detection when archival evidence supersedes a core statement; +5. fallback to archival search when core memory is insufficient. + +Target benchmark: core-vs-archival jobs prove correct attachment, sharing, update +visibility, and stale-core avoidance. + +### P1 - Capture, Consolidation, And Knowledge Pages + +A good memory system is not only retrieval. ELF should benchmark and later optimize: + +1. safe capture/write policy with redaction and exclusion proof; +2. reviewable consolidation proposals with source lineage and unsupported-claim flags; +3. project/entity knowledge pages that rebuild from authoritative notes; +4. timelines for changed decisions, ownership, and production state; +5. operator UX that explains failures without raw database inspection. + +Target benchmark: live capture, consolidation, knowledge, and operator-debugging suites +must move from `not_encoded` or fixture-only to comparable live evidence. + +### P2 - Graph/RAG And Context-Trajectory Adapters + +Graph/RAG and context trajectory should be measured, not assumed: + +1. Graphiti/Zep for temporal graph facts; +2. RAGFlow, LightRAG, and GraphRAG for document/chunk/graph evidence handles; +3. graphify for graph-compressed navigation reports; +4. OpenViking for staged context trajectory; +5. llm-wiki and gbrain for maintained knowledge workflows. + +Target benchmark: each adapter must emit evidence-linked outputs from Docker-contained +or explicitly typed provider-backed runs before any ELF win/loss claim. + +## Claim Boundaries + +Allowed: + +- ELF+mem0 basic local lifecycle smoke passed in the fresh Docker baseline. +- ELF narrowly outperformed qmd on the fresh memory-evolution slice because ELF passed + delete/TTL and qmd did not. +- ELF still failed five of six live memory-evolution jobs. +- Graphiti/Zep temporal smoke is typed blocked due missing explicit provider key. +- Letta is a design reference, not a measured comparable competitor in this report. +- The next work should be benchmark/report driven before implementation work is + claimed successful. + +Not allowed: + +- Do not claim all goals are complete. +- Do not claim ELF beats all tracked memory projects. +- Do not claim ELF beats mem0/OpenMemory on UI/export, hosted behavior, entity + history, or graph memory. The current UI/export result is a setup blocker, not a + comparison win. +- Do not claim ELF beats Graphiti/Zep on temporal validity. +- Do not claim ELF beats Letta on core-vs-archival memory. +- Do not treat fixture pass, baseline smoke pass, and live real-world pass as the + same evidence class. + +## Next Concrete Report/Issue Directions + +1. Open or refine a P0 issue for ELF live temporal reconciliation and trace contract. +2. Follow up the XY-931 OpenMemory UI/export blocker with a Docker Compose/import + path that loads the same corpus into the OpenMemory product app database. +3. Open a P0 benchmark issue for ELF/qmd trace-level replay and wrong-result + diagnosis. +4. Open a P1 benchmark issue for Letta-style core-vs-archival memory. +5. Keep Graphiti/Zep provider-backed temporal smoke blocked until explicit provider + credentials are available, then rerun and compare validity-window behavior. +6. Keep graph/RAG and knowledge-page adapters as P2 until Docker-contained evidence + mappings are available. + +## Bottom Line + +ELF is not done competing. The evidence says ELF should keep its strict +source-of-truth and production-operation core, then absorb the best competitor ideas +behind benchmark gates. The immediate product-quality gap is temporal and lifecycle +memory: users need to know what is current, what changed, what was deleted, what is +historical, and why the system believes that answer. diff --git a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md new file mode 100644 index 00000000..9d1f9f7b --- /dev/null +++ b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md @@ -0,0 +1,160 @@ +# Dreaming-Readiness Stage Ledger - June 16, 2026 + +Goal: Define the Decodex benchmark gate for Dreaming-inspired ELF memory-system +optimization stages. +Read this when: You are starting or finishing a staged memory improvement lane and +need the baseline command matrix, typed evidence status, post-stage outcome, and +report shape required before claiming the stage improved. +Inputs: `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`, the June 11 +competitor-strength, temporal-history, and iteration-direction reports, the XY-905 +June 16 live temporal reconciliation report, the consolidation proposal spec, the +memory summary spec, the XY-953 proactive brief scoring report, the XY-954 scheduled +memory task scoring report, and the checked-in real-world fixture suites. +Outputs: A stage-by-stage ledger that downstream issues can update with +`improved`, `regressed`, `unchanged`, `blocked`, or `not_tested` judgments. + +## Executive Judgment + +This ledger does not claim a broad product win. It records the gate later product +lanes must pass before they can claim a Dreaming or competitor-inspired stage is done, +and now includes the XY-905 post-stage result for live temporal reconciliation. + +Current stage status: + +- `improved`: current-vs-historical correctness, preference evolution, reviewable + consolidation, memory-summary/top-of-mind fixture readback, proactive brief fixture + scoring, and scheduled-memory task fixture scoring. +- `regressed`: none. +- `unchanged`: deletion/TTL/tombstone behavior and the final competitor retest + baseline. +- `blocked`: none. +- `not_tested`: none. + +The known live `memory_evolution` loss is now repaired for the encoded ELF live +adapter slice: the XY-905 run passes all six memory-evolution jobs and reports +current, historical, rationale, tombstone, invalidation, selected, dropped, and +non-narrated evidence fields. This is not a private-corpus, hosted memory, or broad +competitor-superiority claim. + +Reviewable consolidation is also improved for the narrow ELF self-check: XY-934 adds +service-backed proposal materialization, source lineage, confidence/usefulness, +unsupported-claim flags, apply/defer/discard audit transitions, and zero source +mutations. Direct competitor runners remain untested or product-reference only. + +Memory summary and top-of-mind behavior is improved only at the fixture-backed +contract level: XY-952 adds a reviewable `elf.memory_summary/v1` source-trace fixture +that distinguishes current top-of-mind, background, stale, superseded, tombstoned, and +derived project-profile entries. It does not prove live top-of-mind product behavior or +parity with managed memory products. + +Proactive brief readiness is improved only at the fixture-backed benchmark level: +XY-953 adds a direct `proactive_brief` suite with daily project brief, resume-work +brief, stale decision audit, stale plan/preference warning, and private-corpus refresh +blocker scenarios. It does not prove OpenAI Pulse parity, hosted managed-memory +parity, background scheduling, or private-corpus production quality. + +Scheduled-memory task readiness is improved only at the fixture-backed benchmark +level: XY-954 adds a direct `scheduled_memory` suite with weekly project status +summary, stale preference/plan audit, stale decision audit, knowledge-page refresh +suggestion, and private/provider scheduler blocker scenarios. It does not prove a +hosted scheduler, ChatGPT Tasks parity, Pulse parity, notification delivery, +provider-backed private-corpus quality, or silent source mutation safety. + +## Ledger Rules + +- Every downstream Dreaming or competitor-improvement stage must write a post-stage + JSON report and Markdown summary before claiming phase completion. +- The report must compare against the baseline counts in + `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`. +- The comparison judgment must be one of `improved`, `regressed`, `unchanged`, + `blocked`, or `not_tested`. +- Typed non-pass labels stay typed. Do not collapse `wrong_result`, `blocked`, + `not_tested`, `not_encoded`, `incomplete`, `lifecycle_fail`, `unsupported`, or + `non_goal` into a single pass/fail label. +- Fixture-backed evidence proves benchmark shape only. It does not prove live product + behavior. +- Private-corpus and provider-backed gates remain typed blocked unless an operator + supplies explicit inputs; those boundaries are tied to XY-930. + +## Stage Command Matrix + +| Stage | Baseline command(s) | Required post-stage command(s) | Baseline counts | Post-stage counts | Judgment | Next optimization direction | +| --- | --- | --- | --- | --- | --- | --- | +| Current-vs-historical correctness | `cargo make real-world-memory-evolution`; `cargo make real-world-memory-live-adapters` | Same commands; publish post-stage JSON and Markdown evidence | `pass=1`, `wrong_result=5`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `pass=6`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Move from benchmark materialization into service-native temporal reconciliation APIs and compare against mem0/OpenMemory history and Graphiti/Zep temporal graph evidence without broad superiority claims. | +| Preference evolution and correction history | `cargo make real-world-memory-evolution`; `cargo make real-world-memory-live-adapters`; `cargo make openmemory-ui-export-readback` | Same commands; include mem0/OpenMemory boundary evidence | `pass=0`, `wrong_result=1`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Measure preference correction against mem0/OpenMemory history and UI/export surfaces before making any broader history-quality claim. | +| Deletion, TTL, and tombstone behavior | `cargo make real-world-memory`; `cargo make real-world-memory-live-adapters` | Same commands | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `pass=1`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `unchanged` | Extend tombstone and TTL readback beyond the single encoded job into update/delete/recreate history cases. | +| Reviewable consolidation | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Keep Dreaming output derived and reviewable, and add direct competitor/reference runners only when they emit comparable source ids, confidence, unsupported-claim flags, and review audit artifacts. | +| Memory summary and top-of-mind behavior | `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival` | `cargo make real-world-memory-summary`; `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival`; `cargo make real-world-memory-live-adapters` | `pass=8`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=9`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Move from fixture-backed summary/source-trace readback into service-native admin readback and later live top-of-mind behavior; do not turn hidden summaries into authoritative memory. | +| Proactive brief readiness | `cargo make real-world-first-generation-oss`; `cargo make real-world-job-operator-ux` | `cargo make real-world-memory-proactive-brief`; `cargo make real-world-memory`; `cargo test -p elf-eval --test real_world_job_benchmark -- --test-threads=1` | `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0`; evidence-ref/freshness/rationale coverage `1.000`; invalid-current and tombstone violations `0` | `improved` | Move from fixture-backed proactive brief scoring into service-native generated brief readback and later live adapter materialization; keep scheduling and private-corpus refresh behind owned lanes and operator inputs. | +| Scheduled memory task readiness | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-scheduled`; `cargo make real-world-memory`; `cargo test -p elf-eval --test real_world_job_benchmark scheduled_memory -- --test-threads=1` | `pass=0`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0` | `pass=4`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0`; evidence-ref/freshness/action/trace coverage `1.000`; invalid-current, unsupported-current, tombstone, and source-mutation violations `0` | `improved` | Move from fixture-backed scheduled task scoring into service-native queued task materialization and operator-visible readback; keep hosted/private/provider scheduler gates behind XY-930 inputs. | +| Final competitor retest status | `cargo make real-world-memory-live-adapters`; `cargo make real-world-first-generation-oss`; `cargo make real-world-memory-graph-rag`; `cargo make openmemory-ui-export-readback`; `cargo make baseline-production-private-addendum` when operator input exists | Same commands; private/provider commands may remain typed blocked under XY-930 | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11`, `not_encoded=11` | partial XY-905 evidence: ELF live adapter `pass=40`, `wrong_result=0`, `blocked=5`, `not_encoded=10` | `unchanged` | Rerun the broader competitor matrix after each optimization; the XY-905 live adapter improvement does not replace private/provider or external competitor gates. | + +## Evidence Anchors + +| Stage | Evidence file(s) | +| --- | --- | +| Current-vs-historical correctness | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Preference evolution and correction history | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json` | +| Deletion, TTL, and tombstone behavior | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md` | +| Reviewable consolidation | `docs/spec/system_consolidation_proposals_v1.md`; `apps/elf-eval/fixtures/real_world_memory/consolidation/`; `docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md`; `docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json` | +| Memory summary and top-of-mind behavior | `docs/spec/system_memory_summary_v1.md`; `apps/elf-eval/fixtures/real_world_memory/memory_summary/`; `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Proactive brief readiness | `docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md`; `docs/research/2026-06-16-proactive-brief-scoring-report.json`; `apps/elf-eval/fixtures/real_world_memory/proactive_brief/`; `docs/research/2026-06-08-agent-memory-selection.json`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | +| Scheduled memory task readiness | `docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md`; `docs/research/2026-06-16-scheduled-memory-task-scoring-report.json`; `apps/elf-eval/fixtures/real_world_memory/scheduled_memory/`; `docs/research/2026-06-08-agent-memory-selection.json` | +| Final competitor retest status | `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/research/2026-06-11-competitor-strength-adoption-report.json`; `docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | + +## Report Shape For Downstream Issues + +Downstream stage reports should use the same fields as the JSON ledger: + +- `stage_id` +- `baseline_commands` +- `post_stage_commands` +- `evidence_files` +- `baseline_counts` +- `post_stage_counts` +- `comparison_judgment` +- `regression_rule` +- `improvement_rule` +- `next_optimization_direction` + +If a stage cannot run because credentials, private corpus, provider setup, or a +product surface is absent, record `blocked` or `not_tested` with the concrete blocker. +Do not silently drop the stage from the report. + +## Claim Boundaries + +Allowed: + +- The Dreaming-readiness gate exists and names required stage commands and evidence + files. +- The current ledger preserves typed non-pass states and records the XY-905 live + memory-evolution improvement. +- The current ledger records the XY-952 fixture-backed memory-summary/source-trace + contract improvement. +- The current ledger records the XY-953 fixture-backed proactive brief scoring + improvement with source refs, freshness/currentness markers, reject/defer rationale, + and typed private-corpus blocking. +- The current ledger records the XY-954 fixture-backed scheduled-memory scoring + improvement with source refs, freshness/currentness markers, action rationale, + completed trace readback, zero source mutations, and typed private/provider blocking. +- Fixture-backed knowledge and core/archival jobs can be used as regression guards for + report shape. +- Reviewable consolidation now has ELF live service-backed proposal scoring evidence, + with direct competitor runners still untested. + +Not allowed: + +- Do not claim this ledger proves preference history against mem0/OpenMemory, + live top-of-mind behavior, live proactive brief behavior, hosted scheduled tasks, + private-corpus gates, hosted memory, broad consolidation superiority, or competitor + adapters. +- Do not claim fixture-backed proactive brief scoring proves OpenAI Pulse parity or + hosted managed-memory parity. +- Do not claim fixture-backed scheduled-memory scoring proves ChatGPT Tasks, Pulse, + hosted scheduler, notification, provider-backed private-corpus, or silent-mutation + parity. +- Do not claim ELF has full-suite live real-world pass evidence. +- Do not claim private-corpus or provider-backed production quality without the + operator-owned inputs required by XY-930. +- Do not claim fixture-only or smoke-only evidence proves broad competitor + superiority. diff --git a/docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md b/docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md new file mode 100644 index 00000000..4e7f8302 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md @@ -0,0 +1,86 @@ +# Live Consolidation Proposal Scoring Report - June 16, 2026 + +Goal: Record the XY-934 live consolidation proposal scoring evidence and product +reference boundaries. +Read this when: You need to know whether ELF has live evidence for reviewable +consolidation proposal generation, source lineage, confidence, unsupported-claim +flags, and apply/defer/discard review audit transitions. +Inputs: `cargo make real-world-memory-consolidation`, +`cargo make real-world-memory-live-consolidation`, +`apps/elf-eval/fixtures/real_world_memory/consolidation/`, +`apps/elf-eval/src/bin/real_world_live_adapter.rs`, and +`docs/spec/system_consolidation_proposals_v1.md`. +Outputs: Scenario-level consolidation results, live artifacts, and typed comparison +boundaries for managed dreaming and Always-On Memory Agent style references. + +## Verdict + +ELF now has service-backed live consolidation proposal scoring. The narrow live +command materializes all 4 `consolidation` jobs through `ElfService` consolidation +run creation, worker proposal materialization, and review-action audit transitions. + +This is not scheduled production consolidation and not live provider generation. The +run uses the deterministic fixture/manual proposal payload boundary required by +`elf.consolidation/v1`: source notes are immutable, proposals are derived outputs, and +review actions are explicit artifacts. + +## Fresh Runs + +| Command | Result | Artifact | +| --- | --- | --- | +| `cargo make real-world-memory-consolidation` | pass | `tmp/real-world-memory/consolidation/report.json` | +| `cargo make real-world-memory-live-consolidation` | pass | `tmp/real-world-memory/live-consolidation/summary.json` | + +## ELF Live Consolidation Results + +| Job | Live status | Source refs | Review action | Final review state | Unsupported claims | Source mutations | +| --- | --- | ---: | --- | --- | ---: | ---: | +| `consolidation-project-summary-apply-001` | `pass` | `2` | `apply` | `applied` | `0` | `0` | +| `consolidation-weekly-decision-summary-apply-001` | `pass` | `2` | `apply` | `applied` | `0` | `0` | +| `consolidation-preference-candidate-defer-001` | `pass` | `2` | `defer` | `archived` | `0` | `0` | +| `consolidation-contradiction-report-discard-001` | `pass` | `3` | `discard` | `rejected` | `1` | `0` | + +The generated benchmark report keeps the same consolidation metrics as the fixture +report: + +- `proposal_count = 4` +- `lineage_completeness = 1.0` +- `review_action_correctness = 1.0` +- `proposal_unsupported_claim_count = 1` +- `source_mutation_count = 0` +- `executable_gap_count = 0` + +The materialization artifact records service-backed run ids, proposal ids, source +lineage counts, unsupported-claim flag counts, review-event counts, review actions, +and final review states. It does not claim source memory rewrites. + +## Comparison Boundary + +| Compared target | Position | Reason | +| --- | --- | --- | +| qmd live real-world adapter | `untested` | qmd keeps consolidation jobs typed `not_encoded`; no qmd consolidation proposal generator or review-action audit runner exists in this benchmark. | +| Managed dreaming memory systems | `product_reference` | Managed dreaming motivates the proposal-review shape, but no contained runner emits comparable source ids, confidence, unsupported-claim flags, and review audit artifacts. | +| Always-On Memory Agent patterns | `product_reference` | Always-on scheduling remains a reference only. XY-934 does not implement scheduled consolidation and does not allow silent source-of-truth rewrites. | + +## Claims Allowed + +- ELF live consolidation self-checks pass for proposal materialization, source + lineage, confidence/usefulness thresholds, unsupported-claim flags, and + apply/defer/discard audit transitions. +- Fixture consolidation passes and live service-backed consolidation evidence are + separate evidence classes. +- qmd and other tracked projects remain untested or reference-only for live + consolidation proposal scoring until a contained runner emits comparable artifacts. +- Derived-output safety claims are tied to source lineage, immutable source snapshots, + zero source mutations, and review-action artifacts. + +## Claims Not Allowed + +- Do not claim scheduled production consolidation exists. +- Do not claim live provider-generated consolidation quality; the accepted + `elf.consolidation/v1` service boundary is deterministic fixture/manual proposal + materialization. +- Do not claim ELF broadly beats managed dreaming, Always-On Memory Agent, + agentmemory, qmd, or llm-wiki on consolidation without comparable contained live + runners. +- Do not mix knowledge-page rebuild/lint scoring into the consolidation claim. diff --git a/docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md b/docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md new file mode 100644 index 00000000..f4385ad3 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md @@ -0,0 +1,120 @@ +# Live Temporal Reconciliation Report - June 16, 2026 + +Goal: Record the XY-905 live memory-evolution before/after result and trace contract. +Read this when: You need the current evidence for ELF live current-vs-historical, +supersession, rationale, tombstone, and invalidation behavior. +Inputs: `cargo make real-world-memory-evolution`, `cargo make +real-world-memory-live-adapters`, and +`docs/research/2026-06-16-live-temporal-reconciliation-report.json`. +Outputs: A scoped benchmark result for ELF live `memory_evolution` only. + +## Executive Judgment + +XY-905 improves the encoded ELF live `memory_evolution` slice. The fresh Docker live +adapter sweep shows ELF passing all six memory-evolution jobs with current, +historical, rationale, tombstone, invalidation, selected, dropped, and non-narrated +evidence fields exposed. + +This is not a broad competitor-superiority claim. It does not prove ELF beats +Graphiti/Zep, mem0/OpenMemory, Letta, qmd broadly, hosted memory products, private +corpus gates, or provider-backed production quality. + +## Commands + +| Command | Result | Main artifact | +| --- | --- | --- | +| `cargo test -p elf-eval --test real_world_job_benchmark -- --test-threads=1` | pass | stdout | +| `cargo make real-world-memory-evolution` | pass | `tmp/real-world-memory/evolution-report.json` | +| `cargo make real-world-memory-live-adapters` | pass | `tmp/real-world-memory/live-adapters/summary.json` | + +The live adapter run completed in 187.57 seconds. It emitted the pre-existing Qdrant +client/server compatibility warning, but the command completed and wrote ELF and qmd +reports. + +## Before And After + +| Adapter | Stage | Jobs | Status counts | Score mean | Expected evidence recall | Judgment | +| --- | --- | ---: | --- | ---: | ---: | --- | +| ELF live service adapter | June 11 baseline | 6 | `pass=1`, `wrong_result=5` | `0.492` | `1.000` | baseline loss | +| ELF live service adapter | XY-905 post-stage | 6 | `pass=6`, `wrong_result=0` | `1.000` | `1.000` | improved | +| qmd live CLI adapter | June 11 baseline | 6 | `pass=0`, `wrong_result=6` | `0.325` | `0.769` | baseline non-pass | +| qmd live CLI adapter | XY-905 post-stage | 6 | `pass=0`, `wrong_result=6` | `0.325` | `0.769` | unchanged non-pass | + +ELF full live adapter summary after XY-905: 55 jobs, 40 pass, 0 wrong_result, 5 +blocked, 10 not_encoded, mean score 0.727, expected evidence recall 0.655. + +## ELF Memory Evolution Result + +| Job | Status | Selected lifecycle evidence | +| --- | --- | --- | +| `memory-evolution-benchmark-verdict-001` | pass | current verdict, historical not-ready verdict, update rationale | +| `memory-evolution-deploy-method-001` | pass | current production runbook, historical quickstart, supersession rationale | +| `memory-evolution-issue-state-001` | pass | current done state, historical blocked state, resolution rationale | +| `memory-evolution-preference-001` | pass | current preference, historical preference, rationale | +| `memory-evolution-relation-temporal-001` | pass | current owner, historical owner, temporal rationale | +| `memory-evolution-delete-ttl-001` | pass | current plan, tombstone, invalidation evidence | + +The suite reports conflict detection count `5`, update rationale availability count +`6`, temporal-validity not-encoded count `0`, and history-readback encoded count `1`. + +## Trace Contract + +The report JSON now exposes selected lifecycle evidence fields: + +- `selected_current_evidence` +- `selected_historical_evidence` +- `selected_rationale_evidence` +- `selected_tombstone_evidence` +- `selected_invalidation_evidence` +- `conflict_candidate_evidence` +- `retrieved_but_dropped_evidence` +- `selected_but_not_narrated_evidence` + +The ELF materialization artifact also records: + +- current winner evidence +- historical loser evidence +- supersession rationale evidence +- tombstone and invalidation evidence +- retrieved, selected, absent, retrieved-but-dropped, selected-but-not-narrated, and + lifecycle-demoted evidence ids + +The scorer still fails selected-but-not-narrated conflicts as `wrong_result`; the +targeted integration test mutates a passing preference fixture to select the +historical evidence without attaching it to the current-preference conflict claim and +confirms the job remains `wrong_result`. + +## Ledger Update + +The XY-951 ledger now records: + +- `current_vs_historical_correctness`: improved from `pass=1`, `wrong_result=5` to + `pass=6`, `wrong_result=0`. +- `preference_evolution`: improved from `pass=0`, `wrong_result=1` to `pass=1`, + `wrong_result=0`. +- `deletion_ttl_tombstone_behavior`: unchanged at `pass=1`, `wrong_result=0`, with + tombstone and invalidation evidence now explicit in report fields. + +## Claim Boundaries + +Allowed: + +- ELF live `memory_evolution` now passes all six encoded jobs in the XY-905 run. +- The trace/readback contract distinguishes current, historical, rationale, + tombstone, invalidation, selected, dropped, non-narrated, and lifecycle-demoted + evidence. +- qmd remains `wrong_result` on this memory-evolution slice in the same run. + +Not allowed: + +- Do not claim ELF broadly beats qmd as a memory system. +- Do not claim ELF beats Graphiti/Zep, mem0/OpenMemory, or Letta. +- Do not claim private-corpus, hosted memory, OpenMemory UI/export, or provider-backed + production quality from this issue. + +## Next Direction + +Move this reconciliation contract from benchmark materialization toward service-native +temporal answer/readback APIs. Then compare against mem0/OpenMemory history and +Graphiti/Zep temporal graph gates before making broader history or temporal-memory +claims. diff --git a/docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md b/docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md new file mode 100644 index 00000000..255c544d --- /dev/null +++ b/docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md @@ -0,0 +1,100 @@ +# Proactive Brief Scoring Report - June 16, 2026 + +Purpose: Publish the XY-953 fixture-backed proactive project brief scoring result. +Status: benchmark report +Read this when: You need the current proactive-brief fixture evidence, stage-ledger +delta, and claim boundaries. +Not this document: A scheduler design, morning-dashboard UI, private-corpus run, or +hosted managed-memory comparison. +Source: `docs/research/2026-06-16-proactive-brief-scoring-report.json`. + +## Summary + +`cargo make real-world-memory-proactive-brief` now scores a direct +`proactive_brief` fixture suite. The suite has 5 jobs: 4 pass, 1 blocked, 0 +wrong_result, and 0 unsupported-claim results. + +The four runnable jobs produce 5 suggestions across daily project brief, +resume-work brief, stale decision audit, and stale plan/preference warning scenarios. +Suggestion evidence-ref coverage is `5/5`; freshness/currentness coverage is `1.000`; +action-rationale coverage is `1.000`. The suite records 2 recommendations, 2 defers, +and 1 rejection, with 0 invalid-current suggestions and 0 tombstone violations. + +The private-corpus refresh scenario remains a typed blocker tied to XY-930 because no +operator-owned private production corpus manifest is available. This is intentional: +the benchmark must not require private corpus access and must not turn missing private +inputs into a fixture pass. + +## Fixture Results + +| Job | Status | Suggestion kind | Decision | Evidence and freshness outcome | +| --- | --- | --- | --- | --- | +| `proactive-daily-project-brief-001` | `pass` | `daily_project_brief` | `recommend` | Current source refs selected; stale Pulse-parity trap dropped. | +| `proactive-resume-work-brief-001` | `pass` | `resume_work` | `recommend` | Current handoff and validation refs selected; stale branch trap dropped. | +| `proactive-stale-decision-audit-001` | `pass` | `stale_decision_audit` | `defer` | Superseded decision is surfaced as stale, not current. | +| `proactive-stale-plan-preference-warning-001` | `pass` | `stale_plan_preference_warning` | `defer`, `reject` | Expired, superseded, and tombstoned sources are warning inputs, not current recommendations. | +| `proactive-private-corpus-refresh-blocked-001` | `blocked` | `private_corpus_refresh` | blocked | Private-corpus refresh stays blocked until XY-930 operator inputs exist. | + +## Aggregate Delta + +The root fixture aggregate after XY-953 is: + +| Metric | Value | +| --- | ---: | +| Jobs | `55` | +| Encoded suites | `15` | +| Pass | `49` | +| Blocked | `6` | +| Wrong result | `0` | +| Incomplete | `0` | +| Not encoded | `0` | +| Unsupported claim count | `0` | +| Evidence coverage | `123/123` | +| Source-ref coverage | `123/123` | +| Quote coverage | `123/123` | +| Expected evidence recall | `1.000` | +| Mean score | `0.891` | + +XY-951 stage-ledger delta for `proactive_brief_readiness`: + +| Baseline | After XY-953 | Judgment | +| --- | --- | --- | +| `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0` | `improved` | + +## Regression Guards + +The proactive scorer fails or downgrades output when a suggestion: + +- lacks evidence refs, +- lacks freshness/currentness markers, +- lacks a reject/defer/recommend rationale, +- presents stale, superseded, expired, or tombstoned evidence as current, +- ignores TTL invalidations or tombstones, +- carries unsupported current-suggestion flags, +- or claims private-corpus, Pulse, or hosted managed-memory parity from fixture-only + output. + +## Claim Boundaries + +Allowed: + +- ELF now has fixture-backed proactive brief scoring for project briefs and stale + context warnings. +- Passing proactive suggestions include evidence refs, freshness/currentness markers, + and action rationale. +- The private-corpus refresh case is encoded as a typed blocker tied to XY-930. + +Not allowed: + +- Do not claim OpenAI Pulse parity. +- Do not claim hosted managed-memory parity. +- Do not claim scheduler, morning-dashboard, or background execution behavior. +- Do not claim private-corpus refresh quality without operator-owned inputs. +- Do not treat proactive suggestions as authoritative notes; they are derived, + source-linked output that must remain reviewable. + +## Next Direction + +Move from fixture-backed proactive brief scoring into service-native generated brief +readback and later live adapter materialization. Scheduling and private-corpus refresh +remain owned by their separate lanes and operator-input gates. diff --git a/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md b/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md new file mode 100644 index 00000000..f0d5dedd --- /dev/null +++ b/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md @@ -0,0 +1,400 @@ +# Real-World Job Benchmark Report + +Goal: Publish a Markdown summary for one generated real_world_job benchmark report. +Read this when: You need a durable smoke report for real-world agent memory job fixtures. +Inputs: `tmp/real-world-memory/scheduled/report.json`. +Depends on: `apps/elf-eval/fixtures/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`. +Verification: Compare this Markdown summary with the source JSON before committing. + +## Summary + +- Run ID: `real-world-memory-scheduled` +- Generated at: `2026-06-16T16:29:13.720856Z` +- Runner version: `0.2.0-7f08eb504271123fa861e24e6e6861227682acda-aarch64-apple-darwin` +- Corpus profile: `mixed` +- Adapter: `fixture_scheduled_memory` (offline_fixture_response) +- Jobs: `5` +- Suites with encoded jobs: `1` +- Suites with `not_encoded` status: `15` +- Status summary: `4` pass, `0` wrong_result, `0` lifecycle_fail, `0` incomplete, `1` blocked, `0` not_encoded, `0` unsupported_claim +- Unsupported claim count: `0` +- Wrong-result count: `0` +- Stale-answer count: `0` +- Conflict detections: `0` +- Update rationales available: `0` +- Temporal validity not encoded: `0` +- History readback encoded: `0` +- Evidence coverage: `10/10` (`1.000`) +- Source-ref coverage: `10/10` (`1.000`) +- Quote coverage: `10/10` (`1.000`) +- Stale retrieval count: `0` +- Scope correctness: `0/0` (`0.000`), violations `0` +- Redaction leak count: `0` +- Qdrant rebuild cases: `0` encoded, `0` pass +- Expected evidence recall: `1.000` (10/10) +- Irrelevant context ratio: `0.000` (0 irrelevant) +- Trace explainability: `0` job(s), `0` wrong-result stage attribution(s) +- Consolidation source mutation count: `0` +- Mean score: `0.800` +- Mean latency: `2.000 ms` +- Cost: `0.000 USD` +- Operator-debug jobs: `0` +- Raw SQL needed: `0` +- Trace-incomplete debug jobs: `0` +- Operator UX gaps: `0` +- Scheduled memory outputs: `5` across `4` task run(s) +- Scheduled memory evidence-ref coverage: `5/5` (`1.000`) +- Scheduled memory freshness/action/trace coverage: `1.000` / `1.000` / `1.000` +- Scheduled memory stale/currentness violations: `0` invalid current, `0` tombstone violation(s) +- Scheduled memory source mutations: `0` +- Private corpus redaction: `publish evidence ids and bounded score summaries only; do not publish private text` + +## External Adapter Coverage + +This section is manifest-backed. It records external adapter coverage and blockers, but it does not convert live-baseline retrieval results into real-world suite wins. + +- Manifest: `real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store` +- Docker default: `true` via `docker-compose.baseline.yml`; artifact dir `tmp/live-baseline/` +- Adapter records: `23` total, `16` external project(s), `23` Docker-default, `0` requiring host-global installs +- Evidence classes: `1` fixture-backed, `6` live-baseline-only, `5` live real-world, `11` research-gate +- Overall statuses: `blocked=7, wrong_result=6, lifecycle_fail=1, pass=4, not_encoded=5` +- Capability coverage statuses: `real=8, mocked=1, unsupported=6, blocked=22, wrong_result=10, pass=30, not_encoded=26` +- Real-world suite statuses: `blocked=23, wrong_result=7, pass=27, not_encoded=38` +- Scenario coverage statuses: `unsupported=3, blocked=12, incomplete=1, wrong_result=6, lifecycle_fail=1, pass=23, not_encoded=11` +- ELF scenario positions: `wins=10, ties=11, loses=1, untested=35` +- Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=17, blocked=13, non_goal=5` + +| Project | Adapter | Evidence Class | Overall | Setup | Run | Result | Docker | Suites | Evidence | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| ELF | `elf_real_world_memory_fixture` | `fixture_backed` | `blocked` | `pass` | `blocked` | `blocked` | `true` | `trust_source_of_truth`: `pass`<br>`work_resume`: `pass`<br>`project_decisions`: `pass`<br>`retrieval`: `pass`<br>`memory_evolution`: `pass`<br>`consolidation`: `pass`<br>`memory_summary`: `pass`<br>`proactive_brief`: `blocked`<br>`scheduled_memory`: `blocked`<br>`knowledge_compilation`: `pass`<br>`operator_debugging_ux`: `pass`<br>`capture_integration`: `pass`<br>`core_archival_memory`: `pass`<br>`production_ops`: `blocked`<br>`personalization`: `pass`<br>`context_trajectory`: `blocked` | setup: `cargo make real-world-memory`<br>result: `tmp/real-world-memory/real-world-memory-report.md` | +| ELF | `elf_live_real_world` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `trust_source_of_truth`: `pass`<br>`work_resume`: `pass`<br>`retrieval`: `pass`<br>`project_decisions`: `pass`<br>`memory_evolution`: `wrong_result`<br>`consolidation`: `pass`<br>`knowledge_compilation`: `pass`<br>`operator_debugging_ux`: `pass`<br>`capture_integration`: `pass`<br>`production_ops`: `blocked`<br>`personalization`: `pass`<br>`core_archival_memory`: `not_encoded`<br>`context_trajectory`: `blocked` | setup: `cargo make real-world-memory-live-adapters`<br>result: `tmp/real-world-memory/live-adapters/elf-report.md` | +| qmd | `qmd_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `retrieval`: `not_encoded`<br>`memory_evolution`: `not_encoded`<br>`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker`<br>result: `docs/guide/benchmarking/live_baseline_benchmark.md` | +| qmd | `qmd_live_real_world` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `trust_source_of_truth`: `pass`<br>`work_resume`: `pass`<br>`retrieval`: `pass`<br>`project_decisions`: `pass`<br>`memory_evolution`: `wrong_result`<br>`consolidation`: `not_encoded`<br>`knowledge_compilation`: `not_encoded`<br>`operator_debugging_ux`: `wrong_result`<br>`capture_integration`: `not_encoded`<br>`production_ops`: `blocked`<br>`personalization`: `pass`<br>`core_archival_memory`: `not_encoded`<br>`context_trajectory`: `blocked` | setup: `cargo make real-world-memory-live-adapters`<br>result: `tmp/real-world-memory/live-adapters/qmd-report.md` | +| ELF | `elf_operator_debug_live` | `live_real_world` | `pass` | `pass` | `pass` | `pass` | `true` | `operator_debugging_ux`: `pass` | setup: `cargo make real-world-job-operator-ux-live-adapters`<br>result: `tmp/real-world-job/operator-ux-live-adapters/elf-report.md` | +| qmd | `qmd_operator_debug_live` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `operator_debugging_ux`: `wrong_result` | setup: `cargo make real-world-job-operator-ux-live-adapters`<br>result: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.md` | +| agentmemory | `agentmemory_live_baseline` | `live_baseline_only` | `lifecycle_fail` | `pass` | `lifecycle_fail` | `lifecycle_fail` | `true` | `work_resume`: `blocked`<br>`capture_integration`: `blocked`<br>`memory_evolution`: `blocked` | setup: `ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker`<br>result: `tmp/live-baseline/live-baseline-report.json` | +| mem0/OpenMemory | `mem0_openmemory_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `memory_evolution`: `not_encoded`<br>`personalization`: `not_encoded`<br>`operator_debugging_ux`: `blocked` | setup: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`<br>result: `tmp/live-baseline/live-baseline-report.json` | +| memsearch | `memsearch_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `trust_source_of_truth`: `not_encoded`<br>`retrieval`: `not_encoded`<br>`memory_evolution`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=memsearch cargo make baseline-live-docker`<br>result: `tmp/live-baseline/live-baseline-report.json` | +| OpenViking | `openviking_live_baseline` | `live_baseline_only` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `retrieval`: `wrong_result`<br>`work_resume`: `not_encoded`<br>`context_trajectory`: `blocked` | setup: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`<br>result: `docs/guide/benchmarking/live_baseline_benchmark.md` | +| claude-mem | `claude_mem_live_baseline` | `live_baseline_only` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `work_resume`: `not_encoded`<br>`operator_debugging_ux`: `blocked`<br>`capture_integration`: `blocked` | setup: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`<br>result: `tmp/live-baseline/live-baseline-report.json` | +| qmd | `qmd_deep_profile_gate` | `research_gate` | `not_encoded` | `pass` | `not_encoded` | `not_encoded` | `true` | `retrieval`: `not_encoded`<br>`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker`<br>result: `docs/research/2026-06-11-qmd-openviking-strength-profile-report.json` | +| OpenViking | `openviking_deep_profile_gate` | `research_gate` | `blocked` | `pass` | `blocked` | `blocked` | `true` | `retrieval`: `wrong_result`<br>`context_trajectory`: `blocked`<br>`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`<br>result: `docs/research/2026-06-11-qmd-openviking-strength-profile-report.json` | +| RAGFlow | `ragflow_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`<br>`knowledge_compilation`: `not_encoded`<br>`production_ops`: `blocked` | setup: `cargo make smoke-ragflow-docker`<br>result: `tmp/real-world-memory/ragflow-smoke/ragflow-report.json` | +| LightRAG | `lightrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`<br>`memory_evolution`: `not_encoded`<br>`operator_debugging_ux`: `not_encoded` | setup: `cargo make smoke-lightrag-docker-context`<br>result: `tmp/real-world-memory/lightrag-context/lightrag-report.json` | +| GraphRAG | `graphrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `knowledge_compilation`: `blocked`<br>`retrieval`: `not_encoded`<br>`production_ops`: `not_encoded`<br>`memory_evolution`: `not_encoded` | setup: `cargo make smoke-graphrag-docker`<br>result: `tmp/real-world-memory/graphrag-smoke/graphrag-report.json` | +| Graphiti/Zep | `graphiti_zep_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `memory_evolution`: `blocked`<br>`retrieval`: `not_encoded`<br>`production_ops`: `not_encoded` | setup: `cargo make smoke-graphiti-zep-docker-temporal`<br>result: `tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json` | +| Letta | `letta_research_gate` | `research_gate` | `blocked` | `blocked` | `not_encoded` | `not_encoded` | `true` | `personalization`: `not_encoded`<br>`project_decisions`: `not_encoded`<br>`work_resume`: `not_encoded`<br>`core_archival_memory`: `blocked` | setup: `Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract is a Docker-only benchmark-created agent export that must return core block JSON, archival search readback, and source ids before any scenario claim is scored.`<br>result: `No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision result is claimed.` | +| LangGraph | `langgraph_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `production_ops`: `not_encoded`<br>`work_resume`: `not_encoded` | setup: `LangGraph is D1 reviewed as a replay/checkpoint reference, not a direct memory backend adapter.`<br>result: `No production-ops or resume suite result is claimed.` | +| nanograph | `nanograph_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `memory_evolution`: `not_encoded`<br>`retrieval`: `not_encoded` | setup: `nanograph is D1 reviewed as typed graph DX, but no Docker adapter is implemented.`<br>result: `No graph temporal or retrieval-debug result is claimed.` | +| llm-wiki | `llm_wiki_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `knowledge_compilation`: `not_encoded`<br>`work_resume`: `not_encoded` | setup: `llm-wiki is D1 reviewed as a knowledge-compilation reference, but no plugin or generated-page adapter is implemented.`<br>result: `No knowledge page citation or lint result is claimed.` | +| gbrain | `gbrain_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `knowledge_compilation`: `not_encoded`<br>`operator_debugging_ux`: `not_encoded` | setup: `gbrain is D1 reviewed as a compiled-truth and timeline reference, but no Docker adapter is implemented.`<br>result: `No knowledge-synthesis or operator-continuity result is claimed.` | +| graphify | `graphify_docker_smoke` | `live_real_world` | `wrong_result` | `pass` | `pass` | `wrong_result` | `true` | `knowledge_compilation`: `wrong_result`<br>`retrieval`: `blocked`<br>`work_resume`: `not_encoded` | setup: `cargo make smoke-graphify-docker-graph-report`<br>result: `tmp/real-world-memory/graphify-smoke/graphify-report.json` | + +### Adapter Capability Details + +| Adapter | Capability | Status | Evidence | +| --- | --- | --- | --- | +| `elf_real_world_memory_fixture` | real_world_job_fixture_scoring | `real` | The runner scores checked-in real_world_job records with expected evidence, traps, and typed status output. | +| `elf_real_world_memory_fixture` | live_external_adapter_execution | `not_encoded` | The ELF fixture response path does not exercise an external memory project runtime. | +| `elf_real_world_memory_fixture` | docker_isolated_baseline | `pass` | ELF live baseline runs execute through docker-compose.baseline.yml for retrieval and lifecycle evidence. | +| `elf_live_real_world` | real_world_job_adapter | `pass` | The adapter executes real_world_job prompts after runtime ingestion and writes generated answer artifacts before scoring. | +| `elf_live_real_world` | service_runtime_execution | `real` | The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker. | +| `elf_live_real_world` | targeted_live_pass | `pass` | The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions. | +| `elf_live_real_world` | full_suite_live_sweep | `wrong_result` | The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution is wrong_result and production/core/context boundaries remain typed non-pass. | +| `elf_live_real_world` | full_suite_live_pass | `wrong_result` | No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes. | +| `elf_live_real_world` | typed_failure_reporting | `pass` | Adapter setup/runtime limitations are materialized as typed jobs with evidence JSON instead of silent claim upgrades. | +| `qmd_live_baseline` | same_corpus_retrieval | `pass` | qmd has an encoded Docker same-corpus retrieval adapter. | +| `qmd_live_baseline` | update_delete_cold_start | `pass` | qmd lifecycle smoke checks are encoded in the live-baseline runner. | +| `qmd_live_baseline` | real_world_job_adapter | `not_encoded` | This live_baseline_only record does not execute real_world_job prompts; cite qmd_live_real_world for the full live real-world sweep. | +| `qmd_live_real_world` | real_world_job_adapter | `pass` | qmd executes real_world_job prompts through its local CLI retrieval/query workflow and records generated answer artifacts. | +| `qmd_live_real_world` | local_cli_retrieval | `real` | The adapter uses qmd collection add, update, embed -f, and query --json inside Docker. | +| `qmd_live_real_world` | targeted_live_pass | `pass` | The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions. | +| `qmd_live_real_world` | full_suite_live_sweep | `wrong_result` | The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution and operator_debugging_ux are wrong_result while non-qmd product surfaces remain typed not_encoded or blocked. | +| `qmd_live_real_world` | full_suite_live_pass | `wrong_result` | No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes. | +| `qmd_live_real_world` | typed_failure_reporting | `pass` | qmd setup/runtime limitations are materialized as typed jobs with command evidence and retry artifacts. | +| `elf_operator_debug_live` | operator_debug_real_world_job_adapter | `pass` | The adapter executes the checked-in operator_debugging_ux jobs through the live service materializer and generated scoring fixtures. | +| `elf_operator_debug_live` | trace_hydration_metadata | `pass` | Generated operator_debug records include service trace ids, viewer links, admin trace-bundle URLs, and trace_available=true. | +| `elf_operator_debug_live` | replay_command_metadata | `pass` | Generated operator_debug records include admin trace-bundle curl replay commands; no raw SQL path is required. | +| `elf_operator_debug_live` | candidate_drop_visibility | `pass` | The operator-debug jobs keep dropped-candidate visibility as explicit job-level evidence instead of relying on direct database inspection. | +| `elf_operator_debug_live` | openmemory_or_claude_mem_ui_runner | `not_encoded` | This ELF live slice does not launch OpenMemory or claude-mem UI flows. | +| `qmd_operator_debug_live` | operator_debug_real_world_job_adapter | `pass` | The adapter executes the checked-in operator_debugging_ux jobs through qmd local CLI materialization and generated scoring fixtures. | +| `qmd_operator_debug_live` | local_replay_command_metadata | `pass` | Generated operator_debug records include qmd query replay commands tied to per-job collections. | +| `qmd_operator_debug_live` | trace_hydration_metadata | `wrong_result` | Generated qmd operator_debug records have trace_available=false and no ELF viewer/admin trace bundle because qmd exposes local replay rows rather than service trace hydration. | +| `qmd_operator_debug_live` | candidate_drop_visibility | `wrong_result` | qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed in the generated artifact. | +| `qmd_operator_debug_live` | openmemory_or_claude_mem_ui_runner | `not_encoded` | This qmd live slice does not launch OpenMemory or claude-mem UI flows. | +| `agentmemory_live_baseline` | same_corpus_retrieval | `pass` | The current adapter can run mem::remember and mem::search against the shared corpus. | +| `agentmemory_live_baseline` | adapter_storage | `mocked` | The current adapter uses a process-local StateKV Map and in-memory index. | +| `agentmemory_live_baseline` | durable_cold_start | `blocked` | A persistent upstream KV/index path or hosted runtime is needed before cold-start recovery can be fairly scored. | +| `agentmemory_live_baseline` | durable_work_resume_capture_path | `blocked` | XY-925 selects the next local path as a Docker-contained agentmemory session directory with persisted SDK KV store, observation log, and searchable index across a fresh process; the current StateKV Map and in-memory index still block scoring. | +| `agentmemory_live_baseline` | write_policy_hook_capture | `blocked` | Capture/write-policy jobs require live agentmemory hook observations plus persisted write-policy audit evidence. The current adapter does not execute those hooks. | +| `agentmemory_live_baseline` | real_world_job_adapter | `blocked` | XY-925 adds fixture-backed blocked prompt coverage for the required durable path, but no live agentmemory real_world_job adapter executes prompts until the persistent local store exists. | +| `mem0_openmemory_live_baseline` | local_storage | `real` | The adapter targets local FastEmbed, Qdrant path storage, and local history DB paths in Docker. | +| `mem0_openmemory_live_baseline` | same_corpus_retrieval | `pass` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks. | +| `mem0_openmemory_live_baseline` | local_lifecycle_update_delete_reload | `pass` | The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; the fresh scoped run reports those lifecycle checks passing. | +| `mem0_openmemory_live_baseline` | preference_correction_history | `pass` | The fresh scoped run reports preference_correction_history as pass: Memory.history preserved explicit ADD and UPDATE records with old and current preference text, and search returned only the current correction. | +| `mem0_openmemory_live_baseline` | entity_scoped_personalization | `pass` | The fresh scoped run reports entity_scoped_personalization as pass: user_id, agent_id, and run_id filters returned the ELF scoped preference and omitted a PubFi scoped preference. | +| `mem0_openmemory_live_baseline` | local_get_all_export_readback | `pass` | The fresh scoped run reports local_get_all_export_readback as pass: Memory.get_all returned the current scoped preference and omitted the other scope. | +| `mem0_openmemory_live_baseline` | deletion_audit_history | `pass` | The fresh scoped run reports delete_history_audit_readback as pass: Memory.history exposed a DELETE event and search suppressed the deleted memory. | +| `mem0_openmemory_live_baseline` | openmemory_ui_readback | `blocked` | XY-931 runs a bounded OpenMemory export-helper setup probe after the mem0 SDK corpus checks. The probe finds the OpenMemory tree, UI package, compose file, and export helper, then records a setup blocker because the export helper requires Docker access to a running OpenMemory container. Local SDK get_all readback is measured separately and must not be reused as UI evidence. | +| `mem0_openmemory_live_baseline` | hosted_managed_memory_claims | `unsupported` | Hosted mem0 Platform behavior and Platform UI export are outside the local OSS Docker adapter and are non-goals for this local evidence record. | +| `mem0_openmemory_live_baseline` | real_world_job_adapter | `not_encoded` | No mem0/OpenMemory adapter currently executes real_world_job prompts and answer scoring. | +| `mem0_openmemory_live_baseline` | optional_graph_memory | `not_encoded` | Optional graph memory is not enabled in the default local OSS path and remains an opt-in scenario gate rather than a default pass/fail claim. | +| `memsearch_live_baseline` | canonical_markdown_store | `real` | memsearch is tracked as a Markdown-first source-of-truth reference. | +| `memsearch_live_baseline` | same_corpus_retrieval | `pass` | Fresh comparable baseline run live-baseline-20260611061612 reports memsearch retrieval_pass with 3/3 same-corpus retrieval checks. | +| `memsearch_live_baseline` | reindex_update_delete_reload | `pass` | The runner rewrites auth-memory.md, deletes a second corpus file, reruns memsearch index, and starts fresh memsearch search processes; the fresh scoped run reports update, delete, and cold-start reload passing. | +| `memsearch_live_baseline` | real_world_job_adapter | `not_encoded` | XY-925 adds fixture-backed prompt coverage for the Markdown source-store and retrieval-debug jobs, but no live memsearch runtime adapter executes real_world_job prompts and answer scoring. | +| `memsearch_live_baseline` | markdown_source_store_prompt_jobs | `pass` | The first-generation OSS fixture slice encodes source-of-truth rebuild/reload and retrieval-debug prompts over the canonical Markdown store while preserving the live-baseline-only evidence boundary. | +| `openviking_live_baseline` | local_embed_setup | `pass` | Docker local embedding dependency setup is pinned to llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu and reached import/runtime in the smoke run. | +| `openviking_live_baseline` | same_corpus_retrieval | `wrong_result` | OpenViking add_resource/find returned resources but missed expected evidence-term matches for every smoke query. | +| `openviking_live_baseline` | context_trajectory | `blocked` | OpenViking staged/hierarchical retrieval is now encoded as blocked context_trajectory fixtures until same-corpus expected evidence ids match and staged artifacts are materialized. | +| `openviking_live_baseline` | real_world_job_adapter | `not_encoded` | No OpenViking adapter currently executes real_world_job prompts and answer scoring. | +| `claude_mem_live_baseline` | same_corpus_retrieval | `wrong_result` | The current Docker adapter did not prove correct same-corpus retrieval. | +| `claude_mem_live_baseline` | durable_storage | `real` | The runner writes to a Docker-local SQLite file and constructs a new Database plus repository instances for cold-start recovery search. | +| `claude_mem_live_baseline` | repository_lifecycle | `real` | The runner uses MemoryItemsRepository.update, deletes from the repository-owned memory_items table, and relies on repository FTS triggers for update/delete checks. | +| `claude_mem_live_baseline` | repository_progressive_disclosure | `real` | The runner verifies search result to getById detail hydration and listSources source evidence on the durable repository path. | +| `claude_mem_live_baseline` | progressive_disclosure_real_world_job | `pass` | XY-925 adds fixture-backed prompt coverage for the Docker-contained repository progressive-disclosure path: search result to getById detail hydration and listSources evidence on durable SQLite. Hook, timeline, and viewer workflows remain blocked separately. | +| `claude_mem_live_baseline` | retrieval_repair_artifact | `wrong_result` | The same-corpus retrieval smoke remains wrong_result, and XY-925 records a repair prompt that tells operators to rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker before inspecting tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. | +| `claude_mem_live_baseline` | hook_capture_viewer_workflow | `blocked` | The current Docker runner does not launch claude-mem hooks, timeline capture, local viewer readback, or an operator workflow over the same corpus. | +| `qmd_deep_profile_gate` | stress_profile_retrieval_debug | `not_encoded` | The stress command path exists, but this adapter-pack gate has not published a deep qmd profile result. | +| `qmd_deep_profile_gate` | real_world_job_adapter | `not_encoded` | The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run. | +| `qmd_deep_profile_gate` | host_global_install_boundary | `unsupported` | Repository-supported qmd benchmark runs must stay inside docker-compose.baseline.yml and must not require host-global installs. | +| `openviking_deep_profile_gate` | docker_local_embed_setup | `pass` | The local embedding setup is pinned and reaches import/runtime in Docker. | +| `openviking_deep_profile_gate` | hierarchical_context_trajectory | `blocked` | Stage trajectory scoring is encoded as blocked until the smoke adapter returns evidence-bearing same-corpus output and selected hierarchy/expansion artifacts. | +| `openviking_deep_profile_gate` | host_global_install_boundary | `unsupported` | The adapter pack must not ask operators to install OpenViking dependencies globally on the host. | +| `ragflow_research_gate` | adapter_candidate_verdict | `not_encoded` | XY-882 completed D1/D2 feasibility research and marks RAGFlow adapter_candidate; no adapter run is encoded. | +| `ragflow_research_gate` | docker_service_setup | `blocked` | The smoke records official Docker setup, image/disk/startup envelope, CPU/GPU mode, vm.max_map_count handling, provider boundaries, and retry behavior. | +| `ragflow_research_gate` | real_world_job_adapter | `blocked` | One generated retrieval job is scored from the smoke artifact or typed blocked when resource, service, or local API-key boundaries stop execution. | +| `ragflow_research_gate` | quality_or_scale_claim | `not_encoded` | The scored smoke does not claim broad RAGFlow quality, private corpus behavior, scale, or comparative ranking. | +| `lightrag_research_gate` | docker_service_setup | `blocked` | The opt-in compose profile records explicit LightRAG image, LLM, embedding, rerank, workspace, and Docker volume configuration without host-global installs. | +| `lightrag_research_gate` | retrieved_context_export | `blocked` | The materializer calls /documents/texts, waits on /documents/track_status, and queries /query with only_need_context plus chunk references when the service is reachable. | +| `lightrag_research_gate` | real_world_job_adapter | `blocked` | The LightRAG materializer rewrites generated retrieval fixtures with adapter_response evidence only when source paths or context map to required evidence ids. | +| `lightrag_research_gate` | quality_or_scale_claim | `not_encoded` | The smoke does not score broad graph-RAG quality, private corpora, scale, or comparative ranking claims. | +| `graphrag_research_gate` | indexing_resource_envelope | `blocked` | The smoke bounds the generated public corpus, timeout, GraphRAG package, model configuration, cache size, output size, elapsed time, and observed cache entries. | +| `graphrag_research_gate` | source_citation_mapping | `blocked` | The generated artifact maps GraphRAG documents, text_units, communities, community_reports, entities, and relationships parquet rows back to real_world_job evidence ids when available. | +| `graphrag_research_gate` | real_world_job_adapter | `blocked` | The smoke writes a generated real_world_job fixture and scored report; provider/setup limits remain blocked until live GraphRAG output maps to expected evidence ids. | +| `graphrag_research_gate` | quality_or_scale_claim | `not_encoded` | The smoke does not claim broad graph-navigation quality, knowledge-synthesis quality, private corpora, or large-corpus indexing. | +| `graphiti_zep_research_gate` | temporal_graph_memory | `blocked` | The smoke materializes generated current, historical, and rationale facts with validity windows, but the checked-in record stays blocked until a live artifact maps search output. | +| `graphiti_zep_research_gate` | docker_graph_store_setup | `blocked` | The task uses a Docker Compose graphiti-zep profile for FalkorDB and a container-local Python venv; no host-global graph database or hosted Zep service is used. | +| `graphiti_zep_research_gate` | real_world_job_adapter | `blocked` | The generated temporal-validity fixture is scored or typed blocked; live quality evidence requires Graphiti/Zep search output mapped to current and historical evidence ids. | +| `graphiti_zep_research_gate` | quality_or_scale_claim | `not_encoded` | The smoke does not claim broad graph-memory quality, managed Zep service behavior, private-corpus behavior, or large-corpus performance. | +| `letta_research_gate` | core_archival_memory | `blocked` | ELF fixture jobs now score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids. | +| `letta_research_gate` | docker_embedding_configuration | `blocked` | Docker setup requires explicit embedding configuration before archival retrieval can be tested. | +| `letta_research_gate` | real_world_job_adapter | `not_encoded` | No Letta materializer or scorer mapping exists. | +| `langgraph_research_gate` | checkpoint_replay_regression | `not_encoded` | Replay/fork behavior needs an agent graph harness before scoring. | +| `langgraph_research_gate` | standalone_memory_backend | `unsupported` | LangGraph persistence is an agent-state/checkpoint layer, not a drop-in memory retrieval backend. | +| `langgraph_research_gate` | real_world_job_adapter | `not_encoded` | No LangGraph benchmark materializer exists. | +| `nanograph_research_gate` | typed_graph_schema | `not_encoded` | Schema-as-code and typed query ergonomics need a benchmark harness. | +| `nanograph_research_gate` | memory_backend_comparison | `unsupported` | nanograph is a graph database reference, not a complete agent memory service. | +| `nanograph_research_gate` | real_world_job_adapter | `not_encoded` | No nanograph materializer exists. | +| `llm_wiki_research_gate` | knowledge_page_compilation | `not_encoded` | Wiki generation and citation lint are not executed by the runner. | +| `llm_wiki_research_gate` | live_service_runtime | `unsupported` | llm-wiki is a plugin/workflow reference rather than a service adapter. | +| `llm_wiki_research_gate` | real_world_job_adapter | `not_encoded` | No page materializer or scorer mapping exists. | +| `gbrain_research_gate` | compiled_truth_timeline | `not_encoded` | Compiled truth plus timeline output is a reference pattern but not scored. | +| `gbrain_research_gate` | postgres_backed_brain_repo | `blocked` | A Docker-local brain repo and Postgres setup path must be proven before execution. | +| `gbrain_research_gate` | real_world_job_adapter | `not_encoded` | No gbrain materializer exists. | +| `graphify_docker_smoke` | docker_cli_boundary | `pass` | The smoke uses docker-compose.baseline.yml baseline-runner, a container-local Python venv, and isolated assistant config paths; it does not install host-global assistant hooks. | +| `graphify_docker_smoke` | graph_report_generation | `pass` | The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, command logs, build time, graph size, and report size. | +| `graphify_docker_smoke` | real_world_job_adapter | `wrong_result` | The smoke writes a generated real_world_job fixture and scored report; current knowledge_compilation scoring is wrong_result, not pass. | +| `graphify_docker_smoke` | multimodal_code_graph | `not_encoded` | Multimodal extraction for videos, images, PDFs, or broad codebase understanding is a reference capability but not scored by this smoke. | +| `graphify_docker_smoke` | quality_or_scale_claim | `not_encoded` | The smoke does not claim broad graph quality, private corpus behavior, scale, or authoritative memory-store behavior. | + +### Adapter Scenario Judgments + +| Adapter | Scenario | Suite | Status | Outcome | Evidence | +| --- | --- | --- | --- | --- | --- | +| `elf_live_real_world` | `live_capture_write_policy` | `capture_integration` | `pass` | `tie` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is an ELF self-check, not a win over external hook systems.<br>command: `cargo make real-world-memory-live-adapters`<br>artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_live_real_world` | `live_consolidation_proposal_review` | `consolidation` | `pass` | `tie` | ELF live consolidation jobs now exercise source lineage, unsupported-claim flags, and apply/defer/discard review audit transitions. This is an ELF service self-check, not a broad competitor win.<br>command: `cargo make real-world-memory-live-adapters`<br>artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_live_real_world` | `live_knowledge_page_rebuild_lint` | `knowledge_compilation` | `pass` | `tie` | ELF live knowledge jobs now exercise page rebuild, search, stale-source lint, citations, backlinks, and unsupported-section handling. This is an ELF service self-check, not a broad knowledge-product win.<br>command: `cargo make real-world-memory-live-adapters`<br>artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_live_real_world` | `full_sweep_operator_debug` | `operator_debugging_ux` | `pass` | `win` | ELF full live sweep now includes the operator-debug fixture tree with hydrated trace ids, trace-bundle replay commands, dropped-candidate visibility, repair guidance, and no raw SQL requirement.<br>command: `cargo make real-world-memory-live-adapters`<br>artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_operator_debug_live` | `operator_debug_trace_hydration` | `operator_debugging_ux` | `pass` | `win` | ELF generated trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs for the operator-debug jobs; qmd has replay rows but no ELF trace hydration surface.<br>command: `cargo make real-world-job-operator-ux-live-adapters`<br>artifact: `tmp/real-world-job/operator-ux-live-adapters/elf-report.json` | +| `elf_operator_debug_live` | `operator_debug_replay_command` | `operator_debugging_ux` | `pass` | `tie` | ELF generated admin trace-bundle replay commands; qmd generated local CLI query replay commands. These are comparable replay-command availability artifacts, not equivalent UI quality claims.<br>command: `cargo make real-world-job-operator-ux-live-adapters`<br>artifact: `tmp/real-world-job/operator-ux-live-adapters/summary.json` | +| `elf_operator_debug_live` | `operator_debug_candidate_drop_visibility` | `operator_debugging_ux` | `pass` | `win` | ELF generated operator_debug candidate-drop visibility from trace and replay-candidate metadata without direct SQL assumptions; qmd keeps only top-k replay rows and lacks intermediate candidate-drop stages.<br>command: `cargo make real-world-job-operator-ux-live-adapters`<br>artifact: `tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json` | +| `elf_operator_debug_live` | `operator_debug_repair_action_clarity` | `operator_debugging_ux` | `pass` | `tie` | ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.<br>command: `cargo make real-world-job-operator-ux-live-adapters`<br>artifact: `tmp/real-world-job/operator-ux-live-adapters/summary.json` | +| `elf_operator_debug_live` | `operator_debug_selected_but_not_narrated` | `operator_debugging_ux` | `pass` | `win` | The new selected-but-not-narrated job scores whether selected trace evidence is available for answer-composition repair without direct database inspection.<br>command: `cargo make real-world-job-operator-ux-live-adapters`<br>artifact: `tmp/real-world-job/operator-ux-live-adapters/elf-report.json` | +| `qmd_operator_debug_live` | `operator_debug_trace_hydration` | `operator_debugging_ux` | `wrong_result` | `win` | qmd generated replay-command metadata but trace_available=false, so ELF wins only this trace-hydration dimension; this is not a broad qmd loss.<br>command: `cargo make real-world-job-operator-ux-live-adapters`<br>artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` | +| `qmd_operator_debug_live` | `operator_debug_replay_command` | `operator_debugging_ux` | `pass` | `tie` | qmd generated local CLI query replay commands for the same operator-debugging scenarios; ELF generated admin trace-bundle curl commands.<br>command: `cargo make real-world-job-operator-ux-live-adapters`<br>artifact: `tmp/real-world-job/operator-ux-live-adapters/summary.json` | +| `qmd_operator_debug_live` | `operator_debug_candidate_drop_visibility` | `operator_debugging_ux` | `wrong_result` | `win` | qmd generated top-k replay output but not intermediate retrieved-but-dropped stage visibility, so candidate-drop diagnosis remains a qmd wrong_result in this narrow slice.<br>command: `cargo make real-world-job-operator-ux-live-adapters`<br>artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json` | +| `qmd_operator_debug_live` | `operator_debug_repair_action_clarity` | `operator_debugging_ux` | `pass` | `tie` | qmd generated clear local replay steps for repair investigation, matching ELF on repair-action clarity while differing on trace hydration.<br>command: `cargo make real-world-job-operator-ux-live-adapters`<br>artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` | +| `qmd_operator_debug_live` | `operator_debug_selected_but_not_narrated` | `operator_debugging_ux` | `wrong_result` | `win` | qmd can replay top-k rows, but the generated artifact does not expose service trace narration stages for the selected-but-not-narrated diagnosis.<br>command: `cargo make real-world-job-operator-ux-live-adapters`<br>artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` | +| `agentmemory_live_baseline` | `basic_same_corpus_retrieval` | `retrieval` | `pass` | `not_tested` | Fresh comparable baseline run live-baseline-20260611061612 reports agentmemory retrieval_pass with 3/3 same-corpus retrieval checks through mem::remember and mem::search. This is live-baseline-only evidence through an in-memory mock, not a real_world_job suite pass.<br>command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`<br>artifact: `tmp/live-baseline/live-baseline-report.json` | +| `agentmemory_live_baseline` | `durable_update_reload_lifecycle` | `memory_evolution` | `lifecycle_fail` | `win` | Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks, while agentmemory update_replaces_note_text is lifecycle_fail and cold_start_recovery_search is blocked because the harness uses an in-memory SDK/KV mock. This is an ELF baseline win only at the local lifecycle-smoke evidence class.<br>command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`<br>artifact: `tmp/live-baseline/live-baseline-report.json` | +| `agentmemory_live_baseline` | `work_resume_capture_continuity` | `work_resume` | `blocked` | `blocked` | agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. XY-925 selects the durable local path as a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; keep work_resume and capture claims blocked until that path exists.<br>command: `cargo make real-world-first-generation-oss`<br>artifact: `tmp/real-world-memory/first-generation-oss/report.json` | +| `agentmemory_live_baseline` | `durable_work_resume_local_path` | `work_resume` | `blocked` | `blocked` | The selected comparable path is explicit: capture into a Docker-local agentmemory session directory, persist the SDK KV/index and observation log, restart a fresh process, then score work_resume prompts. The checked-in fixture records this as blocked rather than scoring the current mock.<br>command: `cargo make real-world-first-generation-oss`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json` | +| `agentmemory_live_baseline` | `capture_write_policy_hooks` | `capture_integration` | `blocked` | `blocked` | agentmemory capture/write-policy comparison needs live hook observations and write-policy audit evidence persisted through the selected local store. The fixture preserves this as a typed blocker and does not convert the mem::remember smoke into capture proof.<br>command: `cargo make real-world-first-generation-oss`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json` | +| `mem0_openmemory_live_baseline` | `basic_local_lifecycle` | `memory_evolution` | `pass` | `tie` | Prior comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing basic same-corpus retrieval, update, delete, and cold-start reload checks. This remains a basic local lifecycle tie at the encoded smoke surface and is not reused as history/UI evidence.<br>command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`<br>artifact: `tmp/live-baseline/live-baseline-report.json` | +| `mem0_openmemory_live_baseline` | `preference_correction_history` | `personalization` | `pass` | `loss` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.<br>command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`<br>artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| `mem0_openmemory_live_baseline` | `entity_scoped_personalization` | `personalization` | `pass` | `tie` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.<br>command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`<br>artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| `mem0_openmemory_live_baseline` | `delete_audit_readback` | `memory_evolution` | `pass` | `tie` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.<br>command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`<br>artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| `mem0_openmemory_live_baseline` | `local_get_all_export_readback` | `operator_debugging_ux` | `pass` | `not_tested` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.<br>command: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`<br>artifact: `tmp/live-baseline/mem0-checks.json` | +| `mem0_openmemory_live_baseline` | `openmemory_ui_export_readback` | `operator_debugging_ux` | `blocked` | `blocked` | The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.<br>command: `cargo make openmemory-ui-export-readback`<br>artifact: `tmp/live-baseline/mem0-openmemory-ui-export.json` | +| `mem0_openmemory_live_baseline` | `hosted_platform_export` | `operator_debugging_ux` | `unsupported` | `non_goal` | Hosted mem0 Platform export is explicitly outside the local OSS Docker comparison and is not counted as a local pass, loss, or blocker.<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `mem0_openmemory_live_baseline` | `optional_graph_memory` | `memory_evolution` | `not_encoded` | `non_goal` | Optional graph memory is kept as an opt-in scenario gate. It is not enabled in the default mem0 local OSS run and is not part of the default pass/fail comparison.<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `memsearch_live_baseline` | `canonical_markdown_reindex_reload` | `trust_source_of_truth` | `pass` | `not_tested` | Fresh comparable baseline run live-baseline-20260611061612 reports memsearch passed same-corpus retrieval, update reindex, delete suppression, and cold-start reload over a canonical Markdown corpus. ELF has no directly comparable canonical Markdown source-store scenario in this baseline, so the ELF position remains untested.<br>command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`<br>artifact: `tmp/live-baseline/live-baseline-report.json` | +| `memsearch_live_baseline` | `markdown_source_store_rebuild_reload_prompt` | `trust_source_of_truth` | `pass` | `not_tested` | XY-925 adds a checked-in real_world_job prompt fixture that asks for the memsearch source-of-truth path and rebuild/reload boundary: canonical Markdown files are authoritative, while the index is derived by rerunning memsearch index. This is fixture-backed scenario coverage plus baseline artifact evidence, not a memsearch live real_world_job suite pass.<br>command: `cargo make real-world-first-generation-oss`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json` | +| `memsearch_live_baseline` | `markdown_retrieval_debug_prompt` | `operator_debugging_ux` | `pass` | `not_tested` | XY-925 adds a checked-in retrieval-debug prompt over memsearch's canonical Markdown store. The expected debug surface is CLI replay plus Markdown source inspection and reindexing; staged expansion/fusion/rerank/candidate-drop trace bundles remain not encoded for memsearch.<br>command: `cargo make real-world-first-generation-oss`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json` | +| `memsearch_live_baseline` | `ttl_expiry_lifecycle` | `memory_evolution` | `unsupported` | `non_goal` | The encoded memsearch CLI path supports reindex/delete but no TTL or expiry behavior. Unsupported TTL behavior is preserved as unsupported competitor evidence and does not create an ELF win/loss claim without a directly comparable scenario artifact.<br>artifact: `tmp/live-baseline/live-baseline-report.json` | +| `memsearch_live_baseline` | `real_world_prompt_adapter` | `retrieval` | `not_encoded` | `not_tested` | No live memsearch runtime adapter currently executes real_world_job prompts and answer scoring. XY-925 fixture-backed prompt jobs document the source-store and retrieval-debug shape, while baseline retrieval/reindex evidence remains separate from suite pass claims.<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `claude_mem_live_baseline` | `same_corpus_retrieval` | `retrieval` | `wrong_result` | `win` | Fresh comparable baseline run live-baseline-20260611061612 reports ELF retrieval_pass and claude-mem same_corpus_retrieval as wrong_result with 0/3 expected query checks passing, while its durable repository setup completed. This is an ELF baseline win for the narrow retrieval smoke scenario.<br>command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`<br>artifact: `tmp/live-baseline/live-baseline-report.json` | +| `claude_mem_live_baseline` | `retrieval_repair_artifact_path` | `retrieval` | `wrong_result` | `win` | XY-925 adds a checked-in repair prompt that preserves the claude-mem wrong_result and names rerun/inspection targets from the reproducible Docker baseline: tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. This is repair evidence for a miss, not a retrieval pass.<br>command: `cargo make real-world-first-generation-oss`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json` | +| `claude_mem_live_baseline` | `repository_lifecycle_reload` | `memory_evolution` | `pass` | `tie` | Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing local lifecycle checks and claude-mem update, delete, and cold-start reload checks passing over a durable Docker-local SQLite repository. This is a local lifecycle-smoke tie, not a hook-driven work-resume or full progressive-disclosure job pass.<br>command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`<br>artifact: `tmp/live-baseline/live-baseline-report.json` | +| `claude_mem_live_baseline` | `progressive_disclosure_detail_hydration` | `operator_debugging_ux` | `pass` | `not_tested` | claude-mem passed the repository-level search-to-detail/source hydration check, which is a useful progressive-disclosure signal. ELF does not have a directly comparable claude-mem-style progressive-disclosure scenario in this baseline, so the ELF position remains untested rather than a loss claim.<br>command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`<br>artifact: `tmp/live-baseline/live-baseline-report.json` | +| `claude_mem_live_baseline` | `progressive_disclosure_prompt` | `operator_debugging_ux` | `pass` | `not_tested` | XY-925 adds fixture-backed prompt coverage that asks for the measured claude-mem progressive-disclosure boundary: repository search results hydrate through getById and listSources on durable SQLite, but hooks, timeline, viewer, and live prompt scoring are not executed.<br>command: `cargo make real-world-first-generation-oss`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json` | +| `claude_mem_live_baseline` | `hook_capture_viewer_workflow` | `capture_integration` | `blocked` | `blocked` | The Docker baseline uses repository classes only. claude-mem hooks, viewer, timeline, and observation workflows are not executed by the runner, so XY-925 preserves this as a typed blocker rather than not_encoded prose.<br>command: `cargo make real-world-first-generation-oss`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json` | +| `claude_mem_live_baseline` | `viewer_operator_workflow` | `operator_debugging_ux` | `blocked` | `blocked` | A fair claude-mem viewer/operator comparison needs a Docker-contained run that opens the local viewer or equivalent readback over the same durable SQLite corpus and emits timeline, detail hydration, and repair-command artifacts. That path is not available in the current runner.<br>command: `cargo make real-world-first-generation-oss`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json` | +| `ragflow_research_gate` | `reference_chunk_citation_mapping` | `retrieval` | `blocked` | `blocked` | XY-929 adds a representative blocked fixture for RAGFlow reference-chunk citation scoring. The job must remain blocked until returned reference chunks include generated document ids, chunk ids, content, and document metadata mapped to benchmark evidence ids.<br>command: `cargo make real-world-memory-graph-rag`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json` | +| `ragflow_research_gate` | `private_or_large_corpus_ragflow_quality` | `retrieval` | `not_encoded` | `non_goal` | Private corpus, large-corpus, and hosted RAGFlow quality are outside the generated-public Docker representative lane and must not be inferred from smoke reports.<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `lightrag_research_gate` | `context_source_reference_mapping` | `retrieval` | `incomplete` | `blocked` | XY-929 adds a representative incomplete fixture for LightRAG context/source-reference scoring. The job cannot score until the opt-in Docker API exports generated source file paths, snippets, or reference content.<br>command: `cargo make real-world-memory-graph-rag`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json` | +| `lightrag_research_gate` | `graph_rag_navigation_quality` | `retrieval` | `not_encoded` | `not_tested` | LightRAG graph-RAG navigation quality remains not_tested beyond the context-source output contract; no ELF win, tie, or loss is claimed.<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `graphrag_research_gate` | `output_table_citation_mapping` | `knowledge_compilation` | `blocked` | `blocked` | XY-929 adds a representative blocked fixture for GraphRAG output-table citation scoring. The job requires provider-backed Docker output tables whose document, text-unit, community, report, entity, and relationship identifiers map to generated evidence ids.<br>command: `cargo make real-world-memory-graph-rag`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json` | +| `graphrag_research_gate` | `graph_summary_synthesis_quality` | `knowledge_compilation` | `not_encoded` | `not_tested` | GraphRAG graph-summary synthesis quality remains not_tested until provider-backed output tables and local-search context are scored beyond the smoke contract.<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `graphiti_zep_research_gate` | `temporal_validity_window_mapping` | `memory_evolution` | `blocked` | `blocked` | XY-929 adds a representative blocked fixture for Graphiti/Zep temporal-validity scoring. The job remains blocked until provider-backed Docker output maps current and historical validity-window facts to generated evidence ids.<br>command: `cargo make real-world-memory-graph-rag`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json` | +| `graphiti_zep_research_gate` | `hosted_zep_temporal_memory` | `memory_evolution` | `unsupported` | `non_goal` | Hosted Zep service behavior is outside the Docker-local representative lane; no hosted-service result is used as ELF win/loss evidence.<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `letta_research_gate` | `core_block_attachment_readback` | `core_archival_memory` | `not_encoded` | `not_tested` | ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta has no comparable exported core block attachment evidence.<br>artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json` | +| `letta_research_gate` | `core_block_scope_readback` | `core_archival_memory` | `not_encoded` | `not_tested` | ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains unscored without a contained export of agent, block, and visibility metadata.<br>artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json` | +| `letta_research_gate` | `core_block_provenance_readback` | `core_archival_memory` | `not_encoded` | `not_tested` | ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains not_tested until exported core memory includes stable source ids and audit-equivalent events.<br>artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json` | +| `letta_research_gate` | `stale_core_detection` | `core_archival_memory` | `blocked` | `blocked` | ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.<br>artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json` | +| `letta_research_gate` | `archival_fallback_readback` | `core_archival_memory` | `blocked` | `blocked` | ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.<br>artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json` | +| `letta_research_gate` | `core_archival_project_decision_recovery` | `core_archival_memory` | `not_encoded` | `not_tested` | ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains not_tested until the contained export/readback contract exists.<br>artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json` | +| `llm_wiki_research_gate` | `wiki_page_citation_lint` | `knowledge_compilation` | `not_encoded` | `not_tested` | llm-wiki remains a knowledge-workflow reference. No Docker-contained plugin or file-based page materializer emits cited wiki sections for scoring.<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `gbrain_research_gate` | `compiled_truth_timeline_export` | `knowledge_compilation` | `blocked` | `blocked` | gbrain compiled-truth and timeline scoring remains blocked until a Docker-local brain repository and database setup emits current-truth pages with source timeline evidence.<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `graphify_docker_smoke` | `graph_report_navigation_lint` | `knowledge_compilation` | `wrong_result` | `not_tested` | XY-929 adds a representative graphify fixture that scores graph report navigation, source-location citations, stale-source lint, and unsupported-summary handling as wrong_result because stale-source lint is still missing. This remains graphify non-pass evidence, not an ELF victory claim.<br>command: `cargo make real-world-memory-graph-rag`<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json` | +| `graphify_docker_smoke` | `broad_graph_navigation_quality` | `retrieval` | `not_encoded` | `not_tested` | Broad graph-navigation, codebase, multimodal, and private-corpus quality remain not_tested; the graphify evidence is bounded to generated graph/report artifacts.<br>artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | + +### Adapter Execution Metadata + +| Adapter | Sources | Setup Path | Runtime Boundary | Resource Expectation | Retry Guidance | Research Depth | +| --- | --- | --- | --- | --- | --- | --- | +| `openviking_live_baseline` | [OpenViking repository](https://github.com/volcengine/OpenViking/): Official source for OpenViking local context database, resource, and retrieval APIs.<br>[llama-cpp-python CPU wheel index](https://abetlen.github.io/llama-cpp-python/whl/cpu): Official prebuilt CPU wheel index used by the Docker-local embedding pin. | Run ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker. The runner installs llama-cpp-python==0.3.28 with --only-binary llama-cpp-python from the CPU wheel index before OpenViking add_resource/find. | docker-compose.baseline.yml baseline-runner container; no host-global OpenViking, llama-cpp-python, or model service install is required. | Local embedding setup may download a CPU wheel and model assets; record OpenViking.log, elapsed time, and cache size before claiming adapter quality. | Use the default pinned CPU wheel path first.; Override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the default wheel is unavailable for the Docker platform.; Treat install/import failure as incomplete, not wrong_result; treat add_resource/find evidence misses as wrong_result. | not recorded | +| `qmd_deep_profile_gate` | [qmd repository](https://github.com/tobi/qmd): Official qmd source for local hybrid search, CLI setup, and query behavior. | Use the existing Docker baseline qmd install, collection add, update, embed, and query flow with scale or stress profiles. | docker-compose.baseline.yml baseline-runner container with project files and caches inside Docker volumes. | CPU local embedding and rerank cost scale with corpus size; record elapsed time and qmd log artifacts before claims. | Run qmd stress profile in Docker and publish the artifact path.; Map qmd JSON output to retrieval-debug real_world_job scoring before suite claims. | D2 reviewed; deep profile not encoded | +| `openviking_deep_profile_gate` | [OpenViking repository](https://github.com/volcengine/OpenViking/): Official source for OpenViking local context database, resource, and retrieval APIs. | Use the pinned Docker local embedding path from scripts/live-baseline-benchmark.sh, then run OpenViking add_resource/find before any deep profile scoring. | docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker. | Local embedding setup can download CPU wheels and model assets; record build/import logs, model cache size, and elapsed time. | Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.; Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.; Fix evidence-bearing same-corpus output and materialize selected hierarchy/expansion artifacts before converting blocked context_trajectory fixtures into scored jobs. | D2 reviewed; local embedding setup pinned; blocked fixtures encoded | +| `ragflow_research_gate` | [RAGFlow repository](https://github.com/infiniflow/ragflow): Official source for RAGFlow service code and Docker Compose setup.<br>[RAGFlow docs](https://ragflow.io/docs/): Official deployment and setup documentation.<br>[RAGFlow HTTP API reference](https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md): Official reference for OpenAI-compatible responses with reference chunks and document metadata. | Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API. | Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs. | Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring. | Run cargo make smoke-ragflow-docker first to produce a typed preflight artifact.; Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.; Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids. | D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output | +| `lightrag_research_gate` | [LightRAG repository](https://github.com/HKUDS/LightRAG): Official source for LightRAG server, Docker, and retrieval modes.<br>[LightRAG Docker docs](https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md): Official Docker deployment reference.<br>[LightRAG API server docs](https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md): Official query-mode and context-output reference.<br>[LightRAG core programming docs](https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md): Official source-id and file-path citation reference. | Run cargo make smoke-lightrag-docker-context for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export. | docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes. | The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts. | Run cargo make smoke-lightrag-docker-context first; a missing API must remain a typed incomplete artifact, not a pass claim.; Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.; Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids. | D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output | +| `graphrag_research_gate` | [GraphRAG repository](https://github.com/microsoft/graphrag): Official Microsoft GraphRAG source and setup reference.<br>[GraphRAG docs](https://microsoft.github.io/graphrag/): Official documentation for indexing and querying.<br>[GraphRAG input docs](https://microsoft.github.io/graphrag/index/inputs/): Official input format and document metadata reference.<br>[GraphRAG output tables](https://microsoft.github.io/graphrag/index/outputs/): Official output schema with document, text unit, community, and relationship identifiers.<br>[GraphRAG local search docs](https://microsoft.github.io/graphrag/query/local_search/): Official local-search context and graph traversal reference. | Run cargo make smoke-graphrag-docker for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt. | docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke. | The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries. | Run cargo make smoke-graphrag-docker first; missing provider configuration must remain a typed blocked artifact, not a pass claim.; Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.; Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs. | D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output | +| `graphiti_zep_research_gate` | [Graphiti repository](https://github.com/getzep/graphiti): Official open-source temporal context graph engine.<br>[Zep Graphiti overview](https://www.getzep.com/platform/graphiti/): Official product documentation for temporal context graph behavior.<br>[Graphiti quick start](https://help.getzep.com/graphiti/getting-started/quick-start): Official setup, episode ingest, and search output reference.<br>[Graphiti FalkorDB configuration](https://help.getzep.com/graphiti/configuration/falkor-db-configuration): Official Docker-local FalkorDB setup reference.<br>[Graphiti fact triples](https://help.getzep.com/graphiti/working-with-data/adding-fact-triples): Official manual fact-triple ingest contract. | Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt. | docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke. | Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring. | Run cargo make smoke-graphiti-zep-docker-temporal first to produce a typed blocked artifact.; Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.; Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass. | D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output | +| `letta_research_gate` | [Letta repository](https://github.com/letta-ai/letta): Official source for Letta stateful agents and memory.<br>[Letta Docker docs](https://docs.letta.com/guides/docker/): Official Docker deployment guide and embedding configuration boundary. | Use a Docker-only Letta server or CLI flow that creates a benchmark-owned agent, loads the checked-in core_archival_memory fixture corpus, writes core memory and archival memory with fixture source ids, then exports core block JSON plus archival search/readback JSON. | Docker-only Letta server or CLI flow with benchmark-created agents, benchmark-owned storage, no host-global state, and no unstated hosted service dependency. | Embedding model, agent server state, exported core memory, archival search output, and provider boundaries must be explicit in the artifact. | Create a tiny Docker agent with core memory and archival memory loaded from the ELF core_archival_memory fixtures.; Export core block readback, archival search results, source ids, and any audit-equivalent metadata as JSON before scoring.; Score core-versus-archival scenarios only after source evidence can be exported and mapped to the fixture evidence ids. | D1 feasibility verdict: research_only (XY-882); XY-927 selects the contained export/readback contract, but the Letta adapter remains blocked until that artifact exists | +| `langgraph_research_gate` | [LangGraph persistence docs](https://docs.langchain.com/oss/python/langgraph/persistence): Official documentation for checkpoints, replay, fork, and persistence behavior. | Build a tiny LangGraph agent with a checkpointer and explicit memory read/write steps before scoring. | Docker-only Python harness with checkpoint store under the artifact directory. | Small runtime expected, but LLM calls and side effects must be stubbed or deterministic before replay claims. | Encode one replay/fork failure recovery job.; Keep LangGraph classified as replay reference unless memory retrieval is actually exercised. | D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded | +| `nanograph_research_gate` | [nanograph repository](https://github.com/nanograph/nanograph): Official source for on-device typed property graph behavior. | Build or install nanograph inside Docker and load a typed graph fixture from generated corpus facts. | Docker-only CLI run with graph folder under benchmark artifacts. | Light local graph runtime expected; record binary build/install time and graph artifact size. | Define a minimal schema for memory_evolution facts.; Score typed query output only if it cites fixture evidence IDs. | D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded | +| `llm_wiki_research_gate` | [llm-wiki repository](https://github.com/nvk/llm-wiki): Official source for the LLM Wiki plugin and knowledge-base workflow. | Research plugin bootstrap inside a Docker-contained Codex or file-based harness, then materialize page artifacts. | Docker-only plugin or fixture materializer; no user-global Codex plugin install. | LLM generation cost depends on page build; record provider boundary and generated artifact size. | Prototype a fixture-only page build with explicit citations.; Do not score until generated sections can be mapped to evidence IDs. | D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded | +| `gbrain_research_gate` | [gbrain repository](https://github.com/garrytan/gbrain): Official source for brain repo and retrieval workflow.<br>[compiled truth guide](https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md): Official guide for compiled truth plus timeline behavior. | Create a Docker-local brain repo fixture, run import/sync, and export compiled truth plus timeline evidence. | Docker-only repository and database state with no operator-owned brain repo. | Postgres-backed sync and embedding choices must be explicit; record DB size and import time. | Prototype a tiny brain repo with one current-truth page and timeline.; Score only if compiled truth cites the source timeline evidence. | D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven | +| `graphify_docker_smoke` | [graphify repository](https://github.com/safishamsi/graphify): Official source for graphify graph extraction and query workflow.<br>[graphify README](https://github.com/safishamsi/graphify/blob/v3/README.md): Official CLI, output artifact, query, and source-location contract. | Run cargo make smoke-graphify-docker-graph-report to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks. | docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke. | Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior. | Run cargo make smoke-graphify-docker-graph-report first; setup/runtime failures must remain typed artifacts, not pass claims.; Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.; Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids. | D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result | + +## Capture And Integration Coverage + +The real-world job runner is fixture-backed. This section separates encoded evidence from live adapter claims. + +| Class | Behaviors | +| --- | --- | +| real | - | +| fixture-backed | - | +| mocked | - | +| blocked | - | +| not encoded | No capture/integration behavior was declared by encoded fixtures. | + +## Suites + +| Suite | Status | Jobs | Score | Evidence Recall | Irrelevant Context | Trace Explain | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | History Readback | Unsupported Claims | Wrong Results | Reason | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| trust_source_of_truth | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| work_resume | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| project_decisions | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| retrieval | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| memory_evolution | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| consolidation | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| memory_summary | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| proactive_brief | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| scheduled_memory | `blocked` | 5 | `0.800` | `1.000` | `0.000` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | At least one encoded job is blocked. | +| knowledge_compilation | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| operator_debugging_ux | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| capture_integration | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| production_ops | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| personalization | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| core_archival_memory | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| context_trajectory | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | + +## Jobs + +| Suite | Job | Status | Answer Type | Caveat Required | Refusal Required | Unknown Allowed | Score | Evidence Recall | Irrelevant Context | Expected Evidence | Produced Evidence | Trace Failure Stage | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost | +| --- | --- | --- | --- | --- | --- | --- | ---: | ---: | ---: | --- | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- | +| scheduled_memory | scheduled-knowledge-page-refresh-suggestion-001 | `pass` | `scheduled_memory_task` | `false` | `false` | `true` | `1.000` | `1.000` | `0.000` | `scheduled-knowledge-page-stale-finding, scheduled-knowledge-reviewable-refresh` | `scheduled-knowledge-page-stale-finding, scheduled-knowledge-reviewable-refresh` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.000 ms` | `0.000 USD` | +| scheduled_memory | scheduled-private-provider-scheduler-blocked-001 | `blocked` | `scheduled_memory_task` | `true` | `true` | `true` | `0.000` | `1.000` | `0.000` | `` | `` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `-` | `-` | +| scheduled_memory | scheduled-stale-decision-audit-001 | `pass` | `scheduled_memory_task` | `false` | `false` | `true` | `1.000` | `1.000` | `0.000` | `scheduled-old-consolidation-only-decision, scheduled-current-direct-suite-decision` | `scheduled-current-direct-suite-decision, scheduled-old-consolidation-only-decision` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.000 ms` | `0.000 USD` | +| scheduled_memory | scheduled-stale-preference-plan-audit-001 | `pass` | `scheduled_memory_task` | `false` | `false` | `true` | `1.000` | `1.000` | `0.000` | `scheduled-stale-old-plan, scheduled-stale-plan-expired, scheduled-current-trace-plan, scheduled-current-reviewable-preference` | `scheduled-current-reviewable-preference, scheduled-current-trace-plan, scheduled-old-silent-mutation-preference, scheduled-stale-old-plan, scheduled-stale-plan-expired` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.000 ms` | `0.000 USD` | +| scheduled_memory | scheduled-weekly-project-status-summary-001 | `pass` | `scheduled_memory_task` | `false` | `false` | `true` | `1.000` | `1.000` | `0.000` | `scheduled-weekly-current-gate, scheduled-weekly-ledger-update` | `scheduled-weekly-current-gate, scheduled-weekly-ledger-update` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.000 ms` | `0.000 USD` | + +## Operator Debugging UX + +No encoded job reported operator debugging evidence. + +## Memory Evolution + +- Stale answers: `0` +- Conflict detections: `0` +- Update rationales available: `0` +- Temporal validity not encoded: `0` + +- History readback encoded: `0` + +| Suite | Job | Current Evidence | Historical Evidence | Tombstone/Invalidation | Selected Current | Selected Historical | Selected Rationale | Selected Tombstone/Invalidation | Selected But Not Narrated | Stale Traps Used | Conflict Count | Detected | Update Rationale | Temporal Validity | History Readback | Follow-up | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---: | ---: | --- | --- | --- | --- | + +## Trace Explainability + +No encoded job reported trace explainability metadata. + +## Scheduled Memory Metrics + +| Job | Task Runs | Outputs | Kinds | Evidence Coverage | Freshness | Action Rationale | Trace Coverage | Invalid Current | Untraced | Unsupported Current | Tombstone Violations | Source Mutations | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| scheduled-knowledge-page-refresh-suggestion-001 | 1 | 1 | `1/1` | `1.000` | `1.000` | `1.000` | `1.000` | 0 | 0 | 0 | 0 | 0 | +| scheduled-stale-decision-audit-001 | 1 | 1 | `1/1` | `1.000` | `1.000` | `1.000` | `1.000` | 0 | 0 | 0 | 0 | 0 | +| scheduled-stale-preference-plan-audit-001 | 1 | 2 | `1/1` | `1.000` | `1.000` | `1.000` | `1.000` | 0 | 0 | 0 | 0 | 0 | +| scheduled-weekly-project-status-summary-001 | 1 | 1 | `1/1` | `1.000` | `1.000` | `1.000` | `1.000` | 0 | 0 | 0 | 0 | 0 | + +## Unsupported Claims + +No unsupported claims were produced by encoded jobs. + +## Follow-Ups + +| Suite | Job | Follow-up | Reason | +| --- | --- | --- | --- | +| scheduled_memory | scheduled-private-provider-scheduler-blocked-001 | XY-930 private/provider scheduled-memory input gate | Run private-corpus, provider-backed, and hosted scheduler gates only when operator-owned inputs exist. | + +## Result Semantics + +This report uses `docs/spec/real_world_agent_memory_benchmark_v1.md` status terms. +It is a real-world job fixture report, not a Docker live-baseline report. +Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins. + +The summary counters report required evidence coverage, source-ref coverage, quote coverage, expected evidence recall, irrelevant context ratio, trace explainability, stale retrievals, scope violations, redaction leaks, Qdrant rebuild case coverage, stale answers, conflict detections, update rationale availability, and temporal validity gaps across encoded jobs. + +- `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule. +- `wrong_result`: a job completed but missed required answer or evidence expectations. +- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links. +- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed. + +For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims. + +For `memory_summary` jobs, summary artifacts are derived review surfaces. Top-of-mind entries must be current, included or downgraded entries must carry source refs, and derived project-profile entries must either cite sources or be explicitly flagged as unsupported. + +For `proactive_brief` jobs, brief artifacts are fixture-scored derived outputs, not scheduled UI behavior. Every suggestion must carry evidence refs, freshness/currentness metadata, and an action rationale; stale, superseded, or tombstoned sources must not be presented as current recommendations. + +For `scheduled_memory` jobs, task artifacts are deterministic fixture-scored stand-ins for asynchronous work. Every output must carry evidence refs, freshness/currentness metadata, action rationale, and execution trace/readback evidence; scheduled tasks must not mutate source notes silently or claim hosted scheduler/private-provider parity from fixture-only output. + +## Suites With `not_encoded` Status + +- `trust_source_of_truth` +- `work_resume` +- `project_decisions` +- `retrieval` +- `memory_evolution` +- `consolidation` +- `memory_summary` +- `proactive_brief` +- `knowledge_compilation` +- `operator_debugging_ux` +- `capture_integration` +- `production_ops` +- `personalization` +- `core_archival_memory` +- `context_trajectory` diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md new file mode 100644 index 00000000..56de3357 --- /dev/null +++ b/docs/guide/benchmarking/index.md @@ -0,0 +1,157 @@ +# Benchmarking Guide Index + +Goal: Route agents to live benchmark runbooks, report publication steps, and checked-in +benchmark evidence. +Read this when: You need to run, publish, interpret, or extend ELF benchmark evidence +against external memory systems. +Inputs: The benchmark question, selected corpus profile, and whether you need a runbook +or a saved evidence snapshot. +Depends on: `docs/index.md`, `docs/guide/index.md`, and `docs/governance.md`. +Outputs: The smallest benchmarking guide or report needed to continue. + +## Use This Index When + +- You need to run the live Docker-only benchmark matrix. +- You need to publish a Markdown report from a generated benchmark JSON report. +- You need the checked-in benchmark evidence behind README claims. +- You need to extend the benchmark matrix with new projects, profiles, or lifecycle + checks. + +Do not use benchmark commands as the production operating procedure. For single-user +Docker Compose production start, stop, backup, restore, Qdrant rebuild, rollback, and +cleanup, use `docs/guide/single_user_production.md`. + +## Guides And Reports + +- `live_baseline_benchmark.md`: run, clean up, publish, and interpret the live + Docker-only benchmark matrix, including generated public and production-corpus + profiles, private addendum publication, opt-in 10k/100k backfill, and soak + profiles. +- `2026-06-09-live-baseline-report.md`: checked-in evidence snapshot for the June 9, + 2026 ELF production-provider stress run and all-project smoke comparison. +- `2026-06-09-production-corpus-report.md`: checked-in synthetic production-corpus + ELF adoption benchmark report with task queries and evidence IDs. +- `2026-06-09-production-adoption-gate-report.md`: XY-836 production adoption + decision report with fresh provider-backed synthetic, stress, backfill, restore, and + external adapter evidence. +- `2026-06-09-operator-debugging-ux-report.md`: checked-in real-world job + operator-debugging UX report with trace/viewer links, raw-SQL avoidance, root-cause + step counts, dropped-candidate visibility, and repair-action clarity. +- `2026-06-10-real-world-comparison-report.md`: checked-in post-P1 real-world + comparison report with aggregate fixture evidence, external-adapter evidence classes, + remaining typed gaps, and adoption implications. +- `2026-06-10-live-real-world-sweep-report.md`: XY-880 full-suite live real-world + sweep report for ELF and qmd, showing per-suite live pass and typed non-pass states + without claiming full-suite live parity. +- `2026-06-10-production-adoption-refresh.md`: XY-884 post-adapter production + adoption refresh that keeps the decision at adopt with bounded caveats and separates + fixture, live adapter, private corpus, credentialed, blocked, and research-gate + evidence. +- `2026-06-11-competitor-strength-evidence-matrix.md`: XY-897 competitor-strength + matrix contract that maps every tracked memory/RAG/graph project to its strongest + scenario, current evidence class, typed blockers, next measurement gate, and ELF + borrow-if-stronger direction. +- `2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md`: current + optimization-direction report that translates measured benchmark data and competitor + strengths into prioritized ELF iteration themes and explicit non-claims. +- `2026-06-11-measurement-coverage-audit.md`: fresh coverage audit that separates + current measured ELF/qmd data, fixture evidence including the XY-927 + `core_archival_memory` suite, external adapter ledger coverage, scenario non-claims, + and the next measurement reports needed before stronger competitor claims. +- `2026-06-11-elf-qmd-retrieval-debug-profile.md`: fresh ELF/qmd retrieval-debug + profile with real-world retrieval-suite evidence, 480-document stress baseline + evidence, qmd top-10 artifact inspection, and explicit rerank/fusion non-claims. +- `2026-06-11-elf-qmd-memory-evolution-diagnostic.md`: fresh ELF/qmd + memory-evolution diagnostic showing fixture pass, live ELF/qmd current-vs-historical + wrong-result patterns, qmd tombstone evidence miss, and temporal-reconciliation + iteration directions. +- `2026-06-11-temporal-history-competitor-gap-report.md`: fresh report-only + temporal/history competitor-gap report that updates the mem0 basic lifecycle result, + records Graphiti/Zep and Letta claim boundaries, and turns qmd, mem0/OpenMemory, + Graphiti/Zep, Letta, and adjacent project strengths into benchmark-gated ELF + optimization directions. +- `2026-06-11-qmd-openviking-strength-profile-report.md`: XY-899 strength-profile + report that separates qmd retrieval quality from debug/replay ergonomics, records + qmd wrong-result diagnosis classes, and preserves XY-928 OpenViking + context-trajectory surfaces as blocked/not-tested until scored staged, + hierarchical, and recursive evidence exists. +- `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md`: XY-923 trace-level + replay and wrong-result diagnostics report that scores qmd top-10/replay artifact + ergonomics against ELF trace/admin surfaces while keeping retrieval correctness, + rerank, fusion, candidate-drop, and typed non-pass boundaries separate. +- `2026-06-11-first-generation-oss-adapter-promotion-report.md`: XY-898 + first-generation OSS adapter promotion report that updates agentmemory, + mem0/OpenMemory, memsearch, and claude-mem with fresh scenario-level baseline + evidence and ELF win/tie/loss/untested positions without converting baseline-only + evidence into real-world suite wins. +- `2026-06-11-first-generation-oss-continuity-source-store-report.md`: XY-925 + follow-up report that adds first-generation OSS fixture-backed prompt coverage and + typed blockers for agentmemory durable continuity, memsearch canonical Markdown + source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, + hook, and viewer/operator surfaces. +- `2026-06-11-graph-rag-scored-smoke-adapter-report.md`: XY-900 graph/RAG + scored-smoke adapter report, updated by XY-929 with a representative + graph/RAG fixture slice, that keeps RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, + graphify, llm-wiki, and gbrain outputs as scored or typed non-pass + `real_world_job` evidence without converting smoke or representative + non-pass evidence into quality claims. +- `2026-06-11-competitor-strength-adoption-report.md`: XY-901 final + competitor-strength adoption report, updated by XY-927 with fixture-backed + core-vs-archival coverage and by XY-929 with representative graph/RAG + typed non-pass fixtures, plus the bounded personal-production decision, + scenario-level win/tie/loss/not-tested matrix, claim boundaries, and + optimization issue queue. +- `2026-06-11-capture-write-policy-live-report.md`: XY-933 live capture/write-policy + report that scores ELF redaction, exclusions, source ids, evidence binding, and no + secret leakage while preserving typed blocked/untested boundaries for agentmemory + and claude-mem capture breadth. +- `2026-06-16-live-consolidation-proposal-scoring-report.md`: XY-934 live + consolidation proposal scoring report that separates fixture-backed consolidation + passes from service-backed live proposal materialization, lineage, confidence, + unsupported-claim flags, and apply/defer/discard audit evidence. +- `2026-06-11-mem0-openmemory-history-ui-export-report.md`: XY-924 plus XY-931 + mem0/OpenMemory local OSS history, preference-correction, deletion-audit, + personalization, and export-readback comparison with normalized + win/tie/loss/not-tested/blocked/non-goal outcomes and explicit hosted/UI/graph + non-claims. +- `2026-06-16-dreaming-readiness-stage-ledger.md`: XY-951 stage-gate ledger for + Dreaming-inspired memory improvements, with the required current baseline, + post-stage command matrix, typed improved/regressed/unchanged/blocked/not-tested + buckets, and machine-readable companion file + `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`. +- `2026-06-16-proactive-brief-scoring-report.md`: XY-953 fixture-backed proactive + project brief scoring report with source refs, freshness/currentness markers, + reject/defer rationale, stale/tombstone guards, and the private-corpus blocker tied + to XY-930. +- `2026-06-16-scheduled-memory-task-scoring-report.md`: XY-954 fixture-backed + scheduled-memory task scoring report with source refs, freshness/currentness + markers, action rationale, execution trace/readback, source-mutation guards, and + the private/provider scheduler blocker tied to XY-930. +- `2026-06-16-live-temporal-reconciliation-report.md`: XY-905 live temporal + reconciliation follow-up showing ELF live `memory_evolution` moving from + `pass=1`, `wrong_result=5` to `pass=6`, `wrong_result=0`, with trace/readback + fields for selected current, historical, rationale, tombstone, invalidation, + dropped, and non-narrated evidence. +- `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world + agent memory benchmark contract, including suite taxonomy, typed report states, + knowledge-compilation fixture tasks, and the production-ops fixture target. +- `real_world_memory_evolution.md`: run and interpret the checked-in memory evolution + jobs for current facts, historical facts, stale traps, conflicts, update rationales, + and temporal graph limitations. + +## Update Rules + +- Add a dated report when a new run changes README-level claims. +- Keep generated raw JSON under `tmp/live-baseline/`; commit only reviewed Markdown + summaries and durable scripts. +- Keep generated real-world job smoke JSON and Markdown under `tmp/real-world-job/`; + commit fixture schemas, smoke fixtures, runner code, and durable docs only. +- Keep generated real-world memory trust/personalization/knowledge/production-ops JSON + and Markdown under `tmp/real-world-memory/`; commit fixtures, runner code, and + durable docs only. +- Link the newest decision-relevant report from README and this index. +- When benchmark semantics change, update `live_baseline_benchmark.md` and the + relevant spec before publishing a new result. +- Real-world job benchmark changes are governed by + `docs/spec/real_world_agent_memory_benchmark_v1.md`; keep this guide as routing and + do not duplicate the normative schema here. diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md new file mode 100644 index 00000000..9d93a2d6 --- /dev/null +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -0,0 +1,558 @@ +# Live Baseline Benchmark + +Goal: Run Docker-isolated, current-HEAD baseline checks against ELF and the external memory projects compared with ELF. +Read this when: You need evidence about which external projects actually run against a shared benchmark corpus. +Preconditions: Docker and Docker Compose are available on the host. +Depends on: `docker-compose.baseline.yml`, `scripts/live-baseline-benchmark.sh`, +`docs/spec/system_competitive_parity_gate_v1.md`, and +`docs/spec/production_corpus_manifest_v1.md`. +Verification: `cargo make baseline-live-docker` writes `tmp/live-baseline/live-baseline-report.json`; `cargo make baseline-live-report` can render that JSON into a checked-in Markdown report. + +## Scope + +This guide is for benchmark evidence, not for operating a personal production ELF service. For +single-user Docker Compose production start, stop, health, backup, restore, Qdrant rebuild, +rollback, and cleanup commands, use `docs/guide/single_user_production.md`. + +The runner covers ELF plus the six external projects in the README comparison table: + +- ELF +- agentmemory +- OpenViking +- mem0 +- qmd +- claude-mem +- memsearch + +For ELF, the runner uses Docker-owned Postgres and Qdrant, writes the shared corpus +through `add_note`, drains the worker indexing outbox into persisted chunks and +embeddings, rebuilds Qdrant from the worker-produced chunk tables, and verifies +`search_raw` against the shared query manifest. It also runs ELF service lifecycle +checks for note update, note delete, cold-start recovery, concurrent writes, +configurable soak stability, and a local resource envelope over the same Docker-owned +stores. By default these checks use the deterministic local embedding provider. Set +`ELF_BASELINE_ELF_EMBEDDING_MODE=provider` to run ELF through the configured +production embedding provider instead. + +For external projects, the runner clones current upstream `main` inside Docker, records +the exact commit SHA, reads the same generated corpus and query manifest, and runs a +same-corpus retrieval adapter when the project exposes a local API or CLI that can run +without provider keys. Each project record includes adapter metadata that marks storage +and behavior surfaces as `real`, `mocked`, `unsupported`, `blocked`, `incomplete`, or +`not_encoded`. + +Corpus profiles: + +- `smoke`: default, 3 documents and 3 query cases. +- `scale`: 120 documents by default, 8 query cases, and generated distractor notes + that make the check closer to a production retrieval benchmark. +- `stress`: 480 documents by default, 16 query cases, and alternate phrasings for + every needle query. +- `production-synthetic`: checked-in synthetic coding-agent production corpus with + issues, PRs, worktrees, runbooks, decisions, blockers, recovery notes, and + task-oriented queries. Fixture: + `apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json`. +- `production-private`: local private/sanitized production corpus manifest supplied by + `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST`. +- `backfill`: 2000 documents by default, 16 query cases, alternate phrasings for + every needle query, and ELF-only resumable backfill evidence. + +Use `ELF_BASELINE_SCALE_DOCS` and `ELF_BASELINE_STRESS_DOCS` to raise or lower the +generated corpus sizes. +Use `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` to supply a local manifest that follows +`docs/spec/production_corpus_manifest_v1.md`. The private profile fails closed when the +manifest path is absent, the file is missing, a referenced `local_path` is missing, or a +query references an unknown evidence ID. It does not fall back to the checked-in +synthetic fixture. +Use `ELF_BASELINE_BACKFILL_DOCS` to set the generated corpus size for the backfill +profile; values such as `10000` are supported for operator-controlled stress runs. +Use `cargo make baseline-backfill-10k-docker` for the checked-in 10k operator profile. +Use `cargo make baseline-backfill-100k-docker` only with +`ELF_BASELINE_ENABLE_EXPENSIVE=1`; the task fails closed without that explicit guard. +Use `ELF_BASELINE_CONCURRENT_NOTES`, `ELF_BASELINE_MAX_ELF_SECONDS`, and +`ELF_BASELINE_MAX_ELF_RSS_KB` to tune ELF's concurrent-write and resource-envelope +checks. +Use `ELF_BASELINE_SOAK_SECONDS`, `ELF_BASELINE_SOAK_ROUNDS`, and +`ELF_BASELINE_SOAK_PROBE_INTERVAL_MS` to tune ELF's repeated write/search soak +window. The smoke profile does not run soak by default; the scale/full profiles run a +short 15-second soak by default, and the stress profile runs a 60-second soak by +default. Use `cargo make baseline-soak-docker` for an explicit one-hour ELF-only soak, +or override `ELF_BASELINE_SOAK_SECONDS` for a shorter or longer operator-controlled +window. +Use `ELF_BASELINE_ELF_EMBEDDING_MODE=provider` plus +`ELF_BASELINE_ELF_EMBEDDING_API_BASE`, `ELF_BASELINE_ELF_EMBEDDING_API_KEY`, +`ELF_BASELINE_ELF_EMBEDDING_MODEL`, and +`ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS` to run ELF with a production embedding API. +The runner also accepts `QWEN_API_KEY`, `QWEN_EMBEDDING_API_BASE`, +`QWEN_EMBEDDING_MODEL`, `QWEN_EMBEDDING_DIMENSIONS`, and `QWEN_EMBEDDING_PATH` for +Qwen-compatible embedding configuration. Generic aliases `EMBEDDING_API_BASE`, +`EMBEDDING_API_KEY`, `EMBEDDING_MODEL`, `EMBEDDING_DIMENSIONS`, +`EMBEDDING_PROVIDER_ID`, `EMBEDDING_PATH`, and `EMBEDDING_TIMEOUT_MS` are also +supported. Provider-mode runs default to a 30-second embedding timeout unless an +explicit timeout env var is set. For Qwen3 production embedding runs, use +`Qwen3-Embedding-8B` with `EMBEDDING_DIMENSIONS=4096`. The aggregate report records +ELF's embedding mode, provider id, model, dimensions, timeout, API base, and path; it +never records the API key. +For ELF backfill runs, the runner writes a durable checkpoint file under the report +directory by default, intentionally interrupts the first pass unless +`ELF_BASELINE_BACKFILL_RESUME_PROBE=0`, then resumes from the checkpoint. Tune +`ELF_BASELINE_BACKFILL_BATCH_SIZE`, `ELF_BASELINE_BACKFILL_INTERRUPT_AFTER`, +`ELF_BASELINE_BACKFILL_CHECKPOINT`, and `ELF_BASELINE_WORKER_CONCURRENCY` when +measuring import and indexing throughput. +Set `ELF_BASELINE_COST_PER_1K_TOKENS_USD` to attach a planning-only cost proxy to +ELF reports. The proxy estimates input tokens from primary corpus note text plus +declared same-corpus query text; it is not a billing statement. + +The ELF report records: + +- duplicate source-note count and checkpoint resume state; +- query latency mean, P50, P95, P99, and max; +- local RSS, Postgres database bytes, corpus bytes, report-directory bytes, and + checkpoint-file bytes; +- the optional cost proxy described above; +- operator-case commands for private addendum, 10k/100k resume, provider outage, + Docker Compose start/stop/upgrade, migration rollback, Postgres restore, Qdrant + rebuild, and unattended soak. + +Current external same-corpus adapters: + +- agentmemory: writes every corpus document through `mem::remember`, queries through + `mem::search`, exercises `mem::forget` delete suppression, and probes + superseding by writing a revised memory through `mem::remember`. The current + adapter uses an in-memory SDK/KV mock, so behavior metadata is `mocked` and durable + cold-start recovery is recorded as `blocked` until a persistent agentmemory KV/index + path or hosted runtime is wired into the harness. +- qmd: adds the corpus as a collection, embeds it locally, and runs structured hybrid + `query --json` for every query case. It also works from a per-adapter corpus copy, + rewrites and deletes files in that copy, then reruns `qmd update`, `qmd embed -f`, + and fresh `qmd query` processes. +- memsearch: indexes the corpus with the local ONNX embedder and runs CLI search. + It also works from a per-adapter corpus copy, rewrites and deletes files in that + copy, then reruns `memsearch index` and fresh `memsearch search` processes. +- mem0: writes the corpus with `infer=false` and searches local FastEmbed + Qdrant + path storage. It also runs public `Memory.update`, `Memory.delete`, and a new + `Memory.from_config` over the same local paths from a per-adapter corpus copy. No + LLM inference is required. OpenMemory UI and hosted Platform behavior are not + counted as local OSS passes. +- claude-mem: writes every corpus document into a Docker-local durable SQLite memory + repository, runs repository search for every query case, updates one item, deletes + one item, reopens the same SQLite file with fresh repository instances, and checks + search-to-detail/source hydration. Hook, viewer, and full timeline progressive + disclosure remain separate from this local repository check. + +Current deeper checks: + +- ELF: same-corpus retrieval through worker-produced chunks, async worker indexing + completion, resumable checkpointed backfill without duplicate source notes, service + update replacement through the worker, service delete suppression through the worker, + cold-start search recovery after constructing a fresh service over the same Postgres + and Qdrant stores, concurrent write/search E2E, configurable repeated write/search + soak stability, and a configurable local resource envelope. +- qmd, memsearch, and mem0: same-corpus retrieval, update replacement, delete + suppression, and cold-start search recovery through their local public API or CLI + surfaces. +- agentmemory: same-corpus retrieval and delete suppression are exercised; update + replacement is probed through superseding `mem::remember`; cold-start recovery is + `blocked` because the current adapter runs against an in-memory SDK/KV mock. +- claude-mem: same-corpus retrieval, update replacement, delete suppression, + cold-start search recovery, and repository-level progressive detail/source + hydration through a durable local SQLite repository. Hook, viewer, and full timeline + progressive disclosure remain `not_encoded` until a real adapter executes those + surfaces. +- OpenViking: same-corpus retrieval only when its local runtime path can complete. + Update, delete, and recovery checks are `not_encoded` for this adapter. +- Concurrent write, soak stability, and resource-envelope checks are currently encoded + for ELF. They are not yet encoded for the external adapters. Multi-hour production + soak is still operator-controlled through `ELF_BASELINE_SOAK_SECONDS`; the checked-in + stress default is a bounded 60-second signal. + +OpenViking attempts the official `.[local-embed]` path plus `OpenViking.add_resource` +and `OpenViking.find`. The Docker runner first pins the local embedding dependency to +`llama-cpp-python==0.3.28` from the official CPU wheel index +`https://abetlen.github.io/llama-cpp-python/whl/cpu` and installs it with +`--only-binary llama-cpp-python`. Override +`ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION` or +`ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX` only when the pinned wheel is +unavailable for the Docker platform. If the pinned wheel cannot install or import, the +project is recorded as `incomplete` with +`retrieval_status = "local_embed_install_failed"` rather than as a retrieval failure. +When the pinned dependency reaches `add_resource`/`find`, evidence misses are recorded +as `wrong_result`/`retrieval_wrong_result`. This local dependency check is separate +from provider-backed ELF/Qwen3 embedding evidence. + +## Checked-In Reports + +- `docs/guide/benchmarking/2026-06-09-live-baseline-report.md`: June 9, 2026 + production-provider ELF stress run and all-project smoke comparison. + +## Run + +```sh +cargo make baseline-live-docker +``` + +To run the scale profile: + +```sh +ELF_BASELINE_PROFILE=scale cargo make baseline-live-docker +ELF_BASELINE_PROFILE=scale ELF_BASELINE_SCALE_DOCS=240 cargo make baseline-live-docker +ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker +ELF_BASELINE_PROJECTS=ELF ELF_BASELINE_PROFILE=backfill cargo make baseline-live-docker +cargo make baseline-backfill-docker +cargo make baseline-backfill-10k-docker +ELF_BASELINE_ENABLE_EXPENSIVE=1 cargo make baseline-backfill-100k-docker +ELF_BASELINE_SOAK_SECONDS=3600 cargo make baseline-soak-docker +``` + +To iterate on one or more project adapters without rerunning the full matrix: + +```sh +ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker +ELF_BASELINE_PROJECTS=ELF,memsearch cargo make baseline-live-docker +``` + +To run the checked-in synthetic production-style corpus through ELF: + +```sh +cargo make baseline-production-synthetic +``` + +To run a private local production corpus without committing private content: + +```sh +ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST=tmp/private-production-corpus/manifest.json \ +cargo make baseline-production-private +``` + +The private manifest can contain sanitized inline `text` fields or `local_path` fields +that point to local sanitized text/Markdown files. Keep private manifests and local +evidence under `tmp/` or outside the repository. `tmp/` is ignored by git. +The manifest `manifest_id`, evidence IDs, and query IDs are report-visible labels; keep +them lower-case ASCII identifiers and do not encode private text in those fields. + +To run the same private profile and publish a safe Markdown addendum under `tmp/`: + +```sh +ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST=tmp/private-production-corpus/manifest.json \ +cargo make baseline-production-private-addendum +``` + +The default addendum path is: + +```text +tmp/live-baseline/private-production-addendum.md +``` + +Override it with `ELF_BASELINE_PRIVATE_ADDENDUM`. The addendum intentionally reports +manifest id, evidence ids, task labels, checks, latency, backfill, resource, cost +proxy, and operator-case fields without embedding private evidence text or local +private file paths. Raw JSON and logs remain under `tmp/live-baseline/` and must be +reviewed before any manual copy into durable docs. + +The only host artifact is: + +```text +tmp/live-baseline/ +``` + +That directory contains the aggregate report, per-project logs, and the shared query +fixture used by the run. The aggregate report records `corpus.profile`, +`corpus.track`, `corpus.manifest_id`, `corpus.document_count`, and +`corpus.query_count` so generated public corpus results are not confused with +synthetic or private production-corpus results. Each project record includes +`elapsed_seconds` for rough local runtime comparison and an `adapter` metadata object +that distinguishes real, mocked, unsupported, blocked, incomplete, and not-encoded +behavior surfaces. ELF project records also include an `embedding` summary so +deterministic local and production-provider runs are not confused. ELF query records +include task, trace ID, expected evidence IDs, allowed alternate evidence IDs, top +evidence ID, wrong-result count, and per-query latency. Each ELF trace ID can be opened +from the admin viewer at `/viewer` by loading it in the Traces panel; the full trace +bundle shows stage-level candidates, rerank terms, relation context, and provider +metadata without raw SQL. Each project record also includes +`backfill` evidence with source count, completed count, batch size, worker +concurrency, resume state, duplicate-source count, and backfill elapsed seconds. Each +project record also includes `checks` and `check_summary`; the aggregate +`full_check_summary` is the adoption-relevant multi-check count. + +Production-ready claims must cite a concrete report path. A claim based only on +generated public `smoke`, `scale`, or `stress` profiles is not enough for personal +production adoption. Cite a `production-synthetic` report for fixture coverage, and +cite a `production-private` report when making a private-corpus production-readiness +claim. +If no operator-owned private manifest is supplied, the private-corpus path is a +bounded failure, not a pass. + +For job-level production-ops coverage under the real-world benchmark contract, run: + +```sh +cargo make real-world-memory-production-ops +``` + +That target parses checked-in fixture evidence for interrupted backfill resume, +backup/restore readback, cold-start recovery, resource-envelope interpretation, and +typed private-manifest, credential, and dependency boundaries. It does not run Docker, +private corpus data, or provider-backed credentials, and it must not be used as a +substitute for `baseline-production-private` when making a private-corpus readiness +claim. + +## Local CLI Wrappers + +The `elf` CLI delegates benchmark and backfill operations to the same `cargo make` tasks listed +above. It is a local convenience wrapper, not a second benchmark runner. + +Build the CLI: + +```sh +cargo build -p elf --bin elf +``` + +Run the default resumable backfill profile: + +```sh +target/debug/elf backfill +``` + +Override the generated document count or worker concurrency: + +```sh +target/debug/elf backfill --docs 2000 --worker-concurrency 4 +target/debug/elf backfill --ten-k +target/debug/elf backfill --hundred-k --enable-expensive +``` + +Run the live baseline or production corpus profiles through the CLI wrapper: + +```sh +target/debug/elf benchmark run --kind live --profile stress --projects ELF +target/debug/elf benchmark run --kind production-synthetic +target/debug/elf benchmark run \ + --kind production-private \ + --production-corpus-manifest tmp/private-production-corpus/manifest.json +``` + +Render a Markdown report from the generated JSON: + +```sh +target/debug/elf benchmark report \ + --report tmp/live-baseline/live-baseline-report.json \ + --out tmp/live-baseline/live-baseline-report.md +``` + +Add `--dry-run` to `backfill`, `benchmark run`, or `benchmark report` to print the resolved task and +environment as JSON without running Docker or writing a report. + +## Publish A Markdown Report + +After a run writes `tmp/live-baseline/live-baseline-report.json`, render a durable +Markdown summary: + +```sh +cargo make baseline-live-report +``` + +By default the task prints Markdown to stdout. To write a checked-in report: + +```sh +ELF_BASELINE_MARKDOWN_REPORT=docs/guide/benchmarking/YYYY-MM-DD-live-baseline-report.md \ +cargo make baseline-live-report +``` + +The publisher summarizes one generated aggregate JSON report. For a combined report +that compares multiple runs, use the generated Markdown as input evidence and then add +the interpretation manually under `docs/guide/benchmarking/`. + +## Real-World Job Smoke + +The live-baseline runner and real-world job runner publish separate report schemas. +Live-baseline reports remain evidence for Docker retrieval and lifecycle checks only. +They are not real-world suite wins. +The real-world runner loads +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` +by default and records live-baseline-only external adapter evidence under +`external_adapters`; those records preserve the typed setup/run evidence but still +leave real-world suites as `not_encoded`, `blocked`, `incomplete`, `wrong_result`, or +`lifecycle_fail` until an adapter actually executes `real_world_job` prompts and +scoring. The same manifest can also contain `research_gate` records for future adapter +packs; those records provide source/setup/runtime/resource/retry guidance but are not +live-baseline evidence. +The manifest may also include scenario judgments with an ELF position of `wins`, +`ties`, `loses`, or `untested`; these are dimension-level report inputs and do not +upgrade live-baseline-only evidence into real-world suite pass evidence. + +The full live real-world adapter sweep for ELF and qmd is separate from the +same-corpus live baseline: + +```sh +cargo make real-world-memory-live-adapters +``` + +This task runs in `docker-compose.baseline.yml`, materializes generated +`adapter_response` fixtures through ELF's service runtime and qmd's local CLI +against the checked-in `real_world_memory` fixture corpus, then scores all encoded +suites. It preserves typed non-pass states and does not claim a full-suite live pass +when memory-evolution conflict evidence, production operations, capture integrations, +derived pages, consolidation proposals, or operator-debugging traces are not proven. +It publishes: + +```text +tmp/real-world-memory/live-adapters/elf-report.json +tmp/real-world-memory/live-adapters/elf-report.md +tmp/real-world-memory/live-adapters/qmd-report.json +tmp/real-world-memory/live-adapters/qmd-report.md +tmp/real-world-memory/live-adapters/summary.json +``` + +To run the checked-in real-world job smoke fixture and render its Markdown report: + +```sh +cargo make smoke-real-world-job +``` + +To run the checked-in work-resume, source-of-truth, lifecycle, redaction, +capture-boundary, and personalization real-world memory fixtures: + +```sh +cargo make real-world-memory +``` + +Artifacts: + +```text +tmp/real-world-job/real-world-job-smoke-report.json +tmp/real-world-job/real-world-job-smoke-report.md +tmp/real-world-memory/real-world-memory-report.json +tmp/real-world-memory/real-world-memory-report.md +``` + +The smoke fixture suite lives under +`apps/elf-eval/fixtures/real_world_memory/work_resume/` and uses +`docs/spec/real_world_agent_memory_benchmark_v1.md` status terms, including +`unsupported_claim`. The checked-in slice includes work-resume continuity jobs and one +capture/integration boundary job. Suites without checked-in jobs are reported as +`not_encoded`. + +The broader real-world memory fixture set lives under +`apps/elf-eval/fixtures/real_world_memory/` and adds summary counters for evidence +coverage, source-ref coverage, quote coverage, stale retrievals, scope correctness, +redaction leaks, and Qdrant rebuild coverage. + +The memory evolution suite is a separate checked-in real-world job fixture set: + +```sh +cargo make real-world-memory-evolution +``` + +It lives under `apps/elf-eval/fixtures/real_world_memory/evolution/` and reports +stale-answer count, conflict detection count, update rationale availability, temporal +validity encoding, and unsupported claims. Its relation-temporal fixture is encoded as +a normal pass/fail check for current versus historical graph-lite relation context. + +To run the checked-in retrieval-quality real-world fixtures: + +```sh +cargo make real-world-memory-retrieval +``` + +Artifacts: + +```text +tmp/real-world-memory/retrieval-report.json +tmp/real-world-memory/retrieval-report.md +``` + +The retrieval fixture lives under +`apps/elf-eval/fixtures/real_world_memory/retrieval/` and covers alternate phrasing, +distractor-heavy corpora, multi-hop routing questions, current-versus-obsolete context +selection, minimal sufficient context, and stage-level wrong-result explainability. +It is still an offline fixture report. qmd has a separate full live adapter sweep +through `cargo make real-world-memory-live-adapters`; OpenViking remains a reference +system unless an adapter actually runs and records typed evidence. + +To run the checked-in proposal-only consolidation fixtures: + +```sh +cargo make real-world-memory-consolidation +``` + +Artifacts: + +```text +tmp/real-world-memory/consolidation/report.json +tmp/real-world-memory/consolidation/report.md +``` + +The consolidation fixtures live under +`apps/elf-eval/fixtures/real_world_memory/consolidation/`. They score reviewable +proposal payloads, source lineage, review action outcomes, executable gaps, and source +mutation count. They do not claim live scheduled consolidation-worker generation. + +To run the checked-in knowledge-compilation and page-rebuild fixtures: + +```sh +cargo make real-world-memory-knowledge +``` + +Artifacts: + +```text +tmp/real-world-memory/knowledge-report.json +tmp/real-world-memory/knowledge-report.md +``` + +The knowledge fixtures live under +`apps/elf-eval/fixtures/real_world_memory/knowledge/`. They score derived page +citation coverage, stale-claim linting, rebuild determinism, backlink coverage, page +usefulness, and explicitly flagged unsupported summaries. Generated pages are +benchmark artifacts, not source-truth replacements. + +## Clean Up + +```sh +cargo make clean-baseline-live-docker +``` + +This removes Docker-managed Postgres, Qdrant, npm, pip, cargo, and target volumes used +by the live baseline runner. It does not remove the host report directory. + +## Result Semantics + +The result terms below belong to the current Docker live baseline. For the future +job-level suite contract, including `unsupported_claim`, see +`docs/spec/real_world_agent_memory_benchmark_v1.md`. + +- `pass`: the project installed and every encoded check for that project passed in the + selected corpus profile. +- `wrong_result`: a retrieval check completed but returned the wrong memory or missed + expected evidence. +- `lifecycle_fail`: same-corpus retrieval may pass, but an encoded update, delete, + cold-start, persistence, or related lifecycle check failed. +- `incomplete`: setup or a declared check could not complete because install, runtime, + dependency, or adapter wiring failed in Docker. +- `blocked`: a safe check cannot run without external credentials, manual setup, + durable runtime wiring, or host integration outside this run. +- `not_encoded`: the capability is not covered by the current adapter, so no pass/fail + claim is allowed. + +The top-level `verdict` is intentionally stricter than the per-project `status`: it +only returns `pass` when every selected project has `status = "pass"` and +`retrieval_status = "retrieval_pass"`. The `same_corpus_summary` field is the +retrieval count and does not treat lifecycle failures as retrieval failures. For +multi-check comparisons, read `full_check_summary`, each project's `checks`, and the +adapter behavior metadata. + +`incomplete`, `blocked`, and `not_encoded` are not passes. Treat them as evidence that +more benchmark wiring or upstream/runtime support is needed. + +## Failure Conditions + +A project status should be `wrong_result` when same-corpus retrieval runs but does not +return the expected evidence. A project status should be `lifecycle_fail` when +retrieval is not the failing condition but an encoded update, delete, cold-start, +persistence, concurrent, soak, or resource-envelope check completes and proves the +project did not meet the selected benchmark contract. + +Use `incomplete` when the runner cannot execute the declared check fairly because clone, +install, import, build, adapter wiring, native dependency support, or local runtime +setup failed. Use `blocked` when the check needs credentials, manual setup, durable +runtime integration, or host integration outside the issue scope. Use `not_encoded` +when the adapter simply does not cover the capability yet. diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md new file mode 100644 index 00000000..c4e5c141 --- /dev/null +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -0,0 +1,488 @@ +# Real-World Agent Memory Benchmark + +Goal: Explain the v1 real-world agent memory benchmark suite and route implementation +work to the governing spec. +Read this when: You need to create jobs, extend benchmark suites, interpret reports, +or understand why retrieval-only comparisons are insufficient. +Inputs: `docs/spec/real_world_agent_memory_benchmark_v1.md`, current live baseline +reports, external project comparison docs, and the intended user-job scenario. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, +`live_baseline_benchmark.md`, and `docs/guide/research/comparison_external_projects.md`. +Outputs: Operator-facing suite overview, bias explanation, and implementation routing. + +## Governing Spec + +The authoritative contract is: + +- `docs/spec/real_world_agent_memory_benchmark_v1.md` + +Use the spec for field names, suite ids, report states, scoring rules, and claim +boundaries. This guide is only an operator map. + +## Why This Suite Exists + +The current live baseline proves useful behavior: ELF and qmd can pass the encoded +Docker smoke checks, and ELF can pass provider-backed synthetic, stress, backfill, +restore, and lifecycle checks. That evidence remains valid for the existing benchmark. + +It is incomplete for real agent work. A memory system can retrieve the right chunk and +still fail the user's job by repeating completed work, trusting stale evidence, missing +a blocker, leaking private context, or inventing a decision that was never recorded. + +The real-world suite changes the unit from a query to a `real_world_job`: + +- corpus +- timeline +- prompt +- expected answer +- required evidence +- negative traps +- scoring rubric +- allowed uncertainty + +This shape rewards systems that help agents resume, decide, debug, update stale memory, +compile knowledge, and state honest uncertainty. + +## Suite Overview + +| Suite | What It Tests | Example Job | +| --- | --- | --- | +| Trust/source-of-truth | Provenance, rebuildability, and derived-index boundaries. | Restore a note after index rebuild and cite authoritative source evidence. | +| Work resume | Resuming agent work without repeating completed steps. | Identify the next action after a retained lane failure. | +| Project decisions | Current decisions, rationale, reversals, and caveats. | Explain why a benchmark gate uses typed failures. | +| Retrieval | Task-relevant search with decoys and alternates. | Answer a task query while avoiding near-duplicate project evidence. | +| Memory evolution | Update, delete, expiry, contradiction, and history behavior. | Report what superseded an old fact and suppress deleted memory. | +| Consolidation | Reviewable derived memories without hidden mutation. | Produce a proposal with lineage and unsupported-claim flags. | +| Knowledge compilation | Evidence-linked project/entity/concept pages. | Compile current project status with timeline and stale-section lint. | +| Operator debugging UX | Ability to diagnose wrong results without raw store access. | Show which retrieval stage dropped expected evidence. | +| Capture/integration | Accuracy of hooks, imports, exclusions, and write policies. | Capture a session decision while excluding private spans. | +| Production ops | Backfill, restore, cold start, resource, and bounded-failure behavior. | Resume interrupted import without duplicate source notes. | +| Personalization | Scoped preferences without cross-tenant leakage. | Apply the user's current preference and ignore another project's note. | +| Core/archival memory | Always-loaded core memory behavior kept separate from archival note search. | Detect a stale core block and fall back to archival evidence. | +| Context trajectory | Staged context trajectory, hierarchy selection, and recursive expansion. | Block OpenViking trajectory scoring until same-corpus evidence ids and comparable stage artifacts exist. | + +## External Reference Mapping + +The suite uses external strengths as references, not as winners: + +- ELF: evidence-bound writes, deterministic ingestion boundaries, source-of-truth plus + rebuildable index, production ops, and evaluation tooling. +- qmd: local retrieval quality, query expansion/routing, weighted fusion, rerank, and + transparent debug ergonomics. +- agentmemory: cross-agent hooks, coding-agent continuity, local viewer, consolidation + lifecycle, and observability console. +- claude-mem: progressive disclosure, automatic capture loop, local inspection, and + operator comfort. +- OpenViking: filesystem context model, hierarchical retrieval, staged trajectory, and + session iteration. +- mem0: multi-entity scoping, lifecycle history, optional graph context, hosted/OpenMemory + ecosystem, and personalization references. +- memsearch: Markdown-first source-of-truth pattern, incremental indexing, and practical + local hybrid retrieval. +- llm-wiki and gbrain: compiled knowledge pages, query-save/lint loops, current-truth + plus timeline shape. +- Always-On Memory Agent, Claude Dreams, and Gemini CLI Auto Memory: background + consolidation patterns, with ELF's requirement that derived outputs remain reviewable. +- Graphiti/Zep, Letta, LangGraph, graphify, and nanograph: temporal facts, core versus + archival memory, replay mindset, graph-compressed navigation, and typed graph ergonomics. + +## Report Interpretation + +A real-world benchmark report must preserve typed outcomes: + +- `pass` +- `wrong_result` +- `lifecycle_fail` +- `incomplete` +- `blocked` +- `not_encoded` +- `unsupported_claim` + +Do not collapse those terms into one leaderboard. `unsupported_claim` is especially +important: it means the system made a substantive claim that the corpus or evidence did +not support. That is a different and higher-risk failure than simply missing a result. + +## Implementation Routing + +Downstream runner issues can cite the spec directly. They should choose a small suite +slice first, then report every untouched suite as `not_encoded`. + +Recommended first increments: + +1. Encode one `work_resume` job over the synthetic production corpus. +2. Encode one `retrieval` job with decoys and required evidence. +3. Encode one `memory_evolution` job that proves update/delete/supersession behavior. +4. Add report output for `unsupported_claim` before broadening the suite count. + +Current checked-in smoke increment: + +```sh +cargo make smoke-real-world-job +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/work_resume/`, writes +`tmp/real-world-job/real-world-job-smoke-report.json`, and renders +`tmp/real-world-job/real-world-job-smoke-report.md`. + +The checked-in fixture slice covers stale worktree resume, Decodex/Linear lane status, +failed command recovery, PR review blocker recovery, exact next-action extraction, and +cross-tool capture boundaries. The smoke report includes suite id, job id, expected +evidence, produced answer/evidence, unsupported-claim count, wrong-result count, +latency/cost fields when available, capture/integration behavior classes, and typed +suite/job statuses. Untouched suites remain `not_encoded`. + +Current checked-in aggregate memory increment: + +```sh +cargo make real-world-memory +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/`, writes +`tmp/real-world-memory/real-world-memory-report.json`, and renders +`tmp/real-world-memory/real-world-memory-report.md`. + +This command recursively parses all checked-in `real_world_memory` fixture slices, +including the retrieval-quality slice below. The suite currently encodes: + +- `trust_source_of_truth`: evidence binding, source refs, and Qdrant rebuild from + Postgres-held chunk embeddings before answering. +- `work_resume`: stale worktree resume, Decodex/Linear lane status, failed command + recovery, PR review blocker recovery, and exact next-action extraction. +- `project_decisions`: accepted durable decisions, superseded/reversed decisions, + old-versus-current validation gates, tradeoff rationale, and bounded caveat or + uncertainty handling. +- `retrieval`: alternate phrasing, distractor-heavy retrieval, multi-hop routing, + current-versus-obsolete selection, and minimal sufficient context. +- `memory_evolution`: TTL/delete suppression plus current-versus-historical preference, + issue status, deployment method, benchmark conclusion, and temporal relation cases. +- `operator_debugging_ux`: trace-backed stage attribution that identifies where + expected evidence was filtered, demoted, or selected against. +- `capture_integration`: write-policy audit behavior for redaction/private exclusion, + source-id preservation, evidence binding, no secret leakage, and fixture-backed + capture/integration boundary classification. +- `production_ops`: interrupted generated backfill resume, backup/restore plus + cold-start readback, resource-envelope interpretation, pinned OpenViking local + embedding runtime/wrong-result classification, missing private manifest `blocked` + classification, and provider credential boundary `blocked` classification. +- `personalization`: scoped stable preference correction without temporary or + cross-project preference leakage. +- `core_archival_memory`: core block attachment, scope, provenance, stale-core + detection, archival fallback, and project-decision recovery through core routing + plus archival rationale. +- `context_trajectory`: OpenViking staged retrieval, hierarchy selection, and + recursive/context expansion jobs encoded as `blocked` until same-corpus expected + evidence ids and comparable stage artifacts are available. + +The generated report includes evidence coverage, source-ref coverage, quote coverage, +unsupported-claim count, stale retrieval count, stale-answer count, conflict detection +count, update rationale availability, temporal validity encoding count, scope +correctness, redaction leak count, capture/integration behavior classes, Qdrant +rebuild case/pass counts, expected evidence recall, irrelevant context ratio, +latency/cost, answer-type plus caveat/refusal/uncertainty flags, and trace +explainability counters, production-ops blocked/wrong-result job states, and +private-corpus redaction policy. The fixtures include negative traps for stale +blockers, unsupported prior claims, stale deleted facts, stale historical facts, +cross-project preference leakage, private/redacted text leakage, obsolete retrieval +context, project-decision stale reuse, missing rationale, uncited current policy +claims, overconfident unsupported decision answers, distractor context, +index-only restore claims, private-corpus pass claims without a manifest, and +checked-in credential leakage. + +Current checked-in project-decisions increment: + +```sh +cargo make real-world-memory-project-decisions +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/project_decisions/`, writes +`tmp/real-world-memory/project-decisions/report.json`, and renders +`tmp/real-world-memory/project-decisions/report.md`. The fixture set covers: + +- accepted decision recovery with required rationale; +- superseded decision recovery where historical evidence must not become the current + answer; +- old-versus-current validation gate recovery; +- fixture-backed-first tradeoff rationale with an external-adapter parity caveat; +- missing private-manifest uncertainty where the correct answer is a bounded caveat. + +The report exposes `answer_type`, `requires_caveat`, `requires_refusal`, and +`can_answer_unknown` per job, and the memory-evolution table shows current evidence, +historical evidence, conflict detections, and update-rationale availability. These jobs +are fixture-backed only; they do not claim external adapter parity or private-corpus +validation. + +The report also loads the checked-in external adapter coverage manifest by default: + +```text +apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +``` + +That manifest records the first memory-project set plus expanded RAG and graph-memory +research gates. Its `external_adapters` report section distinguishes: + +- `fixture_backed`: checked-in real-world fixture scoring, such as the ELF fixture + response path. +- `live_baseline_only`: Docker live-baseline retrieval/lifecycle evidence that is not + a real-world suite win. +- `live_real_world`: external adapters that actually execute `real_world_job` + prompts and scoring. +- `research_gate`: checked-in source/setup/runtime/resource/retry metadata for a + future adapter path, not fixture-backed or live execution evidence. + +Current fixture state: `cargo make real-world-memory` covers 60 jobs across 16 suites, +with 53 pass and 7 blocked. The `core_archival_memory` suite contributes six passing +fixture jobs for core block attachment, scope, provenance, stale-core detection, +archival fallback, and project-decision recovery. The `memory_summary` suite +contributes one passing fixture-backed source-trace job for reviewable current, +background, stale, superseded, tombstoned, and derived project-profile entries. The +`proactive_brief` suite contributes four passing source-linked proactive suggestions +and one typed private-corpus refresh blocker tied to XY-930. The blocked jobs are +production-ops operator boundaries, the private-corpus refresh blocker, the +private/provider scheduler blocker, plus the XY-928 OpenViking `context_trajectory` +gates for staged retrieval, hierarchy selection, and recursive context expansion. +The `scheduled_memory` suite contributes four passing source-linked scheduled task +readbacks plus one typed private/provider scheduler blocker tied to XY-930; it is not +hosted scheduler, ChatGPT Tasks, Pulse, notification, or provider-backed private-corpus +parity evidence. + +Current live-adapter state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full +checked-in suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter +materializes generated runtime answers for 55 jobs across 13 suites before scoring, +including the operator-debug fixture tree. +The original targeted `work_resume`, `retrieval`, and `project_decisions` slice still +passes. ELF now also passes live `capture_integration` self-checks for redaction, +exclusions, source ids, evidence binding, and no secret leakage; live consolidation +proposal review; live knowledge-page rebuild/lint; and live operator-debug trace +metadata. The full sweep is still not a full-suite pass: memory_evolution is +`wrong_result`, production_ops keeps operator-owned blocked boundaries, +core_archival_memory remains typed `not_encoded` for this live adapter path, and +context_trajectory remains blocked. qmd keeps `capture_integration`, consolidation, +knowledge_compilation, and core_archival_memory typed non-pass, is `wrong_result` for +operator-debug trace hydration, and still also keeps its separate `live_baseline_only` +same-corpus record for update/delete/cold-start checks; that record is not a +real-world suite win. agentmemory is blocked on durable upstream storage for lifecycle +proof and capture breadth. mem0/OpenMemory, memsearch, and claude-mem no longer share +one live-baseline boundary: mem0/OpenMemory and memsearch now pass scoped local +baseline paths, while OpenMemory product UI/export, hosted +Platform behavior, optional graph memory, memsearch real-world prompt/TTL coverage, +and claude-mem hook/viewer capture remain blocked, unsupported, not encoded, or +wrong-result for the checked-in adapter evidence. OpenViking now reaches its pinned +Docker local embedding setup but remains a same-corpus `wrong_result` until it +returns evidence-bearing retrieval output. The checked-in `context_trajectory` +fixtures keep OpenViking staged retrieval, hierarchy selection, and recursive/context +expansion blocked until same-corpus evidence ids match and staged artifacts are +materialized. +The expanded RAG and graph-memory records for RAGFlow, LightRAG, GraphRAG, +Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper +qmd/OpenViking profiles stay `research_gate`, typed non-pass, or not-encoded records +until Docker-contained or provider-backed evidence-linked outputs exist. XY-929 adds a +focused representative slice for graph/RAG navigation, citation mapping, graph +summaries, temporal validity, graph reports, stale-source lint, and unsupported-claim +handling: + +```sh +cargo make real-world-memory-graph-rag +``` + +Artifacts: + +```text +tmp/real-world-memory/graph-rag/report.json +tmp/real-world-memory/graph-rag/report.md +``` + +This slice is allowed to report blocked, incomplete, wrong_result, not_tested, and +non_goal outcomes. These typed states describe benchmark coverage; do not convert setup +weight, missing research, smoke output, or representative non-pass fixtures into broad +project quality rankings. + +To run the full live adapter sweep for ELF and qmd: + +```sh +cargo make real-world-memory-live-adapters +``` + +Artifacts: + +```text +tmp/real-world-memory/live-adapters/elf-materialization.json +tmp/real-world-memory/live-adapters/elf-report.json +tmp/real-world-memory/live-adapters/elf-report.md +tmp/real-world-memory/live-adapters/qmd-materialization.json +tmp/real-world-memory/live-adapters/qmd-report.json +tmp/real-world-memory/live-adapters/qmd-report.md +tmp/real-world-memory/live-adapters/summary.json +``` + +To run the fixture report without the manifest during local debugging: + +```sh +cargo run -p elf-eval --bin real_world_job_benchmark -- \ + run \ + --fixtures apps/elf-eval/fixtures/real_world_memory \ + --skip-external-adapter-manifest +``` + +To test an adapter-pack manifest before committing it: + +```sh +cargo run -p elf-eval --bin real_world_job_benchmark -- \ + run \ + --fixtures apps/elf-eval/fixtures/real_world_memory \ + --external-adapter-manifest path/to/manifest.json \ + --out tmp/real-world-memory/adapter-contract-report.json +``` + +Narrow memory evolution increment: + +```sh +cargo make real-world-memory-evolution +``` + +Artifacts: + +```text +tmp/real-world-memory/evolution-report.json +tmp/real-world-memory/evolution-report.md +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/evolution/` and reports only +the cases added for current-versus-historical interpretation and temporal staleness. +The relation temporal-validity fixture is encoded and scores current owner, +historical owner, update rationale, and stale-owner trap behavior. + +Current checked-in retrieval-quality increment: + +```sh +cargo make real-world-memory-retrieval +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/retrieval/`, writes +`tmp/real-world-memory/retrieval-report.json`, and renders +`tmp/real-world-memory/retrieval-report.md`. The fixture set covers alternate +phrasing, distractor-heavy retrieval, multi-hop routing, current-versus-obsolete +selection, minimal sufficient context, and trace-backed stage attribution for +operator debugging. Reports include expected evidence recall, irrelevant context ratio, +latency/cost, and optional trace explainability metadata. The qmd and OpenViking +references in these fixtures are design references only; no parity claim is allowed +unless an external adapter run actually provides evidence. + +Operator debugging UX increment: + +```sh +cargo make real-world-job-operator-ux +``` + +Artifacts: + +```text +tmp/real-world-job/real-world-job-operator-ux-report.json +tmp/real-world-job/real-world-job-operator-ux-report.md +``` + +The operator UX fixtures live under +`apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/`. They cover dropped +expected evidence, rerank promotion of a bad candidate, provider latency or failure, +Qdrant rebuild result changes, and misleading relation context. Reports include direct +viewer and admin trace bundle links, steps to root cause, whether raw SQL was needed, +dropped-candidate visibility, trace completeness, repair-action clarity, and any +encoded UX gaps. + +Checked-in evidence snapshot: +`docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md`. + +The same `real-world-memory` target also includes the current consolidation fixtures +under the same fixture root. + +Current checked-in consolidation increment: + +```sh +cargo make real-world-memory-consolidation +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/consolidation/`, writes +`tmp/real-world-memory/consolidation/report.json`, and renders +`tmp/real-world-memory/consolidation/report.md`. The consolidation report includes +proposal usefulness, lineage completeness, review action correctness, proposal +unsupported-claim count, executable gap count, and source mutation count. Source +mutation count must remain `0` for proposal-only cases. + +These fixtures use the same reviewable proposal shape as the runtime manual/fixture +consolidation service. They remain offline fixture responses and do not claim scheduled +provider-backed proposal generation. + +Current live consolidation increment: + +```sh +cargo make real-world-memory-live-consolidation +``` + +This runs only `apps/elf-eval/fixtures/real_world_memory/consolidation/` through the +ELF live service adapter and writes: + +```text +tmp/real-world-memory/live-consolidation/elf-materialization.json +tmp/real-world-memory/live-consolidation/elf-report.json +tmp/real-world-memory/live-consolidation/elf-report.md +tmp/real-world-memory/live-consolidation/summary.json +``` + +The live increment proves service-backed proposal materialization and review audit for +the current checked-in consolidation jobs. It does not implement scheduled production +consolidation, live provider-generated proposal quality, source-of-truth rewrites, or +knowledge-page rebuild/lint scoring. + +Current checked-in knowledge-compilation increment: + +```sh +cargo make real-world-memory-knowledge +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/knowledge/`, writes +`tmp/real-world-memory/knowledge-report.json`, and renders +`tmp/real-world-memory/knowledge-report.md`. The fixtures include synthetic project, +entity, concept, and issue-timeline page artifacts. Generated pages are benchmark +artifacts only: every section must cite source evidence or timeline events, or it must +be explicitly flagged unsupported. The report publishes citation coverage, stale claim +detection, rebuild determinism, aggregate backlink counts and page coverage, page +usefulness, unsupported summary count, and untraced section count. + +Current checked-in production-ops increment: + +```sh +cargo make real-world-memory-production-ops +``` + +Artifacts: + +```text +tmp/real-world-memory/production-ops-report.json +tmp/real-world-memory/production-ops-report.md +``` + +The production-ops fixtures live under +`apps/elf-eval/fixtures/real_world_memory/production_ops/`. They encode user-job +readback over existing public benchmark and restore evidence: interrupted backfill +resume from checkpoint, clean-run comparison, backup/restore readback, Qdrant rebuild +from Postgres-held vectors, cold-start search recovery, and resource-envelope +interpretation. + +The same slice deliberately keeps non-pass boundaries typed. A missing private +production manifest is `blocked`, unavailable provider credentials are `blocked`, and +the OpenViking cold-start dependency fixture now records a pinned Docker-local +embedding path that reaches `OpenViking.add_resource` and `OpenViking.find` but returns +`wrong_result` evidence for the smoke queries. If the pinned wheel cannot install or +import on a Docker platform, that setup boundary remains `incomplete`. These states +are evidence for operator caveats, not proof of private-corpus, provider-backed +production, or external-adapter quality success. + +This suite does not run private corpus data, does not require or publish credentials, +does not perform live Docker restore/backfill work, and does not reinterpret older +live-baseline reports as real-world production-ops wins. For personal production +adoption, cite both the relevant live-baseline or restore proof and this real-world +fixture report; rerun `baseline-production-private` with an operator-owned manifest +before claiming private-corpus retrieval quality. + +Do not treat the full live adapter sweep as a private-corpus or production-ops +adoption verdict. It is a full-suite sweep with typed non-pass states, not a +full-suite pass. diff --git a/docs/guide/benchmarking/real_world_memory_evolution.md b/docs/guide/benchmarking/real_world_memory_evolution.md new file mode 100644 index 00000000..af578a15 --- /dev/null +++ b/docs/guide/benchmarking/real_world_memory_evolution.md @@ -0,0 +1,69 @@ +# Real-World Memory Evolution Benchmark + +Goal: Run and interpret the checked-in memory evolution real-world job fixtures. +Read this when: You need to test current facts, historical facts, stale facts, +conflicts, corrected memories, and temporal relation validity. +Inputs: `apps/elf-eval/fixtures/real_world_memory/evolution/`, +`apps/elf-eval/src/bin/real_world_job_benchmark.rs`, and `Makefile.toml`. +Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, +`docs/guide/benchmarking/real_world_agent_memory_benchmark.md`, and +`docs/guide/research/comparison_external_projects.md`. +Outputs: `tmp/real-world-memory/evolution-report.json` and +`tmp/real-world-memory/evolution-report.md`. + +## Scope + +This suite is part of the real-world job benchmark family. It is not a Docker +live-baseline retrieval matrix and does not claim private production readiness. + +The checked-in fixture set covers: + +- User preference supersession, using mem0-style memory history and Letta-style + current operating memory as reference patterns. +- Issue state evolution from blocked to done. +- Production deployment guidance superseding a local smoke quickstart. +- Benchmark adoption verdict reversal with a bounded private-corpus caveat. +- Relation fact current-versus-historical ownership with graph-lite temporal + validity encoded as a normal pass/fail fixture. + +The relation case borrows from Graphiti/Zep temporal validity and nanograph typed +query ergonomics while preserving ELF's Postgres source-of-truth and evidence-link +requirements. + +## Run + +```sh +cargo make real-world-memory-evolution +``` + +Generated artifacts: + +```text +tmp/real-world-memory/evolution-report.json +tmp/real-world-memory/evolution-report.md +``` + +## Metrics + +The runner reports memory evolution counters at summary, suite, and job levels: + +- `stale_answer_count`: stale negative traps or stale-current forbidden claims used + by produced answers. +- `conflict_detection_count`: current-versus-historical conflicts detected with + both current and historical evidence. +- `update_rationale_available_count`: jobs where the produced answer cites the + update rationale. +- `temporal_validity_not_encoded_count`: jobs that require temporal graph validity + but are deliberately declared `not_encoded`; this should be `0` for the checked-in + evolution fixture set. +- selected lifecycle evidence fields at job level: + `selected_current_evidence`, `selected_historical_evidence`, + `selected_rationale_evidence`, `selected_tombstone_evidence`, and + `selected_invalidation_evidence`. +- `unsupported_claim_count`: existing real-world job unsupported claim counter. + +Runnable jobs should have `stale_answer_count = 0`, nonzero conflict detection, and +an update rationale when the fixture provides one. The relation temporal-validity job +should report temporal validity as encoded and pass only when current and historical +relation evidence are distinguished. Delete/TTL jobs should keep tombstone or +invalidation evidence selected while suppressing the deleted fact as a current answer. diff --git a/docs/guide/competitive_parity_testing.md b/docs/guide/competitive_parity_testing.md new file mode 100644 index 00000000..328bdd91 --- /dev/null +++ b/docs/guide/competitive_parity_testing.md @@ -0,0 +1,80 @@ +# Competitive Parity Testing + +Goal: Run the Docker-only parity gate that decides whether ELF has enough evidence to be considered against external memory systems. +Read this when: You need to prove ELF meets the minimum adoption bar instead of relying on architecture claims. +Preconditions: Docker and Docker Compose are available on the host. +Depends on: `docs/spec/system_competitive_parity_gate_v1.md`, `docs/guide/research/agentmemory_adapter.md`, and `Makefile.toml`. +Verification: `cargo make parity-docker` exits successfully and writes `tmp/parity/competitive-parity-report.json` with `verdict = "pass"`. + +## Run + +Start the gate from the repository root: + +```sh +cargo make parity-docker +``` + +This command invokes Docker Compose on the host. The actual adapter check, +service-backed ELF run, Postgres database, Qdrant vector store, Cargo registry cache, +and Rust build target all run inside Docker-managed containers or volumes. + +The report is written to: + +```text +tmp/parity/competitive-parity-report.json +``` + +## Clean Up + +Remove parity containers and Docker-managed volumes: + +```sh +cargo make clean-parity-docker +``` + +The cleanup command removes Postgres, Qdrant, Cargo cache, and Rust target volumes +for the parity environment. It does not remove the host report directory under +`tmp/parity/`. + +## Current Gate Coverage + +The checked-in gate currently proves this minimum set: + +- the agentmemory fixture adapter maps the sanitized sample into 2 note candidates, + 2 doc candidates, 1 baseline query, and 1 explicit ignored item; +- note candidate source references keep the agentmemory fixture resolver and origin + identifiers; +- unsupported agentmemory memory kinds are rejected with the preserved reason + `unsupported_memory_kind`; +- ELF can run a Postgres/Qdrant-backed retrieval and consolidation harness in Docker; +- consolidation preserves or improves recall while keeping retrieved context size no + larger than the baseline run; +- the local admin viewer route returns 200 during the Docker service run. + +This is not enough for personal production adoption by itself. It is the required +floor that prevents subjective comparisons from being mistaken for evidence. + +## Production Adoption Expansion + +Before using ELF as personal production memory infrastructure, extend the same gate +with private data and live baselines: + +1. Build a sanitized private fixture pack from real personal coding-agent memory + cases. Keep the source fixture out of the repository unless it has been reviewed + for secrets and sensitive content. +2. Run the adapter/import/retrieval path against that private fixture pack inside + Docker. +3. Add at least one live containerized external baseline, starting with agentmemory, + against the same retrieval cases. +4. Keep the acceptance decision strict: ELF is not adopted if it loses on retrieval + quality, migration fidelity, operator inspectability, or failure recovery without + a documented compensating advantage. + +## Failure Handling + +When `cargo make parity-docker` fails: + +- keep `tmp/parity/competitive-parity-report.json` if it was written; +- inspect `tmp/parity/consolidation-harness.log` for service-backed failures; +- fix the failing gate dimension before expanding to broader baselines; +- do not lower thresholds to make a comparison pass. diff --git a/docs/guide/development/dependency_upgrade_workflow.md b/docs/guide/development/dependency_upgrade_workflow.md deleted file mode 100644 index 8b2da3a7..00000000 --- a/docs/guide/development/dependency_upgrade_workflow.md +++ /dev/null @@ -1,23 +0,0 @@ -# Dependency Upgrade Workflow - -This guide standardizes how to upgrade Rust dependencies while keeping version requirements consistent and low-risk. - -## Version format policy - -- Use `major.minor` in version requirements when possible. -- Avoid patch pins unless a specific patch is required for correctness or security. -- For `0.x` dependencies, prefer minor-capped ranges to avoid overly broad upgrades. -- In `Cargo.toml`, normalize dependency entries to inline table form with an explicit `version` key, even when no features are required. -- Do not edit lockfiles by hand. Regenerate them with the appropriate tool. - -Exception: If a minimum patch is required, document the reason and use an explicit range such as `>=X.Y.Z,<X.(Y+1)`. - -## Rust (Cargo) - -1. Normalize dependency entries to inline table form with an explicit `version` key. -2. Keep dependency requirements in `Cargo.toml` at `major.minor` unless a patch pin is required. -3. Run `cargo update -w` from the repository root to refresh `Cargo.lock`. - -## Verification - -- Run `cargo make test` or targeted Rust tests when Rust dependencies change. diff --git a/docs/guide/development/issue_labeling.md b/docs/guide/development/issue_labeling.md new file mode 100644 index 00000000..cbf18466 --- /dev/null +++ b/docs/guide/development/issue_labeling.md @@ -0,0 +1,108 @@ +# Issue Labeling + +Goal: Standardize how Linear issues are labeled in this repository. +Read this when: You are creating, revising, or auditing Linear labels and issue triage. +Inputs: The current Linear workspace labels plus the repository's issue taxonomy needs. +Depends on: Existing label groups and the repository's development workflow. +Verification: Labels remain consistent, searchable, and aligned with the documented taxonomy. + +This guide standardizes how Linear issues are labeled in this repository. + +Tracker policy: + +- Linear is the authoritative issue tracker for planning, triage, and delivery. +- GitHub remains the code hosting, pull request review, release, and CI surface. +- GitHub Issues are not part of the planning, triage, or delivery workflow. + +## Goals + +- Make issues easy to route to the right owner (system area). +- Make the intent of an issue explicit (feature, bug, architecture, spec, research, performance, chore). +- Support cross-cutting workflows by tagging evaluation, reliability, provenance, cost, and governance themes. + +## Label description style + +Label descriptions must be short, clear sentences and must end with terminal punctuation (usually a period). + +## Label taxonomy + +### `kind:*` (required, exactly one) + +Every issue must have exactly one `kind:*` label. + +- `kind:epic`: Umbrella issue that tracks multiple deliverables. +- `kind:feat`: New capability or product behavior that is not primarily a refactor or cleanup. +- `kind:arch`: Architecture and design changes that affect system shape, boundaries, or major flows. +- `kind:spec`: Specification or contract definition (APIs, schemas, invariants, query semantics). +- `kind:research`: Investigation, evaluation, or spike that produces a decision memo or research artifact. +- `kind:perf`: Performance and efficiency improvements (latency, throughput, storage, cost). +- `kind:bug`: Something is not working as intended. +- `kind:chore`: Maintenance work that does not fit other kinds. + +### `area:*` (required, one or more) + +Every issue must have at least one `area:*` label. + +Use `area:*` for ownership and routing. Prefer one primary area and add additional areas only when the change clearly spans multiple subsystems. + +Current areas: + +- `area:api`: HTTP API service and request/response contracts. +- `area:service`: Retrieval logic, ranking, and request orchestration. +- `area:storage`: Postgres schema, SQL queries, and storage correctness. +- `area:providers`: Embedding, rerank, and extractor provider integrations. +- `area:worker`: Background workers, outbox processing, and indexing pipelines. +- `area:mcp`: MCP server and tool routing. +- `area:ui`: Viewer and developer-facing UI work. +- `area:docs`: Documentation and developer experience docs. +- `area:ops`: Local dev, scripts, and operational runbooks. +- `area:security`: Authentication, secrets, and security hygiene. +- `area:observability`: Logging, tracing, and diagnostics. + +### `status:*` (optional, at most one) + +Use `status:*` when an issue is intentionally not progressing. + +- `status:deferred`: Not planned for the near term. +- `status:blocked`: Cannot proceed until dependencies are resolved. The issue body should include a short "Blocked by" section. + +### `theme:*` (optional, any number) + +Use `theme:*` to tag cross-cutting concerns that benefit from consistent closed-loop workflows. + +- `theme:governance`: Approval workflows, review queues, policy, and auditability. +- `theme:evaluation`: Quality measurement, gold sets, regressions, and metrics. +- `theme:provenance`: Evidence, citations, lineage, and explainability. +- `theme:reliability`: Correctness, consistency, failure handling, and operational robustness. +- `theme:cost`: Latency, compute, storage, and cost controls. + +### Reserved labels + +These labels exist for automation and should not be repurposed. + +- `dependencies`: Dependency updates (Dependabot and tooling). +- `bot`: Automated issue or pull request created by a bot. + +## Labeling rules + +1. Add exactly one `kind:*` label. +2. Add at least one `area:*` label. +3. Add `status:*` only when it materially affects planning (deferred or blocked). +4. Add `theme:*` labels when the work is explicitly about governance, evaluation, provenance, reliability, or cost. + +## Examples + +- Postgres schema correctness bug: + - `kind:bug`, `area:storage`, `theme:reliability`. +- Add an optional auth mechanism: + - `kind:feat`, `area:api`, `area:security`, `theme:governance`. +- Retrieval ranking experiment: + - `kind:research`, `area:service`, `theme:evaluation`. +- Performance work postponed: + - `kind:perf`, `area:service`, `status:deferred`, `theme:cost`. + +## Query patterns + +- All epics: `kind:epic`. +- Open feature work: `kind:feat` with non-completed workflow state. +- Reliability issues in storage: `area:storage` + `theme:reliability`. diff --git a/docs/guide/development/languages/index.md b/docs/guide/development/languages/index.md deleted file mode 100644 index d418fee8..00000000 --- a/docs/guide/development/languages/index.md +++ /dev/null @@ -1,7 +0,0 @@ -# Language and Stack Guides - -Purpose: Provide a single entry point for language- or stack-specific development rules. - -## Languages - -- `docs/guide/development/languages/rust.md` — Rust development and style rules. diff --git a/docs/guide/development/languages/rust.md b/docs/guide/development/languages/rust.md deleted file mode 100644 index ed72f446..00000000 --- a/docs/guide/development/languages/rust.md +++ /dev/null @@ -1,225 +0,0 @@ -# Rust Development and LLM-Friendly Style Guide - -This guide defines the Rust rules for this repository. It is optimized for LLM readability, deterministic diffs, and safe execution. All comments and messages must also follow the Global Language Rules in `AGENTS.md`. - -## Scope - -These rules apply to Rust crates, binaries, and tooling in this repository. They do not apply to non-Rust projects. - -## Rule Levels - -- Required: Must be followed. No exceptions without explicit approval. -- Preferred: Strong default. Exceptions are allowed with a brief justification in code comments. -- Optional: Suggestions that can be used when helpful. -- Imperative statements without a label are Required. -- `rustfmt` output is the final authority for formatting. - -## Decision Priorities - -Use this priority order when trade-offs appear: - -1. Correctness and safety. -2. Deterministic behavior and reproducibility. -3. LLM readability and auditability. -4. Simplicity of implementation. -5. Performance. - -## Tooling and Workflow (Required) - -- The Rust toolchain is pinned. Do not modify `rust-toolchain.toml`, `.cargo/config.toml`, or `.rustfmt.toml`. -- Do not install, update, or override toolchains. -- Do not invoke system package managers. -- Use `cargo make` tasks when they are a good fit for formatting, linting, and testing. - -## Runtime Safety (Required) - -- Do not use `unwrap()` in non-test code. -- `expect()` requires a clear, user-actionable message. - -## Time and TLS (Required) - -- Use the `time` crate for all date and time types. Do not add `chrono`. -- Prefer rustls for TLS. Only use native-tls when rustls is not supported. - -## Formatting and Layout (Required) - -- Use tabs (`\t`) for indentation. - -### Module Item Order (Required) - -At module scope, order items as follows: - -``` -mod -use -macro_rules! -type -const -static -trait -enum -struct -impl -fn -``` - -Additional rules: - -- Within each group, place `pub` items before non-`pub` items. -- Within the `fn` group at the same visibility, place non-`async` functions before `async` functions. -- Tests must be declared last, after all other items. -- Inside `#[cfg(test)] mod tests`, you must use `use super::*;`. - -### File Structure (Required) - -- Use a flat module structure. Do not create or keep `mod.rs`. If `mod.rs` exists, flatten it into `a.rs` and `a/xxx.rs` style files. - -## Imports and Paths (Required) - -Use only these import headers: - -- `// std` for `std::`. -- `// crates.io` for third-party crates. -- `// self` for `crate::`, `self::`, `super::`, or workspace member crates. - -Rules: - -- Do not import functions directly. Import the module or type and call `module::function(...)`. -- Calls to functions or macros must use a single module qualifier, such as `parent::function(...)` or `parent::macro!(...)`, unless the function or macro is defined in the same file. -- Standard library macros must be used without a `std::` qualifier, such as `vec!`, `format!`, or `println!`. -- If `crate::prelude::*` is imported, do not add redundant imports. -- Avoid glob imports. In tests, prefer `use super::*;` when it is used. Otherwise, avoid glob imports except an existing prelude. - -## Types and `impl` Blocks (Required) - -- Use `Self` instead of the concrete type name in `impl` method signatures. -- Keep `impl` blocks for a type contiguous in the `impl` section. -- Order `impl` blocks as: inherent, standard library traits, third-party traits, project traits. - -## Generics and Trait Bounds (Required) - -- All trait bounds must be in a `where` clause. -- Inline trait bounds are not allowed. -- You may use `impl Trait` in parameters or return positions. - -## Error Handling (Required) - -- Add context at crate or module boundaries and keep the original error as the source. -- Boundaries include public APIs, entrypoints, and module-level helpers that are consumed outside the module. -- Use `#[error(transparent)]` only for thin wrappers where this crate adds no context and the upstream message is already sufficient for developers. -- Use short, action-oriented error messages that include the source error. -- Use `ok_or_else` to convert `Option` to `Result` with context. - -## Logging (Required) - -- Use fully qualified tracing macros, such as `tracing::info!`. -- Do not import tracing macros. -- Always use structured fields for dynamic values such as identifiers, names, counts, and errors. -- Use short, action-oriented messages as complete sentences. - -## Numeric Literals (Required) - -- Separate numeric literal suffixes with a single underscore, for example `10_f32`. -- Insert underscores every three digits for integers with more than three digits, for example `1_000_000`. - -## Readability Preferences (Preferred) - -- Keep one logical operation per line. -- Prefer functions at or under 100 lines. Extract helpers when a function exceeds 120 lines or the happy path is no longer obvious. -- Limit nesting depth to two levels. Extract helpers if deeper nesting appears. -- Prefer guard clauses and early returns to keep the happy path linear. -- Avoid complex `if let` or `match` guards. Extract a named boolean when logic grows. -- Use descriptive names and avoid single-letter locals except for trivial indices like `i`. -- Prefer explicit type annotations when inference spans multiple steps or reduces clarity. -- Prefer struct literals with named fields over `Default::default()` when fields matter. -- Avoid struct update syntax (`..`) unless the remaining fields are truly irrelevant. -- Keep boolean expressions short; extract them into named variables when they grow. -- Prefer type annotations on `let` bindings or function signatures. Use turbofish only when those locations cannot express the type. - -## Functional Style (Preferred) - -Functional style is allowed and preferred when it stays simple and readable. - -- Limit iterator chains to at most three method calls after the base expression. -- Closures must be single-expression and side-effect free. -- If a closure needs `if`, `match`, or multiple statements, extract a named function. -- Avoid chaining `flat_map`, `filter_map`, `zip`, and `fold` in a single pipeline. -- Use `for` loops when you need multiple mutable state variables, `break`, or `continue`. - -Example (preferred): - -```rust -let filtered: Vec<_> = items.iter().filter(|item| item.is_valid()).collect(); -let mapped: Vec<_> = filtered.into_iter().map(build_item).collect(); -``` - -Example (avoid): - -```rust -let result: Vec<_> = items - .iter() - .filter(|item| item.is_valid()) - .map(|item| build_item(item)) - .filter(|item| item.score > threshold) - .collect(); -``` - -## Borrowing and Ownership (Preferred) - -- Prefer borrowing with `&` over `.as_*()` conversions when both are applicable. -- Avoid `.clone()` unless it is required by ownership or lifetimes, or it clearly improves clarity. -- Use `into_iter()` when intentionally consuming collections. -- Do not use scope blocks solely to end a borrow. -- When an early release is required, use an explicit `drop`. -- When the value is a reference and you need to end a borrow without a drop warning, use `let _ = value;`. - -## Vertical Spacing (Preferred) - -Inside Rust functions: - -- Do not insert blank lines within the same statement type. -- Insert one blank line between different statement types. -- Insert exactly one blank line before the final return or tail expression, unless the body is a single expression. - -Treat statements as the same type when they share the same syntactic form or call target, such as: - -- Multiple `let` statements. -- Multiple `let mut` statements. -- Multiple `if` statements. -- Multiple `if let` statements. -- Multiple `match` statements. -- Multiple `for` loops. -- Multiple `while` loops. -- Multiple `loop` loops. -- Multiple calls to the same macro name (for example, `println!` with `println!`, or `tracing::...` with `tracing::...`). -- Multiple `Type::function(...)` calls. -- Multiple `self.method(...)` calls. -- Multiple assignment statements like `a = b`. - -Additional rules. - -- Treat `let` and `let mut` as different statement types. -- Different macro names are different statement types. -- When both appear together, place `let` statements before `let mut` statements. - -## Comments and Documentation (Required) - -- Comments must be full sentences with proper punctuation. -- Use comments only when intent is not clear from names and types. -- Public items should have doc comments when the intent is not obvious. - -## Tests (Required) - -- Use descriptive test names in `snake_case` that encode the behavior and expected outcome. -- Tests must be deterministic to keep LLM reasoning and CI outcomes stable. -- Integration tests that require external services must be marked `#[ignore]` with a clear message about required dependencies. - -## LLM Readability Checklist (Required) - -Before finalizing a Rust change, ensure the following: - -- Functions are short, flat, and linear. -- Iterator chains are short and clear. -- Error boundaries are explicit. -- Logging uses structured fields. -- Names convey intent without relying on comments. diff --git a/docs/guide/eval-structured-facts-sample.json b/docs/guide/eval-structured-facts-sample.json new file mode 100644 index 00000000..96838d74 --- /dev/null +++ b/docs/guide/eval-structured-facts-sample.json @@ -0,0 +1,29 @@ +{ + "name": "structured-facts-sample", + "defaults": { + "tenant_id": "tenant-1", + "project_id": "project-1", + "agent_id": "agent-1", + "read_profile": "all_scopes", + "top_k": 12, + "candidate_k": 60 + }, + "queries": [ + { + "id": "facts-1", + "query": "what policy do we use for reranking", + "expected_note_ids": ["11111111-1111-1111-1111-111111111111"] + }, + { + "id": "facts-2", + "query": "where are embeddings stored", + "expected_note_ids": ["22222222-2222-2222-2222-222222222222"] + }, + { + "id": "facts-3", + "query": "what is the max evidence quotes per extracted note", + "expected_note_ids": ["33333333-3333-3333-3333-333333333333"] + } + ] +} + diff --git a/docs/guide/evaluation.md b/docs/guide/evaluation.md index a0b54d98..39441ab9 100644 --- a/docs/guide/evaluation.md +++ b/docs/guide/evaluation.md @@ -1,15 +1,34 @@ # Retrieval Evaluation -Purpose: Provide a repeatable way to measure memory retrieval quality and prevent regressions. +Goal: Provide a repeatable way to measure memory retrieval quality and prevent regressions. +Read this when: You need to run retrieval evaluations or compare quality before and after a change. +Inputs: An ELF config file plus an evaluation dataset or saved trace fixture. +Depends on: `elf-eval`, `Makefile.toml`, and the search-related system specs. +Verification: Evaluation commands complete and produce metrics or regression outputs you can compare. ## Tool -Use the `elf-eval` app to run an evaluation against a dataset of queries and expected note IDs. +Use the `elf-eval` app to run an evaluation against a dataset of queries and expected notes. Example: ```bash -cargo run -p elf-eval -- --config ./elf.toml --dataset ./docs/guide/eval-sample.json +cargo run -p elf-eval -- -c ./elf.toml --dataset ./docs/guide/eval-sample.json +``` + +Search-mode selection: + +```bash +# Run the evaluation using the quick_find (faster) search mode. +cargo run -p elf-eval -- -c ./elf.toml --dataset ./docs/guide/eval-sample.json --search-mode quick_find + +# Compare two configs while forcing different modes per side (A vs B). +cargo run -p elf-eval -- \ + -c ./elf.a.toml \ + --config-b ./elf.b.toml \ + --dataset ./docs/guide/eval-sample.json \ + --search-mode planned_search \ + --search-mode-b quick_find ``` ## Dataset format @@ -35,6 +54,11 @@ The dataset is JSON with optional defaults and a list of queries. "11111111-1111-1111-1111-111111111111", "22222222-2222-2222-2222-222222222222" ] + }, + { + "id": "q-2", + "query": "how do we consolidate duplicate incident notes", + "expected_keys": ["incident_merge_protocol"] } ] } @@ -44,9 +68,13 @@ Each query supports these fields: - `id` (optional): A human-friendly identifier for the query. - `query` (required): The search query text. -- `expected_note_ids` (required): One or more note IDs expected in the results. +- `expected_note_ids` (optional): One or more note IDs expected in the results. +- `expected_keys` (optional): One or more semantic note keys expected in the results. +- Exactly one of `expected_note_ids` or `expected_keys` must be set per query. - `tenant_id`, `project_id`, `agent_id`, `read_profile` (optional): Override defaults. - `top_k`, `candidate_k` (optional): Override defaults. +- `ranking` (optional): A request-scoped ranking override (for example, `ranking.blend.enabled`, + `ranking.blend.segments`, or normalization settings). Resolution order for `top_k` and `candidate_k` is: @@ -64,8 +92,305 @@ The command prints a JSON report containing summary metrics and per-query detail - `mean_rr` - `mean_ndcg` - `latency_ms_p50` and `latency_ms_p95` +- `queries[].trace_id` (and `queries[].trace_ids` when `runs_per_query > 1`) for trace-based replay. ## Notes - The evaluation tool uses the configured embedding and rerank providers. +- The evaluation tool can run in either search mode: + - `--search-mode quick_find` (lower latency) + - `--search-mode planned_search` (planning-enabled path; useful when you need query plans and staged trajectory metadata) + - When running a config comparison with `--config-b`, you can set `--search-mode-b` to override the mode for the B side. +- To compare against sanitized agentmemory session fixtures without running an agentmemory server, use + `docs/guide/research/agentmemory_adapter.md`. - The dataset should avoid secrets and sensitive data. +- To persist traces for later replay without running `elf-worker`, set `search.explain.write_mode = "inline"` + in the config used by `elf-eval`. +- To compare ranking policies on a fixed candidate set without re-running Qdrant, use trace compare mode: + - Run: `cargo run -p elf-eval -- -c ./elf.a.toml --config-b ./elf.b.toml --trace-id <uuid1> <uuid2>` + - Requirements: `search.explain.capture_candidates = true` when generating traces, and candidates must not be + expired by `search.explain.candidate_retention_days`. + +## CI Trace Regression Gate + +CI runs a trace regression gate to catch unintended ranking changes on a fixed candidate set. + +What it checks: + +- Replays ranking from stored `search_trace_candidates` for each `trace_id` (no Qdrant or external providers). +- Compares the replayed top-k `note_id`s against the baseline `search_trace_items` for the same trace. +- Enforces thresholds from a gate JSON file: + - `max_positional_churn_at_k` and `max_set_churn_at_k`. + - `min_retrieval_top_rank_retention` (retention over candidates with `retrieval_rank <= retrieval_retention_rank`). +- Fails if the baseline or replay returns fewer than `top_k` items. + +Run locally: + +```bash +# Load the CI fixture into a local Postgres database. +psql "postgres://postgres:postgres@127.0.0.1:5432/elf" -v ON_ERROR_STOP=1 -f sql/init.sql +psql "postgres://postgres:postgres@127.0.0.1:5432/elf" -v ON_ERROR_STOP=1 -f .github/fixtures/trace_gate/fixture.sql + +# Run the gate (reads Postgres DSN from the config). +cargo run -p elf-eval --bin trace_regression_gate -- \ + -c .github/fixtures/trace_gate/config.toml \ + -g .github/fixtures/trace_gate/gate.json \ + --out tmp/trace-regression-gate.report.json +``` + +Update baseline: + +- Re-record the baseline trace items/candidates with the intended baseline build/config, regenerate the fixture, + then update the gate JSON (trace IDs and thresholds) used by CI. + +Export fixtures: + +- Use `elf-eval` to export one or more trace IDs into a deterministic SQL fixture (assumes an empty database): + +```bash +cargo run -p elf-eval --bin trace_gate_export -- \ + -c ./elf.toml \ + --trace-id <uuid1> --trace-id <uuid2> \ + --out tmp/trace-gate.fixture.sql +``` + +- If you also want stage data for trace compare mode, add `--include-stages`. + +Notes: + +- Keep fixtures sanitized (no secrets, no customer data, no proprietary content). +- Treat fixture updates like snapshot updates: update only when a ranking change is intentional, and review the + diff in Git. + +Artifacts: + +- The gate outputs a JSON report (stdout, or the `--out` file) with per-trace metrics and any breached thresholds. + +## Context Misranking Harness + +To measure cross-scope misranking before and after enabling context boosting, use the harness +script: + +```bash +cargo make test-e2e +``` + +Or run the script directly: + +```bash +scripts/context-misranking-harness.sh +``` + +What it does: + +- Creates a dedicated database (default: `elf_e2e`). +- Creates a dedicated Qdrant collection for the run (default: `elf_harness_<run_id>`). +- Starts `elf-worker` and `elf-api` with deterministic local providers: + - `providers.embedding.provider_id = "local"` (token-hash embedding). + - `providers.rerank.provider_id = "local"` (token overlap rerank). +- Inserts two notes with identical text in different scopes (`org_shared` and `project_shared`), + with importance configured to intentionally produce baseline misranking. +- Runs `elf-eval` twice: + - Baseline: no `[context]`. + - Context: `context.scope_descriptions` + `context.scope_boost_weight`. +- Prints `recall@1` and the top-ranked note ID for both runs, then deletes the notes. +- Deletes the dedicated database and collection unless `ELF_HARNESS_KEEP_DB=1` or + `ELF_HARNESS_KEEP_COLLECTION=1` is set. + +Prerequisites: + +- Postgres is running and reachable. +- Qdrant is running and reachable. +- Environment variables are set: + - `ELF_PG_DSN` (base DSN, typically ending in `/postgres`) + - `ELF_QDRANT_GRPC_URL` (Qdrant gRPC URL, commonly `http://127.0.0.1:51890` in this repository) + - `ELF_QDRANT_HTTP_URL` (Qdrant REST URL, commonly `http://127.0.0.1:51889` in this repository) + +Operational notes: + +- The harness builds once and then starts `elf-worker` and `elf-api` by executing `target/debug/...`. + If you are running the services manually, prefer `cargo build` plus direct binary execution over + running multiple `cargo run` processes concurrently, which can lead to Cargo lock contention and + slow startup. +- If the health check does not become ready, inspect `tmp/elf.harness.api.log` and + `tmp/elf.harness.worker.log` for the first startup error. +- `psql`, `curl`, `taplo`, and `jaq` (or `jq`) are installed. + +## Search Modes Latency Benchmark + +To validate the search-modes acceptance criterion that `quick_find` has **lower p95 latency** than +`planned_search`, run a small benchmark using `elf-eval` search-mode selection. + +This procedure uses the ranking-stability harness to seed a deterministic dataset (local providers), +then runs `elf-eval` twice on the same queries. + +### 1) Seed a benchmark dataset (kept for follow-up eval runs) + +```bash +ELF_PG_DSN="postgres://postgres:postgres@127.0.0.1:51888/postgres" \ +ELF_QDRANT_GRPC_URL="http://127.0.0.1:51890" \ +ELF_QDRANT_HTTP_URL="http://127.0.0.1:51889" \ +ELF_HARNESS_DB_NAME="elf_search_mode_bench" \ +ELF_HARNESS_COLLECTION="elf_search_mode_bench_$(date +%s)" \ +ELF_HARNESS_VECTOR_DIM=256 \ +ELF_HARNESS_KEEP_DB=1 \ +ELF_HARNESS_KEEP_COLLECTION=1 \ +scripts/ranking-stability-harness.sh +``` + +Notes: + +- The harness writes `tmp/elf.stability.base.toml` and `tmp/elf.stability.dataset.json`. +- With `ELF_HARNESS_KEEP_DB=1` and `ELF_HARNESS_KEEP_COLLECTION=1`, you must clean up manually (see + cleanup section below). + +### 2) Create a multi-query dataset (for meaningful percentiles) + +`elf-eval` reports p50/p95 over per-query latencies. Duplicate the seeded query into N entries: + +```bash +python - <<'PY' +import json +from pathlib import Path + +src = Path("tmp/elf.stability.dataset.json") +data = json.loads(src.read_text()) +base_query = data["queries"][0]["query"] +expected = data["queries"][0].get("expected_note_ids") or [] + +N = 50 +data["name"] = "search-modes-latency-bench" +data["queries"] = [ + {"id": f"mode-lat-{i+1:02d}", "query": base_query, "expected_note_ids": expected} + for i in range(N) +] + +out = Path("tmp/elf.search_modes_latency.dataset.json") +out.write_text(json.dumps(data, indent=2) + "\n") +print(out) +PY +``` + +### 3) Run `elf-eval` in each mode and compare p95 + +Quick: + +```bash +(cargo run -q -p elf-eval -- -c tmp/elf.stability.base.toml --dataset tmp/elf.search_modes_latency.dataset.json --search-mode quick_find) \ + | awk 'BEGIN{started=0} /^\{/{started=1} {if(started) print}' \ + > tmp/elf.search_modes_latency.quick.json + +jq -r '.summary.latency_ms_p50, .summary.latency_ms_p95' tmp/elf.search_modes_latency.quick.json +``` + +Planned: + +```bash +(cargo run -q -p elf-eval -- -c tmp/elf.stability.base.toml --dataset tmp/elf.search_modes_latency.dataset.json --search-mode planned_search) \ + | awk 'BEGIN{started=0} /^\{/{started=1} {if(started) print}' \ + > tmp/elf.search_modes_latency.planned.json + +jq -r '.summary.latency_ms_p50, .summary.latency_ms_p95' tmp/elf.search_modes_latency.planned.json +``` + +Acceptance check: + +- `quick_find.summary.latency_ms_p95 < planned_search.summary.latency_ms_p95` on the same dataset. + +Reference run (2026-03-04, macOS local Postgres/Qdrant, local providers, `vector_dim=256`, N=50, +`top_k=10`, `candidate_k=60`): + +- `quick_find`: p50 ≈ 9.82ms, p95 ≈ 12.55ms +- `planned_search`: p50 ≈ 9.45ms, p95 ≈ 22.76ms + +### 4) Cleanup + +Drop the benchmark database and delete the benchmark collection (replace the collection name with +the one you used in `ELF_HARNESS_COLLECTION`): + +```bash +psql "postgres://postgres:postgres@127.0.0.1:51888/postgres" -tAc \ + "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = 'elf_search_mode_bench' AND pid <> pg_backend_pid();" >/dev/null +psql "postgres://postgres:postgres@127.0.0.1:51888/postgres" -v ON_ERROR_STOP=1 -c \ + "DROP DATABASE IF EXISTS elf_search_mode_bench;" >/dev/null +curl -sS -X DELETE "http://127.0.0.1:51889/collections/<collection>?wait=true" >/dev/null +``` + +## Ranking Stability Harness + +To empirically measure rank churn reduction from deterministic ranking terms, use the harness +script: + +```bash +ELF_PG_DSN="postgres://postgres:postgres@127.0.0.1:51888/postgres" \ +ELF_QDRANT_GRPC_URL="http://127.0.0.1:51890" \ +ELF_QDRANT_HTTP_URL="http://127.0.0.1:51889" \ +scripts/ranking-stability-harness.sh +``` + +What it does: + +- Creates a dedicated database and Qdrant collection for the run. +- Ingests a synthetic dataset with many near-tied candidates. +- Enables a local noisy rerank model to simulate reranker instability. +- Compares `elf-eval` stability metrics with deterministic ranking disabled vs enabled. + +## Consolidation Harness + +To validate the reflection/consolidation loop with stable query assertions, use the harness: + +```bash +scripts/consolidation-harness.sh +``` + +What it does: + +- Creates a dedicated database (default: `elf_consolidation`) and Qdrant collection. +- Ingests notes using a shared `key` (`incident_merge_protocol`) to create duplicate legacy notes, then ingests a + consolidated canonical note with the same key. +- Waits for ingestion/deindexing with the worker outbox lifecycle. +- Runs `elf-eval` twice with `expected_keys`: + - `tmp/elf.consolidation.out.base.json` (before consolidation). + - `tmp/elf.consolidation.out.after.json` (after consolidation). +- Prints baseline/after recall and retrieved key signals to stdout. +- Cleans up database and collection by default. + +Prerequisites: + +- Postgres and Qdrant are reachable, and local service binds are available. +- Environment variables are set (or `.env` loaded): + - `ELF_PG_DSN` (base DSN, typically ending in `/postgres`) + - `ELF_QDRANT_HTTP_URL` (for example `http://127.0.0.1:51889`) + - `ELF_QDRANT_GRPC_URL` (for example `http://127.0.0.1:51890`) + +Optional controls: + +- `ELF_HARNESS_KEEP_DB=1`: keep the created database after run. +- `ELF_HARNESS_KEEP_COLLECTION=1`: keep the created Qdrant collection after run. +- `ELF_HARNESS_DB_NAME`, `ELF_HARNESS_COLLECTION`, `ELF_HARNESS_RUN_ID`: override generated values. +- `ELF_HARNESS_TOP_K`, `ELF_HARNESS_CANDIDATE_K`: override retrieval cutoffs. +- `ELF_HARNESS_VECTOR_DIM`: override vector dimension used by generated config. + +## Nightly Harness Signals + +CI also runs the harness scripts on a schedule and uploads the JSON outputs and logs as artifacts. + +Rationale: + +- The trace regression gate is a deterministic merge gate for ranking-policy changes. +- The harness scripts cover integration surfaces (Postgres + Qdrant + worker/api orchestration) and are better + suited to a scheduled job than a per-PR gate. + +Configuration: + +- Control rerank noise with `ELF_HARNESS_NOISE_STD`. +- Control stability sampling with `ELF_HARNESS_RUNS_PER_QUERY`. +- Control ranking cutoffs with `ELF_HARNESS_TOP_K` and `ELF_HARNESS_CANDIDATE_K`. + +Configuration: + +- Override the database name with `ELF_HARNESS_DB_NAME`. +- Override the run identifier with `ELF_HARNESS_RUN_ID`. +- Override the collection name with `ELF_HARNESS_COLLECTION` (must start with `elf_harness_`). +- Override the API binds with `ELF_HARNESS_HTTP_BIND`, `ELF_HARNESS_ADMIN_BIND`, + and `ELF_HARNESS_MCP_BIND`. diff --git a/docs/guide/getting_started.md b/docs/guide/getting_started.md new file mode 100644 index 00000000..f5ede104 --- /dev/null +++ b/docs/guide/getting_started.md @@ -0,0 +1,174 @@ +# Getting Started + +Goal: Provide the canonical setup and local run flow for ELF. +Read this when: You are bootstrapping a local ELF environment or resetting a broken one. +Inputs: This repository checkout, Docker Compose for local dependencies, and optional provider credentials. +Depends on: `Makefile.toml`, `docker-compose.yml`, `config/local/elf.docker.toml`, `elf.example.toml`, and the relevant service binaries. +Verification: Configuration is in place and the local ELF stack can start successfully. + +## Prerequisites + +- Docker Compose for the local dependency stack, or separately managed Postgres with `pgvector` and Qdrant. +- Rust toolchain from `rust-toolchain.toml`. +- Provider endpoints only when you are testing provider-backed embeddings, rerank, query expansion, or `add_event`. + +## 1. Start local dependencies + +Validate and start the local Postgres and Qdrant services. +The checked-in Compose file is local-development-only: + +- Postgres: `127.0.0.1:51888`, database `elf_local`, user `elf_dev`, password `elf_dev_password`. +- Qdrant REST: `127.0.0.1:51889`. +- Qdrant gRPC: `127.0.0.1:51890`. +- Data lives in Docker volumes `elf-postgres-data` and `elf-qdrant-data`. + +```sh +docker compose -f docker-compose.yml config >/dev/null +docker compose -f docker-compose.yml up -d postgres qdrant +docker compose -f docker-compose.yml ps +``` + +## 2. Choose config + +For local dependency smoke tests, use the checked-in Docker config directly: + +```sh +config/local/elf.docker.toml +``` + +This config is strict-valid, binds only to loopback, uses the local deterministic embedding and rerank providers, disables LLM query expansion, and contains only placeholder provider keys. Do not use `add_event` with this config until you replace `[providers.llm_extractor]` with a real local or external extractor. + +For provider-backed development, copy `elf.example.toml` to `elf.toml`, then set provider and storage values. + +```sh +cp elf.example.toml elf.toml +``` + +Reference: + +- Full configuration contract: `docs/spec/system_elf_memory_service_v2.md`. + +## 3. Start services + +Run each service in its own terminal from the repository root. +`elf-api` and `elf-worker` auto-create the Postgres schema, the Qdrant memory/docs collections, and docs payload indexes during startup. + +```sh +cargo run -p elf-api -- -c config/local/elf.docker.toml +``` + +```sh +cargo run -p elf-worker -- -c config/local/elf.docker.toml +``` + +Optional MCP server: + +```sh +cargo run -p elf-mcp -- -c config/local/elf.docker.toml +``` + +If you are using `elf.toml` instead, replace `config/local/elf.docker.toml` with `elf.toml`. + +## 4. Inspect API contract + +After `elf-api` starts, the API process serves: + +- `GET /openapi.json` for the generated OpenAPI contract. +- `GET /docs` for the Scalar API reference UI. +- `GET /viewer` on the admin bind for the local read-only search, note, and trace viewer. + +Use the host and port from `service.http_bind` in your config. +For example: + +```sh +curl -fsS http://127.0.0.1:51892/openapi.json +open http://127.0.0.1:51892/docs +``` + +Use the host and port from `service.admin_bind` for the viewer. +For the checked-in local config: + +```sh +open http://127.0.0.1:51891/viewer +``` + +## 5. Smoke the local stack + +```sh +curl -fsS http://127.0.0.1:51892/health +``` + +Run a deterministic `add_note` smoke that does not call any LLM provider: + +```sh +curl -fsS -X POST http://127.0.0.1:51892/v2/notes/ingest \ + -H 'content-type: application/json' \ + -H 'X-ELF-Tenant-Id: local-tenant' \ + -H 'X-ELF-Project-Id: local-project' \ + -H 'X-ELF-Agent-Id: local-agent' \ + -d '{ + "scope": "agent_private", + "notes": [ + { + "type": "fact", + "key": "local_compose_stack", + "text": "The local ELF development stack runs Postgres with pgvector and Qdrant through Docker Compose.", + "importance": 0.7, + "confidence": 0.9, + "ttl_days": 14, + "source_ref": {"schema": "local_smoke/v1", "ref": {"command": "docs/guide/getting_started.md"}} + } + ] + }' +``` + +## 6. Run retrieval evaluation + +Use `elf-eval` with your dataset. + +```sh +cargo run -p elf-eval -- -c elf.toml -i path/to/eval.json +``` + +For dataset format and metric details, see `docs/guide/evaluation.md`. + +## 7. Run local checks + +With the Compose dependencies running, the context misranking harness can use the same local dependency ports: + +```sh +ELF_PG_DSN="postgres://elf_dev:elf_dev_password@127.0.0.1:51888/postgres" \ +ELF_QDRANT_GRPC_URL="http://127.0.0.1:51890" \ +ELF_QDRANT_HTTP_URL="http://127.0.0.1:51889" \ +ELF_HARNESS_VECTOR_DIM=256 \ +cargo make test-e2e +``` + +## 8. Development workflow + +Use `cargo make` tasks from repository root. + +```sh +cargo make fmt +cargo make check +cargo make test-rust +cargo make test-rust-integration +cargo make test-e2e +``` + +Notes: + +- `cargo make test-rust-integration` runs ignored tests that require external Postgres and Qdrant. + Set `ELF_PG_DSN` and `ELF_QDRANT_GRPC_URL`. +- `cargo make test-e2e` runs the context misranking harness. + Set `ELF_PG_DSN`, `ELF_QDRANT_GRPC_URL`, and `ELF_QDRANT_HTTP_URL`. +- Stop local dependencies with `docker compose -f docker-compose.yml down`. + Add `-v` only when you intentionally want to delete the local development volumes. + +## Related guides + +- Evaluation: `docs/guide/evaluation.md` +- Integration testing: `docs/guide/integration-testing.md` +- Single-user production: `docs/guide/single_user_production.md` +- Test taxonomy: `docs/guide/testing.md` +- Agent setup: `docs/guide/agent-setup.md` diff --git a/docs/guide/index.md b/docs/guide/index.md index 20f3e227..bbeeec91 100644 --- a/docs/guide/index.md +++ b/docs/guide/index.md @@ -1,22 +1,73 @@ # Guide Index -Purpose: Provide the entry point for operational guidance and runbooks. +Goal: Route agents to procedural documents that tell them how to execute work safely and +repeatably. +Read this when: You know the question is operational and need the best execution path. +Inputs: The current task shape, subsystem, and whether you need background research. +Depends on: `docs/index.md` and `docs/governance.md`. +Outputs: The smallest guide or guide subfolder needed to continue execution. -## Start here +Question this index answers: "what should I do?" -- `AGENTS.md` for automated agent rules and tooling constraints. -- `docs/spec/index.md` for the normative system specifications and contracts. -- `docs/governance.md` for documentation structure, ownership, and update rules. +## Use this index when -## Guide sections +- You need a runbook, how-to, migration sequence, validation flow, troubleshooting + path, or maintenance procedure. +- You already know the relevant spec and need the operational steps. +- You need a bounded sequence with prerequisites and verification. +- You need external comparisons or research notes that inform an implementation choice. -### Development +## Do not use this index when -- `docs/guide/development/languages/index.md` — Language- and stack-specific development rules. -- `docs/guide/development/languages/rust.md` — Rust development and style rules for this repository. -- `docs/guide/development/dependency_upgrade_workflow.md` — Dependency upgrade workflow and versioning policy. +- You need the authoritative contract, schema, or invariant. +- You need a planning-tool artifact or a saved execution plan under `docs/plans/`. +- You need broad documentation policy or repo task-entrypoint rules; read + `docs/governance.md` or `Makefile.toml` instead. -### Evaluation +## What belongs in `docs/guide/` -- `docs/guide/evaluation.md` — Retrieval evaluation harness and dataset format. -- `docs/guide/integration-testing.md` — End-to-end memory retrieval integration testing. +- Task-oriented runbooks. +- Validation and test procedures. +- Migration, rollout, rollback, and recovery sequences. +- Troubleshooting flows and operator checklists. +- Short implementation recipes that depend on a governing spec. +- Decision-support research and external comparisons that inform implementation choices. + +## Guide document contract + +Start each guide with a compact routing header: + +- `Goal` +- `Read this when` +- `Inputs` or `Preconditions` +- `Depends on` +- `Outputs` or `Verification` + +Then structure the body for execution: + +- Write steps in the order an agent should perform them. +- Keep commands, checks, and rollback points explicit. +- Link to specs for normative truth instead of restating contracts. +- Include failure branches only when they change the next action. +- End with verification so an agent can tell whether the guide succeeded. + +## Structure policy + +- Group guides by workflow or subsystem only when multiple guides exist and the grouping + improves retrieval. +- Do not create empty category folders or placeholder section headings. +- Prefer titles that encode the task or outcome, such as `validate_release.md` or + `rerun_ingest_job.md`. +- Keep the guide index as a router, not a dumping ground for long explanations. + +## Guide subfolders + +- `docs/guide/single_user_production.md` for the single-user production runbook, + backup/restore path, migration checks, and Qdrant rebuild proof. +- `docs/guide/benchmarking/` for live benchmark runbooks, report publication steps, + and checked-in benchmark evidence. +- `docs/guide/competitive_parity_testing.md` for running the Docker-only adoption + gate against external memory-system baselines. +- `docs/guide/development/` for repository-development workflows. +- `docs/guide/research/` for external comparisons and decision-support materials that are + non-normative. diff --git a/docs/guide/integration-testing.md b/docs/guide/integration-testing.md index d81efac2..336715f9 100644 --- a/docs/guide/integration-testing.md +++ b/docs/guide/integration-testing.md @@ -1,19 +1,53 @@ # Integration Testing (Memory Retrieval) -Purpose: Provide a repeatable end-to-end test for memory ingestion, indexing, and retrieval. +Goal: Provide a repeatable E2E test for memory ingestion, indexing, and retrieval. +Read this when: You need to validate retrieval behavior after changing ingestion, ranking, or storage logic. +Inputs: External Postgres and Qdrant services plus the repository test commands. +Depends on: `docs/guide/testing.md` and `Makefile.toml`. +Verification: The integration or E2E commands complete without regressions. + +Name: This flow is the E2E test in `docs/guide/testing.md`. ## When to use - After adding or changing memory ingestion, ranking, or storage behavior. - Before shipping changes that affect retrieval quality or service wiring. +## Fast path (automated) + +Run the ignored integration suite (requires external Postgres and Qdrant): + +```bash +ELF_PG_DSN="postgres://postgres:postgres@127.0.0.1:51888/postgres" \ +ELF_QDRANT_GRPC_URL="http://127.0.0.1:51890" \ +cargo make test-rust-integration +``` + +Run the context misranking harness (creates and drops a dedicated database and collection): + +```bash +ELF_PG_DSN="postgres://postgres:postgres@127.0.0.1:51888/postgres" \ +ELF_QDRANT_GRPC_URL="http://127.0.0.1:51890" \ +ELF_QDRANT_HTTP_URL="http://127.0.0.1:51889" \ +cargo make test-e2e +``` + +CI also runs this harness as a required check for code changes (see `.github/workflows/e2e.yml`). + +Note: The harness builds binaries first and then starts `elf-worker` and `elf-api` by executing the +compiled artifacts under `target/debug/`. This avoids slow startup and Cargo lock contention that can +happen when running multiple `cargo run` processes concurrently. + ## Preconditions - Postgres is running and reachable. - Qdrant is running and reachable. - You have a config file with valid storage and provider settings. +- You can create and drop databases on your Postgres instance. -Note: Use the existing collection configured in your `elf.toml`. Do not create a new collection for this flow. Keep test data isolated by tenant, project, and agent identifiers, then clean it up after the run. +Note: The automated harness creates a dedicated Qdrant collection per run and deletes it on exit. The ignored integration suite uses per-test collections and cleans them up during teardown. +Note: Qdrant exposes a REST API (default: 6333) and a gRPC API (default: 6334). The `storage.qdrant.url` field is the gRPC base URL. In this repository's local setup, REST is commonly mapped to port 51889 and gRPC to port 51890. +Note: The local Postgres instance in this repository typically runs on port `51888`. Adjust the DSN if your setup differs. ## Step 1: Prepare a dedicated integration config @@ -21,21 +55,22 @@ Create a dedicated config file for integration tests (for example, `tmp/elf.inte ```toml [service] -admin_bind = "127.0.0.1:8090" -http_bind = "127.0.0.1:8089" +admin_bind = "127.0.0.1:51891" +http_bind = "127.0.0.1:51892" +mcp_bind = "127.0.0.1:51893" log_level = "info" [storage.postgres] -dsn = "postgres://postgres:postgres@127.0.0.1:5432/elf" +dsn = "postgres://postgres:postgres@127.0.0.1:51888/elf_e2e" pool_max_conns = 10 [storage.qdrant] -collection = "mem_notes_v1" -url = "http://127.0.0.1:6334" +collection = "mem_notes_v2" +url = "http://127.0.0.1:51890" vector_dim = 4096 [providers.embedding] -api_base = "https://provider.example/v1" +api_base = "https://provider.example" api_key = "REPLACE_ME" model = "embedding-model" path = "/embeddings" @@ -46,7 +81,7 @@ timeout_ms = 20000 default_headers = {} [providers.rerank] -api_base = "https://provider.example/v1" +api_base = "https://provider.example" api_key = "REPLACE_ME" model = "rerank-model" path = "/rerank" @@ -56,7 +91,7 @@ timeout_ms = 20000 default_headers = {} [providers.llm_extractor] -api_base = "https://provider.example/v1" +api_base = "https://provider.example" api_key = "REPLACE_ME" model = "llm-model" path = "/chat/completions" @@ -133,7 +168,7 @@ evidence_max_quote_chars = 320 evidence_max_quotes = 2 evidence_min_quotes = 1 redact_secrets_on_write = true -reject_cjk = true +reject_non_english = true ``` ## Step 2: Start the worker and API @@ -141,26 +176,31 @@ reject_cjk = true From the repository root: ```bash -cargo run -p elf-worker -- --config tmp/elf.integration.toml +cargo run -p elf-worker -- -c tmp/elf.integration.toml ``` In a second terminal: ```bash -cargo run -p elf-api -- --config tmp/elf.integration.toml +cargo run -p elf-api -- -c tmp/elf.integration.toml ``` +Note: If you see long "waiting for file lock" messages or slow startup, build once and run the +binaries directly: +`cargo build -p elf-worker -p elf-api`, then `target/debug/elf-worker -c ...` and +`target/debug/elf-api -c ...`. + ## Step 3: Add test notes Use a dedicated tenant, project, and agent to isolate test data. ```bash -curl -sS http://127.0.0.1:8089/v1/memory/add_note \ +curl -sS http://127.0.0.1:51892/v2/notes/ingest \ -H 'content-type: application/json' \ + -H 'X-ELF-Tenant-Id: it-tenant' \ + -H 'X-ELF-Project-Id: it-project' \ + -H 'X-ELF-Agent-Id: it-agent' \ -d '{ - "tenant_id": "it-tenant", - "project_id": "it-project", - "agent_id": "it-agent", "scope": "project_shared", "notes": [ { @@ -187,7 +227,7 @@ curl -sS http://127.0.0.1:8089/v1/memory/add_note \ Record the returned `note_id` values from `results[].note_id`. These are required for the evaluation dataset and cleanup. -Note: Requests reject CJK content. Use English-only text and keys. +Note: Requests reject non-English content. Use English-only text and keys. ## Step 4: Create the evaluation dataset @@ -222,7 +262,7 @@ Create `tmp/eval.json` with expected note IDs from the add-note call. ## Step 5: Run the evaluation ```bash -cargo run -p elf-eval -- --config tmp/elf.integration.toml --dataset tmp/eval.json +cargo run -p elf-eval -- -c tmp/elf.integration.toml --dataset tmp/eval.json ``` Review the JSON output for recall, precision, and latency metrics. @@ -244,26 +284,31 @@ Recommended (quality signal): Use the returned note IDs from Step 3. ```bash -curl -sS http://127.0.0.1:8089/v1/memory/delete \ - -H 'content-type: application/json' \ - -d '{ - "tenant_id": "it-tenant", - "project_id": "it-project", - "agent_id": "it-agent", - "note_id": "NOTE_ID_1" - }' - -curl -sS http://127.0.0.1:8089/v1/memory/delete \ - -H 'content-type: application/json' \ - -d '{ - "tenant_id": "it-tenant", - "project_id": "it-project", - "agent_id": "it-agent", - "note_id": "NOTE_ID_2" - }' +curl -sS -X DELETE http://127.0.0.1:51892/v2/notes/NOTE_ID_1 \ + -H 'X-ELF-Tenant-Id: it-tenant' \ + -H 'X-ELF-Project-Id: it-project' \ + -H 'X-ELF-Agent-Id: it-agent' + +curl -sS -X DELETE http://127.0.0.1:51892/v2/notes/NOTE_ID_2 \ + -H 'X-ELF-Tenant-Id: it-tenant' \ + -H 'X-ELF-Project-Id: it-project' \ + -H 'X-ELF-Agent-Id: it-agent' ``` ## Troubleshooting - If results do not appear immediately, wait a few seconds for the outbox worker to index, then re-run the evaluation. - If Qdrant connectivity warnings appear, verify the configured `storage.qdrant.url` and that the service is reachable. + +## Integration test scheduling decision for Doc v1 acceptance checks + +The Doc v1 acceptance coverage in `packages/elf-service/tests/acceptance/docs_extension_v1.rs` +(filter behavior, source_ref non-English boundary, and Qdrant payload-index assertions) remains +`#[ignore]` by design and is not enabled in default CI because it requires external PostgreSQL/Qdrant +services and acceptance-style provisioning. Run it intentionally with: + +```bash +ELF_PG_DSN="postgres://postgres:postgres@127.0.0.1:51888/postgres" \ +ELF_QDRANT_GRPC_URL="http://127.0.0.1:51890" \ +cargo test -p elf-service --test acceptance -- docs_extension_v1 --ignored +``` diff --git a/docs/guide/observability.md b/docs/guide/observability.md new file mode 100644 index 00000000..d0bfccfb --- /dev/null +++ b/docs/guide/observability.md @@ -0,0 +1,79 @@ +# Observability and Correlation (MCP + Admin API) + +Goal: Provide a practical traceability workflow for agents and operators. +Read this when: You need to correlate requests, traces, tool calls, and admin inspection surfaces. +Inputs: Running `elf-mcp` and `elf-api` instances plus request identifiers or trace IDs. +Depends on: Admin API support and the relevant trace/provenance contracts. +Outputs: A correlated request trail that links surface-level behavior back to stored trace data. + +## 1) Request correlation + +Every ELF response returns: + +- `X-ELF-Request-Id` response header. +- `request_id` field in JSON responses. + +In `elf-mcp`, each tool call carries `X-ELF-Request-Id` automatically: + +- `X-ELF-Request-Id` is generated per call. +- The same value is available in the tool response body as `request_id` (when JSON). + +Correlation workflow: + +1. Capture `request_id` from the JSON response (or header if present). +2. Use the same identifier for logs, incident notes, and trace lookups. + +## 2) Admin provenance lookup + +For a note-level traceability trail: + +- MCP tool: `elf_admin_note_provenance_get` + - `{"note_id": "<uuid>"}` +- Equivalent HTTP endpoint: + - `GET /v2/admin/notes/{note_id}/provenance` + - Schema: `elf.note_provenance_bundle/v1` +- Memory history readback: + - MCP tool: `elf_admin_memory_history_get` + - `GET /v2/admin/notes/{note_id}/history` + - Schema: `elf.memory_history/v1` + +Returned bundle sections: + +- `note` +- `ingest_decisions` +- `note_versions` +- `history` +- `indexing_outbox` +- `recent_traces` + +Use this bundle to answer: + +- Why a note exists or changed. +- Whether indexing/outbox is stalled. +- Which recent searches touched the note. + +## 3) Worker traceability fields + +For background job diagnostics, filter worker logs with these fields: + +- `outbox_id` (indexing/doc indexing/trace outbox jobs). +- `note_id` (note indexing jobs). +- `doc_id`, `chunk_id` (doc indexing jobs). +- `trace_id` (search trace outbox jobs). + +Recommended loop: + +1. Start from a user-facing error `trace_id` or note `note_id`. +2. Query `elf_admin_trace_*` family to inspect trajectory and trace items. +3. Use `elf_admin_note_provenance_get` to connect trace history with ingest and indexing state. +4. Use `elf_admin_memory_history_get` when you only need chronological memory evolution events. + +## 4) MCP admin/debug surface map + +- `elf_admin_traces_recent_list` -> `GET /v2/admin/traces/recent` +- `elf_admin_trace_get` -> `GET /v2/admin/traces/{trace_id}` +- `elf_admin_trajectory_get` -> `GET /v2/admin/trajectories/{trace_id}` +- `elf_admin_trace_item_get` -> `GET /v2/admin/trace-items/{item_id}` +- `elf_admin_trace_bundle_get` -> `GET /v2/admin/traces/{trace_id}/bundle` +- `elf_admin_note_provenance_get` -> `GET /v2/admin/notes/{note_id}/provenance` +- `elf_admin_memory_history_get` -> `GET /v2/admin/notes/{note_id}/history` diff --git a/docs/guide/research/agentmemory_adapter.md b/docs/guide/research/agentmemory_adapter.md new file mode 100644 index 00000000..65d51662 --- /dev/null +++ b/docs/guide/research/agentmemory_adapter.md @@ -0,0 +1,175 @@ +# Agentmemory Fixture Adapter + +Goal: Convert sanitized agentmemory-style session exports into ELF-owned note/doc +candidates and retrieval baseline records. +Read this when: You need to compare coding-agent memory capture against ELF without +running an agentmemory server or bypassing ELF ingestion. +Inputs: A local JSON fixture with agentmemory-style sessions, observations, memories, +and retrieval cases. +Depends on: `elf-eval`, `docs/research/2026-06-08-agent-memory-selection.json`, +`docs/spec/system_elf_memory_service_v2.md`, `docs/spec/system_doc_source_ref_v1.md`, +and `docs/spec/system_source_ref_doc_pointer_v1.md`. +Outputs: A deterministic `elf.agentmemory_adapter/v1` JSON bundle with note candidates, +doc candidates, baseline queries, and ignored-item reasons. + +## Boundary + +The adapter is an offline comparison/import boundary, not an ingestion path. +It does not call agentmemory, ELF HTTP APIs, providers, Postgres, Qdrant, or any LLM. +It only rewrites a sanitized fixture into records that can later be reviewed, grouped, +and submitted through normal ELF endpoints. + +Use this boundary when the question is: + +- Which agentmemory memories are plausible ELF note candidates? +- Which raw observations should be retained as document evidence? +- Which retrieval cases can become ELF evaluation datasets after candidate notes are + ingested through `/v2/notes/ingest`? + +Do not use it to claim that ELF reproduces agentmemory benchmark numbers. Fixture +retrieval cases preserve agentmemory result ranks and scores as external baseline +metadata only. + +## Command + +Run the adapter through `cargo run`: + +```sh +cargo run -p elf-eval --bin agentmemory_fixture_adapter -- \ + --fixture apps/elf-eval/fixtures/agentmemory/sample_session.json \ + --out tmp/agentmemory-adapter.json +``` + +Optional flags: + +- `--scope`: ELF write scope attached to emitted note and doc candidates. Defaults to + `agent_private`. +- `--max-note-chars`: maximum accepted note length before a memory is reported as + ignored. Defaults to `240`, matching the canonical local config limit. + +## Fixture Shape + +The fixture is intentionally small and producer-owned. It should use this shape: + +```json +{ + "schema": "agentmemory.fixture/v1", + "fixture_id": "agentmemory-sample-2026-06-08", + "source": { + "system": "agentmemory", + "version": "v0.9.27", + "export_id": "agentmemory-export-sample", + "exported_at": "2026-06-08T06:30:00Z" + }, + "sessions": [ + { + "session_id": "am-session-2026-06-08", + "agent": "codex", + "project": "ELF", + "started_at": "2026-06-08T05:45:00Z", + "observations": [], + "memories": [], + "retrieval_cases": [] + } + ] +} +``` + +The checked-in sample fixture is sanitized and exists only to exercise the mapping. +External exports must be reviewed for secrets and sensitive content before being +committed or shared. + +## Mapping + +Agentmemory memories become `note_candidates` only when all of these are true: + +- `kind` maps directly to one ELF note type: `preference`, `constraint`, `decision`, + `profile`, `fact`, or `plan`. +- `text` is non-empty and does not exceed `--max-note-chars`. +- `importance` and `confidence`, when present, are finite values in `0.0..=1.0`. + +The emitted `notes_ingest_item` is shaped like a single `/v2/notes/ingest` note item. +It includes a `source_ref/v1` envelope with `resolver = "agentmemory_fixture/v1"` and +stable origin fields: + +- fixture id +- session id +- memory id +- source observation ids +- source system/version +- export, session, and memory timestamps + +The adapter does not infer missing ELF note types, does not truncate text, and does not +rewrite memory text into a canonical note sentence. + +Agentmemory observations become `doc_candidates` when they have non-empty text and an +RFC3339 timestamp from the observation, session, or export. The emitted `docs_put` +payload uses: + +- `doc_type = "chat"` +- `source_ref.schema = "doc_source_ref/v1"` +- `thread_id = session_id` +- `message_id = observation_id` +- `role` from the observation role, observation kind, or `observation` + +This keeps raw session evidence separate from authoritative ELF notes. If operators +later ingest docs and want hydrated note evidence, they should attach normal +`elf_doc_ext/v1` doc pointers after `docs_put` returns concrete `doc_id` values. + +Retrieval cases become `baseline_queries` when at least one expected memory id maps to +a note candidate. The baseline record preserves: + +- query id and query text +- expected agentmemory memory ids +- deterministic note candidate ids +- expected note keys, when available +- agentmemory result ranks/scores, when present + +These records are suitable for building an ELF eval dataset after candidate notes are +ingested through ELF policy. They are not benchmark proof on their own. + +## Ignored Items + +The adapter reports ignored items instead of repairing them. Current reasons include: + +- `empty_text` +- `missing_or_invalid_timestamp` +- `note_text_too_long` +- `unsupported_memory_kind` +- `invalid_importance` +- `invalid_confidence` +- `no_mapped_expected_memories` + +Ignored items can still be reviewed manually. Do not force them into ELF notes by +loosening the adapter; either fix the fixture upstream or store long/ambiguous evidence +as docs and use normal ELF extraction/review workflows. + +## Comparing Retrieval Quality + +Use a two-step comparison: + +1. Review the adapter output and ingest selected `notes_ingest_item` records through + `/v2/notes/ingest`, grouped by scope. ELF write policy, English gate, provenance + validation, duplicate/update resolution, and indexing still run normally. +2. Convert selected `baseline_queries` into the `elf-eval` dataset format. Prefer + `expected_keys` when keys were emitted; otherwise resolve ingested note IDs and use + `expected_note_ids`. + +Then run `elf-eval` as usual: + +```sh +cargo run -p elf-eval -- -c ./elf.toml --dataset tmp/agentmemory-eval.json +``` + +For config-to-config comparisons or trace replay, follow `docs/guide/evaluation.md`. + +## Verification + +Run the adapter fixture test without network services: + +```sh +cargo test -p elf-eval --test agentmemory_fixture_adapter +``` + +Before review handoff for changes to this boundary, run the repository gate from +`Makefile.toml`. diff --git a/docs/guide/research/comparison_external_projects.md b/docs/guide/research/comparison_external_projects.md new file mode 100644 index 00000000..42a861f8 --- /dev/null +++ b/docs/guide/research/comparison_external_projects.md @@ -0,0 +1,590 @@ +# External Memory Project Comparison + +Goal: Provide a detailed, evidence-backed comparison between ELF and adjacent memory projects. +Read this when: You are evaluating architecture directions, positioning claims, or adoption trade-offs. +Inputs: Current ELF docs/code and public documentation for the compared external projects. +Depends on: `docs/spec/system_elf_memory_service_v2.md` and `docs/guide/research/research_projects_inventory.md`. +Outputs: A comparison matrix and trade-off summary suitable for follow-up design decisions. + +Scope note: This document is intentionally detailed and source-heavy. Keep `README.md` concise and link here for full analysis. +For a full list of reviewed and pending projects, see `docs/guide/research/research_projects_inventory.md`. +For the June 2026 agentmemory and dreaming decision run, see +`docs/research/2026-06-08-agent-memory-selection.json`. +For the June 2026 real-world benchmark-dimension refresh, see +`docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`. + +Comparison focuses on shared capabilities, ELF distinctives, and objective trade-offs. These projects solve adjacent problems, but their primary storage units and default workflows differ. + +Legend: + +- `✅`: Built-in and explicitly documented. +- `⚠️`: Partial, optional, transport-specific, or plugin-level support. +- `—`: Not explicitly documented in public docs/readme (as of February 17, 2026). + +## Research Method And Confidence + +- This comparison is documentation-grounded, not benchmark-grounded. +- ELF claims are code-grounded against this repository; peer claims are documentation-grounded. +- Primary evidence is limited to official public READMEs and official docs from each project. +- A capability is marked `✅` only when explicitly documented as first-class behavior. +- A capability is marked `⚠️` when it exists but is optional, transport-specific, plugin-scoped, or requires extra configuration. +- A capability is marked `—` when no explicit public documentation was found during this review window. +- Snapshot date for all claims in this section: February 17, 2026. + +Note: In this section, mem0 refers to the Mem0 ecosystem, including OpenMemory (an MCP memory server with a built-in UI). +OpenViking is included as a newly reviewed project with mechanism-level analysis. + +## June 2026 Real-World Benchmark-Dimension Map + +Snapshot date for this subsection: June 9, 2026. + +This map translates the existing external-project research into benchmark dimensions +for the real-world agent memory suite. It does not add new adapter pass/fail evidence. +Use the evidence class before making claims: + +- `benchmark-grounded`: ELF's Docker benchmark has runnable adapter evidence for this + project and dimension. Read the exact report before quoting a pass/fail result. +- `docs-grounded`: official docs or READMEs indicate a likely strength, but ELF has not + reproduced the behavior in the benchmark runner. +- `watch`: the project remains D0 or otherwise pending; do not assign strength claims + until a deep dive or adapter run exists. + +Current benchmark-grounded scope is narrow. The June 9, 2026 all-project smoke run +proved encoded same-corpus/lifecycle behavior only for the then-current adapters: ELF +and qmd passed their encoded smoke checks; agentmemory passed same-corpus retrieval but +failed or could not prove durable lifecycle behavior; memsearch, mem0, OpenViking, and +claude-mem retained `incomplete`, wrong-result, or not-encoded states. Later June 11 +follow-ups promote scoped local mem0/OpenMemory and memsearch baseline paths, while +OpenMemory UI/export, hosted Platform behavior, optional graph memory, broader +memsearch prompt/TTL coverage, OpenViking staged trajectory, and claude-mem hook/viewer +capture remain blocked, unsupported, not encoded, or wrong-result. All broader suite +fit below is research guidance, not a benchmark result. + +The real-world job runner now carries a separate external adapter coverage manifest: +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. +That manifest is a contract and evidence ledger, not a leaderboard. It records which +projects only have `live_baseline_only` Docker retrieval/lifecycle evidence, which +capabilities are `mocked`, `blocked`, `unsupported`, `incomplete`, `wrong_result`, or +`lifecycle_fail`, and which real-world suites remain `not_encoded`. The manifest now +includes full-suite `live_real_world` sweep records for ELF and qmd through +`cargo make real-world-memory-live-adapters`; both retain targeted live pass evidence +for `work_resume`, `retrieval`, and `project_decisions`, but neither is a full-suite +live pass. It also includes `research_gate` records for RAGFlow, LightRAG, GraphRAG, +Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, graphify, and deeper +qmd/OpenViking profiles. Research gates carry source/setup/runtime/resource/retry +metadata for future adapter work, but they are not fixture-backed, live-baseline-only, +or live-real-world evidence. Other external projects remain live-baseline-only, +incomplete, blocked, or not encoded until their own `real_world_job` adapters run. + +XY-882 adds D1/D2 feasibility verdicts for the RAG and graph-memory research gates. +`adapter_candidate` means an implementation follow-up is justified because a scoped +Docker boundary and evidence-linked output contract exist. It does not mean a Docker +adapter has run, and it does not change the `research_gate` evidence class. + +Benchmark suite labels: + +| Suite | Real-world job shape | +| ----- | -------------------- | +| `rw.resume-evidence` | Resume a stalled agent task, recover the right prior decision, cite required evidence, and avoid negative traps. | +| `rw.lifecycle-staleness` | Update, delete, expire, cold-start, and contradiction cases where stale facts must stop winning. | +| `rw.operator-continuity` | Capture session observations, inspect memory state, and support day-to-day agent continuity with low friction. | +| `rw.retrieval-debug` | Explain query expansion, hybrid retrieval, fusion, rerank, and wrong-result causes. | +| `rw.context-trajectory` | Navigate multi-stage or hierarchical context before selecting final evidence. | +| `rw.knowledge-synthesis` | Compile durable project/entity/concept pages from memory and keep them lintable or repairable. | +| `rw.consolidation-review` | Run background consolidation while keeping derived output reviewable and evidence-linked. | +| `rw.graph-temporal` | Track facts, entities, relations, validity windows, and current-versus-historical answers. | +| `rw.core-archival` | Separate always-loaded operating memory from retrieval-only archival memory. | +| `rw.replay-regression` | Replay, fork, or checkpoint agent state to debug memory-assisted work and regression failures. | +| `rw.graph-navigation` | Use graph-compressed corpus structure to guide agents before raw retrieval or file inspection. | + +Project-to-suite map: + +| Project | Best-fit real-world suites | Why this project matters for that suite | Fair adapter evidence before claims | Evidence class and confidence | Current ELF position | +| ------- | -------------------------- | -------------------------------------- | ---------------------------------- | ----------------------------- | -------------------- | +| agentmemory | `rw.operator-continuity`, `rw.resume-evidence`, `rw.lifecycle-staleness` | Cross-agent hooks, MCP/REST packaging, viewer, lifecycle/consolidation claims, and coding-agent continuity focus make it the right reference for daily agent memory ergonomics. | Use durable upstream storage rather than the current in-memory mock; ingest realistic agent sessions through the public hook/API path; prove restart, update/supersede, delete, and viewer/trace readback. | Mixed: benchmark-grounded only for current same-corpus retrieval; current lifecycle evidence is a failure/blocker, while hooks/viewer/consolidation are docs-grounded. Confidence: medium for suite fit, low for durable adapter quality. | ELF is stronger on evidence-bound writes and source-of-truth discipline; agentmemory remains the reference for capture breadth and agent-continuity UX. | +| qmd | `rw.retrieval-debug`, `rw.lifecycle-staleness`, `rw.resume-evidence` | Its local CLI, structured JSON query output, expansion modes, hybrid routing, weighted fusion, rerank, update, delete, and cold-start path make it the strongest local retrieval-debug baseline. | Run `qmd` over the real-world corpus, capture query JSON, then rewrite/delete corpus files and rerun update/embed/query in fresh processes. | Benchmark-grounded for current smoke retrieval/update/delete/cold-start pass; docs-grounded for deeper query planning ergonomics. Confidence: high for local adapter baseline. | ELF is not yet stronger on local CLI debug ergonomics; treat qmd as the retrieval-debug reference while keeping ELF's service/provenance model. | +| claude-mem | `rw.operator-continuity`, `rw.resume-evidence`, `rw.retrieval-debug` | Progressive-disclosure search, auto-capture hooks, local viewer, and observation/timeline workflows are directly aligned with real agent resumption jobs. | Exercise a real local repository with hook-driven capture, then evaluate `search -> timeline -> observations` behavior after restart; do not rely on mocked storage. | Docs-grounded for progressive disclosure/viewer; current benchmark adapter evidence is incomplete/wrong-result and mostly not encoded for lifecycle. Confidence: medium for product reference, low for current adapter claims. | ELF has stronger provenance and service boundaries, but claude-mem remains a reference for operator workflow and progressive disclosure UX. | +| mem0 / OpenMemory | `rw.lifecycle-staleness`, `rw.graph-temporal`, `rw.operator-continuity`, `rw.resume-evidence` | Entity-scoped memory, memory history, expiration, hosted/OSS surfaces, OpenMemory UI, and optional graph memory make it the broadest lifecycle and ecosystem comparison target. | Separate OSS local FastEmbed/Qdrant evidence from hosted Platform claims; prove add/update/delete/history, entity-scoped retrieval, expiration exclusion, OpenMemory UI readback, and optional graph context on the same corpus. | Benchmark-grounded for scoped local OSS same-corpus retrieval, update/delete/reload, history, entity filters, local `get_all` readback, and deletion audit; OpenMemory product UI/export remains blocked, hosted Platform is a non-goal, and optional graph plus broader prompt coverage remain not encoded. Confidence: medium for suite fit and scoped local adapter quality, low for product UI/hosted/graph claims. | ELF is stronger on deterministic evidence-bound writes; mem0/OpenMemory remains the reference for ecosystem reach, entity-scoped history, hosted option, and optional graph UX, with local preference-correction history currently measured as an ELF loss. | +| memsearch | `rw.lifecycle-staleness`, `rw.retrieval-debug`, `rw.resume-evidence` | Markdown as canonical memory plus incremental/content-addressed reindexing is a useful model for source transparency and rebuildable derived indexes. | Index a real-world Markdown corpus, mutate/delete files, rerun index/search from fresh processes, and record Milvus mode so Lite/Server/Cloud behavior is not conflated. | Benchmark-grounded for local same-corpus retrieval, reindex/update/delete, and cold-start reload smoke; no real-world prompt adapter is encoded, so Markdown-first behavior remains baseline scenario evidence rather than suite pass evidence. Confidence: medium for design pattern and scoped local adapter evidence, low for broad real-world adapter coverage. | ELF already owns source-of-truth plus rebuildable index at service level; memsearch remains a reference for simple local canonical-store ergonomics and transparent local reindexing. | +| OpenViking | `rw.context-trajectory`, `rw.resume-evidence`, `rw.retrieval-debug` | `viking://` context organization, intent analysis, hierarchical retrieval, staged find/search behavior, and session compression are relevant to multi-hop agent context jobs. | Use the pinned Docker local embedding path, then evaluate `add_resource`/`find`/`search` over multi-stage jobs with stage output, hierarchy, and session memory evidence. | Docs-grounded for mechanism; current benchmark adapter reaches local embedding setup and `add_resource`/`find`, but remains `wrong_result` because same-corpus evidence terms are missed. Confidence: medium for architecture reference, low for runnable adapter quality. | ELF has first-class traces and evidence-bound notes, but OpenViking is the reference for hierarchical context trajectory and filesystem-like organization. | +| llm-wiki | `rw.knowledge-synthesis`, `rw.resume-evidence` | Query/save/lint flows and topic-scoped wiki pages are a useful reference for turning retrieved memory into maintained project knowledge. | Run a corpus-to-wiki job, ask resume/decision questions, require page citations back to source memory, then mutate a stale source and prove lint/repair catches it. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for derived-knowledge fit. | ELF is not yet stronger on derived knowledge pages; llm-wiki should inform rebuildable, evidence-cited dossiers rather than core storage. | +| gbrain | `rw.knowledge-synthesis`, `rw.operator-continuity` | `compiled_truth`, timeline sections, backlinks, primary-home routing, and enrichment workflows model a living operational brain for project work. | Build or update pages from the real-world corpus, require current-truth plus timeline answers, and prove enrichment/backlink maintenance does not hide unsupported claims. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for operator knowledge UX. | ELF should keep source notes authoritative; gbrain is a reference for presentation, enrichment, and maintenance loops. | +| Always-On Memory Agent | `rw.consolidation-review`, `rw.operator-continuity` | The file/API/dashboard ingest loop and timer-based consolidation show how background memory formation becomes a user-visible product surface. | Run scheduled consolidation on a fixed corpus, record source rows and output insights, then score whether consolidation is reviewable, repeatable, and bounded against unsupported claims. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for consolidation workflow reference. | ELF should borrow scheduling and operator controls while keeping deterministic writes and reviewable derived outputs. | +| graphify | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Deterministic code extraction, LLM-assisted graph building, honesty tags, graph reports, and assistant hooks are strong references for graph-compressed navigation over large corpora. | Generate graph/report artifacts from the benchmark corpus, require answers to use graph structure plus source evidence, and prove rebuild behavior after corpus edits. | Scored tiny `live_real_world` smoke: `cargo make smoke-graphify-docker-graph-report` records a Docker-only generated-corpus graph/report artifact and currently scores `wrong_result`; the checked-in manifest does not claim broad graph quality, rebuild strength, or production-quality graph navigation. Confidence: medium for adapter feasibility, low for production-quality graph navigation. | ELF is stronger as a memory service; graphify is now a runnable reference for derived graph reports and pre-search guidance, but not yet a stronger end-to-end memory system. | +| Letta | `rw.core-archival`, `rw.operator-continuity` | Core memory blocks, archival memory, and shared/read-only memory blocks map directly to always-loaded operating context versus retrievable memory. | Build a multi-agent job where core blocks must be attached/detached/shared read-only, while archival memory is retrieved separately and audited. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for memory-semantics reference. | ELF has scoped notes but not first-class core/archival block ergonomics; Letta is the reference dimension. | +| LangGraph | `rw.replay-regression`, `rw.resume-evidence` | Thread checkpoints, durable execution, replay, fork, and time travel define a strong model for debugging agent-state and memory-regression behavior. | Run an agent job with memory reads across checkpoints, replay/fork the thread after a stale-memory failure, and verify side-effect boundaries. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for replay workflow reference. | ELF traces are useful but do not replace full agent checkpoint replay; LangGraph is the reference for replay-regression jobs. | +| Graphiti / Zep | `rw.graph-temporal`, `rw.resume-evidence` | Temporal entities, relations, fact triples, validity windows, and graph search directly target stale/contradictory factual memory. | Add fact triples with validity changes, query current and historical answers, and score invalidation/append behavior under contradiction traps. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium-high for temporal-graph dimension. | ELF graph-lite covers evidence-linked validity windows and current/historical relation context; Graphiti/Zep remains the reference for broader temporal graph workflows. | +| nanograph | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema and typed query ergonomics are relevant to making ELF graph-lite interactions inspectable and hard to misuse. | Define typed graph schemas and queries for the same fact set, then score developer-visible validation, query shape, and explainability rather than retrieval quality alone. | Docs-grounded D1; no benchmark adapter evidence. Confidence: medium for DX reference, low for memory-system comparison. | ELF should borrow typed graph ergonomics without treating nanograph as a full memory backend. | + +XY-882 feasibility verdicts for RAG and graph-memory gates: + +| Project | Verdict | Docker boundary | Evidence-linked output contract | Follow-up | +| ------- | ------- | --------------- | ------------------------------- | --------- | +| RAGFlow | `adapter_candidate` | Official Docker Compose path, but the first adapter must use a tiny CPU corpus and record the 4 CPU / 16 GB RAM / 50 GB disk envelope, image size, `vm.max_map_count`, provider needs, and retry behavior. | OpenAI-compatible and agent completion responses can include `reference.chunks` with chunk id, document id/name, metadata, dataset id, positions, and similarity fields. | [XY-885](https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter); no live pass claim. | +| LightRAG | `adapter_candidate` | Docker Compose server with explicit LLM, embedding, rerank, storage, workspace, and data-volume configuration. | Context-only query modes can return the context prepared for the LLM; core APIs can insert documents with ids and source file paths. | [XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter); no live pass claim. | +| GraphRAG | `adapter_candidate` | Cost-bounded Docker Python CLI/API run over a generated tiny corpus with container-local parquet artifacts. | Output tables contain generated UUIDs, human-readable ids, source documents, text units, community reports, and text-unit links for graph summaries and relationships. | [XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter); no live pass claim. | +| Graphiti / Zep | `adapter_candidate` | Docker-local FalkorDB or Neo4j plus Python SDK runner with provider config captured under benchmark artifacts. | Search results and fact triples expose UUIDs, fact text, and validity windows (`valid_at` / `invalid_at`) that map to memory-evolution scoring. | [XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter); no live pass claim. | +| graphify | `adapter_candidate` | Docker-only CLI/materializer using `pip install graphifyy` over a mounted corpus; host-global assistant hooks are out of scope. | `graph.json`, `GRAPH_REPORT.md`, and graph query output include edge types, confidence tags, source files, and source locations. | [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter) adds `cargo make smoke-graphify-docker-graph-report`; XY-900 promotes the tiny generated smoke to scored `live_real_world` `wrong_result` evidence while still avoiding broad quality claims. | +| Letta | `research_only` | Docker server exists, but current docs require explicit embedding configuration and steer Letta Code evaluation toward non-Docker local/frontier-model exploration. | Core/archival memory and shared blocks remain useful semantics, but no contained evidence export is selected for this adapter batch. | No implementation issue. | +| LangGraph | `research_only` | A Docker harness is possible, but the project is an agent-state/checkpoint framework rather than a standalone memory adapter. | Store search and checkpoints are references for replay-regression jobs, not a direct external memory output contract here. | No implementation issue. | +| nanograph | `research_only` | Official positioning is one CLI / one folder / no server / no Docker. | Typed schema, query, CDC, and search ergonomics remain graph-lite DX inspiration. | No implementation issue. | +| llm-wiki | `research_only` | Plugin or instruction-file workflow would require a contained harness before scoring; host-global plugin installs are not proof. | Wiki compile/query/lint/audit workflows are derived-knowledge references, not current adapter outputs. | No implementation issue. | +| gbrain | `blocked` | A Docker-local brain repo and database setup path was not proven in this lane. | Compiled truth, timeline, and source attribution are strong, but not enough for implementation without contained setup proof. | No implementation issue until Docker setup is proven. | + +## Where ELF Is Not Yet The Reference + +| Benchmark dimension | Current reference project(s) | ELF gap to test before claiming strength | +| ------------------- | ---------------------------- | ---------------------------------------- | +| Local retrieval debugging and CLI transparency | qmd | ELF needs equally fast local knobs/readback for expansion, hybrid fusion, rerank, and wrong-result diagnosis. | +| Turn-by-turn agent capture and daily continuity | agentmemory, claude-mem, OpenMemory | ELF has service and viewer surfaces, but not the same turnkey hook breadth or session-continuity product ergonomics. | +| Progressive disclosure UX | claude-mem, OpenViking | ELF has L0/L1/L2 shaping and traces, but the operator workflow still needs better search-session navigation. | +| Entity-scoped history and managed ecosystem reach | mem0/OpenMemory | ELF has ingest decisions and versions, but not the same hosted option, SDK reach, or first-class memory history surface. | +| Core memory versus archival memory | Letta | ELF scopes notes well, but lacks attachable/read-only core memory blocks as a distinct user-facing layer. | +| Temporal graph validity | Graphiti/Zep | ELF graph-lite now persists validity windows and labels current versus historical relation context, while Graphiti/Zep remains the broader reference for temporal graph workflows. | +| Agent replay and forkable regression debugging | LangGraph | ELF traces are replay evidence for retrieval, not full persisted agent-state replay with side-effect boundaries. | +| Derived knowledge pages and lint/repair loops | llm-wiki, gbrain | ELF does not yet ship rebuildable entity/project pages with unsupported-claim lint as a first-class workflow. | +| Scheduled consolidation as a product surface | Always-On Memory Agent | ELF's target should be reviewable derived consolidation, but the scheduling/operator-control workflow is not implemented. | +| Graph-compressed navigation over large corpora | graphify, GraphRAG/LightRAG adapter candidates | ELF relation context is bounded and evidence-linked, but broader graph report/navigation workflows remain future work. | + +## June 2026 Agentmemory And Dreaming Refresh + +Snapshot date for this subsection: June 8, 2026. + +This refresh re-evaluates ELF after the June 2026 hardening work and after the +appearance of [agentmemory](https://github.com/rohitg00/agentmemory) as a high-velocity +coding-agent memory project. It also records the current vendor direction around +dreaming-style background memory consolidation. + +### Current ELF Position + +ELF remains strongest as a high-trust memory service rather than a turnkey coding-agent +continuity plugin. The current main branch has: + +- evidence-linked fact writes and quote-bound provenance; +- deterministic `add_note` separated from LLM-driven `add_event`; +- Postgres as source of truth and Qdrant as a rebuildable derived index; +- scoped HTTP/MCP service semantics, TTL/lifecycle policy, graph-lite relation context, + and retrieval evaluation tooling; +- recently restored local gates, stricter config presence, generated OpenAPI/Scalar docs, + and Docker Compose service dependencies. + +### agentmemory + +agentmemory is now important enough to track as a first-class comparison target. Its +public README advertises cross-agent support for Claude Code, Codex CLI, Cursor, Gemini +CLI, OpenCode, and generic MCP clients; MCP/REST access; hook-based capture; hybrid +BM25/vector/graph retrieval; consolidation/lifecycle behavior; a local viewer on `:3113`; +and iii console observability for traces, KV state, triggers, queues, and streams. Its +roadmap still lists benchmark CI, session replay UI, governance baseline, enterprise trust +features, and a v1.0 stability freeze as future work. + +ELF implication: do not replace ELF with agentmemory. Treat it as: + +- an optional capture/import adapter for coding-agent session observations; +- a benchmark and UX baseline for local continuity workflows; +- a source of product ideas around hooks, viewer, replay, audit, and tool breadth. + +### Dreaming And Background Consolidation + +OpenAI frames dreaming as background curation that synthesizes memory state, applies +preferences, and keeps memory current over time. Anthropic Claude Dreams is the strongest +safety reference: a dream reads an input memory store plus 1-100 sessions, produces a +separate output memory store, never modifies the input store, and leaves the output +reviewable, attachable, discardable, archivable, or deletable. Google examples add two +operator patterns: Always-On Memory Agent runs scheduled consolidation, while Gemini CLI +Auto Memory mines idle transcripts but writes reviewable patches and skill drafts to an +inbox before anything is applied. + +ELF implication: dreaming should be a reviewed derived layer over authoritative evidence, +not a destructive rewrite path. The target shape is: + +- immutable observations, notes, events, traces, and source pointers as input; +- asynchronous consolidation jobs that produce candidate derived memories, pages, graph + views, or skills; +- explicit lineage, diff, confidence, contradiction/staleness markers, and review/apply + controls; +- rebuildable outputs that can be discarded without corrupting source-of-truth memory. + +### Current Recommendation + +Continue building ELF. Do not directly adopt agentmemory or managed dreaming as the core +backend. The next work should prioritize: + +1. a reviewable derived consolidation pipeline; +2. read-only viewer plus retrieval/consolidation observability; +3. optional agentmemory import/baseline adapter; +4. graph-lite typed query and derived knowledge pages with provenance/lint. + +This ordering reuses the existing vNext planning surface instead of starting a parallel +roadmap: [XY-286](https://linear.app/hack-ink/issue/XY-286/knowledge-memory-derived-entityconceptproject-pages-with-provenance), +[XY-19](https://linear.app/hack-ink/issue/XY-19/add-a-read-only-web-viewer-for-sessions-and-traces), +[XY-27](https://linear.app/hack-ink/issue/XY-27/viewer-add-retrieval-observability-panels-on-top-of-the-read-only), +and [XY-70](https://linear.app/hack-ink/issue/XY-70/graph-lite-dx-typed-schema-typed-query-nanograph-inspired) +remain the right backbone. + +Primary sources for this refresh: + +- https://github.com/rohitg00/agentmemory +- https://raw.githubusercontent.com/rohitg00/agentmemory/main/ROADMAP.md +- https://openai.com/index/chatgpt-memory-dreaming/ +- https://platform.claude.com/docs/en/managed-agents/dreams +- https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent +- https://github.com/google-gemini/gemini-cli/blob/main/docs/cli/auto-memory.md + +## Scope And Intended Use + +| Aspect | ELF | [memsearch](https://github.com/zilliztech/memsearch) | [qmd](https://github.com/tobi/qmd) | [claude-mem](https://github.com/thedotmack/claude-mem) | [mem0](https://github.com/mem0ai/mem0) | +| ------------------ | ----------------------------------------------------- | ---------------------------------------------------- | ---------------------------------- | ------------------------------------------------------ | -------------------------------------- | +| Primary artifact | Evidence-bound notes | Markdown memory files + Milvus index | Local Markdown index (chunks) | Session observations and summaries | User, session, and agent memories | +| Default write path | HTTP `POST /v2/notes/ingest` / `POST /v2/events/ingest` | CLI hooks + Python API (Markdown-first) | CLI index + search | Auto-capture via Claude Code plugin hooks | SDK/API (LLM-assisted) | +| Default deployment | API + worker + MCP server | Local package + Milvus (Lite/Server/Cloud) + plugin | Local CLI + MCP server | Local plugin + worker + UI + MCP tools | SDK + hosted option; OpenMemory MCP server + UI | + +## Interfaces And Integration + +| Capability | ELF | memsearch | qmd | claude-mem | mem0 | +| ------------------------------- | --- | --------- | --- | ---------- | ---- | +| Local-first, self-hosted memory | ✅ | ✅ | ✅ | ✅ | ✅ (OpenMemory) | +| MCP integration | ✅ | ⚠️ | ✅ | ✅ | ✅ (OpenMemory) | +| HTTP API service | ✅ | — | ⚠️ | ✅ | ✅ (SDK/API) | +| CLI-first workflow | — | ✅ | ✅ | ⚠️ | — | +| Web UI viewer | — | — | — | ✅ | ✅ (OpenMemory) | +| Hosted option | — | — | — | — | ✅ | + +## Retrieval Pipeline + +| Capability | ELF | memsearch | qmd | claude-mem | mem0 | +| ------------------------------------------- | --- | --------- | --- | ---------- | ---- | +| Full-text search (BM25/FTS/keyword modes) | ✅ | ✅ | ✅ | ✅ | ⚠️ | +| Vector semantic search | ✅ | ✅ | ✅ | ✅ | ✅ | +| Hybrid dense + sparse fusion | ✅ | ✅ | ✅ | ✅ | ⚠️ | +| LLM reranking stage | ✅ | — | ✅ | — | ⚠️ | +| Query expansion or query rewriting | ✅ | — | ✅ | — | ⚠️ | +| Progressive disclosure workflow | ✅ | ⚠️ | — | ✅ | — | + +## Quality, Safety, And Memory Semantics + +| Capability | ELF | memsearch | qmd | claude-mem | mem0 | +| --------------------------------------------- | --- | --------- | --- | ---------- | ---- | +| Evidence-bound notes (verbatim quotes) | ✅ | — | — | — | — | +| Deterministic vs LLM ingestion separation | ✅ | — | — | — | — | +| Source-of-truth storage with rebuildable index | ✅ | ✅ | — | — | — | +| Multi-tenant scoping | ✅ | — | — | — | ✅ | +| TTL and lifecycle policies | ✅ | — | — | — | ✅ | +| First-class graph memory mode | ⚠️ (graph-lite via `POST /v2/graph/query`) | — | — | — | ✅ (optional) | +| Redaction or write-time exclusion controls | ✅ | — | — | ⚠️ | ⚠️ | + +## Operations And Evaluation + +| Capability | ELF | memsearch | qmd | claude-mem | mem0 | +| ------------------------ | --- | --------- | --- | ---------- | ---- | +| Retrieval evaluation CLI | ✅ | — | — | — | — | +| Structured JSON outputs | ✅ | ⚠️ | ✅ | ✅ | ✅ | + +Capability notes: + +- qmd HTTP support is MCP Streamable HTTP (`POST /mcp`) rather than a separate REST memory API ([source](https://github.com/tobi/qmd?tab=readme-ov-file#streamable-http)). +- memsearch integration is currently plugin/CLI-centric; no standalone MCP server is documented ([source](https://github.com/zilliztech/memsearch)). +- memsearch progressive disclosure is described in the Claude plugin workflow docs, not as a generic service contract ([source](https://github.com/zilliztech/memsearch/tree/main/ccplugin)). +- ELF graph mode is intentionally graph-lite: scoped temporal facts are queried through `POST /v2/graph/query`, with optional explain payload `elf.graph_query/v1` and evidence-linked fact rows. +- mem0 graph memory is optional and requires an OpenAI-compatible LLM setup ([source](https://docs.mem0.ai/platform/features/graph-memory)). +- mem0 search docs describe optional reranking, query optimization, and keyword-search toggles ([source](https://docs.mem0.ai/platform/features/search-filters)). +- mem0 lifecycle docs describe `expiration_date` and automatic exclusion of expired memories from retrieval ([source](https://docs.mem0.ai/cookbooks/essentials/memory-expiration-short-and-long-term)). +- claude-mem supports `<private>` tags to exclude selected content from storage ([source](https://github.com/thedotmack/claude-mem?tab=readme-ov-file#memory-privacy-controls)). + +## Project Strengths And Trade-offs + +- [memsearch](https://github.com/zilliztech/memsearch): Strong Markdown-first transparency, smart dedup, and live file-watch sync. Trade-off: integration is centered on plugin/CLI workflows rather than a general MCP + HTTP service surface. +- [qmd](https://github.com/tobi/qmd): Strong local-first retrieval quality (BM25 + vector + rerank + query expansion) with practical CLI and MCP tooling. Trade-off: focused on document retrieval workflows more than memory-specific safety/lifecycle semantics. +- [claude-mem](https://github.com/thedotmack/claude-mem): Strong automatic capture and progressive disclosure UX, plus a practical local web viewer for inspection. Trade-off: optimized for Claude session continuity, with fewer explicit deterministic ingestion boundaries. +- [mem0](https://github.com/mem0ai/mem0): Strong ecosystem reach (SDK + hosted + OpenMemory), multi-entity scoping, and lifecycle controls like `expiration_date`. Trade-off: ingestion and retrieval behavior depends heavily on configurable LLM-assisted flows, which can be less deterministic by default. +- [OpenViking](https://github.com/volcengine/OpenViking): Strong context filesystem paradigm (`viking://`), hierarchical retrieval, and session-centric context iteration. Trade-off: relation model is URI-link based (not property graph), and adoption still requires adapting patterns into ELF's evidence-bound note contract. +- [llm-wiki](https://github.com/nvk/llm-wiki): Strong LLM-maintained wiki pattern, topic-scoped knowledge bases, and explicit query/save/lint flows. Trade-off: wiki pages are the primary interface, so ELF-grade provenance and trust boundaries must remain layered above it. +- [gbrain](https://github.com/garrytan/gbrain): Strong operational knowledge-brain shape with primary-home routing, `compiled_truth` + timeline pages, and explicit maintenance/enrichment workflows. Trade-off: page-first ontology and personal-brain workflow assumptions would over-couple ELF core to one UI/content model if copied directly. +- [Always-On Memory Agent](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent): Strong always-on ingest/consolidate/query loop with multimodal inbox, timer-driven consolidation, simple SQLite persistence, and a lightweight dashboard/API. Trade-off: memory formation is LLM-first, so it does not preserve ELF-style deterministic write boundaries or evidence-bound fact contracts. +- [graphify](https://github.com/safishamsi/graphify): Strong multimodal graph compression with deterministic AST extraction for code, explicit `EXTRACTED`/`INFERRED`/`AMBIGUOUS` relation tagging, and always-on assistant hooks. Trade-off: it is closer to a graph-guided corpus understanding skill than a multi-tenant memory service, so its graph artifact should be treated as a derived operator surface rather than a source-of-truth memory backend. +- [nanograph](https://github.com/nanograph/nanograph): Strong typed schema + typed query developer ergonomics. Trade-off: focuses on graph-first DX patterns rather than ELF's evidence-bound notes + multi-tenant service contract. + +## nanograph Snapshot (New) + +Snapshot date for this subsection: March 4, 2026. + +- nanograph's docs emphasize typed schema and typed query surfaces for working with structured graph data. +- Relevance for ELF: a concrete reference for making graph-lite interaction feel like a first-class API (schema + query + explain), while ELF remains evidence-bound and scope-governed. + +Primary references: + +- [nanograph](https://github.com/nanograph/nanograph) +- [Schema docs](https://github.com/nanograph/nanograph/blob/main/docs/user/schema.md) +- [Query docs](https://github.com/nanograph/nanograph/blob/main/docs/user/queries.md) + +## LLM Wiki And Operational Brain Snapshot (New) + +Snapshot date for this subsection: April 16, 2026. + +| Project | Primary knowledge unit | Relevant mechanism | Implication for ELF | +| ------- | ---------------------- | ------------------ | ------------------- | +| [llm-wiki](https://github.com/nvk/llm-wiki) | Topic-scoped wiki pages maintained as the working knowledge base | Query-answer-save loop, lint/repair workflow, and explicit inspiration from Karpathy's LLM Wiki framing | Strong reference for a derived knowledge-memory layer and operator-friendly compiled knowledge workflow; should sit above ELF core facts and evidence rather than replace them | +| [gbrain](https://github.com/garrytan/gbrain) | Slugged brain pages with one primary home, `compiled_truth`, timeline, and backlinks | Resolver-based routing, schema-guided page types, enrichment as a shared service, hybrid search with compiled-truth boost, and explicit maintenance commands | Strong reference for turning memory into an operational knowledge base; should inform ELF knowledge-memory UX and maintenance loops, not its source-of-truth contract | + +Key takeaways for ELF from this snapshot: + +- Both projects reinforce a useful framing: knowledge is maintained memory, not a separate system. +- Both are more valuable as references for ELF's future knowledge-memory layer than for ELF core ingestion semantics. +- Both treat maintenance as first-class product surface area through lint, enrich, backlink, query-save, or repair flows rather than as a side task. + +## Always-On Memory And Graphify Snapshot (New) + +Snapshot date for this subsection: April 17, 2026. + +| Project | Primary artifact | Relevant mechanism | Implication for ELF | +| ------- | ---------------- | ------------------ | ------------------- | +| [Always-On Memory Agent](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent) | SQLite-backed memories plus timer-generated consolidation insights | Multimodal inbox/file-watcher ingest, scheduled consolidation pass, simple query API, and lightweight dashboard | Strong reference for productizing background memory formation and manual/automatic consolidation triggers, but ELF should keep evidence-bound facts and deterministic note paths instead of making every write LLM-first | +| [graphify](https://github.com/safishamsi/graphify) | Persistent `graph.json` + `GRAPH_REPORT.md` + optional wiki derived from a multimodal corpus | Deterministic AST extraction for code, LLM extraction for docs/media, graph-topology clustering, explicit honesty tags, and always-on assistant hooks | Strong reference for derived graph/wiki operator surfaces and graph-guided navigation over large corpora, but the graph should remain a rebuildable derived view over ELF notes/docs rather than the authoritative store | + +Key takeaways for ELF from this snapshot: + +- Always-on consolidation is a product surface, not just an agent prompt pattern. +- A compressed graph/report layer can materially improve how assistants navigate large corpora before they touch raw files. +- Both projects are strongest when treated as derived layers above a trustworthy base store, not as replacements for ELF core memory semantics. + +## OpenViking Deep Dive (New) + +Snapshot date for this subsection: February 17, 2026. + +| Aspect | OpenViking observation | Implication for ELF | +| ------ | ---------------------- | ------------------- | +| Core paradigm | Filesystem-oriented context model (`viking://`) unifying resource, memory, and skill directories | Useful for retrieval organization and payload shaping; does not require graph database adoption | +| Storage design | Dual-layer storage: AGFS as content source-of-truth + vector index for semantic retrieval | Aligns with ELF's current SoT + derived index principle | +| Retrieval flow | Intent analysis -> hierarchical recursive retrieval -> rerank -> structured result | High-value blueprint for improving complex-query quality in ELF | +| Relation model | Explicit URI relation table via `.relations.json` and link/unlink APIs | Indicates graph-like utility can be achieved without Neo4j-first architecture | +| Session iteration | Session commit/compress + memory extraction loop | Useful reference for memory evolution and operational observability | +| Neo4j signal | No first-class Neo4j dependency or property-graph backend in published architecture | Does not support prioritizing Neo4j for ELF at current stage | + +## Mechanism-Level Deep Dive (Beyond README) + +Snapshot date for this subsection: February 17, 2026. + +| Project | Ingestion and update semantics | Retrieval internals | Consistency and reliability model | Operational profile | +| ------- | ------------------------------ | ------------------- | --------------------------------- | ------------------- | +| [OpenViking](https://github.com/volcengine/OpenViking) | Session-centric commit/compress and memory extraction; relation writes are explicit URI links | Intent analyzer + hierarchical recursive retrieval + optional rerank | Clear stage decomposition and traceable retrieval trajectory concept | Strong context-organization patterns; requires adaptation to ELF evidence-bound semantics | +| [mem0](https://github.com/mem0ai/mem0) | `add()` can run LLM-guided `ADD/UPDATE/DELETE/NONE`; history events are persisted; optional graph extraction runs alongside vector memory | Dense retrieval is core; rerank/filter are optional; graph mode adds relation retrieval as an extra context channel | OSS sync mode waits for processing completion; Platform API is async-by-default with event queue semantics | Rich hosted + OSS surface; stronger built-in feedback/events, but more tuning knobs and potential latency/cost variance | +| [memsearch](https://github.com/zilliztech/memsearch) | Markdown is canonical; reindex is incremental/content-addressed; stale chunks are removed by hash-based reconciliation | Milvus hybrid search (dense + BM25 sparse) with RRF fusion | Plugin hook workflow favors practical continuity; failures are mostly handled operationally rather than through strict policy contracts | Very pragmatic local workflow; Milvus Lite/Server/Cloud flexibility, but capability envelope depends on Milvus mode | +| [qmd](https://github.com/tobi/qmd) | Content-addressed SQLite model; `qmd update` reactivates/upserts and deactivates missing documents | Typed query expansion (`lex/vec/hyde`), hybrid routing, weighted RRF, then rerank blend by rank bands | Strong deterministic local index behavior with schema self-healing for vector tables | Excellent local-first control and explainability; less focused on multi-tenant memory governance semantics | +| [claude-mem](https://github.com/thedotmack/claude-mem) | Hook-driven capture tied to Claude Code lifecycle; queue-backed worker persists pending tasks | Progressive-disclosure retrieval is explicit (`search -> timeline -> get_observations`); hybrid local stack (SQLite + Chroma) | Deliberate fail-open handler behavior reduces workflow interruption but may accept occasional capture gaps | Best-in-class local operator ergonomics (viewer/SSE/logs), centered on Claude-centric usage patterns | +| [llm-wiki](https://github.com/nvk/llm-wiki) | Topic-specific wiki artifacts persisted as the working knowledge base | Query-answer-save loop over wiki state, lint/repair workflow, and an explicit LLM Wiki model | Strong practical workflow for compiled knowledge, but the wiki itself is the primary artifact rather than a strictly derived view | Useful model for ELF-derived dossiers/concept pages and memory linting, not for replacing evidence-bound facts as authoritative state | +| [gbrain](https://github.com/garrytan/gbrain) | Page-first brain with schema-guided slugs/types/tiering and `compiled_truth` + timeline sections | Hybrid search with compiled-truth boosting, resolver-based primary-home routing, and shared enrichment service callable from multiple ingest paths | Strong operator workflow for maintaining a living knowledge base, but trust/provenance depends on page upkeep discipline | Useful model for ELF knowledge-memory presentation and enrichment loops if pages remain derived and pointer-backed | +| [Always-On Memory Agent](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent) | Always-on memory loop over local SQLite rows and consolidation insights | File watcher/dashboard/API ingest, timer-based consolidation, and lightweight local query surface over multimodal inputs | Operationally simple and product-legible, but memory formation is LLM-first and does not separate deterministic note writes from derived synthesis | Useful model for adding first-class consolidation scheduling and operator controls without relaxing ELF write-path invariants | +| [graphify](https://github.com/safishamsi/graphify) | Derived knowledge graph plus graph report/wiki built from code and multimodal corpus inputs | Deterministic AST extraction, LLM-assisted relation extraction, topology-based clustering, and hook-driven assistant guidance | Excellent for graph-guided corpus navigation, but not a general memory contract and not scoped around multi-tenant storage semantics | Useful model for ELF-derived graph reports, graph-guided query surfaces, and assistant hooks over rebuildable derived artifacts | + +Key takeaways for ELF from this deeper pass: + +- mem0 demonstrates that graph context can be additive instead of replacing vector retrieval. +- qmd shows retrieval quality gains from explicit routing heuristics and transparent score fusion. +- memsearch validates a strong pattern: canonical primary store + rebuildable derived index. +- claude-mem demonstrates how much adoption improves when operator inspection is first-class. +- OpenViking reinforces that context organization and retrieval trajectory can deliver large gains without Neo4j-first architecture. +- llm-wiki reinforces the value of a query/save/lint workflow around compiled knowledge artifacts rather than treating every answer as ephemeral. +- gbrain reinforces that a useful knowledge base often looks like maintained entity/project pages with current truth plus timeline, not just a bag of retrieved chunks. +- Always-On Memory Agent reinforces that scheduled consolidation and manual consolidation triggers are product-level features, not just internal implementation details. +- graphify reinforces that graph-compressed corpus views and pre-search graph guidance can meaningfully reduce raw-file thrash for assistants. + +## Where ELF Is Currently Weaker (Objective Gaps) + +- ELF now has a local admin viewer and retrieval observability surfaces, but + claude-mem, OpenMemory, and agentmemory remain stronger references for turnkey + memory-inspection and session-continuity ergonomics. +- No hosted/cloud product option (mem0 provides managed deployment). +- Graph support is currently graph-lite (`POST /v2/graph/query`) and does not yet include multi-hop/global graph reasoning patterns used by GraphRAG-focused projects. +- Less turnkey for zero-config local plugin workflows than memsearch/claude-mem defaults. +- Supports explicit `quick_find` vs `planned_search` split through `POST /v2/searches` mode. +- Stage-level retrieval trajectory summary is now first-class on `/v2/searches` responses (`search_retrieval_trajectory/v1`), but operator-facing trajectory inspection ergonomics are still evolving. + +## Extended Deep-Dive Comparison (Reference Only) + +Snapshot date for this subsection: February 17, 2026. + +| Project | Distinct memory model | High-value mechanism | Known trade-off | Optional takeaway for ELF | +| ------- | --------------------- | -------------------- | --------------- | -------------------------- | +| [mem0](https://github.com/mem0ai/mem0) | Entity-scoped memories (`user_id`/`agent_id`/`app_id`/`run_id`) with optional graph augmentation | Async ingestion + webhooks, explicit memory history events, optional graph relations context | Async default introduces read-after-write complexity; graph path adds cost and provider coupling | Add first-class memory update events and stronger entity-scoped query semantics; keep graph context additive first | +| [Letta](https://github.com/letta-ai/letta) | Explicit split between core memory blocks and archival memory | Attachable/detachable blocks with `read_only` sharing for multi-agent coordination | Requires clear policy boundaries between always-loaded context and retrieval-only context | Add `core` vs `archival` memory layers in ELF without replacing note storage | +| [LangGraph](https://docs.langchain.com/oss/python/langgraph/persistence) | Threaded checkpoints + replay/fork over persisted state | Deterministic replay model (`thread_id` + checkpoint lineage) for debugging and regression analysis | Replay safety requires idempotent side-effect boundaries | Elevate trace replay and ranking compare to hard regression gates in CI | +| [Graphiti / Zep](https://help.getzep.com/graphiti/core-concepts/temporal-awareness) | Temporal knowledge graph (entities/relations/facts) with explicit validity windows | Invalidate-and-append fact updates (`valid_at`/`invalid_at`) instead of destructive overwrite | Full graph backends add operational complexity and traversal cost | Implement Postgres-first graph-lite with temporal fact validity before introducing graph infra | +| [qmd](https://github.com/tobi/qmd) + [claude-mem](https://github.com/thedotmack/claude-mem) | Retrieval UX and operator workflow focus | Progressive-disclosure search + local inspection/debug loops | Less emphasis on strict deterministic ingestion contracts | Productize ELF debug loop (viewer, status, explain-first inspection) | +| [llm-wiki](https://github.com/nvk/llm-wiki) + [gbrain](https://github.com/garrytan/gbrain) | Compiled knowledge artifacts and maintained knowledge pages | Query-save flows, `compiled_truth` + timeline page shape, backlink/enrichment maintenance, and wiki/brain repair loops | Page-first systems can blur source-of-truth boundaries unless provenance is explicit and rebuildable | Add a derived knowledge-memory layer in ELF with note/doc pointers, recompile rules, and lint/repair loops | +| [Always-On Memory Agent](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent) + [graphify](https://github.com/safishamsi/graphify) | Background consolidation and graph-compressed operator context | Scheduled consolidation loops, multimodal inbox flow, derived graph/report surfaces, and always-on assistant guidance before raw search | LLM-first consolidation and graph artifacts can drift unless tied back to authoritative evidence and rebuild rules | Add optional consolidation schedulers and derived graph/report surfaces in ELF while keeping Postgres notes/docs authoritative | + +## Extended Source Map + +- RAGFlow: + - https://ragflow.io/docs/ + - https://github.com/infiniflow/ragflow/blob/main/docker/README.md + - https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md +- LightRAG: + - https://github.com/HKUDS/LightRAG + - https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/DockerDeployment.md + - https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/LightRAG-API-Server.md + - https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/ProgramingWithCore.md +- GraphRAG: + - https://microsoft.github.io/graphrag/ + - https://microsoft.github.io/graphrag/index/inputs/ + - https://microsoft.github.io/graphrag/index/outputs/ + - https://microsoft.github.io/graphrag/query/local_search/ +- mem0: + - https://docs.mem0.ai/platform/features/entity-scoped-memory + - https://docs.mem0.ai/platform/features/graph-memory + - https://docs.mem0.ai/core-concepts/memory-operations/add + - https://docs.mem0.ai/open-source/features/async-memory + - https://docs.mem0.ai/platform/features/advanced-retrieval + - https://docs.mem0.ai/platform/features/async-mode-default-change + - https://docs.mem0.ai/platform/features/webhooks + - https://docs.mem0.ai/open-source/features/custom-update-memory-prompt + - https://github.com/mem0ai/mem0/blob/main/mem0/memory/main.py + - https://github.com/mem0ai/mem0/blob/main/mem0/memory/graph_memory.py +- Letta: + - https://docs.letta.com/concepts/memory/blocks/ + - https://docs.letta.com/concepts/memory/archival-memory/ + - https://docs.letta.com/concepts/memory/shared-memory/ +- LangGraph: + - https://docs.langchain.com/oss/python/langgraph/persistence + - https://docs.langchain.com/oss/python/langgraph/durable-execution + - https://docs.langchain.com/oss/python/langgraph/use-time-travel +- Graphiti / Zep: + - https://help.getzep.com/graphiti/core-concepts/temporal-awareness + - https://help.getzep.com/graphiti/working-with-data/adding-fact-triples + - https://help.getzep.com/graphiti/working-with-data/searching-the-graph +- memsearch: + - https://github.com/zilliztech/memsearch/blob/main/docs/architecture.md + - https://github.com/zilliztech/memsearch/blob/main/docs/claude-plugin.md + - https://github.com/zilliztech/memsearch/blob/main/src/memsearch/core.py + - https://github.com/zilliztech/memsearch/blob/main/src/memsearch/store.py +- OpenViking: + - https://github.com/volcengine/OpenViking/blob/main/README.md + - https://github.com/volcengine/OpenViking/blob/main/docs/en/concepts/01-architecture.md + - https://github.com/volcengine/OpenViking/blob/main/docs/en/concepts/05-storage.md + - https://github.com/volcengine/OpenViking/blob/main/docs/en/concepts/07-retrieval.md + - https://github.com/volcengine/OpenViking/blob/main/docs/en/concepts/08-session.md + - https://github.com/volcengine/OpenViking/blob/main/openviking/storage/viking_fs.py + - https://github.com/volcengine/OpenViking/blob/main/openviking/retrieve/hierarchical_retriever.py + - https://github.com/volcengine/OpenViking/blob/main/openviking/service/relation_service.py + - https://github.com/volcengine/OpenViking/blob/main/pyproject.toml +- qmd / claude-mem: + - https://github.com/tobi/qmd + - https://github.com/tobi/qmd/blob/main/src/store.ts + - https://github.com/tobi/qmd/blob/main/src/llm.ts + - https://github.com/tobi/qmd/blob/main/src/mcp.ts + - https://docs.claude-mem.ai/user-guide/progressive-disclosure-search + - https://docs.claude-mem.ai/user-guide/view-memory + - https://github.com/thedotmack/claude-mem/blob/main/src/servers/mcp-server.ts + - https://github.com/thedotmack/claude-mem/blob/main/src/services/worker/http/routes/ViewerRoutes.ts +- llm-wiki: + - https://github.com/nvk/llm-wiki + - https://github.com/nvk/llm-wiki/blob/main/README.md + - https://llm-wiki.net/ + - https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f +- gbrain: + - https://github.com/garrytan/gbrain + - https://github.com/garrytan/gbrain/blob/master/README.md + - https://github.com/garrytan/gbrain/blob/master/docs/ENGINES.md + - https://github.com/garrytan/gbrain/blob/master/docs/GBRAIN_RECOMMENDED_SCHEMA.md + - https://github.com/garrytan/gbrain/blob/master/src/schema.sql + - https://github.com/garrytan/gbrain/blob/master/src/core/search/hybrid.ts + - https://github.com/garrytan/gbrain/blob/master/src/core/enrichment-service.ts +- Always-On Memory Agent: + - https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent + - https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/agents/always-on-memory-agent/README.md +- graphify: + - https://github.com/safishamsi/graphify + - https://github.com/safishamsi/graphify/blob/v3/README.md + - https://github.com/safishamsi/graphify/blob/v3/README.zh-CN.md + +## ELF Distinctives (Code-Verified) + +- Evidence binding with verbatim quote checks. +- Postgres is the source of truth; vector index is fully rebuildable. +- Deterministic `add_note` and LLM-only `add_event` semantics. +- Query expansion modes (`off`, `always`, `dynamic`) for cost/latency control. +- Dedicated evaluation CLI to measure retrieval quality. + +## Potential Directions (Reference, Not Commitments) + +Expanded research snapshot date for this section: February 17, 2026. + +This list is for architectural comparison only. It is not a product commitment and should not be read as a roadmap. + +1. Temporal Graph-Lite facts in Postgres + - Borrow from Graphiti's temporal fact model (`valid_at`/`invalid_at`) and invalidation-overwrite semantics. + - Add `entities` + `facts` as append-only, evidence-linked rows with temporal windows. + - Keep graph storage in Postgres first; avoid introducing a graph database in the first iteration. + +2. Core memory blocks vs archival memory + - Borrow from Letta's memory blocks + archival memory split. + - Add first-class, attachable per-agent memory blocks (for stable identity/instructions) while keeping notes as archival memory. + - Support read-only shared blocks for multi-agent coordination. + +3. First-class memory evolution and history semantics + - Borrow from mem0's explicit `ADD`/`UPDATE`/`DELETE` event model and history APIs. + - Standardize update decisions and reasons in the API contract so behavior is auditable and reproducible. + +4. Replay-first ranking and regression gates + - Borrow from LangGraph's checkpoint/replay mindset. + - Promote trace replay and policy comparison to a CI quality gate to prevent silent retrieval regressions. + +5. Developer observability workflow + - Borrow from qmd/claude-mem operator workflows (viewer + status + logs + troubleshooting loop). + - Add a lightweight inspection surface and stronger local debugging commands to reduce tuning/debug cycle time. + +6. Search mode split and retrieval trajectory + - Borrow from OpenViking's `find()` vs `search()` separation and staged retrieval flow. + - Keep quick/planned split and stage-level trajectory outputs in place on `/v2/searches`, then improve operator visibility (`GET /v2/searches/{search_id}` ergonomics and optional local timeline tooling). + +7. Unified evidence-to-knowledge memory layer + - Borrow from llm-wiki's query/save/lint workflow and gbrain's `compiled_truth` + timeline page shape. + - Add optional derived knowledge-memory pages in ELF (entity pages, concept pages, dossiers, project overviews) that compile from notes/docs and can be rebuilt. + - Keep notes and evidence pointers authoritative so derived knowledge remains inspectable, invalidatable, and lintable instead of becoming a second hidden source of truth. + +8. First-class background consolidation workflow + - Borrow from Always-On Memory Agent's multimodal inbox, scheduled consolidation pass, and explicit manual consolidation trigger. + - Add first-class scheduling and operator control surfaces for consolidation/rebuild jobs, while keeping ELF note writes and provenance rules deterministic where required. + +9. Graph-compressed navigation over rebuildable derived views + - Borrow from graphify's deterministic code extraction, explicit confidence/honesty tagging, graph report, and assistant hook surfaces. + - Add optional graph-derived reports, graph query surfaces, or agent-facing pre-search guidance over ELF notes/docs without treating the graph as a new source of truth. + +Current planning surface for these research-backed directions: + +- Linear project: [ELF vNext: Evidence-to-Knowledge Memory](https://linear.app/hack-ink/project/elf-vnext-evidence-to-knowledge-memory-d7a9dd3f3e86) +- Active workstreams: + - [XY-286](https://linear.app/hack-ink/issue/XY-286/knowledge-memory-derived-entityconceptproject-pages-with-provenance) knowledge-memory layer + - [XY-19](https://linear.app/hack-ink/issue/XY-19/add-a-read-only-web-viewer-for-sessions-and-traces) and [XY-27](https://linear.app/hack-ink/issue/XY-27/viewer-add-retrieval-observability-panels-on-top-of-the-read-only) operator workflow + - [XY-70](https://linear.app/hack-ink/issue/XY-70/graph-lite-dx-typed-schema-typed-query-nanograph-inspired) graph-lite DX + +Research sources for this section: +- Graphiti/Zep: + - https://help.getzep.com/graphiti/core-concepts/temporal-awareness + - https://help.getzep.com/graphiti/working-with-data/adding-fact-triples + - https://help.getzep.com/graphiti/working-with-data/searching-the-graph +- Letta: + - https://docs.letta.com/concepts/memory/blocks/ + - https://docs.letta.com/concepts/memory/archival-memory/ + - https://docs.letta.com/concepts/memory/shared-memory/ +- mem0: + - https://docs.mem0.ai/platform/features/graph-memory + - https://docs.mem0.ai/platform/features/entity-scoped-memory + - https://docs.mem0.ai/open-source/features/custom-update-memory-prompt +- LangGraph: + - https://docs.langchain.com/oss/python/langgraph/persistence + - https://docs.langchain.com/oss/python/langgraph/durable-execution +- qmd / claude-mem: + - https://github.com/tobi/qmd + - https://docs.claude-mem.ai/user-guide/view-memory +- OpenViking: + - https://github.com/volcengine/OpenViking/blob/main/README.md + - https://github.com/volcengine/OpenViking/blob/main/docs/en/concepts/01-architecture.md + - https://github.com/volcengine/OpenViking/blob/main/docs/en/concepts/07-retrieval.md +- Always-On Memory Agent: + - https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent + - https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/agents/always-on-memory-agent/README.md +- graphify: + - https://github.com/safishamsi/graphify + - https://github.com/safishamsi/graphify/blob/v3/README.md + - https://github.com/safishamsi/graphify/blob/v3/README.zh-CN.md diff --git a/docs/guide/research/external_memory_improvement_plan.md b/docs/guide/research/external_memory_improvement_plan.md new file mode 100644 index 00000000..6ad45be2 --- /dev/null +++ b/docs/guide/research/external_memory_improvement_plan.md @@ -0,0 +1,581 @@ +# External Memory Improvement Plan - June 9, 2026 + +Goal: Convert the June 2026 live benchmark, external memory-system research, and Dexter radar operating pattern into an issue-ready ELF improvement plan. +Read this when: Deciding what to implement next before using ELF as a personal production memory system. +Inputs: `README.md`, `docs/guide/benchmarking/2026-06-09-live-baseline-report.md`, `docs/guide/research/comparison_external_projects.md`, `docs/guide/research/research_projects_inventory.md`, current Linear readback, and the local Dexter Pattern Radar automation pattern. +Depends on: `docs/governance.md`, `docs/spec/system_elf_memory_service_v2.md`, and the checked-in live baseline runner. +Outputs: Prioritized gaps, issue queue, parallelization plan, acceptance criteria, and follow-up radar model. + +## Summary Judgment + +ELF is currently a credible personal-production candidate for an evidence-bound agent memory service, but it should not be treated as fully proven until the P0 items below land. + +The objective position is: + +- Better than the tested alternatives on evidence-bound writes, deterministic ingestion boundaries, source-of-truth discipline, rebuildable indexing, multi-tenant service shape, and the current encoded Docker benchmark. +- Comparable to the best tested alternative, qmd, on local retrieval quality under the smoke scenario, but ELF has a stronger service/provenance model while qmd has stronger local retrieval-debug ergonomics. +- Behind agentmemory, claude-mem/OpenMemory-style tools, and some managed-memory products on operator UX, visible memory inspection, and turn-by-turn operational comfort. +- Behind Graphiti/Zep, Letta, and mem0-style systems on some broader memory semantics: temporal graph workflows beyond graph-lite relation context, explicit memory history, core-vs-archival blocks, and reviewable memory evolution. +- Not yet proven on large private personal corpus migration, repeated batch backfill, cold-start persistence across every adapter, or long-running unattended production operation. + +So the answer is not "ELF is universally better." The current evidence supports "ELF is the better foundation for this repo's desired high-trust, evidence-linked memory system, and it can become the better personal-production choice if the P0 work lands and is benchmarked." + +## Evidence Base + +### Live Benchmark Evidence + +Checked-in report: `docs/guide/benchmarking/2026-06-09-live-baseline-report.md`. + +Current encoded result: + +- ELF provider stress run: `live-baseline-20260609010854`, `Qwen3-Embedding-8B`, 4096-dimensional provider embeddings, 480 documents, 16 queries, 8 of 8 encoded checks passing, elapsed 1163 seconds. +- All-project smoke run: `live-baseline-20260609022837`. +- ELF and qmd passed every encoded smoke check. +- agentmemory passed same-corpus retrieval but failed or could not complete lifecycle checks. +- mem0, memsearch, and claude-mem returned wrong same-corpus retrieval results in the encoded smoke. +- OpenViking was incomplete in the June 9 run because its local embedding dependency + could not complete inside the Docker runner. XY-881 later pinned the Docker path to + a CPU `llama-cpp-python` wheel and moved the current OpenViking state to + `wrong_result` when `add_resource`/`find` misses expected evidence terms. + +What this proves: + +- ELF's current service path can run real provider embeddings through Docker-isolated benchmark scripts. +- ELF's strict provenance/service model does not prevent it from passing the encoded retrieval checks. +- 4096-dimensional provider embeddings are operationally usable for the tested scale. + +What this does not prove: + +- It does not prove ELF beats every project on all retrieval workloads. +- It does not prove long-running personal production safety. +- It does not prove private-corpus migration quality. +- It does not prove viewer/operator ergonomics are competitive. +- It does not prove every adapter's lifecycle behavior is correctly represented. + +### External Project Activity Snapshot + +Captured from GitHub API on June 9, 2026. Activity is only a refresh signal, not a quality ranking. + +| Project | Stars | Last push | Latest release | Why keep tracking | +| --- | ---: | --- | --- | --- | +| rohitg00/agentmemory | 21969 | 2026-06-08 | v0.9.27 | Coding-agent continuity, packaging, viewer, benchmark claims | +| mem0ai/mem0 | 58095 | 2026-06-09 | cli-node-v0.2.8 | Memory lifecycle, hosted/OpenMemory ecosystem, graph option | +| zilliztech/memsearch | 1948 | 2026-06-01 | v0.4.6 | Markdown-first store and hybrid retrieval ergonomics | +| tobi/qmd | 26294 | 2026-06-08 | v2.5.3 | Strong local retrieval pipeline and transparent debug workflow | +| thedotmack/claude-mem | 81336 | 2026-06-08 | v13.4.1 | Progressive disclosure, auto-capture loop, local viewer | +| volcengine/OpenViking | 25368 | 2026-06-09 | v0.3.24 | Hierarchical context model and staged retrieval trajectory | +| nvk/llm-wiki | 547 | 2026-05-23 | v0.10.2 | Evidence-to-knowledge page compilation | +| garrytan/gbrain | 21723 | 2026-06-08 | none | Human-operable knowledge memory shape | +| GoogleCloudPlatform/generative-ai | 17001 | 2026-06-09 | none | Managed memory/dreaming reference patterns | +| safishamsi/graphify | 63545 | 2026-06-08 | v0.8.36 | Graph-compressed navigation and graph reports | +| nanograph/nanograph | 149 | 2026-05-17 | v1.3.0 | Typed graph ergonomics | +| letta-ai/letta | 23219 | 2026-05-14 | 0.16.8 | Core memory blocks vs archival memory | +| langchain-ai/langgraph | 34219 | 2026-06-07 | 1.2.4 | Replay-first state and regression workflow | +| getzep/graphiti | 27194 | 2026-06-09 | v0.29.2 | Temporal graph memory semantics | +| infiniflow/ragflow | 82243 | 2026-06-09 | v0.25.6 | Full RAG app benchmark reference | +| HKUDS/LightRAG | 36316 | 2026-06-09 | v1.5.0 | Lightweight graph/RAG architecture | +| microsoft/graphrag | 33574 | 2026-06-05 | v3.1.0 | GraphRAG indexing and community reports | +| virattt/dexter | 26927 | 2026-06-03 | v2026.6.3 | Radar operating model and research-worker patterns | + +### Failure Semantics + +Use these terms in future benchmark reports and Linear issues: + +| Term | Meaning | Example | +| --- | --- | --- | +| `pass` | Encoded check completed and returned expected result. | ELF same-corpus retrieval and lifecycle checks pass. | +| `wrong_result` | The system completed but returned an incorrect memory or missed the expected evidence. | mem0/memsearch/claude-mem smoke retrieval mismatch. | +| `lifecycle_fail` | Retrieval may work, but update/delete/cold-start/persistence behavior is wrong or incomplete. | agentmemory adapter passing retrieval but not lifecycle. | +| `incomplete` | The benchmark could not reach the behavioral check due to install/runtime/dependency failure. | A pinned local embedding wheel/import failure before OpenViking `add_resource`/`find`. | +| `not_encoded` | Capability is not currently covered by the benchmark, so no pass/fail claim is allowed. | Viewer quality and batch backfill UX. | +| `blocked` | A safe test cannot run without external credentials, manual setup, or a dependency outside the issue scope. | Private corpus evaluation before sanitized corpus exists. | + +## Priority Program + +### P0 - Personal Production Readiness + +These items decide whether ELF is safe and comfortable enough for single-user production use. + +#### P0.1 Batch Ingest and Backfill Throughput + +Problem: +The current provider stress result is acceptable for 480 documents, but production adoption needs predictable bulk loading and recovery behavior for a larger personal memory corpus. + +Adopt from: + +- qmd and memsearch: practical local indexing ergonomics. +- LangGraph-style replay discipline: rerunnable import paths with explicit progress. +- ELF's own outbox/worker architecture. + +Implementation shape: + +- Add a bulk ingest/backfill command or HTTP job surface that accepts generated or file-backed note batches. +- Use micro-batched embedding requests. +- Add bounded concurrent embedding workers. +- Use durable job rows with checkpointed offsets and retry state. +- Use batch Qdrant upserts. +- Preserve Postgres as source of truth; Qdrant remains rebuildable. +- Expose batch progress and per-stage timing in report artifacts. + +Acceptance: + +- Docker-only benchmark profile for 480, 2k, and 10k document backfills. +- Backfill can be interrupted and resumed without duplicate source notes. +- Search quality after resume equals a clean run for the same manifest. +- Provider credentials stay in `.env`; no host-global install path is required. + +Linear mapping: + +- New issue required: `[ELF prod P0] Add resumable batch ingest and backfill benchmark`. +- Parallelizable with P0.2 and P0.4. + +#### P0.2 Private Production Corpus Benchmark + +Problem: +The generated benchmark is useful but not enough to decide personal production adoption. A sanitized real corpus is needed. + +Adopt from: + +- agentmemory: coding-agent continuity scenarios. +- qmd: local query/debug workflow. +- LangGraph: replayable regression cases. + +Implementation shape: + +- Build a private/sanitized corpus manifest for real project memory: issues, PRs, worktrees, runbooks, decisions, and stalled-lane recovery notes. +- Define task-oriented queries: "resume lane", "find prior decision", "explain stale blocker", "recover exact command", "compare project status". +- Include cold-start, update, delete/expiry, and contradictory-memory cases. +- Keep the actual private corpus out of public docs if needed, but commit the manifest schema and synthetic fixtures. + +Acceptance: + +- Benchmark reports separate public generated corpus from private production corpus. +- Every query has expected evidence ids and allowed alternates. +- Results record precision, wrong-result count, latency, provider, dimensions, and cost proxy. +- Any claim that ELF is production-ready must cite this report. + +Linear mapping: + +- New issue required: `[ELF prod P0] Add private-corpus production adoption benchmark`. +- Blocks a final "use as personal production memory" decision. + +#### P0.3 Single-User Production Runbook and Recovery Contract + +Problem: +Docker compose and strict config now exist, but production use needs backup, restore, upgrade, and disaster-recovery instructions. + +Adopt from: + +- memsearch: simple local store expectations. +- Docker-first deployment discipline from the new live baseline. +- ELF governance: explicit config and source-of-truth boundaries. + +Implementation shape: + +- Document a single-user production profile using Docker Compose for Postgres, Qdrant, API, worker, and MCP if needed. +- Add backup/restore commands for Postgres. +- Add Qdrant rebuild instructions from Postgres. +- Add health checks, migration checks, and rollback notes. +- Document provider `.env` expectations and what must not be committed. + +Acceptance: + +- Fresh machine restore proves notes/search work after Postgres restore and Qdrant rebuild. +- Runbook includes exact commands and fail-closed warnings. +- No host-global service install is required. + +Linear mapping: + +- New issue required: `[ELF prod P0] Add single-user production runbook with backup and restore`. +- Parallelizable with P0.1 after config paths are stable. + +#### P0.4 Retrieval Observability and Viewer Follow-Through + +Problem: +For daily use, API-only debugging is too slow. ELF now has a base read-only viewer path, but retrieval tuning still needs first-class panels. + +Adopt from: + +- claude-mem/OpenMemory-style viewer ergonomics. +- qmd transparent expansion/fusion/rerank controls. +- OpenViking staged retrieval trajectory. + +Implementation shape: + +- Extend the viewer with search session timelines, candidate lists, dense/BM25/fusion/rerank scores, relation context, latency, and provider metadata. +- Add a `GET /v2/searches/{id}` or equivalent trace readback if not already exposed for every panel. +- Keep the viewer read-only for P0. +- Add direct links from benchmark failures to trace ids where possible. + +Acceptance: + +- A benchmark wrong-result can be debugged from viewer panels without raw database queries. +- The viewer shows which stage dropped or reranked the expected memory. +- Read-only authorization and no-mutation behavior are tested. + +Linear mapping: + +- Existing: XY-19 base read-only viewer is done. +- Existing follow-up: XY-27 should be prioritized from Backlog to active after P0.1/P0.2 are queued. + +#### P0.5 Durable External Adapter and Lifecycle Benchmark Coverage + +Problem: +The current all-project smoke found adapter-level ambiguity. It is not enough to say "agentmemory failed" if the adapter uses an in-memory or incomplete lifecycle path. + +Adopt from: + +- agentmemory: actual durable package behavior and benchmark claims. +- ELF benchmark runner: Docker-isolated reproducibility. + +Implementation shape: + +- Replace mock/in-memory external adapters with durable local modes where feasible. +- For every external adapter, mark which behaviors are real, mocked, unsupported, or blocked. +- For expanded RAG and graph-memory systems, use `research_gate` records until D1/D2 + research, resource sizing, and Docker runtime boundaries are proven. +- Add lifecycle checks: update, delete/expire, cold-start reload, and same-corpus retrieval. +- Keep failures typed with the terms in this document. +- Use `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` + as the real-world adapter coverage contract so fixture-only, live-baseline-only, and + future live-real-world evidence stay separate. + +Acceptance: + +- agentmemory adapter either passes durable lifecycle checks or is explicitly marked blocked with evidence. +- OpenViking records a pinned Docker local embedding retry path; install/import + failure remains `incomplete`, while evidence misses after `add_resource`/`find` + are `wrong_result`. +- qmd smoke pass remains covered and gains scale/stress profiles. +- Real-world reports include adapter coverage counters before any external adapter is + allowed to claim a real-world suite pass. + +Linear mapping: + +- Existing: XY-801 created the initial agentmemory import/baseline boundary and is done. +- New issue required: `[ELF benchmark P0] Make external adapters lifecycle-durable and fail-typed`. + +### P1 - Memory Quality and Product Differentiation + +These items make ELF not merely usable, but materially better than adjacent memory products for high-trust agent work. + +#### P1.1 Reviewable Consolidation Worker + +Problem: +ELF has the right evidence-bound source model, but long-term memory quality needs consolidation without hidden mutation. + +Adopt from: + +- Gemini/managed memory "dreaming" direction, but with explicit review. +- Always-On Memory Agent: background consolidation loop. +- Dexter: proposal-only memo/readback artifacts. + +Implementation shape: + +- Implement consolidation jobs over immutable notes/events/traces. +- Write derived proposals, not source-note rewrites. +- Include source ids, confidence, unsupported-claim flags, conflicts, and review state. +- Add apply/discard/defer transitions. + +Acceptance: + +- Every proposed derived memory is traceable to source evidence. +- No derived proposal can silently replace source truth. +- Consolidation output appears in viewer/readback. + +Linear mapping: + +- Existing foundation: XY-800 is done. +- New follow-up required: `[ELF vNext P1] Implement reviewable consolidation worker and proposal review flow`. + +#### P1.2 Knowledge Memory Pages + +Problem: +Many compact memories remain hard to navigate unless compiled into stable, provenance-linked entity/project/concept pages. + +Adopt from: + +- llm-wiki and gbrain: maintained knowledge pages. +- ELF provenance model: every page section cites notes/events. + +Implementation shape: + +- Build derived pages for entities, concepts, projects, issues, and decisions. +- Add backlinks, source coverage, stale/unsupported-claim lint, and rebuild commands. +- Keep pages derived and rebuildable, not authoritative source truth. + +Acceptance: + +- A project page can be rebuilt from notes and preserves citations. +- Lint catches unsupported claims and stale source references. +- Viewer/search can surface page snippets with provenance. + +Linear mapping: + +- Existing: XY-286 is the right epic and should be expanded with smaller implementation issues. + +#### P1.3 Temporal Graph-Lite Validity + +Problem: +ELF already persists structured relations, but production memory needs time-aware facts: what was true when, what superseded it, and why. + +Adopt from: + +- Graphiti/Zep: temporal graph memory semantics. +- nanograph: typed graph/query ergonomics, without replacing Postgres. + +Implementation shape: + +- Use `valid_from` and `valid_to` semantics for relation facts. +- Keep append-only relation history and supersession evidence. +- Expose current versus historical temporal status in graph query and search relation context. +- Keep broader typed graph query ergonomics scoped to XY-70. + +Acceptance: + +- Contradictory facts do not overwrite silently. +- Search relation context labels current and historical facts. +- Tests cover invalidation, current readback, and old-state replay. + +Linear mapping: + +- Existing related: XY-70 covers graph-lite typed schema/query. +- Focused implementation issue: XY-863 `[ELF graph P1] Add temporal validity to graph-lite relation context`. + +#### P1.4 Memory History and Evolution API + +Problem: +Users and agents need to inspect how a memory changed over time, especially when an LLM proposed an update. + +Adopt from: + +- mem0: lifecycle/event history. +- ELF ingest decision table: existing audit direction. + +Implementation shape: + +- Add memory event history for add, update, ignore, reject, expire, derived, applied, and invalidated transitions. +- Expose history readbacks via HTTP/MCP. +- Link ingest decisions to note/relation versions. + +Acceptance: + +- A user can explain why a memory currently exists and what earlier evidence changed it. +- History survives restart and migration. +- Benchmark lifecycle checks include history expectations. + +Linear mapping: + +- New issue required: `[ELF memory P1] Add memory history and evolution readback API`. + +#### P1.5 Core Memory Blocks vs Archival Memory + +Problem: +Some memories should be intentionally small, always-attached operating context; most memory should remain retrievable archival context. + +Adopt from: + +- Letta: core memory blocks vs archival memory. +- ELF scope controls: explicit attachment and sharing. + +Implementation shape: + +- Add scoped, read-only memory blocks for stable agent/project instructions. +- Keep block attachment explicit per tenant/project/agent. +- Do not let blocks bypass evidence or policy boundaries. +- Keep blocks inspectable in viewer and MCP readback. + +Acceptance: + +- Agents can request their attached core blocks separately from search. +- Blocks have source/provenance metadata and audit history. +- Archival search remains independent. + +Linear mapping: + +- New issue required: `[ELF memory P1] Add scoped core memory blocks with archival separation`. + +#### P1.6 Search Trajectory and Query Planning + +Problem: +ELF already has expansion, hybrid retrieval, and reranking, but external tools expose the route more clearly. + +Adopt from: + +- qmd: weighted fusion and local debug knobs. +- OpenViking: staged retrieval trajectory and recursive retrieval. +- graphify: graph-compressed navigation hints. + +Implementation shape: + +- Add stable trace schema for query expansion, dense retrieval, BM25 retrieval, fusion, rerank, graph context, and final selection. +- Add optional recursive or staged retrieval profiles. +- Expose search-plan hints without making them hidden authority. + +Acceptance: + +- Every search result can explain its path. +- Tuning can be done through config/profile changes and benchmark replay. +- Wrong-result reports show stage-level cause. + +Linear mapping: + +- Existing related: XY-27 retrieval observability. +- New issue may be needed after XY-27: `[ELF retrieval P1] Add staged search trajectory profiles`. + +### P2 - Ongoing Intelligence and Ecosystem Parity + +These items keep ELF improving after the first production cut. + +#### P2.1 ELF External Memory Pattern Radar + +Problem: +External memory projects are moving quickly. Manual one-off reviews will go stale. + +Adopt from: + +- Local Dexter Pattern Radar automation. +- Decodex radar evidence discipline. + +Implementation shape: + +- Create a weekly Codex automation for ELF memory-system radar. +- Track upstream deltas for agentmemory, mem0, qmd, claude-mem, OpenViking, Graphiti, Letta, LightRAG, GraphRAG, and related projects. +- Maintain a structured cursor file plus prose memory. +- For every candidate pattern, produce an architecture-fit matrix: + - upstream change + - reusable pattern + - ELF verdict: covered, reject, or gap + - product value + - duplicate/coverage evidence + - safety boundary + - issue decision + - acceptance evidence +- Search Linear before creating issues. +- Create issues only when repo evidence shows a real gap. + +Acceptance: + +- A no-issue run records why ELF is already covered or why a pattern is rejected. +- A new issue includes source links, repo evidence, non-goals, and validation criteria. +- The radar never treats external runtime adoption as the default. + +Linear mapping: + +- New issue required: `[ELF ops P2] Add weekly external memory pattern radar automation`. + +#### P2.2 Broaden Benchmark Adapter Coverage + +Problem: +The current smoke covers the first project set, but broader claims need RAGFlow, LightRAG, GraphRAG, and deeper qmd/OpenViking profiles. + +Adopt from: + +- RAGFlow, LightRAG, GraphRAG: graph/RAG baselines. +- Current Docker live benchmark. + +Implementation shape: + +- Add D1/D2 research runs before implementation for large RAG systems. +- Add adapters only when Docker isolation is practical. +- Track install time, resource needs, and failure mode separately from retrieval quality. + +Acceptance: + +- Reports separate unsupported, blocked, incomplete, and wrong-result states. +- No external project is marked worse solely because setup is heavier. +- Claims remain scoped to encoded checks. + +Linear mapping: + +- New issue required: `[ELF benchmark P2] Add expanded RAG and graph-memory baseline adapters`. + +#### P2.3 CLI and SDK Ergonomics + +Problem: +ELF is service-first. External projects often feel easier for a local developer because their CLI path is direct. + +Adopt from: + +- qmd, memsearch, agentmemory: local CLI ergonomics. + +Implementation shape: + +- Add CLI wrappers for add/search/status/backfill/report if they are still missing or scattered. +- Keep commands thin over HTTP/MCP contracts. +- Link commands to benchmark and runbook workflows. + +Acceptance: + +- A local user can add notes, search, view status, run backfill, and generate benchmark report from documented commands. +- CLI output includes trace ids and source ids. + +Linear mapping: + +- New issue required after P0 runbook: `[ELF dx P2] Add local CLI wrappers for production memory workflows`. + +## Issue Queue + +| Order | Priority | Issue | Existing mapping | Parallelizable | Blocks | +| ---: | --- | --- | --- | --- | --- | +| 1 | P0 | Add resumable batch ingest and backfill benchmark | New | yes | production corpus migration | +| 2 | P0 | Add private-corpus production adoption benchmark | New | yes | final adoption claim | +| 3 | P0 | Add single-user production runbook with backup and restore | New | yes | unattended use | +| 4 | P0 | Prioritize retrieval observability panels | XY-27, after XY-19 | yes | efficient tuning | +| 5 | P0 | Make external adapters lifecycle-durable and fail-typed | New, follows XY-801 | yes | fair external comparison | +| 6 | P1 | Implement reviewable consolidation worker and proposal review flow | follows XY-800 | partly | knowledge pages | +| 7 | P1 | Split XY-286 into derived page storage, rebuild, lint, and viewer/search integration | XY-286 | partly | durable knowledge layer | +| 8 | P1 | Add temporal validity to graph-lite relation context | XY-863, follows/relates XY-70 | yes | time-aware relation context | +| 9 | P1 | Add memory history and evolution readback API | New | yes | lifecycle auditability | +| 10 | P1 | Add scoped core memory blocks with archival separation | New | yes | agent operating context | +| 11 | P1 | Add staged search trajectory profiles | New or XY-27 follow-up | after XY-27 | advanced retrieval tuning | +| 12 | P2 | Add weekly external memory pattern radar automation | New | yes | ongoing parity | +| 13 | P2 | Add expanded RAG and graph-memory baseline adapters | New | yes | broader public comparison | +| 14 | P2 | Add local CLI wrappers for production memory workflows | New | after P0.3 | local ergonomics | + +## Parallel Development Plan + +Safe concurrent lanes: + +- Lane A: P0.1 batch ingest/backfill. +- Lane B: P0.2 private-corpus benchmark and manifest schema. +- Lane C: P0.3 production runbook and backup/restore proof. +- Lane D: P0.5 adapter lifecycle benchmark hardening. +- Lane E: XY-27 retrieval observability panels. +- Lane F: P2.1 radar automation, because it is mostly automation/config/docs and should not touch runtime code. + +Avoid running concurrently without coordination: + +- P1.1 consolidation worker and P1.2 knowledge pages, because knowledge pages should build on the reviewed derived proposal model. +- P1.3 temporal graph validity and XY-70 typed graph work, unless ownership is split cleanly between storage semantics and query ergonomics. +- P1.6 staged search trajectory and XY-27 viewer panels, unless the trace schema is agreed first. + +Recommended Decodex queue order: + +1. Queue P0.2 and P0.3 first because they define adoption evidence and recovery expectations. +2. Queue P0.1 and P0.5 in parallel because they exercise different implementation surfaces. +3. Promote XY-27 after the trace data needed by P0.5 is clear. +4. Start P1.1 only after P0.2 has enough corpus scenarios to evaluate consolidation quality. +5. Split XY-286 after P1.1 defines derived proposal semantics. + +## Non-Goals + +- Do not replace ELF core storage with any external memory runtime. +- Do not make Qdrant authoritative. +- Do not treat graph memory as a separate hidden source of truth. +- Do not allow background consolidation to mutate source notes silently. +- Do not benchmark with host-global installs when Docker isolation is feasible. +- Do not claim overall superiority from a benchmark dimension that is not encoded. +- Do not create new Linear issues from radar output without duplicate search and repo evidence. + +## Production Adoption Gate + +For personal production use, the minimum acceptable gate is: + +- P0.1 batch ingest/backfill passes generated scale checks and resume checks. +- P0.2 private corpus benchmark has a passing or explicitly bounded result. +- P0.3 backup/restore runbook is tested on Docker Compose. +- P0.4/XY-27 gives enough viewer traceability to debug bad retrieval without raw SQL. +- P0.5 benchmark reports use typed failure states for external comparisons. + +After that gate, ELF can reasonably be used as the personal production memory system with known limitations. Before that gate, ELF is a strong foundation with promising benchmark evidence, but the adoption risk is still too high to call it production-proven. diff --git a/docs/guide/research/external_memory_pattern_radar.md b/docs/guide/research/external_memory_pattern_radar.md new file mode 100644 index 00000000..06638e2a --- /dev/null +++ b/docs/guide/research/external_memory_pattern_radar.md @@ -0,0 +1,89 @@ +# External Memory Pattern Radar + +Goal: Run ELF's weekly external memory pattern radar and preserve no-issue, rejection, +or issue-ready outcomes for future comparison reports. +Read this when: You are refreshing upstream memory/RAG/agent-continuity watch state or +deciding whether a watched upstream pattern deserves an ELF follow-up issue. +Inputs: `docs/research/external_memory_pattern_radar/cursor.json`, GitHub repository +metadata, current ELF research docs, and Linear duplicate-search readback when creating +issues. +Depends on: `docs/spec/external_memory_pattern_radar_v1.md`, +`docs/guide/research/comparison_external_projects.md`, and +`docs/guide/research/research_projects_inventory.md`. +Outputs: Updated cursor JSON plus `docs/research/external_memory_pattern_radar/latest.md`. + +## Scope + +The radar watches agentmemory, mem0, qmd, claude-mem, OpenViking, Graphiti, Letta, +LightRAG, GraphRAG, RAGFlow, and adjacent projects already represented in ELF's +external comparison research. + +The radar does not adopt external runtimes by default and does not create follow-up +issues from stars, activity, release tags, or push timestamps alone. + +## Commands + +Run a live cursor refresh: + +```sh +cargo make external-memory-radar +``` + +Run the deterministic no-network dry run used by local PR checks and fallback +verification: + +```sh +cargo make external-memory-radar-dry-run +``` + +Run a live read-only artifact refresh under `tmp/` without changing checked-in files: + +```sh +cargo make external-memory-radar-artifact +``` + +Validate the checked-in cursor: + +```sh +cargo make external-memory-radar-validate +``` + +## Issue Decision Rules + +For every candidate pattern, the cursor decision must record: + +- upstream change +- reusable pattern +- ELF verdict: `covered`, `reject`, or `gap` +- product value +- duplicate/coverage evidence +- safety boundary +- issue decision +- acceptance evidence + +`create_issue` is allowed only when the decision also records upstream source links, +repo evidence, non-goals, validation criteria, and Linear duplicate-search evidence. +When the run is no-issue, the cursor still records why the pattern is already covered +or why the observed change is rejected. + +## Weekly Schedule + +`.github/workflows/external-memory-pattern-radar.yml` runs weekly and on manual +dispatch. The scheduled workflow refreshes live GitHub metadata and writes artifacts under +`tmp/external-memory-pattern-radar/` and uploads them for review. + +The workflow is intentionally read-only with respect to Linear and repository contents. +Codex or Decodex automation may consume the artifact, perform source review, search +Linear, and then submit a small PR that updates the cursor and prose summary. + +## Next Comparison Report Input + +The next full comparison report should consume: + +- changed project metadata from `projects[].last_seen` +- no-issue and rejection rationales from `last_run.decisions[]` +- issue-ready `gap` records only when `issue_decision.action = "create_issue"` +- source links, repo evidence, non-goals, and validation criteria from proposed issues + +Do not quote a watched project as an ELF gap or parity win unless the cursor decision +contains source-backed evidence under the radar spec. diff --git a/docs/guide/research/index.md b/docs/guide/research/index.md new file mode 100644 index 00000000..cf11bc56 --- /dev/null +++ b/docs/guide/research/index.md @@ -0,0 +1,22 @@ +# Research Guide Index + +Goal: Route agents to external comparison and decision-support research for ELF memory architecture. +Read this when: You need to compare ELF with adjacent memory, context, RAG, or consolidation systems. +Inputs: Current ELF docs/code, public external project docs, tracker state, and checked-in research run files. +Depends on: `docs/index.md`, `docs/governance.md`, and `docs/research/` for machine-readable research runs. +Outputs: The smallest comparison or inventory document needed for implementation decisions. + +## Documents + +- `research_projects_inventory.md`: audited and pending external projects, research depth, and current planning surface. +- `comparison_external_projects.md`: detailed capability comparison, project trade-offs, source map, and research-backed ELF directions. +- `external_memory_improvement_plan.md`: prioritized June 2026 improvement backlog, issue queue, parallelization plan, and production-adoption gate from benchmark and external-project evidence. +- `agentmemory_adapter.md`: fixture-backed agentmemory import and baseline adapter boundary for `elf-eval`. +- `external_memory_pattern_radar.md`: weekly radar runbook for upstream memory-system + deltas, no-issue decisions, and issue-ready pattern evidence. + +## Machine-Readable Runs + +Machine-authoritative research run JSON files live under `docs/research/`. +Use those files when a research conclusion needs replayable hypotheses, evidence, +trade-offs, challenge records, and terminal decision state. diff --git a/docs/guide/research/research_projects_inventory.md b/docs/guide/research/research_projects_inventory.md new file mode 100644 index 00000000..be322238 --- /dev/null +++ b/docs/guide/research/research_projects_inventory.md @@ -0,0 +1,102 @@ +# External Project Research Inventory + +Goal: Maintain a single, auditable inventory of external memory/context projects reviewed for ELF architecture decisions. +Read this when: You need to know which external projects have already been reviewed or still need a deep dive. +Inputs: Existing research notes, open architecture questions, and tracked adoption threads. +Depends on: `docs/guide/research/comparison_external_projects.md`. +Outputs: A current inventory of reviewed and pending external projects. + +Last updated: June 11, 2026. + +## Legend + +- `D2`: Mechanism-level deep dive (docs + code pointers + operational trade-offs). +- `D1`: Docs-level deep dive (architecture/features/scope compared, limited code inspection). +- `D0`: Mention-level only in discussions; not yet deeply reviewed. + +## Inventory + +| Project | Research depth | Current status | Benchmark dimension role | Why it matters to ELF | Primary reference | +| ------- | -------------- | -------------- | ------------------------ | --------------------- | ----------------- | +| [agentmemory](https://github.com/rohitg00/agentmemory) | D1 | Reviewed | `rw.operator-continuity`, `rw.resume-evidence`, `rw.lifecycle-staleness` | Cross-agent coding-memory hooks, MCP/REST surface, viewer, consolidation lifecycle, and external benchmark target | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-08-agent-memory-selection.json`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | +| [OpenAI ChatGPT Memory Dreaming](https://openai.com/index/chatgpt-memory-dreaming/) | D1 | Reviewed | `rw.consolidation-review` | Background memory synthesis and staleness repair as a product direction | `docs/research/2026-06-08-agent-memory-selection.json` | +| [Claude Managed Agents Dreams](https://platform.claude.com/docs/en/managed-agents/dreams) | D1 | Reviewed | `rw.consolidation-review` | Reviewable derived memory-store output over past sessions; strong safety shape for ELF consolidation | `docs/research/2026-06-08-agent-memory-selection.json` | +| [Gemini CLI Auto Memory](https://github.com/google-gemini/gemini-cli/blob/main/docs/cli/auto-memory.md) | D1 | Reviewed | `rw.consolidation-review`, `rw.operator-continuity` | Background session mining with project-local review inbox for memory patches and skills | `docs/research/2026-06-08-agent-memory-selection.json` | +| [mem0](https://github.com/mem0ai/mem0) | D2 | Reviewed | `rw.lifecycle-staleness`, `rw.graph-temporal`, `rw.operator-continuity` | Graph memory as additive context, memory history and async mode trade-offs | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | +| [memsearch](https://github.com/zilliztech/memsearch) | D2 | Reviewed | `rw.lifecycle-staleness`, `rw.retrieval-debug`, `rw.resume-evidence` | Markdown-first SoT + rebuildable index pattern | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | +| [qmd](https://github.com/tobi/qmd) | D2 | Reviewed | `rw.retrieval-debug`, `rw.lifecycle-staleness`, `rw.resume-evidence` | Retrieval routing, weighted fusion, and local-first explainability | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | +| [claude-mem](https://github.com/thedotmack/claude-mem) | D2 | Reviewed | `rw.operator-continuity`, `rw.resume-evidence`, `rw.retrieval-debug` | Progressive disclosure and strong operator workflow | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | +| [OpenViking](https://github.com/volcengine/OpenViking) | D2 | Reviewed | `rw.context-trajectory`, `rw.resume-evidence`, `rw.retrieval-debug` | Filesystem context paradigm, hierarchical retrieval, trajectory observability | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | +| [llm-wiki](https://github.com/nvk/llm-wiki) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.knowledge-synthesis`, `rw.resume-evidence` | LLM-maintained wiki pattern, topic-scoped knowledge bases, query-save and lint workflows | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [gbrain](https://github.com/garrytan/gbrain) | D1 | Reviewed; XY-882 verdict `blocked` | `rw.knowledge-synthesis`, `rw.operator-continuity` | Operational knowledge brain, `compiled_truth` + timeline pages, enrichment and maintenance loops; blocked on Docker-local brain repo and database proof | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [Always-On Memory Agent](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent) | D1 | Reviewed | `rw.consolidation-review`, `rw.operator-continuity` | Always-on multimodal ingest + scheduled consolidation loop with simple local ops surface | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | +| [graphify](https://github.com/safishamsi/graphify) | D1 | Reviewed; XY-882 verdict `adapter_candidate`; XY-889 adds Docker graph/report smoke | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Multimodal graph compression, deterministic code extraction, and graph/report outputs with source-file/source-location references; current ELF evidence is a generated-corpus Docker smoke, not broad graph-quality proof | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| [Letta](https://github.com/letta-ai/letta) | D1 | Reviewed; XY-882 verdict `research_only`; XY-927 selects blocked contained export/readback path | `rw.core-archival`, `rw.operator-continuity` | Core vs archival memory split, shared blocks; compare only after a Docker-only benchmark-created agent export returns core block JSON, archival readback JSON, and source ids | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| [LangGraph](https://docs.langchain.com/oss/python/langgraph/persistence) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.replay-regression`, `rw.resume-evidence` | Checkpoint/replay mindset for quality regression workflows; not a standalone memory backend adapter | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [Graphiti / Zep](https://help.getzep.com/graphiti/core-concepts/temporal-awareness) | D1 | Reviewed; XY-882 verdict `adapter_candidate` | `rw.graph-temporal`, `rw.resume-evidence` | Temporal fact validity model with Docker-local graph-store options and UUID/fact/validity-window output | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [nanograph](https://github.com/nanograph/nanograph) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema + typed query ergonomics for graph-lite developer experience; official shape is no server/no Docker | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [RAGFlow](https://github.com/infiniflow/ragflow) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.resume-evidence`, `rw.graph-navigation`, `rw.retrieval-debug`; no live strength claim | Docker setup is resource-heavy but documented; API references expose document/chunk evidence handles for a tiny-corpus adapter smoke | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [LightRAG](https://github.com/HKUDS/LightRAG) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.graph-navigation`, `rw.graph-temporal`, `rw.retrieval-debug`; no live strength claim | Docker compose path, context-only query modes, and source file-path citation shape support an implementation follow-up | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [GraphRAG](https://github.com/microsoft/graphrag) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.retrieval-debug`; no live strength claim | Cost-bounded CLI/API path and parquet output tables expose document, text-unit, and graph-summary handles for evidence mapping | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | + +## June 10, 2026 Adapter Feasibility Verdicts + +XY-882 resolved the D1/D2 feasibility gate for the RAG and graph-memory +`research_gate` records. These verdicts do not change any project into live adapter +evidence by themselves; they only decide whether an implementation follow-up is +justified. XY-900 later promotes graphify's generated-corpus Docker smoke into a +scored tiny `live_real_world` non-pass record, but not broad graph-quality proof. + +| Project | Verdict | Follow-up rule | +| ------- | ------- | -------------- | +| RAGFlow | `adapter_candidate` | Follow-up issue: [XY-885](https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter), a tiny Docker evidence-smoke adapter that records the resource envelope and maps `reference.chunks` to benchmark evidence. | +| LightRAG | `adapter_candidate` | Follow-up issue: [XY-886](https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter), a Docker context-export adapter using explicit LLM/embedding config and source file-path citations. | +| GraphRAG | `adapter_candidate` | Follow-up issue: [XY-887](https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter), a cost-bounded Docker CLI/API adapter over a tiny corpus and parquet output tables. | +| Graphiti / Zep | `adapter_candidate` | Follow-up issue: [XY-888](https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter), a Docker-local temporal graph adapter that scores current/historical fact validity. | +| graphify | `adapter_candidate` | Follow-up issue: [XY-889](https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter), a Docker-only CLI/materializer adapter over `graph.json` and `GRAPH_REPORT.md`; host-global assistant hooks remain out of scope. XY-900 promotes the checked-in graphify row to a scored tiny Docker smoke with `wrong_result`; it is still not broad graph-navigation quality proof. | +| Letta | `research_only` | Keep as a core/archival memory reference until a supported contained path can export archival-memory evidence for scoring. | +| LangGraph | `research_only` | Keep as a checkpoint/replay regression reference, not a standalone external memory adapter. | +| nanograph | `research_only` | Keep as typed graph DX inspiration; official shape is no server/no Docker. | +| llm-wiki | `research_only` | Keep as a derived knowledge-page workflow reference; host-global plugin installs are not adapter proof. | +| gbrain | `blocked` | Revisit only after a Docker-local brain repo and database path can be proven without operator-owned state. | + +## June 2026 Activity Snapshot + +GitHub API snapshot time: 2026-06-08T06:01:57Z. + +The monitored project set is still moving quickly. Recent push activity was observed for +agentmemory, mem0, qmd, claude-mem, OpenViking, gbrain, graphify, LangGraph, Graphiti, +RAGFlow, LightRAG, and GraphRAG. Notable current scale signals: + +- agentmemory: 21,783 stars, latest release `v0.9.27`, pushed 2026-06-07. +- mem0: 58,005 stars, latest release `cli-node-v0.2.8`, pushed 2026-06-06. +- claude-mem: 81,157 stars, latest release `v13.4.1`, pushed 2026-06-08. +- graphify: 62,294 stars, latest release `v0.8.35`, pushed 2026-06-07. +- RAGFlow: 82,150 stars, latest release `v0.25.6`, pushed 2026-06-08. +- LightRAG: 36,270 stars, latest release `v1.5.0`, pushed 2026-06-08. +- GraphRAG: 33,545 stars, latest release `v3.1.0`, pushed 2026-06-05. + +Interpretation: this is not a settled market. ELF should keep watching external +implementation velocity, but the current activity signal alone does not justify +replacing ELF's evidence-bound service contract. + +## Current Planning Surface + +- Linear project: [ELF vNext: Evidence-to-Knowledge Memory](https://linear.app/hack-ink/project/elf-vnext-evidence-to-knowledge-memory-d7a9dd3f3e86) +- Active workstreams: + - [XY-286](https://linear.app/hack-ink/issue/XY-286/knowledge-memory-derived-entityconceptproject-pages-with-provenance) knowledge-memory layer + - [XY-19](https://linear.app/hack-ink/issue/XY-19/add-a-read-only-web-viewer-for-sessions-and-traces) and [XY-27](https://linear.app/hack-ink/issue/XY-27/viewer-add-retrieval-observability-panels-on-top-of-the-read-only) operator workflow + - [XY-70](https://linear.app/hack-ink/issue/XY-70/graph-lite-dx-typed-schema-typed-query-nanograph-inspired) graph-lite DX +- Historical research/foundation issues now closed: + - [XY-40](https://linear.app/hack-ink/issue/XY-40/vision-track-elf-as-a-high-trust-memory-system-for-singlemulti-agent) + - [XY-51](https://linear.app/hack-ink/issue/XY-51/agent-memory-ux-mcp-surface-skills-doc-pointers-epic) + - [XY-63](https://linear.app/hack-ink/issue/XY-63/research-openviking-as-optional-doc-backend-integration-sketch) +- Current June 2026 research runs: + - `docs/research/2026-06-08-agent-memory-selection.json` + - `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` + - `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` + +## Notes + +- This inventory tracks research state, not implementation commitment. +- Any architecture change must still pass code-level feasibility and regression validation in ELF. diff --git a/docs/guide/single_user_production.md b/docs/guide/single_user_production.md new file mode 100644 index 00000000..914b0fe7 --- /dev/null +++ b/docs/guide/single_user_production.md @@ -0,0 +1,698 @@ +# Single-User Production Runbook + +Goal: Operate one local ELF instance with Docker Compose managed Postgres and Qdrant, +plus ELF API, worker, and optional MCP processes. +Read this when: You are running ELF as a personal production memory service or proving backup, +restore, migration, and Qdrant rebuild behavior. +Preconditions: Docker Compose, this repository checkout, a Rust toolchain for building ELF +binaries, and provider credentials for production embeddings/rerank/extraction. +Depends on: `docker-compose.yml`, `elf.example.toml`, `docs/spec/system_elf_memory_service_v2.md`, +`docs/guide/getting_started.md`, and `docs/guide/integration-testing.md`. +Verification: Health succeeds, a note can be ingested and found, Postgres backup restores notes, +Qdrant search state can be rebuilt from Postgres, and the clean-volume proof path below can run +without host-global service installs. + +## Operating Boundary + +This runbook is the minimum single-user production path. It does not describe hosted, +cloud-managed, or public internet deployment. + +Postgres is the only source of truth for notes, chunks, embeddings, audit history, and outbox +state. Qdrant is derived state. Back up Postgres, not Qdrant. If Qdrant is lost, recreate its +collections and run the admin rebuild from Postgres. + +The checked-in `docker-compose.yml` owns only the stateful services: + +- `postgres`: Postgres with pgvector. +- `qdrant`: Qdrant REST and gRPC. + +`elf-api`, `elf-worker`, and `elf-mcp` run as local ELF binaries from the checked-out release. +Keep their binds on loopback. The API refuses `http_bind` outside loopback when +`security.bind_localhost_only = true`, refuses `security.auth_mode = "off"` on non-loopback HTTP +binds, and always requires `admin_bind` to be loopback. The MCP server also refuses non-loopback +binds when auth is off. + +## 1. Create Local Secrets + +Create `.env` for Docker Compose storage settings only. Docker Compose loads it automatically; ELF +itself does not read provider credentials or required config fields from environment variables. + +```sh +cat > .env <<'EOF' +ELF_COMPOSE_PROJECT=elf-prod +ELF_POSTGRES_DB=elf_prod +ELF_POSTGRES_USER=elf_prod +ELF_POSTGRES_PASSWORD=replace-with-a-long-random-password +ELF_POSTGRES_PORT=51888 +ELF_POSTGRES_VOLUME=elf-prod-postgres-data +ELF_QDRANT_REST_PORT=51889 +ELF_QDRANT_GRPC_PORT=51890 +ELF_QDRANT_VOLUME=elf-prod-qdrant-data +ELF_QDRANT_COLLECTION=mem_notes_v2 +ELF_QDRANT_DOCS_COLLECTION=doc_chunks_v1 +ELF_QDRANT_VECTOR_DIM=4096 +EOF +chmod 600 .env +``` + +For shell commands below, load the same variables into your shell: + +```sh +set -a +. ./.env +set +a +``` + +Create an untracked production config: + +```sh +cp elf.example.toml elf.production.toml +chmod 600 elf.production.toml +``` + +Edit `elf.production.toml`: + +- Set `storage.postgres.dsn` to + `postgres://elf_prod:<ELF_POSTGRES_PASSWORD>@127.0.0.1:51888/elf_prod`, using the real password. +- Set `storage.qdrant.url` to `http://127.0.0.1:51890`. +- Set `storage.qdrant.collection`, `storage.qdrant.docs_collection`, and + `storage.qdrant.vector_dim` to match `.env`. +- Fill every `[providers.*]` block with real provider endpoints, models, dimensions, and keys. +- Keep `providers.embedding.dimensions` equal to `storage.qdrant.vector_dim`. +- Keep `chunking.enabled = true` and set `chunking.tokenizer_repo` to a non-empty tokenizer. +- Prefer `security.auth_mode = "static_keys"` with non-empty `security.auth_keys`. +- If you run `elf-mcp`, keep `[mcp]` present and ensure exactly one static key matches its + tenant, project, agent, and read profile. + +Do not put provider credentials, bearer tokens, or static-key secrets in the Compose `.env` file. +Production provider settings belong in the untracked ELF config file, or in a local secret-rendering +step that writes that untracked config before startup. ELF fails closed when provider keys are empty, +required provider fields are absent, the embedding dimension does not match the Qdrant vector +dimension, or the config path is missing. + +Do not commit `.env`, `elf.production.toml`, backups, provider keys, bearer tokens, or database +dumps. `.env*`, root ELF config files, and `backups/` are ignored for this reason. + +## 2. Start Postgres And Qdrant + +Validate the Compose file and start storage: + +```sh +docker compose -f docker-compose.yml config >/dev/null +docker compose -f docker-compose.yml up -d postgres qdrant +docker compose -f docker-compose.yml ps +``` + +Check storage health: + +```sh +docker compose -f docker-compose.yml exec -T postgres \ + pg_isready -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" + +curl -fsS "http://127.0.0.1:${ELF_QDRANT_REST_PORT}/collections" >/dev/null +``` + +Stop storage without deleting data: + +```sh +docker compose -f docker-compose.yml stop postgres qdrant +``` + +Start it again: + +```sh +docker compose -f docker-compose.yml up -d postgres qdrant +``` + +Remove stopped containers while keeping volumes: + +```sh +docker compose -f docker-compose.yml down +``` + +Delete all Compose-managed storage only when you have a verified backup or are running the +clean-volume proof below: + +```sh +docker compose -f docker-compose.yml down -v +``` + +## 3. Build And Start ELF Services + +Build once, then run the binaries directly to avoid multiple `cargo run` processes contending for +Cargo locks: + +```sh +cargo build -p elf-api -p elf-worker -p elf-mcp +``` + +Start the worker in one terminal: + +```sh +target/debug/elf-worker -c elf.production.toml +``` + +Start the API in a second terminal: + +```sh +target/debug/elf-api -c elf.production.toml +``` + +Optional: start MCP in a third terminal when a client needs the MCP adapter: + +```sh +target/debug/elf-mcp -c elf.production.toml +``` + +Stop ELF services by sending Ctrl-C in each service terminal. If you started them in the background, +stop those exact processes before backup, restore, upgrade, or rollback: + +```sh +pkill -f "target/debug/elf-api -c elf.production.toml" || true +pkill -f "target/debug/elf-worker -c elf.production.toml" || true +pkill -f "target/debug/elf-mcp -c elf.production.toml" || true +``` + +On startup, `elf-api` and `elf-worker` initialize the Postgres schema and ensure the Qdrant +collections and docs payload indexes exist. Startup fails closed if the config file is missing, +required config is absent, `security.reject_non_english` is false, vector dimensions mismatch, or +loopback/auth rules are violated. + +## 4. Health And Migration Checks + +Check API health: + +```sh +curl -fsS http://127.0.0.1:51892/health +``` + +Check that schema initialization or migration has reached the configured database: + +```sh +docker compose -f docker-compose.yml exec -T postgres \ + psql -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" -v ON_ERROR_STOP=1 \ + -c "SELECT COUNT(*) AS active_notes FROM memory_notes WHERE status = 'active';" +``` + +Before upgrading ELF binaries or changing config, take a Postgres backup. There is no reverse +migration command in the minimum runbook; rollback means stopping ELF, restoring the previous +Postgres backup, starting the previous known-good binary/config, and rebuilding Qdrant. + +## 5. Restart, Upgrade, And Roll Back + +For a config-only restart: + +```sh +pkill -f "target/debug/elf-api -c elf.production.toml" || true +pkill -f "target/debug/elf-worker -c elf.production.toml" || true +pkill -f "target/debug/elf-mcp -c elf.production.toml" || true +``` + +Then start the worker and API again in separate terminals: + +```sh +target/debug/elf-worker -c elf.production.toml +``` + +```sh +target/debug/elf-api -c elf.production.toml +``` + +For an ELF binary upgrade: + +```sh +# 1. Run Section 6 and keep the backup path. +# 2. Stop ELF service processes. +pkill -f "target/debug/elf-api -c elf.production.toml" || true +pkill -f "target/debug/elf-worker -c elf.production.toml" || true +pkill -f "target/debug/elf-mcp -c elf.production.toml" || true + +# 3. Move the checkout to the desired release or commit, then rebuild. +cargo build -p elf-api -p elf-worker -p elf-mcp + +# 4. Start worker in one terminal. +target/debug/elf-worker -c elf.production.toml +``` + +```sh +# 5. Start API in another terminal, then run Section 4 health and migration checks. +target/debug/elf-api -c elf.production.toml +``` + +For rollback, restore the pre-upgrade backup and rebuild Qdrant: + +```sh +# 1. Stop ELF service processes. +pkill -f "target/debug/elf-api -c elf.production.toml" || true +pkill -f "target/debug/elf-worker -c elf.production.toml" || true +pkill -f "target/debug/elf-mcp -c elf.production.toml" || true + +# 2. Move the checkout and elf.production.toml back to the previous known-good version. +# 3. Run Section 7 restore. +# 4. Run Section 8 Qdrant rebuild. +# 5. Start the previous known-good worker and API, then run Section 4 health checks. +``` + +## 6. Back Up Postgres + +Stop or pause writers first. For this single-user runbook, that means stop `elf-api`, `elf-worker`, +and `elf-mcp` with Ctrl-C in their terminals. Leave the `postgres` container running. + +Create a custom-format Postgres backup: + +```sh +mkdir -p backups/postgres +BACKUP="backups/postgres/elf-$(date -u +%Y%m%dT%H%M%SZ).dump" + +docker compose -f docker-compose.yml exec -T postgres \ + pg_dump -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" -Fc > "${BACKUP}" + +chmod 600 "${BACKUP}" +printf 'Wrote %s\n' "${BACKUP}" +``` + +Copy the backup to your normal encrypted backup location. Do not commit it. + +## 7. Restore Postgres + +Use this path for a fresh machine restore or rollback. Stop `elf-api`, `elf-worker`, and `elf-mcp` +before restoring. Start only storage: + +```sh +docker compose -f docker-compose.yml up -d postgres qdrant +``` + +Restore the selected backup into the configured database: + +```sh +RESTORE="backups/postgres/elf-YYYYMMDDTHHMMSSZ.dump" + +docker compose -f docker-compose.yml exec -T postgres \ + dropdb -U "${ELF_POSTGRES_USER}" --force --if-exists "${ELF_POSTGRES_DB}" + +docker compose -f docker-compose.yml exec -T postgres \ + createdb -U "${ELF_POSTGRES_USER}" "${ELF_POSTGRES_DB}" + +docker compose -f docker-compose.yml exec -T postgres \ + pg_restore -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" \ + --no-owner --role="${ELF_POSTGRES_USER}" < "${RESTORE}" +``` + +Verify the restored source-of-truth rows: + +```sh +docker compose -f docker-compose.yml exec -T postgres \ + psql -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" -v ON_ERROR_STOP=1 \ + -c "SELECT COUNT(*) AS notes FROM memory_notes;" +``` + +## 8. Rebuild Qdrant From Postgres + +Qdrant is rebuildable. If the Qdrant volume or memory-note collection is missing, stale, or +restored from the wrong point in time, discard the memory-note collection and rebuild it from +Postgres. + +Delete the derived memory-note collection. A missing collection is acceptable: + +```sh +QDRANT_REST="http://127.0.0.1:${ELF_QDRANT_REST_PORT}" + +curl -fsS -X DELETE "${QDRANT_REST}/collections/${ELF_QDRANT_COLLECTION}?wait=true" || true +``` + +Start or restart `elf-api` after deleting collections so startup recreates them: + +```sh +target/debug/elf-api -c elf.production.toml +``` + +Then call the admin rebuild endpoint from another terminal. If `security.auth_mode = "static_keys"`, +use an admin or super-admin token: + +```sh +curl -fsS -X POST http://127.0.0.1:51891/v2/admin/qdrant/rebuild \ + -H "Authorization: Bearer ${ELF_ADMIN_TOKEN}" +``` + +Expected result: + +```json +{ + "rebuilt_count": 1, + "missing_vector_count": 0, + "error_count": 0 +} +``` + +`rebuilt_count` depends on how many active chunks exist. `missing_vector_count` and `error_count` +must be `0` for a clean production restore. The rebuild uses persisted Postgres vectors and must not +call the embedding provider. + +This endpoint rebuilds memory-note chunks. Do not treat it as a Doc Extension rebuild procedure for +`storage.qdrant.docs_collection`. + +## 9. Smoke And Restore Proof + +With `elf-worker` and `elf-api` running, ingest one deterministic note. If auth is off, omit the +`Authorization` header. If static-key auth is on, use a token whose configured context matches the +tenant, project, agent, and read profile used by the smoke commands. + +```sh +curl -fsS -X POST http://127.0.0.1:51892/v2/notes/ingest \ + -H "Authorization: Bearer ${ELF_USER_TOKEN}" \ + -H 'content-type: application/json' \ + -H 'X-ELF-Tenant-Id: local-tenant' \ + -H 'X-ELF-Project-Id: local-project' \ + -H 'X-ELF-Agent-Id: local-agent' \ + -d '{ + "scope": "agent_private", + "notes": [ + { + "type": "fact", + "key": "single_user_restore_probe", + "text": "The single-user production restore probe is stored in Postgres and searchable after Qdrant rebuild.", + "importance": 0.8, + "confidence": 0.95, + "ttl_days": 14, + "source_ref": {"schema": "single_user_runbook/v1", "ref": {"step": "restore_probe"}} + } + ] + }' +``` + +Wait a few seconds for the worker, then search: + +```sh +curl -fsS -X POST http://127.0.0.1:51892/v2/searches \ + -H "Authorization: Bearer ${ELF_USER_TOKEN}" \ + -H 'content-type: application/json' \ + -H 'X-ELF-Tenant-Id: local-tenant' \ + -H 'X-ELF-Project-Id: local-project' \ + -H 'X-ELF-Agent-Id: local-agent' \ + -H 'X-ELF-Read-Profile: private_only' \ + -d '{ + "mode": "quick_find", + "query": "Where is the single-user production restore probe stored?", + "top_k": 5, + "candidate_k": 20, + "payload_level": "l0" +}' +``` + +### Clean-Volume Proof Path + +Run this from the repository root when you need a local proof that backup, clean-volume restore, +Qdrant rebuild, and search recovery work without host-global service installs. It uses the +checked-in deterministic local providers, a temporary config under `tmp/`, ports `51988-51993`, +and isolated Docker volume names. + +```sh +bash <<'EOF' +set -euo pipefail + +PROOF_DIR="tmp/single-user-restore-proof" +PROOF_CONFIG="${PROOF_DIR}/elf.restore-proof.toml" +mkdir -p "${PROOF_DIR}/backups" +cp config/local/elf.docker.toml "${PROOF_CONFIG}" +perl -0pi -e 's/127\.0\.0\.1:51888/127.0.0.1:51988/g; s/127\.0\.0\.1:51889/127.0.0.1:51989/g; s/127\.0\.0\.1:51890/127.0.0.1:51990/g; s/127\.0\.0\.1:51891/127.0.0.1:51991/g; s/127\.0\.0\.1:51892/127.0.0.1:51992/g; s/127\.0\.0\.1:51893/127.0.0.1:51993/g; s/elf_local_notes/elf_restore_proof_notes/g; s/elf_local_doc_chunks/elf_restore_proof_doc_chunks/g' "${PROOF_CONFIG}" + +export ELF_COMPOSE_PROJECT=elf-restore-proof +export ELF_POSTGRES_DB=elf_local +export ELF_POSTGRES_USER=elf_dev +export ELF_POSTGRES_PASSWORD=elf_dev_password +export ELF_POSTGRES_PORT=51988 +export ELF_POSTGRES_VOLUME=elf-restore-proof-postgres-data +export ELF_QDRANT_REST_PORT=51989 +export ELF_QDRANT_GRPC_PORT=51990 +export ELF_QDRANT_VOLUME=elf-restore-proof-qdrant-data + +API_PID="" +WORKER_PID="" +cleanup() { + for pid in ${API_PID:-} ${WORKER_PID:-}; do + if [ -n "${pid}" ]; then + kill "${pid}" 2>/dev/null || true + wait "${pid}" 2>/dev/null || true + fi + done + docker compose -f docker-compose.yml down -v --remove-orphans >/dev/null 2>&1 || true +} +trap cleanup EXIT + +docker compose -f docker-compose.yml down -v --remove-orphans +docker compose -f docker-compose.yml config >/dev/null +docker compose -f docker-compose.yml up -d postgres qdrant +for _ in $(seq 1 60); do + docker compose -f docker-compose.yml exec -T postgres \ + pg_isready -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" >/dev/null 2>&1 && break + sleep 1 +done +docker compose -f docker-compose.yml exec -T postgres \ + pg_isready -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" +for _ in $(seq 1 60); do + curl -fsS "http://127.0.0.1:${ELF_QDRANT_REST_PORT}/collections" >/dev/null && break + sleep 1 +done +curl -fsS "http://127.0.0.1:${ELF_QDRANT_REST_PORT}/collections" >/dev/null + +cargo build -p elf-api -p elf-worker + +target/debug/elf-worker -c "${PROOF_CONFIG}" > "${PROOF_DIR}/worker-before.log" 2>&1 & +WORKER_PID="$!" +target/debug/elf-api -c "${PROOF_CONFIG}" > "${PROOF_DIR}/api-before.log" 2>&1 & +API_PID="$!" + +for _ in $(seq 1 60); do + curl -fsS http://127.0.0.1:51992/health >/dev/null && break + sleep 1 +done +curl -fsS http://127.0.0.1:51992/health | tee "${PROOF_DIR}/health-before.json" + +curl -fsS -X POST http://127.0.0.1:51992/v2/notes/ingest \ + -H 'content-type: application/json' \ + -H 'X-ELF-Tenant-Id: local-tenant' \ + -H 'X-ELF-Project-Id: local-project' \ + -H 'X-ELF-Agent-Id: local-agent' \ + -d '{ + "scope": "agent_private", + "notes": [ + { + "type": "fact", + "key": "single_user_restore_probe", + "text": "The single-user production restore proof note is stored in Postgres and searchable after Qdrant rebuild.", + "importance": 0.8, + "confidence": 0.95, + "ttl_days": 14, + "source_ref": {"schema": "single_user_runbook/v1", "ref": {"step": "clean_volume_restore_proof"}} + } + ] + }' | tee "${PROOF_DIR}/add-note.json" + +for _ in $(seq 1 60); do + OPEN_OUTBOX="$(docker compose -f docker-compose.yml exec -T postgres \ + psql -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" -At \ + -c "SELECT COUNT(*) FROM indexing_outbox WHERE status <> 'DONE';")" + [ "${OPEN_OUTBOX}" = "0" ] && break + sleep 1 +done +test "${OPEN_OUTBOX}" = "0" + +curl -fsS -X POST http://127.0.0.1:51992/v2/searches \ + -H 'content-type: application/json' \ + -H 'X-ELF-Tenant-Id: local-tenant' \ + -H 'X-ELF-Project-Id: local-project' \ + -H 'X-ELF-Agent-Id: local-agent' \ + -H 'X-ELF-Read-Profile: private_only' \ + -d '{ + "mode": "quick_find", + "query": "Where is the single-user production restore proof note stored?", + "top_k": 5, + "candidate_k": 20, + "payload_level": "l0" + }' | tee "${PROOF_DIR}/search-before.json" +grep -F "single-user production restore proof note" "${PROOF_DIR}/search-before.json" + +BACKUP="${PROOF_DIR}/backups/elf-proof.dump" +docker compose -f docker-compose.yml exec -T postgres \ + pg_dump -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" -Fc > "${BACKUP}" +test -s "${BACKUP}" + +kill "${API_PID}" "${WORKER_PID}" 2>/dev/null || true +wait "${API_PID}" "${WORKER_PID}" 2>/dev/null || true +API_PID="" +WORKER_PID="" + +docker compose -f docker-compose.yml down -v --remove-orphans +docker compose -f docker-compose.yml up -d postgres qdrant +for _ in $(seq 1 60); do + docker compose -f docker-compose.yml exec -T postgres \ + pg_isready -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" >/dev/null 2>&1 && break + sleep 1 +done +docker compose -f docker-compose.yml exec -T postgres \ + pg_isready -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" +for _ in $(seq 1 60); do + curl -fsS "http://127.0.0.1:${ELF_QDRANT_REST_PORT}/collections" >/dev/null && break + sleep 1 +done + +docker compose -f docker-compose.yml exec -T postgres \ + dropdb -U "${ELF_POSTGRES_USER}" --force --if-exists "${ELF_POSTGRES_DB}" +docker compose -f docker-compose.yml exec -T postgres \ + createdb -U "${ELF_POSTGRES_USER}" "${ELF_POSTGRES_DB}" +docker compose -f docker-compose.yml exec -T postgres \ + pg_restore -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" \ + --no-owner --role="${ELF_POSTGRES_USER}" < "${BACKUP}" + +RESTORED_NOTES="$(docker compose -f docker-compose.yml exec -T postgres \ + psql -U "${ELF_POSTGRES_USER}" -d "${ELF_POSTGRES_DB}" -At \ + -c "SELECT COUNT(*) FROM memory_notes WHERE key = 'single_user_restore_probe';")" +test "${RESTORED_NOTES}" = "1" + +target/debug/elf-api -c "${PROOF_CONFIG}" > "${PROOF_DIR}/api-after.log" 2>&1 & +API_PID="$!" +for _ in $(seq 1 60); do + curl -fsS http://127.0.0.1:51992/health >/dev/null && break + sleep 1 +done + +curl -fsS -X POST http://127.0.0.1:51991/v2/admin/qdrant/rebuild \ + | tee "${PROOF_DIR}/qdrant-rebuild.json" +grep -F '"missing_vector_count":0' "${PROOF_DIR}/qdrant-rebuild.json" +grep -F '"error_count":0' "${PROOF_DIR}/qdrant-rebuild.json" + +curl -fsS -X POST http://127.0.0.1:51992/v2/searches \ + -H 'content-type: application/json' \ + -H 'X-ELF-Tenant-Id: local-tenant' \ + -H 'X-ELF-Project-Id: local-project' \ + -H 'X-ELF-Agent-Id: local-agent' \ + -H 'X-ELF-Read-Profile: private_only' \ + -d '{ + "mode": "quick_find", + "query": "Where is the single-user production restore proof note stored?", + "top_k": 5, + "candidate_k": 20, + "payload_level": "l0" + }' | tee "${PROOF_DIR}/search-after.json" +grep -F "single-user production restore proof note" "${PROOF_DIR}/search-after.json" + +printf 'Single-user restore proof passed. Evidence files remain under %s.\n' "${PROOF_DIR}" +EOF +``` + +The proof fails closed on missing Docker services, occupied ports, failed service health, undrained +indexing outbox rows, an empty backup, missing restored source rows, non-zero Qdrant rebuild errors, +or a search response that does not contain the restored note. + +### Recorded Local Proof - June 9, 2026 + +The clean-volume proof path above was executed locally against this worktree after aligning +`docker-compose.yml` with the PostgreSQL 18 volume layout. It used the checked-in local deterministic +providers, isolated Compose volumes, and ports `51988-51993`. + +Recorded evidence: + +- Compose storage started cleanly with Postgres accepting connections. +- `cargo build -p elf-api -p elf-worker` completed. +- `POST /v2/notes/ingest` returned `op = "ADD"` and `policy_decision = "remember"` for + `key = "single_user_restore_probe"`. +- Search before backup returned the note summary: + "The single-user production restore proof note is stored in Postgres and searchable after Qdrant + rebuild." +- The custom-format Postgres backup was non-empty (`88K` in the local proof run). +- The proof destroyed and recreated the isolated Compose volumes, restored Postgres with + `pg_restore`, and verified one restored source row for `single_user_restore_probe`. +- `POST /v2/admin/qdrant/rebuild` returned + `{"error_count":0,"missing_vector_count":0,"rebuilt_count":1}`. +- Search after restore and Qdrant rebuild returned the same restored note. +- Cleanup removed the isolated proof containers and volumes. + +## 10. Local CLI Wrappers + +The `elf` CLI is a thin local wrapper over the same HTTP contracts used above. It does not read or +write storage directly, bypass auth, or change scope/read-profile rules. Build it with the service +binaries: + +```sh +cargo build -p elf --bin elf +``` + +By default the CLI targets the runbook loopback ports and smoke context: + +- `ELF_API_URL` or `--api-url`: default `http://127.0.0.1:51892`. +- `ELF_ADMIN_URL` or `--admin-url`: default `http://127.0.0.1:51891`. +- `ELF_TENANT_ID`, `ELF_PROJECT_ID`, and `ELF_AGENT_ID`: default `local-tenant`, + `local-project`, and `local-agent`. +- `ELF_READ_PROFILE` or `--read-profile`: default `private_only`. +- `ELF_USER_TOKEN` or `--token`: bearer token for public endpoints when static-key auth is enabled. +- `ELF_ADMIN_TOKEN` or `--admin-token`: admin bearer token for admin endpoints. + +Check API health and get machine-readable status: + +```sh +target/debug/elf status --pretty +``` + +Add a deterministic note through `POST /v2/notes/ingest`. `--source-id` is copied into +`source_ref.ref.source_id` and echoed in the CLI output for debugging: + +```sh +target/debug/elf add-note \ + --key single_user_restore_probe_cli \ + --source-id single-user-runbook:restore-probe-cli \ + --text "The single-user production CLI smoke note is stored through the HTTP add-note contract." \ + --importance 0.8 \ + --confidence 0.95 \ + --ttl-days 14 \ + --pretty +``` + +Search through `POST /v2/searches`. The JSON output includes `trace_id`, `search_id`, and note ids: + +```sh +target/debug/elf search \ + --query "Where is the single-user production CLI smoke note stored?" \ + --top-k 5 \ + --candidate-k 20 \ + --payload-level l0 \ + --pretty +``` + +Use admin diagnostics when you need source refs, trace bundles, provenance, or a Qdrant rebuild +readback. These commands require an admin token when `security.auth_mode = "static_keys"`: + +```sh +target/debug/elf diagnostics raw-search \ + --query "Where is the single-user production CLI smoke note stored?" \ + --payload-level l2 \ + --pretty + +target/debug/elf diagnostics recent-traces --limit 10 --pretty +target/debug/elf diagnostics trace-bundle --trace-id TRACE_ID --mode bounded --pretty +target/debug/elf diagnostics note-provenance --note-id NOTE_ID --pretty +target/debug/elf diagnostics qdrant-rebuild --pretty +``` + +For batch backfill and benchmark reports, use the wrappers documented in +`docs/guide/benchmarking/live_baseline_benchmark.md`. Those wrappers delegate to the checked-in +`cargo make` tasks and keep benchmark artifacts under `tmp/live-baseline/`. + +## 11. Failure And Secret Rules + +- Missing or invalid config fails startup. +- `security.reject_non_english = false` fails config validation. +- Non-English API inputs fail with HTTP 422. +- API binds outside loopback fail unless authenticated static-key mode is configured; admin bind is + loopback-only. +- `add_note` is deterministic and does not call an LLM. `add_event` requires the configured LLM + extractor and evidence-bound quotes. +- Secret-like note text is rejected by the write gate. +- Qdrant can be stale, empty, or deleted; Postgres remains authoritative. +- Never commit `.env`, `elf.production.toml`, backups, dumps, API keys, bearer tokens, or provider + credentials. + +## Related Guides + +- Local bootstrap: `docs/guide/getting_started.md` +- Integration testing: `docs/guide/integration-testing.md` +- System contract: `docs/spec/system_elf_memory_service_v2.md` diff --git a/docs/guide/testing.md b/docs/guide/testing.md new file mode 100644 index 00000000..480a8c61 --- /dev/null +++ b/docs/guide/testing.md @@ -0,0 +1,26 @@ +# Test Names and Scope + +Goal: Provide consistent names for test categories and the commands that run them. +Read this when: You need to choose, report, or request the right test lane for a change. +Inputs: The repository test surface and current validation target. +Depends on: `Makefile.toml` and the repository CI/test workflow. +Outputs: A consistent test-category name and the matching command or workflow. + +## Names + +- `unit` — Tests inside `#[cfg(test)]` modules in `src/`. Run with `cargo make test`. +- `integration` — Rust integration tests under `tests/*.rs`. Run with `cargo make test`. +- `integration (ignored)` — Integration tests that require external services and are marked `#[ignore]`. Run with `cargo make test-rust-integration`. +- `acceptance` — The integration suite in `packages/elf-service/tests/acceptance.rs` and `packages/elf-service/tests/acceptance/*.rs`. These are usually `#[ignore]` and require external services. +- `E2E harness` — Deterministic harness scripts for memory retrieval/ranking. Run locally with `cargo make test-e2e` and in CI via `.github/workflows/e2e.yml`. + +Note: Some integration tests require external services such as Postgres or Qdrant and are marked `#[ignore]`. When requesting those, say "integration (ignored)" so the ignored set is included. + +## Database names + +- `elf_e2e` — Dedicated database for the E2E flow. +- `elf_test_*` — Ephemeral databases created by `elf_testkit::TestDatabase` for integration tests. + +## Usage + +When requesting tests, refer to the names above. Example: "Run unit and integration tests," "Run integration (ignored) tests," or "Run the E2E flow." diff --git a/docs/index.md b/docs/index.md index c8236fee..1d364989 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,36 +1,46 @@ # Documentation Index -Purpose: Provide the canonical entry point and reading order for repository documentation. - -## Start here - -- `AGENTS.md` for automated agent rules and tooling constraints. -- `docs/spec/index.md` for normative system specifications and contracts. -- `docs/guide/index.md` for operational guides and runbooks. -- `docs/governance.md` for documentation structure and update rules. -- `docs/plans/` for Claude-generated execution plans (non-normative). - -## Documentation classes - -### Specifications (normative) - -- Location: `docs/spec/` (flat structure). -- Use for: System contracts, data models, pipeline behavior, and required invariants. -- Entry point: `docs/spec/index.md`. -- Core spec: `docs/spec/system_elf_memory_service_v1.md`. - -### Operational and pipeline docs (implementation guides) - -- Location: `docs/guide/` -- Use for: Runbooks, pipeline walkthroughs, operational maintenance, and test procedures. -- Entry point: `docs/guide/index.md`. - -### Working plans and drafts - -- Location: `docs/plans/` -- Use for: Temporary design docs and execution plans that may drift. - -### Repository README - -- Location: `README.md` (the only README in the repository). -- Use for: High-level project overview and entry points into `docs/`. +Purpose: Route agents to the smallest correct document set for the current task. +Read this when: You are starting from repository docs and need to choose the right lane. +Not this document: Detailed subsystem contracts, step-by-step runbooks, research run state, or saved plan artifacts. +Routes to: `docs/governance.md`, `docs/spec/`, `docs/guide/`, `docs/research/`, `docs/plans/`, and `Makefile.toml`. + +Audience: All documentation in this repository is written for AI agents and LLM workflows. +The split below is by question type, not by human-versus-agent audience. + +## Read order + +- Read `docs/governance.md` for document contracts and placement rules. +- Read `Makefile.toml` when the task depends on repo task names or execution entrypoints. +- Then choose one primary lane: + - `docs/spec/index.md` when the question is "what must be true?" + - `docs/guide/index.md` when the question is "what should I do?" +- Use `docs/research/` only when a research workflow explicitly points to a + machine-readable research run file there. +- Use `docs/plans/` only when a planning tool or execution workflow explicitly points to + a saved plan artifact there. + +## Routing matrix + +- Need contracts, invariants, schemas, enums, state machines, or required behavior -> + `docs/spec/` +- Need runbooks, migrations, validation steps, troubleshooting, or operational sequences -> + `docs/guide/` +- Need the single-user production backup, restore, and Qdrant rebuild path -> + `docs/guide/single_user_production.md` +- Need external comparisons or architecture research inputs -> `docs/guide/research/` +- Need machine-readable research run state, evidence, trade-offs, and decision status -> + `docs/research/` +- Need repo task names or automation entrypoints -> `Makefile.toml` +- Need documentation placement or authoring rules -> `docs/governance.md` +- Need a planning-tool artifact or saved execution plan -> `docs/plans/` + +## Retrieval rules + +- Optimize for agent routing and execution, not narrative flow. +- Keep one authoritative document per topic. Link instead of copying. +- Start each document with a short routing header that says what the document is for, + when to read it, and what it does not cover. +- Keep links explicit and stable. +- Let structure emerge from real topics. Do not create empty folders, empty indexes, or + naming schemes that are stricter than the current corpus needs. diff --git a/docs/plans/2026-02-02-project-cleanup-design.md b/docs/plans/2026-02-02-project-cleanup-design.md index 2199e4ba..4f6d6cf4 100644 --- a/docs/plans/2026-02-02-project-cleanup-design.md +++ b/docs/plans/2026-02-02-project-cleanup-design.md @@ -1,6 +1,6 @@ # Project Cleanup Architecture Design -**Goal:** Restructure each app into a library-plus-binary layout, remove `#[path]` test imports, and make `cargo make lint` pass without suppressing lints. +**Goal:** Restructure each app into a library-plus-binary layout, remove `#[path]` test imports, and make `cargo make lint-rust` pass without suppressing lints. **Scope (Option 2):** - Apply the `lib + bin` layout to `elf-api`, `elf-mcp`, and `elf-worker`. @@ -19,5 +19,5 @@ - Any remaining clippy errors will be fixed by small structural adjustments rather than `#[allow]` attributes. **Testing and Verification:** -- Run `cargo make lint` to confirm workspace linting passes. +- Run `cargo make lint-rust` to confirm workspace linting passes. - Do not change test behavior; only update import paths and shared wiring required by the new layout. diff --git a/docs/plans/2026-02-02-project-cleanup.md b/docs/plans/2026-02-02-project-cleanup.md index 536991c7..a0ef40d4 100644 --- a/docs/plans/2026-02-02-project-cleanup.md +++ b/docs/plans/2026-02-02-project-cleanup.md @@ -2,7 +2,7 @@ > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. -**Goal:** Refactor each app into a lib+bin layout, remove `#[path]` test imports, and keep CLI/logging behavior unchanged while ensuring `cargo make lint` passes. +**Goal:** Refactor each app into a lib+bin layout, remove `#[path]` test imports, and keep CLI/logging behavior unchanged while ensuring `cargo make lint-rust` passes. **Architecture:** Each app exposes a small `lib.rs` with its CLI `Args` and `run` entrypoint plus existing modules. `main.rs` becomes a thin wrapper that parses CLI args and calls the library. Tests import the library modules instead of using `#[path]`. @@ -250,7 +250,7 @@ git commit -m "refactor: move elf-mcp entrypoint into lib" - Modify: None **Step 1: Run lint** -Run: `cargo make lint` +Run: `cargo make lint-rust` Expected: PASS. **Step 2: Run targeted app tests** diff --git a/docs/plans/2026-02-03-search-expansion-design.md b/docs/plans/2026-02-03-search-expansion-design.md index 4795acc2..4f8c99e6 100644 --- a/docs/plans/2026-02-03-search-expansion-design.md +++ b/docs/plans/2026-02-03-search-expansion-design.md @@ -36,7 +36,7 @@ A new search configuration block is introduced: ## Failure handling -If the LLM expansion call fails or returns invalid JSON, the system must fall back to the original query only. Any CJK output in expanded queries is dropped. If the expanded set becomes empty after filtering, the system must fall back to the original query. +If the LLM expansion call fails or returns invalid JSON, the system must fall back to the original query only. Any non-English output in expanded queries is dropped. If the expanded set becomes empty after filtering, the system must fall back to the original query. ## Testing diff --git a/docs/plans/2026-02-04-chunked-embeddings-design.md b/docs/plans/2026-02-04-chunked-embeddings-design.md index 48588fc4..c90f1126 100644 --- a/docs/plans/2026-02-04-chunked-embeddings-design.md +++ b/docs/plans/2026-02-04-chunked-embeddings-design.md @@ -110,12 +110,12 @@ Chunk text is not stored in Qdrant payload. ## API Changes Search is chunk-first: -- `POST /v1/memory/search` returns chunk items and snippets. +- `POST /v2/searches` returns chunk items and snippets. - Snippets are stitched from the top chunk plus immediate neighbors. -- A new endpoint returns full notes by ID: `GET /v1/memory/notes/{note_id}`. +- Full notes are fetched separately via `POST /v2/searches/{search_id}/notes` or `GET /v2/notes/{note_id}`. Search explain: -- `GET /v1/memory/search/explain` returns `chunk_id` alongside scores. +- `GET /v2/admin/trace-items/{item_id}` returns per-item explain data, including `chunk_id` alongside scores. ## Rebuild and Indexing @@ -141,7 +141,7 @@ Add tests to cover: ## Spec Updates -Update `docs/spec/system_elf_memory_service_v1.md` to reflect: +Update `docs/spec/system_elf_memory_service_v2.md` to reflect: - Chunk embeddings as the source-of-truth vectors. - `note_embeddings` as derived pooled vectors. - New tables and search explain fields. diff --git a/docs/plans/2026-02-04-chunked-embeddings-implementation.md b/docs/plans/2026-02-04-chunked-embeddings-implementation.md index 21731ed1..87f560b0 100644 --- a/docs/plans/2026-02-04-chunked-embeddings-implementation.md +++ b/docs/plans/2026-02-04-chunked-embeddings-implementation.md @@ -13,6 +13,7 @@ ### Task 1: Add chunking config and validation **Files:** + - Modify: `packages/elf-config/src/types.rs` - Modify: `packages/elf-config/src/lib.rs` - Modify: `elf.example.toml` @@ -118,6 +119,7 @@ git commit -m '{"schema":"cmsg/1","type":"feat","scope":"config","summary":"Add ### Task 2: Add chunk tables and adjust schema **Files:** + - Create: `sql/tables/009_memory_note_chunks.sql` - Create: `sql/tables/010_note_chunk_embeddings.sql` - Modify: `sql/tables/004_memory_hits.sql` @@ -251,6 +253,7 @@ git commit -m '{"schema":"cmsg/1","type":"feat","scope":"storage","summary":"Add ### Task 3: Add chunking utilities and dependencies **Files:** + - Modify: `Cargo.toml` - Modify: `apps/elf-worker/Cargo.toml` - Create: `apps/elf-worker/src/chunking.rs` @@ -302,13 +305,13 @@ Create `apps/elf-worker/src/chunking.rs`: use unicode_segmentation::UnicodeSegmentation; use tokenizers::Tokenizer; -#[derive(Debug, Clone)] +#[derive(Clone, Debug)] pub struct ChunkingConfig { pub max_tokens: u32, pub overlap_tokens: u32, } -#[derive(Debug, Clone)] +#[derive(Clone, Debug)] pub struct Chunk { pub chunk_index: i32, pub start_offset: usize, @@ -381,6 +384,7 @@ git commit -m '{"schema":"cmsg/1","type":"feat","scope":"worker","summary":"Add ### Task 4: Implement chunk-first indexing in worker **Files:** + - Modify: `apps/elf-worker/src/worker.rs` - Modify: `packages/elf-storage/src/models.rs` - Modify: `packages/elf-storage/src/queries.rs` @@ -478,6 +482,7 @@ git commit -m '{"schema":"cmsg/1","type":"feat","scope":"worker","summary":"Inde ### Task 5: Update rebuild and search traces for chunks **Files:** + - Modify: `packages/elf-service/src/admin.rs` - Modify: `apps/elf-worker/src/worker.rs` - Modify: `sql/tables/006_search_traces.sql` @@ -524,6 +529,7 @@ git commit -m '{"schema":"cmsg/1","type":"feat","scope":"search","summary":"Rebu ### Task 6: Make search chunk-first and add note fetch endpoint **Files:** + - Modify: `packages/elf-service/src/search.rs` - Modify: `packages/elf-service/src/list.rs` - Create: `packages/elf-service/src/notes.rs` @@ -572,6 +578,7 @@ pub struct SearchItem { ``` Adjust search pipeline: + - Parse Qdrant payload for `chunk_id`, `chunk_index`, `start_offset`, `end_offset`. - Load chunk text from `memory_note_chunks` for snippet stitching. - Rerank chunk snippets (chunk + neighbors). @@ -608,6 +615,7 @@ git commit -m '{"schema":"cmsg/1","type":"feat","scope":"api","summary":"Return ### Task 7: Update specs and docs **Files:** + - Modify: `docs/spec/system_elf_memory_service_v1.md` - Modify: `docs/guide/integration-testing.md` diff --git a/docs/plans/2026-02-04-llm-cache-implementation-plan.md b/docs/plans/2026-02-04-llm-cache-implementation-plan.md index dd285cd0..5a5bd692 100644 --- a/docs/plans/2026-02-04-llm-cache-implementation-plan.md +++ b/docs/plans/2026-02-04-llm-cache-implementation-plan.md @@ -271,14 +271,14 @@ Expected: FAIL due to missing types and helper. Add payload structs and a validator that returns `Option<Vec<f32>>`. ```rust -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[derive(Clone, Debug serde::Serialize, serde::Deserialize)] struct RerankCacheItem { note_id: uuid::Uuid, updated_at: time::OffsetDateTime, score: f32, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[derive(Clone, Debug serde::Serialize, serde::Deserialize)] struct RerankCachePayload { items: Vec<RerankCacheItem>, } @@ -439,4 +439,3 @@ Expected: PASS (external integration tests may be ignored without Postgres/Qdran **Step 2: Summarize behavior changes** Document cache defaults, TTLs, and invalidation rules in the PR summary. - diff --git a/docs/plans/2026-02-04-search-explainability-design.md b/docs/plans/2026-02-04-search-explainability-design.md index 8897fcdf..d419303a 100644 --- a/docs/plans/2026-02-04-search-explainability-design.md +++ b/docs/plans/2026-02-04-search-explainability-design.md @@ -22,8 +22,9 @@ This design adds persistent, query-scoped explainability for search results whil - Traces are retained for `search.explain.retention_days` and cleaned by the worker. ## API -- `POST /v1/memory/search` response includes `trace_id`, `result_handle`, and `explain` with component scores and matches. -- `GET /v1/memory/search/explain?result_handle=...` returns the trace metadata plus the item explanation. +- `POST /v2/searches` response includes `trace_id`, per-item `result_handle`, and `explain` with component scores and matches. +- `GET /v2/admin/trace-items/{item_id}` returns the trace metadata plus the item explanation. +- `GET /v2/admin/traces/{trace_id}` returns the full trace metadata and items. ## Data Flow 1. Resolve scopes and expansion mode. diff --git a/docs/plans/2026-02-09-ranking-harness-trace-policy-compare.md b/docs/plans/2026-02-09-ranking-harness-trace-policy-compare.md new file mode 100644 index 00000000..35787537 --- /dev/null +++ b/docs/plans/2026-02-09-ranking-harness-trace-policy-compare.md @@ -0,0 +1,75 @@ +# Trace-Based Ranking Harness: Next Steps + +## Context + +We have laid the groundwork for trace-based ranking evaluation: + +- Traces persist `top_k` items with `final_score` and explain breakdown. +- Optional persistence of the full `candidate_k` set is available via `search_trace_candidates`. +- Trace persistence supports `write_mode = "outbox" | "inline"` for production throughput vs evaluation ergonomics. +- `elf-eval` emits `trace_id` (and `trace_ids` for repeated runs) and supports request-scoped `ranking` overrides. + +This document records the next work to deliver a full, reproducible, policy-comparison loop. + +## Goal + +Provide a fast and reproducible harness that can: + +1. Load the exact candidate set from stored traces. +2. Recompute rankings for multiple policy variants on the same candidates. +3. Produce stable metrics and a machine-readable report for diffing and regression gates. + +## Non-Goals (For V1) + +- No web UI dashboard. +- No ML training (LTR). +- No “live” candidate retrieval re-execution for comparison (the source of truth is the stored candidate set). + +## Work Items + +1. Add a trace-based compare mode to `elf-eval`. + - Input options: + - A list of `trace_id`s. + - A dataset of queries that includes `trace_id` per query. + - Output: + - Stability metrics (top-k overlap, positional churn, set churn). + - Guardrails (retention of baseline retrieval rank 1–3, if available). + - Per-trace policy snapshot and per-item score decomposition. + +2. Implement a pure “re-rank from candidates” function in `elf-service` (library-only). + - Inputs: candidate rows (including retrieval rank and rerank score), config snapshot or override. + - Output: ordered results with the same explain schema (`search_ranking_explain/v2`). + - Constraints: + - Must not touch Qdrant, providers, or caches. + - Must be deterministic for a given input set. + +3. Add a stable `policy_id` derived from the policy snapshot. + - Compute a canonical JSON snapshot of policy parameters. + - Derive `policy_id` as a short hash (for example, `blend_v1:<hash>`). + - Store `policy_id` in trace config snapshot and explain outputs to enable automatic grouping. + +4. Ensure candidate capture is sufficient for planned ranking signals. + - Audit what future policies need (diversity, lexical overlap, hit reinforcement, decay). + - Add only the minimal additional columns required for policy recomputation. + - Avoid large JSON fields unless they are required for correctness. + +5. Define operational defaults for production vs evaluation. + - Production: + - `write_mode = "outbox"`. + - `capture_candidates = false` by default. + - Evaluation: + - `write_mode = "inline"` (no worker dependency). + - `capture_candidates = true` (for policy replay). + - If production capture is desired, add sampling (for example, 1%) and/or allowlist gates. + +## Acceptance Criteria + +- Given a fixed list of `trace_id`s, the harness can compare two policy variants and print stability deltas. +- Policy comparisons are reproducible without running Qdrant or external providers. +- The report includes enough detail to explain regressions (policy snapshot and per-term breakdown). + +## Risks / Open Questions + +- Storage growth if `capture_candidates` is enabled broadly in production. +- Some future signals may require additional inputs that are not currently persisted. +- Inline trace writes increase request latency and should remain evaluation-focused by default. diff --git a/docs/plans/2026-02-10-search-ranking-explain-v2-design.md b/docs/plans/2026-02-10-search-ranking-explain-v2-design.md new file mode 100644 index 00000000..06d27d2b --- /dev/null +++ b/docs/plans/2026-02-10-search-ranking-explain-v2-design.md @@ -0,0 +1,61 @@ +# Search Ranking Explain v2 (Additive Terms, v2-Only) + +## Goal +Replace the ad-hoc map-based ranking explain payload with a structured, versioned schema that is stable under iteration and supports reliable evaluation and replay. This change is intentionally breaking. Existing v1 explain payloads and historical trace items are not preserved. + +## Non-Goals +- Do not preserve backward compatibility with `search_ranking_explain/v1`. +- Provide a stage graph or non-additive scoring model. +- Expand retrieval or reranking behavior beyond the deterministic terms already tracked in issue work. + +## Summary +The ranking explain payload becomes `search_ranking_explain/v2` and is defined as an additive decomposition: + +- Invariant: `final_score == sum(terms[].value)`. +- Each term is a named scalar contribution. +- Term inputs are recorded only for persisted traces and evaluation, not in the hot-path search response. + +The implementation uses a single scoring path for live search and for trace replay to prevent drift. Tie-breaking rules are explicit so repeated runs are stable when floating-point comparisons are equal. + +## Schema +`SearchExplain` remains a two-part object: +- `match`: matched terms and fields. +- `ranking`: ranking breakdown. + +`ranking` (v2): +- `schema`: `"search_ranking_explain/v2"` +- `policy_id`: stable policy identifier used for grouping and comparison. +- `final_score`: final score used for sorting. +- `terms`: ordered list of `{ name, value, inputs? }` + +In search responses, `inputs` is omitted. In trace persistence and evaluation outputs, `inputs` is included for debugging and tuning. + +## Data Persistence +`search_trace_items.explain` stores the v2 explain payload as JSON. + +`search_trace_candidates` persists a `candidate_snapshot` JSON object that contains the minimum candidate fields required to replay ranking and compute deterministic terms without re-querying mutable database state. This supports future ranking signals without repeated schema churn. + +## Terms +The initial v2 term set mirrors the current additive score components: +- `blend.retrieval` +- `blend.rerank` +- `tie_breaker` +- `context.scope_boost` +- `deterministic.lexical_bonus` +- `deterministic.hit_boost` +- `deterministic.decay_penalty` + +Each term may record inputs, for example: weights, normalization kinds, ranks, overlap ratios, and hit statistics. + +## Determinism and Tie-Breaks +Sorting is stable and deterministic: +1. `final_score` (descending) +2. `retrieval_rank` (ascending) +3. `note_id` (ascending) +4. `chunk_id` (ascending) + +This ensures repeated runs and replay are consistent when scores collide. + +## Testing +- Unit tests for additive term bounds and schema stability. +- Trace replay tests ensure the explain schema matches v2 and policy IDs remain stable. diff --git a/docs/plans/2026-02-10-structured-memory-fields-design.md b/docs/plans/2026-02-10-structured-memory-fields-design.md new file mode 100644 index 00000000..ac896740 --- /dev/null +++ b/docs/plans/2026-02-10-structured-memory-fields-design.md @@ -0,0 +1,37 @@ +# Structured Memory Fields With Field-Level Embeddings + +## Goal +Improve semantic precision on fact-like queries by adding optional structured fields to notes (summary, facts, concepts), embedding them separately, and merging field matches back into a single note result with explicit explain output. + +This change is additive to the existing chunk-first retrieval design and does not require a graph database. + +## Data Model +Add a normalized structured-field table and a derived embedding table: + +- `memory_note_fields`: One row per note field item (`summary`, `fact`, `concept`) with `item_index` for ordering. +- `note_field_embeddings`: One embedding vector per field row and embedding version. This table is derived and must be rebuildable from Postgres data. + +The canonical human-readable note remains `memory_notes.text`. + +## Write Semantics +- `add_note` remains deterministic. Structured fields are optional input. When provided: + - `facts` must be evidence-bound deterministically (either a substring of the note text, or a substring of any `source_ref.evidence[].quote` strings when provided). +- `add_event` extractor output may include `structured`. Evidence binding remains strict: + - `facts` must be supported by the extracted evidence quotes. +- Structured field changes enqueue an indexing outbox `UPSERT` so the worker regenerates field embeddings. + +## Indexing +The worker embeds both chunk texts and structured field texts in the same embedding batch, then writes: +- chunk vectors to `note_chunk_embeddings` and pooled vectors to `note_embeddings` (existing behavior), +- field vectors to `note_field_embeddings` (new behavior). + +## Retrieval And Explain +Retrieval remains chunk-first via Qdrant hybrid search. In addition: +- Perform a Postgres vector search over `note_field_embeddings` to retrieve additional note candidates and record which fields matched (`summary`, `facts`, `concepts`). +- For field-only candidates, select a representative chunk via Postgres similarity over chunk embeddings so results remain chunk-shaped. + +Explain output includes `matched_fields` entries for matched structured fields. + +## Testing And Evaluation +- Unit tests cover structured-field validation and evidence binding for facts. +- Add a small evaluation dataset focused on fact-like queries and run `elf-eval` before/after enabling structured-field retrieval to compare precision and false positives. diff --git a/docs/plans/2026-02-22-org-shared-design.md b/docs/plans/2026-02-22-org-shared-design.md new file mode 100644 index 00000000..7b839bf4 --- /dev/null +++ b/docs/plans/2026-02-22-org-shared-design.md @@ -0,0 +1,118 @@ +# Org-Shared (Tenant-Wide) Semantics Design +Date: 2026-02-22 + +## Summary +This design defines `org_shared` as **tenant-wide shared memory** (organization scope) rather than a project-scoped variant of `team_shared`/`project_shared`. + +Because the current storage model and access controls are keyed on `(tenant_id, project_id, scope)`, this design implements tenant-wide `org_shared` by introducing an **org sentinel project** (`project_id="__org__"`) that holds all org-scoped notes and grants. Reads from any project are extended to include org-scoped notes in addition to the current project’s notes, while preserving explicit sharing via `memory_space_grants`. + +Writes to `org_shared` (ingest, publish, and grant management that can affect tenant-wide visibility) are gated behind `SecurityAuthRole::{Admin,SuperAdmin}` when `security.auth_mode="static_keys"`. + +## Goals +- Define `org_shared` semantics that are consistent across projects within a tenant. +- Preserve explicit sharing and auditability (no “implicit readability” without a grant). +- Avoid weakening isolation guarantees between projects. +- Minimize schema changes and blast radius by reusing existing tables and indexes. + +## Non-Goals +- Making `X-ELF-Project-Id` optional across the public HTTP API. +- Introducing agentless tokens for normal endpoints. +- Adding a full organization membership registry. +- Implementing moderation workflows for promoting notes into `org_shared` (can be added later). + +## Definitions +- **Tenant**: The top-level namespace keyed by `tenant_id`. +- **Project**: A sub-namespace keyed by `project_id` within a tenant. +- **Org sentinel project**: `project_id="__org__"`, reserved for tenant-wide (`org_shared`) storage. +- **team_shared**: Public API alias for internal `project_shared` (project-wide sharing). +- **org_shared**: Tenant-wide sharing, stored under the org sentinel project. + +## Current Constraints (Why this change is needed) +- Public HTTP request context requires `tenant_id`, `project_id`, and `agent_id`. +- Storage tables and grant tables require `project_id NOT NULL`. +- Shared grants are currently loaded by `(tenant_id, project_id, grantee_agent_id)` and treat `org_shared` as project-scoped. + +## Data Model +### Notes +- All notes continue to live in `memory_notes`. +- `org_shared` notes are stored with: + - `tenant_id = <tenant>` + - `project_id = "__org__"` + - `scope = "org_shared"` + +### Grants +- Grants continue to live in `memory_space_grants`. +- Grants for `org_shared` are stored with: + - `tenant_id = <tenant>` + - `project_id = "__org__"` + - `scope = "org_shared"` +- `grantee_kind="project"` in the org sentinel project is defined as **tenant-wide read access** (all agents that can make requests within the tenant). + +## API Semantics +### Reads (list/search/details) +When `org_shared` is included in the resolved allowed scopes for the request’s `read_profile`: +- Queries that currently filter by `(tenant_id, project_id)` are extended to include: + - `(tenant_id, project_id = <request project>)` and + - `(tenant_id, project_id = "__org__")` for `org_shared` only. + +This yields a hierarchical view: +- `agent_private`: only the caller’s agent_id, project-scoped. +- `team_shared`/`project_shared`: project-scoped. +- `org_shared`: tenant-wide (org sentinel project). + +### Writes +#### Ingest (add_note / add_event) +- If request scope is `org_shared`, the note is written to `project_id="__org__"` (not the caller’s project). +- If request scope is `project_shared` or `agent_private`, behavior is unchanged. + +#### Publish / Unpublish +- Publishing a note to `org_shared` moves the note to `project_id="__org__"` and sets `scope="org_shared"`. +- Publishing to `team_shared`/`project_shared` remains project-scoped and creates a project-wide grant as today. + +#### Grant management +- `org_shared` grant upsert/revoke/list operate on `project_id="__org__"` regardless of caller project. + +## Authorization +### Static keys (`security.auth_mode="static_keys"`) +- `org_shared` **writes** require `SecurityAuthRole::{Admin,SuperAdmin}`: + - ingest with `scope="org_shared"` + - publish/unpublish to `space="org_shared"` + - org_shared grant upsert/revoke +- `org_shared` reads are allowed for `User` tokens if the requested `read_profile` includes `org_shared` and an applicable grant exists (including org “project” grants). + +### Auth mode off (`security.auth_mode="off"`) +- Treated as a trusted localhost mode; role gating is not enforceable without an auth key. +- The service should remain usable for local testing; operational deployments should use `static_keys`. + +## Data Flow (Org Shared Read) +1. Resolve allowed scopes from `read_profile`. +2. Load shared read grants for the caller project. +3. If `org_shared` is allowed, also load shared read grants from the org sentinel project. +4. Execute list/search with a combined view of: + - project-scoped notes + - org-scoped notes (org sentinel project) +5. Apply `note_read_allowed` based on scope, status/ttl, and grants. + +## Migration +- New semantics require moving existing project-scoped `org_shared` notes and grants into the org sentinel project. +- Provide a one-time SQL migration script and document operational steps: + - Update `memory_notes.project_id` to `"__org__"` where `scope="org_shared"`. + - Update `memory_space_grants.project_id` to `"__org__"` where `scope="org_shared"`. + +## Testing +- Add acceptance tests that demonstrate cross-project visibility: + - Create an `org_shared` note (admin write). + - Verify an agent in a different project can retrieve it via list/search when `org_shared` is allowed. + - Verify revocation removes visibility. +- Add negative tests: + - `User` token cannot ingest/publish/grant for `org_shared` in `static_keys` mode. + +## Risks and Mitigations +- **Accidental tenant-wide publication**: mitigated by Admin/SuperAdmin write gating. +- **Back-compat**: existing `org_shared` data needs migration; include explicit operator runbook and a rollback plan (restore prior project_id values from backups). +- **Confusion over “project” grantee_kind in org scope**: mitigate via explicit spec wording and tests. + +## Open Questions +- Should `org_shared` reads require Admin role (stricter) or remain user-readable when granted? (Current design: user-readable when granted.) +- Should we add an explicit `grantee_kind="tenant"` in the future to avoid overloading `project`? (Deferred.) + diff --git a/docs/plans/2026-02-22-org-shared-implementation-plan.md b/docs/plans/2026-02-22-org-shared-implementation-plan.md new file mode 100644 index 00000000..0bdcaf0f --- /dev/null +++ b/docs/plans/2026-02-22-org-shared-implementation-plan.md @@ -0,0 +1,156 @@ +# Org-Shared (Tenant-Wide) Semantics Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Implement tenant-wide `org_shared` semantics using an org sentinel project (`project_id="__org__"`), including cross-project reads and Admin/SuperAdmin-gated writes in `static_keys` mode. + +**Architecture:** Treat `tenant_id` as the org boundary. Store all `org_shared` notes and grants under a reserved `project_id="__org__"`. Read paths union org + project scopes; write paths route org_shared to the org sentinel and enforce role-based gating at the HTTP layer. + +**Tech Stack:** Rust (axum, sqlx), Postgres, existing ELF config/auth (`SecurityAuthRole`). + +--- + +### Task 1: Introduce the org sentinel constant and reservation rules + +**Files:** +- Modify: `packages/elf-service/src/access.rs` +- Modify: `packages/elf-service/src/sharing.rs` +- (Optional) Modify: `docs/spec/system_elf_memory_service_v2.md` + +**Step 1: Add a single source of truth constant** +- Add `const ORG_PROJECT_ID: &str = "__org__";` in a shared module used by access + sharing (pick the lowest-impact existing module; avoid creating new crates). + +**Step 2: Document reservation** +- Add a short note in the spec that `__org__` is reserved and not a user project id. + +**Step 3: Verify** +- Run: `cargo make test-rust` +- Expected: PASS + +**Step 4: Commit (optional)** +```bash +git add packages/elf-service/src/access.rs packages/elf-service/src/sharing.rs docs/spec/system_elf_memory_service_v2.md +git commit -m '{"schema":"cmsg/1","type":"feat","scope":"sharing","summary":"Define org sentinel project id","intent":"Add a reserved project id for org_shared storage","impact":"Centralizes __org__ constant for later org_shared semantics","breaking":false,"risk":"low","refs":[]}' +``` + +### Task 2: Propagate auth role to request handling (static_keys mode) + +**Files:** +- Modify: `apps/elf-api/src/routes.rs` +- Add tests: `apps/elf-api/tests/http.rs` + +**Step 1: Add role propagation mechanism** +- In `api_auth_middleware`, after resolving `key`, attach `key.role` to the request for downstream handlers. + - Preferred: `req.extensions_mut().insert(key.role);` + - Avoid: new public headers (keep role server-side). + +**Step 2: Add helper to require Admin for org_shared writes** +- Implement `fn require_admin_for_org_shared(role: Option<&SecurityAuthRole>, ...) -> Result<(), ApiError>` +- Call it from endpoints that can write org_shared: + - notes ingest (`/v2/notes/ingest`) when `scope == "org_shared"` + - events ingest (`/v2/events/ingest`) when `scope == Some("org_shared")` + - publish/unpublish when `space == "org_shared"` + - grant upsert/revoke for `space == "org_shared"` + +**Step 3: Tests** +- Add tests that a `User` key cannot org_shared ingest/publish/grant. +- Add tests that an `Admin` key can org_shared ingest/publish/grant. + +**Step 4: Verify** +- Run: `cargo make test-rust` +- Expected: PASS + +### Task 3: Route org_shared writes to the org sentinel project + +**Files:** +- Modify: `apps/elf-api/src/routes.rs` +- Modify: `packages/elf-service/src/add_note.rs` +- Modify: `packages/elf-service/src/add_event.rs` +- Modify: `packages/elf-service/src/sharing.rs` +- Add tests: `packages/elf-service/tests/acceptance/suite.rs` (or a new acceptance test module under `packages/elf-service/tests/acceptance/`) + +**Step 1: Ingest routing** +- When request scope is `org_shared`, replace `project_id` passed to the service with `ORG_PROJECT_ID`. + +**Step 2: Publish routing** +- When publishing/unpublishing `space == "org_shared"`, operate against the org sentinel project id for: + - the note lookup + - the scope update + - the grant creation + +**Step 3: Add a tenant-wide project grant on org publish** +- Ensure that publishing to org_shared creates a grant row with: + - `tenant_id=<tenant>` + - `project_id="__org__"` + - `scope="org_shared"` + - `grantee_kind="project"` + - `space_owner_agent_id=<note owner>` + +**Step 4: Cross-project acceptance test** +- Setup: + - tenant: `t` + - project A: `a` + - project B: `b` + - admin agent: `admin1` + - user agent in B: `user2` +- Flow: + 1) Ingest a private note as `admin1` in project A. + 2) Publish it to `org_shared` (admin role). + 3) Search/list from project B as `user2` with `read_profile` that includes `org_shared`. + 4) Assert the note is visible. + +**Step 5: Verify** +- Run: `cargo make test-rust` +- Expected: PASS + +### Task 4: Extend read paths to include org_shared across projects + +**Files:** +- Modify: `packages/elf-service/src/access.rs` +- Modify: `packages/elf-service/src/list.rs` +- Modify: `packages/elf-service/src/search.rs` +- Modify: `packages/elf-service/src/progressive_search.rs` + +**Step 1: Load org grants in addition to project grants** +- If allowed scopes include `org_shared`, call `load_shared_read_grants(..., project_id="__org__", ...)` and union with project grants. + +**Step 2: Extend note queries** +- For list/search queries that currently filter by `project_id = $project`, extend to: + - include org notes (`project_id="__org__"`) for `scope="org_shared"` only + - avoid accidentally including `agent_private` from org sentinel (should not exist). + +**Step 3: Verify** +- Run: `cargo make test-rust` +- Expected: PASS + +### Task 5: Operational migration script + runbook + +**Files:** +- Add: `sql/migrate_org_shared_to_org_project.sql` +- Modify: `docs/spec/system_elf_memory_service_v2.md` + +**Step 1: Add a migration SQL script** +- Write a safe, explicit script that moves existing `org_shared` rows into the org sentinel project: + - `UPDATE memory_notes SET project_id='__org__' WHERE scope='org_shared' AND project_id <> '__org__';` + - `UPDATE memory_space_grants SET project_id='__org__' WHERE scope='org_shared' AND project_id <> '__org__';` +- Include a `BEGIN; ... COMMIT;` wrapper and a `SELECT count(*)` before/after. + +**Step 2: Document runbook + rollback** +- Document: + - pre-checks (backups, counts) + - how to run the script + - rollback expectations (restore from backup; optional reverse-update if previous mapping recorded) + +**Step 3: Verify** +- Run: `cargo make test-rust` +- Expected: PASS + +--- + +Plan complete and saved to `docs/plans/2026-02-22-org-shared-implementation-plan.md`. + +Two execution options: +1) **Subagent-Driven (this session)** — execute tasks one-by-one with review checkpoints +2) **Parallel Session (separate)** — open a new session and execute with `executing-plans` checkpoints + +Which approach do you want? diff --git a/docs/plans/2026-02-23-agent-memory-mcp-skills-backlog.md b/docs/plans/2026-02-23-agent-memory-mcp-skills-backlog.md new file mode 100644 index 00000000..8ffbf71a --- /dev/null +++ b/docs/plans/2026-02-23-agent-memory-mcp-skills-backlog.md @@ -0,0 +1,129 @@ +# Agent Memory (MCP + Skills) Backlog +Date: 2026-02-23 + +## Summary +This document captures backlog issues for making ELF maximally usable as an AI-agent memory system when the primary integration surface is MCP. + +The key product gap is long-form memory usability: store compact, evidence-linked facts in ELF while referencing long documents via pointers and hydrating relevant excerpts on demand. + +## Goals +- Support long-form memory via doc pointers + on-demand hydration while keeping ELF notes compact. +- Make multi-agent / multi-brain shared memory operable via MCP (not HTTP-only). +- Provide reference “skills” (agent-side workflows) so different agents behave consistently. +- Preserve ELF invariants: explicit scopes, explicit sharing grants, auditability, and rebuildable derived indexes. + +## Non-Goals +- Turning ELF into a general-purpose document warehouse (unless explicitly decided later). +- Removing the English-only boundary in the v2 contract (treat non-English as an upstream canonicalization concern for now). +- Shipping a full hosted managed service offering. + +## Backlog Issues + +### Issue 1: Expose sharing + grants management via MCP +Problem: The HTTP API has publish/unpublish and grant management endpoints, but MCP does not expose them. This prevents “MCP-only” agents from operating shared memory. + +Proposed MCP tools: +- `elf_notes_publish` +- `elf_notes_unpublish` +- `elf_space_grants_list` +- `elf_space_grant_upsert` +- `elf_space_grant_revoke` + +Acceptance criteria: +- Tools forward to the corresponding HTTP endpoints. +- Tools respect server-side auth and context headers (tenant/project/agent/read_profile). +- Add basic end-to-end tests for MCP tool registration + request forwarding. + +### Issue 2: Define a versioned `source_ref` schema for doc pointers +Problem: `source_ref` is required and flexible, but without a standard schema downstream agents cannot reliably hydrate documents. + +Proposed `source_ref` shape (v0): +- `kind`: `"doc_pointer"` +- `schema_version`: `"0"` +- `doc_id`: stable identifier +- `uri`: optional canonical location +- `content_hash`: strong hash of canonical bytes (or normalized text) +- `title`: optional +- `mime_type`: optional +- `locator`: optional section/span pointer (e.g. `{ "section": "...", "start": 123, "end": 456 }`) +- `access`: optional hint for how to fetch (e.g. `"s3" | "http" | "local_fs"`) + +Acceptance criteria: +- Add a spec/guide page describing the schema and forward/backward compatibility rules. +- Provide at least one reference implementation of encoding/decoding in an agent-side “skill”. + +### Issue 3: Add a document hydration component (Doc Store and/or Doc MCP) +Problem: ELF intentionally stores compact notes; long documents need a canonical store that can return excerpts safely and cheaply. + +Options: +- A) Separate “doc store” service with its own MCP (`doc-mcp`) and a small set of tools: + - `doc_put(doc_bytes, metadata) -> {doc_id, content_hash}` + - `doc_get(doc_id) -> bytes` (or streaming) + - `doc_excerpt(doc_id, locator | query) -> excerpt(s)` +- B) Extend ELF HTTP API and MCP to include document endpoints (higher coupling). + +Acceptance criteria: +- Clear ownership of document durability and access control. +- Deterministic excerpting rules (max bytes, max excerpts, stable locators). +- Integration example showing: ingest long doc -> write ELF pointers -> search -> hydrate excerpts. + +### Issue 4: Ship a “skills cookbook” (reference agent workflows) +Problem: Without standardized workflows, different agents will write inconsistent notes, misuse scopes, and fail to hydrate long-form context. + +Proposed skills (agent-side workflows, not server responsibilities): +- `doc_ingest`: long doc -> doc store -> extract compact facts -> write notes with `source_ref`. +- `hydrate_context`: interpret `source_ref` -> fetch excerpt(s) -> progressive disclosure injection. +- `memory_write_policy`: decide add_note vs add_event, keys, scope selection, and update vs ignore. +- `share_workflow`: publish/unpublish + grants management (project/org sharing). +- `reflect_consolidate`: periodic consolidation of episodic events into stable profiles/decisions/constraints. + +Acceptance criteria: +- A small set of runnable examples (or pseudocode + prompt templates) that only require MCP connectivity. +- Guidance on safe defaults (no secrets, evidence rules, TTL expectations). + +### Issue 5: Reflection / consolidation loop (human-like “memory formation”) +Problem: Brain-like memory is not just storage + retrieval; it needs consolidation and conflict resolution over time. + +Proposed approach: +- Implement as an operator- or scheduler-driven job (agent-side), not inside ELF core. +- Inputs: recent events, high-hit notes, conflicting keys, nearing TTL items. +- Outputs: a small number of updated stable notes (decisions/constraints/profile) with explicit provenance and keys. + +Acceptance criteria: +- A deterministic policy surface for what gets consolidated (thresholds, caps, key strategy). +- Evaluation harness scenario(s) that demonstrate reduced context size with preserved correctness. + +### Issue 6: Standardize provenance + observability surfaces +Problem: Auditable memory requires consistent provenance and trace correlation across ingest, retrieval, and hydration. + +Proposed work: +- Define a provenance mapping for `source_ref` and note evolution (versioning, updates, deprecations). +- Add OpenTelemetry-compatible tracing around ingest/search flows (at least span + request IDs). + +Acceptance criteria: +- Operators can answer: “Where did this memory come from?” and “Why was it retrieved?” with stable identifiers. + +### Issue 7: Multi-language strategy (English-only boundary vs product reality) +Problem: The v2 contract is English-only; many real deployments are multi-language. + +Proposed approach (near-term): +- Keep ELF contract unchanged. +- Add upstream canonicalization in skills (translate/summarize to English + preserve original text in doc store). + +Acceptance criteria: +- Clear guidance and examples for non-English (including Chinese) user inputs: how to store original, how to store English facts, how to hydrate both. + +## Open Questions (To Resolve Before Implementation) +1) Doc store choice: S3/object storage vs Postgres large fields vs dedicated document service. +2) Multi-language requirement: is Chinese-first a product requirement, or is English-only acceptable for v2? +3) Can agents connect to multiple MCP servers (e.g., `elf-mcp` + `doc-mcp`), or must everything be behind `elf-mcp`? + +## Research Notes (External References) +- Retrieval-Augmented Generation (RAG): https://arxiv.org/abs/2005.11401 +- MemGPT (tiered “virtual context” memory): https://arxiv.org/abs/2310.08560 +- Generative Agents (memory stream + reflection loop): https://arxiv.org/abs/2304.03442 +- BEIR benchmark (retrieval families + robustness): https://arxiv.org/abs/2104.08663 +- Reciprocal Rank Fusion (RRF): https://dl.acm.org/doi/10.1145/1571941.1572114 +- Transactional outbox pattern: https://microservices.io/patterns/data/transactional-outbox.html +- W3C PROV-DM provenance model: https://www.w3.org/TR/prov-dm/ +- OpenTelemetry tracing spec: https://opentelemetry.io/docs/specs/otel/trace/ diff --git a/docs/plans/2026-02-24-doc-ext-v1-design.md b/docs/plans/2026-02-24-doc-ext-v1-design.md new file mode 100644 index 00000000..6f54e8c7 --- /dev/null +++ b/docs/plans/2026-02-24-doc-ext-v1-design.md @@ -0,0 +1,172 @@ +# Doc Extension v1 (Evidence Store) — Design + +**Status:** Approved (v1 scope locked) + +## Goal + +Provide an ELF Extension for long-form evidence storage and retrieval that: + +- Stores English-only documents in Postgres (source of truth). +- Builds a derived Qdrant index for retrieval (dense + BM25). +- Supports progressive disclosure (L0 discovery; L1/L2 bounded excerpts). +- Returns verifiable excerpts (selectors + hashes + verified flag), enabling facts-first workflows. + +## Non-goals (v1) + +- No public library (tenant_public or cross-tenant global public). Tracked separately (deferred). +- No translation or multilingual retrieval. +- No LLM query expansion for doc search. +- No heavy reranking or “full search platform” feature set (analytics, entity extraction, etc.). + +## Core vs Extension boundary + +- **ELF Core** remains facts-first memory (short notes; advanced retrieval; expansion/fusion/rerank as needed). +- **Doc Extension v1** is an evidence store with minimal retrieval and bounded hydration. + - Search exists only as `docs_search_l0` for discovery/backfill/debug. + - All “real evidence reading” happens via `docs_excerpts_get`. + +## Scope model (tenant-internal only) + +Doc uses the same scope labels as Core memory: + +- `agent_private` +- `project_shared` (aka `team_shared` externally) +- `org_shared` (stored under reserved `project_id = "__org__"`) + +Shared visibility is controlled via explicit grants. The v1 implementation reuses the existing shared-grants semantics (project/agent grants) so that: + +- `project_shared` supports intra-project sharing. +- `org_shared` supports intra-tenant, cross-project sharing. + +## English-only boundary + +All Doc text inputs must satisfy the English gate (Core policy). Doc v1 does not translate. + +## Storage: Postgres (SoT) + +### Entities + +- **Document** + - `doc_id` (uuid) + - `tenant_id`, `project_id`, `agent_id`, `scope` + - `title` (optional) + - `source_ref` (optional json) + - `content` (text) + - `content_hash` (blake3 hex of raw UTF-8 bytes) + - `content_bytes` (bytes length) + - timestamps, status + +- **Chunk** + - `chunk_id` (uuid) + - `doc_id` (fk) + - `chunk_index` (0..) + - `start_offset`, `end_offset` (byte offsets in UTF-8 `content`) + - `chunk_text` (text) + - `chunk_hash` (blake3) + +- **Chunk embedding (SoT for rebuild)** + - `chunk_id`, `embedding_version`, `embedding_dim` + - `vec` (pgvector vector(VECTOR_DIM)) + +### Limits (defaults; configurable) + +- `docs_put.max_doc_bytes = 4 MiB` (2^22) +- Chunking: + - `target_bytes = 2048` + - `overlap_bytes = 256` + - `max_chunks_per_doc = 4096` (2^12) +- Excerpts: + - `L1.max_bytes = 8 KiB` (2^13) + - `L2.max_bytes = 32 KiB` (2^15) +- Search: + - `docs_search_l0.top_k_max = 32` (2^5) + +## Derived index: Qdrant + +Doc Extension v1 uses a dedicated Qdrant collection for doc chunks. + +- Point id = `chunk_id` +- Vectors: + - `dense`: float32 embedding vector + - `bm25`: `Document(text, model="qdrant/bm25")` +- Payload includes: `doc_id`, `chunk_id`, `chunk_index`, offsets, `tenant_id`, `project_id`, `agent_id`, `scope`, `status`, `updated_at`, `embedding_version`, `content_hash`, `chunk_hash` + +This supports deterministic, model-free lexical retrieval (BM25) without storing SPLADE-like sparse vectors. + +## Indexing consistency: transactional outbox + +Doc ingestion enqueues indexing jobs in Postgres (outbox) in the same transaction as document persistence. + +Worker processes doc outbox jobs (at-least-once): + +- `UPSERT`: embed chunk text, store embedding in PG, upsert point to Qdrant doc collection +- `DELETE`: delete points by doc_id or chunk_ids + +All operations must be idempotent. + +## Retrieval & progressive disclosure + +### L0: discovery (`docs_search_l0`) + +Inputs: +- `query` (English-only) +- filters: `scope`, (optional) `status`, `doc_type` (future), time bounds (future) +- `top_k` (<= 32) +- `candidate_k` (<= 1024) + +Behavior: +- Embed query text (dense) +- Run Qdrant fusion query: dense prefetch + bm25 prefetch; final query fusion = RRF +- Return L0 items: pointers + tiny preview snippet + minimal metadata +- Do not return large excerpts + +### L1/L2: hydration (`docs_excerpts_get`) + +Inputs: +- `doc_id` +- selector: + - `chunk_id` and optional local offsets, or + - `TextQuoteSelector` (exact + prefix + suffix), and optional `TextPositionSelector` (start/end) +- `level = L1|L2` + +Behavior: +- Load authoritative `content` (PG) +- Resolve selector: + - Prefer TextQuoteSelector match; fallback to TextPositionSelector when provided +- Extract bounded window: + - L1: <= 8 KiB + - L2: <= 32 KiB +- Return excerpt + verification signals (below) + +## Verification contract (v1) + +Every excerpt response must include: + +- `locator` (the selector used / resolved) +- `hashes` (at least `content_hash` and `excerpt_hash`, blake3 hex) +- `verified: bool` +- `verification_errors: []` + +Rules: +- If selector resolution fails or hashes mismatch: `verified=false`. +- Agents should treat `verified=false` excerpts as best-effort and avoid using them as hard evidence. + +Cryptographic signing may be added later; v1 requires hash+selector verification only. + +## API & MCP surface + +HTTP (Extension endpoints): + +- `POST /v2/docs` → `docs_put` +- `GET /v2/docs/{doc_id}` → `docs_get` (metadata-first) +- `POST /v2/docs/search/l0` → `docs_search_l0` +- `POST /v2/docs/excerpts` → `docs_excerpts_get` + +MCP (single surface via `elf-mcp`): + +- `elf_docs_put` +- `elf_docs_get` +- `elf_docs_search_l0` +- `elf_docs_excerpts_get` + +If Doc Extension is disabled/unconfigured, tools must fail closed with explicit, stable error codes. diff --git a/docs/plans/2026-02-24-doc-ext-v1-implementation-plan.md b/docs/plans/2026-02-24-doc-ext-v1-implementation-plan.md new file mode 100644 index 00000000..15ffebea --- /dev/null +++ b/docs/plans/2026-02-24-doc-ext-v1-implementation-plan.md @@ -0,0 +1,180 @@ +# Doc Extension v1 (Evidence Store) Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Implement Doc Extension v1: PG-backed document store + doc chunk indexing outbox + Qdrant derived index (dense + BM25) + L0/L1/L2 retrieval endpoints exposed via HTTP and MCP. + +**Architecture:** Add new PG tables for docs/chunks/embeddings/outbox, add worker pipeline to index doc chunks into a dedicated Qdrant collection, implement minimal Doc APIs and MCP tools. Reuse existing scope + grants semantics and the existing outbox/worker patterns. + +**Tech Stack:** Rust (axum, sqlx), Postgres (+ pgvector), Qdrant, MCP (elf-mcp). + +--- + +### Task 1: Add Doc schema tables (PG) + +**Files:** +- Create: `sql/tables/025_doc_documents.sql` +- Create: `sql/tables/026_doc_chunks.sql` +- Create: `sql/tables/027_doc_chunk_embeddings.sql` +- Create: `sql/tables/028_doc_indexing_outbox.sql` +- Modify: `sql/init.sql` +- Modify: `packages/elf-storage/src/schema.rs` + +**Step 1: Create SQL tables** +- Define doc/chunk tables with scope checks, hash fields, and indexes for lookup by `(tenant_id, project_id, scope, status)`. +- Add chunk offsets and hashes. +- Add doc outbox table (chunk_id + op + embedding_version + retry fields). + +**Step 2: Wire tables into schema renderer** +- Include `\\ir` entries in `sql/init.sql`. +- Add `include_str!` matches in `packages/elf-storage/src/schema.rs`. + +**Step 3: Run formatting / tests** +- Run: `cargo make fmt` +- Run: `cargo make test` + +**Step 4: Commit** +- Add all new SQL files + schema include changes. + +--- + +### Task 2: Add storage models + queries for Doc (PG) + +**Files:** +- Modify: `packages/elf-storage/src/models.rs` +- Create: `packages/elf-storage/src/docs.rs` +- Modify: `packages/elf-storage/src/lib.rs` + +**Step 1: Add Rust models** +- Add structs for doc document, doc chunk, doc chunk embedding, doc outbox job. + +**Step 2: Add PG queries** +- Insert doc + chunks transactionally. +- Fetch doc metadata and content by id (authoritative for hydrate). +- Fetch chunks by doc_id / chunk_id. +- Outbox: claim next doc job, mark done/failed with backoff. + +**Step 3: Add unit tests (pure logic only)** +- Hash computation and bounds helpers (no DB required). + +**Step 4: Run tests + commit** +- Run: `cargo make test` +- Commit. + +--- + +### Task 3: Extend config for Qdrant docs collection + +**Files:** +- Modify: `packages/elf-config/src/types.rs` +- Modify: `packages/elf-config/src/lib.rs` +- Modify: `elf.example.toml` + +**Step 1: Add `docs_collection`** +- Add `docs_collection: String` to Qdrant config with default `doc_chunks_v1`. + +**Step 2: Validate config** +- Ensure non-empty and printable. + +**Step 3: Run tests + commit** +- Run: `cargo make test` +- Commit. + +--- + +### Task 4: Add worker pipeline for Doc outbox → Qdrant docs collection + +**Files:** +- Modify: `apps/elf-worker/src/worker.rs` +- Modify: `packages/elf-storage/src/qdrant.rs` (construct store for docs collection) + +**Step 1: Add `docs_qdrant` store** +- Instantiate a second QdrantStore using `cfg.storage.qdrant.docs_collection`. + +**Step 2: Process doc outbox jobs** +- `UPSERT`: load chunk text, embed, store embedding in PG, upsert Qdrant doc chunk point (dense+bm25). +- `DELETE`: delete doc chunk points by chunk_id/doc_id. + +**Step 3: Run unit tests** +- Add small tests for payload shape helpers (no external PG/Qdrant). + +**Step 4: Commit** + +--- + +### Task 5: Implement Doc service methods (docs_put/docs_get/docs_search_l0/docs_excerpts_get) + +**Files:** +- Create: `packages/elf-service/src/docs.rs` +- Modify: `packages/elf-service/src/lib.rs` + +**Step 1: docs_put** +- Validate request size (<= 4 MiB) and English gate. +- Deterministically chunk content (2048/256). +- Persist doc+chunks, enqueue doc outbox jobs for chunks. +- If scope is shared, ensure project grant (project_shared) or org grant (org_shared in `__org__`). + +**Step 2: docs_get** +- Return metadata + content_hash + bytes; omit full content by default. + +**Step 3: docs_search_l0** +- Embed query once. +- Run Qdrant fusion query against docs collection with filters for tenant/project scope. +- Return L0 results: doc_id/chunk_id + tiny snippet + metadata handles. + +**Step 4: docs_excerpts_get** +- Resolve selector (quote preferred, position fallback). +- Enforce L1/L2 byte bounds and return excerpt + verification signals. + +**Step 5: Tests** +- Pure logic tests for selector resolution + bounds + hashing. +- Integration tests can be ignored when external PG/Qdrant not configured (mirror existing acceptance style). + +**Step 6: Commit** + +--- + +### Task 6: Wire HTTP endpoints in elf-api + +**Files:** +- Modify: `apps/elf-api/src/routes.rs` + +**Step 1: Add routes** +- `POST /v2/docs` +- `GET /v2/docs/{doc_id}` +- `POST /v2/docs/search/l0` +- `POST /v2/docs/excerpts` + +**Step 2: Validate request bytes and headers** +- Reuse existing request size guards and context headers. + +**Step 3: Commit** + +--- + +### Task 7: Expose MCP tools via elf-mcp (single surface, per decision A) + +**Files:** +- Modify: `apps/elf-mcp/src/server.rs` + +**Step 1: Add tools** +- `docs_put`, `docs_get`, `docs_search_l0`, `docs_excerpts_get` + +**Step 2: Ensure fail-closed behavior when disabled** +- If extension is disabled by config, return explicit error payload. + +**Step 3: Commit** + +--- + +### Task 8: Verification + regression checks + +**Step 1: Run full test suite** +- Run: `cargo make test` + +**Step 2: Manual smoke (optional)** +- Start services and run a minimal docs_put → docs_search_l0 → docs_excerpts_get flow. + +**Step 3: Push** +- Push directly to `main` (per user preference). + diff --git a/docs/plans/2026-02-25-agent-skills-cookbook-design.md b/docs/plans/2026-02-25-agent-skills-cookbook-design.md new file mode 100644 index 00000000..29b73aaf --- /dev/null +++ b/docs/plans/2026-02-25-agent-skills-cookbook-design.md @@ -0,0 +1,68 @@ +# Agent Skills Cookbook (MCP-first) — Design + +Status: Proposed +Date: 2026-02-25 + +## Problem + +ELF is used primarily via MCP, but without reference agent-side workflows, different agents: + +- Write inconsistent note shapes, keys, scopes, and TTLs. +- Fail to use facts-first + evidence hydration correctly (either storing long text in notes or failing to hydrate supporting evidence). +- Drift on sharing/grants workflows, reducing multi-agent interoperability. + +## Goal + +Ship a non-normative "skills cookbook" that standardizes how an agent should use ELF via MCP: + +- Facts-first memory in Core (short notes). +- Long-form evidence via Doc Extension v1 (store documents; hydrate bounded excerpts on demand). +- Multi-agent sharing through explicit scopes and grants. + +This cookbook is a guide/playbook, not a system contract. It must not change ELF Core semantics. + +## Core vs Skills contract + +### MCP (capability + invariants) + +MCP tools must remain a thin forwarding layer to ELF HTTP endpoints and must not contain policy. +All hard guarantees are enforced server-side (elf-api/elf-service), including: + +- English-only boundary enforcement. +- ACL/tenancy/scope access. +- Size limits and caps. +- Idempotency and safe retry behavior (where supported). +- Auditability and provenance surfaces exposed by the API. + +### Skills (policy + workflow) + +Skills define agent-side workflows and policies, such as: + +- What to remember vs ignore, and how to normalize content into compact facts. +- When to store long evidence in Doc and attach pointers in note `source_ref`. +- When to hydrate evidence and how to progressively disclose (L0 -> L1 -> L2). +- How to choose scope, keys, TTLs, and how to consolidate/refresh memories over time. + +## Deliverable + +Add a single guide document: + +- `docs/guide/agent_skills_cookbook.md` + +It should include: + +1. A short "MCP vs Skills" contract and failure modes. +2. Reference workflows: + - doc_ingest + - hydrate_context + - memory_write_policy + - share_workflow + - reflect_consolidate +3. Copy-pastable MCP tool-call JSON examples (English-only). + +## Non-goals + +- No new server features or new endpoints (this is documentation only). +- No changes to normative specs. +- No attempt to ship a general-purpose doc/search platform in Core. + diff --git a/docs/plans/2026-02-25-ci-services-checks-design.md b/docs/plans/2026-02-25-ci-services-checks-design.md new file mode 100644 index 00000000..92b8765d --- /dev/null +++ b/docs/plans/2026-02-25-ci-services-checks-design.md @@ -0,0 +1,78 @@ +# CI Service-Backed Checks Design + +**Date:** 2026-02-25 + +## Goal + +Make service-backed verification (Postgres + Qdrant) a first-class, always-on check for changes that can affect retrieval correctness, while keeping the heavier harness signals as nightly-only trend indicators. + +## Context + +Today the repository already runs: + +- Fast checks on PR/push/merge queue: `.github/workflows/language.yml` and `.github/workflows/quality.yml`. +- Service-backed integration tests on a schedule: `.github/workflows/integration.yml` (daily). +- Service-backed harness scripts on a schedule: `.github/workflows/nightly-harness-signals.yml` (nightly). + +Local developer guidance for service-backed testing lives in: + +- `docs/guide/integration-testing.md` +- `docs/guide/testing.md` + +## Requirements + +- Do not rely on external providers or secrets for correctness checks. +- Run service-backed checks on both: + - `pull_request` (fast feedback for contributors) + - `merge_group` (merge queue parity) +- Avoid running on docs-only changes. +- Ensure we can run the full Rust test surface, including ignored tests that require services, without leaving coverage gaps. +- Keep the heavier harness scripts (trend/signal) separate from gating checks. + +## Non-goals + +- Do not build a full “retrieval quality platform” in CI. +- Do not add provider-backed LLM/embedding calls to required checks. +- Do not change ranking logic or memory semantics as part of this work. + +## Design + +### 1) Always-on integration tests with services + +Update `.github/workflows/integration.yml` to run on PR and merge queue (in addition to schedule + manual). + +In this workflow, run the full workspace test suite including ignored tests: + +- `cargo make test-rust-all` + +Rationale: + +- This makes “ignored tests” a convention for “requires services”, not “unexecuted”. +- It keeps the “no skipped tests” expectation enforceable in CI. + +### 2) Always-on E2E harness (lightweight) + +Add a new workflow to run the lightweight, deterministic E2E harness: + +- `cargo make test-e2e` (which runs `scripts/context-misranking-harness.sh`) + +Key properties: + +- Uses local deterministic providers (`local-hash`, `local-token-overlap`). +- Uses Postgres + Qdrant services only. +- Produces clear pass/fail semantics and can upload logs on failure. + +### 3) Keep “harness signals” nightly-only + +Do not change `.github/workflows/nightly-harness-signals.yml` scope: it remains nightly + manual and continues to upload artifacts. This job can evolve independently without becoming a hard merge gate. + +## Acceptance criteria + +- `Integration Tests` runs on: + - `pull_request`, `merge_group`, `schedule`, `workflow_dispatch` +- `Integration Tests` runs with `--run-ignored all` and succeeds on `main`. +- A new E2E workflow runs on: + - `pull_request`, `merge_group`, `workflow_dispatch` +- E2E job starts Postgres + Qdrant via GitHub Actions services and successfully runs `cargo make test-e2e` without external secrets. +- Both workflows use `paths-ignore` for docs-only changes (`docs/**`, `**/*.md`, `.gitignore`). +- Local docs reflect the updated meaning of “E2E harness” vs “nightly harness signals”. diff --git a/docs/plans/2026-03-01-reflection-consolidation-loop-eval-scenarios.md b/docs/plans/2026-03-01-reflection-consolidation-loop-eval-scenarios.md new file mode 100644 index 00000000..b2b84bce --- /dev/null +++ b/docs/plans/2026-03-01-reflection-consolidation-loop-eval-scenarios.md @@ -0,0 +1,48 @@ +# Reflection & Consolidation Loop: Evaluation Scenarios + +## Decision + +For the reflection/consolidation loop track we define consolidation as an **agent-side policy** and keep **scoring and API behavior as server-side capability**. + +The agent decides when to consolidate (`query + merge policy`), while `elf-api`/`elf-worker` only provide: + +- append and update semantics, +- duplicate de-duplication rules when configured by service config, +- query retrieval/search behavior, +- and deterministic evaluation primitives for measuring outcomes. + +This keeps consolidation policies under LLM-agent control (and easy to evolve) without introducing a separate long-lived service. + +## Tradeoff + +- **Pros** + - Faster product iteration: policy thresholds, scoring windows, and trigger conditions can change per-agent workflow without backend deployment. + - Better portability: consolidation behavior can be reused by different local agents with minimal API changes. + - Smaller server surface: only stable capabilities and guarantees stay in the shared API. +- **Cons** +- Additional policy logic in clients increases implementation variance across agents. +- Requires explicit evaluation to prevent silent regressions when policies change. + +## Evaluation Scenario + +### Consolidation stability scenario + +Problem: a single logical key has multiple noisy legacy notes. Before consolidation, query results are spread; after deduplication and creation of one canonical note, retrieval should become both more stable and more deterministic. + +Harness behavior: + +- ingest 3 duplicate notes with key `incident_merge_protocol` and distractor notes, +- run `elf-eval` with dataset query expectation by `expected_keys`, +- perform a consolidation action (delete duplicates, ingest canonical stable note), +- run the same query again. + +Success signal: + +- baseline and post-consolidation recall remain healthy, +- post-consolidation `retrieved_keys` is focused and stable, +- change in `avg_retrieved_summary_chars` is visible to detect summary-quality drift. + +## Why `expected_keys` is required + +Consolidation changes note IDs; `expected_note_ids` assertions are brittle under those flows. +`expected_keys` allows intent-based assertions that survive ID churn and still validates semantic coverage through the new `expected_keys` mode in `elf-eval`. diff --git a/docs/plans/2026-03-04-search-modes-design.md b/docs/plans/2026-03-04-search-modes-design.md new file mode 100644 index 00000000..f83a06d4 --- /dev/null +++ b/docs/plans/2026-03-04-search-modes-design.md @@ -0,0 +1,83 @@ +# Search Modes: `quick_find` vs `planned_search` (Design) + +Date: 2026-03-04 + +## Goal + +Expose an explicit **latency-vs-quality** choice at search-creation time, while keeping the response contract deterministic and inspectable: + +- `quick_find`: low-latency path for straightforward lookups. +- `planned_search`: higher-quality path that returns a machine-readable `query_plan`. + +## Public API (v2) + +### Create a search session + +`POST /v2/searches` + +Body: + +```json +{ + "query": "English-only", + "mode": "quick_find|planned_search", + "top_k": 12, + "candidate_k": 60, + "filter": { "schema": "search_filter_expr/v1", "expr": { "op": "and", "args": [] } }, + "payload_level": "l0|l1|l2|null" +} +``` + +Response (single shape; `query_plan` present only for `planned_search`): + +```json +{ + "trace_id": "uuid", + "search_id": "uuid", + "expires_at": "...", + "mode": "quick_find|planned_search", + "items": [ { "note_id": "uuid", "summary": "...", "final_score": 0.0 } ], + "query_plan": { "schema": "elf.search.query_plan", "version": "v1" } +} +``` + +### Read a search session + +`GET /v2/searches/{search_id}?top_k=12&touch=true` + +- Returns the same response shape as create. +- `query_plan` is returned when present in the stored session (planned searches). + +## Semantics + +### `quick_find` + +- Query expansion: **off**. +- Rerank provider call: **skipped** (deterministic placeholder scores), to keep latency predictable. +- Returns a compact index view; no `query_plan` field. + +### `planned_search` + +- Query expansion: follows configured expansion policy (`off|always|dynamic`). +- Rerank provider call: **on**. +- Returns `query_plan` (machine-readable retrieval plan + policy snapshot). + +## Storage + +Search sessions persist enough context to make `GET /v2/searches/{search_id}` reflect the creation response: + +- `mode` (text, required) +- `query_plan` (jsonb, nullable; present for `planned_search`) + +## MCP surface + +The MCP server maps 1:1 to v2 endpoints and exposes a single creation tool: + +- `elf_searches_create` → `POST /v2/searches` (requires `mode`) + +## Evaluation / Acceptance + +Latency can be benchmarked by running `elf-eval` in mode A vs mode B on the same dataset/config and comparing `latency_ms_p95`: + +- Expectation: `quick_find` p95 < `planned_search` p95 on the same queries/environment. + diff --git a/docs/plans/2026-06-08-elf-hardening-evaluation-decisions.md b/docs/plans/2026-06-08-elf-hardening-evaluation-decisions.md new file mode 100644 index 00000000..77e0d95a --- /dev/null +++ b/docs/plans/2026-06-08-elf-hardening-evaluation-decisions.md @@ -0,0 +1,120 @@ +# ELF Hardening Evaluation Decisions + +**Date:** 2026-06-08 + +## Goal + +Record the system evaluation decisions that drove the June 2026 ELF reliability +hardening work, so the rationale lives in the repository instead of only in chat +or tracker history. + +## Context + +The evaluation found several gaps that made local operation and API contract +review harder than necessary: + +- Required runtime gates and service-backed checks needed to be restored and + made easy to run. +- The MCP default ingestion profile update path needed an explicit PUT-backed + contract. +- New operators needed a concrete getting-started path with local service setup. +- The HTTP API contract needed a generated, inspectable surface. +- Configuration should reject missing required fields instead of silently + accepting ambiguous defaults. +- Local development needed a Docker Compose stack for the service dependencies. + +## Selected Decisions + +### 1) Restore gates, MCP default-set PUT, and getting-started docs + +Decision: implement the gate restoration, MCP default-set PUT forwarding, and +operator getting-started documentation as one bounded reliability lane. + +Tracking: + +- Linear: [XY-789](https://linear.app/hack-ink/issue/XY-789/elf-hardening-14-restore-gates-mcp-default-set-put-and-getting-started) +- GitHub: [PR #109](https://github.com/hack-ink/ELF/pull/109) + +Verification expectation: + +- Service-backed integration coverage must be runnable through the repository + checks. +- MCP default ingestion profile updates must use the API contract path rather + than a parallel local-only behavior. +- Setup documentation must be enough for an operator to start the local system + without relying on chat context. + +### 2) Use utoipa and Scalar for the API contract surface + +Decision: use `utoipa` for OpenAPI generation and Scalar for the browsable API +reference. + +Tracking: + +- Linear: [XY-790](https://linear.app/hack-ink/issue/XY-790/elf-hardening-24-add-utoipa-and-scalar-api-contract-surface) +- GitHub: [PR #111](https://github.com/hack-ink/ELF/pull/111) + +Verification expectation: + +- The generated OpenAPI document must cover the v2 HTTP routes needed by + operators and tests. +- The Scalar UI must be served by the API app without requiring a separate docs + process. +- Contract tests should assert the key route and schema names so the docs + surface cannot drift silently. + +### 3) Enforce stricter configuration field presence + +Decision: make required configuration fields explicit and reject missing required +fields instead of accepting implicit defaults for operator-critical behavior. + +Tracking: + +- Linear: [XY-791](https://linear.app/hack-ink/issue/XY-791/elf-hardening-34-enforce-strict-config-field-presence) +- GitHub: [PR #110](https://github.com/hack-ink/ELF/pull/110) + +Verification expectation: + +- Config validation tests must cover required-field failures. +- Existing valid fixtures must keep passing after the stricter read path. +- Error messages should identify the missing field clearly enough for operator + remediation. + +### 4) Use Docker Compose for local service setup + +Decision: use Docker Compose as the repo-owned local development stack for +Postgres, Qdrant, and the API/MCP-facing runtime dependencies. + +Tracking: + +- Linear: [XY-792](https://linear.app/hack-ink/issue/XY-792/elf-hardening-44-add-docker-compose-local-dev-stack) +- GitHub: [PR #112](https://github.com/hack-ink/ELF/pull/112) + +Verification expectation: + +- The compose stack must avoid colliding with unrelated local services. +- The documented environment should map directly to the repo-native checks and + getting-started flow. +- Compose configuration should remain development-only and not introduce a new + production deployment contract. + +## Deferred / Non-goals + +- Item 7 from the evaluation was explicitly ignored for this hardening pass. +- This plan does not introduce live provider calls, new hosted infrastructure, + or a replacement runtime architecture. +- This plan does not make Docker Compose the production deployment surface. + +## Delivery Order + +The implementation order is: + +1. Restore gates, MCP default-set PUT, and getting-started docs. +2. Add the utoipa + Scalar API contract surface. +3. Enforce stricter configuration field presence. +4. Add the Docker Compose local dev stack. +5. Land this decision record so future maintenance can trace the work back to + the evaluated system gaps. + +Each implementation lane should land only after repo-native verification passes, +with service-backed checks used where behavior depends on Postgres or Qdrant. diff --git a/docs/research/2026-06-08-agent-memory-selection.json b/docs/research/2026-06-08-agent-memory-selection.json new file mode 100644 index 00000000..0e4c6899 --- /dev/null +++ b/docs/research/2026-06-08-agent-memory-selection.json @@ -0,0 +1,221 @@ +{ + "schema": "research-run/2", + "run_id": "2026-06-08-agent-memory-selection", + "question": "Given agentmemory, current monitored memory projects, and OpenAI/Anthropic/Google dreaming-style memory consolidation, should ELF continue building its own memory system or adopt an external system?", + "success_criteria": [ + "Use current ELF main-branch evidence, current Decodex/Linear state, and current external sources.", + "Compare continue-build, adopt-agentmemory, and adopt-managed-dreaming options.", + "Return guidance that can shape the next ELF Linear issues without relaxing evidence/provenance requirements." + ], + "constraints": [ + "Do not treat external benchmark or README claims as independently verified unless ELF has reproduced them.", + "Do not recommend destructive memory rewriting without reviewable derived output and provenance.", + "Keep ELF source-of-truth semantics separate from optional adapters and derived views." + ], + "stop_rule": "Stop once the recommendation is decision-ready for issue shaping or the remaining uncertainty would require implementation benchmarks beyond this research pass.", + "primary_hypothesis": "ELF should continue as the evidence-bound core memory service and borrow or integrate external systems only at the capture, evaluation, viewer, and derived-consolidation layers.", + "rival_hypotheses": [ + "Replace ELF with agentmemory because it already packages cross-agent hooks, MCP tools, benchmarks, viewer, and consolidation.", + "Replace ELF's roadmap with managed dreaming APIs because large vendors are converging on background memory curation.", + "Pause ELF core development until the agent-memory market stabilizes." + ], + "falsifiers": [ + "If agentmemory or another external project exposes ELF-equivalent evidence-bound deterministic write contracts, multi-tenant service semantics, and rebuildable source-of-truth storage with lower integration risk, replacement becomes viable.", + "If managed dreaming APIs provide portable, self-hostable, reviewable, evidence-linked memory stores that can satisfy ELF governance boundaries, adopting them as core becomes viable.", + "If ELF's own hardening and validation surface is not operational after the June 2026 work, continuing core development should be deferred until reliability is restored." + ], + "coverage": { + "mode": "broad_external", + "min_source_families": 4 + }, + "continuation": { + "mode": "auto_if_not_decision_ready", + "attempt": 1, + "max_attempts": 2, + "session_id": "2026-06-08-agent-memory-selection" + }, + "events": [ + { + "seq": 1, + "type": "probe_completed", + "remaining_option_count": 3, + "independent_option_questions": [ + "Should ELF continue as the core memory service or be replaced by agentmemory?", + "Should dreaming-style consolidation become authoritative or derived/reviewed?", + "Which current ELF backlog items become higher priority after the refresh?" + ], + "external_slices": [] + }, + { + "seq": 2, + "type": "evidence_recorded", + "evidence": [ + { + "id": "E1", + "kind": "observation", + "summary": "Current ELF main presents itself as evidence-linked fact memory with deterministic add_note and LLM-driven add_event separation, Postgres source-of-truth, rebuildable Qdrant index, multi-tenant scoped APIs, HTTP/MCP surfaces, graph-lite relation context, and evaluation tooling.", + "source_family": "repo_docs", + "source_locator": "README.md; config/local/elf.docker.toml; docker-compose.yml; Makefile.toml" + }, + { + "id": "E2", + "kind": "observation", + "summary": "The June 2026 ELF hardening sequence landed local service gates, MCP default-set PUT forwarding, getting-started docs, utoipa/Scalar API docs, strict config field presence, Docker Compose dependencies, and a checked-in decision record.", + "source_family": "repo_docs", + "source_locator": "docs/plans/2026-06-08-elf-hardening-evaluation-decisions.md" + }, + { + "id": "E3", + "kind": "observation", + "summary": "GitHub and Linear current-state checks show PRs #109-#113 merged and XY-789, XY-790, XY-791, XY-792, and XY-798 completed; Decodex top-level live status has zero active, running, queued, waiting, and attention lanes, although old attempt history still includes a stale XY-790 needs_attention ledger.", + "source_family": "tracker_runtime", + "source_locator": "gh pr view 109-113; Linear issue(id) query; decodex status --live --json --config /Users/x/.codex/decodex/projects/elf" + }, + { + "id": "E4", + "kind": "observation", + "summary": "agentmemory is a fast-moving Apache-2.0 coding-agent memory project with cross-agent MCP/REST/hook integration, advertised hybrid BM25/vector/graph retrieval, lifecycle/consolidation claims, a local viewer, iii console observability, v0.9.27 release, and recent push activity. Its own roadmap still lists governance, benchmark CI, session replay UI, enterprise trust, and v1.0 stability as future work.", + "source_family": "external_project", + "source_locator": "https://github.com/rohitg00/agentmemory; https://raw.githubusercontent.com/rohitg00/agentmemory/main/ROADMAP.md; GitHub API snapshot 2026-06-08T06:01:57Z" + }, + { + "id": "E5", + "kind": "observation", + "summary": "OpenAI describes dreaming as a background memory curation process that synthesizes memory state from conversations, improves preference use, and keeps memory current over time rather than treating old memories as static facts.", + "source_family": "vendor_docs", + "source_locator": "https://openai.com/index/chatgpt-memory-dreaming/" + }, + { + "id": "E6", + "kind": "observation", + "summary": "Anthropic Claude Dreams treats dreaming as an asynchronous research-preview job over a memory store plus 1-100 past sessions. It produces a separate output memory store, never modifies the input store, exposes progress/session events, and expects review, attach, discard, archive, or delete decisions after completion.", + "source_family": "vendor_docs", + "source_locator": "https://platform.claude.com/docs/en/managed-agents/dreams" + }, + { + "id": "E7", + "kind": "observation", + "summary": "Google examples split into two useful patterns: Always-On Memory Agent productizes file/API/dashboard ingest plus timer-based consolidation, while Gemini CLI Auto Memory keeps background extraction review-gated by writing patches and skill drafts to a project-local inbox before any approval.", + "source_family": "vendor_docs", + "source_locator": "https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent; https://github.com/google-gemini/gemini-cli/blob/main/docs/cli/auto-memory.md" + }, + { + "id": "E8", + "kind": "observation", + "summary": "The monitored project set remains active as of 2026-06-08. GitHub API snapshots showed recent pushes for agentmemory, mem0, qmd, claude-mem, OpenViking, gbrain, graphify, LangGraph, Graphiti, RAGFlow, LightRAG, and GraphRAG, with agentmemory at 21,783 stars and v0.9.27, mem0 at 58,005 stars, claude-mem at 81,157 stars, graphify at 62,294 stars, and RAGFlow at 82,150 stars.", + "source_family": "external_project", + "source_locator": "GitHub API repository metadata snapshot 2026-06-08T06:01:57Z" + }, + { + "id": "E9", + "kind": "observation", + "summary": "The existing ELF vNext backlog already has directly relevant Backlog issues for knowledge memory pages with provenance and lint (XY-286), read-only viewer (XY-19), retrieval observability panels (XY-27), and graph-lite typed query/DX (XY-70).", + "source_family": "tracker_runtime", + "source_locator": "Linear issue(id) query for XY-286, XY-19, XY-27, XY-70" + } + ] + }, + { + "seq": 3, + "type": "tradeoffs_recorded", + "tradeoffs": [ + { + "id": "T1", + "summary": "Continuing ELF preserves the evidence-bound, deterministic, scoped service contract that external coding-agent products do not clearly replace; the trade-off is slower product UX unless viewer and capture adapters are prioritized.", + "supporting_evidence_ids": [ + "E1", + "E4", + "E8" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T2", + "summary": "Dreaming-style consolidation is now validated by major vendors as a product direction, but the safest shared pattern is separate or review-gated output rather than destructive authoritative rewriting.", + "supporting_evidence_ids": [ + "E5", + "E6", + "E7" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T3", + "summary": "agentmemory should be treated as an integration and benchmark target for coding-agent session capture, not as a core replacement, because its strongest value is hooks, viewer, tool breadth, and packaged local UX while ELF's strongest value is provenance and service governance.", + "supporting_evidence_ids": [ + "E1", + "E4" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T4", + "summary": "The refreshed evidence reorders ELF priorities toward viewer/observability and derived consolidation before more automatic memory authority, because operators need to inspect what was remembered, why, and how consolidation proposals were formed.", + "supporting_evidence_ids": [ + "E4", + "E6", + "E7", + "E9" + ], + "disconfirming_evidence_ids": [] + } + ] + }, + { + "seq": 4, + "type": "judgment_candidate_created", + "judgment_payload": { + "decision_claim": "Continue ELF as the evidence-bound memory core. Do not replace it with agentmemory or managed dreaming. Use agentmemory and managed dreaming systems as comparison baselines and optional adapters while prioritizing reviewable derived consolidation, operator viewer/observability, and graph-lite/knowledge-memory work in ELF.", + "implementation_order": [ + "Persist the research refresh and use it as the source for issue shaping.", + "Build a reviewed, derived consolidation pipeline over immutable evidence-bound notes and traces.", + "Ship the read-only viewer and retrieval observability panels before expanding automatic consolidation authority.", + "Add an optional agentmemory import/baseline adapter for coding-agent session observations.", + "Advance graph-lite typed query and derived knowledge pages with provenance and lint." + ], + "judgment_type": "recommend", + "key_evidence_ids": [ + "E1", + "E2", + "E3", + "E4", + "E5", + "E6", + "E7", + "E8" + ], + "key_tradeoff_ids": [ + "T1", + "T2", + "T3", + "T4" + ], + "preferred_option": "continue-elf-core-with-dreaming-inspired-derived-consolidation-and-agentmemory-baseline-integration", + "rejected_options": [ + "replace-elf-with-agentmemory", + "replace-elf-with-managed-dreaming", + "pause-elf-core-development-until-the-market-settles" + ] + }, + "judgment_hash": "sha256:854918f581d32764fad76ac0481e58a72701bc348a827afa2a2b76978cc341f9" + }, + { + "seq": 5, + "type": "worker_completed", + "worker": "skeptic", + "target_judgment_hash": "sha256:854918f581d32764fad76ac0481e58a72701bc348a827afa2a2b76978cc341f9", + "summary": "The strongest objection is that agentmemory's product surface is already ahead of ELF for coding-agent continuity. That does not defeat the judgment because it supports an adapter/baseline and viewer priority, not replacement of ELF's stricter source-of-truth and evidence contract.", + "objections": [] + }, + { + "seq": 6, + "type": "finalized_decision_ready", + "judgment_hash": "sha256:854918f581d32764fad76ac0481e58a72701bc348a827afa2a2b76978cc341f9", + "confidence": "medium", + "missing_evidence": [ + "ELF has not independently reproduced agentmemory's benchmark claims.", + "The next implementation pass still needs issue-local design for the consolidation data model and adapter boundaries." + ] + } + ] +} diff --git a/docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json b/docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json new file mode 100644 index 00000000..198df1af --- /dev/null +++ b/docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json @@ -0,0 +1,136 @@ +{ + "schema": "research-run/2", + "run_id": "2026-06-09-xy-841-external-memory-benchmark-dimensions", + "question": "How should ELF map reviewed external memory projects to real-world benchmark dimensions without overstating docs-only evidence as benchmark proof?", + "success_criteria": [ + "Map every reviewed external project in the issue scope to one or more real-world benchmark suites.", + "Separate benchmark-grounded adapter evidence from docs-grounded research claims.", + "Identify dimensions where ELF should not be treated as the reference yet.", + "Keep pending D0 projects as watch items unless current evidence is gathered in scope." + ], + "constraints": [ + "Do not implement benchmark adapters or change ELF runtime behavior.", + "Do not make benchmark pass/fail claims without runnable evidence from checked-in reports.", + "Use existing reviewed docs and benchmark reports as the authority for this docs-only refresh." + ], + "stop_rule": "Stop once the comparison and inventory can route future real_world_job benchmark design without implying unproven external quality claims.", + "primary_hypothesis": "The capability map should treat qmd, claude-mem, agentmemory, mem0/OpenMemory, OpenViking, memsearch, llm-wiki, gbrain, Always-On Memory Agent, graphify, Letta, LangGraph, Graphiti/Zep, and nanograph as dimension references only where docs or benchmark evidence supports the fit; D0 RAG projects should remain watch items.", + "rival_hypotheses": [ + "Use the current smoke benchmark status alone to rank external projects.", + "Treat official external README claims as sufficient benchmark-quality evidence.", + "Drop pending RAGFlow, LightRAG, and GraphRAG from the map until adapters exist." + ], + "falsifiers": [ + "If a current runnable adapter report exists for a broader dimension, docs-only confidence would be too conservative.", + "If a listed project lacks any documented mechanism matching the assigned suite, the suite map would overstate its reference role.", + "If D0 watch items are assigned strengths, the map would violate the no-current-evidence boundary." + ], + "coverage": { + "mode": "repo_docs_and_existing_external_research", + "min_source_families": 3 + }, + "events": [ + { + "seq": 1, + "type": "probe_completed", + "remaining_option_count": 3, + "independent_option_questions": [ + "Which benchmark dimensions are already proven by ELF's checked-in adapter evidence?", + "Which projects should be treated as docs-grounded references for unencoded dimensions?", + "Which pending projects must stay as watch items?" + ], + "external_slices": [] + }, + { + "seq": 2, + "type": "evidence_recorded", + "evidence": [ + { + "id": "E1", + "kind": "observation", + "summary": "README states that the June 9 Docker live baseline and production adoption gate prove a bounded ELF production-provider path, while the all-project smoke has ELF and qmd passing encoded checks and other external projects retaining typed failure or incomplete states.", + "source_family": "repo_docs", + "source_locator": "README.md" + }, + { + "id": "E2", + "kind": "observation", + "summary": "The production adoption gate explicitly bounds external comparison as an objective adapter matrix, not an overall superiority claim, and records qmd pass, agentmemory lifecycle_fail, and memsearch/mem0/OpenViking/claude-mem incomplete or wrong-result states.", + "source_family": "benchmark_report", + "source_locator": "docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md" + }, + { + "id": "E3", + "kind": "observation", + "summary": "The live baseline runbook defines pass, wrong_result, lifecycle_fail, incomplete, blocked, and not_encoded semantics, and warns that incomplete, blocked, and not_encoded are not passes.", + "source_family": "repo_runbook", + "source_locator": "docs/guide/benchmarking/live_baseline_benchmark.md" + }, + { + "id": "E4", + "kind": "observation", + "summary": "The existing comparison contains D1/D2 docs-grounded mechanism research for agentmemory, qmd, claude-mem, mem0/OpenMemory, memsearch, OpenViking, llm-wiki, gbrain, Always-On Memory Agent, graphify, Letta, LangGraph, Graphiti/Zep, and nanograph.", + "source_family": "repo_research_docs", + "source_locator": "docs/guide/research/comparison_external_projects.md" + }, + { + "id": "E5", + "kind": "observation", + "summary": "The inventory marks RAGFlow, LightRAG, and GraphRAG as D0 pending deep dives, so they can only be watch items in this lane.", + "source_family": "repo_research_docs", + "source_locator": "docs/guide/research/research_projects_inventory.md" + } + ] + }, + { + "seq": 3, + "type": "tradeoffs_recorded", + "tradeoffs": [ + { + "id": "T1", + "summary": "Using only current smoke results would hide useful future benchmark dimensions such as operator continuity, temporal graph validity, core/archival memory, and knowledge synthesis.", + "supporting_evidence_ids": [ + "E2", + "E4" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T2", + "summary": "Using docs-grounded references without labels would overstate external project quality because the benchmark runner has not reproduced most broader claims.", + "supporting_evidence_ids": [ + "E2", + "E3" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T3", + "summary": "Keeping D0 RAG projects as watch items preserves future coverage without pretending that adapter feasibility, resource envelope, or evidence quality has been audited.", + "supporting_evidence_ids": [ + "E3", + "E5" + ], + "disconfirming_evidence_ids": [] + } + ] + }, + { + "seq": 4, + "type": "challenge_recorded", + "summary": "The main risk is that a broad suite map could read like a quality ranking. The mitigation is to label evidence class per project, repeat that only current adapter reports can support pass/fail claims, and call out ELF gaps by reference dimension instead of claiming overall superiority.", + "resolved": true + }, + { + "seq": 5, + "type": "finalized_decision_ready", + "confidence": "medium", + "decision": "Update the comparison and inventory with a real-world benchmark-dimension map. Treat qmd, claude-mem, agentmemory, mem0/OpenMemory, memsearch, OpenViking, llm-wiki, gbrain, Always-On Memory Agent, graphify, Letta, LangGraph, Graphiti/Zep, and nanograph as reference projects for specific dimensions, but separate benchmark-grounded evidence from docs-grounded suite fit. Keep RAGFlow, LightRAG, and GraphRAG as D0 watch items.", + "missing_evidence": [ + "No new upstream source refresh was performed in this lane.", + "No new benchmark adapter or real_world_job suite was executed.", + "Most non-smoke dimensions remain docs-grounded until future adapter evidence exists." + ] + } + ] +} diff --git a/docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json b/docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json new file mode 100644 index 00000000..9f42812b --- /dev/null +++ b/docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json @@ -0,0 +1,348 @@ +{ + "schema": "research-run/2", + "run_id": "2026-06-10-xy-882-rag-graph-adapter-feasibility", + "question": "Which RAG and graph-memory research gates should become Docker-bounded adapter implementation candidates for ELF real-world benchmarks?", + "success_criteria": [ + "Give RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, and graphify one explicit verdict: adapter_candidate, research_only, blocked, or reject.", + "Separate setup/resource feasibility from product quality; heavy setup is not treated as a quality failure.", + "Require adapter_candidate projects to have both a Docker-contained path and an evidence-linked output contract.", + "Keep all researched projects in the research_gate evidence class until a Docker adapter executes real_world_job scoring." + ], + "constraints": [ + "Do not implement adapters in this issue.", + "Do not use host-global installs as proof.", + "Do not claim live adapter pass evidence from source or docs review.", + "Create implementation follow-ups only for adapter candidates with a scoped Docker boundary and evidence-linked output." + ], + "stop_rule": "Stop when every target project has a verdict, adapter candidates have scoped follow-up issue titles, and the docs/manifest still label these records as research gates rather than live evidence.", + "primary_hypothesis": "RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify have enough Docker-bounded setup and evidence-output shape to justify implementation follow-ups; Letta, LangGraph, nanograph, and llm-wiki remain research-only references; gbrain remains blocked until a Docker-local brain repo/database path is proven.", + "rival_hypotheses": [ + "All projects should remain research-only because none has executed in the benchmark runner.", + "All projects with official Docker or CLI instructions should become adapter candidates.", + "RAGFlow should be rejected because its official resource envelope is large." + ], + "falsifiers": [ + "If a candidate cannot run without host-global state, it is not an adapter implementation candidate for this benchmark lane.", + "If a candidate cannot emit source IDs, document IDs, file locations, citations, or equivalent evidence handles, it cannot support real_world_job scoring.", + "If a project is a useful architecture reference but not a standalone memory/retrieval output path, it should remain research_only." + ], + "coverage": { + "mode": "primary_source_docs_and_existing_repo_contracts", + "min_source_families": 4 + }, + "events": [ + { + "seq": 1, + "type": "probe_completed", + "remaining_option_count": 4, + "independent_option_questions": [ + "Does the project expose a Docker-contained setup path?", + "Does the project expose corpus ingest and query output that can map back to source evidence?", + "Is the project a direct adapter candidate, a reference-only design input, blocked by missing Docker proof, or rejected?" + ], + "external_slices": [ + "RAGFlow", + "LightRAG", + "GraphRAG", + "Graphiti/Zep", + "Letta", + "LangGraph", + "nanograph", + "llm-wiki", + "gbrain", + "graphify" + ] + }, + { + "seq": 2, + "type": "evidence_recorded", + "evidence": [ + { + "id": "E1", + "kind": "contract", + "summary": "The real-world benchmark spec defines research_gate records as source/setup/runtime/resource/retry metadata for future implementation; research gates must not count as fixture-backed, live-baseline, or live-real-world evidence.", + "source_family": "repo_spec", + "source_locator": "docs/spec/real_world_agent_memory_benchmark_v1.md" + }, + { + "id": "E2", + "kind": "setup", + "summary": "RAGFlow official quickstart documents Docker startup, 4 CPU / 16 GB RAM / 50 GB disk prerequisites, x86/Nvidia support, image-size caveats, dataset creation, chunk visibility, and citation-backed retrieval testing.", + "source_family": "upstream_docs", + "source_locator": "https://ragflow.io/docs/" + }, + { + "id": "E3", + "kind": "output_contract", + "summary": "RAGFlow HTTP API can include reference metadata and returns reference chunks containing chunk id, content, document id, document name, document metadata, dataset id, positions, and similarity scores.", + "source_family": "upstream_docs", + "source_locator": "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md" + }, + { + "id": "E4", + "kind": "setup", + "summary": "LightRAG Docker docs describe docker compose startup, generated compose files, persistent data paths, environment-driven LLM and embedding configuration, and optional Docker-local vLLM embedding/rerank services.", + "source_family": "upstream_docs", + "source_locator": "https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/DockerDeployment.md" + }, + { + "id": "E5", + "kind": "output_contract", + "summary": "LightRAG supports query prefixes including context-only modes, can return the context prepared for the LLM, supports inserting documents with stable ids, and traces sources through file_paths.", + "source_family": "upstream_docs", + "source_locator": "https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/LightRAG-API-Server.md" + }, + { + "id": "E6", + "kind": "output_contract", + "summary": "GraphRAG writes parquet output tables with UUIDs and human-readable ids; communities and reports carry text_unit_ids, and text_units carry raw text plus document ids and relationship/entity ids.", + "source_family": "upstream_docs", + "source_locator": "https://microsoft.github.io/graphrag/index/outputs/" + }, + { + "id": "E7", + "kind": "setup", + "summary": "GraphRAG input and query docs describe a CLI/API indexing and local-search path over structured documents, raw text chunks, graph data, and query context builders.", + "source_family": "upstream_docs", + "source_locator": "https://microsoft.github.io/graphrag/" + }, + { + "id": "E8", + "kind": "output_contract", + "summary": "Graphiti/Zep requires Python plus Neo4j or FalkorDB, supports Docker-local FalkorDB, adds episodes or fact triples, and search results include UUID, fact text, valid_at, and invalid_at fields.", + "source_family": "upstream_docs", + "source_locator": "https://help.getzep.com/graphiti/getting-started/quick-start" + }, + { + "id": "E9", + "kind": "boundary", + "summary": "Letta remains a strong core/archival memory reference, but Docker use needs explicit embedding configuration and the current docs steer new Letta Code users away from Docker-first evaluation.", + "source_family": "upstream_docs", + "source_locator": "https://docs.letta.com/guides/docker/" + }, + { + "id": "E10", + "kind": "boundary", + "summary": "LangGraph persistence provides checkpoints, replay, stores, and semantic memory search, but it is an agent-state framework rather than a standalone external memory service adapter.", + "source_family": "upstream_docs", + "source_locator": "https://docs.langchain.com/oss/python/langgraph/persistence" + }, + { + "id": "E11", + "kind": "boundary", + "summary": "nanograph documents one CLI, one folder, schema-as-code, no server, no cloud, and no Docker; this makes it a graph-lite DX reference rather than a Docker adapter candidate for this lane.", + "source_family": "upstream_docs", + "source_locator": "https://www.nanograph.io/" + }, + { + "id": "E12", + "kind": "boundary", + "summary": "llm-wiki ships as agent plugins or portable instructions with wiki query, compile, lint, audit, and output workflows; it is a derived knowledge workflow reference, not a service adapter candidate without a contained plugin harness.", + "source_family": "upstream_docs", + "source_locator": "https://github.com/nvk/llm-wiki" + }, + { + "id": "E13", + "kind": "boundary", + "summary": "gbrain has strong compiled-truth, append-only timeline, and source attribution contracts, but this lane did not prove a Docker-local brain repository and database setup path.", + "source_family": "upstream_docs", + "source_locator": "https://raw.githubusercontent.com/garrytan/gbrain/master/docs/guides/compiled-truth.md" + }, + { + "id": "E14", + "kind": "output_contract", + "summary": "graphify can run over a folder, produces graph.html, GRAPH_REPORT.md, graph.json, and cache artifacts, and query output includes node labels, edge types, confidence tags, source files, and source locations.", + "source_family": "upstream_docs", + "source_locator": "https://raw.githubusercontent.com/safishamsi/graphify/v3/README.md" + } + ] + }, + { + "seq": 3, + "type": "project_verdicts_recorded", + "verdicts": [ + { + "project": "RAGFlow", + "verdict": "adapter_candidate", + "supporting_evidence_ids": [ + "E2", + "E3" + ], + "docker_boundary": "Nested Docker service profile or baseline compose service using official RAGFlow Docker Compose, capped to a tiny corpus and CPU mode first.", + "output_contract": "Map RAGFlow reference.chunks fields to real_world_job expected evidence ids.", + "follow_up_title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter", + "follow_up_issue": "XY-885", + "follow_up_url": "https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter" + }, + { + "project": "LightRAG", + "verdict": "adapter_candidate", + "supporting_evidence_ids": [ + "E4", + "E5" + ], + "docker_boundary": "Docker Compose LightRAG server with explicit LLM, embedding, rerank, and data-volume configuration.", + "output_contract": "Use context-only query modes and file_paths-backed citations for evidence scoring.", + "follow_up_title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter", + "follow_up_issue": "XY-886", + "follow_up_url": "https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter" + }, + { + "project": "GraphRAG", + "verdict": "adapter_candidate", + "supporting_evidence_ids": [ + "E6", + "E7" + ], + "docker_boundary": "Cost-bounded Docker Python CLI/API run over a generated tiny corpus with container-local parquet artifacts.", + "output_contract": "Map documents, text_units, communities, and community_reports output tables back to source evidence ids.", + "follow_up_title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter", + "follow_up_issue": "XY-887", + "follow_up_url": "https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter" + }, + { + "project": "Graphiti/Zep", + "verdict": "adapter_candidate", + "supporting_evidence_ids": [ + "E8" + ], + "docker_boundary": "Docker-local FalkorDB or Neo4j plus Python SDK runner with provider configuration explicit in benchmark artifacts.", + "output_contract": "Score UUID, fact, valid_at, and invalid_at search output against memory_evolution current/historical evidence.", + "follow_up_title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter", + "follow_up_issue": "XY-888", + "follow_up_url": "https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter" + }, + { + "project": "Letta", + "verdict": "research_only", + "supporting_evidence_ids": [ + "E9" + ], + "reason": "Keep as core/archival memory semantics reference; do not create an implementation issue until a supported, contained server path can export archival evidence for scoring." + }, + { + "project": "LangGraph", + "verdict": "research_only", + "supporting_evidence_ids": [ + "E10" + ], + "reason": "Keep as checkpoint/replay regression reference; it is not a standalone external memory adapter candidate in this benchmark lane." + }, + { + "project": "nanograph", + "verdict": "research_only", + "supporting_evidence_ids": [ + "E11" + ], + "reason": "Keep as typed graph DX inspiration; official positioning is no server/no Docker and no real_world_job evidence contract is proven." + }, + { + "project": "llm-wiki", + "verdict": "research_only", + "supporting_evidence_ids": [ + "E12" + ], + "reason": "Keep as derived knowledge-page workflow inspiration; no host-global plugin install may be used as adapter proof." + }, + { + "project": "gbrain", + "verdict": "blocked", + "supporting_evidence_ids": [ + "E13" + ], + "reason": "The evidence contract is strong, but a Docker-local brain repo and database path must be proven before an implementation issue is safe." + }, + { + "project": "graphify", + "verdict": "adapter_candidate", + "supporting_evidence_ids": [ + "E14" + ], + "docker_boundary": "Docker-only CLI/materializer run using pip-installed graphifyy over mounted benchmark corpus, with no assistant global hook install.", + "output_contract": "Score graph.json query output and GRAPH_REPORT.md source-file/source-location references against expected evidence.", + "follow_up_title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", + "follow_up_issue": "XY-889", + "follow_up_url": "https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter" + } + ] + }, + { + "seq": 4, + "type": "tradeoffs_recorded", + "tradeoffs": [ + { + "id": "T1", + "summary": "RAGFlow is resource-heavy, but the official Docker and reference chunk output make it an adapter candidate as long as the follow-up starts with a tiny corpus and records resource bounds instead of making a quality claim.", + "supporting_evidence_ids": [ + "E2", + "E3" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T2", + "summary": "LightRAG and GraphRAG can become adapter candidates because both expose bounded ingest/query paths and source mapping, but their first adapter issues must remain cost-bounded.", + "supporting_evidence_ids": [ + "E4", + "E5", + "E6", + "E7" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T3", + "summary": "Graphiti/Zep is a stronger adapter candidate than generic graph-memory references because it can emit temporal facts with validity windows and run against Docker-local graph stores.", + "supporting_evidence_ids": [ + "E8" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T4", + "summary": "Letta, LangGraph, nanograph, and llm-wiki should still inform ELF design, but creating adapter implementation issues now would blur reference workflows with executable memory-service evidence.", + "supporting_evidence_ids": [ + "E9", + "E10", + "E11", + "E12" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T5", + "summary": "gbrain has a good citation and current-truth/timeline contract, but the missing Docker-local brain repo/database setup keeps it blocked rather than adapter_candidate.", + "supporting_evidence_ids": [ + "E13" + ], + "disconfirming_evidence_ids": [] + }, + { + "id": "T6", + "summary": "graphify is an adapter candidate only if implemented as an isolated CLI/materializer over generated corpus artifacts, not as a host-global assistant hook install.", + "supporting_evidence_ids": [ + "E14" + ], + "disconfirming_evidence_ids": [] + } + ] + }, + { + "seq": 5, + "type": "challenge_recorded", + "summary": "The main risk is that adapter_candidate could be read as benchmark evidence. The mitigation is to keep evidence_class=research_gate, keep overall status non-pass, and state that follow-up implementation issues must still run Docker and real_world_job scoring before any live evidence claim.", + "resolved": true + }, + { + "seq": 6, + "type": "finalized_decision_ready", + "confidence": "medium", + "decision": "Create implementation follow-ups only for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify. Keep Letta, LangGraph, nanograph, and llm-wiki as research_only references. Keep gbrain blocked pending a Docker-local brain repo/database proof. Do not change any research_gate record into live evidence until an adapter executes inside Docker and emits evidence-linked outputs.", + "missing_evidence": [ + "No Docker adapter was implemented or executed in this lane.", + "No host-global install was used as proof.", + "Provider credentials and private corpora remain out of scope." + ] + } + ] +} diff --git a/docs/research/2026-06-11-capture-write-policy-live-report.json b/docs/research/2026-06-11-capture-write-policy-live-report.json new file mode 100644 index 00000000..574e1cc1 --- /dev/null +++ b/docs/research/2026-06-11-capture-write-policy-live-report.json @@ -0,0 +1,220 @@ +{ + "schema": "elf.capture_write_policy_live_report/v1", + "report_id": "xy-933-capture-write-policy-live-report-2026-06-11", + "authority": "XY-933", + "created_at": "2026-06-11T14:31:00Z", + "commands": [ + { + "command": "cargo make real-world-memory", + "status": "pass", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "status": "pass", + "artifact": "tmp/real-world-memory/live-adapters/summary.json" + } + ], + "fixture_aggregate": { + "job_count": 40, + "pass": 38, + "blocked": 2, + "capture_integration": { + "encoded_job_count": 4, + "status": "pass", + "score_mean": 1.0, + "redaction_leak_count": 0, + "evidence_required_count": 10, + "evidence_covered_count": 10, + "source_ref_required_count": 10, + "source_ref_covered_count": 10 + } + }, + "live_capture_results": { + "elf_live_real_world": { + "suite_status": "pass", + "encoded_job_count": 4, + "redaction_leak_count": 0, + "expected_evidence_recall": 1.0, + "evidence_required_count": 10, + "evidence_covered_count": 10, + "source_ref_required_count": 10, + "source_ref_covered_count": 10, + "artifact": "tmp/real-world-memory/live-adapters/elf-report.json", + "materialization_artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + "qmd_live_real_world": { + "suite_status": "not_encoded", + "encoded_job_count": 4, + "redaction_leak_count": 0, + "expected_evidence_recall": 0.0, + "evidence_required_count": 10, + "evidence_covered_count": 0, + "source_ref_required_count": 10, + "source_ref_covered_count": 0, + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" + } + }, + "jobs": [ + { + "job_id": "capture-redaction-exclusion-001", + "status": "pass", + "stored_evidence_ids": [ + "public-captured-decision", + "write-policy-audit" + ], + "excluded_evidence_ids": [ + "private-excluded-text" + ], + "source_ids": [ + "capture:linear-comment-933", + "capture:write-policy-audit-933" + ], + "runtime_source_refs": [ + { + "evidence_id": "public-captured-decision", + "source_id": "capture:linear-comment-933", + "evidence_binding": "source_ref", + "write_policy_applied": false + }, + { + "evidence_id": "write-policy-audit", + "source_id": "capture:write-policy-audit-933", + "evidence_binding": "source_ref", + "write_policy_applied": false + } + ], + "write_policy_audit_count": 0, + "write_policy_redaction_count": 0, + "redaction_leak_count": 0 + }, + { + "job_id": "capture-source-id-binding-001", + "status": "pass", + "stored_evidence_ids": [ + "source-id-release-summary", + "source-id-command-log" + ], + "excluded_evidence_ids": [], + "source_ids": [ + "capture:issue-comment-42", + "capture:command-log-7" + ], + "runtime_source_refs": [ + { + "evidence_id": "source-id-release-summary", + "source_id": "capture:issue-comment-42", + "evidence_binding": "source_ref", + "write_policy_applied": false + }, + { + "evidence_id": "source-id-command-log", + "source_id": "capture:command-log-7", + "evidence_binding": "source_ref", + "write_policy_applied": false + } + ], + "write_policy_audit_count": 0, + "write_policy_redaction_count": 0, + "redaction_leak_count": 0 + }, + { + "job_id": "capture-write-policy-redaction-001", + "status": "pass", + "stored_evidence_ids": [ + "redacted-source-message" + ], + "excluded_evidence_ids": [ + "redacted-private-token-trap" + ], + "source_ids": [ + "capture:terminal-log-17" + ], + "runtime_source_refs": [ + { + "evidence_id": "redacted-source-message", + "source_id": "capture:terminal-log-17", + "evidence_binding": "source_ref", + "write_policy_applied": true + } + ], + "write_policy_audit_count": 1, + "write_policy_redaction_count": 1, + "redaction_leak_count": 0 + }, + { + "job_id": "capture-integration-boundaries-001", + "status": "pass", + "stored_evidence_ids": [ + "xy844-capture-log", + "agentmemory-hook-reference", + "claude-mem-viewer-reference", + "live-adapter-follow-up" + ], + "excluded_evidence_ids": [ + "private-span-trap" + ], + "source_ids": [], + "runtime_source_refs": [ + { + "evidence_id": "live-adapter-follow-up", + "source_id": null, + "evidence_binding": null, + "write_policy_applied": false + }, + { + "evidence_id": "agentmemory-hook-reference", + "source_id": null, + "evidence_binding": null, + "write_policy_applied": false + }, + { + "evidence_id": "xy844-capture-log", + "source_id": null, + "evidence_binding": null, + "write_policy_applied": false + }, + { + "evidence_id": "claude-mem-viewer-reference", + "source_id": null, + "evidence_binding": null, + "write_policy_applied": false + } + ], + "write_policy_audit_count": 0, + "write_policy_redaction_count": 0, + "redaction_leak_count": 0 + } + ], + "competitor_positions": [ + { + "project": "qmd", + "position": "untested", + "reason": "ELF executes and passes 4/4 live capture jobs; qmd keeps capture_integration typed not_encoded in the same live sweep, so this is an ELF self-check rather than a qmd comparison result." + }, + { + "project": "agentmemory", + "position": "blocked", + "reason": "The current Docker baseline uses a process-local StateKV Map and in-memory index; no durable local session/capture path stores source ids, exclusions, write-policy audit, or evidence-bound output." + }, + { + "project": "claude-mem", + "position": "blocked", + "reason": "Repository storage, lifecycle, progressive disclosure, and same-corpus retrieval are checked; hooks, timeline, observations, viewer capture, and automatic capture review need a Docker-contained hook/viewer runner before scoring." + } + ], + "claim_boundary": { + "allowed": [ + "ELF live capture/write-policy self-checks pass for redaction, exclusions, source ids, evidence binding, and no secret leakage.", + "qmd remains not_encoded for capture/write-policy jobs in the full live sweep.", + "agentmemory capture comparison is blocked by mocked/in-memory storage and lack of a durable local capture artifact.", + "claude-mem capture breadth is blocked until a Docker-contained hook/viewer capture runner exists." + ], + "not_allowed": [ + "Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth.", + "Do not use host-global hooks as benchmark evidence.", + "Do not weaken ELF write-policy, redaction, or evidence-binding constraints for benchmark convenience.", + "Do not convert fixture-backed or live-baseline-only capture references into a live real-world competitor pass." + ] + } +} diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json new file mode 100644 index 00000000..6404bc35 --- /dev/null +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -0,0 +1,542 @@ +{ + "schema": "elf.competitor_strength_adoption_report/v1", + "report_id": "xy-901-competitor-strength-adoption-report-2026-06-11", + "authority": "XY-901", + "created_at": "2026-06-11T00:00:00Z", + "adoption_decision": { + "personal_production_adoptable": true, + "verdict": "adopt_with_bounded_caveats", + "summary": "ELF is currently adoptable for bounded personal production use because source-of-truth, evidence-bound writes, rebuild/backfill/restore, and typed benchmark evidence are stronger than the measured alternatives. It is not a broad competitor-superiority claim.", + "remaining_caveats": [ + "Full-suite live real-world pass parity is not proven.", + "Live temporal reconciliation remains wrong_result for five of six memory_evolution jobs.", + "Private-corpus production quality is blocked until an operator-owned manifest exists.", + "Credentialed provider production-ops gates are blocked until explicit provider setup exists.", + "Several competitor strengths remain not_tested or blocked: OpenMemory UI/export is blocked by the XY-931 export-helper setup probe, hosted mem0 Platform behavior remains a non-goal, and OpenViking trajectory, Letta core-vs-archival memory, and broad graph/RAG navigation remain unproven. XY-929 adds a representative graph/RAG fixture slice with typed blockers, one incomplete LightRAG job, and one graphify wrong_result job, but it does not create any broad graph/RAG win, tie, or loss claim. XY-928 encodes OpenViking staged trajectory, hierarchy selection, and recursive/context expansion as blocked fixtures behind same-corpus evidence output and missing staged artifacts. XY-927 adds six ELF fixture-backed core_archival_memory jobs, but Letta scenario rows remain blocked or not_tested until the selected contained export/readback path exists. XY-925 adds fixture-backed first-generation OSS prompt coverage and typed blockers for agentmemory durable continuity, memsearch Markdown source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, hook, and viewer/operator surfaces without creating live external real-world suite passes. mem0 local OSS preference history is measured separately and is an ELF loss on the current correction-history scenario. The XY-923 follow-up scores qmd immediate top-10/replay artifact ergonomics as stronger than ELF's default stress report, while expansion, fusion, and rerank remain untested. XY-932 adds a narrow live operator-debug slice where ELF beats qmd on trace hydration and candidate-drop visibility, but OpenMemory UI/export remains blocked and claude-mem viewer workflows remain blocked until Docker-contained hook/viewer evidence exists. XY-933 adds an ELF live capture/write-policy self-check, but agentmemory and claude-mem hook-capture breadth remain blocked until Docker-contained hook/viewer evidence exists." + ] + }, + "evidence_class_terms": [ + "fixture_backed", + "live_baseline_only", + "live_real_world", + "smoke_only", + "research_gate", + "blocked", + "incomplete", + "unsupported", + "not_encoded", + "wrong_result", + "lifecycle_fail" + ], + "outcome_terms": [ + "win", + "tie", + "loss", + "not_tested", + "blocked", + "non_goal" + ], + "source_artifacts": [ + { + "command": "cargo make real-world-memory", + "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "claim": "ELF fixture aggregate covers 60 jobs across 16 suites with 53 pass and 7 blocked production-ops, private-corpus, private/provider scheduler, or OpenViking context-trajectory measurement gates, including 6 passing core_archival_memory jobs, 1 passing memory_summary source-trace job, 4 passing proactive_brief suggestion jobs plus 1 private-corpus blocker, and 4 passing scheduled_memory task-readback jobs plus 1 private/provider scheduler blocker." + }, + { + "command": "cargo make real-world-memory-summary", + "artifact": "tmp/real-world-memory/memory-summary/report.json", + "claim": "The memory summary fixture scores reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile entries with source refs, freshness metadata, rationale, and unsupported-claim flags; this is fixture-backed contract evidence, not managed-memory parity." + }, + { + "command": "cargo make real-world-memory-proactive-brief", + "artifact": "tmp/real-world-memory/proactive-brief/report.json", + "claim": "The proactive brief fixture scores daily project brief, resume-work brief, stale decision audit, stale plan/preference warning, and private-corpus refresh blocker scenarios with evidence refs, freshness/currentness markers, action rationale, and stale/tombstone guards; this is fixture-backed contract evidence, not Pulse or hosted managed-memory parity." + }, + { + "command": "cargo make real-world-memory-scheduled", + "artifact": "tmp/real-world-memory/scheduled/report.json", + "claim": "The scheduled-memory fixture scores weekly project status summary, stale preference/plan audit, stale decision audit, knowledge-page refresh suggestion, and private/provider scheduler blocker scenarios with evidence refs, freshness/currentness markers, action rationale, execution trace/readback, source-mutation guards, and stale/tombstone guards; this is fixture-backed contract evidence, not hosted scheduler, ChatGPT Tasks, Pulse, notification, or provider-backed private-corpus parity." + }, + { + "command": "cargo make real-world-memory-core-archival", + "artifact": "tmp/real-world-memory/core-archival/report.json", + "claim": "ELF core_archival_memory fixture coverage scores core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search." + }, + { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "claim": "ELF live service adapter reports 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs." + }, + { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md", + "claim": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, while agentmemory and claude-mem capture breadth are blocked until durable hook/viewer evidence exists." + }, + { + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json", + "claim": "The narrow live operator-debug slice scores ELF as pass and qmd as wrong_result: ELF wins trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence; both systems expose replay commands and repair-action guidance." + }, + { + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", + "claim": "mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result on same-corpus retrieval." + }, + { + "command": "cargo make real-world-first-generation-oss", + "artifact": "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md", + "claim": "First-generation OSS fixture slice reports 6 jobs: 4 pass, 2 blocked, full evidence/source-ref/quote coverage, and manifest scenario outcomes across win, tie, loss, not_tested, blocked, and non_goal without promoting smoke evidence into live suite passes." + }, + { + "command": "cargo make openmemory-ui-export-readback", + "artifact": "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", + "claim": "mem0 local OSS passes preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history; OpenMemory export-helper setup emits a separate blocked artifact with DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER, and hosted Platform export remains non-goal." + }, + { + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "claim": "Graphiti/Zep temporal smoke remains blocked by provider_api_key_missing when live provider execution is explicitly enabled without credentials." + }, + { + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md", + "claim": "graphify reaches tiny Docker graph/report scoring but remains wrong_result; broad graph/RAG quality is not tested." + }, + { + "command": "cargo make real-world-memory-graph-rag", + "artifact": "tmp/real-world-memory/graph-rag/report.json", + "claim": "Representative graph/RAG fixtures produce typed non-pass reports: RAGFlow, GraphRAG, and Graphiti/Zep blocked; LightRAG incomplete with comparison blocked; graphify wrong_result; llm-wiki not_tested; gbrain blocked; private and hosted profiles non_goal." + }, + { + "command": "cargo make baseline-production-synthetic, cargo make baseline-backfill-docker, backup/restore plus Qdrant rebuild proof", + "artifact": "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md", + "claim": "ELF has provider synthetic, stress, backfill, restore, and rebuild evidence, while private-corpus proof remains blocked by missing operator-owned manifest." + }, + { + "command": "ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker plus ELF trace-bundle and qmd CLI replay commands", + "artifact": "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md", + "claim": "Retrieval correctness remains tied, but qmd wins current immediate top-10/replay artifact ergonomics; ELF trace/admin surfaces are useful but not yet hydrated into the default stress artifact." + } + ], + "scenario_outcomes": [ + { + "scenario_id": "source_of_truth_rebuild_evidence_writes", + "title": "Source-of-truth rebuild and evidence-bound writes", + "outcome": "win", + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "live_baseline_only" + ], + "measured_claim": "ELF has the strongest measured source-of-truth and rebuild story: Postgres is authoritative, Qdrant is rebuildable, trust_source_of_truth passes in fixture and live sweeps, and production restore/rebuild proof exists.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md" + ], + "follow_up_issues": [], + "caveat": "XY-925 encodes fixture-backed memsearch canonical Markdown source-store prompts, but no live memsearch real_world_job runtime adapter pass is claimed." + }, + { + "scenario_id": "work_resume_coding_agent_continuity", + "title": "Work resume and coding-agent continuity", + "outcome": "tie", + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "live_baseline_only", + "blocked", + "not_encoded" + ], + "measured_claim": "ELF and qmd both pass the encoded live work_resume jobs. XY-925 selects agentmemory's durable local path but keeps it blocked until the SDK KV/index and observation log survive a fresh process; claude-mem work_resume remains not_encoded, and OpenViking continuity trajectory remains blocked.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" + ], + "follow_up_issues": [ + "XY-928" + ], + "caveat": "The tie is only for encoded live work_resume behavior, not for broad capture hooks or staged context." + }, + { + "scenario_id": "project_decisions_reversals", + "title": "Project decisions and reversals", + "outcome": "tie", + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "research_gate", + "not_encoded" + ], + "measured_claim": "ELF and qmd both pass encoded project_decisions jobs. The new ELF core_archival_memory fixture also scores project-decision recovery through core routing plus archival rationale, but Letta-style comparison remains blocked without contained export evidence.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" + ], + "follow_up_issues": [ + "XY-927" + ], + "caveat": "No Letta comparison exists until the selected contained export/readback path produces source-id-mapped evidence." + }, + { + "scenario_id": "retrieval_quality", + "title": "Retrieval quality", + "outcome": "tie", + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "live_baseline_only" + ], + "measured_claim": "ELF and qmd both pass the encoded live retrieval suite and both pass stress/same-corpus retrieval evidence.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" + ], + "follow_up_issues": [ + "XY-923" + ], + "caveat": "Retrieval correctness is separate from debug/replay ergonomics." + }, + { + "scenario_id": "local_debug_replay_ux", + "title": "Retrieval quality and local debug UX", + "outcome": "loss", + "evidence_classes": [ + "live_baseline_only", + "research_gate", + "wrong_result", + "not_encoded" + ], + "measured_claim": "The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", + "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" + ], + "follow_up_issues": [ + "XY-923" + ], + "caveat": "The loss is a local-debug artifact loss only; retrieval correctness remains tied and no broad qmd-over-ELF memory-system claim is allowed." + }, + { + "scenario_id": "memory_evolution_temporal_history", + "title": "Memory evolution and temporal history", + "outcome": "loss", + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "live_baseline_only", + "wrong_result", + "blocked" + ], + "measured_claim": "ELF fixture memory_evolution passes, but live ELF passes only the delete/TTL job and reports five wrong_result jobs where evidence is retrieved but current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/research/2026-06-11-temporal-history-competitor-gap-report.json" + ], + "follow_up_issues": [ + "XY-905" + ], + "caveat": "Graphiti/Zep remains a temporal-validity reference, but its local provider-backed smoke is blocked by provider_api_key_missing." + }, + { + "scenario_id": "consolidation_proposal_review", + "title": "Consolidation/proposal review", + "outcome": "not_tested", + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "research_gate", + "not_encoded" + ], + "measured_claim": "ELF fixture consolidation passes, and XY-934 adds live service-backed proposal materialization, source lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit evidence. Managed dreaming and Always-On Memory Agent patterns remain product references, not direct live competitors.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md", + "docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json" + ], + "follow_up_issues": [ + "XY-934" + ], + "caveat": "The live evidence is an ELF self-check for deterministic fixture/manual proposal materialization; no direct managed dreaming, Always-On Memory Agent, qmd, agentmemory, or llm-wiki live competitor runner is claimed." + }, + { + "scenario_id": "knowledge_page_compilation", + "title": "Knowledge page compilation", + "outcome": "not_tested", + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "wrong_result", + "research_gate", + "blocked", + "not_encoded" + ], + "measured_claim": "ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. The XY-929 graph/RAG representative slice scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md" + ], + "follow_up_issues": [ + "XY-926", + "XY-929" + ], + "caveat": "GraphRAG, graphify, llm-wiki, and gbrain remain references until contained citation, graph-report, and lint jobs produce passable evidence-linked output." + }, + { + "scenario_id": "operator_debugging_viewer_ux", + "title": "Operator debugging/viewer UX", + "outcome": "win", + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "blocked", + "not_encoded" + ], + "measured_claim": "ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, but claude-mem viewer/operator workflows and OpenMemory UI/export remain blocked, so this is not a broad viewer-product superiority claim.", + "command_artifacts": [ + "tmp/real-world-job/operator-ux-live-adapters/summary.json", + "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", + "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" + ], + "follow_up_issues": [ + "XY-926" + ], + "caveat": "The live slice compares ELF and qmd only; OpenMemory UI/export and claude-mem viewer workflows remain typed blocked until a bounded local runner exists." + }, + { + "scenario_id": "capture_write_policy_redaction", + "title": "Capture/write policy and redaction", + "outcome": "not_tested", + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "live_baseline_only", + "blocked", + "not_encoded" + ], + "measured_claim": "ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains not_encoded; XY-925 records agentmemory and claude-mem hook capture as typed blockers until Docker-contained hook observations and write-policy/viewer readback artifacts exist.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" + ], + "follow_up_issues": [ + "XY-933", + "XY-925", + "XY-926" + ], + "caveat": "This is an ELF self-check and qmd not_encoded delta, not a broad capture-breadth win over agentmemory or claude-mem." + }, + { + "scenario_id": "production_ops_restore_backfill", + "title": "Production ops, restore, backfill, and rebuild", + "outcome": "win", + "evidence_classes": [ + "live_baseline_only", + "blocked" + ], + "measured_claim": "ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence are checked in.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md", + "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md" + ], + "follow_up_issues": [ + "XY-930" + ], + "caveat": "Private-corpus and credentialed provider gates remain blocked, so this is not private production quality proof." + }, + { + "scenario_id": "private_corpus_provider_boundaries", + "title": "Private corpus and provider boundaries", + "outcome": "blocked", + "evidence_classes": [ + "blocked" + ], + "measured_claim": "The private production profile fails closed without an operator-owned manifest, and provider-backed production-ops gates require explicit credentials.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md", + "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md" + ], + "follow_up_issues": [ + "XY-930" + ], + "caveat": "The blocker is an input boundary, not a hidden benchmark pass or loss." + }, + { + "scenario_id": "personalization_scoped_preferences", + "title": "Personalization and scoped preferences", + "outcome": "tie", + "evidence_classes": [ + "fixture_backed", + "live_real_world", + "live_baseline_only", + "not_encoded" + ], + "measured_claim": "ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md" + ], + "follow_up_issues": [ + "XY-927" + ], + "caveat": "The tie is scoped to encoded personalization and local OSS entity filters; OpenMemory UI readback and long-term preference evolution remain separate surfaces." + }, + { + "scenario_id": "context_trajectory_hierarchical_retrieval", + "title": "Context trajectory and hierarchical retrieval", + "outcome": "not_tested", + "evidence_classes": [ + "fixture_backed", + "live_baseline_only", + "research_gate", + "wrong_result", + "blocked" + ], + "measured_claim": "OpenViking reaches the pinned Docker local embedding path and now exposes expected/matched/missing evidence ids, but same-corpus evidence is still wrong_result; staged trajectory, hierarchy selection, and recursive expansion are encoded as blocked fixtures, not scored comparisons.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" + ], + "follow_up_issues": [ + "XY-928" + ], + "caveat": "ELF only has a narrow precondition win over OpenViking, not a trajectory win." + }, + { + "scenario_id": "core_vs_archival_memory", + "title": "Core-vs-archival memory", + "outcome": "blocked", + "evidence_classes": [ + "fixture_backed", + "research_gate", + "blocked", + "not_encoded" + ], + "measured_claim": "ELF now has 6 fixture-backed core_archival_memory jobs that score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search. Letta remains blocked or not_tested until its contained export/readback artifact maps core and archival source ids.", + "command_artifacts": [ + "docs/spec/system_elf_memory_service_v2.md", + "apps/elf-eval/fixtures/real_world_memory/core_archival_memory", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", + "tmp/real-world-memory/core-archival/report.json" + ], + "follow_up_issues": [ + "XY-927" + ], + "caveat": "No ELF-over-Letta claim is allowed; the selected Letta path must export core block JSON, archival search/readback JSON, and source ids before scoring." + }, + { + "scenario_id": "graph_rag_navigation_citations", + "title": "Graph/RAG navigation and citations", + "outcome": "not_tested", + "evidence_classes": [ + "smoke_only", + "research_gate", + "blocked", + "incomplete", + "wrong_result", + "not_encoded" + ], + "measured_claim": "cargo make real-world-memory-graph-rag adds representative citation, graph-summary, temporal-validity, graph-report, stale-source-lint, and unsupported-claim fixtures. The slice is typed non-pass: RAGFlow, GraphRAG, and Graphiti/Zep are blocked; LightRAG is incomplete with comparison blocked; graphify is wrong_result; llm-wiki is not_tested; gbrain is blocked. Broad graph/RAG navigation and citation quality remain not_tested.", + "command_artifacts": [ + "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md" + ], + "follow_up_issues": [ + "XY-929" + ], + "caveat": "RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, llm-wiki, gbrain, and graphify have no broad quality proof; private, hosted, and large-corpus graph/RAG behavior remains non_goal unless explicitly authorized." + } + ], + "follow_up_queue": [ + { + "issue": "XY-905", + "priority": "P0", + "state": "Backlog", + "gap": "Live temporal reconciliation answer and trace contract." + }, + { + "issue": "XY-923", + "priority": "P0", + "state": "Backlog", + "gap": "qmd trace-level replay and wrong-result diagnostics." + }, + { + "issue": "XY-924/XY-931", + "priority": "P0", + "state": "Encoded local OSS history; UI/export setup blocker measured", + "gap": "mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export has a blocked export-helper setup probe and still needs a dedicated compose/import path before any product-UX comparison." + }, + { + "issue": "XY-925", + "priority": "P1", + "state": "Fixture slice encoded; runtime paths still blocked", + "gap": "First-generation OSS prompt coverage and typed blockers are recorded for agentmemory, memsearch, and claude-mem; durable agentmemory hooks and claude-mem viewer/operator runs still need runtime adapters." + }, + { + "issue": "XY-926", + "priority": "P1", + "state": "Backlog", + "gap": "Live consolidation and knowledge-page suites; broad operator-debugging remains dependent on OpenMemory and claude-mem UI runners." + }, + { + "issue": "XY-933", + "priority": "P1", + "state": "Live ELF self-check encoded", + "gap": "Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked until Docker-contained hook/viewer evidence exists." + }, + { + "issue": "XY-927", + "priority": "P1", + "state": "Fixture encoded; Letta export blocked", + "gap": "ELF core_archival_memory fixture coverage is encoded; a contained Letta export/readback adapter remains future work before win/tie/loss claims." + }, + { + "issue": "XY-928", + "priority": "P1", + "state": "Backlog", + "gap": "OpenViking context-trajectory and hierarchy benchmark is encoded but blocked until evidence-bearing same-corpus and staged artifacts exist." + }, + { + "issue": "XY-929", + "priority": "P2", + "state": "Representative fixture slice encoded; live contracts still blocked or typed non-pass", + "gap": "Graph/RAG adapters now have representative citation/navigation/lint fixtures, but live evidence-linked output contracts are still blocked, incomplete, wrong_result, not_tested, or non_goal." + }, + { + "issue": "XY-930", + "priority": "P1", + "state": "Backlog", + "gap": "Private-corpus and credentialed production gates after operator inputs exist." + }, + { + "issue": "XY-906", + "priority": "ops", + "state": "Todo", + "gap": "Decodex registered-project review-config schema drift blocks Decodex loading of elf." + } + ], + "claim_boundaries": { + "allowed": [ + "ELF is adoptable for bounded personal production use with caveats.", + "ELF has the strongest measured source-of-truth, rebuild, restore, and backfill evidence among the tracked systems.", + "ELF ties qmd on encoded live retrieval, work_resume, project_decisions, and personalization slices.", + "ELF fixture-backed core_archival_memory coverage passes attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery jobs separately from archival search.", + "ELF has a live temporal reconciliation loss against the benchmark expectation: five memory_evolution jobs remain wrong_result.", + "Most competitor strengths outside qmd retrieval are not_tested, blocked, incomplete, smoke_only, or research_gate.", + "ELF has a narrow live operator-debug win over qmd for trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence, with replay-command availability and repair-action clarity tied.", + "ELF live capture/write-policy self-checks pass for redaction, exclusions, source ids, evidence binding, and no secret leakage." + ], + "not_allowed": [ + "Do not claim ELF broadly beats qmd.", + "Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win.", + "Do not claim ELF beats mem0/OpenMemory on preference history, UI/export, hosted behavior, or graph memory. The local OSS correction-history scenario is currently an ELF loss, while OpenMemory UI/export is a measured setup blocker and hosted behavior plus graph memory remain outside measured local OSS evidence.", + "Do not claim ELF beats OpenViking on staged context trajectory.", + "Do not claim ELF beats Letta on core-vs-archival memory.", + "Do not claim graph/RAG parity from smoke-only or typed non-pass representative evidence.", + "Do not promote fixture-backed, live_baseline_only, smoke_only, research_gate, blocked, incomplete, wrong_result, lifecycle_fail, unsupported, or not_encoded states into a generic pass/fail score.", + "Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice.", + "Do not claim ELF broadly beats agentmemory or claude-mem on capture breadth; the current comparison is blocked for their hook/viewer capture paths." + ] + } +} diff --git a/docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json b/docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json new file mode 100644 index 00000000..f7a639ae --- /dev/null +++ b/docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json @@ -0,0 +1,197 @@ +{ + "schema": "elf.memory_evolution_diagnostic_report/v1", + "run_id": "2026-06-11-elf-qmd-memory-evolution-diagnostic", + "commit": "87a388b6f33ff0142359876e5d9632fc096ee956", + "created_at": "2026-06-11", + "scope": "ELF versus qmd live memory-evolution behavior, current-vs-historical conflict diagnosis, and optimization directions", + "commands": [ + { + "command": "cargo make real-world-memory-evolution", + "status": "pass", + "runtime_seconds": 50.34, + "artifact": "tmp/real-world-memory/evolution-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "status": "pass", + "runtime_seconds": 112.26, + "artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "fixture_memory_evolution": { + "job_count": 5, + "pass": 5, + "wrong_result": 0, + "mean_score": 1.0, + "expected_evidence_total": 11, + "expected_evidence_matched": 11, + "conflict_detection_count": 5, + "update_rationale_available_count": 5, + "history_readback_encoded_count": 1 + }, + "live_full_sweep_context": { + "elf": { + "job_count": 38, + "pass": 18, + "wrong_result": 5, + "blocked": 2, + "not_encoded": 13, + "mean_score": 0.525, + "mean_latency_ms": 8.62, + "expected_evidence_total": 77, + "expected_evidence_matched": 41, + "evidence_required_count": 84, + "evidence_covered_count": 48 + }, + "qmd": { + "job_count": 38, + "pass": 17, + "wrong_result": 6, + "blocked": 2, + "not_encoded": 13, + "mean_score": 0.486, + "mean_latency_ms": 691.163, + "expected_evidence_total": 77, + "expected_evidence_matched": 38, + "evidence_required_count": 84, + "evidence_covered_count": 45 + } + }, + "live_memory_evolution": { + "elf": { + "jobs": 6, + "pass": 1, + "wrong_result": 5, + "mean_score": 0.4916666666666667, + "expected_evidence_total": 13, + "expected_evidence_matched": 13, + "produced_evidence_total": 13, + "diagnosis": "ELF retrieved all required evidence but failed supersession jobs because conflict detection and lifecycle-aware current-vs-historical answer behavior were not emitted." + }, + "qmd": { + "jobs": 6, + "pass": 0, + "wrong_result": 6, + "mean_score": 0.325, + "expected_evidence_total": 13, + "expected_evidence_matched": 10, + "produced_evidence_total": 10, + "diagnosis": "qmd had the same missing conflict-detection pattern and additionally missed three required evidence links, including the delete tombstone." + } + }, + "job_diagnosis": [ + { + "job_id": "memory-evolution-benchmark-verdict-001", + "elf_status": "wrong_result", + "elf_score": 0.4, + "qmd_status": "wrong_result", + "qmd_score": 0.15, + "diagnosis": "ELF retrieved current verdict, caveat, and rationale but did not cite the old not-ready verdict as historical; qmd also missed private-corpus caveat evidence." + }, + { + "job_id": "memory-evolution-deploy-method-001", + "elf_status": "wrong_result", + "elf_score": 0.4, + "qmd_status": "wrong_result", + "qmd_score": 0.4, + "diagnosis": "Both retrieved the current runbook and supersession rationale but did not preserve the old quickstart path as historical conflict evidence." + }, + { + "job_id": "memory-evolution-issue-state-001", + "elf_status": "wrong_result", + "elf_score": 0.4, + "qmd_status": "wrong_result", + "qmd_score": 0.4, + "diagnosis": "Both answered the current done state and rationale but did not surface the earlier blocked state as superseded history." + }, + { + "job_id": "memory-evolution-preference-001", + "elf_status": "wrong_result", + "elf_score": 0.4, + "qmd_status": "wrong_result", + "qmd_score": 0.15, + "diagnosis": "ELF retrieved current preference and rationale but did not preserve the old terse preference as historical; qmd only returned rationale evidence." + }, + { + "job_id": "memory-evolution-relation-temporal-001", + "elf_status": "wrong_result", + "elf_score": 0.35, + "qmd_status": "wrong_result", + "qmd_score": 0.35, + "diagnosis": "Both retrieved current and historical owners but did not emit scored temporal-validity explanation or update rationale." + }, + { + "job_id": "memory-evolution-delete-ttl-001", + "elf_status": "pass", + "elf_score": 1.0, + "qmd_status": "wrong_result", + "qmd_score": 0.5, + "diagnosis": "ELF retrieved tombstone and current plan evidence; qmd retrieved only the current plan and missed the tombstone." + } + ], + "elf_failure_pattern": { + "wrong_result_jobs": 5, + "answer_correctness_score": 0.0, + "evidence_grounding_score": 1.0, + "lifecycle_behavior_score": 0.0, + "trap_avoidance_score": 1.0, + "interpretation": "The issue is lifecycle-aware reconciliation and narration, not basic evidence retrieval." + }, + "claim_boundary": { + "fixture_claim": "fixture_memory_evolution_passes", + "live_claim": "elf_narrowly_outscores_qmd_on_this_fresh_slice_but_does_not_solve_memory_evolution", + "not_allowed": [ + "ELF broadly beats qmd as a memory system", + "ELF has solved temporal memory evolution", + "fixture pass is production proof", + "Graphiti/Zep, mem0/OpenMemory, or Letta are beaten" + ] + }, + "optimization_directions": [ + { + "direction": "temporal_reconciliation_layer", + "description": "Detect current and historical evidence for the same claim, choose the current winner, preserve the historical loser, and cite update rationale." + }, + { + "direction": "history_readback_and_note_version_links", + "description": "Expose add/update/delete/ignore history and version links for user preference and entity memory changes." + }, + { + "direction": "tombstone_and_invalidation_evidence", + "description": "Treat deletion and TTL tombstones as answerable evidence instead of only suppressing stale retrieval." + }, + { + "direction": "trace_conflict_candidates", + "description": "Hydrate trace artifacts with conflict candidates, current winners, historical losers, dropped candidates, and replay commands." + } + ], + "borrow_from": [ + { + "project": "Graphiti/Zep", + "borrow": "temporal fact windows, invalidation, supersession, and graph fact provenance", + "benchmark_gate": "Graphiti/Zep temporal graph adapter for current, historical, and future-valid facts" + }, + { + "project": "mem0/OpenMemory", + "borrow": "entity-scoped history, lifecycle inspection, and memory UI/readback", + "benchmark_gate": "entity and preference history readback with correction and deletion evidence" + }, + { + "project": "Letta", + "borrow": "core memory blocks versus archival memory", + "benchmark_gate": "core-vs-archival jobs for operating context and historical retrieval" + }, + { + "project": "qmd", + "borrow": "local replay and candidate inspection ergonomics", + "benchmark_gate": "ELF trace hydration with conflict candidates and replay commands" + } + ], + "next_reports": [ + "Live temporal reconciliation report", + "Graphiti/Zep temporal graph comparison", + "mem0/OpenMemory history comparison", + "qmd tombstone/delete diagnostic", + "ELF trace-candidate conflict profile" + ] +} diff --git a/docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json b/docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json new file mode 100644 index 00000000..72f22936 --- /dev/null +++ b/docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json @@ -0,0 +1,154 @@ +{ + "schema": "elf.retrieval_debug_profile_report/v1", + "run_id": "2026-06-11-elf-qmd-retrieval-debug-profile", + "commit": "38c586d", + "created_at": "2026-06-11", + "scope": "ELF versus qmd retrieval correctness, stress same-corpus behavior, and retrieval-debug artifact comparison", + "commands": [ + { + "command": "cargo make real-world-memory-live-adapters", + "status": "pass", + "runtime_seconds": 116.76, + "artifact": "tmp/real-world-memory/live-adapters/" + }, + { + "command": "ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "status": "pass", + "runtime_seconds": 149.41, + "artifact": "tmp/live-baseline/live-baseline-report.json" + } + ], + "live_real_world_retrieval": { + "elf": { + "jobs": 5, + "pass": 5, + "expected_evidence": 6, + "matched_evidence": 6, + "produced_evidence": 6, + "mean_score": 1.0 + }, + "qmd": { + "jobs": 5, + "pass": 5, + "expected_evidence": 6, + "matched_evidence": 6, + "produced_evidence": 6, + "mean_score": 1.0 + } + }, + "live_real_world_full_sweep_context": { + "elf": { + "job_count": 38, + "pass": 18, + "wrong_result": 5, + "blocked": 2, + "not_encoded": 13, + "mean_score": 0.525, + "mean_latency_ms": 5.823 + }, + "qmd": { + "job_count": 38, + "pass": 17, + "wrong_result": 6, + "blocked": 2, + "not_encoded": 13, + "mean_score": 0.486, + "mean_latency_ms": 691.163 + } + }, + "stress_baseline": { + "profile": "stress", + "document_count": 480, + "query_count": 16, + "verdict": "pass", + "summary": { + "projects": 2, + "pass": 2, + "fail": 0, + "full_checks": 13, + "full_checks_pass": 13 + }, + "elf": { + "head": "38c586d49167d2e4118c921765c11fbec0a60af9", + "status": "pass", + "retrieval_status": "retrieval_pass", + "elapsed_seconds": 81, + "query_pass": 16, + "query_total": 16, + "expected_top1": 16, + "latency_ms_mean": 29.80780025, + "latency_ms_p95": 31.298164, + "backfill_source_count": 480, + "backfill_completed_count": 480, + "resume_attempts": 2, + "duplicate_source_notes": 0, + "resource_elapsed_seconds": 71.303126711, + "rss_kb": 54724, + "estimated_input_tokens": 27023, + "checks": [ + "resumable_backfill_no_duplicates", + "same_corpus_retrieval", + "async_worker_indexing_e2e", + "update_replaces_note_text", + "delete_suppresses_retrieval", + "cold_start_recovery_search", + "concurrent_write_search_e2e", + "soak_stability_e2e", + "resource_envelope" + ] + }, + "qmd": { + "head": "636602409c862db077f38d9006df7f0bdca17ff3", + "status": "pass", + "retrieval_status": "retrieval_pass", + "elapsed_seconds": 66, + "query_pass": 16, + "query_total": 16, + "expected_top1": 16, + "mean_expected_rank": 1.0, + "mean_distractors_in_top10": 7.9375, + "checks": [ + "same_corpus_retrieval", + "update_replaces_note_text", + "delete_suppresses_retrieval", + "cold_start_recovery_search" + ] + }, + "per_query": [ + {"id": "q-auth", "elf_matched_top_evidence": true, "elf_latency_ms": 30.57141, "qmd_expected_rank": 1, "qmd_top10_distractors": 6}, + {"id": "q-auth-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 30.500951, "qmd_expected_rank": 1, "qmd_top10_distractors": 7}, + {"id": "q-database", "elf_matched_top_evidence": true, "elf_latency_ms": 30.533742, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-database-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 31.280581, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-deploy", "elf_matched_top_evidence": true, "elf_latency_ms": 29.958447, "qmd_expected_rank": 1, "qmd_top10_distractors": 9}, + {"id": "q-deploy-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 31.298164, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-retention", "elf_matched_top_evidence": true, "elf_latency_ms": 30.433992, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-retention-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 29.1944, "qmd_expected_rank": 1, "qmd_top10_distractors": 9}, + {"id": "q-incident", "elf_matched_top_evidence": true, "elf_latency_ms": 30.838953, "qmd_expected_rank": 1, "qmd_top10_distractors": 7}, + {"id": "q-incident-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 28.700106, "qmd_expected_rank": 1, "qmd_top10_distractors": 9}, + {"id": "q-billing", "elf_matched_top_evidence": true, "elf_latency_ms": 30.092115, "qmd_expected_rank": 1, "qmd_top10_distractors": 7}, + {"id": "q-billing-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 28.855273, "qmd_expected_rank": 1, "qmd_top10_distractors": 9}, + {"id": "q-search", "elf_matched_top_evidence": true, "elf_latency_ms": 29.479694, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-search-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 28.641688, "qmd_expected_rank": 1, "qmd_top10_distractors": 7}, + {"id": "q-recovery", "elf_matched_top_evidence": true, "elf_latency_ms": 28.357061, "qmd_expected_rank": 1, "qmd_top10_distractors": 8}, + {"id": "q-recovery-alt", "elf_matched_top_evidence": true, "elf_latency_ms": 28.188227, "qmd_expected_rank": 1, "qmd_top10_distractors": 9} + ] + }, + "debug_artifact_judgment": { + "retrieval_correctness": "tie_on_encoded_surfaces", + "qmd_advantage": "direct_top10_json_results_with_file_line_score_snippet_and_distractor_visibility", + "elf_advantage": "service_lifecycle_backfill_qdrant_rebuild_resource_envelope_source_of_truth_and_trace_ids", + "unmeasured": [ + "qmd_rerank_quality", + "elf_rerank_quality", + "expansion_fusion_stage_quality", + "operator_debugging_ux_live_suite" + ] + }, + "next_measurement_work": [ + "hydrate ELF trace candidates into stress reports", + "add qmd query latency and candidate-density aggregates", + "add rerank-on qmd profile or keep rerank as unmeasured", + "add scored operator-debugging retrieval jobs for both systems", + "add expansion/fusion trace profile" + ] +} diff --git a/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json b/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json new file mode 100644 index 00000000..84a38938 --- /dev/null +++ b/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json @@ -0,0 +1,369 @@ +{ + "schema": "elf.trace_replay_diagnostics_report/v1", + "run_id": "2026-06-11-elf-qmd-trace-replay-diagnostics", + "authority": "XY-923", + "created_at": "2026-06-11", + "scope": "ELF versus qmd trace-level replay and wrong-result diagnostics, with retrieval correctness kept as a separate guardrail.", + "inputs": [ + "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", + "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", + "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json", + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json", + "scripts/live-baseline-benchmark.sh", + "apps/elf-eval/src/app.rs", + "docs/spec/system_elf_memory_service_v2.md" + ], + "outcome_terms": [ + "win", + "tie", + "loss", + "not_tested", + "blocked", + "non_goal" + ], + "result_type_terms": [ + "pass", + "wrong_result", + "blocked", + "not_encoded", + "non_goal" + ], + "summary": { + "retrieval_correctness": "tie", + "debug_ergonomics": "qmd wins the current default top-10 candidate artifact and short replay-command surfaces.", + "elf_trace_position": "ELF has service trace, admin bundle, and trace replay surfaces, but they are not hydrated into the default stress report as qmd-like candidate artifacts.", + "outcome_counts": { + "win": 4, + "tie": 5, + "loss": 2, + "not_tested": 4, + "blocked": 0, + "non_goal": 1 + }, + "operator_debug_live_slice": "XY-932 adds a narrow live_real_world operator-debug slice: ELF passes trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, and repair-action clarity; qmd ties replay-command and repair-action clarity but remains wrong_result for trace hydration and candidate-drop stage visibility." + }, + "commands": [ + { + "system": "ELF", + "purpose": "stress retrieval guardrail with trace ids", + "command": "ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "status": "pass", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "system": "ELF", + "purpose": "admin trace bundle hydration", + "command": "curl -fsS 'http://127.0.0.1:51891/v2/admin/traces/<trace_id>/bundle?mode=full&stage_items_limit=256&candidates_limit=200' -H 'X-ELF-Tenant-Id: <tenant>' -H 'X-ELF-Project-Id: <project>' -H 'X-ELF-Agent-Id: <agent>'", + "status": "available_not_hydrated_in_default_stress_report", + "artifact": "elf.trace_bundle/v1 admin response" + }, + { + "system": "ELF", + "purpose": "trace ranking replay from persisted candidates", + "command": "cargo run -p elf-eval -- --config-a config/local/elf.docker.toml --config-b config/local/elf.docker.toml --trace-id <trace_id>", + "status": "available_not_run_for_the_checked_in_stress_report", + "artifact": "elf-eval trace compare JSON" + }, + { + "system": "qmd", + "purpose": "stress retrieval guardrail plus top-10 rows", + "command": "ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "status": "pass", + "artifact": "tmp/live-baseline/qmd-query.json" + }, + { + "system": "qmd", + "purpose": "per-query replay", + "command": "npx tsx src/cli/qmd.ts query 'lex: <query>\\nvec: <query>' -c elfbench --json --no-rerank --min-score 0 -n 10", + "status": "pass_in_baseline_driver", + "artifact": "tmp/live-baseline/qmd-query.json" + }, + { + "system": "qmd", + "purpose": "lifecycle replay", + "command": "npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts query ... --json --no-rerank", + "status": "pass_for_update_delete_cold_start_checks", + "artifact": "tmp/live-baseline/qmd-query.json" + } + ], + "scenario_outcomes": [ + { + "scenario_id": "retrieval_correctness_guardrail", + "surface": "retrieval correctness", + "evidence_class": "live_real_world_and_live_baseline_only", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "pass", + "outcome": "tie", + "diagnostic_judgment": "Both systems pass encoded retrieval and stress same-corpus checks; this row does not score debugging ergonomics.", + "artifacts": [ + "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json", + "tmp/live-baseline/live-baseline-report.json" + ] + }, + { + "scenario_id": "default_top10_candidate_artifact", + "surface": "default top-10 candidate artifact", + "evidence_class": "live_baseline_only", + "result_type": "pass", + "elf_status": "not_encoded", + "qmd_status": "pass", + "outcome": "loss", + "diagnostic_judgment": "qmd exposes file, score, line/snippet, and distractor rows directly; ELF records trace ids and top evidence but not the full candidate list in the report.", + "artifacts": [ + "tmp/live-baseline/qmd-query.json", + "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json" + ] + }, + { + "scenario_id": "replay_command_locality", + "surface": "replay command locality", + "evidence_class": "live_baseline_only", + "result_type": "pass", + "elf_status": "not_encoded", + "qmd_status": "pass", + "outcome": "loss", + "diagnostic_judgment": "qmd replay is a short local CLI query/update/embed path; ELF replay requires a live service config, persisted traces, headers, and trace ids.", + "artifacts": [ + "scripts/live-baseline-benchmark.sh", + "apps/elf-eval/src/app.rs", + "docs/spec/system_elf_memory_service_v2.md" + ] + }, + { + "scenario_id": "trace_admin_replay_surface_availability", + "surface": "trace/admin replay surface availability", + "evidence_class": "implementation_reference", + "result_type": "not_encoded", + "elf_status": "pass", + "qmd_status": "pass", + "outcome": "tie", + "diagnostic_judgment": "ELF has admin trace bundles and elf-eval trace replay; qmd has direct CLI replay. They are different useful surfaces and are not scored as equivalent quality.", + "artifacts": [ + "docs/spec/system_elf_memory_service_v2.md", + "apps/elf-eval/src/app.rs", + "scripts/live-baseline-benchmark.sh" + ] + }, + { + "scenario_id": "operator_debug_trace_hydration", + "surface": "operator-debug trace hydration", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "wrong_result", + "outcome": "win", + "diagnostic_judgment": "ELF live operator-debug jobs generate trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs; qmd generates local replay commands but no service trace hydration surface.", + "artifacts": [ + "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + ] + }, + { + "scenario_id": "operator_debug_replay_command_availability", + "surface": "operator-debug replay command availability", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "pass", + "outcome": "tie", + "diagnostic_judgment": "ELF emits admin trace-bundle curl commands and qmd emits local CLI query replay commands for the same operator-debugging scenarios; this scores command availability, not equivalent UI quality.", + "artifacts": [ + "tmp/real-world-job/operator-ux-live-adapters/summary.json" + ] + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "surface": "operator-debug candidate-drop visibility", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "wrong_result", + "outcome": "win", + "diagnostic_judgment": "ELF exposes dropped-candidate visibility through generated operator_debug metadata without direct SQL assumptions; qmd exposes top-k replay rows but no intermediate candidate-drop stages in this slice.", + "typed_non_pass_states": [ + "retrieved_but_dropped" + ], + "artifacts": [ + "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json", + "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + ] + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "surface": "operator-debug repair-action clarity", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "pass", + "outcome": "tie", + "diagnostic_judgment": "Both live operator-debug adapters emit concrete next steps for replay or trace-bundle inspection; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.", + "artifacts": [ + "tmp/real-world-job/operator-ux-live-adapters/summary.json" + ] + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "surface": "operator-debug selected-but-not-narrated evidence", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "wrong_result", + "outcome": "win", + "diagnostic_judgment": "The operator-debug slice now scores selected-but-not-narrated evidence as a trace/answer-composition repair surface without direct database inspection.", + "typed_non_pass_states": [ + "selected_but_not_narrated" + ], + "artifacts": [ + "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/selected_but_not_narrated.json" + ] + }, + { + "scenario_id": "query_expansion_attribution", + "surface": "query expansion attribution", + "evidence_class": "research_gate", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "outcome": "not_tested", + "diagnostic_judgment": "No comparable artifact shows expansion variants or dynamic expansion decisions for both systems.", + "artifacts": [ + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "dense_sparse_channel_attribution", + "surface": "dense/sparse channel attribution", + "evidence_class": "research_gate", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "outcome": "not_tested", + "diagnostic_judgment": "ELF uses dense plus BM25 and qmd uses structured lex plus vec, but the scored artifacts do not expose comparable per-channel contribution.", + "artifacts": [ + "docs/spec/system_elf_memory_service_v2.md", + "scripts/live-baseline-benchmark.sh" + ] + }, + { + "scenario_id": "fusion_attribution", + "surface": "fusion attribution", + "evidence_class": "research_gate", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "outcome": "not_tested", + "diagnostic_judgment": "No comparable artifact shows fusion inputs, RRF or weighted-fusion contribution, or fusion-stage candidate drops.", + "artifacts": [ + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "rerank_attribution", + "surface": "rerank attribution", + "evidence_class": "live_baseline_only", + "result_type": "non_goal", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "outcome": "non_goal", + "diagnostic_judgment": "The current qmd stress and materializer paths use --no-rerank; no rerank-on comparison is claimed.", + "artifacts": [ + "scripts/live-baseline-benchmark.sh", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "candidate_drop_diagnostics", + "surface": "candidate-drop diagnostics", + "evidence_class": "research_gate", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "outcome": "not_tested", + "diagnostic_judgment": "retrieved_but_dropped is defined but not observed because current qmd artifacts lack intermediate candidate traces and the ELF stress report does not hydrate candidate bundles.", + "typed_non_pass_states": [ + "retrieved_but_dropped" + ], + "artifacts": [ + "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json", + "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json" + ] + }, + { + "scenario_id": "selected_but_not_narrated_wrong_results", + "surface": "selected-but-not-narrated wrong-result diagnosis", + "evidence_class": "live_real_world", + "result_type": "wrong_result", + "elf_status": "wrong_result", + "qmd_status": "wrong_result", + "outcome": "tie", + "diagnostic_judgment": "Both live paths produce memory-evolution wrong results where evidence is present but current-vs-historical or lifecycle narration is missing.", + "typed_non_pass_states": [ + "selected_but_not_narrated", + "contradicted_by_lifecycle_evidence" + ], + "artifacts": [ + "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json" + ] + }, + { + "scenario_id": "evidence_absent_tombstone_diagnostics", + "surface": "evidence-absent and tombstone diagnosis", + "evidence_class": "live_real_world", + "result_type": "wrong_result", + "elf_status": "pass", + "qmd_status": "wrong_result", + "outcome": "win", + "diagnostic_judgment": "ELF retrieved all required memory-evolution evidence and passed delete/TTL; qmd missed three required evidence links including the delete tombstone.", + "typed_non_pass_states": [ + "evidence_absent", + "contradicted_by_lifecycle_evidence" + ], + "artifacts": [ + "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json" + ] + } + ], + "wrong_result_diagnostics": { + "typed_non_pass_states": [ + { + "class": "evidence_absent", + "coverage": "observed_for_qmd", + "meaning": "Required evidence is absent from produced evidence ids." + }, + { + "class": "retrieved_but_dropped", + "coverage": "not_tested", + "meaning": "Required evidence appears in an intermediate candidate set but is absent from the final selected or narrated answer." + }, + { + "class": "selected_but_not_narrated", + "coverage": "observed_for_elf_and_qmd", + "meaning": "Evidence is selected or available, but the answer does not narrate the required lifecycle relationship." + }, + { + "class": "contradicted_by_lifecycle_evidence", + "coverage": "observed_for_elf_and_qmd", + "meaning": "The answer is contradicted or made incomplete by current, historical, supersession, or tombstone evidence." + } + ], + "qmd_missing_evidence": [ + "verdict-bounded-private-caveat", + "pref-current-concise-rationale", + "delete-tombstone" + ] + }, + "claim_boundaries": [ + "ELF and qmd remain tied on encoded retrieval correctness.", + "qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay.", + "ELF trace/admin endpoint availability is not proof that the default benchmark report has qmd-level candidate visibility.", + "Rerank superiority is not scored from a qmd --no-rerank run.", + "Do not claim qmd beats ELF as a memory system overall.", + "Do not collapse not_tested, non_goal, or wrong_result into pass evidence.", + "ELF narrowly wins the live operator-debug trace hydration and candidate-drop visibility slice against qmd; qmd still ties replay-command and repair-action clarity.", + "Expansion, dense/sparse contribution, fusion, rerank-on quality, and broad retrieved-but-dropped diagnosis outside the operator-debug slice remain unproven.", + "Do not convert the XY-932 operator-debug trace slice into a broad viewer-product win over OpenMemory or claude-mem; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists." + ] +} diff --git a/docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json b/docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json new file mode 100644 index 00000000..f5d38617 --- /dev/null +++ b/docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json @@ -0,0 +1,140 @@ +{ + "schema": "elf.first_generation_oss_continuity_source_store_report/v1", + "report_id": "xy-925-first-generation-oss-continuity-source-store-2026-06-11", + "authority": "XY-925", + "created_at": "2026-06-11T00:00:00Z", + "scope": "Fixture-backed first-generation OSS prompt coverage and typed blockers for agentmemory, memsearch, and claude-mem without promoting smoke evidence into real-world suite pass evidence.", + "validation": { + "command": "cargo make real-world-first-generation-oss", + "status": "pass", + "json_artifact": "tmp/real-world-memory/first-generation-oss/report.json", + "markdown_artifact": "tmp/real-world-memory/first-generation-oss/report.md", + "summary": { + "job_count": 6, + "encoded_suite_count": 4, + "pass": 4, + "blocked": 2, + "evidence_coverage": 1.0, + "source_ref_coverage": 1.0, + "quote_coverage": 1.0, + "operator_debug_job_count": 2, + "raw_sql_needed_count": 0 + } + }, + "manifest": { + "path": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", + "manifest_id": "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store", + "scenario_outcome_counts": { + "win": 9, + "tie": 9, + "loss": 1, + "not_tested": 8, + "blocked": 6, + "non_goal": 3 + }, + "scenario_status_counts": { + "unsupported": 2, + "blocked": 6, + "wrong_result": 5, + "lifecycle_fail": 1, + "pass": 20, + "not_encoded": 2 + } + }, + "scenario_judgments": [ + { + "project": "agentmemory", + "scenario_id": "durable_work_resume_local_path", + "suite_id": "work_resume", + "status": "blocked", + "comparison_outcome": "blocked", + "evidence": "The selected local path is a Docker-contained session directory that persists the SDK KV/index and observation log across a fresh process.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + }, + { + "project": "agentmemory", + "scenario_id": "capture_write_policy_hooks", + "suite_id": "capture_integration", + "status": "blocked", + "comparison_outcome": "blocked", + "evidence": "Live agentmemory hook observations and persisted write-policy audit evidence are required before capture/write-policy scoring.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + }, + { + "project": "memsearch", + "scenario_id": "markdown_source_store_rebuild_reload_prompt", + "suite_id": "trust_source_of_truth", + "status": "pass", + "comparison_outcome": "not_tested", + "evidence": "The prompt fixture covers canonical Markdown files as source of truth and memsearch index as derived rebuild/reload behavior.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json" + }, + { + "project": "memsearch", + "scenario_id": "markdown_retrieval_debug_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "comparison_outcome": "not_tested", + "evidence": "The prompt fixture covers CLI replay, Markdown source inspection, and reindexing while keeping staged trace bundles not encoded.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json" + }, + { + "project": "claude-mem", + "scenario_id": "retrieval_repair_artifact_path", + "suite_id": "retrieval", + "status": "wrong_result", + "comparison_outcome": "win", + "evidence": "The prompt fixture preserves claude-mem same-corpus retrieval as wrong_result and names rerun/inspection targets tmp/live-baseline/claude-mem.log plus tmp/live-baseline/claude-mem-checks.json.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json" + }, + { + "project": "claude-mem", + "scenario_id": "progressive_disclosure_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "comparison_outcome": "not_tested", + "evidence": "The prompt fixture covers repository search-to-detail/source hydration on durable SQLite and separates it from hook/viewer claims.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json" + }, + { + "project": "claude-mem", + "scenario_id": "hook_capture_viewer_workflow", + "suite_id": "capture_integration", + "status": "blocked", + "comparison_outcome": "blocked", + "evidence": "The current Docker baseline uses repository classes only and does not execute hooks, timeline capture, or viewer workflows.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + }, + { + "project": "claude-mem", + "scenario_id": "viewer_operator_workflow", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "comparison_outcome": "blocked", + "evidence": "A fair viewer/operator comparison needs Docker-contained readback over the same durable SQLite corpus.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + } + ], + "claim_boundaries": { + "allowed": [ + "agentmemory has a selected durable local path for future work-resume and capture/write-policy scoring.", + "memsearch has checked-in source-store and retrieval-debug prompt coverage over the canonical Markdown store.", + "claude-mem has checked-in progressive-disclosure and retrieval-repair prompt coverage for the Docker-contained repository path.", + "claude-mem hook capture and viewer/operator workflows remain typed blockers." + ], + "not_allowed": [ + "Do not claim agentmemory durable continuity from the in-memory same-corpus smoke.", + "Do not claim memsearch full real-world suite parity from Markdown reindex/reload smoke or fixture-backed prompt coverage.", + "Do not claim claude-mem retrieval passed; same-corpus retrieval remains wrong_result.", + "Do not claim claude-mem hooks or viewer workflows pass from repository class-level hydration evidence." + ] + } +} diff --git a/docs/research/2026-06-11-measurement-coverage-audit.json b/docs/research/2026-06-11-measurement-coverage-audit.json new file mode 100644 index 00000000..ff2405b1 --- /dev/null +++ b/docs/research/2026-06-11-measurement-coverage-audit.json @@ -0,0 +1,237 @@ +{ + "schema": "elf.benchmark_measurement_coverage_audit/v2", + "run_id": "2026-06-11-measurement-coverage-audit", + "source_revision": "current benchmark lane after XY-927 core-vs-archival fixture coverage, XY-928 context-trajectory blocked fixtures, and XY-933 live capture/write-policy scoring", + "created_at": "2026-06-11", + "scope": "ELF memory-system competitiveness measurement coverage, external competitor comparison evidence, and next report directions", + "commands": [ + { + "command": "cargo make real-world-memory-core-archival", + "status": "pass", + "runtime_seconds": 12.14, + "artifact": "tmp/real-world-memory/core-archival/report.json" + }, + { + "command": "cargo make real-world-memory", + "status": "pass", + "runtime_seconds": 11.09, + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "status": "pass", + "runtime_seconds": 137.66, + "artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "fixture_aggregate": { + "job_count": 49, + "encoded_suite_count": 13, + "pass": 44, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 5, + "not_encoded": 0, + "unsupported_claim": 0, + "mean_score": 0.898, + "mean_latency_ms": 3.94, + "expected_evidence_total": 100, + "expected_evidence_matched": 100, + "evidence_required_count": 111, + "evidence_covered_count": 111 + }, + "live_real_world_adapters": [ + { + "adapter": "ELF live service adapter", + "job_count": 40, + "encoded_suite_count": 11, + "pass": 22, + "wrong_result": 5, + "blocked": 2, + "not_encoded": 11, + "mean_score": 0.599, + "mean_latency_ms": 6.98, + "expected_evidence_total": 80, + "expected_evidence_matched": 50, + "evidence_required_count": 88, + "evidence_covered_count": 58 + }, + { + "adapter": "qmd live CLI adapter", + "job_count": 40, + "encoded_suite_count": 11, + "pass": 17, + "wrong_result": 6, + "blocked": 2, + "not_encoded": 15, + "mean_score": 0.461, + "mean_latency_ms": 792.543, + "expected_evidence_total": 80, + "expected_evidence_matched": 38, + "evidence_required_count": 88, + "evidence_covered_count": 45 + } + ], + "live_suite_delta": "ELF passes memory-evolution-delete-ttl-001 while qmd reports wrong_result; ELF also passes the live capture/write-policy suite while qmd remains not_encoded for capture_integration.", + "live_suite_breakdown": [ + { + "suite": "trust_source_of_truth", + "jobs": 1, + "elf_status_counts": { + "pass": 1 + }, + "qmd_status_counts": { + "pass": 1 + } + }, + { + "suite": "work_resume", + "jobs": 5, + "elf_status_counts": { + "pass": 5 + }, + "qmd_status_counts": { + "pass": 5 + } + }, + { + "suite": "retrieval", + "jobs": 5, + "elf_status_counts": { + "pass": 5 + }, + "qmd_status_counts": { + "pass": 5 + } + }, + { + "suite": "project_decisions", + "jobs": 5, + "elf_status_counts": { + "pass": 5 + }, + "qmd_status_counts": { + "pass": 5 + } + }, + { + "suite": "personalization", + "jobs": 1, + "elf_status_counts": { + "pass": 1 + }, + "qmd_status_counts": { + "pass": 1 + } + }, + { + "suite": "memory_evolution", + "jobs": 6, + "elf_status_counts": { + "pass": 1, + "wrong_result": 5 + }, + "qmd_status_counts": { + "wrong_result": 6 + } + }, + { + "suite": "capture_integration", + "jobs": 4, + "elf_status_counts": { + "pass": 4 + }, + "qmd_status_counts": { + "not_encoded": 4 + } + }, + { + "suite": "consolidation", + "jobs": 4, + "elf_status_counts": { + "not_encoded": 4 + }, + "qmd_status_counts": { + "not_encoded": 4 + } + }, + { + "suite": "knowledge_compilation", + "jobs": 2, + "elf_status_counts": { + "not_encoded": 2 + }, + "qmd_status_counts": { + "not_encoded": 2 + } + }, + { + "suite": "operator_debugging_ux", + "jobs": 1, + "elf_status_counts": { + "not_encoded": 1 + }, + "qmd_status_counts": { + "not_encoded": 1 + } + }, + { + "suite": "production_ops", + "jobs": 6, + "elf_status_counts": { + "blocked": 2, + "not_encoded": 4 + }, + "qmd_status_counts": { + "blocked": 2, + "not_encoded": 4 + } + } + ], + "adapter_ledger": { + "adapter_records": 23, + "unique_project_names": 17, + "external_project_count_note": "The generated report field external_project_count reports unique non-ELF project names after the XY-900 runner repair; the manifest has 16 external projects and 17 total project names including ELF.", + "evidence_class_counts": { + "fixture_backed": 1, + "live_baseline_only": 6, + "live_real_world": 5, + "research_gate": 11 + }, + "overall_status_counts": { + "pass": 4, + "wrong_result": 6, + "lifecycle_fail": 1, + "blocked": 7, + "not_encoded": 5 + }, + "xy900_update_note": "XY-900 promotes graphify from research_gate/blocked to a tiny scored live_real_world wrong_result smoke; broad graph/RAG quality remains unproven.", + "xy932_update_note": "XY-932 adds narrow ELF/qmd operator-debug live_real_world records: ELF pass and qmd wrong_result for trace hydration/candidate-drop visibility, with OpenMemory and claude-mem UI still unmeasured.", + "xy933_update_note": "XY-933 adds live ELF capture/write-policy scoring: ELF passes 4/4 capture_integration jobs with zero redaction leaks, qmd remains not_encoded, agentmemory comparison is blocked by mocked/in-memory storage, and claude-mem capture hooks remain not_encoded.", + "xy928_update_note": "XY-928 adds three blocked context_trajectory fixtures for OpenViking staged retrieval, hierarchy selection, and recursive/context expansion; no trajectory win/tie/loss is claimed." + }, + "claim_boundary": { + "elf_vs_qmd": "near_tie_with_narrow_delete_ttl_elf_lead_not_overall_win", + "elf_personal_production": "credible_with_bounded_caveats", + "broad_competitor_superiority": "not_proven", + "major_unmeasured_strengths": [ + "qmd_deep_retrieval_debug", + "OpenViking_context_trajectory", + "mem0_OpenMemory_entity_history_ui", + "agentmemory_claude_mem_capture_breadth", + "Letta_core_vs_archival_export_path", + "Graphiti_Zep_temporal_graph", + "RAG_graph_navigation", + "llm_wiki_gbrain_graphify_knowledge_workflows" + ] + }, + "next_reports": [ + "ELF/qmd retrieval-debug deep profile", + "ELF/qmd live memory-evolution diagnostic", + "External capture-hook report for agentmemory and claude-mem", + "Continuity and context-trajectory report", + "Personalization and core-memory report", + "Knowledge and graph/RAG report pack" + ] +} diff --git a/docs/research/2026-06-11-qmd-openviking-strength-profile-report.json b/docs/research/2026-06-11-qmd-openviking-strength-profile-report.json new file mode 100644 index 00000000..decee8e7 --- /dev/null +++ b/docs/research/2026-06-11-qmd-openviking-strength-profile-report.json @@ -0,0 +1,376 @@ +{ + "schema": "elf.competitor_strength_profile_report/v1", + "run_id": "2026-06-11-qmd-openviking-strength-profile", + "created_at": "2026-06-11", + "scope": "Scenario-level qmd retrieval-debug and OpenViking context-trajectory strength profile outcomes for XY-899.", + "inputs": [ + "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/guide/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", + "docs/spec/real_world_agent_memory_benchmark_v1.md", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", + "scripts/real-world-live-adapters.sh" + ], + "outcome_terms": [ + "elf_win", + "tie", + "elf_loss", + "not_tested" + ], + "result_type_terms": [ + "pass", + "wrong_result", + "blocked", + "incomplete", + "lifecycle_fail", + "not_encoded", + "unsupported_claim" + ], + "coverage_status_terms": [ + "pass", + "wrong_result", + "blocked", + "incomplete", + "lifecycle_fail", + "not_encoded", + "unsupported", + "unsupported_claim" + ], + "evidence_class_terms": [ + "fixture_backed", + "live_baseline_only", + "live_real_world", + "research_gate" + ], + "summary": { + "qmd": { + "overall_outcome": "not_tested", + "overall_rationale": "ELF ties qmd on encoded retrieval and lifecycle surfaces; qmd query-transparency, replayability, and expansion/fusion/rerank strengths remain not_tested for comparative scoring because equivalent scored ELF surfaces are not encoded.", + "retrieval_quality": "tie", + "local_query_transparency": "not_tested", + "local_replayability": "not_tested", + "expansion_fusion_rerank": "not_tested", + "claim": "ELF ties qmd on encoded retrieval correctness and equivalent update/delete/cold-start behavior. qmd remains the local retrieval-debug UX reference, but ELF has no scored loss on query-transparency, replayability, expansion, fusion, or rerank controls until equivalent comparative surfaces are encoded." + }, + "openviking": { + "overall_outcome": "not_tested", + "overall_rationale": "OpenViking context-trajectory strengths remain blocked/not_tested; ELF has only one same-corpus retrieval precondition win.", + "claim": "ELF has one measured win on the same-corpus evidence-bearing precondition where OpenViking currently returns wrong_result. ELF does not have a measured win, tie, or loss against OpenViking context-trajectory strengths because staged trajectory, hierarchy selection, and recursive expansion are encoded as blocked fixtures until scored staged output exists." + } + }, + "qmd_strength_profile": { + "scenario_outcomes": [ + { + "scenario_id": "qmd-retrieval-quality", + "surface": "retrieval quality", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "pass", + "elf_outcome": "tie", + "retrieval_quality": "ELF and qmd each pass 5/5 live real-world retrieval jobs with 6/6 expected evidence matched.", + "debug_replay_ergonomics": "not scored by this scenario", + "source_artifacts": [ + "tmp/real-world-memory/live-adapters/elf-report.json", + "tmp/real-world-memory/live-adapters/qmd-report.json", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "qmd-local-query-transparency", + "surface": "local query transparency", + "evidence_class": "live_baseline_only", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "pass", + "elf_outcome": "not_tested", + "retrieval_quality": "not a correctness scenario", + "debug_replay_ergonomics": "qmd stress artifacts expose per-query top-10 files, line numbers, snippets, scores, and distractor density; ELF stress artifacts expose trace ids and top evidence but do not hydrate an equivalent candidate list in the checked-in report, so this surface is not scored as a comparative ELF loss.", + "source_artifacts": [ + "scripts/live-baseline-benchmark.sh", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "qmd-expansion-fusion-rerank-controls", + "surface": "expansion, fusion, and rerank controls", + "evidence_class": "research_gate", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "elf_outcome": "not_tested", + "retrieval_quality": "not scored", + "debug_replay_ergonomics": "The qmd materializer and stress baseline use structured lex/vec query input with --no-rerank; no scenario scores expansion, fusion, or rerank superiority for either system.", + "source_artifacts": [ + "scripts/real-world-live-adapters.sh", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "qmd-stale-context-isolation", + "surface": "stale context isolation", + "evidence_class": "live_real_world", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "pass", + "elf_outcome": "tie", + "retrieval_quality": "Both adapters pass the encoded retrieval current-vs-obsolete and distractor-heavy jobs.", + "debug_replay_ergonomics": "The debug explanation of stale-candidate rejection is not scored beyond the job answer and evidence match.", + "source_artifacts": [ + "apps/elf-eval/fixtures/real_world_memory/retrieval/current_vs_obsolete.json", + "apps/elf-eval/fixtures/real_world_memory/retrieval/distractor_heavy.json", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "qmd-update-delete-cold-start", + "surface": "update, delete, and cold-start behavior", + "evidence_class": "live_baseline_only", + "result_type": "pass", + "elf_status": "pass", + "qmd_status": "pass", + "elf_outcome": "tie", + "retrieval_quality": "Equivalent qmd and ELF stress-baseline lifecycle checks pass for update replacement, delete suppression, and cold-start recovery.", + "debug_replay_ergonomics": "ELF has additional service lifecycle, backfill, rebuild, and resource evidence, but the equivalent qmd strength surface is a tie.", + "source_artifacts": [ + "tmp/live-baseline/live-baseline-report.json", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "qmd-operator-debug-evidence", + "surface": "operator-debug evidence", + "evidence_class": "live_real_world", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "elf_outcome": "not_tested", + "retrieval_quality": "not scored", + "debug_replay_ergonomics": "The live real-world sweep marks operator_debugging_ux not_encoded for both ELF and qmd. ELF fixture-backed operator-debug jobs pass, but they are not live adapter evidence.", + "source_artifacts": [ + "tmp/real-world-memory/live-adapters/elf-report.json", + "tmp/real-world-memory/live-adapters/qmd-report.json", + "apps/elf-eval/fixtures/real_world_memory/retrieval/stage_explainability_wrong_result.json" + ] + }, + { + "scenario_id": "qmd-local-replayability", + "surface": "local replayability", + "evidence_class": "live_baseline_only", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "pass", + "elf_outcome": "not_tested", + "retrieval_quality": "not a correctness scenario", + "debug_replay_ergonomics": "qmd's observed replay path is collection add, update, embed -f, and query --json in a fresh CLI process; ELF has service traces and admin bundle endpoints, but no scored replayability rule compares the two surfaces yet.", + "source_artifacts": [ + "scripts/live-baseline-benchmark.sh", + "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + ] + }, + { + "scenario_id": "qmd-wrong-result-diagnosis", + "surface": "wrong-result diagnosis", + "evidence_class": "research_gate", + "result_type": "not_encoded", + "elf_status": "not_encoded", + "qmd_status": "not_encoded", + "elf_outcome": "not_tested", + "retrieval_quality": "The memory-evolution diagnostic classifies qmd misses and selected-but-not-narrated lifecycle failures from produced evidence; candidate-drop classification remains untested because qmd live job artifacts do not expose candidate-stage traces.", + "debug_replay_ergonomics": "The report taxonomy supports absent evidence, retrieved-but-dropped evidence, selected-but-not-narrated evidence, and lifecycle-contradicted evidence. Current qmd data exercises absent and selected-but-not-narrated classes; retrieved-but-dropped remains not observed.", + "source_artifacts": [ + "docs/guide/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md" + ] + } + ], + "win_tie_loss_summary": { + "elf_win": 0, + "tie": 3, + "elf_loss": 0, + "not_tested": 5 + }, + "wrong_result_diagnosis": { + "taxonomy": [ + { + "class": "evidence_absent", + "meaning": "Required evidence is absent from the adapter-produced evidence ids.", + "coverage": "observed" + }, + { + "class": "retrieved_but_dropped", + "meaning": "Required evidence appears in an intermediate candidate set but is absent from the final selected/narrated answer.", + "coverage": "not_observed_candidate_trace_missing" + }, + { + "class": "selected_but_not_narrated", + "meaning": "Evidence is selected or available, but the answer does not narrate the required current-vs-historical or lifecycle relationship.", + "coverage": "observed" + }, + { + "class": "contradicted_by_lifecycle_evidence", + "meaning": "The answer is contradicted or made incomplete by available current, historical, supersession, or tombstone evidence.", + "coverage": "observed" + } + ], + "jobs": [ + { + "job_id": "memory-evolution-benchmark-verdict-001", + "qmd_status": "wrong_result", + "score": 0.15, + "classifications": [ + "evidence_absent", + "selected_but_not_narrated", + "contradicted_by_lifecycle_evidence" + ], + "missing_evidence": [ + "verdict-bounded-private-caveat" + ], + "diagnosis": "qmd missed the caveat evidence and did not represent the superseded not-ready verdict as historical." + }, + { + "job_id": "memory-evolution-deploy-method-001", + "qmd_status": "wrong_result", + "score": 0.4, + "classifications": [ + "selected_but_not_narrated", + "contradicted_by_lifecycle_evidence" + ], + "missing_evidence": [], + "diagnosis": "qmd retrieved current runbook and rationale evidence, but did not preserve the old quickstart path as historical." + }, + { + "job_id": "memory-evolution-issue-state-001", + "qmd_status": "wrong_result", + "score": 0.4, + "classifications": [ + "selected_but_not_narrated", + "contradicted_by_lifecycle_evidence" + ], + "missing_evidence": [], + "diagnosis": "qmd found current done state and rationale evidence, but did not surface the earlier blocked state as superseded history." + }, + { + "job_id": "memory-evolution-preference-001", + "qmd_status": "wrong_result", + "score": 0.15, + "classifications": [ + "evidence_absent", + "selected_but_not_narrated", + "contradicted_by_lifecycle_evidence" + ], + "missing_evidence": [ + "pref-current-concise-rationale" + ], + "diagnosis": "qmd only returned rationale evidence and did not preserve the old terse preference as historical." + }, + { + "job_id": "memory-evolution-relation-temporal-001", + "qmd_status": "wrong_result", + "score": 0.35, + "classifications": [ + "selected_but_not_narrated", + "contradicted_by_lifecycle_evidence" + ], + "missing_evidence": [], + "diagnosis": "qmd retrieved current and historical owners, but did not produce temporal-validity explanation or update rationale." + }, + { + "job_id": "memory-evolution-delete-ttl-001", + "qmd_status": "wrong_result", + "score": 0.5, + "classifications": [ + "evidence_absent", + "contradicted_by_lifecycle_evidence" + ], + "missing_evidence": [ + "delete-tombstone" + ], + "diagnosis": "qmd retrieved the current plan but missed the tombstone evidence, so the delete/TTL lifecycle answer remains a typed wrong_result." + } + ] + } + }, + "openviking_context_trajectory_profile": { + "scenario_outcomes": [ + { + "scenario_id": "openviking-local-embed-setup", + "surface": "Docker local embedding setup", + "evidence_class": "live_baseline_only", + "result_type": "pass", + "openviking_status": "pass", + "elf_equivalent_status": "unsupported", + "elf_outcome": "not_tested", + "typed_blocker": null, + "evidence": "The pinned llama-cpp-python==0.3.28 CPU wheel path installed and OpenViking reached add_resource/find in Docker." + }, + { + "scenario_id": "openviking-evidence-bearing-retrieval-precondition", + "surface": "same-corpus evidence-bearing retrieval precondition", + "evidence_class": "live_baseline_only", + "result_type": "wrong_result", + "openviking_status": "wrong_result", + "elf_equivalent_status": "pass", + "elf_outcome": "elf_win", + "typed_blocker": "output_missed_expected_terms", + "evidence": "OpenViking add_resource/find returned resources but matched 0/3 expected evidence-term checks; this is a wrong_result smoke output, not a trajectory comparison." + }, + { + "scenario_id": "openviking-staged-retrieval-trajectory", + "surface": "staged retrieval trajectory", + "evidence_class": "fixture_backed", + "result_type": "blocked", + "openviking_status": "blocked", + "elf_equivalent_status": "not_encoded", + "elf_outcome": "not_tested", + "typed_blocker": "needs_evidence_bearing_same_corpus_output", + "evidence": "The context_trajectory fixture context-trajectory-openviking-staged-retrieval-001 is encoded as blocked until OpenViking returns evidence-bearing same-corpus output and comparable staged artifacts." + }, + { + "scenario_id": "openviking-hierarchy-selection", + "surface": "hierarchy selection", + "evidence_class": "fixture_backed", + "result_type": "blocked", + "openviking_status": "blocked", + "elf_equivalent_status": "unsupported", + "elf_outcome": "not_tested", + "typed_blocker": "hierarchy_output_not_scored", + "evidence": "The context_trajectory fixture context-trajectory-openviking-hierarchy-selection-001 is encoded as blocked until selected hierarchy nodes and evidence ids are materialized." + }, + { + "scenario_id": "openviking-recursive-context-expansion", + "surface": "recursive/context expansion", + "evidence_class": "fixture_backed", + "result_type": "blocked", + "openviking_status": "blocked", + "elf_equivalent_status": "not_encoded", + "elf_outcome": "not_tested", + "typed_blocker": "recursive_expansion_not_materialized", + "evidence": "The context_trajectory fixture context-trajectory-openviking-recursive-expansion-001 is encoded as blocked until expansion paths and expected evidence ids are materialized." + }, + { + "scenario_id": "openviking-missed-expected-terms-evidence", + "surface": "typed failure evidence when expected terms are missed", + "evidence_class": "live_baseline_only", + "result_type": "wrong_result", + "openviking_status": "wrong_result", + "elf_equivalent_status": "pass", + "elf_outcome": "not_tested", + "typed_blocker": "retrieval_wrong_result", + "evidence": "The baseline report preserves the same missed expected terms as wrong_result instead of loosening evidence expectations or reporting setup failure; this row documents typed failure evidence and is not counted as a second comparative win." + } + ], + "win_tie_loss_summary": { + "elf_win": 1, + "tie": 0, + "elf_loss": 0, + "not_tested": 5 + } + }, + "claim_boundaries": [ + "ELF does not broadly beat qmd; it ties encoded retrieval and lifecycle correctness, keeps qmd query transparency as not_tested for comparative scoring, and leaves replayability not_tested.", + "qmd expansion, fusion, and rerank superiority remains not_tested because the current qmd paths use --no-rerank and do not score internals.", + "ELF does not beat OpenViking on context trajectory; OpenViking trajectory strengths remain blocked/not_tested behind a wrong_result same-corpus output precondition and missing staged artifacts.", + "Research_gate and blocked fixture records are follow-up gates, not pass evidence.", + "Missing equivalent surfaces are encoded as unsupported, blocked, or not_encoded rather than fake losses." + ] +} diff --git a/docs/research/2026-06-11-temporal-history-competitor-gap-report.json b/docs/research/2026-06-11-temporal-history-competitor-gap-report.json new file mode 100644 index 00000000..8bfcffd6 --- /dev/null +++ b/docs/research/2026-06-11-temporal-history-competitor-gap-report.json @@ -0,0 +1,356 @@ +{ + "schema": "elf.temporal_history_competitor_gap_report/v1", + "run_id": "2026-06-11-temporal-history-competitor-gap-report", + "commit": "d6d9051f9e28384410308ac952936fcdb021dbc2", + "created_at": "2026-06-11", + "scope": "Report-only competitor gap assessment for temporal/history memory, lifecycle smoke, and future ELF optimization direction", + "role_boundary": "No ELF optimization implementation is included; this report records evidence, claim boundaries, and future optimization directions.", + "commands": [ + { + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", + "status": "blocked", + "typed_status": "provider_api_key_missing", + "runtime_seconds": 3.5, + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json" + }, + { + "command": "ELF_BASELINE_PROJECTS=ELF,mem0 cargo make baseline-live-docker", + "status": "pass", + "runtime_seconds": 50.14, + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "command": "cargo make openmemory-ui-export-readback", + "status": "pass", + "runtime_seconds": 35.14, + "artifact": "tmp/live-baseline/mem0-checks.json; tmp/live-baseline/mem0-openmemory-ui-export.json", + "claim": "XY-924 local OSS mem0 history run passes preference correction history, entity-scoped personalization, local get_all readback, and deletion audit history; XY-931 records OpenMemory export-helper setup as blocked with DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER." + }, + { + "command": "cargo make real-world-memory-evolution", + "status": "pass", + "runtime_seconds": 59.65, + "artifact": "tmp/real-world-memory/evolution-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "status": "pass", + "runtime_seconds": 166.61, + "artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "executive_judgment": { + "goal_complete": false, + "summary": "ELF is a credible personal-production foundation, but the current evidence does not prove broad superiority across all tracked memory projects or all user-important scenarios.", + "highest_priority_gap": "temporal_reconciliation_and_lifecycle_readback", + "main_reason": "In live memory-evolution jobs, ELF retrieves the required evidence but does not represent current, historical, superseded, and deleted facts as explicit answer and trace state." + }, + "basic_local_lifecycle": { + "run_id": "live-baseline-20260611010431", + "project_filter": "ELF,mem0", + "verdict": "pass", + "summary": { + "total": 2, + "pass": 2, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 0, + "not_encoded": 0 + }, + "same_corpus_summary": { + "total": 2, + "pass": 2, + "fail": 0 + }, + "full_check_summary": { + "total": 12, + "pass": 12, + "fail": 0, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 0, + "not_encoded": 0 + }, + "projects": [ + { + "project": "ELF", + "status": "pass", + "elapsed_seconds": 11, + "checks": 8, + "checks_passed": 8, + "passed_capabilities": [ + "resumable_backfill_no_duplicates", + "same_corpus_retrieval", + "async_worker_indexing_e2e", + "update_replaces_note_text", + "delete_suppresses_retrieval", + "cold_start_recovery_search", + "concurrent_write_search_e2e", + "resource_envelope" + ] + }, + { + "project": "mem0", + "status": "pass", + "elapsed_seconds": 36, + "checks": 4, + "checks_passed": 4, + "passed_capabilities": [ + "same_corpus_retrieval", + "update_replaces_note_text", + "delete_suppresses_retrieval", + "cold_start_recovery_search" + ], + "not_measured": [ + "OpenMemory UI", + "hosted ecosystem behavior", + "OpenMemory UI/export quality", + "optional graph memory", + "real-world memory_evolution jobs" + ] + } + ], + "claim": "ELF and mem0 both pass the encoded local Docker lifecycle smoke; this does not prove ELF beats mem0/OpenMemory on its strongest product surfaces." + }, + "fixture_memory_evolution": { + "job_count": 5, + "pass": 5, + "wrong_result": 0, + "mean_score": 1.0, + "expected_evidence_total": 11, + "expected_evidence_matched": 11, + "conflict_detection_count": 5, + "update_rationale_available_count": 5, + "history_readback_encoded_count": 1 + }, + "live_real_world_context": { + "elf": { + "job_count": 38, + "encoded_suite_count": 11, + "pass": 18, + "wrong_result": 5, + "wrong_result_signal_count": 6, + "blocked": 2, + "not_encoded": 13, + "mean_score": 0.525, + "mean_latency_ms": 9.888, + "expected_evidence_total": 77, + "expected_evidence_matched": 41, + "evidence_required_count": 84, + "evidence_covered_count": 48 + }, + "qmd": { + "job_count": 38, + "encoded_suite_count": 11, + "pass": 17, + "wrong_result": 6, + "wrong_result_signal_count": 11, + "blocked": 2, + "not_encoded": 13, + "mean_score": 0.486, + "mean_latency_ms": 1132.646, + "expected_evidence_total": 77, + "expected_evidence_matched": 38, + "evidence_required_count": 84, + "evidence_covered_count": 45 + } + }, + "live_memory_evolution": { + "elf": { + "encoded_jobs": 6, + "pass": 1, + "wrong_result_jobs": 5, + "score_mean": 0.492, + "expected_evidence_recall": 1.0, + "diagnosis": "ELF retrieved all required memory-evolution evidence but did not emit lifecycle-aware current-vs-historical answer behavior on five jobs." + }, + "qmd": { + "encoded_jobs": 6, + "pass": 0, + "wrong_result_jobs": 6, + "score_mean": 0.325, + "expected_evidence_recall": 0.769, + "diagnosis": "qmd had the same missing temporal-conflict pattern and additionally missed evidence, including the delete tombstone." + }, + "job_matrix": [ + { + "job_id": "memory-evolution-benchmark-verdict-001", + "elf_status": "wrong_result", + "elf_score": 0.4, + "elf_evidence": "3/3", + "qmd_status": "wrong_result", + "qmd_score": 0.15, + "qmd_evidence": "2/3", + "diagnosis": "ELF found current verdict, caveat, and rationale but did not represent the superseded verdict as historical." + }, + { + "job_id": "memory-evolution-deploy-method-001", + "elf_status": "wrong_result", + "elf_score": 0.4, + "elf_evidence": "2/2", + "qmd_status": "wrong_result", + "qmd_score": 0.4, + "qmd_evidence": "2/2", + "diagnosis": "Both found current runbook and rationale, but neither preserved the old quickstart path as historical." + }, + { + "job_id": "memory-evolution-issue-state-001", + "elf_status": "wrong_result", + "elf_score": 0.4, + "elf_evidence": "2/2", + "qmd_status": "wrong_result", + "qmd_score": 0.4, + "qmd_evidence": "2/2", + "diagnosis": "Both found current done state and rationale, but neither surfaced the earlier blocked state as history." + }, + { + "job_id": "memory-evolution-preference-001", + "elf_status": "wrong_result", + "elf_score": 0.4, + "elf_evidence": "2/2", + "qmd_status": "wrong_result", + "qmd_score": 0.15, + "qmd_evidence": "1/2", + "diagnosis": "ELF found current preference and rationale, but did not preserve old preference history." + }, + { + "job_id": "memory-evolution-relation-temporal-001", + "elf_status": "wrong_result", + "elf_score": 0.35, + "elf_evidence": "2/2", + "qmd_status": "wrong_result", + "qmd_score": 0.35, + "qmd_evidence": "2/2", + "diagnosis": "Both found current and old owners, but did not emit temporal-validity explanation." + }, + { + "job_id": "memory-evolution-delete-ttl-001", + "elf_status": "pass", + "elf_score": 1.0, + "elf_evidence": "2/2", + "qmd_status": "wrong_result", + "qmd_score": 0.5, + "qmd_evidence": "1/2", + "diagnosis": "ELF found tombstone and current plan; qmd missed tombstone." + } + ] + }, + "graphiti_zep_temporal_smoke": { + "run_id": "graphiti-zep-docker-smoke-20260611010309", + "evidence_class": "research_gate", + "status": "blocked", + "failure_class": "provider_api_key_missing", + "failure_reason": "Graphiti/Zep live temporal search requires an explicit provider API key; no hosted Zep service or unrecorded provider credentials were used.", + "expected_evidence_ids": [ + "graphiti-zep-old-owner", + "graphiti-zep-current-owner", + "graphiti-zep-owner-rationale" + ], + "claim": "Graphiti/Zep remains a temporal-validity reference, but no live pass or ELF superiority claim is supported." + }, + "scenario_judgments": [ + { + "scenario": "basic_local_lifecycle", + "current_judgment": "elf_and_mem0_both_pass_encoded_smoke", + "claim_strength": "limited_tie_or_elf_broader_smoke_surface", + "next_gate": "OpenMemory compose/import path that loads the same corpus into the product app database; hosted Platform export and optional graph memory remain non-goals for the local OSS lane" + }, + { + "scenario": "retrieval_debug", + "current_judgment": "qmd_remains_debug_ux_reference", + "claim_strength": "no_elf_win_claim", + "next_gate": "ELF/qmd trace-level replay and wrong-result diagnosis" + }, + { + "scenario": "current_vs_historical_memory", + "current_judgment": "elf_narrowly_beats_qmd_but_still_fails_temporal_product_quality", + "claim_strength": "narrow_job_slice_only", + "next_gate": "ELF live memory_evolution pass for all six jobs" + }, + { + "scenario": "temporal_graph_validity", + "current_judgment": "graphiti_zep_blocked_reference", + "claim_strength": "no_comparable_claim", + "next_gate": "provider-backed Graphiti/Zep Docker temporal smoke" + }, + { + "scenario": "core_vs_archival_memory", + "current_judgment": "letta_research_only_reference", + "claim_strength": "no_comparable_claim", + "next_gate": "contained Letta export path and core-vs-archival jobs" + }, + { + "scenario": "production_operation_discipline", + "current_judgment": "elf_strongest_measured_local_story", + "claim_strength": "bounded_by_private_and_provider_gates", + "next_gate": "private-corpus and credentialed production-ops evidence only when operator inputs exist" + } + ], + "optimization_direction_order": [ + { + "priority": "P0", + "direction": "temporal_reconciliation_contract", + "description": "Add answer and trace semantics for current winner, historical loser, update rationale, tombstone, and supersession state.", + "benchmark_gate": "ELF live memory_evolution pass for all six jobs." + }, + { + "priority": "P0", + "direction": "mem0_openmemory_history_comparison", + "description": "Local OSS comparison has moved past basic update/delete smoke into preference history, entity memory, lifecycle inspection, deletion audit, and SDK export-style readback.", + "benchmark_gate": "Local OSS history jobs are encoded with per-scenario claims; OpenMemory UI/export has a bounded probe but remains blocked until a Docker-contained product app import/export path exists." + }, + { + "priority": "P0", + "direction": "qmd_level_debugging_and_replay", + "description": "Expose query expansion, sparse/dense retrieval, fusion, rerank, dropped candidates, conflict candidates, and replay commands.", + "benchmark_gate": "Every wrong result has a replayable trace that localizes absent, dropped, selected-but-not-narrated, or contradicted evidence." + }, + { + "priority": "P1", + "direction": "core_memory_blocks", + "description": "Evaluate Letta-style core memory blocks with provenance, attachment rules, stale-core detection, and archival fallback.", + "benchmark_gate": "Core-vs-archival jobs prove correct attachment, sharing, update visibility, and stale-core avoidance." + }, + { + "priority": "P1", + "direction": "capture_consolidation_knowledge_pages", + "description": "Score safe capture, reviewable consolidation, cited knowledge pages, timelines, and operator UX as live surfaces.", + "benchmark_gate": "Live capture, consolidation, knowledge, and operator-debugging suites move from not_encoded or fixture-only to comparable evidence." + }, + { + "priority": "P2", + "direction": "graph_rag_and_context_trajectory_adapters", + "description": "Measure Graphiti/Zep, RAGFlow, LightRAG, GraphRAG, graphify, OpenViking, llm-wiki, and gbrain with evidence-linked output contracts.", + "benchmark_gate": "Docker-contained or explicitly typed provider-backed adapters emit scored evidence outputs." + } + ], + "claim_boundaries": { + "allowed": [ + "ELF+mem0 basic local lifecycle smoke passed in the fresh Docker baseline.", + "mem0 local OSS history, entity-scoped personalization, deletion audit, and SDK get_all readback are measured by the XY-924 report.", + "OpenMemory UI/export readback is measured as a setup blocker by the XY-931 export-helper setup probe.", + "ELF narrowly outperformed qmd on the fresh memory-evolution slice because ELF passed delete/TTL and qmd did not.", + "ELF still failed five of six live memory-evolution jobs.", + "Graphiti/Zep temporal smoke is typed blocked due missing explicit provider key.", + "Letta is a design reference, not a measured comparable competitor in this report." + ], + "not_allowed": [ + "All goals are complete.", + "ELF beats all tracked memory projects.", + "ELF beats mem0/OpenMemory on preference history, UI/export, hosted behavior, or graph memory.", + "ELF beats Graphiti/Zep on temporal validity.", + "ELF beats Letta on core-vs-archival memory.", + "Fixture pass, baseline smoke pass, and live real-world pass are interchangeable evidence classes." + ] + }, + "next_issue_directions": [ + "P0 ELF live temporal reconciliation and trace contract", + "P0 OpenMemory Docker compose/import path after the XY-931 UI/export setup blocker", + "P0 ELF/qmd trace-level replay and wrong-result diagnosis", + "P1 Letta-style core-vs-archival memory benchmark", + "P2 Graphiti/Zep provider-backed temporal smoke after explicit provider credentials exist", + "P2 graph/RAG and knowledge-page Docker-contained evidence adapters" + ] +} diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json new file mode 100644 index 00000000..f74e0d45 --- /dev/null +++ b/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json @@ -0,0 +1,660 @@ +{ + "schema": "elf.competitor_strength_evidence_matrix/v1", + "matrix_id": "xy-897-competitor-strength-evidence-matrix-2026-06-11", + "date": "2026-06-11", + "authority": "XY-897", + "purpose": "Keep competitor-strength claims tied to measured evidence classes, typed blockers, and next benchmark gates.", + "source_inputs": [ + "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md", + "docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md", + "docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md", + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "docs/guide/research/external_memory_improvement_plan.md", + "docs/guide/research/research_projects_inventory.md", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", + "Makefile.toml" + ], + "claim_boundary": { + "summary": "Do not claim ELF beats, ties, or loses to a project unless the named scenario is encoded and run at a comparable evidence class.", + "current_live_real_world_boundary": "ELF and qmd have full-suite live_real_world sweeps, but both are typed non-pass sweeps, not full-suite live passes.", + "research_gate_boundary": "Research-gate records are routing evidence for future adapters and must not be counted as fixture-backed, live-baseline, or live-real-world pass evidence.", + "operator_boundary": "Private corpus and credentialed production-ops checks remain blocked until operator-owned inputs are supplied." + }, + "manifest_summary": { + "adapter_records": 23, + "project_count": 17, + "evidence_class_counts": { + "fixture_backed": 1, + "live_baseline_only": 6, + "live_real_world": 5, + "research_gate": 11 + }, + "overall_status_counts": { + "lifecycle_fail": 1, + "blocked": 7, + "not_encoded": 5, + "pass": 4, + "wrong_result": 6 + } + }, + "state_taxonomy": [ + { + "state": "fixture_backed", + "meaning": "A checked-in fixture or generated fixture response is scored by the real-world job runner. This is evidence for the benchmark contract, not live runtime behavior." + }, + { + "state": "live_baseline_only", + "meaning": "A Docker live-baseline adapter ran same-corpus or lifecycle checks, but no real-world job suite was scored through that project." + }, + { + "state": "live_real_world", + "meaning": "A project adapter materialized and scored real-world job records through a runtime or CLI path." + }, + { + "state": "research_gate", + "meaning": "Source, setup, resource, retry, and output-contract metadata exists, but the project has not produced live adapter pass evidence." + }, + { + "state": "blocked", + "meaning": "A safe measurement cannot run without operator-owned credentials, private data, setup proof, or a dependency outside the lane." + }, + { + "state": "unsupported", + "meaning": "The capability is out of scope for the project shape or would require a non-comparable path such as host-global state." + }, + { + "state": "wrong_result", + "meaning": "The system ran but missed expected memory, evidence, or answer terms." + }, + { + "state": "lifecycle_fail", + "meaning": "Basic retrieval may work, but update, delete, reload, persistence, or cold-start behavior is wrong or incomplete." + }, + { + "state": "incomplete", + "meaning": "The run did not reach the behavioral check because setup, install, dependency, or runtime execution failed." + }, + { + "state": "not_encoded", + "meaning": "The scenario is not currently encoded for that project or evidence class, so no pass or fail claim is allowed." + } + ], + "project_matrix": [ + { + "project": "ELF", + "strongest_user_facing_scenario": "Evidence-linked source-of-truth memory service with real-world fixtures and live service retrieval sweeps.", + "current_evidence_class": "live_real_world", + "supporting_evidence_classes": [ + "fixture_backed", + "live_real_world" + ], + "measured_status": "wrong_result", + "proof": { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.md; tmp/real-world-job/operator-ux-live-adapters/elf-report.md" + }, + "unsupported_or_blocked_status": { + "state": "blocked", + "typed_reason": "private_manifest_and_provider_credentials", + "details": "Fixture production-ops keeps private corpus and provider credential gates blocked; the full live sweep keeps broader non-retrieval suites typed non-pass, while the narrow operator-debug and live capture/write-policy slices now pass." + }, + "benchmark_before_claim": "A full-suite live_real_world pass plus separate private-corpus, credentialed production-ops, and durable external capture-hook evidence is required before broad live parity, production, or capture-breadth claims.", + "borrow_if_stronger": "Keep borrowing qmd debug knobs, OpenViking staged trajectory, mem0 history, Letta core memory, agentmemory/claude-mem capture breadth, and graph/RAG navigation patterns where they remain stronger." + }, + { + "project": "qmd", + "strongest_user_facing_scenario": "Local retrieval-debug workflow with transparent CLI indexing, querying, expansion, fusion, and rerank ergonomics.", + "current_evidence_class": "live_real_world", + "supporting_evidence_classes": [ + "live_baseline_only", + "live_real_world", + "research_gate" + ], + "measured_status": "wrong_result", + "proof": { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md; tmp/real-world-job/operator-ux-live-adapters/qmd-report.md" + }, + "unsupported_or_blocked_status": { + "state": "not_encoded", + "typed_reason": "deep_profile_and_non_retrieval_suites_not_encoded", + "details": "The full live sweep passes targeted retrieval suites but keeps memory_evolution wrong_result and several broader suites not_encoded or blocked; the narrow operator-debug slice ties replay commands but is wrong_result for trace hydration and candidate-drop visibility." + }, + "benchmark_before_claim": "Keep qmd deep retrieval/debug profiling separate from the narrow operator-debug live slice; no broad ELF-over-qmd or qmd-over-ELF claim is allowed until comparable stage artifacts exist.", + "borrow_if_stronger": "Borrow transparent local knobs for query rewriting, weighted fusion, rerank explanation, and command-line replay." + }, + { + "project": "agentmemory", + "strongest_user_facing_scenario": "Coding-agent continuity, MCP/REST packaging, viewer workflow, and durable cross-agent memory lifecycle.", + "current_evidence_class": "live_baseline_only", + "supporting_evidence_classes": [ + "live_baseline_only" + ], + "measured_status": "lifecycle_fail", + "proof": { + "command": "ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "unsupported_or_blocked_status": { + "state": "blocked", + "typed_reason": "durable_lifecycle_and_capture_adapter_missing", + "details": "Same-corpus retrieval can run, but durable cold-start, capture-hook persistence, and real-world job adapter coverage are blocked by the current process-local StateKV Map and in-memory index path." + }, + "benchmark_before_claim": "Add a durable local adapter that covers update, delete, cold-start reload, work resume, capture/write policy, and lifecycle-staleness jobs.", + "borrow_if_stronger": "Borrow cross-agent hooks, packaging, continuity scenarios, and operator-visible viewer affordances." + }, + { + "project": "mem0/OpenMemory", + "strongest_user_facing_scenario": "Memory lifecycle, personalization, hosted/OpenMemory UI ergonomics, and optional graph memory.", + "current_evidence_class": "live_baseline_only", + "supporting_evidence_classes": [ + "live_baseline_only" + ], + "measured_status": "pass", + "proof": { + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "unsupported_or_blocked_status": { + "state": "blocked", + "typed_reason": "openmemory_export_helper_setup_blocked", + "details": "Local OSS same-corpus/update/delete/reload, entity/preference history, deletion-audit readback, and SDK get_all readback now pass. OpenMemory UI/export remains blocked by the XY-931 export-helper setup probe until a product app import/export path can load the same corpus. Hosted Platform export is unsupported in the local OSS lane, and optional graph memory plus real-world prompt adapter coverage remain not_encoded." + }, + "benchmark_before_claim": "Add a Docker-contained OpenMemory product app import/export path, then score browser/API readback separately from SDK get_all; keep hosted Platform and graph memory opt-in or non-goal unless explicitly enabled.", + "borrow_if_stronger": "Borrow entity-scoped memory history, lifecycle surfaces, async update ergonomics, and OpenMemory-style inspection UX." + }, + { + "project": "memsearch", + "strongest_user_facing_scenario": "Markdown-first canonical store with rebuildable local index and practical hybrid retrieval.", + "current_evidence_class": "live_baseline_only", + "supporting_evidence_classes": [ + "live_baseline_only", + "fixture_backed" + ], + "measured_status": "pass", + "proof": { + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker; cargo make real-world-first-generation-oss", + "artifact": "tmp/live-baseline/live-baseline-report.json; tmp/real-world-memory/first-generation-oss/report.json" + }, + "unsupported_or_blocked_status": { + "state": "not_encoded", + "typed_reason": "live_prompt_runtime_adapter_not_encoded", + "details": "Basic canonical Markdown same-corpus/reindex/update/delete/reload smoke passes, and XY-925 adds fixture-backed source-store and retrieval-debug prompts. No live memsearch runtime adapter executes prompt scoring yet; memory-evolution prompt adapters remain not encoded and TTL/expiry is unsupported by the current CLI path." + }, + "benchmark_before_claim": "Promote the fixture-backed source-store and retrieval-debug prompts into a live memsearch real-world adapter before any suite-level win/loss claim; keep TTL/expiry unsupported unless a comparable path exists.", + "borrow_if_stronger": "Borrow the canonical markdown-store ergonomics, local reindex clarity, and user-inspectable source files." + }, + { + "project": "OpenViking", + "strongest_user_facing_scenario": "Filesystem-like context trajectory, hierarchical retrieval, and staged context loading.", + "current_evidence_class": "live_baseline_only", + "supporting_evidence_classes": [ + "live_baseline_only", + "fixture_backed", + "research_gate" + ], + "measured_status": "wrong_result", + "proof": { + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "unsupported_or_blocked_status": { + "state": "blocked", + "typed_reason": "hierarchical_context_trajectory_blocked", + "details": "Pinned Docker local embedding setup reaches add_resource/find, but same-corpus output misses expected evidence; staged retrieval, hierarchy selection, and recursive/context expansion jobs are encoded as blocked fixtures." + }, + "benchmark_before_claim": "First make evidence-bearing same-corpus output pass, then run a context-trajectory suite that scores staged retrieval paths and hierarchy expansion.", + "borrow_if_stronger": "Borrow the viking-style filesystem context model, trajectory readback, and staged retrieval planning." + }, + { + "project": "claude-mem", + "strongest_user_facing_scenario": "Progressive disclosure, automatic capture loop, repository-local lifecycle, and practical local viewer workflow.", + "current_evidence_class": "live_baseline_only", + "supporting_evidence_classes": [ + "live_baseline_only", + "fixture_backed" + ], + "measured_status": "wrong_result", + "proof": { + "command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker; cargo make real-world-first-generation-oss", + "artifact": "tmp/live-baseline/live-baseline-report.json; tmp/real-world-memory/first-generation-oss/report.json" + }, + "unsupported_or_blocked_status": { + "state": "blocked", + "typed_reason": "hook_viewer_runtime_paths_blocked", + "details": "Same-corpus retrieval remains wrong_result; XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompts. Hook capture and viewer/operator workflows still lack a Docker-contained runner, and the repair prompt lists rerun/inspection targets tmp/live-baseline/claude-mem.log plus tmp/live-baseline/claude-mem-checks.json." + }, + "benchmark_before_claim": "Promote durable repository-backed work_resume, operator_debugging_ux, capture/write-policy, viewer/operator, and progressive-disclosure prompts into a live claude-mem adapter before any broader UX claim.", + "borrow_if_stronger": "Borrow progressive disclosure, automatic capture review loops, and local viewer/operator comfort." + }, + { + "project": "RAGFlow", + "strongest_user_facing_scenario": "Full RAG application workflow with document, chunk, and reference evidence handles.", + "current_evidence_class": "research_gate", + "supporting_evidence_classes": [ + "research_gate" + ], + "measured_status": "blocked", + "proof": { + "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" + }, + "unsupported_or_blocked_status": { + "state": "blocked", + "typed_reason": "docker_service_resource_envelope_and_adapter_output_mapping", + "details": "Research says adapter candidate, but Docker runtime proof and reference.chunks to benchmark evidence mapping must still run." + }, + "benchmark_before_claim": "Run XY-885 tiny Docker evidence-smoke adapter and map RAGFlow reference chunks to scored retrieval/debug evidence.", + "borrow_if_stronger": "Borrow document/chunk reference surfaces, resource-envelope reporting, and RAG app evidence handles." + }, + { + "project": "LightRAG", + "strongest_user_facing_scenario": "Lightweight graph/RAG context export with source file-path citation shape.", + "current_evidence_class": "research_gate", + "supporting_evidence_classes": [ + "research_gate" + ], + "measured_status": "blocked", + "proof": { + "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context", + "artifact": "tmp/real-world-memory/lightrag-context/summary.json" + }, + "unsupported_or_blocked_status": { + "state": "blocked", + "typed_reason": "docker_service_setup_and_context_export_not_proven", + "details": "The project is an adapter candidate, but retrieved-context export and real-world adapter scoring remain blocked." + }, + "benchmark_before_claim": "Run XY-886 Docker context-export adapter with explicit LLM and embedding config plus source citation mapping.", + "borrow_if_stronger": "Borrow context-only query modes, graph-aware retrieval layout, and file-path citation readback." + }, + { + "project": "GraphRAG", + "strongest_user_facing_scenario": "GraphRAG indexing, graph summaries, and document/text-unit evidence tables.", + "current_evidence_class": "research_gate", + "supporting_evidence_classes": [ + "research_gate" + ], + "measured_status": "blocked", + "proof": { + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker", + "artifact": "tmp/real-world-memory/graphrag-smoke/summary.json" + }, + "unsupported_or_blocked_status": { + "state": "blocked", + "typed_reason": "indexing_resource_envelope_and_source_citation_mapping", + "details": "Cost-bounded Docker CLI/API and parquet outputs are identified, but indexing and evidence mapping have not passed." + }, + "benchmark_before_claim": "Run XY-887 cost-bounded Docker adapter over a tiny corpus and score output tables against retrieval and knowledge-synthesis evidence.", + "borrow_if_stronger": "Borrow graph summary artifacts, local/global search separation, and source table evidence mapping." + }, + { + "project": "Graphiti/Zep", + "strongest_user_facing_scenario": "Temporal graph memory with current, historical, and future fact validity windows.", + "current_evidence_class": "research_gate", + "supporting_evidence_classes": [ + "research_gate" + ], + "measured_status": "blocked", + "proof": { + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json" + }, + "unsupported_or_blocked_status": { + "state": "blocked", + "typed_reason": "docker_graph_store_and_temporal_adapter_not_proven", + "details": "Temporal graph memory is an adapter candidate, but Docker graph-store setup and real-world job scoring are blocked." + }, + "benchmark_before_claim": "Run XY-888 Docker-local temporal graph adapter and score current versus historical fact validity with evidence ids.", + "borrow_if_stronger": "Borrow temporal fact windows, invalidation/supersession semantics, and graph fact provenance." + }, + { + "project": "Letta", + "strongest_user_facing_scenario": "Core memory blocks versus archival memory with explicit operating-context surfaces.", + "current_evidence_class": "research_gate", + "supporting_evidence_classes": [ + "research_gate" + ], + "measured_status": "blocked", + "proof": { + "command": "blocked until a Docker-only benchmark-created agent export is implemented", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + "unsupported_or_blocked_status": { + "state": "blocked", + "typed_reason": "contained_export_readback_artifact_missing", + "details": "The selected contract requires a benchmark-created Letta agent export with core block JSON, archival search/readback JSON, and source ids before any scenario claim can be scored." + }, + "benchmark_before_claim": "Implement and run the contained export/readback adapter before any Letta win, tie, or loss claim; keep personalization and project-decision scenarios blocked or not tested until that evidence exists.", + "borrow_if_stronger": "Borrow explicit core memory block ergonomics, archival separation, and shared operating context readback." + }, + { + "project": "LangGraph", + "strongest_user_facing_scenario": "Checkpoint/replay regression workflow and durable state replay for agent runs.", + "current_evidence_class": "research_gate", + "supporting_evidence_classes": [ + "research_gate" + ], + "measured_status": "not_encoded", + "proof": { + "command": null, + "artifact": "docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json" + }, + "unsupported_or_blocked_status": { + "state": "unsupported", + "typed_reason": "not_a_standalone_memory_backend_adapter", + "details": "Keep as a checkpoint/replay reference, not as a direct memory backend competitor until a comparable memory output contract exists." + }, + "benchmark_before_claim": "Non-goal for direct win/loss until a standalone memory adapter contract exists; use replay regression jobs as a benchmark infrastructure reference.", + "borrow_if_stronger": "Borrow checkpoint replay, deterministic regression, and state-diff evaluation patterns." + }, + { + "project": "nanograph", + "strongest_user_facing_scenario": "Typed graph schema and query ergonomics for graph-lite developer experience.", + "current_evidence_class": "research_gate", + "supporting_evidence_classes": [ + "research_gate" + ], + "measured_status": "not_encoded", + "proof": { + "command": null, + "artifact": "docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json" + }, + "unsupported_or_blocked_status": { + "state": "unsupported", + "typed_reason": "not_a_memory_backend_comparison_target", + "details": "Official shape is no server and no Docker path; use as graph-lite DX reference rather than adapter proof." + }, + "benchmark_before_claim": "Non-goal for direct win/loss unless a contained memory-backed comparison target emerges; measure ELF graph-lite DX against typed schema/query acceptance instead.", + "borrow_if_stronger": "Borrow typed relation schema, query ergonomics, and small graph developer experience." + }, + { + "project": "llm-wiki", + "strongest_user_facing_scenario": "LLM-maintained wiki or knowledge-page workflow with query-save and lint loops.", + "current_evidence_class": "research_gate", + "supporting_evidence_classes": [ + "research_gate" + ], + "measured_status": "not_encoded", + "proof": { + "command": null, + "artifact": "docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json" + }, + "unsupported_or_blocked_status": { + "state": "unsupported", + "typed_reason": "live_service_runtime_not_available_for_adapter_proof", + "details": "Research-only until a contained plugin or instruction harness can emit scored knowledge-page evidence." + }, + "benchmark_before_claim": "Select a contained plugin or instruction harness, then score knowledge pages for citation coverage, unsupported claims, rebuild, and stale-source lint.", + "borrow_if_stronger": "Borrow maintained wiki workflows, page lint, query-save loops, and topic-scoped knowledge navigation." + }, + { + "project": "gbrain", + "strongest_user_facing_scenario": "Operational knowledge brain with compiled_truth pages, timelines, enrichment, and maintenance loops.", + "current_evidence_class": "research_gate", + "supporting_evidence_classes": [ + "research_gate" + ], + "measured_status": "not_encoded", + "proof": { + "command": null, + "artifact": "docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json" + }, + "unsupported_or_blocked_status": { + "state": "blocked", + "typed_reason": "docker_local_brain_repo_and_database_path_missing", + "details": "Research remains blocked until a Docker-local brain repo and database path can be proven without operator-owned state." + }, + "benchmark_before_claim": "First prove Docker-local repository and database setup, then encode compiled_truth/timeline page scoring and operator-continuity jobs.", + "borrow_if_stronger": "Borrow compiled truth pages, timeline maintenance, and human-operable knowledge-brain navigation." + }, + { + "project": "graphify", + "strongest_user_facing_scenario": "Graph-compressed navigation with graph.json and GRAPH_REPORT evidence outputs.", + "current_evidence_class": "live_real_world", + "supporting_evidence_classes": [ + "live_real_world" + ], + "measured_status": "wrong_result", + "proof": { + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" + }, + "unsupported_or_blocked_status": { + "state": "not_encoded", + "typed_reason": "broad_graph_navigation_not_encoded", + "details": "The tiny generated graph/report smoke scores wrong_result; broad graph navigation, rebuild behavior, private-corpus, and large-corpus quality remain not encoded." + }, + "benchmark_before_claim": "Expand beyond the tiny generated smoke and score representative graph/RAG navigation jobs before any broad graphify quality or ELF comparison claim.", + "borrow_if_stronger": "Borrow graph compression, source-location graph reports, and navigation hints for large code or document spaces." + } + ], + "scenario_matrix": [ + { + "scenario_id": "retrieval_debug", + "scenario": "retrieval/debug", + "current_elf_evidence": "ELF fixture-backed retrieval passes and ELF live_real_world retrieval passes in the full sweep.", + "strongest_competitor_or_reference": "qmd", + "current_competitor_evidence": "qmd live_real_world retrieval passes and qmd live_baseline_only checks pass, but qmd full-suite live status is wrong_result.", + "current_state": "Measured tie on encoded retrieval answers; qmd remains stronger on local debug ergonomics not fully scored.", + "next_measurement": "Run qmd deep retrieval/debug profile and ELF/qmd trace-level wrong-result replay with expansion, fusion, rerank, and candidate-drop diagnostics." + }, + { + "scenario_id": "work_resume", + "scenario": "work resume", + "current_elf_evidence": "ELF fixture-backed work_resume passes and ELF live_real_world work_resume passes.", + "strongest_competitor_or_reference": "agentmemory, claude-mem, OpenViking", + "current_competitor_evidence": "agentmemory is live_baseline_only with lifecycle_fail; claude-mem work_resume remains not_encoded pending a durable repository-backed adapter; OpenViking work_resume is not_encoded.", + "current_state": "ELF and qmd have current encoded live pass evidence, but continuity-oriented competitors remain undermeasured.", + "next_measurement": "Encode durable agentmemory, claude-mem, and OpenViking work_resume adapters or declare each blocked with lifecycle/setup evidence." + }, + { + "scenario_id": "project_decisions", + "scenario": "project decisions", + "current_elf_evidence": "ELF fixture-backed and live_real_world project_decisions suites pass; the ELF core_archival_memory fixture also scores project-decision recovery through core routing plus archival rationale.", + "strongest_competitor_or_reference": "qmd, Letta", + "current_competitor_evidence": "qmd live_real_world project_decisions passes; Letta project-decision recovery is research_gate not_tested or blocked until the contained export path exists.", + "current_state": "ELF and qmd are the only measured live competitors for this scenario; Letta remains a product-reference comparison target.", + "next_measurement": "Run the Letta core/archival export/readback contract before treating project-decision recovery as a comparable scenario." + }, + { + "scenario_id": "source_of_truth", + "scenario": "source-of-truth", + "current_elf_evidence": "ELF fixture-backed trust_source_of_truth passes and ELF live_real_world trust_source_of_truth passes.", + "strongest_competitor_or_reference": "memsearch", + "current_competitor_evidence": "memsearch canonical-store, reindex, delete, and reload smoke passes; XY-925 fixture-backed source-of-truth prompts now cover the canonical Markdown rebuild/reload boundary, but no live memsearch prompt adapter pass is claimed.", + "current_state": "ELF has stronger measured live real-world source-of-truth evidence; memsearch now ties the local canonical-store reindex/reload smoke and has fixture-backed prompt coverage as a local-store ergonomics reference.", + "next_measurement": "Promote memsearch source-of-truth rebuild/reload prompts into a live adapter before any suite-level win/loss claim." + }, + { + "scenario_id": "temporal_current_historical", + "scenario": "temporal/current-vs-historical memory", + "current_elf_evidence": "ELF fixture-backed memory_evolution passes, but ELF live_real_world memory_evolution is wrong_result.", + "strongest_competitor_or_reference": "Graphiti/Zep, mem0/OpenMemory", + "current_competitor_evidence": "Graphiti/Zep is research_gate blocked; mem0/OpenMemory local OSS preference history, entity scope, deletion audit, and SDK get_all now pass; OpenMemory UI/export is blocked by the export-helper setup probe; graph-memory scenarios are not_encoded.", + "current_state": "No project has a comparable live pass for current-vs-historical evidence; ELF cannot claim live superiority yet.", + "next_measurement": "Fix ELF/qmd live memory_evolution evidence links, add OpenMemory product app import/export readback, and run XY-888 Graphiti/Zep temporal graph adapter." + }, + { + "scenario_id": "consolidation", + "scenario": "consolidation", + "current_elf_evidence": "ELF fixture-backed consolidation passes, and XY-934 adds live_real_world service-backed proposal scoring with source lineage, confidence/usefulness, unsupported-claim flags, apply/defer/discard audit, and zero source mutations.", + "strongest_competitor_or_reference": "managed dreaming, Always-On Memory Agent patterns, agentmemory, llm-wiki", + "current_competitor_evidence": "No direct live competitor runner emits comparable consolidation artifacts; qmd remains not_encoded and managed dreaming plus Always-On Memory Agent patterns are product references only.", + "current_state": "ELF has live consolidation self-check evidence, but no broad consolidation superiority or direct competitor parity claim is allowed without contained external runners.", + "next_measurement": "Add contained competitor/reference runners only if they can emit source ids, confidence, unsupported-claim flags, and review-action audit artifacts." + }, + { + "scenario_id": "knowledge_pages", + "scenario": "knowledge pages", + "current_elf_evidence": "ELF fixture-backed knowledge_compilation passes, but live_real_world knowledge_compilation is not_encoded.", + "strongest_competitor_or_reference": "llm-wiki, gbrain, GraphRAG, graphify", + "current_competitor_evidence": "llm-wiki and gbrain are research_gate not_encoded or blocked; GraphRAG remains research_gate blocked; graphify has a tiny live_real_world wrong_result smoke.", + "current_state": "No live knowledge-page competitor pass exists; graphify has only bounded non-pass tiny-smoke evidence and ELF has fixture-backed derived-page evidence.", + "next_measurement": "Encode live knowledge-page rebuild/lint scoring for ELF and run larger contained llm-wiki, gbrain, GraphRAG, or graphify adapters only after setup proof exists." + }, + { + "scenario_id": "operator_debugging", + "scenario": "operator debugging", + "current_elf_evidence": "ELF fixture-backed operator_debugging_ux passes, and the narrow live_real_world operator-debug slice passes for trace hydration, candidate-drop visibility, selected-but-not-narrated evidence, replay-command availability, and repair-action clarity.", + "strongest_competitor_or_reference": "qmd, claude-mem, OpenMemory", + "current_competitor_evidence": "qmd now has a narrow live_real_world operator-debug slice: replay-command availability and repair-action clarity pass, but trace hydration, candidate-drop stage visibility, and selected-but-not-narrated evidence are wrong_result. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, while claude-mem viewer/operator and OpenMemory UI/export remain blocked.", + "current_state": "ELF has a narrow comparable live win over qmd for trace hydration and candidate-drop visibility, while OpenMemory and claude-mem viewer/operator workflows remain blocked for broad UX claims.", + "next_measurement": "Add bounded OpenMemory and claude-mem UI/export or viewer runners before any broader operator-UX claim." + }, + { + "scenario_id": "capture_write_policy", + "scenario": "capture/write policy", + "current_elf_evidence": "ELF fixture-backed capture_integration passes, and ELF live_real_world capture_integration passes 4/4 with zero redaction leaks, source ids, write-policy audit, and evidence binding.", + "strongest_competitor_or_reference": "agentmemory, claude-mem", + "current_competitor_evidence": "agentmemory and claude-mem hook capture remain blocked until Docker-contained hook observations and write-policy/viewer readback artifacts exist.", + "current_state": "ELF has live capture/write-policy self-check evidence, but agentmemory and claude-mem capture-breadth comparisons remain blocked.", + "next_measurement": "Run durable agentmemory and claude-mem capture-hook jobs that prove redaction, exclusion, evidence binding, source ids, and no secret leakage." + }, + { + "scenario_id": "production_ops", + "scenario": "production ops", + "current_elf_evidence": "ELF production runbooks and fixture production_ops cover restore, Qdrant rebuild, backfill resume, resource envelope, and typed private/credential blockers; live_real_world production_ops is blocked.", + "strongest_competitor_or_reference": "ELF production gate, qmd, RAG/RAGFlow resource gates", + "current_competitor_evidence": "qmd live production_ops is blocked; RAGFlow/GraphRAG/LightRAG resource gates are research_gate blocked.", + "current_state": "ELF has the strongest checked-in production evidence, but private corpus and credentialed gates remain blocked.", + "next_measurement": "Rerun private-corpus and credentialed production-ops gates only when operator-owned manifest and credentials are supplied." + }, + { + "scenario_id": "personalization", + "scenario": "personalization", + "current_elf_evidence": "ELF fixture-backed personalization passes and ELF live_real_world personalization passes.", + "strongest_competitor_or_reference": "mem0/OpenMemory, Letta", + "current_competitor_evidence": "mem0/OpenMemory local OSS entity-scoped personalization now passes; OpenMemory UI/export remains blocked, hosted Platform export is non-goal, optional graph memory remains outside local OSS scoring, and Letta personalization is research_gate not_encoded.", + "current_state": "ELF, qmd, and mem0 local OSS have measured scoped-preference evidence, so scoped personalization is a tie on the current surface; mem0 preference-correction history remains a separate ELF loss.", + "next_measurement": "Add OpenMemory product app import/export and contained Letta scoped-preference readback before making broader personalization superiority claims." + }, + { + "scenario_id": "context_trajectory", + "scenario": "context trajectory", + "current_elf_evidence": "ELF has trace and trajectory directions, but staged context trajectory is not yet a comparable live scenario.", + "strongest_competitor_or_reference": "OpenViking", + "current_competitor_evidence": "OpenViking Docker setup is pinned, same-corpus retrieval is wrong_result, and hierarchical trajectory jobs are fixture-backed blocked gates.", + "current_state": "OpenViking remains the strongest design reference, but not a measured live winner.", + "next_measurement": "Make OpenViking same-corpus evidence-bearing retrieval pass, then score hierarchical expansion and staged context trajectory outputs." + }, + { + "scenario_id": "core_vs_archival_memory", + "scenario": "core-vs-archival memory", + "current_elf_evidence": "ELF fixture core_archival_memory passes 6/6 and scores core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search.", + "strongest_competitor_or_reference": "Letta", + "current_competitor_evidence": "Letta is research_gate blocked/not_tested until the selected contained export/readback artifact exists.", + "current_state": "ELF has fixture-only core-block evidence; Letta remains unscored, so no win, tie, or loss claim is allowed.", + "next_measurement": "Implement the Letta export/readback adapter, then compare only scenarios whose core block JSON, archival search/readback JSON, and source ids are present." + }, + { + "scenario_id": "graph_rag_navigation", + "scenario": "graph/RAG navigation", + "current_elf_evidence": "ELF relation context and graph-lite work are not enough to claim graph/RAG navigation parity.", + "strongest_competitor_or_reference": "RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, graphify", + "current_competitor_evidence": "RAGFlow, LightRAG, GraphRAG, and Graphiti/Zep remain research_gate blocked or incomplete; graphify has a tiny live_real_world wrong_result smoke.", + "current_state": "No RAG/graph project has live_real_world pass evidence; graphify supplies only bounded non-pass tiny-smoke evidence.", + "next_measurement": "Run larger Docker-contained adapters and require evidence-linked outputs before any graph/RAG navigation claim." + } + ], + "parallelizable_followups": [ + { + "workstream": "qmd deep retrieval/debug profile", + "issue_or_candidate": "new benchmark issue", + "parallelizable": true, + "blocked_by": "None after this matrix lands.", + "measurement": "Stress profile plus trace-level retrieval-debug artifacts for qmd and ELF." + }, + { + "workstream": "agentmemory durable lifecycle adapter", + "issue_or_candidate": "[ELF benchmark P0] Make external adapters lifecycle-durable and fail-typed", + "parallelizable": true, + "blocked_by": "Durable local adapter path selection.", + "measurement": "Update, delete, cold-start reload, work_resume, and capture/write-policy jobs." + }, + { + "workstream": "agentmemory/claude-mem capture-hook breadth", + "issue_or_candidate": "follow-up after XY-933", + "parallelizable": true, + "blocked_by": "Docker-contained hook/viewer capture path with durable artifacts.", + "measurement": "Source ids, redaction/exclusion audit, evidence-bound output, and typed blocker reporting." + }, + { + "workstream": "mem0/OpenMemory history and UI coverage", + "issue_or_candidate": "new adapter repair issue", + "parallelizable": true, + "blocked_by": "Comparable local OSS path for history/UI/readback evidence.", + "measurement": "Preference/entity history, deletion audit readback, personalization, OpenMemory inspection/export, and optional graph-context jobs." + }, + { + "workstream": "memsearch source-of-truth live adapter coverage", + "issue_or_candidate": "new adapter repair issue", + "parallelizable": true, + "blocked_by": "Fixture-backed source-store and retrieval-debug prompts are encoded by XY-925; live prompt execution remains missing.", + "measurement": "Runtime adapter execution for the existing source-of-truth rebuild/reload and retrieval-debug prompt jobs without converting baseline smoke into suite pass claims." + }, + { + "workstream": "OpenViking context trajectory", + "issue_or_candidate": "new benchmark issue after evidence output fix", + "parallelizable": true, + "blocked_by": "Evidence-bearing same-corpus retrieval output.", + "measurement": "Hierarchical expansion, staged trajectory, and resume/retrieval evidence jobs." + }, + { + "workstream": "claude-mem hook/viewer runtime coverage", + "issue_or_candidate": "new adapter issue", + "parallelizable": true, + "blocked_by": "Fixture-backed progressive-disclosure and retrieval-repair prompts are encoded by XY-925; hook capture and viewer/operator workflows remain blocked.", + "measurement": "Work resume, operator debugging, capture/write-policy, viewer/operator, and live progressive-disclosure adapter execution." + }, + { + "workstream": "RAGFlow evidence smoke", + "issue_or_candidate": "XY-885", + "parallelizable": true, + "blocked_by": "Resource envelope accepted for tiny Docker smoke.", + "measurement": "reference.chunks to benchmark evidence mapping." + }, + { + "workstream": "LightRAG context export", + "issue_or_candidate": "XY-886", + "parallelizable": true, + "blocked_by": "Docker service setup and explicit provider config.", + "measurement": "Retrieved context export and source file-path citations." + }, + { + "workstream": "GraphRAG cost-bounded adapter", + "issue_or_candidate": "XY-887", + "parallelizable": true, + "blocked_by": "Tiny corpus cost/resource envelope.", + "measurement": "Document, text-unit, graph-summary, and citation output tables." + }, + { + "workstream": "Graphiti/Zep temporal graph adapter", + "issue_or_candidate": "XY-888", + "parallelizable": true, + "blocked_by": "Docker-local graph store setup.", + "measurement": "Current/historical/future fact validity and evidence ids." + }, + { + "workstream": "graphify graph report adapter", + "issue_or_candidate": "XY-889 plus post-XY-900 expansion", + "parallelizable": true, + "blocked_by": "Representative graph/RAG navigation and quality proof beyond the tiny generated smoke.", + "measurement": "Graph/report evidence over representative graph/RAG jobs, with graph.json and GRAPH_REPORT outputs mapped to scored evidence ids." + }, + { + "workstream": "Private corpus and credentialed production ops", + "issue_or_candidate": "operator-owned benchmark gates", + "parallelizable": false, + "blocked_by": "Sanitized private manifest and routed provider credentials.", + "measurement": "Private-corpus retrieval quality and credentialed production-ops pass/fail evidence." + }, + { + "workstream": "Letta, LangGraph, nanograph, llm-wiki direct adapters", + "issue_or_candidate": "Letta export artifact blocked; others research-only until output contract", + "parallelizable": false, + "blocked_by": "Letta needs the selected contained export/readback artifact; the others need a non-memory-backend comparability contract.", + "measurement": "Only run after comparable output exists; otherwise treat as product-reference evidence." + } + ] +} diff --git a/docs/research/2026-06-11-xy-898-first-generation-oss-adapter-promotion.json b/docs/research/2026-06-11-xy-898-first-generation-oss-adapter-promotion.json new file mode 100644 index 00000000..81e9179c --- /dev/null +++ b/docs/research/2026-06-11-xy-898-first-generation-oss-adapter-promotion.json @@ -0,0 +1,207 @@ +{ + "schema": "elf.first_generation_oss_adapter_promotion_report/v1", + "report_id": "xy-898-first-generation-oss-adapter-promotion-2026-06-11", + "authority": "XY-898", + "date": "2026-06-11", + "scope": "Scenario-level adapter evidence for agentmemory, mem0/OpenMemory, memsearch, and claude-mem without ELF optimization changes.", + "source_inputs": [ + "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/research/2026-06-11-temporal-history-competitor-gap-report.json", + "docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md", + "docs/spec/real_world_agent_memory_benchmark_v1.md", + "docs/guide/benchmarking/live_baseline_benchmark.md", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", + "tmp/live-baseline/live-baseline-report.json" + ], + "fresh_run": { + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "run_id": "live-baseline-20260611061612", + "status": "fail", + "runtime_seconds": 295.74, + "artifact": "tmp/live-baseline/live-baseline-report.json", + "summary": { + "total": 5, + "pass": 3, + "wrong_result": 1, + "lifecycle_fail": 1, + "incomplete": 0, + "blocked": 0, + "not_encoded": 0 + }, + "full_check_summary": { + "total": 25, + "pass": 22, + "wrong_result": 1, + "lifecycle_fail": 1, + "blocked": 1, + "incomplete": 0, + "not_encoded": 0 + } + }, + "projects": [ + { + "project": "ELF", + "status": "pass", + "retrieval_status": "retrieval_pass", + "check_summary": { + "total": 8, + "pass": 8 + }, + "evidence": "ELF passed 8/8 encoded local lifecycle and retrieval checks in the comparable baseline run and serves only as the reference row for same-class scenario positions." + }, + { + "project": "agentmemory", + "status": "lifecycle_fail", + "retrieval_status": "retrieval_pass", + "check_summary": { + "total": 4, + "pass": 2, + "lifecycle_fail": 1, + "blocked": 1 + }, + "evidence": "Same-corpus retrieval and delete suppression pass through the in-memory mock, but update supersession fails and cold-start recovery is blocked." + }, + { + "project": "mem0/OpenMemory", + "status": "pass", + "retrieval_status": "retrieval_pass", + "check_summary": { + "total": 4, + "pass": 4 + }, + "evidence": "Local OSS mem0 passes same-corpus retrieval, update, delete, and cold-start reload. This does not include OpenMemory UI, hosted Platform, entity history, or graph memory." + }, + { + "project": "memsearch", + "status": "pass", + "retrieval_status": "retrieval_pass", + "check_summary": { + "total": 4, + "pass": 4 + }, + "evidence": "memsearch passes same-corpus retrieval, update reindex, delete suppression, and cold-start reload over the canonical Markdown corpus." + }, + { + "project": "claude-mem", + "status": "wrong_result", + "retrieval_status": "retrieval_wrong_result", + "check_summary": { + "total": 5, + "pass": 4, + "wrong_result": 1 + }, + "evidence": "claude-mem passes update, delete, progressive detail/source hydration, and cold-start reload over Docker-local SQLite, but same-corpus retrieval misses expected evidence." + } + ], + "scenario_summary": { + "count": 13, + "status_counts": { + "pass": 5, + "wrong_result": 1, + "lifecycle_fail": 1, + "blocked": 1, + "unsupported": 1, + "not_encoded": 4 + }, + "elf_position_counts": { + "wins": 2, + "ties": 2, + "loses": 0, + "untested": 9 + } + }, + "scenario_judgments": [ + { + "project": "agentmemory", + "scenario_id": "basic_same_corpus_retrieval", + "status": "pass", + "elf_position": "untested" + }, + { + "project": "agentmemory", + "scenario_id": "durable_update_reload_lifecycle", + "status": "lifecycle_fail", + "elf_position": "wins" + }, + { + "project": "agentmemory", + "scenario_id": "work_resume_capture_continuity", + "status": "blocked", + "elf_position": "untested" + }, + { + "project": "mem0/OpenMemory", + "scenario_id": "basic_local_lifecycle", + "status": "pass", + "elf_position": "ties" + }, + { + "project": "mem0/OpenMemory", + "scenario_id": "preference_entity_history", + "status": "not_encoded", + "elf_position": "untested" + }, + { + "project": "mem0/OpenMemory", + "scenario_id": "openmemory_ui_export_readback", + "status": "not_encoded", + "elf_position": "untested" + }, + { + "project": "memsearch", + "scenario_id": "canonical_markdown_reindex_reload", + "status": "pass", + "elf_position": "untested" + }, + { + "project": "memsearch", + "scenario_id": "ttl_expiry_lifecycle", + "status": "unsupported", + "elf_position": "untested" + }, + { + "project": "memsearch", + "scenario_id": "real_world_prompt_adapter", + "status": "not_encoded", + "elf_position": "untested" + }, + { + "project": "claude-mem", + "scenario_id": "same_corpus_retrieval", + "status": "wrong_result", + "elf_position": "wins" + }, + { + "project": "claude-mem", + "scenario_id": "repository_lifecycle_reload", + "status": "pass", + "elf_position": "ties" + }, + { + "project": "claude-mem", + "scenario_id": "progressive_disclosure_detail_hydration", + "status": "pass", + "elf_position": "untested" + }, + { + "project": "claude-mem", + "scenario_id": "hook_capture_viewer_workflow", + "status": "not_encoded", + "elf_position": "untested" + } + ], + "claim_boundaries": { + "allowed": [ + "mem0/OpenMemory passes the current basic local OSS lifecycle smoke.", + "memsearch passes the current canonical Markdown reindex/reload smoke.", + "agentmemory remains lifecycle_fail for durable update/reload because the current adapter is in-memory.", + "claude-mem remains wrong_result for same-corpus retrieval while preserving passed repository lifecycle and detail hydration evidence." + ], + "not_allowed": [ + "Do not claim hosted OpenMemory behavior from local OSS evidence.", + "Do not claim mem0/OpenMemory history, UI/export, hosted, or graph-memory parity.", + "Do not claim memsearch source-of-truth real-world suite parity from baseline smoke.", + "Do not claim claude-mem hook/viewer/capture parity from repository-only checks." + ] + } +} diff --git a/docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json b/docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json new file mode 100644 index 00000000..8caaa5dd --- /dev/null +++ b/docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json @@ -0,0 +1,60 @@ +{ + "schema": "elf.openmemory_ui_export_readback_report/v1", + "report_id": "xy-931-openmemory-ui-export-readback-2026-06-11", + "authority": "XY-931", + "created_at": "2026-06-11T12:24:49Z", + "goal": "Measure OpenMemory UI/export readback separately from local mem0 SDK get_all, or record a typed setup blocker with concrete evidence. This run records an export-helper setup blocker before browser/dashboard readback is reached.", + "command": { + "command": "cargo make openmemory-ui-export-readback", + "status": "pass", + "runtime_seconds": 35.14, + "artifact": "tmp/live-baseline/mem0-openmemory-ui-export.json" + }, + "run": { + "run_id": "live-baseline-20260611122416", + "project_filter": "mem0", + "sdk_baseline_status": "pass", + "sdk_check_summary": { + "total": 8, + "pass": 8, + "fail": 0, + "blocked": 0 + }, + "ui_export_status": "blocked", + "ui_export_reason_code": "DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER" + }, + "same_corpus_boundary": { + "sdk_result_artifact": "tmp/live-baseline/mem0-search.json", + "sdk_get_all_check_status": "pass", + "sdk_get_all_is_ui_export_evidence": false, + "openmemory_ui_export_is_separate_product_ux_scenario": true + }, + "openmemory_probe": { + "tree_present": true, + "ui_package_present": true, + "compose_file_present": true, + "export_script_present": true, + "sunsetting_notice_present": true, + "requires_openai_api_key": true, + "requires_docker_compose": true, + "export_requires_running_container": true, + "attempt": { + "command": "timeout 30 bash openmemory/backup-scripts/export_openmemory.sh --user-id elf-history-user --container openmemory-openmemory-mcp-1", + "exit_code": 1, + "log_artifact": "tmp/live-baseline/mem0-openmemory-export-attempt.log", + "output_excerpt": "openmemory/backup-scripts/export_openmemory.sh: line 52: docker: command not found\nERROR: Container 'openmemory-openmemory-mcp-1' not found/running. Pass --container <NAME_OR_ID> if different." + } + }, + "classification": { + "status": "blocked", + "reason_code": "DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER", + "reason": "The OpenMemory export helper requires Docker access, but Docker is not available inside the baseline-runner container; browser/dashboard readback is not reached.", + "next_action": "Add a dedicated OpenMemory Docker Compose profile that imports the generated mem0 corpus into the OpenMemory app database, starts the API/UI with explicit local or provider configuration, then rerun the export helper and validate the exported memories." + }, + "claim_boundary": { + "elf_can_compare_against_openmemory_ui_export_after_this_run": false, + "hosted_platform_claim": false, + "optional_graph_memory_enabled": false, + "sdk_get_all_is_ui_export_evidence": false + } +} diff --git a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json new file mode 100644 index 00000000..ea5d1bcf --- /dev/null +++ b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json @@ -0,0 +1,556 @@ +{ + "schema": "elf.dreaming_readiness_stage_ledger/v1", + "ledger_id": "xy-951-dreaming-readiness-stage-ledger-2026-06-16", + "authority": "XY-951", + "created_at": "2026-06-16T00:00:00Z", + "purpose": "Define the benchmark evidence gate that every Dreaming-inspired ELF optimization stage must update before claiming completion.", + "source_evidence_cutoff": "Checked-in benchmark and research evidence through the XY-905 live temporal reconciliation run, XY-934 live consolidation proposal scoring run, XY-952 fixture-backed memory summary/source-trace contract, XY-953 fixture-backed proactive brief scoring, and XY-954 fixture-backed scheduled-memory task scoring on 2026-06-16; no private-corpus or provider-backed production pass is claimed by this ledger.", + "typed_status_terms": [ + "pass", + "wrong_result", + "blocked", + "not_tested", + "not_encoded", + "incomplete", + "lifecycle_fail", + "unsupported", + "non_goal" + ], + "judgment_terms": [ + "improved", + "regressed", + "unchanged", + "blocked", + "not_tested" + ], + "count_fields": [ + "pass", + "wrong_result", + "blocked", + "not_tested", + "not_encoded" + ], + "gate_rules": [ + "Every downstream Dreaming or competitor-improvement stage must write a post-stage JSON report and Markdown summary before claiming phase completion.", + "Post-stage reports must compare against this ledger's baseline counts and set exactly one comparison_judgment: improved, regressed, unchanged, blocked, or not_tested.", + "Typed non-pass states must remain typed; blocked, not_tested, not_encoded, incomplete, lifecycle_fail, unsupported, and wrong_result must not be collapsed into a generic fail or hidden under pass.", + "Fixture-backed evidence may prove benchmark shape but must not be promoted into live_real_world product quality.", + "Private-corpus and provider-backed production gates remain typed blocked unless the operator supplies explicit inputs; those blockers are tracked under XY-930.", + "The XY-905 post-stage live memory_evolution result is a narrow temporal reconciliation improvement only; it must not be converted into private-corpus, hosted memory, or broad competitor superiority claims.", + "The XY-934 live consolidation result is a narrow ELF self-check only; it must not be converted into broad managed dreaming, Always-On Memory Agent, qmd, agentmemory, or llm-wiki superiority claims without comparable contained runners.", + "The XY-953 proactive brief result is fixture-backed benchmark-shape evidence only; it must not be converted into OpenAI Pulse, hosted managed-memory, scheduler, or private-corpus parity claims.", + "The XY-954 scheduled-memory result is fixture-backed benchmark-shape evidence only; it must not be converted into hosted scheduler, ChatGPT Tasks, Pulse, provider-backed private-corpus, notification, or silent source-mutation claims." + ], + "summary": { + "improved": [ + "current_vs_historical_correctness", + "preference_evolution", + "reviewable_consolidation", + "memory_summary_top_of_mind_behavior", + "proactive_brief_readiness", + "scheduled_memory_task_readiness" + ], + "regressed": [], + "unchanged": [ + "deletion_ttl_tombstone_behavior", + "final_competitor_retest_status" + ], + "blocked": [], + "not_tested": [] + }, + "stage_gates": [ + { + "stage_id": "current_vs_historical_correctness", + "stage_name": "Current-vs-historical correctness", + "dependent_issue": "XY-905", + "evidence_class": "live_real_world", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-evolution", + "artifact": "tmp/real-world-memory/evolution-report.json", + "purpose": "Fixture gate for current facts, historical facts, conflicts, and update rationales." + }, + { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/", + "purpose": "Live ELF/qmd real-world adapter gate for the memory_evolution suite." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-evolution", + "required_artifact": "tmp/real-world-memory/evolution-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "evidence_files": [ + "docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", + "docs/research/2026-06-16-live-temporal-reconciliation-report.json", + "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/research/2026-06-11-temporal-history-competitor-gap-report.json", + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + ], + "baseline_counts": { + "pass": 1, + "wrong_result": 5, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "baseline_basis": "ELF live service adapter memory_evolution suite: one delete/TTL job passes and five current-vs-historical jobs are wrong_result.", + "post_stage_counts": { + "pass": 6, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "post_stage_basis": "XY-905 live real-world adapter sweep: ELF memory_evolution suite passes all six encoded jobs with current, historical, rationale, tombstone, and temporal-validity evidence selected where present.", + "comparison_judgment": "improved", + "regression_rule": "Any new wrong_result, missed evidence, or loss of the delete/TTL pass is a regression.", + "improvement_rule": "An improvement requires fewer live ELF wrong_result jobs without increasing blocked/not_tested counts.", + "next_optimization_direction": "Move from benchmark materialization into service-native temporal reconciliation APIs and compare against mem0/OpenMemory history and Graphiti/Zep temporal graph evidence without broad superiority claims." + }, + { + "stage_id": "preference_evolution", + "stage_name": "Preference evolution and correction history", + "dependent_issue": "XY-905", + "evidence_class": "live_real_world", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-evolution", + "artifact": "tmp/real-world-memory/evolution-report.json", + "purpose": "Fixture gate for the preference-change job." + }, + { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/", + "purpose": "Live adapter gate for memory-evolution-preference-001." + }, + { + "command": "cargo make openmemory-ui-export-readback", + "artifact": "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", + "purpose": "External comparison boundary for mem0/OpenMemory preference correction and export-style history." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-evolution", + "required_artifact": "tmp/real-world-memory/evolution-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + }, + { + "command": "cargo make openmemory-ui-export-readback", + "required_artifact": "tmp/live-baseline/" + } + ], + "evidence_files": [ + "docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", + "docs/research/2026-06-16-live-temporal-reconciliation-report.json", + "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", + "docs/research/2026-06-11-temporal-history-competitor-gap-report.json" + ], + "baseline_counts": { + "pass": 0, + "wrong_result": 1, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "baseline_basis": "ELF live memory-evolution-preference-001 is wrong_result; mem0 local OSS preference correction history is measured as an ELF loss.", + "post_stage_counts": { + "pass": 1, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "post_stage_basis": "XY-905 live real-world adapter sweep: ELF memory-evolution-preference-001 passes with current preference, historical preference, and rationale evidence selected and narrated.", + "comparison_judgment": "improved", + "regression_rule": "Any loss of fixture preference correctness or any new blocked/not_tested live preference gate is a regression.", + "improvement_rule": "An improvement requires live preference correction history to pass while preserving old preference history as historical evidence.", + "next_optimization_direction": "Measure preference correction against mem0/OpenMemory history and UI/export surfaces before making any broader history-quality claim." + }, + { + "stage_id": "deletion_ttl_tombstone_behavior", + "stage_name": "Deletion, TTL, and tombstone behavior", + "dependent_issue": "XY-905", + "evidence_class": "live_real_world", + "baseline_commands": [ + { + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json", + "purpose": "Aggregate fixture gate containing memory-evolution-delete-ttl-001." + }, + { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/", + "purpose": "Live adapter gate for tombstone behavior." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory", + "required_artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "evidence_files": [ + "docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", + "docs/research/2026-06-16-live-temporal-reconciliation-report.json", + "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" + ], + "baseline_counts": { + "pass": 1, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "baseline_basis": "ELF live memory-evolution-delete-ttl-001 passes with tombstone and current-plan evidence; qmd misses the tombstone.", + "post_stage_counts": { + "pass": 1, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "post_stage_basis": "XY-905 live real-world adapter sweep preserved the delete/TTL pass and now reports tombstone and invalidation evidence in the memory_evolution readback fields.", + "comparison_judgment": "unchanged", + "regression_rule": "Losing tombstone evidence, returning stale deleted content, or failing the aggregate fixture is a regression.", + "improvement_rule": "This stage is already pass for ELF; improvement requires preserving the pass while reducing adjacent memory_evolution wrong_result counts.", + "next_optimization_direction": "Extend tombstone and TTL readback beyond the single encoded job into update/delete/recreate history cases." + }, + { + "stage_id": "reviewable_consolidation", + "stage_name": "Reviewable consolidation", + "dependent_issue": "XY-934", + "evidence_class": "live_real_world", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-consolidation", + "artifact": "tmp/real-world-memory/consolidation/report.json", + "purpose": "Fixture gate for review actions, lineage, unsupported claims, contradiction, and source immutability." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-consolidation", + "required_artifact": "tmp/real-world-memory/consolidation/report.json" + }, + { + "command": "cargo make real-world-memory-live-consolidation", + "required_artifact": "tmp/real-world-memory/live-consolidation/summary.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "evidence_files": [ + "docs/spec/system_consolidation_proposals_v1.md", + "docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md", + "docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json", + "apps/elf-eval/fixtures/real_world_memory/consolidation/" + ], + "baseline_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 0, + "not_tested": 1, + "not_encoded": 1 + }, + "baseline_basis": "Before XY-934, consolidation fixtures passed but live consolidation proposal generation and review-action scoring were not encoded.", + "post_stage_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "post_stage_basis": "XY-934 adds ELF live service-backed proposal materialization, source lineage, confidence/usefulness, unsupported-claim flags, apply/defer/discard audit, and zero source mutations for 4 consolidation jobs.", + "comparison_judgment": "improved", + "regression_rule": "Any source mutation, missing lineage, or collapse of review actions into an automatic rewrite is a regression.", + "improvement_rule": "The stage is improved when live or service-backed consolidation scoring exists without provider hidden state and without mutating authoritative sources.", + "next_optimization_direction": "Keep Dreaming output derived and reviewable, and add direct competitor/reference runners only when they emit comparable source ids, confidence, unsupported-claim flags, and review audit artifacts." + }, + { + "stage_id": "memory_summary_top_of_mind_behavior", + "stage_name": "Memory summary and top-of-mind behavior", + "dependent_issue": "XY-952", + "evidence_class": "fixture_backed", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-knowledge", + "artifact": "tmp/real-world-memory/knowledge-report.json", + "purpose": "Fixture gate for derived knowledge pages, citations, stale-source lint, and repair guidance." + }, + { + "command": "cargo make real-world-memory-core-archival", + "artifact": "tmp/real-world-memory/core-archival/report.json", + "purpose": "Fixture gate for always-attached core block attachment, scope, provenance, stale-core detection, and archival fallback." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-summary", + "required_artifact": "tmp/real-world-memory/memory-summary/report.json" + }, + { + "command": "cargo make real-world-memory-knowledge", + "required_artifact": "tmp/real-world-memory/knowledge-report.json" + }, + { + "command": "cargo make real-world-memory-core-archival", + "required_artifact": "tmp/real-world-memory/core-archival/report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "evidence_files": [ + "docs/spec/system_memory_summary_v1.md", + "apps/elf-eval/fixtures/real_world_memory/memory_summary/", + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "apps/elf-eval/fixtures/real_world_memory/knowledge/", + "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/" + ], + "baseline_counts": { + "pass": 8, + "wrong_result": 0, + "blocked": 0, + "not_tested": 1, + "not_encoded": 1 + }, + "baseline_basis": "Knowledge and core/archival fixtures pass, but live knowledge compilation and top-of-mind product behavior are not encoded.", + "post_stage_counts": { + "pass": 9, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "post_stage_basis": "XY-952 adds one fixture-backed memory_summary job with top-of-mind, background, stale, superseded, tombstone, and derived project-profile entries, source refs, freshness metadata, rationale, and unsupported-claim flags.", + "comparison_judgment": "improved", + "regression_rule": "Any stale summary, unsupported section, missing source id, or stale core block presented as current is a regression.", + "improvement_rule": "An improvement requires top-of-mind or summary readback that remains source-linked, exposes freshness and rationale, and fails stale-current or unsupported-derived claims.", + "next_optimization_direction": "Move from fixture-backed summary/source-trace readback into service-native admin readback and later live top-of-mind behavior without replacing authoritative notes with hidden summaries." + }, + { + "stage_id": "proactive_brief_readiness", + "stage_name": "Proactive brief readiness", + "dependent_issue": "XY-953", + "evidence_class": "fixture_backed", + "baseline_commands": [ + { + "command": "cargo make real-world-first-generation-oss", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json", + "purpose": "Regression guard for claude-mem progressive-disclosure and retrieval-repair reference behavior." + }, + { + "command": "cargo make real-world-job-operator-ux", + "artifact": "tmp/real-world-job/real-world-job-operator-ux-report.json", + "purpose": "Regression guard for operator-facing trace and repair-action clarity." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-proactive-brief", + "required_artifact": "tmp/real-world-memory/proactive-brief/report.json" + }, + { + "command": "cargo make real-world-memory", + "required_artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + { + "command": "cargo test -p elf-eval --test real_world_job_benchmark -- --test-threads=1", + "required_artifact": "test output" + } + ], + "evidence_files": [ + "docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md", + "docs/research/2026-06-16-proactive-brief-scoring-report.json", + "apps/elf-eval/fixtures/real_world_memory/proactive_brief/", + "docs/research/2026-06-08-agent-memory-selection.json", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md", + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + ], + "baseline_counts": { + "pass": 0, + "wrong_result": 0, + "blocked": 0, + "not_tested": 1, + "not_encoded": 1 + }, + "baseline_basis": "No direct proactive-brief real_world_job suite exists; adjacent progressive-disclosure and operator-debug fixtures are reference guards only.", + "post_stage_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 1, + "not_tested": 0, + "not_encoded": 0, + "suggestions": 5, + "evidence_ref_coverage": 1.0, + "freshness_coverage": 1.0, + "action_rationale_coverage": 1.0, + "invalid_current_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0 + }, + "post_stage_basis": "XY-953 adds five proactive_brief fixture jobs: daily project brief, resume-work brief, stale decision audit, stale plan/preference warning, and a typed private-corpus refresh blocker tied to XY-930. The four runnable jobs pass with five evidence-linked suggestions, freshness/currentness markers, action rationale, stale/superseded/tombstone source traces, and no unsupported-current or tombstone violations.", + "comparison_judgment": "improved", + "regression_rule": "A proactive brief that is uncited, lacks freshness/currentness metadata, omits reject/defer rationale, presents stale or tombstoned facts as current, ignores TTL invalidations, leaks excluded content, or claims Pulse/private-corpus parity is a regression.", + "improvement_rule": "An improvement requires direct proactive-brief fixture or live adapter evidence with cited source ids, freshness/currentness markers, reject/defer rationale, and typed non-pass handling for unavailable private inputs.", + "next_optimization_direction": "Move from fixture-backed proactive brief scoring into service-native generated brief readback and later live adapter materialization; keep scheduling and private-corpus refresh behind their owned lanes and operator inputs." + }, + { + "stage_id": "scheduled_memory_task_readiness", + "stage_name": "Scheduled memory task readiness", + "dependent_issue": "XY-954", + "evidence_class": "fixture_backed", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-consolidation", + "artifact": "tmp/real-world-memory/consolidation/report.json", + "purpose": "Current closest fixture gate for deterministic fixture/manual consolidation runs." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-scheduled", + "required_artifact": "tmp/real-world-memory/scheduled/report.json" + }, + { + "command": "cargo make real-world-memory", + "required_artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + { + "command": "cargo test -p elf-eval --test real_world_job_benchmark scheduled_memory -- --test-threads=1", + "required_artifact": "target/debug/deps/real_world_job_benchmark-*" + } + ], + "evidence_files": [ + "apps/elf-eval/fixtures/real_world_memory/scheduled_memory/", + "docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md", + "docs/research/2026-06-16-scheduled-memory-task-scoring-report.json", + "docs/spec/system_consolidation_proposals_v1.md", + "docs/research/2026-06-08-agent-memory-selection.json" + ], + "baseline_counts": { + "pass": 0, + "wrong_result": 0, + "blocked": 1, + "not_tested": 0, + "not_encoded": 0 + }, + "baseline_basis": "The consolidation spec permits fixture and manual job_kind only; scheduled is explicitly future work and no scheduled-memory-task benchmark is encoded.", + "post_stage_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 1, + "not_tested": 0, + "not_encoded": 0, + "task_runs": 4, + "outputs": 5, + "evidence_ref_coverage": 1.0, + "freshness_coverage": 1.0, + "action_rationale_coverage": 1.0, + "trace_coverage": 1.0, + "invalid_current_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_mutation_count": 0 + }, + "post_stage_basis": "XY-954 adds five scheduled_memory fixture jobs: weekly project status summary, stale preference/plan audit, stale decision audit, knowledge-page refresh suggestion, and a typed private/provider scheduler blocker tied to XY-930. The four runnable jobs pass with five evidence-linked outputs, freshness/currentness metadata, action rationale, completed execution trace readback, stale/superseded/tombstone source traces, and zero source mutations.", + "comparison_judgment": "improved", + "regression_rule": "A scheduled-memory task that omits source refs, freshness/currentness markers, execution trace/readback, reviewable action rationale, or silently mutates source memory is a regression.", + "improvement_rule": "An improvement requires direct scheduled-memory fixture or live adapter evidence with source refs, freshness/currentness markers, execution trace/readback, source immutability, and typed blockers for unavailable private/provider scheduler prerequisites.", + "next_optimization_direction": "Move from fixture-backed scheduled task scoring into service-native queued task materialization and operator-visible readback; keep hosted/private/provider scheduler gates behind XY-930 inputs." + }, + { + "stage_id": "final_competitor_retest_status", + "stage_name": "Final competitor retest status", + "dependent_issue": "XY-951", + "evidence_class": "live_real_world", + "baseline_commands": [ + { + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/", + "purpose": "Full encoded ELF/qmd live real-world sweep." + }, + { + "command": "cargo make real-world-first-generation-oss", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json", + "purpose": "First-generation OSS prompt fixture and typed blocker slice." + }, + { + "command": "cargo make real-world-memory-graph-rag", + "artifact": "tmp/real-world-memory/graph-rag/report.json", + "purpose": "Representative graph/RAG typed non-pass fixture slice." + }, + { + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/", + "purpose": "mem0/OpenMemory local OSS history and export-readback boundary." + }, + { + "command": "cargo make baseline-production-private-addendum", + "artifact": "tmp/live-baseline/private-production-addendum.md", + "purpose": "Private-corpus addendum; remains blocked unless an operator-owned manifest is supplied." + } + ], + "post_stage_commands": [ + { + "command": "cargo make real-world-memory-live-adapters", + "required_artifact": "tmp/real-world-memory/live-adapters/" + }, + { + "command": "cargo make real-world-first-generation-oss", + "required_artifact": "tmp/real-world-memory/first-generation-oss/report.json" + }, + { + "command": "cargo make real-world-memory-graph-rag", + "required_artifact": "tmp/real-world-memory/graph-rag/report.json" + }, + { + "command": "cargo make openmemory-ui-export-readback", + "required_artifact": "tmp/live-baseline/" + }, + { + "command": "cargo make baseline-production-private-addendum", + "required_artifact": "tmp/live-baseline/private-production-addendum.md" + } + ], + "evidence_files": [ + "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "docs/research/2026-06-11-competitor-strength-adoption-report.json", + "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md", + "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" + ], + "baseline_counts": { + "pass": 22, + "wrong_result": 5, + "blocked": 2, + "not_tested": 11, + "not_encoded": 11 + }, + "baseline_basis": "ELF full live real-world sweep: 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs. The not_encoded jobs are represented as not_tested for this stage gate while preserving the raw not_encoded count.", + "comparison_judgment": "unchanged", + "regression_rule": "Any higher wrong_result/blocked/not_tested count, missing typed blocker, or unsupported broad competitor win claim is a regression.", + "improvement_rule": "An improvement requires reduced live wrong_result or not_tested counts with no weakened evidence-class boundary and no private/provider claim without inputs.", + "next_optimization_direction": "Rerun the full relevant competitor matrix after each product optimization and update the Markdown/JSON ledger with improved, regressed, unchanged, blocked, and not_tested buckets." + } + ] +} diff --git a/docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json b/docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json new file mode 100644 index 00000000..4f33fed9 --- /dev/null +++ b/docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json @@ -0,0 +1,137 @@ +{ + "schema": "elf.live_consolidation_proposal_scoring_report/v1", + "report_id": "xy-934-live-consolidation-proposal-scoring-2026-06-16", + "authority": "XY-934", + "created_at": "2026-06-16T00:00:00Z", + "commands": [ + { + "command": "cargo make real-world-memory-consolidation", + "status": "pass", + "artifact": "tmp/real-world-memory/consolidation/report.json" + }, + { + "command": "cargo make real-world-memory-live-consolidation", + "status": "pass", + "artifact": "tmp/real-world-memory/live-consolidation/summary.json" + } + ], + "fixture_aggregate": { + "suite_id": "consolidation", + "evidence_class": "fixture_backed", + "encoded_job_count": 4, + "suite_status": "pass", + "proposal_count": 4, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 1, + "lineage_completeness": 1.0, + "review_action_correctness": 1.0, + "executable_gap_count": 0 + }, + "live_consolidation_results": { + "elf_live_real_world": { + "evidence_class": "live_real_world", + "suite_status": "pass", + "encoded_job_count": 4, + "proposal_count": 4, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 1, + "lineage_completeness": 1.0, + "review_action_correctness": 1.0, + "review_event_count": 6, + "artifact": "tmp/real-world-memory/live-consolidation/elf-report.json", + "materialization_artifact": "tmp/real-world-memory/live-consolidation/elf-materialization.json" + }, + "qmd_live_real_world": { + "evidence_class": "live_real_world", + "suite_status": "not_encoded", + "encoded_job_count": 4, + "proposal_count": 0, + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" + } + }, + "jobs": [ + { + "job_id": "consolidation-project-summary-apply-001", + "status": "pass", + "proposal_kind": "project_summary", + "source_lineage_count": 2, + "usefulness_score": 0.93, + "min_usefulness_score": 0.8, + "review_action": "apply", + "final_review_state": "applied", + "review_event_count": 2, + "unsupported_claim_flag_count": 0, + "source_mutation_count": 0 + }, + { + "job_id": "consolidation-weekly-decision-summary-apply-001", + "status": "pass", + "proposal_kind": "weekly_decision_summary", + "source_lineage_count": 2, + "usefulness_score": 0.91, + "min_usefulness_score": 0.8, + "review_action": "apply", + "final_review_state": "applied", + "review_event_count": 2, + "unsupported_claim_flag_count": 0, + "source_mutation_count": 0 + }, + { + "job_id": "consolidation-preference-candidate-defer-001", + "status": "pass", + "proposal_kind": "preference_candidate", + "source_lineage_count": 2, + "usefulness_score": 0.86, + "min_usefulness_score": 0.75, + "review_action": "defer", + "final_review_state": "archived", + "review_event_count": 1, + "unsupported_claim_flag_count": 0, + "source_mutation_count": 0 + }, + { + "job_id": "consolidation-contradiction-report-discard-001", + "status": "pass", + "proposal_kind": "contradiction_report", + "source_lineage_count": 3, + "usefulness_score": 0.9, + "min_usefulness_score": 0.8, + "review_action": "discard", + "final_review_state": "rejected", + "review_event_count": 1, + "unsupported_claim_flag_count": 1, + "source_mutation_count": 0 + } + ], + "reference_positions": [ + { + "project": "qmd", + "position": "untested", + "reason": "qmd keeps consolidation jobs typed not_encoded in the full live sweep; no proposal generation or review-action audit runner exists for qmd." + }, + { + "project": "managed_dreaming_memory_systems", + "position": "product_reference", + "reason": "Managed dreaming motivates the derived proposal-review shape, but no contained runner emits comparable source ids, confidence, unsupported-claim flags, and review audit artifacts." + }, + { + "project": "always_on_memory_agent_patterns", + "position": "product_reference", + "reason": "Always-on scheduling remains a reference only; XY-934 does not implement scheduled consolidation and does not allow silent source-of-truth rewrites." + } + ], + "claim_boundary": { + "allowed": [ + "ELF live consolidation self-checks pass for proposal materialization, source lineage, confidence/usefulness thresholds, unsupported-claim flags, and apply/defer/discard audit transitions.", + "Fixture consolidation passes and live service-backed consolidation evidence are separate evidence classes.", + "qmd and other tracked projects remain untested or reference-only for live consolidation proposal scoring until a contained runner emits comparable artifacts.", + "Derived-output safety claims are tied to source lineage, immutable source snapshots, zero source mutations, and review-action artifacts." + ], + "not_allowed": [ + "Do not claim scheduled production consolidation exists.", + "Do not claim live provider-generated consolidation quality; the accepted elf.consolidation/v1 service boundary is deterministic fixture/manual proposal materialization.", + "Do not claim ELF broadly beats managed dreaming, Always-On Memory Agent, agentmemory, qmd, or llm-wiki on consolidation without comparable contained live runners.", + "Do not mix knowledge-page rebuild/lint scoring into the consolidation claim." + ] + } +} diff --git a/docs/research/2026-06-16-live-temporal-reconciliation-report.json b/docs/research/2026-06-16-live-temporal-reconciliation-report.json new file mode 100644 index 00000000..e6620577 --- /dev/null +++ b/docs/research/2026-06-16-live-temporal-reconciliation-report.json @@ -0,0 +1,149 @@ +{ + "schema": "elf.live_temporal_reconciliation_report/v1", + "report_id": "xy-905-live-temporal-reconciliation-2026-06-16", + "authority": "XY-905", + "generated_at": "2026-06-16T02:09:43Z", + "objective": "Record the before/after evidence for ELF live memory_evolution temporal reconciliation without claiming broader competitor superiority.", + "commands": [ + { + "command": "cargo make real-world-memory-evolution", + "status": "pass", + "artifact": "tmp/real-world-memory/evolution-report.json", + "purpose": "Fixture contract gate for current, historical, conflict, rationale, and temporal-validity scoring." + }, + { + "command": "cargo make real-world-memory-live-adapters", + "status": "pass", + "artifact": "tmp/real-world-memory/live-adapters/summary.json", + "purpose": "Docker-isolated live ELF/qmd real-world adapter sweep." + }, + { + "command": "cargo test -p elf-eval --test real_world_job_benchmark -- --test-threads=1", + "status": "pass", + "artifact": "stdout", + "purpose": "Report/schema and scorer regression coverage, including selected-but-not-narrated conflicts." + } + ], + "baseline": { + "source": "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "elf_memory_evolution": { + "encoded_jobs": 6, + "job_status_counts": { + "pass": 1, + "wrong_result": 5, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "score_mean": 0.492, + "expected_evidence_recall": 1.0, + "diagnosis": "ELF found the required evidence but did not narrate current-vs-historical lifecycle state for five jobs." + }, + "qmd_memory_evolution": { + "encoded_jobs": 6, + "job_status_counts": { + "pass": 0, + "wrong_result": 6, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "score_mean": 0.325, + "expected_evidence_recall": 0.769, + "diagnosis": "qmd had the same lifecycle gap and also missed required evidence including tombstone evidence." + } + }, + "post_stage": { + "source": "tmp/real-world-memory/live-adapters/summary.json", + "elf_memory_evolution": { + "encoded_jobs": 6, + "job_status_counts": { + "pass": 6, + "wrong_result": 0, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "score_mean": 1.0, + "expected_evidence_recall": 1.0, + "conflict_detection_count": 5, + "update_rationale_available_count": 6, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 1, + "selected_but_not_narrated_count": 0, + "suite_status": "pass" + }, + "qmd_memory_evolution": { + "encoded_jobs": 6, + "job_status_counts": { + "pass": 0, + "wrong_result": 6, + "blocked": 0, + "not_tested": 0, + "not_encoded": 0 + }, + "score_mean": 0.325, + "expected_evidence_recall": 0.769, + "conflict_detection_count": 0, + "update_rationale_available_count": 4, + "suite_status": "wrong_result" + }, + "elf_full_live_adapter_summary": { + "job_count": 55, + "pass": 40, + "wrong_result": 0, + "blocked": 5, + "not_encoded": 10, + "mean_score": 0.727, + "expected_evidence_recall": 0.655 + } + }, + "comparison_judgment": { + "current_vs_historical_correctness": "improved", + "preference_evolution": "improved", + "deletion_ttl_tombstone_behavior": "unchanged", + "final_competitor_retest_status": "unchanged" + }, + "trace_contract": { + "answer_fields": [ + "selected_current_evidence", + "selected_historical_evidence", + "selected_rationale_evidence", + "selected_tombstone_evidence", + "selected_invalidation_evidence", + "conflict_candidate_evidence", + "retrieved_but_dropped_evidence", + "selected_but_not_narrated_evidence" + ], + "materialization_fields": [ + "current_winner_evidence_ids", + "historical_loser_evidence_ids", + "supersession_rationale_evidence_ids", + "tombstone_evidence_ids", + "invalidation_evidence_ids", + "conflict_candidate_evidence_ids", + "retrieved_evidence_ids", + "selected_evidence_ids", + "absent_evidence_ids", + "retrieved_but_dropped_evidence_ids", + "selected_but_not_narrated_evidence_ids", + "contradicted_by_lifecycle_evidence_ids" + ], + "trace_stages": [ + "live_adapter.retrieve", + "temporal_reconciliation.current_winner", + "temporal_reconciliation.historical_loser", + "temporal_reconciliation.supersession_rationale", + "temporal_reconciliation.tombstone_invalidation", + "temporal_reconciliation.conflict_candidates" + ], + "negative_gate": "A selected conflict evidence id that is not attached to the required conflict claim still scores wrong_result." + }, + "claim_boundaries": [ + "This report supports only the encoded ELF live memory_evolution temporal reconciliation improvement.", + "This report does not claim ELF beats Graphiti/Zep, mem0/OpenMemory, Letta, qmd broadly, hosted memory products, or private-corpus production quality.", + "qmd remains a useful retrieval-debug reference despite this memory_evolution slice remaining wrong_result.", + "Graphiti/Zep temporal graph, mem0/OpenMemory history and UI/export, and private/provider-backed gates remain separate benchmark lanes." + ], + "next_optimization_direction": "Move the reconciliation contract from benchmark materialization toward service-native temporal answer/readback APIs, then measure against mem0/OpenMemory history and Graphiti/Zep temporal graph gates." +} diff --git a/docs/research/2026-06-16-proactive-brief-scoring-report.json b/docs/research/2026-06-16-proactive-brief-scoring-report.json new file mode 100644 index 00000000..e81a72d9 --- /dev/null +++ b/docs/research/2026-06-16-proactive-brief-scoring-report.json @@ -0,0 +1,131 @@ +{ + "schema": "elf.proactive_brief_scoring_report/v1", + "issue": "XY-953", + "created_at": "2026-06-16T14:33:01Z", + "purpose": "Record fixture-backed proactive project brief scoring without claiming scheduler, private-corpus, OpenAI Pulse, or hosted managed-memory parity.", + "evidence_class": "fixture_backed", + "commands": [ + { + "command": "cargo make real-world-memory-proactive-brief", + "status": "pass", + "artifact": "tmp/real-world-memory/proactive-brief/report.json", + "markdown_artifact": "tmp/real-world-memory/proactive-brief/report.md" + }, + { + "command": "cargo make real-world-memory", + "status": "pass", + "artifact": "tmp/real-world-memory/real-world-memory-report.json", + "markdown_artifact": "tmp/real-world-memory/real-world-memory-report.md" + } + ], + "proactive_brief_summary": { + "job_count": 5, + "pass": 4, + "blocked": 1, + "wrong_result": 0, + "unsupported_claim_count": 0, + "evidence_required_count": 8, + "evidence_covered_count": 8, + "expected_evidence_recall": 1.0, + "suggestion_count": 5, + "evidence_ref_coverage": 1.0, + "freshness_coverage": 1.0, + "action_rationale_coverage": 1.0, + "recommended_count": 2, + "deferred_count": 2, + "rejected_count": 1, + "current_suggestion_count": 2, + "non_current_suggestion_count": 3, + "stale_warning_count": 3, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 7, + "source_trace_stale_count": 2, + "source_trace_superseded_count": 2, + "source_trace_tombstone_count": 1 + }, + "root_fixture_summary_after_xy953": { + "job_count": 55, + "encoded_suite_count": 15, + "pass": 49, + "wrong_result": 0, + "incomplete": 0, + "blocked": 6, + "not_encoded": 0, + "unsupported_claim_count": 0, + "evidence_required_count": 123, + "evidence_covered_count": 123, + "expected_evidence_recall": 1.0, + "source_ref_coverage": 1.0, + "quote_coverage": 1.0, + "mean_score": 0.891 + }, + "scenario_results": [ + { + "job_id": "proactive-daily-project-brief-001", + "status": "pass", + "suggestion_kind": "daily_project_brief", + "decision": "recommend", + "evidence_refs": 2, + "freshness_status": "current" + }, + { + "job_id": "proactive-resume-work-brief-001", + "status": "pass", + "suggestion_kind": "resume_work", + "decision": "recommend", + "evidence_refs": 2, + "freshness_status": "current" + }, + { + "job_id": "proactive-stale-decision-audit-001", + "status": "pass", + "suggestion_kind": "stale_decision_audit", + "decision": "defer", + "evidence_refs": 2, + "freshness_status": "superseded" + }, + { + "job_id": "proactive-stale-plan-preference-warning-001", + "status": "pass", + "suggestion_kind": "stale_plan_preference_warning", + "decisions": ["defer", "reject"], + "evidence_refs": 5, + "freshness_statuses": ["expired", "superseded", "tombstoned"] + }, + { + "job_id": "proactive-private-corpus-refresh-blocked-001", + "status": "blocked", + "suggestion_kind": "private_corpus_refresh", + "blocker": "No operator-owned private production corpus manifest is available; private-corpus refresh suggestions stay blocked under XY-930." + } + ], + "stage_ledger_delta": { + "stage_id": "proactive_brief_readiness", + "baseline_counts": { + "pass": 0, + "wrong_result": 0, + "blocked": 0, + "not_tested": 1, + "not_encoded": 1 + }, + "post_stage_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 1, + "not_tested": 0, + "not_encoded": 0 + }, + "comparison_judgment": "improved", + "next_optimization_direction": "Move from fixture-backed proactive brief scoring into service-native generated brief readback and later live adapter materialization; keep scheduling and private-corpus refresh behind their owned lanes and operator inputs." + }, + "claim_boundaries": [ + "Do not claim OpenAI Pulse parity from this fixture-backed report.", + "Do not claim hosted managed-memory parity from this fixture-backed report.", + "Do not claim background scheduling or a morning-dashboard UI.", + "Do not claim private-corpus refresh quality without operator-owned inputs under XY-930.", + "Treat proactive briefs as derived output that must remain source-linked and reviewable." + ] +} diff --git a/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json b/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json new file mode 100644 index 00000000..9bdae08b --- /dev/null +++ b/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json @@ -0,0 +1,4107 @@ +{ + "schema": "elf.real_world_job_report/v1", + "run_id": "real-world-memory-scheduled", + "generated_at": "2026-06-16T16:29:13.720856Z", + "runner_version": "0.2.0-7f08eb504271123fa861e24e6e6861227682acda-aarch64-apple-darwin", + "corpus_profile": "mixed", + "adapter": { + "adapter_id": "fixture_scheduled_memory", + "name": "ELF scheduled memory fixture", + "behavior": "offline_fixture_response", + "storage": "not_encoded", + "runtime": "not_encoded", + "notes": "Offline runner scores checked-in fixture responses; it does not exercise a live external adapter." + }, + "external_adapters": { + "schema": "elf.real_world_external_adapter_report/v1", + "manifest_id": "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store", + "docker_isolation": { + "default": true, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/live-baseline-benchmark.sh", + "artifact_dir": "tmp/live-baseline/", + "host_global_installs_required": false, + "notes": [ + "External project runs default to Docker Compose and Docker-managed caches.", + "Real-world job fixture reports and live baseline reports use separate schemas and claim boundaries." + ] + }, + "summary": { + "adapter_count": 23, + "external_project_count": 16, + "docker_default_count": 23, + "host_global_install_required_count": 0, + "fixture_backed_count": 1, + "live_baseline_only_count": 6, + "live_real_world_count": 5, + "research_gate_count": 11, + "overall_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 7, + "incomplete": 0, + "wrong_result": 6, + "lifecycle_fail": 1, + "pass": 4, + "not_encoded": 5 + }, + "capability_status_counts": { + "real": 8, + "mocked": 1, + "unsupported": 6, + "blocked": 22, + "incomplete": 0, + "wrong_result": 10, + "lifecycle_fail": 0, + "pass": 30, + "not_encoded": 26 + }, + "suite_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 23, + "incomplete": 0, + "wrong_result": 7, + "lifecycle_fail": 0, + "pass": 27, + "not_encoded": 38 + }, + "scenario_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 3, + "blocked": 12, + "incomplete": 1, + "wrong_result": 6, + "lifecycle_fail": 1, + "pass": 23, + "not_encoded": 11 + }, + "scenario_position_counts": { + "wins": 10, + "ties": 11, + "loses": 1, + "untested": 35 + }, + "scenario_outcome_counts": { + "win": 10, + "tie": 11, + "loss": 1, + "not_tested": 17, + "blocked": 13, + "non_goal": 5 + } + }, + "adapters": [ + { + "adapter_id": "elf_real_world_memory_fixture", + "project": "ELF", + "adapter_kind": "offline_fixture_response", + "evidence_class": "fixture_backed", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The checked-in real_world_memory fixtures parse and score through the ELF fixture runner.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "run": { + "status": "blocked", + "evidence": "The current fixture set reports 60 jobs across 16 suites: 53 pass, 0 incomplete, 7 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; the scheduled_memory suite scores 4 passing scheduled readback tasks plus one blocked private/provider scheduler case tied to XY-930, not hosted scheduler, ChatGPT Tasks, Pulse, or provider-backed private-corpus parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "result": { + "status": "blocked", + "evidence": "This is fixture-backed ELF scoring, not a live external adapter result.", + "artifact": "tmp/real-world-memory/real-world-memory-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_fixture_scoring", + "status": "real", + "evidence": "The runner scores checked-in real_world_job records with expected evidence, traps, and typed status output." + }, + { + "capability": "live_external_adapter_execution", + "status": "not_encoded", + "evidence": "The ELF fixture response path does not exercise an external memory project runtime." + }, + { + "capability": "docker_isolated_baseline", + "status": "pass", + "evidence": "ELF live baseline runs execute through docker-compose.baseline.yml for retrieval and lifecycle evidence." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "Checked-in source-of-truth rebuild fixture is encoded and passing." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "Checked-in work-resume fixtures are encoded and passing." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "Checked-in project-decision fixtures cover accepted decisions, reversals, current validation gates, rationale, and bounded caveats." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "Checked-in retrieval fixtures cover alternate phrasing, distractors, multi-hop routing, current-versus-obsolete selection, and minimal context." + }, + { + "suite_id": "memory_evolution", + "status": "pass", + "evidence": "Checked-in memory-evolution fixtures cover current-versus-historical facts and the relation temporal-validity case is encoded." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "Proposal-only consolidation fixtures are encoded and passing without source mutation." + }, + { + "suite_id": "memory_summary", + "status": "pass", + "evidence": "The source-trace memory summary fixture is encoded and passing with freshness, rationale, tombstone, and unsupported-claim guards." + }, + { + "suite_id": "proactive_brief", + "status": "blocked", + "evidence": "The proactive brief suite scores 4 passing source-linked suggestions and 1 typed private-corpus refresh blocker tied to XY-930." + }, + { + "suite_id": "scheduled_memory", + "status": "blocked", + "evidence": "The scheduled memory suite scores 4 passing source-linked task readbacks with execution trace coverage and 1 typed private/provider scheduler blocker tied to XY-930." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "Knowledge page fixtures are encoded and passing with citation and rebuild metrics." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "Operator-debugging fixtures now expose stage attribution and dropped-candidate evidence without raw SQL." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "Four redaction, exclusion, source-id, evidence-binding, and capture-boundary fixtures are encoded and passing." + }, + { + "suite_id": "core_archival_memory", + "status": "pass", + "evidence": "Six fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Production-ops fixtures encode restore, Qdrant rebuild, backfill resume, resource-envelope interpretation, OpenViking wrong-result classification, plus typed blocked operator boundaries." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The scoped preference fixture is encoded and passing." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked until same-corpus evidence ids and staged artifacts are materialized." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory", + "status": "pass" + } + ], + "notes": [ + "This adapter record exists to keep ELF fixture results separate from live external adapter results.", + "The remaining non-pass ELF fixture states are production-ops operator boundaries plus OpenViking context-trajectory measurement gates.", + "Use elf_live_real_world for service-runtime real_world_job evidence; this fixture-backed record must not imply live-service behavior." + ] + }, + { + "adapter_id": "elf_live_real_world", + "project": "ELF", + "adapter_kind": "docker_service_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "ELF materializes 55 real_world_job adapter_response objects through ElfService, worker indexing, search_raw, live capture/write-policy ingestion, live consolidation proposal review, live knowledge-page rebuild/lint, and operator-debug trace metadata before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full live sweep scores 55 jobs across all 13 checked-in suites, including live-scored consolidation, knowledge-page, capture/write-policy, and operator-debug suites. This is not a full-suite live pass because memory-evolution, production-ops, core-archival, and context-trajectory gaps remain typed non-pass records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes real_world_job prompts after runtime ingestion and writes generated answer artifacts before scoring." + }, + { + "capability": "service_runtime_execution", + "status": "real", + "evidence": "The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution is wrong_result and production/core/context boundaries remain typed non-pass." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "Adapter setup/runtime limitations are materialized as typed jobs with evidence JSON instead of silent claim upgrades." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "The live adapter retrieved the restore/Qdrant rebuild proof evidence through the service runtime." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "The live adapter passed 5/5 work_resume jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "The live adapter passed 5/5 retrieval jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "The live adapter passed 5/5 project_decisions jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "The live adapter passed the delete/TTL case but failed five current-versus-historical conflict jobs because retrieval-backed answers did not provide the required historical conflict evidence links." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "The live adapter creates consolidation runs, materializes proposal jobs through the worker, preserves source lineage and unsupported-claim flags, and applies/defer/discards proposals through review audit transitions." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "The live adapter rebuilds derived knowledge pages through ElfService, searches page sections, lints stale source refs after runtime source updates, and emits citation/backlink/unsupported-section page artifacts." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The full live sweep includes operator_debugging_ux fixtures and emits trace ids, viewer/admin trace-bundle links, replay commands, dropped-candidate visibility, repair-action clarity, and raw_sql_needed=false." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "The live adapter passes 4/4 capture_integration jobs through Docker-local ELF ingestion, including capture-boundary classification, excluded evidence ids, source ids in source_ref, write_policy redaction audit counts, evidence binding, and zero secret leakage." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The live adapter retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The full live adapter sweep preserves the core/archival fixture gap as typed not_encoded; this issue does not add live core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "scenarios": [ + { + "scenario_id": "live_capture_write_policy", + "suite_id": "capture_integration", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is an ELF self-check, not a win over external hook systems.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_consolidation_proposal_review", + "suite_id": "consolidation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live consolidation jobs now exercise source lineage, unsupported-claim flags, and apply/defer/discard review audit transitions. This is an ELF service self-check, not a broad competitor win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_knowledge_page_rebuild_lint", + "suite_id": "knowledge_compilation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live knowledge jobs now exercise page rebuild, search, stale-source lint, citations, backlinks, and unsupported-section handling. This is an ELF service self-check, not a broad knowledge-product win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "full_sweep_operator_debug", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF full live sweep now includes the operator-debug fixture tree with hydrated trace ids, trace-bundle replay commands, dropped-candidate visibility, repair guidance, and no raw SQL requirement.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This Docker-isolated live real_world_job record now covers the full encoded fixture corpus, not only the original three-suite representative slice.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove private-corpus production quality or provider-backed production operations." + ] + }, + { + "adapter_id": "qmd_live_baseline", + "project": "qmd", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs qmd inside the baseline container.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "pass", + "evidence": "qmd same-corpus retrieval, update, delete, and cold-start checks are encoded in the live baseline runner.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "This live_baseline_only record is same-corpus evidence only; cite qmd_live_real_world for the full live real-world sweep.", + "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "qmd has an encoded Docker same-corpus retrieval adapter." + }, + { + "capability": "update_delete_cold_start", + "status": "pass", + "evidence": "qmd lifecycle smoke checks are encoded in the live-baseline runner." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job prompts; cite qmd_live_real_world for the full live real-world sweep." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job retrieval prompts; cite qmd_live_real_world for the live retrieval adapter run." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Live-baseline lifecycle checks exist, but no real_world_job memory_evolution run is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd debug ergonomics are a reference dimension; no operator_debugging_ux fixture is executed against qmd." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + }, + { + "kind": "compose", + "ref": "docker-compose.baseline.yml", + "status": "real" + } + ], + "notes": [ + "This same-corpus record remains separate from qmd_live_real_world, which records real_world_job prompt execution and scoring evidence." + ] + }, + { + "adapter_id": "qmd_live_real_world", + "project": "qmd", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes 55 real_world_job adapter_response objects through collection add, update, embed, and query --json before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records, with operator-debug fixtures scored through qmd replay metadata rather than ELF trace hydration.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full qmd live sweep scores 55 jobs across all 13 checked-in suites, preserving consolidation, knowledge-page, capture, production-ops, core-archival, and context-trajectory gaps as typed non-pass records. This is not a full-suite live pass.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "qmd executes real_world_job prompts through its local CLI retrieval/query workflow and records generated answer artifacts." + }, + { + "capability": "local_cli_retrieval", + "status": "real", + "evidence": "The adapter uses qmd collection add, update, embed -f, and query --json inside Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution and operator_debugging_ux are wrong_result while non-qmd product surfaces remain typed not_encoded or blocked." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "qmd setup/runtime limitations are materialized as typed jobs with command evidence and retry artifacts." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "qmd retrieved the restore/Qdrant rebuild proof evidence through the local CLI workflow." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "qmd passed 5/5 work_resume jobs through CLI evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "qmd passed 5/5 retrieval jobs through CLI evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "qmd passed 5/5 project_decisions jobs through CLI evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "qmd failed all six memory-evolution jobs in the fresh June 11 diagnostic, including the delete/TTL tombstone job where qmd retrieved only the current plan and missed the tombstone evidence." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate or review consolidation proposals." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages." + }, + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The full qmd live sweep includes operator_debugging_ux fixtures and records replay-command metadata, but it lacks ELF trace hydration, viewer links, and intermediate candidate-drop stages, so the suite remains wrong_result." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep does not exercise capture integrations or write-policy redaction boundaries; all capture_integration jobs remain typed not_encoded for qmd." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The qmd live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "qmd retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep preserves the core/archival fixture gap as typed not_encoded; qmd does not expose ELF core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/qmd-report.json", + "status": "pass" + } + ], + "notes": [ + "This qmd record is real-world job evidence and must not be conflated with the same-corpus qmd_live_baseline record.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove broad RAG/graph adapter parity or private-corpus production quality." + ] + }, + { + "adapter_id": "elf_operator_debug_live", + "project": "ELF", + "adapter_kind": "docker_service_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + "run": { + "status": "pass", + "evidence": "ELF materializes operator_debugging_ux adapter_response objects through ElfService, worker indexing, search_raw trace ids, and generated operator_debug metadata.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + "result": { + "status": "pass", + "evidence": "The narrow live slice scores operator-debugging jobs with trace availability, replay command availability, candidate-drop visibility, repair-action clarity, and raw-SQL avoidance separated in job-level evidence.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through the live service materializer and generated scoring fixtures." + }, + { + "capability": "trace_hydration_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include service trace ids, viewer links, admin trace-bundle URLs, and trace_available=true." + }, + { + "capability": "replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include admin trace-bundle curl replay commands; no raw SQL path is required." + }, + { + "capability": "candidate_drop_visibility", + "status": "pass", + "evidence": "The operator-debug jobs keep dropped-candidate visibility as explicit job-level evidence instead of relying on direct database inspection." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This ELF live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The narrow live operator-debug slice scores trace hydration, stage attribution, candidate-drop visibility, selected-but-not-narrated diagnosis, and repair-action clarity through generated ELF live artifacts." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs for the operator-debug jobs; qmd has replay rows but no ELF trace hydration surface.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF generated admin trace-bundle replay commands; qmd generated local CLI query replay commands. These are comparable replay-command availability artifacts, not equivalent UI quality claims.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated operator_debug candidate-drop visibility from trace and replay-candidate metadata without direct SQL assumptions; qmd keeps only top-k replay rows and lacks intermediate candidate-drop stages.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "The new selected-but-not-narrated job scores whether selected trace evidence is available for answer-composition repair without direct database inspection.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "The record does not implement product UI improvements and does not claim broad qmd/OpenMemory/claude-mem superiority." + ] + }, + { + "adapter_id": "qmd_operator_debug_live", + "project": "qmd", + "adapter_kind": "docker_cli_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes operator_debugging_ux adapter_response objects through collection add, update, embed, and query --json, then records local replay-command metadata but no service trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The narrow live slice gives qmd explicit replay-command evidence, but operator-debug jobs remain wrong_result where trace availability, trace completeness, or candidate-drop stage visibility is required.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through qmd local CLI materialization and generated scoring fixtures." + }, + { + "capability": "local_replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include qmd query replay commands tied to per-job collections." + }, + { + "capability": "trace_hydration_metadata", + "status": "wrong_result", + "evidence": "Generated qmd operator_debug records have trace_available=false and no ELF viewer/admin trace bundle because qmd exposes local replay rows rather than service trace hydration." + }, + { + "capability": "candidate_drop_visibility", + "status": "wrong_result", + "evidence": "qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed in the generated artifact." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This qmd live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The narrow qmd operator-debug slice scores local replay commands but remains wrong_result for trace hydration and candidate-drop stage visibility." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated replay-command metadata but trace_available=false, so ELF wins only this trace-hydration dimension; this is not a broad qmd loss.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated local CLI query replay commands for the same operator-debugging scenarios; ELF generated admin trace-bundle curl commands.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated top-k replay output but not intermediate retrieved-but-dropped stage visibility, so candidate-drop diagnosis remains a qmd wrong_result in this narrow slice.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated clear local replay steps for repair investigation, matching ELF on repair-action clarity while differing on trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd can replay top-k rows, but the generated artifact does not expose service trace narration stages for the selected-but-not-narrated diagnosis.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", + "status": "wrong_result" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "qmd's replay-command availability remains useful; the wrong_result status is limited to trace hydration and candidate-drop stage visibility." + ] + }, + { + "adapter_id": "agentmemory_live_baseline", + "project": "agentmemory", + "adapter_kind": "docker_sdk_mock_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "lifecycle_fail", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs and exercises agentmemory package APIs.", + "command": "ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/agentmemory.log" + }, + "run": { + "status": "lifecycle_fail", + "evidence": "Same-corpus retrieval can run, but durable lifecycle behavior is not proven because the adapter uses an in-memory SDK/KV mock.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "lifecycle_fail", + "evidence": "agentmemory remains a reference for capture and continuity UX, but current Docker evidence is not a durable lifecycle pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "The current adapter can run mem::remember and mem::search against the shared corpus." + }, + { + "capability": "adapter_storage", + "status": "mocked", + "evidence": "The current adapter uses a process-local StateKV Map and in-memory index." + }, + { + "capability": "durable_cold_start", + "status": "blocked", + "evidence": "A persistent upstream KV/index path or hosted runtime is needed before cold-start recovery can be fairly scored." + }, + { + "capability": "durable_work_resume_capture_path", + "status": "blocked", + "evidence": "XY-925 selects the next local path as a Docker-contained agentmemory session directory with persisted SDK KV store, observation log, and searchable index across a fresh process; the current StateKV Map and in-memory index still block scoring." + }, + { + "capability": "write_policy_hook_capture", + "status": "blocked", + "evidence": "Capture/write-policy jobs require live agentmemory hook observations plus persisted write-policy audit evidence. The current adapter does not execute those hooks." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed blocked prompt coverage for the required durable path, but no live agentmemory real_world_job adapter executes prompts until the persistent local store exists." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "blocked", + "evidence": "A durable upstream agentmemory session/capture path is required before work-resume jobs can be compared fairly." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "The current fixture import boundary is offline and does not run live agentmemory hooks." + }, + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Durable update/supersede/delete history is not proven by the in-memory adapter." + } + ], + "scenarios": [ + { + "scenario_id": "basic_same_corpus_retrieval", + "suite_id": "retrieval", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports agentmemory retrieval_pass with 3/3 same-corpus retrieval checks through mem::remember and mem::search. This is live-baseline-only evidence through an in-memory mock, not a real_world_job suite pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "durable_update_reload_lifecycle", + "suite_id": "memory_evolution", + "status": "lifecycle_fail", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks, while agentmemory update_replaces_note_text is lifecycle_fail and cold_start_recovery_search is blocked because the harness uses an in-memory SDK/KV mock. This is an ELF baseline win only at the local lifecycle-smoke evidence class.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "work_resume_capture_continuity", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. XY-925 selects the durable local path as a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; keep work_resume and capture claims blocked until that path exists.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json" + }, + { + "scenario_id": "durable_work_resume_local_path", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The selected comparable path is explicit: capture into a Docker-local agentmemory session directory, persist the SDK KV/index and observation log, restart a fresh process, then score work_resume prompts. The checked-in fixture records this as blocked rather than scoring the current mock.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + }, + { + "scenario_id": "capture_write_policy_hooks", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory capture/write-policy comparison needs live hook observations and write-policy audit evidence persisted through the selected local store. The fixture preserves this as a typed blocker and does not convert the mem::remember smoke into capture proof.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + } + ], + "evidence": [ + { + "kind": "guide", + "ref": "docs/guide/research/agentmemory_adapter.md", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "mocked" + } + ], + "notes": [ + "The offline agentmemory fixture adapter is an import/comparison boundary and must not be treated as live benchmark proof." + ], + "follow_up": { + "title": "[ELF benchmark P0] Make agentmemory adapter lifecycle-durable and fail-typed", + "reason": "A durable upstream agentmemory storage path is required before lifecycle and real-world job suites can be fairly scored." + } + }, + { + "adapter_id": "mem0_openmemory_live_baseline", + "project": "mem0/OpenMemory", + "adapter_kind": "docker_sdk_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install mem0 and configure local FastEmbed/Qdrant paths.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded SDK checks. XY-931 adds a separate OpenMemory export-helper setup probe artifact and keeps that blocked UI/export result out of the SDK check summary.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "The local OSS mem0 baseline now passes same-corpus retrieval, update/delete/reload, preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history. The separate OpenMemory export-helper setup probe is blocked because Docker is unavailable inside the baseline-runner container before any product app database readback can run. It still does not claim hosted Platform export, optional graph memory, or a real_world_job prompt adapter.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "local_storage", + "status": "real", + "evidence": "The adapter targets local FastEmbed, Qdrant path storage, and local history DB paths in Docker." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "local_lifecycle_update_delete_reload", + "status": "pass", + "evidence": "The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; the fresh scoped run reports those lifecycle checks passing." + }, + { + "capability": "preference_correction_history", + "status": "pass", + "evidence": "The fresh scoped run reports preference_correction_history as pass: Memory.history preserved explicit ADD and UPDATE records with old and current preference text, and search returned only the current correction." + }, + { + "capability": "entity_scoped_personalization", + "status": "pass", + "evidence": "The fresh scoped run reports entity_scoped_personalization as pass: user_id, agent_id, and run_id filters returned the ELF scoped preference and omitted a PubFi scoped preference." + }, + { + "capability": "local_get_all_export_readback", + "status": "pass", + "evidence": "The fresh scoped run reports local_get_all_export_readback as pass: Memory.get_all returned the current scoped preference and omitted the other scope." + }, + { + "capability": "deletion_audit_history", + "status": "pass", + "evidence": "The fresh scoped run reports delete_history_audit_readback as pass: Memory.history exposed a DELETE event and search suppressed the deleted memory." + }, + { + "capability": "openmemory_ui_readback", + "status": "blocked", + "evidence": "XY-931 runs a bounded OpenMemory export-helper setup probe after the mem0 SDK corpus checks. The probe finds the OpenMemory tree, UI package, compose file, and export helper, then records a setup blocker because the export helper requires Docker access to a running OpenMemory container. Local SDK get_all readback is measured separately and must not be reused as UI evidence." + }, + { + "capability": "hosted_managed_memory_claims", + "status": "unsupported", + "evidence": "Hosted mem0 Platform behavior and Platform UI export are outside the local OSS Docker adapter and are non-goals for this local evidence record." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No mem0/OpenMemory adapter currently executes real_world_job prompts and answer scoring." + }, + { + "capability": "optional_graph_memory", + "status": "not_encoded", + "evidence": "Optional graph memory is not enabled in the default local OSS path and remains an opt-in scenario gate rather than a default pass/fail claim." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure preference correction history and deletion audit readback, but no mem0 real_world_job memory_evolution prompt adapter is encoded." + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure entity-scoped personalization, but no mem0 real_world_job personalization prompt adapter is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "Local SDK get_all inspection is measured, but OpenMemory UI/export readback is blocked by the XY-931 export-helper setup probe until a dedicated OpenMemory compose/import path can load the same corpus into the OpenMemory app database." + } + ], + "scenarios": [ + { + "scenario_id": "basic_local_lifecycle", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Prior comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing basic same-corpus retrieval, update, delete, and cold-start reload checks. This remains a basic local lifecycle tie at the encoded smoke surface and is not reused as history/UI evidence.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "preference_correction_history", + "suite_id": "personalization", + "status": "pass", + "elf_position": "loses", + "comparison_outcome": "loss", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "entity_scoped_personalization", + "suite_id": "personalization", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + }, + { + "scenario_id": "delete_audit_readback", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "local_get_all_export_readback", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0-checks.json" + }, + { + "scenario_id": "openmemory_ui_export_readback", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/mem0-openmemory-ui-export.json" + }, + { + "scenario_id": "hosted_platform_export", + "suite_id": "operator_debugging_ux", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted mem0 Platform export is explicitly outside the local OSS Docker comparison and is not counted as a local pass, loss, or blocker.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "optional_graph_memory", + "suite_id": "memory_evolution", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Optional graph memory is kept as an opt-in scenario gate. It is not enabled in the default mem0 local OSS run and is not part of the default pass/fail comparison.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Separate local OSS mem0 SDK evidence from OpenMemory product UI/export claims.", + "A blocked OpenMemory export-helper setup probe is not an ELF win or loss until the product app can import and export the same local corpus." + ] + }, + { + "adapter_id": "memsearch_live_baseline", + "project": "memsearch", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install memsearch and run its CLI path.", + "command": "ELF_BASELINE_PROJECTS=memsearch cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/memsearch.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 indexes a per-adapter corpus copy, rewrites and deletes files, reruns memsearch index, and reports memsearch 4/4 encoded checks passing.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "memsearch now passes the local same-corpus/reindex/update/delete/reload smoke. No real_world_job memsearch prompt adapter is encoded, so Markdown-first behavior remains baseline scenario evidence rather than suite pass evidence.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "canonical_markdown_store", + "status": "real", + "evidence": "memsearch is tracked as a Markdown-first source-of-truth reference." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "reindex_update_delete_reload", + "status": "pass", + "evidence": "The runner rewrites auth-memory.md, deletes a second corpus file, reruns memsearch index, and starts fresh memsearch search processes; the fresh scoped run reports update, delete, and cold-start reload passing." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Markdown source-store and retrieval-debug jobs, but no live memsearch runtime adapter executes real_world_job prompts and answer scoring." + }, + { + "capability": "markdown_source_store_prompt_jobs", + "status": "pass", + "evidence": "The first-generation OSS fixture slice encodes source-of-truth rebuild/reload and retrieval-debug prompts over the canonical Markdown store while preserving the live-baseline-only evidence boundary." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "not_encoded", + "evidence": "The Markdown-first source model passed the local reindex/reload smoke, and XY-925 adds fixture-backed source-of-truth prompt coverage over the canonical Markdown store. No live memsearch runtime adapter executes prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The Docker same-corpus check passes, and XY-925 adds fixture-backed retrieval-debug prompt coverage over memsearch CLI replay and Markdown source inspection. No live memsearch runtime adapter executes retrieval prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Update/delete reindex semantics pass in Docker, but memory_evolution real_world_job prompts are not encoded for memsearch." + } + ], + "scenarios": [ + { + "scenario_id": "canonical_markdown_reindex_reload", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch passed same-corpus retrieval, update reindex, delete suppression, and cold-start reload over a canonical Markdown corpus. ELF has no directly comparable canonical Markdown source-store scenario in this baseline, so the ELF position remains untested.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "markdown_source_store_rebuild_reload_prompt", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in real_world_job prompt fixture that asks for the memsearch source-of-truth path and rebuild/reload boundary: canonical Markdown files are authoritative, while the index is derived by rerunning memsearch index. This is fixture-backed scenario coverage plus baseline artifact evidence, not a memsearch live real_world_job suite pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json" + }, + { + "scenario_id": "markdown_retrieval_debug_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in retrieval-debug prompt over memsearch's canonical Markdown store. The expected debug surface is CLI replay plus Markdown source inspection and reindexing; staged expansion/fusion/rerank/candidate-drop trace bundles remain not encoded for memsearch.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json" + }, + { + "scenario_id": "ttl_expiry_lifecycle", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "The encoded memsearch CLI path supports reindex/delete but no TTL or expiry behavior. Unsupported TTL behavior is preserved as unsupported competitor evidence and does not create an ELF win/loss claim without a directly comparable scenario artifact.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "real_world_prompt_adapter", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "No live memsearch runtime adapter currently executes real_world_job prompts and answer scoring. XY-925 fixture-backed prompt jobs document the source-store and retrieval-debug shape, while baseline retrieval/reindex evidence remains separate from suite pass claims.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Do not mark memsearch worse solely because setup or local indexing is heavier; preserve the typed incomplete/wrong-result boundary." + ] + }, + { + "adapter_id": "openviking_live_baseline", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "OpenViking local-embed setup installed and imported pinned llama-cpp-python==0.3.28 from the CPU wheel index in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The adapter reached same-corpus add_resource/find and now exposes expected/matched/missing evidence ids, but returned 0 of 3 expected evidence-term matches in the smoke run.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The current OpenViking Docker evidence is a behavioral wrong_result, not a local embedding setup blocker and not a real_world_job pass.", + "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "local_embed_setup", + "status": "pass", + "evidence": "Docker local embedding dependency setup is pinned to llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu and reached import/runtime in the smoke run." + }, + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "OpenViking add_resource/find returned resources but missed expected evidence-term matches for every smoke query." + }, + { + "capability": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged/hierarchical retrieval is now encoded as blocked context_trajectory fixtures until same-corpus expected evidence ids match and staged artifacts are materialized." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No OpenViking adapter currently executes real_world_job prompts and answer scoring." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "The Docker-local setup reached add_resource/find, but the retrieval check returned 0/3 expected evidence-term matches." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Hierarchical context resume scenarios are not encoded for OpenViking." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked behind same-corpus evidence output and staged artifact readback." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + }, + { + "label": "llama-cpp-python CPU wheel index", + "url": "https://abetlen.github.io/llama-cpp-python/whl/cpu", + "evidence": "Official prebuilt CPU wheel index used by the Docker-local embedding pin." + } + ], + "setup_path": "Run ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker. The runner installs llama-cpp-python==0.3.28 with --only-binary llama-cpp-python from the CPU wheel index before OpenViking add_resource/find.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host-global OpenViking, llama-cpp-python, or model service install is required.", + "resource_expectation": "Local embedding setup may download a CPU wheel and model assets; record OpenViking.log, elapsed time, and cache size before claiming adapter quality.", + "retry_guidance": [ + "Use the default pinned CPU wheel path first.", + "Override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the default wheel is unavailable for the Docker platform.", + "Treat install/import failure as incomplete, not wrong_result; treat add_resource/find evidence misses as wrong_result." + ] + }, + "notes": [ + "Record OpenViking as wrong_result now that the pinned Docker local embedding path reaches add_resource/find but misses expected evidence; keep context_trajectory as blocked until staged artifacts exist." + ], + "follow_up": { + "title": "Fix OpenViking evidence-bearing same-corpus retrieval output and materialize staged artifacts", + "reason": "The current adapter reaches add_resource/find and exposes expected evidence ids, but must match evidence ids and return stage/hierarchy/recursive artifacts before trajectory quality can be scored." + } + }, + { + "adapter_id": "claude_mem_live_baseline", + "project": "claude-mem", + "adapter_kind": "docker_repository_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install and build claude-mem.", + "command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/claude-mem.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The Docker runner now uses a durable SQLite file, exercises repository update/delete/reopen checks, and reports missed same-corpus or lifecycle evidence as typed non-pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "No real_world_job claude-mem adapter is encoded; progressive disclosure remains a design reference.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "The current Docker adapter did not prove correct same-corpus retrieval." + }, + { + "capability": "durable_storage", + "status": "real", + "evidence": "The runner writes to a Docker-local SQLite file and constructs a new Database plus repository instances for cold-start recovery search." + }, + { + "capability": "repository_lifecycle", + "status": "real", + "evidence": "The runner uses MemoryItemsRepository.update, deletes from the repository-owned memory_items table, and relies on repository FTS triggers for update/delete checks." + }, + { + "capability": "repository_progressive_disclosure", + "status": "real", + "evidence": "The runner verifies search result to getById detail hydration and listSources source evidence on the durable repository path." + }, + { + "capability": "progressive_disclosure_real_world_job", + "status": "pass", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Docker-contained repository progressive-disclosure path: search result to getById detail hydration and listSources evidence on durable SQLite. Hook, timeline, and viewer workflows remain blocked separately." + }, + { + "capability": "retrieval_repair_artifact", + "status": "wrong_result", + "evidence": "The same-corpus retrieval smoke remains wrong_result, and XY-925 records a repair prompt that tells operators to rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker before inspecting tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json." + }, + { + "capability": "hook_capture_viewer_workflow", + "status": "blocked", + "evidence": "The current Docker runner does not launch claude-mem hooks, timeline capture, local viewer readback, or an operator workflow over the same corpus." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "The durable repository run is encoded, but hook-driven capture and real_world_job work-resume prompts are not proven by that local repository check." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompt coverage, but local viewer/operator workflow remains blocked until a Docker-contained viewer or equivalent readback runner exists." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "claude-mem hook capture remains blocked because hooks, timeline capture, and observation workflows are not executed by this runner." + } + ], + "scenarios": [ + { + "scenario_id": "same_corpus_retrieval", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF retrieval_pass and claude-mem same_corpus_retrieval as wrong_result with 0/3 expected query checks passing, while its durable repository setup completed. This is an ELF baseline win for the narrow retrieval smoke scenario.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "retrieval_repair_artifact_path", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "XY-925 adds a checked-in repair prompt that preserves the claude-mem wrong_result and names rerun/inspection targets from the reproducible Docker baseline: tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. This is repair evidence for a miss, not a retrieval pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json" + }, + { + "scenario_id": "repository_lifecycle_reload", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing local lifecycle checks and claude-mem update, delete, and cold-start reload checks passing over a durable Docker-local SQLite repository. This is a local lifecycle-smoke tie, not a hook-driven work-resume or full progressive-disclosure job pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_detail_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "claude-mem passed the repository-level search-to-detail/source hydration check, which is a useful progressive-disclosure signal. ELF does not have a directly comparable claude-mem-style progressive-disclosure scenario in this baseline, so the ELF position remains untested rather than a loss claim.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds fixture-backed prompt coverage that asks for the measured claude-mem progressive-disclosure boundary: repository search results hydrate through getById and listSources on durable SQLite, but hooks, timeline, viewer, and live prompt scoring are not executed.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json" + }, + { + "scenario_id": "hook_capture_viewer_workflow", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The Docker baseline uses repository classes only. claude-mem hooks, viewer, timeline, and observation workflows are not executed by the runner, so XY-925 preserves this as a typed blocker rather than not_encoded prose.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + }, + { + "scenario_id": "viewer_operator_workflow", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "A fair claude-mem viewer/operator comparison needs a Docker-contained run that opens the local viewer or equivalent readback over the same durable SQLite corpus and emits timeline, detail hydration, and repair-command artifacts. That path is not available in the current runner.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "claude-mem remains a UX reference; durable repository checks do not prove hook, viewer, or full real-world progressive-disclosure behavior." + ] + }, + { + "adapter_id": "qmd_deep_profile_gate", + "project": "qmd", + "adapter_kind": "docker_cli_deep_profile_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "pass", + "evidence": "qmd already has a Docker CLI live-baseline adapter; this gate records the deeper profile extension before a separate scaled run is claimed.", + "command": "ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "not_encoded", + "evidence": "The XY-899 strength-profile report is checked in, but no new live qmd deep-profile adapter artifact is claimed from it." + }, + "result": { + "status": "not_encoded", + "evidence": "The XY-899 report records qmd scenario-level retrieval/debug/replay outcomes and wrong-result diagnosis taxonomy, while expansion/fusion/rerank scoring remains not_encoded.", + "artifact": "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json" + }, + "capabilities": [ + { + "capability": "stress_profile_retrieval_debug", + "status": "not_encoded", + "evidence": "The stress command path exists, but this adapter-pack gate has not published a deep qmd profile result." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "Repository-supported qmd benchmark runs must stay inside docker-compose.baseline.yml and must not require host-global installs." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "A deeper stress retrieval-debug report is not checked in for this gate." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd query planning and score readback are not yet scored as operator-debugging real_world_job outputs." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/tobi/qmd", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "qmd repository", + "url": "https://github.com/tobi/qmd", + "evidence": "Official qmd source for local hybrid search, CLI setup, and query behavior." + } + ], + "setup_path": "Use the existing Docker baseline qmd install, collection add, update, embed, and query flow with scale or stress profiles.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container with project files and caches inside Docker volumes.", + "resource_expectation": "CPU local embedding and rerank cost scale with corpus size; record elapsed time and qmd log artifacts before claims.", + "retry_guidance": [ + "Run qmd stress profile in Docker and publish the artifact path.", + "Map qmd JSON output to retrieval-debug real_world_job scoring before suite claims." + ], + "research_depth": "D2 reviewed; deep profile not encoded" + }, + "notes": [ + "This gate deepens qmd planning without changing the existing qmd pass evidence from the smoke live baseline." + ] + }, + { + "adapter_id": "openviking_deep_profile_gate", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_context_trajectory_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The default pinned OpenViking local embedding dependency path reaches runtime in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "blocked", + "evidence": "The XY-928 context_trajectory fixtures encode staged retrieval, hierarchy selection, and recursive/context expansion as blocked; no live trajectory adapter artifact is claimed." + }, + "result": { + "status": "blocked", + "evidence": "No OpenViking deep context-trajectory result is claimed from the current wrong-result smoke run; the XY-928 fixtures preserve trajectory surfaces as blocked/not_tested.", + "artifact": "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json" + }, + "capabilities": [ + { + "capability": "docker_local_embed_setup", + "status": "pass", + "evidence": "The local embedding setup is pinned and reaches import/runtime in Docker." + }, + { + "capability": "hierarchical_context_trajectory", + "status": "blocked", + "evidence": "Stage trajectory scoring is encoded as blocked until the smoke adapter returns evidence-bearing same-corpus output and selected hierarchy/expansion artifacts." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "The adapter pack must not ask operators to install OpenViking dependencies globally on the host." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "Same-corpus retrieval is still the precondition and remains wrong_result in the live baseline." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion jobs are encoded as blocked fixtures." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Trajectory readback is a reference feature but not a scored adapter output." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/volcengine/OpenViking/", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + } + ], + "setup_path": "Use the pinned Docker local embedding path from scripts/live-baseline-benchmark.sh, then run OpenViking add_resource/find before any deep profile scoring.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker.", + "resource_expectation": "Local embedding setup can download CPU wheels and model assets; record build/import logs, model cache size, and elapsed time.", + "retry_guidance": [ + "Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.", + "Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.", + "Fix evidence-bearing same-corpus output and materialize selected hierarchy/expansion artifacts before converting blocked context_trajectory fixtures into scored jobs." + ], + "research_depth": "D2 reviewed; local embedding setup pinned; blocked fixtures encoded" + }, + "notes": [ + "OpenViking remains a context-trajectory reference, but this gate prevents a smoke wrong_result or blocked fixture from becoming a deep-profile win claim." + ] + }, + { + "adapter_id": "ragflow_research_gate", + "project": "RAGFlow", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe tiny-corpus evidence smoke into a generated real_world_job report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-ragflow-docker", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The live path requires explicit resource-envelope opt-in and a local self-hosted RAGFlow API key; setup failures stay typed in the generated smoke artifact.", + "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", + "artifact": "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits ragflow-report.json and ragflow-report.md from one generated retrieval job. Pass or wrong_result is allowed only when returned reference chunks map to generated evidence ids; resource, setup, and API-key limits remain typed blockers.", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json" + }, + "capabilities": [ + { + "capability": "adapter_candidate_verdict", + "status": "not_encoded", + "evidence": "XY-882 completed D1/D2 feasibility research and marks RAGFlow adapter_candidate; no adapter run is encoded." + }, + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The smoke records official Docker setup, image/disk/startup envelope, CPU/GPU mode, vm.max_map_count handling, provider boundaries, and retry behavior." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "One generated retrieval job is scored from the smoke artifact or typed blocked when resource, service, or local API-key boundaries stop execution." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The scored smoke does not claim broad RAGFlow quality, private corpus behavior, scale, or comparative ranking." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated retrieval smoke is scored as pass, wrong_result, blocked, or incomplete by ragflow-report.json; the checked-in row remains blocked until live reference chunks map to evidence ids." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "RAGFlow knowledge output is not mapped to real_world_job page or citation scoring." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Resource envelope and service startup retry guidance must be documented first." + } + ], + "scenarios": [ + { + "scenario_id": "reference_chunk_citation_mapping", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for RAGFlow reference-chunk citation scoring. The job must remain blocked until returned reference chunks include generated document ids, chunk ids, content, and document metadata mapped to benchmark evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "private_or_large_corpus_ragflow_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Private corpus, large-corpus, and hosted RAGFlow quality are outside the generated-public Docker representative lane and must not be inferred from smoke reports.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/infiniflow/ragflow", + "status": "real" + }, + { + "kind": "source", + "ref": "https://ragflow.io/docs/", + "status": "real" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "RAGFlow repository", + "url": "https://github.com/infiniflow/ragflow", + "evidence": "Official source for RAGFlow service code and Docker Compose setup." + }, + { + "label": "RAGFlow docs", + "url": "https://ragflow.io/docs/", + "evidence": "Official deployment and setup documentation." + }, + { + "label": "RAGFlow HTTP API reference", + "url": "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + "evidence": "Official reference for OpenAI-compatible responses with reference chunks and document metadata." + } + ], + "setup_path": "Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API.", + "runtime_boundary": "Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs.", + "resource_expectation": "Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring.", + "retry_guidance": [ + "Run cargo make smoke-ragflow-docker first to produce a typed preflight artifact.", + "Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.", + "Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids." + ], + "research_depth": "D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed resource/setup/API-key blockers.", + "Do not interpret ragflow-report.json as broad RAGFlow quality evidence unless reference chunks map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter", + "reason": "Created as XY-885. XY-882 found a Docker boundary and reference-chunk output contract; implementation must prove a tiny ingest/query run before any quality claim." + } + }, + { + "adapter_id": "lightrag_research_gate", + "project": "LightRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-886 adds a Docker-profile context-export smoke command, and XY-900 keeps its generated retrieval fixtures scored through real_world_job_benchmark. The checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-lightrag-docker-context", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if the LightRAG API is unavailable; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in Docker service profile.", + "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context", + "artifact": "tmp/real-world-memory/lightrag-context/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke emits lightrag-report.json and lightrag-report.md over generated retrieval jobs. Pass or wrong_result is allowed only when returned context, references, or file paths map to generated evidence ids.", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-report.json" + }, + "capabilities": [ + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The opt-in compose profile records explicit LightRAG image, LLM, embedding, rerank, workspace, and Docker volume configuration without host-global installs." + }, + { + "capability": "retrieved_context_export", + "status": "blocked", + "evidence": "The materializer calls /documents/texts, waits on /documents/track_status, and queries /query with only_need_context plus chunk references when the service is reachable." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The LightRAG materializer rewrites generated retrieval fixtures with adapter_response evidence only when source paths or context map to required evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not score broad graph-RAG quality, private corpora, scale, or comparative ranking claims." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated smoke can exercise retrieval context/source mapping for retrieval fixtures, but the checked-in record stays blocked until a live artifact reaches query output." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "LightRAG update/delete/current-versus-historical behavior is not encoded by the context-export smoke." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "The smoke records context/source mappings, but full trace or viewer diagnostics are not mapped to benchmark scoring." + } + ], + "scenarios": [ + { + "scenario_id": "context_source_reference_mapping", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative incomplete fixture for LightRAG context/source-reference scoring. The job cannot score until the opt-in Docker API exports generated source file paths, snippets, or reference content.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "graph_rag_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "LightRAG graph-RAG navigation quality remains not_tested beyond the context-source output contract; no ELF win, tie, or loss is claimed.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-lightrag-docker-context", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LightRAG repository", + "url": "https://github.com/HKUDS/LightRAG", + "evidence": "Official source for LightRAG server, Docker, and retrieval modes." + }, + { + "label": "LightRAG Docker docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "evidence": "Official Docker deployment reference." + }, + { + "label": "LightRAG API server docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md", + "evidence": "Official query-mode and context-output reference." + }, + { + "label": "LightRAG core programming docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md", + "evidence": "Official source-id and file-path citation reference." + } + ], + "setup_path": "Run cargo make smoke-lightrag-docker-context for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes.", + "resource_expectation": "The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts.", + "retry_guidance": [ + "Run cargo make smoke-lightrag-docker-context first; a missing API must remain a typed incomplete artifact, not a pass claim.", + "Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.", + "Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids." + ], + "research_depth": "D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed service/setup blockers.", + "Do not interpret lightrag-report.json as broad graph-RAG quality evidence unless generated source/context mappings score as pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter", + "reason": "Created as XY-886. XY-882 found a Docker service path and context/source mapping contract; implementation must prove evidence export before scoring." + } + }, + { + "adapter_id": "graphrag_research_gate", + "project": "GraphRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe generated-corpus GraphRAG smoke into a scored knowledge_compilation report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-graphrag-docker", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed blocked artifact without model calls; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration to attempt live GraphRAG index/query.", + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker", + "artifact": "tmp/real-world-memory/graphrag-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphrag-report.json and graphrag-report.md from one generated knowledge_compilation job. Pass or wrong_result is allowed only when GraphRAG output tables map to generated evidence ids.", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-report.json" + }, + "capabilities": [ + { + "capability": "indexing_resource_envelope", + "status": "blocked", + "evidence": "The smoke bounds the generated public corpus, timeout, GraphRAG package, model configuration, cache size, output size, elapsed time, and observed cache entries." + }, + { + "capability": "source_citation_mapping", + "status": "blocked", + "evidence": "The generated artifact maps GraphRAG documents, text_units, communities, community_reports, entities, and relationships parquet rows back to real_world_job evidence ids when available." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; provider/setup limits remain blocked until live GraphRAG output maps to expected evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-navigation quality, knowledge-synthesis quality, private corpora, or large-corpus indexing." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "blocked", + "evidence": "The generated smoke can exercise parquet table source coverage for one tiny knowledge-compilation fixture, but the checked-in record stays blocked until live output exists." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The smoke may run local search for reachability, but retrieval quality scoring is not encoded." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Resource bounds are recorded, but no production-ops suite scoring is encoded." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "GraphRAG update/delete/current-versus-historical behavior is not encoded by the smoke." + } + ], + "scenarios": [ + { + "scenario_id": "output_table_citation_mapping", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for GraphRAG output-table citation scoring. The job requires provider-backed Docker output tables whose document, text-unit, community, report, entity, and relationship identifiers map to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + }, + { + "scenario_id": "graph_summary_synthesis_quality", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "GraphRAG graph-summary synthesis quality remains not_tested until provider-backed output tables and local-search context are scored beyond the smoke contract.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/microsoft/graphrag", + "status": "real" + }, + { + "kind": "source", + "ref": "https://microsoft.github.io/graphrag/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphrag-docker", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "GraphRAG repository", + "url": "https://github.com/microsoft/graphrag", + "evidence": "Official Microsoft GraphRAG source and setup reference." + }, + { + "label": "GraphRAG docs", + "url": "https://microsoft.github.io/graphrag/", + "evidence": "Official documentation for indexing and querying." + }, + { + "label": "GraphRAG input docs", + "url": "https://microsoft.github.io/graphrag/index/inputs/", + "evidence": "Official input format and document metadata reference." + }, + { + "label": "GraphRAG output tables", + "url": "https://microsoft.github.io/graphrag/index/outputs/", + "evidence": "Official output schema with document, text unit, community, and relationship identifiers." + }, + { + "label": "GraphRAG local search docs", + "url": "https://microsoft.github.io/graphrag/query/local_search/", + "evidence": "Official local-search context and graph traversal reference." + } + ], + "setup_path": "Run cargo make smoke-graphrag-docker for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke.", + "resource_expectation": "The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries.", + "retry_guidance": [ + "Run cargo make smoke-graphrag-docker first; missing provider configuration must remain a typed blocked artifact, not a pass claim.", + "Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.", + "Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs." + ], + "research_depth": "D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed provider/setup blockers.", + "Do not interpret graphrag-report.json as broad graph-navigation or knowledge-synthesis quality evidence unless output tables map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter", + "reason": "Created as XY-887. XY-882 found a Docker-bounded CLI/API path and output-table evidence handles; implementation must stay tiny and cost-recorded." + } + }, + { + "adapter_id": "graphiti_zep_research_gate", + "project": "Graphiti/Zep", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-contained Graphiti/Zep temporal smoke into a scored memory_evolution report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if live execution is not explicitly enabled. Set ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration to start Docker-local FalkorDB and run Graphiti.", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphiti-zep-report.json and graphiti-zep-report.md from one generated memory_evolution job. The default blocker is live-run opt-in disabled; when ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 are set without provider credentials, the blocker is provider_api_key_missing. No hosted Zep service or unrecorded credentials are used.", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json" + }, + "capabilities": [ + { + "capability": "temporal_graph_memory", + "status": "blocked", + "evidence": "The smoke materializes generated current, historical, and rationale facts with validity windows, but the checked-in record stays blocked until a live artifact maps search output." + }, + { + "capability": "docker_graph_store_setup", + "status": "blocked", + "evidence": "The task uses a Docker Compose graphiti-zep profile for FalkorDB and a container-local Python venv; no host-global graph database or hosted Zep service is used." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The generated temporal-validity fixture is scored or typed blocked; live quality evidence requires Graphiti/Zep search output mapped to current and historical evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-memory quality, managed Zep service behavior, private-corpus behavior, or large-corpus performance." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Generated current/historical relation facts are encoded, but the checked-in manifest stays blocked until the Docker smoke returns validity-window search output." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Hybrid graph retrieval reachability is not scored beyond the temporal search smoke." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations." + } + ], + "scenarios": [ + { + "scenario_id": "temporal_validity_window_mapping", + "suite_id": "memory_evolution", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for Graphiti/Zep temporal-validity scoring. The job remains blocked until provider-backed Docker output maps current and historical validity-window facts to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json" + }, + { + "scenario_id": "hosted_zep_temporal_memory", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted Zep service behavior is outside the Docker-local representative lane; no hosted-service result is used as ELF win/loss evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/getzep/graphiti", + "status": "real" + }, + { + "kind": "source", + "ref": "https://www.getzep.com/platform/graphiti/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphiti-zep-docker-temporal", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Graphiti repository", + "url": "https://github.com/getzep/graphiti", + "evidence": "Official open-source temporal context graph engine." + }, + { + "label": "Zep Graphiti overview", + "url": "https://www.getzep.com/platform/graphiti/", + "evidence": "Official product documentation for temporal context graph behavior." + }, + { + "label": "Graphiti quick start", + "url": "https://help.getzep.com/graphiti/getting-started/quick-start", + "evidence": "Official setup, episode ingest, and search output reference." + }, + { + "label": "Graphiti FalkorDB configuration", + "url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", + "evidence": "Official Docker-local FalkorDB setup reference." + }, + { + "label": "Graphiti fact triples", + "url": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", + "evidence": "Official manual fact-triple ingest contract." + } + ], + "setup_path": "Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", + "resource_expectation": "Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring.", + "retry_guidance": [ + "Run cargo make smoke-graphiti-zep-docker-temporal first to produce a typed blocked artifact.", + "Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.", + "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass." + ], + "research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed live-run opt-in, provider, and setup blockers.", + "Graphiti/Zep remains the temporal-validity reference; do not claim ELF-over-Graphiti/Zep until provider-backed temporal output maps to scored evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter", + "reason": "Created as XY-888. XY-882 found a Docker-local graph-store path and fact/validity-window output contract for memory_evolution scoring." + } + }, + { + "adapter_id": "letta_research_gate", + "project": "Letta", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract is a Docker-only benchmark-created agent export that must return core block JSON, archival search readback, and source ids before any scenario claim is scored." + }, + "run": { + "status": "not_encoded", + "evidence": "No Letta materializer currently creates the benchmark agent, imports the ELF core_archival_memory fixture corpus, or exports comparable core and archival evidence." + }, + "result": { + "status": "not_encoded", + "evidence": "No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision result is claimed." + }, + "capabilities": [ + { + "capability": "core_archival_memory", + "status": "blocked", + "evidence": "ELF fixture jobs now score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids." + }, + { + "capability": "docker_embedding_configuration", + "status": "blocked", + "evidence": "Docker setup requires explicit embedding configuration before archival retrieval can be tested." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No Letta materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Core memory preference application is not encoded for Letta." + }, + { + "suite_id": "project_decisions", + "status": "not_encoded", + "evidence": "Archival memory decision retrieval is not encoded for Letta." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Agent resumption through Letta memory blocks is not encoded." + }, + { + "suite_id": "core_archival_memory", + "status": "blocked", + "evidence": "ELF fixture coverage exists, but Letta has no contained export/readback artifact for the same core-vs-archival jobs." + } + ], + "scenarios": [ + { + "scenario_id": "core_block_attachment_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta has no comparable exported core block attachment evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json" + }, + { + "scenario_id": "core_block_scope_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains unscored without a contained export of agent, block, and visibility metadata.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json" + }, + { + "scenario_id": "core_block_provenance_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains not_tested until exported core memory includes stable source ids and audit-equivalent events.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json" + }, + { + "scenario_id": "stale_core_detection", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json" + }, + { + "scenario_id": "archival_fallback_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json" + }, + { + "scenario_id": "core_archival_project_decision_recovery", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains not_tested until the contained export/readback contract exists.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/letta-ai/letta", + "status": "real" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/guides/docker/", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Letta repository", + "url": "https://github.com/letta-ai/letta", + "evidence": "Official source for Letta stateful agents and memory." + }, + { + "label": "Letta Docker docs", + "url": "https://docs.letta.com/guides/docker/", + "evidence": "Official Docker deployment guide and embedding configuration boundary." + } + ], + "setup_path": "Use a Docker-only Letta server or CLI flow that creates a benchmark-owned agent, loads the checked-in core_archival_memory fixture corpus, writes core memory and archival memory with fixture source ids, then exports core block JSON plus archival search/readback JSON.", + "runtime_boundary": "Docker-only Letta server or CLI flow with benchmark-created agents, benchmark-owned storage, no host-global state, and no unstated hosted service dependency.", + "resource_expectation": "Embedding model, agent server state, exported core memory, archival search output, and provider boundaries must be explicit in the artifact.", + "retry_guidance": [ + "Create a tiny Docker agent with core memory and archival memory loaded from the ELF core_archival_memory fixtures.", + "Export core block readback, archival search results, source ids, and any audit-equivalent metadata as JSON before scoring.", + "Score core-versus-archival scenarios only after source evidence can be exported and mapped to the fixture evidence ids." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); XY-927 selects the contained export/readback contract, but the Letta adapter remains blocked until that artifact exists" + }, + "notes": [] + }, + { + "adapter_id": "langgraph_research_gate", + "project": "LangGraph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "LangGraph is D1 reviewed as a replay/checkpoint reference, not a direct memory backend adapter." + }, + "run": { + "status": "not_encoded", + "evidence": "No checkpoint replay real_world_job harness is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No production-ops or resume suite result is claimed." + }, + "capabilities": [ + { + "capability": "checkpoint_replay_regression", + "status": "not_encoded", + "evidence": "Replay/fork behavior needs an agent graph harness before scoring." + }, + { + "capability": "standalone_memory_backend", + "status": "unsupported", + "evidence": "LangGraph persistence is an agent-state/checkpoint layer, not a drop-in memory retrieval backend." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No LangGraph benchmark materializer exists." + } + ], + "suites": [ + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Checkpoint recovery and replay regression are not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume from checkpoint with memory reads is not encoded." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://docs.langchain.com/oss/python/langgraph/persistence", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LangGraph persistence docs", + "url": "https://docs.langchain.com/oss/python/langgraph/persistence", + "evidence": "Official documentation for checkpoints, replay, fork, and persistence behavior." + } + ], + "setup_path": "Build a tiny LangGraph agent with a checkpointer and explicit memory read/write steps before scoring.", + "runtime_boundary": "Docker-only Python harness with checkpoint store under the artifact directory.", + "resource_expectation": "Small runtime expected, but LLM calls and side effects must be stubbed or deterministic before replay claims.", + "retry_guidance": [ + "Encode one replay/fork failure recovery job.", + "Keep LangGraph classified as replay reference unless memory retrieval is actually exercised." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "nanograph_research_gate", + "project": "nanograph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "nanograph is D1 reviewed as typed graph DX, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No typed graph schema/query real_world_job run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No graph temporal or retrieval-debug result is claimed." + }, + "capabilities": [ + { + "capability": "typed_graph_schema", + "status": "not_encoded", + "evidence": "Schema-as-code and typed query ergonomics need a benchmark harness." + }, + { + "capability": "memory_backend_comparison", + "status": "unsupported", + "evidence": "nanograph is a graph database reference, not a complete agent memory service." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No nanograph materializer exists." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Typed current/historical fact jobs are not encoded." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Typed query explainability is not scored." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nanograph/nanograph", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "nanograph repository", + "url": "https://github.com/nanograph/nanograph", + "evidence": "Official source for on-device typed property graph behavior." + } + ], + "setup_path": "Build or install nanograph inside Docker and load a typed graph fixture from generated corpus facts.", + "runtime_boundary": "Docker-only CLI run with graph folder under benchmark artifacts.", + "resource_expectation": "Light local graph runtime expected; record binary build/install time and graph artifact size.", + "retry_guidance": [ + "Define a minimal schema for memory_evolution facts.", + "Score typed query output only if it cites fixture evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "llm_wiki_research_gate", + "project": "llm-wiki", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "llm-wiki is D1 reviewed as a knowledge-compilation reference, but no plugin or generated-page adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No llm-wiki corpus-to-page run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge page citation or lint result is claimed." + }, + "capabilities": [ + { + "capability": "knowledge_page_compilation", + "status": "not_encoded", + "evidence": "Wiki generation and citation lint are not executed by the runner." + }, + { + "capability": "live_service_runtime", + "status": "unsupported", + "evidence": "llm-wiki is a plugin/workflow reference rather than a service adapter." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No page materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Corpus-to-wiki output is not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from wiki pages are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "wiki_page_citation_lint", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "llm-wiki remains a knowledge-workflow reference. No Docker-contained plugin or file-based page materializer emits cited wiki sections for scoring.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nvk/llm-wiki", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "llm-wiki repository", + "url": "https://github.com/nvk/llm-wiki", + "evidence": "Official source for the LLM Wiki plugin and knowledge-base workflow." + } + ], + "setup_path": "Research plugin bootstrap inside a Docker-contained Codex or file-based harness, then materialize page artifacts.", + "runtime_boundary": "Docker-only plugin or fixture materializer; no user-global Codex plugin install.", + "resource_expectation": "LLM generation cost depends on page build; record provider boundary and generated artifact size.", + "retry_guidance": [ + "Prototype a fixture-only page build with explicit citations.", + "Do not score until generated sections can be mapped to evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "gbrain_research_gate", + "project": "gbrain", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "gbrain is D1 reviewed as a compiled-truth and timeline reference, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No gbrain brain-repo import or compiled-truth run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge-synthesis or operator-continuity result is claimed." + }, + "capabilities": [ + { + "capability": "compiled_truth_timeline", + "status": "not_encoded", + "evidence": "Compiled truth plus timeline output is a reference pattern but not scored." + }, + { + "capability": "postgres_backed_brain_repo", + "status": "blocked", + "evidence": "A Docker-local brain repo and Postgres setup path must be proven before execution." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No gbrain materializer exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Compiled truth and timeline pages are not scored." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Operator continuity through brain pages is not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "compiled_truth_timeline_export", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "gbrain compiled-truth and timeline scoring remains blocked until a Docker-local brain repository and database setup emits current-truth pages with source timeline evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "gbrain repository", + "url": "https://github.com/garrytan/gbrain", + "evidence": "Official source for brain repo and retrieval workflow." + }, + { + "label": "compiled truth guide", + "url": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "evidence": "Official guide for compiled truth plus timeline behavior." + } + ], + "setup_path": "Create a Docker-local brain repo fixture, run import/sync, and export compiled truth plus timeline evidence.", + "runtime_boundary": "Docker-only repository and database state with no operator-owned brain repo.", + "resource_expectation": "Postgres-backed sync and embedding choices must be explicit; record DB size and import time.", + "retry_guidance": [ + "Prototype a tiny brain repo with one current-truth page and timeline.", + "Score only if compiled truth cites the source timeline evidence." + ], + "research_depth": "D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven" + }, + "notes": [] + }, + { + "adapter_id": "graphify_docker_smoke", + "project": "graphify", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "XY-900 validation reached the Docker-only graph/report smoke setup inside the baseline runner without host-global assistant hooks.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" + }, + "run": { + "status": "pass", + "evidence": "The smoke installed graphify in a container-local venv, ran over a generated public corpus, and produced graph/report/query output for scoring.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The smoke emits graphify-report.json and graphify-report.md from one generated knowledge_compilation job. The current scored report maps evidence ids but remains wrong_result because the scoring rubric still records a wrong-result signal.", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" + }, + "capabilities": [ + { + "capability": "docker_cli_boundary", + "status": "pass", + "evidence": "The smoke uses docker-compose.baseline.yml baseline-runner, a container-local Python venv, and isolated assistant config paths; it does not install host-global assistant hooks." + }, + { + "capability": "graph_report_generation", + "status": "pass", + "evidence": "The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, command logs, build time, graph size, and report size." + }, + { + "capability": "real_world_job_adapter", + "status": "wrong_result", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; current knowledge_compilation scoring is wrong_result, not pass." + }, + { + "capability": "multimodal_code_graph", + "status": "not_encoded", + "evidence": "Multimodal extraction for videos, images, PDFs, or broad codebase understanding is a reference capability but not scored by this smoke." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph quality, private corpus behavior, scale, or authoritative memory-store behavior." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "evidence": "The generated smoke exercised graph/report evidence mapping for one generated knowledge-compilation fixture and scored wrong_result with mean_score 0.75." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "Graph-guided query output is present only as support for the generated knowledge_compilation smoke; broad retrieval quality scoring remains unclaimed." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from graph context are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "graph_report_navigation_lint", + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-929 adds a representative graphify fixture that scores graph report navigation, source-location citations, stale-source lint, and unsupported-summary handling as wrong_result because stale-source lint is still missing. This remains graphify non-pass evidence, not an ELF victory claim.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json" + }, + { + "scenario_id": "broad_graph_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Broad graph-navigation, codebase, multimodal, and private-corpus quality remain not_tested; the graphify evidence is bounded to generated graph/report artifacts.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/safishamsi/graphify", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphify-docker-graph-report", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-report.md", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "graphify repository", + "url": "https://github.com/safishamsi/graphify", + "evidence": "Official source for graphify graph extraction and query workflow." + }, + { + "label": "graphify README", + "url": "https://github.com/safishamsi/graphify/blob/v3/README.md", + "evidence": "Official CLI, output artifact, query, and source-location contract." + } + ], + "setup_path": "Run cargo make smoke-graphify-docker-graph-report to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke.", + "resource_expectation": "Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior.", + "retry_guidance": [ + "Run cargo make smoke-graphify-docker-graph-report first; setup/runtime failures must remain typed artifacts, not pass claims.", + "Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.", + "Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids." + ], + "research_depth": "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result" + }, + "notes": [ + "Status class: live Docker scored smoke with a current wrong_result outcome.", + "Do not interpret graphify-report.json as broad graph-navigation or knowledge-compilation quality evidence; the tiny smoke is scored and currently non-pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", + "reason": "Created as XY-889. XY-882 found a Docker-only CLI/materializer path and source-file/source-location output contract." + } + } + ] + }, + "capture_integration": { + "real": [], + "fixture_backed": [], + "mocked": [], + "blocked": [], + "not_encoded": [ + "No capture/integration behavior was declared by encoded fixtures." + ], + "notes": [] + }, + "summary": { + "job_count": 5, + "encoded_suite_count": 1, + "pass": 4, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 1, + "not_encoded": 0, + "unsupported_claim": 0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_total": 10, + "expected_evidence_matched": 10, + "expected_evidence_recall": 1.0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "wrong_result_stage_attribution_count": 0, + "mean_score": 0.8, + "mean_latency_ms": 2.0, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "evidence_required_count": 10, + "evidence_covered_count": 10, + "evidence_coverage": 1.0, + "source_ref_required_count": 10, + "source_ref_covered_count": 10, + "source_ref_coverage": 1.0, + "quote_required_count": 10, + "quote_covered_count": 10, + "quote_coverage": 1.0, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_correctness": 0.0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case_count": 0, + "qdrant_rebuild_pass_count": 0, + "operator_debug_job_count": 0, + "raw_sql_needed_count": 0, + "trace_incomplete_count": 0, + "operator_ux_gap_count": 0, + "consolidation": { + "proposal_count": 0, + "proposal_usefulness": null, + "lineage_completeness": null, + "review_action_correctness": null, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 0, + "executable_gap_count": 0 + }, + "scheduled_memory": { + "job_count": 4, + "task_run_count": 4, + "output_count": 5, + "required_task_kind_count": 4, + "covered_required_task_kind_count": 4, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 5, + "evidence_ref_output_count": 5, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 5, + "freshness_coverage": 1.0, + "action_rationale_count": 5, + "action_rationale_coverage": 1.0, + "trace_required_count": 4, + "trace_complete_count": 4, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 2, + "non_current_output_count": 3, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 7, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 2, + "source_trace_superseded_count": 3, + "source_trace_tombstone_count": 1 + } + }, + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "project_decisions", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "memory_summary", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "proactive_brief", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "scheduled_memory", + "status": "blocked", + "encoded_job_count": 5, + "score_mean": 0.8, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "At least one encoded job is blocked." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "context_trajectory", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + } + ], + "jobs": [ + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-knowledge-page-refresh-suggestion-001", + "title": "Suggest a knowledge-page refresh from scheduled memory", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-knowledge-page-stale-finding", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-knowledge-reviewable-refresh", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite" + } + ], + "produced_answer": "Scheduled knowledge-page refresh suggestion: suggest a reviewable rebuild because lint found the old scheduled-memory blocked state, and do not silently rewrite source notes.", + "produced_evidence": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 1, + "non_current_output_count": 0, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "source_immutability", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-private-provider-scheduler-blocked-001", + "title": "Block private/provider scheduled tasks without operator inputs", + "status": "blocked", + "answer_type": "scheduled_memory_task", + "requires_caveat": true, + "requires_refusal": true, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [], + "produced_answer": "", + "produced_evidence": [], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 0, + "expected_evidence_matched": 0, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": null, + "cost": null, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 0.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + } + ], + "reason": "No operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available; private/provider scheduled tasks stay blocked under XY-930.", + "evidence_required_count": 0, + "evidence_covered_count": 0, + "source_ref_required_count": 0, + "source_ref_covered_count": 0, + "quote_required_count": 0, + "quote_covered_count": 0, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-stale-decision-audit-001", + "title": "Audit a stale project decision during a scheduled task", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-old-consolidation-only-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-direct-suite-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite" + } + ], + "produced_answer": "Scheduled stale decision audit: the consolidation-only readiness decision is superseded by the direct real-world-memory-scheduled fixture suite plus aggregate real-world-memory regression guard.", + "produced_evidence": [ + "scheduled-current-direct-suite-decision", + "scheduled-old-consolidation-only-decision" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 0, + "non_current_output_count": 1, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 1, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-stale-preference-plan-audit-001", + "title": "Audit stale preferences and plans during a scheduled task", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-stale-old-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-stale-plan-expired", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-trace-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-reviewable-preference", + "claim_id": "scheduled_silent_mutation_rejected", + "requirement": "cite" + } + ], + "produced_answer": "Scheduled stale preference/plan audit: the old report plan is expired, the silent-mutation preference is historical, and the current path requires trace/readback plus reviewable derived output.", + "produced_evidence": [ + "scheduled-current-reviewable-preference", + "scheduled-current-trace-plan", + "scheduled-old-silent-mutation-preference", + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 4, + "expected_evidence_matched": 4, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 5, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 2, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 2, + "evidence_ref_output_count": 2, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 2, + "freshness_coverage": 1.0, + "action_rationale_count": 2, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 0, + "non_current_output_count": 2, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 2, + "source_trace_tombstone_count": 1 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 4, + "evidence_covered_count": 4, + "source_ref_required_count": 4, + "source_ref_covered_count": 4, + "quote_required_count": 4, + "quote_covered_count": 4, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-weekly-project-status-summary-001", + "title": "Run a weekly project status summary from current memory", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-weekly-current-gate", + "claim_id": "scheduled_weekly_gate", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-weekly-ledger-update", + "claim_id": "scheduled_weekly_ledger", + "requirement": "cite" + } + ], + "produced_answer": "Weekly scheduled summary: run cargo make real-world-memory-scheduled, update the XY-951 scheduled-memory-task readiness ledger, and do not claim hosted scheduled-product parity from fixture evidence.", + "produced_evidence": [ + "scheduled-weekly-current-gate", + "scheduled-weekly-ledger-update" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 1, + "non_current_output_count": 0, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + } + ], + "unsupported_claims": [], + "not_encoded_suites": [ + "trust_source_of_truth", + "work_resume", + "project_decisions", + "retrieval", + "memory_evolution", + "consolidation", + "memory_summary", + "proactive_brief", + "knowledge_compilation", + "operator_debugging_ux", + "capture_integration", + "production_ops", + "personalization", + "core_archival_memory", + "context_trajectory" + ], + "private_corpus_redaction": { + "policy": "publish evidence ids and bounded score summaries only; do not publish private text", + "private_fixture_count": 1 + }, + "evolution": { + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0 + }, + "follow_ups": [ + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-private-provider-scheduler-blocked-001", + "title": "XY-930 private/provider scheduled-memory input gate", + "reason": "Run private-corpus, provider-backed, and hosted scheduler gates only when operator-owned inputs exist." + } + ] +} \ No newline at end of file diff --git a/docs/research/external_memory_pattern_radar/cursor.json b/docs/research/external_memory_pattern_radar/cursor.json new file mode 100644 index 00000000..2ce50573 --- /dev/null +++ b/docs/research/external_memory_pattern_radar/cursor.json @@ -0,0 +1,1183 @@ +{ + "schema": "elf.external_memory_pattern_radar_cursor/v1", + "cadence": "weekly", + "generated_at": "2026-06-10T08:32:00.790878Z", + "source_docs": [ + "docs/guide/research/external_memory_improvement_plan.md", + "docs/guide/research/comparison_external_projects.md", + "docs/guide/research/research_projects_inventory.md", + "docs/spec/external_memory_pattern_radar_v1.md" + ], + "projects": [ + { + "id": "agentmemory", + "name": "agentmemory", + "repo": "rohitg00/agentmemory", + "homepage": "https://github.com/rohitg00/agentmemory", + "watch_focus": [ + "rw.operator-continuity", + "rw.resume-evidence", + "rw.lifecycle-staleness" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md", + "docs/research/2026-06-08-agent-memory-selection.json", + "docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json" + ], + "coverage_evidence": [ + { + "label": "adapter evidence boundary", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "agentmemory is tracked for operator continuity and resume evidence, but current benchmark evidence does not prove durable lifecycle quality." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/rohitg00/agentmemory", + "default_branch": "main", + "pushed_at": "2026-06-09T15:14:55Z", + "updated_at": "2026-06-10T08:30:03Z", + "latest_release": { + "tag_name": "v0.9.27", + "url": "https://github.com/rohitg00/agentmemory/releases/tag/v0.9.27", + "published_at": "2026-06-07T08:58:35Z" + }, + "stars": 22180, + "open_issues": 264, + "description": "#1 Persistent memory for AI coding agents based on real-world benchmarks" + } + }, + { + "id": "mem0", + "name": "mem0 / OpenMemory", + "repo": "mem0ai/mem0", + "homepage": "https://github.com/mem0ai/mem0", + "watch_focus": [ + "rw.lifecycle-staleness", + "rw.graph-temporal", + "rw.operator-continuity" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md", + "docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json" + ], + "coverage_evidence": [ + { + "label": "lifecycle and graph reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "mem0 remains the ecosystem and entity-scoped lifecycle reference while ELF keeps deterministic evidence-bound writes." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/mem0ai/mem0", + "default_branch": "main", + "pushed_at": "2026-06-10T07:16:28Z", + "updated_at": "2026-06-10T08:18:56Z", + "latest_release": { + "tag_name": "cli-node-v0.2.8", + "url": "https://github.com/mem0ai/mem0/releases/tag/cli-node-v0.2.8", + "published_at": "2026-06-01T20:18:36Z" + }, + "stars": 58237, + "open_issues": 413, + "description": "Universal memory layer for AI Agents" + } + }, + { + "id": "qmd", + "name": "qmd", + "repo": "tobi/qmd", + "homepage": "https://github.com/tobi/qmd", + "watch_focus": [ + "rw.retrieval-debug", + "rw.lifecycle-staleness", + "rw.resume-evidence" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + ], + "coverage_evidence": [ + { + "label": "retrieval-debug baseline", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "qmd is the strongest local retrieval-debug reference and has targeted live real-world adapter evidence." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/tobi/qmd", + "default_branch": "main", + "pushed_at": "2026-06-08T16:50:52Z", + "updated_at": "2026-06-10T08:26:53Z", + "latest_release": { + "tag_name": "v2.5.3", + "url": "https://github.com/tobi/qmd/releases/tag/v2.5.3", + "published_at": "2026-05-29T03:24:20Z" + }, + "stars": 26365, + "open_issues": 124, + "description": "mini cli search engine for your docs, knowledge bases, meeting notes, whatever. Tracking current sota approaches while being all local" + } + }, + { + "id": "claude-mem", + "name": "claude-mem", + "repo": "thedotmack/claude-mem", + "homepage": "https://github.com/thedotmack/claude-mem", + "watch_focus": [ + "rw.operator-continuity", + "rw.resume-evidence", + "rw.retrieval-debug" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md", + "docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json" + ], + "coverage_evidence": [ + { + "label": "progressive disclosure UX reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "claude-mem remains a product reference for progressive disclosure and viewer workflow, not a proven ELF replacement." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/thedotmack/claude-mem", + "default_branch": "main", + "pushed_at": "2026-06-10T07:22:33Z", + "updated_at": "2026-06-10T08:26:21Z", + "latest_release": { + "tag_name": "v13.5.4", + "url": "https://github.com/thedotmack/claude-mem/releases/tag/v13.5.4", + "published_at": "2026-06-10T07:22:17Z" + }, + "stars": 81523, + "open_issues": 80, + "description": "Persistent Context Across Sessions for Every Agent – Captures everything your agent does during sessions, compresses it with AI, and injects relevant context back into future sessions. Works with Claude Code, OpenClaw, Codex, Gemini, Hermes, Copilot, OpenCode + More" + } + }, + { + "id": "openviking", + "name": "OpenViking", + "repo": "volcengine/OpenViking", + "homepage": "https://github.com/volcengine/OpenViking", + "watch_focus": [ + "rw.context-trajectory", + "rw.resume-evidence", + "rw.retrieval-debug" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + ], + "coverage_evidence": [ + { + "label": "trajectory reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "OpenViking informs hierarchical context trajectory while current adapter evidence remains incomplete." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/volcengine/OpenViking", + "default_branch": "main", + "pushed_at": "2026-06-10T08:29:16Z", + "updated_at": "2026-06-10T08:29:49Z", + "latest_release": { + "tag_name": "v0.3.24", + "url": "https://github.com/volcengine/OpenViking/releases/tag/v0.3.24", + "published_at": "2026-06-05T08:05:34Z" + }, + "stars": 25438, + "open_issues": 221, + "description": "OpenViking is an open-source context database designed specifically for AI Agents(such as openclaw). OpenViking unifies the management of context (memory, resources, and skills) that Agents need through a file system paradigm, enabling hierarchical context delivery and self-evolving." + } + }, + { + "id": "graphiti", + "name": "Graphiti / Zep", + "repo": "getzep/graphiti", + "homepage": "https://github.com/getzep/graphiti", + "watch_focus": [ + "rw.graph-temporal", + "rw.resume-evidence" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + ], + "coverage_evidence": [ + { + "label": "temporal graph reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "Graphiti/Zep remains the broader temporal graph workflow reference for current-versus-historical facts." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/getzep/graphiti", + "default_branch": "main", + "pushed_at": "2026-06-10T07:19:57Z", + "updated_at": "2026-06-10T08:29:29Z", + "latest_release": { + "tag_name": "v0.29.2", + "url": "https://github.com/getzep/graphiti/releases/tag/v0.29.2", + "published_at": "2026-06-08T14:25:35Z" + }, + "stars": 27240, + "open_issues": 365, + "description": "Build Real-Time Knowledge Graphs for AI Agents" + } + }, + { + "id": "letta", + "name": "Letta", + "repo": "letta-ai/letta", + "homepage": "https://github.com/letta-ai/letta", + "watch_focus": [ + "rw.core-archival", + "rw.operator-continuity" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + ], + "coverage_evidence": [ + { + "label": "core versus archival memory reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "Letta informs core memory block ergonomics while ELF keeps archival notes source-of-truth bound." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/letta-ai/letta", + "default_branch": "main", + "pushed_at": "2026-05-14T17:14:23Z", + "updated_at": "2026-06-10T08:26:18Z", + "latest_release": { + "tag_name": "0.16.8", + "url": "https://github.com/letta-ai/letta/releases/tag/0.16.8", + "published_at": "2026-05-14T17:14:24Z" + }, + "stars": 23232, + "open_issues": 52, + "description": "Letta is the platform for building stateful agents: AI with advanced memory that can learn and self-improve over time." + } + }, + { + "id": "lightrag", + "name": "LightRAG", + "repo": "HKUDS/LightRAG", + "homepage": "https://github.com/HKUDS/LightRAG", + "watch_focus": [ + "rw.graph-navigation", + "rw.graph-temporal", + "rw.retrieval-debug" + ], + "primary_references": [ + "docs/guide/research/research_projects_inventory.md", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + ], + "coverage_evidence": [ + { + "label": "research gate", + "path": "docs/guide/research/research_projects_inventory.md", + "summary": "LightRAG is a D0 watch item with a research gate; no adapter strength claim is allowed yet." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/HKUDS/LightRAG", + "default_branch": "main", + "pushed_at": "2026-06-09T11:24:04Z", + "updated_at": "2026-06-10T08:28:11Z", + "latest_release": { + "tag_name": "v1.5.1", + "url": "https://github.com/HKUDS/LightRAG/releases/tag/v1.5.1", + "published_at": "2026-06-09T08:32:30Z" + }, + "stars": 36379, + "open_issues": 227, + "description": "[EMNLP2025] \"LightRAG: Simple and Fast Retrieval-Augmented Generation\"" + } + }, + { + "id": "graphrag", + "name": "GraphRAG", + "repo": "microsoft/graphrag", + "homepage": "https://github.com/microsoft/graphrag", + "watch_focus": [ + "rw.graph-navigation", + "rw.knowledge-synthesis", + "rw.retrieval-debug" + ], + "primary_references": [ + "docs/guide/research/research_projects_inventory.md", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + ], + "coverage_evidence": [ + { + "label": "research gate", + "path": "docs/guide/research/research_projects_inventory.md", + "summary": "GraphRAG is a D0 watch item with a research gate; no adapter strength claim is allowed yet." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/microsoft/graphrag", + "default_branch": "main", + "pushed_at": "2026-06-05T23:46:49Z", + "updated_at": "2026-06-10T08:27:19Z", + "latest_release": { + "tag_name": "v3.1.0", + "url": "https://github.com/microsoft/graphrag/releases/tag/v3.1.0", + "published_at": "2026-05-28T15:55:40Z" + }, + "stars": 33610, + "open_issues": 141, + "description": "A modular graph-based Retrieval-Augmented Generation (RAG) system" + } + }, + { + "id": "ragflow", + "name": "RAGFlow", + "repo": "infiniflow/ragflow", + "homepage": "https://github.com/infiniflow/ragflow", + "watch_focus": [ + "rw.resume-evidence", + "rw.graph-navigation", + "rw.retrieval-debug" + ], + "primary_references": [ + "docs/guide/research/research_projects_inventory.md", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + ], + "coverage_evidence": [ + { + "label": "research gate", + "path": "docs/guide/research/research_projects_inventory.md", + "summary": "RAGFlow is a D0 watch item with a research gate; no adapter strength claim is allowed yet." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/infiniflow/ragflow", + "default_branch": "main", + "pushed_at": "2026-06-10T08:09:36Z", + "updated_at": "2026-06-10T08:29:00Z", + "latest_release": { + "tag_name": "v0.25.6", + "url": "https://github.com/infiniflow/ragflow/releases/tag/v0.25.6", + "published_at": "2026-05-27T01:50:19Z" + }, + "stars": 82363, + "open_issues": 3360, + "description": "RAGFlow is a leading open-source Retrieval-Augmented Generation (RAG) engine that fuses cutting-edge RAG with Agent capabilities to create a superior context layer for LLMs" + } + }, + { + "id": "memsearch", + "name": "memsearch", + "repo": "zilliztech/memsearch", + "homepage": "https://github.com/zilliztech/memsearch", + "watch_focus": [ + "rw.lifecycle-staleness", + "rw.retrieval-debug", + "rw.resume-evidence" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md" + ], + "coverage_evidence": [ + { + "label": "markdown-first reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "memsearch remains a source-transparency reference while current adapter evidence is incomplete or wrong-result typed." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/zilliztech/memsearch", + "default_branch": "main", + "pushed_at": "2026-06-01T12:52:06Z", + "updated_at": "2026-06-10T08:11:17Z", + "latest_release": { + "tag_name": "v0.4.6", + "url": "https://github.com/zilliztech/memsearch/releases/tag/v0.4.6", + "published_at": "2026-05-29T07:28:49Z" + }, + "stars": 1955, + "open_issues": 219, + "description": "A persistent, unified memory layer for all your AI agents (e.g. Claude Code, Codex), backed by Markdown and Milvus." + } + }, + { + "id": "langgraph", + "name": "LangGraph", + "repo": "langchain-ai/langgraph", + "homepage": "https://github.com/langchain-ai/langgraph", + "watch_focus": [ + "rw.replay-regression", + "rw.resume-evidence" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md" + ], + "coverage_evidence": [ + { + "label": "replay regression reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "LangGraph informs replay and checkpoint regression workflows; ELF traces do not replace full agent-state replay." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/langchain-ai/langgraph", + "default_branch": "main", + "pushed_at": "2026-06-09T22:41:05Z", + "updated_at": "2026-06-10T08:30:43Z", + "latest_release": { + "tag_name": "1.2.4", + "url": "https://github.com/langchain-ai/langgraph/releases/tag/1.2.4", + "published_at": "2026-06-02T17:07:49Z" + }, + "stars": 34333, + "open_issues": 560, + "description": "Build resilient agents." + } + }, + { + "id": "nanograph", + "name": "nanograph", + "repo": "nanograph/nanograph", + "homepage": "https://github.com/nanograph/nanograph", + "watch_focus": [ + "rw.graph-temporal", + "rw.retrieval-debug" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md", + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + ], + "coverage_evidence": [ + { + "label": "typed graph ergonomics reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "nanograph is a typed graph DX reference, not a full memory backend benchmark claim." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/nanograph/nanograph", + "default_branch": "main", + "pushed_at": "2026-05-17T01:49:29Z", + "updated_at": "2026-06-10T01:45:26Z", + "latest_release": { + "tag_name": "v1.3.0", + "url": "https://github.com/nanograph/nanograph/releases/tag/v1.3.0", + "published_at": "2026-05-16T23:25:46Z" + }, + "stars": 150, + "open_issues": 0, + "description": "On-device property graph database. Schema-as-code. One CLI → One Folder. No Server. Think: DuckDB for graphs." + } + }, + { + "id": "llm-wiki", + "name": "llm-wiki", + "repo": "nvk/llm-wiki", + "homepage": "https://github.com/nvk/llm-wiki", + "watch_focus": [ + "rw.knowledge-synthesis", + "rw.resume-evidence" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md" + ], + "coverage_evidence": [ + { + "label": "derived knowledge pages reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "llm-wiki informs rebuildable cited knowledge pages and lint/repair loops." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/nvk/llm-wiki", + "default_branch": "master", + "pushed_at": "2026-05-23T16:07:33Z", + "updated_at": "2026-06-09T16:24:54Z", + "latest_release": { + "tag_name": "v0.10.2", + "url": "https://github.com/nvk/llm-wiki/releases/tag/v0.10.2", + "published_at": "2026-05-23T16:07:33Z" + }, + "stars": 549, + "open_issues": 3, + "description": "LLM-compiled knowledge bases for any AI agent. Parallel multi-agent research, thesis-driven investigation, source ingestion, wiki compilation, querying, and artifact generation. " + } + }, + { + "id": "gbrain", + "name": "gbrain", + "repo": "garrytan/gbrain", + "homepage": "https://github.com/garrytan/gbrain", + "watch_focus": [ + "rw.knowledge-synthesis", + "rw.operator-continuity" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md" + ], + "coverage_evidence": [ + { + "label": "operational brain reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "gbrain informs current-truth and timeline presentation while ELF source notes remain authoritative." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/garrytan/gbrain", + "default_branch": "master", + "pushed_at": "2026-06-10T05:32:26Z", + "updated_at": "2026-06-10T08:19:11Z", + "latest_release": null, + "stars": 21971, + "open_issues": 740, + "description": "Garry's Opinionated OpenClaw/Hermes Agent Brain" + } + }, + { + "id": "graphify", + "name": "graphify", + "repo": "safishamsi/graphify", + "homepage": "https://github.com/safishamsi/graphify", + "watch_focus": [ + "rw.graph-navigation", + "rw.knowledge-synthesis", + "rw.resume-evidence" + ], + "primary_references": [ + "docs/guide/research/comparison_external_projects.md" + ], + "coverage_evidence": [ + { + "label": "graph-compressed navigation reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "graphify informs rebuildable graph reports and pre-search guidance without replacing ELF storage." + } + ], + "last_seen": { + "observed_at": "2026-06-10T08:32:00.790878Z", + "source_url": "https://github.com/safishamsi/graphify", + "default_branch": "v8", + "pushed_at": "2026-06-08T22:58:46Z", + "updated_at": "2026-06-10T08:28:45Z", + "latest_release": { + "tag_name": "v0.8.36", + "url": "https://github.com/safishamsi/graphify/releases/tag/v0.8.36", + "published_at": "2026-06-08T22:58:46Z" + }, + "stars": 64475, + "open_issues": 330, + "description": "AI coding assistant skill (Claude Code, Codex, OpenCode, Cursor, Gemini CLI, and more). Turn any folder of code, SQL schemas, R scripts, shell scripts, docs, papers, images, or videos into a queryable knowledge graph. App code + database schema + infrastructure in one graph." + } + } + ], + "last_run": { + "schema": "elf.external_memory_pattern_radar_run/v1", + "run_id": "external-memory-pattern-radar-2026-06-10", + "generated_at": "2026-06-10T08:32:00.790878Z", + "mode": "live", + "summary": { + "project_count": 16, + "covered_count": 16, + "rejected_count": 0, + "gap_count": 0, + "create_issue_count": 0, + "defer_count": 0, + "no_issue_count": 16 + }, + "decisions": [ + { + "project_id": "agentmemory", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "adapter evidence boundary", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "agentmemory is tracked for operator continuity and resume evidence, but current benchmark evidence does not prove durable lifecycle quality." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/rohitg00/agentmemory", + "https://github.com/rohitg00/agentmemory/releases/tag/v0.9.27" + ] + }, + { + "project_id": "mem0", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "lifecycle and graph reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "mem0 remains the ecosystem and entity-scoped lifecycle reference while ELF keeps deterministic evidence-bound writes." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/mem0ai/mem0", + "https://github.com/mem0ai/mem0/releases/tag/cli-node-v0.2.8" + ] + }, + { + "project_id": "qmd", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "retrieval-debug baseline", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "qmd is the strongest local retrieval-debug reference and has targeted live real-world adapter evidence." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/tobi/qmd", + "https://github.com/tobi/qmd/releases/tag/v2.5.3" + ] + }, + { + "project_id": "claude-mem", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "progressive disclosure UX reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "claude-mem remains a product reference for progressive disclosure and viewer workflow, not a proven ELF replacement." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/thedotmack/claude-mem", + "https://github.com/thedotmack/claude-mem/releases/tag/v13.5.4" + ] + }, + { + "project_id": "openviking", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "trajectory reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "OpenViking informs hierarchical context trajectory while current adapter evidence remains incomplete." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/volcengine/OpenViking", + "https://github.com/volcengine/OpenViking/releases/tag/v0.3.24" + ] + }, + { + "project_id": "graphiti", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "temporal graph reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "Graphiti/Zep remains the broader temporal graph workflow reference for current-versus-historical facts." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/getzep/graphiti", + "https://github.com/getzep/graphiti/releases/tag/v0.29.2" + ] + }, + { + "project_id": "letta", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "core versus archival memory reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "Letta informs core memory block ergonomics while ELF keeps archival notes source-of-truth bound." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/letta-ai/letta", + "https://github.com/letta-ai/letta/releases/tag/0.16.8" + ] + }, + { + "project_id": "lightrag", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "research gate", + "path": "docs/guide/research/research_projects_inventory.md", + "summary": "LightRAG is a D0 watch item with a research gate; no adapter strength claim is allowed yet." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/HKUDS/LightRAG", + "https://github.com/HKUDS/LightRAG/releases/tag/v1.5.1" + ] + }, + { + "project_id": "graphrag", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "research gate", + "path": "docs/guide/research/research_projects_inventory.md", + "summary": "GraphRAG is a D0 watch item with a research gate; no adapter strength claim is allowed yet." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/microsoft/graphrag", + "https://github.com/microsoft/graphrag/releases/tag/v3.1.0" + ] + }, + { + "project_id": "ragflow", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "research gate", + "path": "docs/guide/research/research_projects_inventory.md", + "summary": "RAGFlow is a D0 watch item with a research gate; no adapter strength claim is allowed yet." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/infiniflow/ragflow", + "https://github.com/infiniflow/ragflow/releases/tag/v0.25.6" + ] + }, + { + "project_id": "memsearch", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "markdown-first reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "memsearch remains a source-transparency reference while current adapter evidence is incomplete or wrong-result typed." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/zilliztech/memsearch", + "https://github.com/zilliztech/memsearch/releases/tag/v0.4.6" + ] + }, + { + "project_id": "langgraph", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "replay regression reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "LangGraph informs replay and checkpoint regression workflows; ELF traces do not replace full agent-state replay." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/langchain-ai/langgraph", + "https://github.com/langchain-ai/langgraph/releases/tag/1.2.4" + ] + }, + { + "project_id": "nanograph", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "typed graph ergonomics reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "nanograph is a typed graph DX reference, not a full memory backend benchmark claim." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/nanograph/nanograph", + "https://github.com/nanograph/nanograph/releases/tag/v1.3.0" + ] + }, + { + "project_id": "llm-wiki", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "derived knowledge pages reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "llm-wiki informs rebuildable cited knowledge pages and lint/repair loops." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/nvk/llm-wiki", + "https://github.com/nvk/llm-wiki/releases/tag/v0.10.2" + ] + }, + { + "project_id": "gbrain", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "operational brain reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "gbrain informs current-truth and timeline presentation while ELF source notes remain authoritative." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/garrytan/gbrain" + ] + }, + { + "project_id": "graphify", + "upstream_change": "No GitHub metadata delta was observed since the prior cursor.", + "reusable_pattern": "No new candidate pattern was identified in this run.", + "elf_verdict": "covered", + "product_value": "Current ELF coverage remains represented by the comparison and inventory evidence.", + "duplicate_coverage_evidence": [ + { + "label": "graph-compressed navigation reference", + "path": "docs/guide/research/comparison_external_projects.md", + "summary": "graphify informs rebuildable graph reports and pre-search guidance without replacing ELF storage." + } + ], + "safety_boundary": "No external runtime is adopted by default; existing ELF evidence remains authoritative.", + "issue_decision": { + "action": "no_issue", + "rationale": "No issue was created because the run found no source-backed gap.", + "duplicate_search": { + "queried": false, + "query": "", + "result": "not_required_no_issue", + "evidence": [ + "No Linear search is required when the issue decision is no_issue." + ] + }, + "proposed_issue": null + }, + "acceptance_evidence": [ + "No-issue decision recorded in the cursor.", + "Coverage evidence points at checked-in ELF research docs." + ], + "source_links": [ + "https://github.com/safishamsi/graphify", + "https://github.com/safishamsi/graphify/releases/tag/v0.8.36" + ] + } + ] + } +} diff --git a/docs/research/external_memory_pattern_radar/latest.md b/docs/research/external_memory_pattern_radar/latest.md new file mode 100644 index 00000000..00cb8fa7 --- /dev/null +++ b/docs/research/external_memory_pattern_radar/latest.md @@ -0,0 +1,39 @@ +# External Memory Pattern Radar Summary + +Goal: Preserve the latest weekly ELF external memory pattern radar outcome. +Read this when: Feeding the next full comparison report or deciding whether a watched upstream memory project created an ELF follow-up. +Inputs: `docs/research/external_memory_pattern_radar/cursor.json`, GitHub repository metadata, checked-in ELF comparison evidence, and any Codex source-review notes. +Depends on: `docs/spec/external_memory_pattern_radar_v1.md` and `docs/guide/research/external_memory_pattern_radar.md`. +Outputs: Latest no-issue, rejection, or issue-ready radar decisions. + +- Run id: `external-memory-pattern-radar-2026-06-10` +- Generated at: `2026-06-10T08:32:00.790878Z` +- Mode: `live` +- Projects: `16`; covered: `16`; rejected: `0`; gaps: `0`; create_issue: `0` + +## Decisions + +| Project | Upstream change | ELF verdict | Issue decision | Acceptance evidence | +| --- | --- | --- | --- | --- | +| `agentmemory` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `mem0` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `qmd` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `claude-mem` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `openviking` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `graphiti` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `letta` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `lightrag` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `graphrag` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `ragflow` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `memsearch` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `langgraph` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `nanograph` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `llm-wiki` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `gbrain` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | +| `graphify` | No GitHub metadata delta was observed since the prior cursor. | `covered` | `no_issue` | No-issue decision recorded in the cursor.; Coverage evidence points at checked-in ELF research docs. | + +## Safety Boundary + +- The radar records upstream movement as a trigger for source review, not as proof of parity or a reason to adopt an external runtime. +- `create_issue` decisions are valid only when the cursor includes source links, repo evidence, non-goals, validation criteria, and Linear duplicate-search evidence. +- No-issue runs remain useful because each project records why ELF is already covered or why metadata-only movement was rejected. diff --git a/docs/spec/external_memory_pattern_radar_v1.md b/docs/spec/external_memory_pattern_radar_v1.md new file mode 100644 index 00000000..ccde7b34 --- /dev/null +++ b/docs/spec/external_memory_pattern_radar_v1.md @@ -0,0 +1,118 @@ +# External Memory Pattern Radar v1 + +Purpose: Define the durable cursor, run, and issue-decision contract for ELF's external +memory pattern radar. +Status: normative +Read this when: You are changing the weekly radar runner, cursor file, summary output, +or follow-up issue creation boundary. +Not this document: The current project comparison, benchmark results, or step-by-step +operator runbook. +Defines: `elf.external_memory_pattern_radar_cursor/v1` and +`elf.external_memory_pattern_radar_run/v1`. + +## Goal + +The radar keeps ELF aware of fast-moving memory, RAG, graph-memory, and +agent-continuity systems without weakening ELF's evidence-linked source-of-truth model. + +The radar is a decision-support workflow. It is not an adoption workflow. + +## Artifacts + +Canonical checked-in paths: + +- Cursor: `docs/research/external_memory_pattern_radar/cursor.json` +- Latest prose summary: `docs/research/external_memory_pattern_radar/latest.md` + +Temporary dry-run outputs may be written under `tmp/external-memory-pattern-radar/`. + +## Cursor Schema + +`cursor.json` must use: + +```json +{ + "schema": "elf.external_memory_pattern_radar_cursor/v1", + "cadence": "weekly", + "generated_at": "RFC3339 timestamp", + "source_docs": ["repo-relative path or URL"], + "projects": [], + "last_run": null +} +``` + +Each `projects[]` entry must contain: + +| Field | Type | Requirement | +| --- | --- | --- | +| `id` | string | Stable snake-case or kebab-safe project id. | +| `name` | string | Human-readable project name. | +| `repo` | string | GitHub `owner/name`. | +| `homepage` | string | Primary upstream URL. | +| `watch_focus` | string array | ELF benchmark or product dimensions watched for this project. | +| `primary_references` | string array | Repo-relative docs or source URLs used as current ELF context. | +| `coverage_evidence` | evidence array | Existing ELF evidence for duplicate/coverage checks. | +| `last_seen` | object or null | Last observed GitHub metadata. | + +`coverage_evidence[]` entries must contain `label`, `path`, and `summary`. + +## Run Schema + +`last_run` must use: + +```json +{ + "schema": "elf.external_memory_pattern_radar_run/v1", + "run_id": "string", + "generated_at": "RFC3339 timestamp", + "mode": "live|offline", + "summary": {}, + "decisions": [] +} +``` + +Every run must include one decision per project. + +## Decision Contract + +Every `decisions[]` entry must record: + +| Field | Requirement | +| --- | --- | +| `project_id` | Must match a cursor project id. | +| `upstream_change` | What changed upstream, or why no upstream fetch/change occurred. | +| `reusable_pattern` | Candidate reusable pattern, or why no pattern is claimed. | +| `elf_verdict` | One of `covered`, `reject`, or `gap`. | +| `product_value` | Product value or explicit no-value statement. | +| `duplicate_coverage_evidence` | Existing ELF docs, issues, benchmark records, or code pointers. | +| `safety_boundary` | Boundary preventing unsafe adoption, overclaiming, or hidden runtime changes. | +| `issue_decision` | No-issue, defer, or create-issue decision with rationale. | +| `acceptance_evidence` | Evidence that the radar decision itself met this contract. | +| `source_links` | Upstream links used by the decision. | + +Metadata-only upstream movement must not produce `elf_verdict = "gap"`. Metadata-only +movement may only produce `covered` or `reject`, because stars, push timestamps, and +release tags are review triggers rather than architecture evidence. + +## Issue Creation Boundary + +`issue_decision.action = "create_issue"` is valid only when all of the following are +present in the same decision record: + +- `elf_verdict = "gap"` +- upstream source links +- repo evidence showing the ELF gap or missing coverage +- explicit non-goals +- validation criteria +- Linear duplicate-search evidence with `duplicate_search.queried = true` + +If any item is missing, the decision must be `no_issue` or `defer`. + +## Scheduled Workflow Boundary + +GitHub Actions may refresh metadata and upload read-only artifacts. GitHub Actions must +not make AI source-review judgments, create Linear issues, or claim adoption value from +activity alone. + +Codex or Decodex automation may promote a radar observation into a follow-up issue only +after source review and duplicate search satisfy this spec. diff --git a/docs/spec/index.md b/docs/spec/index.md index 758ae782..86c90cd8 100644 --- a/docs/spec/index.md +++ b/docs/spec/index.md @@ -1,24 +1,76 @@ # Spec Index -Purpose: Provide the canonical entry point for repository specifications. +Purpose: Route agents to normative documents that define repository truth. +Status: normative +Read this when: You need to find the authoritative contract before changing code or data. +Not this document: Step-by-step execution guidance or saved planning artifacts. +Defines: Routing rules for normative documents under `docs/spec/`. -Audience: This documentation is written for LLM consumption and should remain explicit and unambiguous. +Question this index answers: "what must remain true?" -## Structure +## Use this index when -- Store specs directly under `docs/spec/` (flat structure). -- Use descriptive file names with stable prefixes (`system_`, `t0_`, `t1_`, `trace_`, `search_`). -- Link new specs from `docs/index.md` or `docs/guide/index.md` when relevant. +- You need an invariant, contract, schema, enum, state model, interface, or required + behavior. +- You are deciding whether code or data is correct. +- A guide says "see the governing spec" and you need the authoritative source. -## Specs +## Do not use this index when -- `docs/spec/system_elf_memory_service_v1.md` - ELF Memory Service v1.0 specification. +- You need step-by-step instructions, maintenance actions, migrations, or incident + response. +- You need a planning-tool artifact or a saved execution plan under `docs/plans/`. +- You want rationale only, without an authoritative contract. -## Authoring guidance (LLM-first) +## What belongs in `docs/spec/` -- Use explicit nouns instead of pronouns whenever possible. -- Define acronyms and domain terms on first use. -- Prefer short sentences with one idea each. -- Include canonical field names, enums, units, and constraints. -- Provide small, concrete examples for non-obvious flows. -- Keep links stable and prefer absolute repo paths. +- Contracts and invariants. +- Data shapes, canonical field names, enums, defaults, units, and limits. +- State transitions and protocol rules. +- Behavior that tests, code, or operators should treat as authoritative. + +## Documents + +- `system_elf_memory_service_v2.md`: Core ELF memory service contract, API semantics, + and storage invariants. +- `system_consolidation_proposals_v1.md`: Reviewable derived consolidation run and + proposal contract over immutable source evidence. +- `system_memory_summary_v1.md`: Reviewable current/background/stale/superseded/ + tombstoned/derived memory summary and source-trace contract. +- `system_knowledge_pages_v1.md`: Derived project/entity/concept/issue/decision page + storage, rebuild, citation, and stale-source lint contract. +- `system_competitive_parity_gate_v1.md`: Docker-only adoption gate that decides + whether ELF meets or exceeds selected external memory-system baselines. +- `production_corpus_manifest_v1.md`: Sanitized/private coding-agent production + corpus manifest schema for adoption benchmark runs. +- `real_world_agent_memory_benchmark_v1.md`: Real-world agent memory benchmark job + schema, suite taxonomy, scoring dimensions, and report state semantics. +- `external_memory_pattern_radar_v1.md`: Weekly external memory pattern radar cursor, + run, decision, and issue-creation boundary schema. + +## Spec document contract + +Start each spec with a compact routing header: + +- `Purpose` +- `Status: normative` +- `Read this when` +- `Not this document` +- `Defines` + +Then keep the body explicit: + +- Prefer concrete nouns over pronouns. +- Separate facts from rationale. +- Include canonical names exactly as code or data uses them. +- Include a small example when it removes ambiguity. +- Link to related guides instead of embedding procedures. + +## Structure policy + +- Prefer shallow paths while the spec set is small. +- Add subfolders only when they mirror stable system boundaries or materially reduce + ambiguity. +- Do not require fixed filename prefixes up front. +- Choose names for topic clarity and retrieval quality, not visual uniformity. +- If a guide depends on a spec, the guide links back to the governing spec. diff --git a/docs/spec/production_corpus_manifest_v1.md b/docs/spec/production_corpus_manifest_v1.md new file mode 100644 index 00000000..36347823 --- /dev/null +++ b/docs/spec/production_corpus_manifest_v1.md @@ -0,0 +1,104 @@ +# Production Corpus Manifest v1 + +Purpose: Define the sanitized/private coding-agent production corpus manifest used by +ELF adoption benchmarks. +Status: normative +Read this when: You are creating, validating, or running a production-style personal +agent memory benchmark corpus. +Not this document: Docker benchmark run commands, report publication steps, or private +fixture storage procedures. +Defines: `elf.production_corpus_manifest/v1` fields, required evidence categories, +query tasks, evidence expectations, and private-content safety rules. + +## Contract + +A production corpus manifest is a JSON object with: + +- `schema`: exactly `elf.production_corpus_manifest/v1`. +- `manifest_id`: stable lower-risk identifier for the corpus snapshot. Allowed + shape: `[a-z0-9][a-z0-9_.-]{1,80}`. +- `description`: optional English summary. +- `evidence`: non-empty array of production-style memory evidence items. +- `queries`: non-empty array of task-oriented retrieval checks. + +The checked-in benchmark fixture must be synthetic and sanitized. Real private +production content must not be committed. + +## Evidence Items + +Each `evidence[]` item must include: + +- `evidence_id`: lower-case ASCII identifier safe for filenames. Allowed shape: + `[a-z0-9][a-z0-9_.-]{1,80}`. +- `category`: one of `issue`, `pr`, `worktree`, `runbook`, `decision`, `blocker`, + or `recovery_note`. +- `title`: short English title. +- Exactly one of: + - `text`: sanitized inline English evidence text. + - `local_path`: path to a local sanitized text/Markdown file, resolved relative to + the manifest when not absolute. + +Evidence text must not contain secrets, tokens, private keys, personal credentials, or +unsanitized private conversation content. + +## Query Cases + +Each `queries[]` item must include: + +- `query_id`: stable query identifier. Allowed shape: + `[a-z0-9][a-z0-9_.-]{1,80}`. +- `task`: one of `resume_lane`, `recover_exact_command`, `explain_stale_blocker`, + `find_prior_decision`, `compare_project_status`, or + `detect_contradiction_update`. +- `query`: English task-oriented search query. +- `expected_evidence_ids`: non-empty array of evidence IDs that satisfy the query. +- `allowed_alternate_evidence_ids`: array of acceptable alternate evidence IDs. Use + an empty array when no alternate is allowed. +- `expected_terms`: non-empty array of terms that should appear in the matched + evidence snippet when the expected note key is not the top result. + +Every query must record both expected evidence IDs and allowed alternates, even when +the allowed alternate list is empty. + +## Benchmark Mapping + +The Docker benchmark materializes each evidence item as a temporary Markdown document +inside the benchmark work directory. The source document filename is +`<evidence_id>.md`. Reports must expose evidence IDs and allowed alternates, not local +private file paths. + +For `production-private` runs, the runner must fail closed when the manifest is absent, +the manifest references a missing `local_path`, or any query references an unknown +evidence ID. It must not silently fall back to the checked-in synthetic corpus. + +## Minimal Example + +```json +{ + "schema": "elf.production_corpus_manifest/v1", + "manifest_id": "local-private-prod-corpus-2026-06-09", + "evidence": [ + { + "evidence_id": "issue-xy123-resume", + "category": "issue", + "title": "XY-123 Resume State", + "text": "XY-123 resumes on branch y/example with command `cargo make check`." + } + ], + "queries": [ + { + "query_id": "q-resume-xy123", + "task": "resume_lane", + "query": "How do I resume XY-123?", + "expected_evidence_ids": ["issue-xy123-resume"], + "allowed_alternate_evidence_ids": [], + "expected_terms": ["XY-123", "cargo make check"] + } + ] +} +``` + +## Related Guides + +- `docs/guide/benchmarking/live_baseline_benchmark.md`: run commands, private fixture + placement, and report publication. diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md new file mode 100644 index 00000000..b371e9a5 --- /dev/null +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -0,0 +1,672 @@ +# Real-World Agent Memory Benchmark v1 + +Purpose: Define the v1 benchmark contract for evaluating agent memory systems through +real user jobs instead of isolated top-k retrieval queries. +Status: normative +Read this when: You are implementing, validating, reporting, or extending real-world +agent memory benchmark suites. +Not this document: Runner implementation steps, large fixture generation, operator +commands, or production adoption verdicts. +Defines: `real_world_job` schema, suite taxonomy, scoring dimensions, report states, +allowed uncertainty, and external reference mapping. + +## Scope + +The benchmark unit is `real_world_job`: a replayable user job that combines a corpus, +timeline, user prompt, expected answer, required evidence, negative traps, scoring +rubric, and allowed uncertainty. A job is intended to answer one question: would this +memory system help an agent do real work correctly, with less repetition and fewer +unsupported claims? + +This contract is future benchmark authority only. Existing live baseline reports remain +valid evidence for their encoded retrieval and lifecycle checks. A project must not +claim wins under this v1 suite until a runner encodes the relevant suites and publishes +a report against this contract. + +## Design Goals + +- Evaluate job completion, not only whether one expected chunk appears in top-k. +- Reward evidence-backed answers, stale-fact handling, and recoverable reasoning. +- Penalize confident but unsupported claims even when retrieval looks plausible. +- Preserve typed failure states instead of flattening every result into one leaderboard. +- Keep external project strengths visible as suite references, not as automatic + superiority claims. + +## Why The Current Benchmark Is Incomplete + +The June 2026 live baseline is necessary but biased toward service-style retrieval and +encoded lifecycle checks. ELF and qmd leading that matrix proves that those systems can +retrieve expected evidence and pass encoded update/delete/cold-start checks under the +selected Docker profiles. It does not prove that they help an agent resume a lane, +explain a decision, debug a failed retrieval, reconcile stale notes, compile durable +knowledge, or avoid unsupported claims during an end-to-end user job. + +This suite fixes that bias by making the job transcript, expected answer, required +evidence, traps, and scoring rubric first-class. A system can pass retrieval and still +fail a real-world job if it repeats completed work, cites obsolete evidence, omits a +blocking caveat, or fabricates a decision that is not in the corpus. + +## Real-World Job Schema + +A `real_world_job` record MUST include the fields below. JSON is the canonical exchange +shape; YAML fixtures MAY be used only when converted to the same field names before +runner execution. + +```json +{ + "schema": "elf.real_world_job/v1", + "job_id": "trust-sot-restore-001", + "suite": "trust_source_of_truth", + "title": "Recover the authoritative restore decision", + "corpus": {}, + "timeline": [], + "prompt": {}, + "expected_answer": {}, + "required_evidence": [], + "negative_traps": [], + "scoring_rubric": {}, + "allowed_uncertainty": {}, + "operator_debug": {}, + "encoding": {}, + "memory_evolution": {}, + "memory_summary": {}, + "tags": [] +} +``` + +### Required Top-Level Fields + +| Field | Type | Required semantics | +| --- | --- | --- | +| `schema` | string | MUST equal `elf.real_world_job/v1`. | +| `job_id` | string | Stable ASCII identifier unique within a suite. | +| `suite` | string | One suite id from the Suite Taxonomy section. | +| `title` | string | Human-readable job title. | +| `corpus` | object | Documents, memory items, traces, source refs, and adapter setup needed to replay the job. | +| `timeline` | array | Ordered events that establish what happened before the user prompt. | +| `prompt` | object | The user-facing request sent to the evaluated memory system or agent harness. | +| `expected_answer` | object | Required answer content, accepted uncertainty, and forbidden claims. | +| `required_evidence` | array | Evidence ids, source refs, quotes, or trace handles that must support the answer. | +| `negative_traps` | array | Distractors, stale facts, or misleading memories that must not drive the answer. | +| `scoring_rubric` | object | Dimensions, weights, thresholds, and hard-fail rules for this job. | +| `allowed_uncertainty` | object | Explicit uncertainty language and fallback behavior accepted for the job. | +| `operator_debug` | object or null | Optional for most suites; required for `operator_debugging_ux` jobs. Records trace/viewer evidence and operator workflow scoring inputs. | +| `encoding` | object | Optional job-level limitation declaration. Only `not_encoded`, `blocked`, and `incomplete` statuses are allowed here. | +| `memory_evolution` | object or null | Optional for most suites; used by `memory_evolution` jobs to report current evidence, historical evidence, stale traps, conflicts, update rationale, and temporal-validity limitations. | +| `memory_summary` | object or null | Optional for most suites; used by `memory_summary` jobs to report reviewable summary/source-trace metrics defined in `system_memory_summary_v1.md`. | +| `tags` | array | Optional labels such as `private_corpus`, `synthetic`, `adapter_required`, or `no_live_claim`. | + +### `corpus` + +`corpus` MUST identify all replay inputs without relying on hidden host state. + +Required fields: + +- `corpus_id`: stable id. +- `profile`: `synthetic`, `private_sanitized`, `generated_public`, or `external_adapter`. +- `items`: array of corpus items. + +Each `items[]` entry MUST include: + +- `evidence_id`: stable id used by `required_evidence` and `negative_traps`. +- `kind`: `note`, `document`, `trace`, `issue`, `pr`, `runbook`, `decision`, `message`, + `compiled_page`, or `adapter_state`. +- `text` or `local_ref`: inline sanitized text or a local fixture pointer. +- `source_ref`: object; MAY be `{}` only for generated synthetic fixtures. +- `created_at`: RFC3339 timestamp or `null` when time is intentionally irrelevant. + +Each `items[]` entry MAY include: + +- `capture`: object used by live capture/write-policy materializers. Supported fields: + - `action`: `store` or `exclude`. `exclude` means the item is an expected capture + input but MUST NOT be stored in the evaluated memory system. + - `source_id`: optional stable source identifier that must be preserved in the + resulting source reference when the item is stored. + - `evidence_binding`: optional label for the evidence-binding mode the live adapter + must preserve. + - `write_policy`: optional write-policy object applied before storage. Redactions + and exclusions from this policy must be counted in the materialization artifact. + +Optional corpus fields: + +- `capture_behaviors`: object used by `capture_integration` jobs and fixture-backed + suites to classify integration evidence. Supported arrays are `real`, + `fixture_backed`, `mocked`, `blocked`, `not_encoded`, and `notes`. + `fixture_backed` means the behavior is represented by checked-in fixture evidence, + not by a live adapter pass. Reports MUST NOT convert `fixture_backed`, `mocked`, + `blocked`, or `not_encoded` behavior into a live integration success claim. + +Private corpus fixtures MUST use sanitized inline text or local refs excluded from git. +Reports MAY publish evidence ids and score summaries without publishing private text. + +### External Adapter Manifest + +Real-world reports MAY include an external adapter manifest. When present, the manifest +MUST use this schema id: + +```text +elf.real_world_external_adapter_manifest/v1 +``` + +The manifest is the stable adapter-pack contract for comparing external memory projects +against `real_world_job` suites. It records what an adapter actually executed, which +coverage is only fixture-backed or live-baseline-only, and which suites remain blocked, +unsupported, incomplete, or not encoded. It MUST NOT be used to convert retrieval-only +live-baseline evidence into a real-world suite win. + +Required manifest fields: + +- `manifest_id`: stable ASCII id for the checked-in or generated manifest. +- `docker_isolation`: object describing the default execution boundary. +- `adapters`: array of adapter records. + +`docker_isolation` MUST include: + +- `default`: boolean; MUST be `true` for repository-supported external adapter runs + unless a separate issue records why Docker is impossible. +- `compose_file`: Docker Compose file used by the supported runner. +- `runner`: script or command entrypoint used inside the Compose boundary. +- `artifact_dir`: relative artifact directory for logs and reports. +- `host_global_installs_required`: boolean; MUST be `false` for default external + runs. +- `notes`: optional bounded explanatory strings. + +Each `adapters[]` record MUST include: + +- `adapter_id`: stable id unique within the manifest. +- `project`: display name such as `qmd`, `agentmemory`, or `mem0/OpenMemory`. +- `adapter_kind`: local execution shape, for example `docker_cli_same_corpus`, + `docker_sdk_same_corpus`, `offline_fixture_response`, or `research_gate`. +- `evidence_class`: one of `fixture_backed`, `live_baseline_only`, + `live_real_world`, or `research_gate`. +- `docker_default`: boolean. +- `host_global_installs_required`: boolean. +- `overall_status`: one adapter status from the table below. +- `setup`, `run`, and `result`: evidence objects with `status`, `evidence`, and + optional `command` and `artifact`. +- `capabilities`: array of capability coverage records with `capability`, `status`, + and `evidence`. +- `suites`: array of real-world suite coverage records with `suite_id`, `status`, and + `evidence`. +- `scenarios`: optional array of scenario judgment records with `scenario_id`, + optional `suite_id`, `status`, `elf_position`, optional `comparison_outcome`, + `evidence`, and optional `command` and `artifact`. `elf_position` MUST be one of + `wins`, `ties`, `loses`, or `untested`. `comparison_outcome`, when present, MUST be + one of `win`, `tie`, `loss`, `not_tested`, `blocked`, or `non_goal`. Scenario rows + with `status = "blocked"` MUST set `comparison_outcome = "blocked"` explicitly so a + blocked evidence path is not derived from `elf_position = "untested"` as + `not_tested`. Reports SHOULD derive `comparison_outcome` from `elf_position` when + omitted for non-blocked rows, but SHOULD use the explicit field for scenarios where + the legacy ELF-relative position is less precise than the report outcome. Scenario + judgments are report inputs for dimension-level comparison; they MUST NOT convert + live-baseline-only evidence into real-world suite pass claims. +- `evidence`: array of evidence pointers with `kind`, `ref`, and `status`. +- `notes`: optional bounded explanatory strings. +- `follow_up`: optional `title` and `reason`. +- `execution_metadata`: optional object used by expanded adapter packs and research + gates. When present, it MUST include `sources`, `setup_path`, + `runtime_boundary`, `resource_expectation`, and `retry_guidance`. It MAY include + `research_depth`. + +`research_gate` evidence class means the adapter record is a checked-in gating record +for future implementation, not a benchmark execution result. It is used when a project +needs D1/D2 research, resource sizing, credentials, Docker runtime proof, or source +mapping before a fair adapter can run. A `research_gate` record MUST NOT be counted as +fixture-backed, live-baseline-only, or live-real-world evidence. + +`execution_metadata.sources[]` entries MUST include: + +- `label`: short source label. +- `url`: official source, docs, or repository URL. +- `evidence`: bounded description of why the source matters. + +`execution_metadata` fields: + +- `setup_path`: intended setup path or the setup blocker to resolve. +- `runtime_boundary`: Docker/service/CLI/process boundary expected for safe runs. +- `resource_expectation`: expected resource or credential envelope, including unknowns. +- `retry_guidance`: one or more concrete next checks before claiming pass/fail. +- `research_depth`: optional `D0`, `D1`, or `D2` research state. + +Adapter coverage status terms: + +| Term | Meaning | +| --- | --- | +| `real` | The adapter capability is exercised through the project's real local API, CLI, storage, or service surface. | +| `mocked` | The adapter uses a mock, in-memory substitute, fixture replay, or other non-durable stand-in for the named capability. | +| `unsupported` | The project or safe Docker profile does not expose the capability. This is not a quality penalty. | +| `blocked` | The check cannot run safely without credentials, manual setup, durable runtime integration, private input, or host integration outside the run scope. | +| `incomplete` | Setup, build, dependency, adapter wiring, parse, or runtime execution did not reach the behavioral check. | +| `wrong_result` | The adapter reached execution but produced the wrong answer, memory, evidence, or action. | +| `lifecycle_fail` | Retrieval may work, but encoded update, delete, expiry, cold-start, persistence, history, or supersession behavior failed. | +| `pass` | The declared adapter check completed and met its encoded expectations. | +| `not_encoded` | The capability, suite, or adapter path is not implemented in the runner, so no pass/fail claim is allowed. | + +Reports that load a manifest MUST emit an `external_adapters` section with schema id +`elf.real_world_external_adapter_report/v1`, the manifest id, Docker isolation +metadata, per-adapter records, and summary counters for: + +- adapter count, external project count, Docker-default count, host-global-install + count; +- `fixture_backed`, `live_baseline_only`, `live_real_world`, and `research_gate` + evidence classes; +- overall adapter statuses; +- capability coverage statuses; +- real-world suite coverage statuses. + +For `elf.real_world_external_adapter_report/v1`, `adapter_count` is the number of +adapter records in the loaded manifest. `external_project_count` is the number of +unique non-ELF project names represented by those records, not the number of non-ELF +adapter records. Multiple adapter records for the same external project MUST count as +one external project in this summary. + +Adapter-pack issues SHOULD add new projects by appending adapter records to this +manifest shape. They MUST NOT change these status meanings to make a project look +better or worse. + +### `timeline` + +`timeline` MUST model the user job as prior agent work, not just a bag of documents. + +Each event MUST include: + +- `event_id` +- `ts` +- `actor`: `user`, `agent`, `tool`, `system`, `operator`, or `external` +- `action`: short verb phrase such as `created_issue`, `made_decision`, + `ran_command`, `hit_blocker`, `updated_memory`, `deleted_memory`, or + `published_report` +- `evidence_ids`: one or more ids from `corpus.items[]` +- `summary`: compact English summary + +Timeline order is normative. If a later event supersedes an earlier fact, the expected +answer MUST follow the later event unless `allowed_uncertainty` permits a historical +answer. + +### `prompt` + +`prompt` MUST include: + +- `role`: normally `user`. +- `content`: the exact user request. +- `job_mode`: `resume`, `answer`, `debug`, `decide`, `compile`, `personalize`, or + `operate`. +- `constraints`: array of explicit instructions such as `do_not_run_live_actions`, + `cite_evidence`, `avoid_repeating_completed_work`, or `state_blockers`. + +The evaluated system MAY retrieve memory, inspect its own state, or call adapter tools +only when the runner profile permits those actions. + +### `expected_answer` + +`expected_answer` MUST define answer correctness at the job level. + +Required fields: + +- `must_include`: array of claims or actions that must appear. +- `must_not_include`: array of forbidden claims, stale facts, or unsafe actions. +- `evidence_links`: mapping from required claim ids to acceptable evidence ids. +- `answer_type`: `direct_answer`, `work_plan`, `resume_summary`, `debug_report`, + `decision_record`, `compiled_knowledge`, or `ops_runbook`. + +Optional fields: + +- `accepted_alternates`: array of alternate phrasings or equivalent evidence ids. +- `requires_caveat`: boolean; when true, omitting the caveat is a scoring failure. +- `requires_refusal`: boolean; when true, the correct answer is to decline or stop + because the memory system lacks evidence or authority. + +### `required_evidence` + +Each required evidence entry MUST include: + +- `evidence_id` +- `claim_id` +- `requirement`: `cite`, `use`, `avoid`, or `explain` +- `quote` or `selector`: exact quote for inline fixtures, or a stable selector for + local/private fixtures. + +An answer that states a required claim without any acceptable evidence link is an +`unsupported_claim` unless the job's `allowed_uncertainty` explicitly permits an +uncited low-confidence statement. + +### Optional `adapter_response.answer.pages` + +Knowledge-compilation fixtures MAY include generated page artifacts in +`corpus.adapter_response.answer.pages[]`. These page artifacts are benchmark outputs, +not authoritative source truth. Any checked-in generated page fixture MUST be clearly +marked as a benchmark artifact. + +Each page entry MUST include: + +- `page_id`: stable page identifier, such as `project:elf-benchmark-suite`. +- `page_type`: `project`, `entity`, `concept`, `issue_timeline`, or another + fixture-defined type. +- `title`: human-readable page title. +- `path`: optional fixture path for a checked-in benchmark artifact page. +- `sections`: generated page sections. +- `backlinks`: zero or more page, entity, concept, issue, or evidence identifiers. +- `lint_findings`: zero or more stale, unsupported, or contradiction findings. +- `rebuild`: optional rebuild comparison record. + +Each `sections[]` entry MUST include: + +- `section_id` +- `heading` +- `role`: examples include `current_truth`, `history`, `timeline`, `backlinks`, and + `summary`. +- `content`: bounded fixture text. +- `evidence_ids`: zero or more ids from `corpus.items[]`. +- `timeline_event_ids`: zero or more ids from `timeline[]`. +- `unsupported_reason`: optional reason why the section is intentionally unsupported. + +Every generated page section MUST trace back to at least one `evidence_id` or +`timeline_event_id`, or it MUST include `unsupported_reason`. A section that lacks both +trace evidence and an unsupported flag is an `unsupported_claim`. A section with +`role = "summary"` and `unsupported_reason` is counted as an unsupported summary, but it +is not a hidden unsupported claim because the page explicitly marks the gap. + +Each `lint_findings[]` entry SHOULD include: + +- `finding_id` +- `finding_type`: for example `stale_claim`, `unsupported_claim`, or + `contradiction`. +- `severity` +- `text` +- `evidence_ids` +- `trap_id`: optional link to `negative_traps[]`. + +Each `rebuild` record SHOULD include: + +- `first_hash` +- `second_hash` +- `deterministic`: true when repeat rebuilds produced byte-stable output. +- `allowed_variance`: explanations for accepted non-semantic variance. + +Knowledge-compilation reports SHOULD include citation coverage, stale claim detection, +rebuild determinism, page usefulness, backlink counts, unsupported summary count, and +untraced section count. Rebuild results are acceptable only when repeated output is +deterministic enough for regression comparison or every allowed variance is explicitly +reported. + +### `negative_traps` + +Negative traps MUST be explicit so systems are tested against realistic memory failure +modes. + +Trap types: + +- `stale_fact`: once true but superseded later in the timeline. +- `near_duplicate`: semantically close but wrong project, user, tenant, or time. +- `decoy_evidence`: shares query terms but does not support the expected claim. +- `unsafe_action`: would perform live, destructive, credentialed, or out-of-scope work. +- `unsupported_prior`: plausible prior decision not present in the corpus. +- `privacy_leak`: private or excluded content that must not appear in the answer. + +Each trap MUST include `trap_id`, `type`, `evidence_ids`, and `failure_if_used`. + +### `encoding` + +`encoding` declares a fixture that is intentionally not scored as a runnable pass +because the benchmark capability is not encoded or cannot run yet. + +Allowed `status` values: + +- `not_encoded`: the fixture documents a capability gap and must not claim pass. +- `blocked`: required adapter, corpus, or system support is missing. +- `incomplete`: fixture execution cannot reach a complete scored state. + +When `status` is present, `reason` MUST be a non-empty explanation. `follow_up` is +optional, but when present it MUST include non-empty `title` and `reason` fields. + +### `memory_evolution` + +`memory_evolution` is used by jobs that test whether an answer distinguishes current +facts, historical facts, stale facts, conflicts, corrected memories, and missing +temporal validity support. + +Fields: + +- `current_evidence_ids`: evidence ids that support the current answer. +- `historical_evidence_ids`: evidence ids that are historically true but not current + answers unless the prompt asks for history. +- `tombstone_evidence_ids`: evidence ids that prove a deleted memory, TTL expiry, or + DELETE outbox tombstone should suppress an older fact. +- `invalidation_evidence_ids`: evidence ids that prove a fact was invalidated by a + higher-priority lifecycle event even if it remains available as history. +- `stale_trap_ids`: negative trap ids that represent stale answers. +- `conflicts`: array of conflicts with `conflict_id`, `claim_id`, + `current_evidence_id`, `historical_evidence_id`, and optional + `resolved_by_evidence_id`. +- `update_rationale`: optional object with `claim_id`, `evidence_ids`, and + `available` to show whether the answer can explain why the memory changed. +- `temporal_validity`: optional object with `required`, `encoded`, and optional + `follow_up`. When `required = true` and `encoded = false`, the job MUST declare + `encoding.status = "not_encoded"` or `encoding.status = "blocked"`. + When `encoded = true`, the job is scored normally and must include concrete + produced evidence for current and historical validity behavior. + +### `operator_debug` + +`operator_debug` is required when `suite = "operator_debugging_ux"` and optional +elsewhere. It records whether a human operator can identify the root cause through +viewer, trace, or CLI readback without raw SQL. + +Required fields: + +- `failure_mode`: stable label such as `expected_evidence_dropped`, + `rerank_promoted_bad_candidate`, `provider_latency_or_failure`, + `rebuild_changed_results`, or `relation_context_misled_search`. +- `trace_id`: trace handle when available. +- `viewer_url`: read-only viewer path that opens the trace evidence when available. +- `admin_trace_bundle_url`: direct admin trace bundle path when available. +- `root_cause`: concise expected diagnosis. +- `steps_to_root_cause`: number of viewer or CLI steps needed to reach the diagnosis. +- `raw_sql_needed`: must be `false` for a pass under this suite. +- `dropped_candidate_visibility`: whether dropped, retained, or misleading candidates + are visible through trace/viewer evidence. +- `trace_completeness`: `complete`, `partial`, or `missing`. +- `repair_action_clarity`: `clear`, `partial`, or `missing`. +- `viewer_panels`: viewer panels used, such as `Replay Candidates`, `Stage Details`, + `Providers And Ranking`, or `Relation Context`. +- `cli_steps`: equivalent CLI or endpoint steps. +- `trace_evidence`: evidence ids used for the diagnosis. +- `ux_gaps`: array of focused follow-up pointers when a needed panel or endpoint is + absent. + +Each `ux_gaps[]` entry MUST include `gap_id`, `severity`, `description`, and +`follow_up_issue`. If a fixture requires a missing panel, the report must encode the +gap instead of hiding it behind a wrong-result score. + +### `scoring_rubric` + +The rubric MUST be job-specific but use the shared dimensions below. + +Required dimensions: + +- `answer_correctness`: expected answer content and action selection. +- `evidence_grounding`: correct use of required evidence and source refs. +- `trap_avoidance`: avoidance of stale, decoy, privacy, and unsafe traps. +- `uncertainty_handling`: honest caveats when evidence is missing or ambiguous. +- `workflow_helpfulness`: whether the answer advances the user job without needless + repetition. + +Optional dimensions: + +- `lifecycle_behavior`: update, delete, expiry, supersession, or cold-start behavior. +- `debuggability`: trace, timeline, viewer, or explanation quality. +- `latency_resource`: bounded runtime, cost proxy, or resource envelope. +- `personalization_fit`: correct user/project preference application without leakage. + +Rubric fields: + +- `dimensions`: object keyed by dimension name, each with `weight`, `max_points`, and + `criteria`. +- `pass_threshold`: total normalized score required for `pass`. +- `hard_fail_rules`: array of rules that force a non-pass status regardless of score. + +Hard-fail rules MUST include: + +- unsupported high-confidence claim about a required decision or fact; +- unsafe live/destructive action when the prompt forbids it; +- use of a negative trap marked `failure_if_used = true`; +- missing required refusal when the job has `requires_refusal = true`. + +### `allowed_uncertainty` + +`allowed_uncertainty` MUST distinguish honest uncertainty from failure. + +Required fields: + +- `can_answer_unknown`: boolean. +- `acceptable_phrases`: array of accepted uncertainty phrases or patterns. +- `fallback_action`: `ask_for_evidence`, `state_blocker`, `cite_partial_evidence`, + `refuse`, or `continue_with_caveat`. + +If `can_answer_unknown = false`, an answer that refuses despite sufficient evidence is +`wrong_result`. If `can_answer_unknown = true`, an answer that invents missing evidence +is `unsupported_claim`. + +## Suite Taxonomy + +Suite ids are stable public names. Each suite MUST contain at least one +`real_world_job` before a report may claim suite coverage. + +| Suite id | Goal | User-job examples | Evidence requirements | Scoring dimensions | Strongest external references | +| --- | --- | --- | --- | --- | --- | +| `trust_source_of_truth` | Verify authoritative storage, provenance, rebuild, and non-authoritative derived index handling. | Restore a note after Qdrant rebuild; identify whether a compiled page is derived; explain why a source ref supports a claim. | Source note/document ids, restore or rebuild trace, source_ref lineage, no hidden index-only evidence. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior. | ELF, memsearch, OpenViking. | +| `work_resume` | Help an agent resume real work without repeating completed steps or losing blockers. | Resume a retained lane; identify next command after a failed run; summarize what remains blocked. | Timeline events, issue/PR ids, run summaries, latest blocker evidence. | answer_correctness, workflow_helpfulness, uncertainty_handling, trap_avoidance. | agentmemory, claude-mem, OpenViking. | +| `project_decisions` | Recover durable decisions, rationale, reversals, and current policy. | Explain why a design was chosen; distinguish old vs current validation gate; cite decision evidence. | Decision records, superseding events, accepted alternatives, current-policy timestamp. | answer_correctness, evidence_grounding, trap_avoidance, uncertainty_handling. | ELF, gbrain, llm-wiki, Letta. | +| `retrieval` | Measure task-relevant retrieval quality beyond top-k keyword matching. | Answer a task query with expected evidence; find alternate phrasing; avoid near-duplicate project evidence. | Expected evidence ids, allowed alternates, decoy evidence ids, trace ids when available. | answer_correctness, evidence_grounding, trap_avoidance, latency_resource. | qmd, ELF, memsearch, OpenViking. | +| `memory_evolution` | Verify updates, deletes, expiry, supersession, contradiction handling, and history. | Apply a new preference; suppress a deleted memory; explain what superseded an old fact. | Before/after memory versions, ingest decision rows or adapter history, current timeline event. | lifecycle_behavior, answer_correctness, evidence_grounding, trap_avoidance. | mem0, ELF, Graphiti/Zep, Letta. | +| `consolidation` | Test reviewable derived memory formation without hidden source mutation. | Produce a consolidation proposal; identify unsupported claims; discard stale synthesis. | Source inputs, derived proposal id, lineage, review state, conflict markers. | answer_correctness, evidence_grounding, uncertainty_handling, debuggability. | Claude Dreams, Gemini CLI Auto Memory, Always-On Memory Agent, ELF. | +| `memory_summary` | Test reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile memory readback. | Produce a current memory summary; downgrade stale memory; expose a TTL tombstone; refuse an unsupported derived profile claim. | Summary entry source refs, freshness and validity markers, source trace, inclusion/downgrade/exclusion rationale, unsupported-claim flags. | answer_correctness, evidence_grounding, lifecycle_behavior, trap_avoidance, uncertainty_handling. | OpenAI Dreaming, Claude Dreams, Always-On Memory Agent, ELF. | +| `knowledge_compilation` | Compile evidence into maintained project/entity/concept pages while preserving provenance. | Build a project status page; answer from compiled truth plus timeline; lint a stale page section. | Page section sources, backlinks, timeline entries, lint evidence. | answer_correctness, evidence_grounding, workflow_helpfulness, trap_avoidance. | llm-wiki, gbrain, graphify, ELF. | +| `operator_debugging_ux` | Show whether a wrong or ambiguous memory result can be debugged without raw store spelunking. | Explain why a result ranked first; inspect a trace; identify which stage dropped expected evidence. | Trace bundle, retrieval trajectory, candidate metrics, viewer or CLI readback. | debuggability, evidence_grounding, workflow_helpfulness, answer_correctness. | claude-mem, qmd, agentmemory, ELF. | +| `capture_integration` | Evaluate how accurately work observations become usable memory across agents and tools. | Capture a session decision; exclude private spans; import external agent observations. | Hook/import logs, write policy audits, excluded spans, resulting note ids. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior. | agentmemory, claude-mem, memsearch, mem0. | +| `production_ops` | Prove safe operation under backup, restore, backfill, cold start, resource, and credential boundaries. | Resume interrupted import; restore from backup; report missing private manifest as bounded caveat. | Command/report artifacts, resource envelope, checkpoint state, failure guard evidence. | lifecycle_behavior, latency_resource, uncertainty_handling, evidence_grounding. | ELF, qmd, memsearch, LangGraph. | +| `personalization` | Apply user/project preferences correctly without leaking across scopes or overfitting stale preferences. | Remember preferred response style; avoid using another project tenant's note; update a preference. | Scoped memory ids, preference versions, tenant/project/agent context, negative cross-scope traps. | personalization_fit, trap_avoidance, evidence_grounding, answer_correctness. | mem0, Letta, agentmemory, ELF. | +| `core_archival_memory` | Verify always-loaded core memory behavior separately from archival note search and derived retrieval indexes. | Read an attached core block; enforce core block scope; detect stale core state from archival evidence; fall back to archival notes; recover a decision from core routing plus archival rationale. | Core block ids, attachment ids, read_profile/scope metadata, source_ref and audit history, archival note evidence ids, stale-core traps, and explicit no-Qdrant-core-block boundary evidence. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior, workflow_helpfulness. | Letta, ELF. | +| `context_trajectory` | Measure staged context trajectory, hierarchy selection, and recursive/context expansion without converting setup or retrieval preconditions into trajectory wins. | Explain whether a staged trajectory can be scored; identify selected hierarchy nodes; report recursive expansion paths and pruned branches. | Same-corpus expected evidence ids, matched/missing evidence ids, stage artifacts, selected hierarchy nodes, expansion paths, comparable ELF trace/session artifacts when a comparison is claimed. | answer_correctness, evidence_grounding, trap_avoidance, debuggability, workflow_helpfulness. | OpenViking, ELF, qmd. | + +## Report Semantics + +Reports MUST preserve typed outcomes at job, suite, and project levels. A report MUST +NOT collapse the results into a single overall leaderboard without the underlying typed +state table. + +Outcome terms: + +| Term | Meaning | +| --- | --- | +| `pass` | The job or suite is encoded, ran to completion, met the pass threshold, satisfied required evidence, and hit no hard-fail rule. | +| `wrong_result` | The system completed the job but selected the wrong answer, wrong action, wrong current fact, or missed required evidence despite enough available evidence. | +| `lifecycle_fail` | The answer surface may be correct for retrieval, but encoded update, delete, expiry, cold-start, persistence, history, or supersession behavior failed. | +| `incomplete` | The runner could not reach the behavioral check because install, build, dependency, adapter wiring, parse, or runtime setup failed. | +| `blocked` | The check cannot be run safely without credentials, manual setup, private corpus input, durable runtime integration, or host integration outside the run scope. | +| `not_encoded` | The suite, job, adapter path, or scoring dimension is not implemented in the runner, so no pass/fail claim is allowed. | +| `unsupported_claim` | The system produced a substantive claim, decision, evidence citation, or capability claim that is not supported by the job corpus, required evidence, or report metadata. | + +`unsupported_claim` is distinct from `wrong_result`: `wrong_result` can be a supported +but incorrect selection, while `unsupported_claim` is an evidentiary failure. When both +apply, reports SHOULD surface `unsupported_claim` because it is higher risk for memory +systems used by agents. + +Suite status rules: + +- A suite is `pass` only when all encoded required jobs pass. +- A suite is `lifecycle_fail` when at least one lifecycle-scored job proves lifecycle + behavior wrong and no higher-risk `unsupported_claim` is present. +- A suite is `wrong_result` when at least one required job returns the wrong result and + no higher-risk `unsupported_claim` is present. +- A suite is `unsupported_claim` when any hard-fail unsupported claim occurs. +- A suite is `incomplete` or `blocked` when required jobs cannot run for those reasons. +- A suite is `not_encoded` when no job in that suite is implemented, or when an + encoded fixture declares a job-level capability gap that prevents a suite pass claim. + +Reports MUST include: + +- run id, runner version, corpus profile, job ids, suite ids, project adapter metadata; +- per-job status, normalized score, hard-fail hits, evidence ids used, trap ids used; +- per-job `answer_type`, required caveat/refusal flags, and whether an unknown answer + is allowed, so current-decision, historical-decision, rationale, and caveat cases are + distinguishable in generated reports; +- expected evidence recall and irrelevant context ratio at job, suite, and summary + levels when the runner can derive them from fixture evidence ids; +- trace explainability metadata when an adapter or fixture can identify retrieval + stages, especially for wrong-result stage attribution; +- per-suite typed status and score distribution; +- unsupported claim list with claim text or a bounded redacted description; +- for encoded knowledge-compilation jobs with page artifacts: citation coverage, stale + claim detection, rebuild determinism, page usefulness, backlink counts, unsupported + summary count, and untraced section count; +- explicit `not_encoded` suite list; +- private-corpus redaction policy when private fixtures are used. +- capture/integration coverage classes when any fixture declares `capture_behaviors`, + preserving the `real`, `fixture_backed`, `mocked`, `blocked`, and `not_encoded` + distinction. +- external adapter coverage when an external adapter manifest is loaded, preserving + `fixture_backed`, `live_baseline_only`, `live_real_world`, `research_gate`, + `real`, `mocked`, `unsupported`, `blocked`, `incomplete`, `wrong_result`, + `lifecycle_fail`, `pass`, and `not_encoded` distinctions. Scenario summaries MUST + preserve status counts, legacy `elf_position` counts, and normalized + `comparison_outcome` counts when scenario judgments are present. + +Reports that encode `memory_evolution` jobs SHOULD also include stale-answer counts, +conflict detection counts, update rationale availability, and temporal-validity +`not_encoded` counts. A temporal graph validity job MUST NOT be reported as `pass` +unless the runner can evaluate current-only versus historical relation facts. + +Reports that encode `memory_summary` jobs MUST also include: + +- summary artifact count and entry count; +- source-ref coverage for included or downgraded summary entries; +- freshness-marker and rationale coverage; +- stale-current violation count for top-of-mind entries; +- derived entries missing both source refs and unsupported-claim flags; +- unsupported derived candidate count. +- unsupported derived entries included as current memory. + +A `memory_summary` job MUST NOT pass when stale, superseded, or tombstoned entries are +presented as current top-of-mind facts. A derived project-profile entry MUST NOT pass +unless it has source refs or explicit unsupported-claim flags. A derived entry with +unsupported-claim flags MUST NOT pass when it is included as current memory instead of +being excluded or downgraded for review. + +Consolidation suite reports MUST also include: + +- proposal usefulness score, or `null` when the job has no proposal payloads; +- lineage completeness score over expected source refs; +- review action correctness for `apply`, `discard`, and `defer` outcomes; +- proposal unsupported-claim count for contradiction/staleness reports; +- source mutation count. + +For proposal-only consolidation jobs, source mutation count MUST be `0`. If the runner +or adapter cannot execute the consolidation primitive it claims to evaluate, the report +MUST include an executable gap with a precise follow-up issue or issue title. Offline +fixtures MAY still pass when they verify checked-in proposal payloads and clearly avoid +claiming scheduled provider-backed generation. + +## Claim Rules + +- A project MAY claim a suite pass only for suites with encoded jobs and a published + report using this contract. +- A project MUST NOT use generated public jobs to claim private production readiness. +- A project MUST NOT treat `blocked`, `incomplete`, or `not_encoded` as evidence of + weakness or strength; those states only describe benchmark coverage. +- A project MUST NOT claim "best memory system" from this suite. Reports SHOULD describe + dimension-specific results and typed limitations. +- Existing ELF/qmd-leading live baseline results MAY be cited as retrieval/lifecycle + evidence, but MUST NOT be reinterpreted as real-world job suite wins. + +## Downstream Implementation Contract + +Runner implementation issues can cite this spec and choose any subset of suites. The +minimum useful runner increment is: + +- one encoded `real_world_job` fixture; +- one adapter path; +- scoring for all required rubric dimensions in that job; +- typed report output using the Report Semantics section. + +Implementation issues MUST state which suites remain `not_encoded`. diff --git a/docs/spec/system_competitive_parity_gate_v1.md b/docs/spec/system_competitive_parity_gate_v1.md new file mode 100644 index 00000000..7c130f7f --- /dev/null +++ b/docs/spec/system_competitive_parity_gate_v1.md @@ -0,0 +1,147 @@ +# Competitive Parity Gate v1 Specification + +Purpose: Define the adoption gate ELF must pass before it can be treated as production-eligible memory infrastructure. +Status: normative +Read this when: You are deciding whether ELF is at least as usable as the external memory systems it is being compared against. +Not this document: A market survey, implementation plan, or claim that architecture alone makes ELF better. +Defines: `elf.competitive_parity_gate/v1` dimensions, Docker isolation rules, baseline families, hard thresholds, and report schema. + +Related inputs: + +- `docs/research/2026-06-08-agent-memory-selection.json` +- `docs/guide/research/comparison_external_projects.md` +- `docs/guide/research/agentmemory_adapter.md` +- `docs/spec/system_elf_memory_service_v2.md` +- `docs/spec/system_consolidation_proposals_v1.md` + +## Core Rule + +ELF is adoption-eligible only when current test evidence shows that it meets or +exceeds the selected baseline projects in user-visible value. A design advantage, +unchecked capability table, or speculative architecture claim is not sufficient. + +The gate must fail closed. If ELF cannot run the comparison, preserve evidence, +retrieve expected memory, expose inspection surfaces, or cleanly isolate state, the +gate result is `fail`. + +## Contract Schema + +Canonical schema identifier: + +```text +elf.competitive_parity_gate/v1 +``` + +Every parity report must carry: + +```json +{ + "schema": "elf.competitive_parity_gate.report/v1", + "gate_schema": "elf.competitive_parity_gate/v1" +} +``` + +## Docker Isolation + +Competitive parity runs must use Docker Compose as the execution boundary. + +Required properties: + +- The host may invoke `docker compose`, but benchmark code, service processes, + Postgres, Qdrant, Cargo builds, and test commands must run inside containers. +- The parity compose file must not publish service ports to the host by default. +- Postgres, Qdrant, Cargo registry, Cargo git cache, and Rust target output must use + Docker-managed volumes. +- The only allowed host artifact is the parity report directory, normally + `tmp/parity/`. +- A parity runner must refuse to run on the host unless an explicit + `ELF_PARITY_ALLOW_HOST=1` override is supplied for debugging. +- Cleanup must be possible with `docker compose -f docker-compose.parity.yml down -v + --remove-orphans`. + +## Baseline Families + +The gate tracks baseline families separately so evidence can grow without changing +the core contract: + +- `agentmemory_fixture`: sanitized offline agentmemory-style session exports mapped + through the ELF-owned fixture adapter. +- `agentmemory_live_container`: future containerized agentmemory service comparisons + against the same private evaluation cases. +- `claude_mem_fixture`: future fixture import and retrieval comparison for + progressive-disclosure Claude memory workflows. +- `mem0_openmemory_fixture`: future local OpenMemory-style workflow comparison. +- `qmd_memsearch_fixture`: future local retrieval-quality comparison against + CLI/MCP-first hybrid retrieval systems. + +External projects are baselines and product references. They must not become hidden +runtime dependencies of ELF core memory semantics unless a separate design spec +explicitly adopts that dependency. + +## Gate Dimensions + +Each completed gate report must evaluate these dimensions: + +| Dimension | Meaning | First hard threshold | +| --------- | ------- | -------------------- | +| `docker_isolation` | The full run used container services and container-local build state. | `pass` | +| `adapter_coverage` | Baseline fixture records are mapped into candidate ELF notes, docs, queries, and ignored reasons. | agentmemory sample emits 2 note candidates, 2 doc candidates, 1 baseline query, and 1 ignored item | +| `provenance_integrity` | Candidate writes keep source-system, session, and item references. | agentmemory note candidate provenance completeness is `1.0` | +| `unsafe_rejection` | Unsupported or unsafe external memory items are rejected explicitly. | at least one ignored item with reason `unsupported_memory_kind` | +| `retrieval_quality` | ELF returns the expected memory for parity queries after normal ingestion/indexing. | consolidation harness after-run recall is not below baseline recall | +| `context_efficiency` | Retrieval/consolidation does not require more context to preserve recall. | consolidation harness after-run context chars are not above baseline | +| `source_safety` | Consolidation output remains derived and reviewable; authoritative source records are not destructively rewritten. | consolidation proposal/source immutability contract remains satisfied | +| `operator_inspectability` | A local operator can inspect memory state without write authority. | admin `GET /viewer` returns 200 during the Docker service run | +| `cleanup` | Test state can be removed without host database or vector-store residue. | documented compose cleanup command exists and succeeds when run | + +These are minimum thresholds. Passing them only proves that the checked-in gate is +alive. Personal production use requires the same gate shape to pass against a larger +private fixture pack and at least one live containerized baseline. + +## First Gate Scope + +The first checked-in executable gate covers: + +- Docker-only execution through `docker-compose.parity.yml`. +- Offline `agentmemory_fixture` adapter validation using the sanitized sample fixture. +- Service-backed ELF consolidation/retrieval validation using Postgres and Qdrant + containers. +- Admin viewer availability during the service-backed run. +- A machine-readable report under `tmp/parity/competitive-parity-report.json`. + +The first gate does not claim broad market superiority. It establishes a hard, +repeatable lower bound that must stay green before broader baselines are meaningful. + +## Report Schema + +Parity reports must be JSON objects with at least: + +- `schema`: `elf.competitive_parity_gate.report/v1` +- `gate_schema`: `elf.competitive_parity_gate/v1` +- `gate_id`: stable or timestamped run identifier +- `verdict`: `pass` or `fail` +- `docker_only`: boolean +- `baselines`: object keyed by baseline family +- `dimensions`: object keyed by gate dimension +- `thresholds`: object describing the hard thresholds used by the run +- `artifacts`: object with relative paths to preserved run evidence + +Reports may include extra metrics, but extra fields must not weaken the hard +thresholds in this spec. + +## Adoption Decision + +Treat ELF as `not_adoptable_for_production` while any of these are true: + +- The Docker parity gate fails. +- The gate only passes the checked-in toy fixture and has not passed a private + personal fixture pack. +- At least one selected external baseline outperforms ELF on retrieval quality, + migration fidelity, operator inspectability, or failure recovery without a + documented compensating ELF advantage. +- Evidence cannot be reproduced from the report artifacts. + +Treat ELF as `personal_production_candidate` only after the Docker gate passes on +both the checked-in fixture and a private personal fixture pack, and after at least +one live external baseline comparison is no worse than ELF on the selected +acceptance metrics. diff --git a/docs/spec/system_consolidation_proposals_v1.md b/docs/spec/system_consolidation_proposals_v1.md new file mode 100644 index 00000000..35f2f95a --- /dev/null +++ b/docs/spec/system_consolidation_proposals_v1.md @@ -0,0 +1,279 @@ +# Consolidation Proposals v1 Specification + +Purpose: Define the reviewable consolidation run and proposal contract for derived memory output. +Status: normative +Read this when: You are implementing, validating, or reviewing dreaming-inspired consolidation storage, jobs, proposals, or review flows. +Not this document: Live LLM consolidation generation, viewer UI behavior, retrieval observability panels, or agentmemory import adapters. +Defines: `elf.consolidation/v1` runs, proposals, source snapshots, lineage, review lifecycle, and source immutability rules. + +Related inputs: + +- `docs/research/2026-06-08-agent-memory-selection.json` +- `docs/guide/research/comparison_external_projects.md` +- `docs/spec/system_elf_memory_service_v2.md` + +## Core Rule + +Consolidation output is derived and reviewable. It must never destructively rewrite +authoritative source notes, events, docs, traces, graph facts, or search traces. + +The authoritative source-of-truth remains the ELF Core storage defined by +`docs/spec/system_elf_memory_service_v2.md`. Consolidation stores proposals over +immutable input snapshots. A proposal may later create or update a derived artifact, +but source evidence remains inspectable and unchanged. + +## Contract Schema + +Canonical schema identifier: + +```text +elf.consolidation/v1 +``` + +Every persisted run and proposal must carry `contract_schema = "elf.consolidation/v1"`. + +## Source References + +`source_refs` is a non-empty array of immutable input pointers. + +Each item has: + +- `kind`: one of `note`, `event`, `trace`, `trace_item`, `doc`, `doc_chunk` +- `id`: UUID of the referenced source artifact +- `snapshot`: source snapshot metadata captured before proposal storage + +`snapshot` must contain at least one freshness or replay guard: + +- `status` +- `updated_at` +- `content_hash` +- `embedding_version` +- `trace_version` +- non-empty `source_ref` +- non-empty `metadata` + +`source_ref` and `metadata` must be JSON objects. + +## Run Contract + +Storage table: `consolidation_runs`. + +Required fields: + +- `run_id` +- `tenant_id` +- `project_id` +- `agent_id` +- `contract_schema` +- `job_kind` +- `status` +- `input_refs` +- `source_snapshot` +- `lineage` +- `error` +- `created_at` +- `updated_at` +- `completed_at` + +`job_kind` identifies how the run was registered, for example `fixture`, `manual`, or +future `scheduled`. This issue only permits fixture-driven or manually supplied +proposal payloads. It does not permit live provider generation. + +Run states: + +- `pending` +- `running` +- `completed` +- `failed` +- `cancelled` + +Allowed run transitions: + +- `pending -> running` +- `pending -> cancelled` +- `running -> completed` +- `running -> failed` +- `running -> cancelled` + +Terminal states are `completed`, `failed`, and `cancelled`. + +## Worker Job Contract + +Storage table: `consolidation_run_jobs`. + +The first runtime implementation is queue-backed and deterministic. Creating a +fixture or manual consolidation run stores the immutable run input snapshot, enqueues +one worker job, and returns the run plus `job_id`. The worker materializes queued +proposal payloads into `consolidation_proposals`; API creation must not call LLM, +embedding, rerank, or external provider adapters. + +Required fields: + +- `job_id` +- `run_id` +- `tenant_id` +- `project_id` +- `agent_id` +- `job_kind` +- `status` +- `payload` +- `attempts` +- `last_error` +- `available_at` +- `created_at` +- `updated_at` + +Job states: + +- `PENDING` +- `CLAIMED` +- `DONE` +- `FAILED` + +`payload` is a JSON object with: + +- `contract_schema = "elf.consolidation/v1"` +- `proposals`: array of proposal contracts matching this spec + +Worker rules: + +- Claim one due `PENDING`, expired `CLAIMED`, or retryable `FAILED` job with a lease. +- Validate `payload.contract_schema` and every proposal before persistence. +- Transition the run through `pending -> running -> completed` when materialization + succeeds. +- Insert proposals with `review_state = proposed`. +- Mark the job `DONE` in the same transaction as the proposal and run-state writes. +- On failure, mark the job `FAILED`, increment attempts, preserve a bounded error, and + schedule retry. +- Never mutate authoritative source notes, events, docs, traces, graph facts, or + search traces. + +## Proposal Contract + +Storage table: `consolidation_proposals`. + +Required fields: + +- `proposal_id` +- `run_id` +- `tenant_id` +- `project_id` +- `agent_id` +- `contract_schema` +- `proposal_kind` +- `apply_intent` +- `review_state` +- `source_refs` +- `source_snapshot` +- `lineage` +- `diff` +- `confidence` +- `unsupported_claim_flags` +- `contradiction_markers` +- `staleness_markers` +- `target_ref` +- `proposed_payload` +- `reviewer_agent_id` +- `review_comment` +- `reviewed_at` +- `created_at` +- `updated_at` + +`confidence` must be finite and in the inclusive range `0.0..=1.0`. + +`lineage` must include non-empty `source_refs`. It may also include `parent_run_id` +and `parent_proposal_ids`. + +`unsupported_claim_flags` is a reviewer prompt array. Each flag has: + +- `claim_id`: optional stable claim identifier +- `message`: non-empty reviewer-facing text +- `source`: optional source reference + +`contradiction_markers` and `staleness_markers` are review prompts. Each marker has: + +- `severity`: `low`, `medium`, or `high` +- `message`: non-empty reviewer-facing text +- `source`: optional source reference + +## Diff And Apply Intent + +`diff` is a JSON object with: + +- `summary`: non-empty text +- `before`: JSON object +- `after`: JSON object + +The diff must describe a derived output change. It must not include source mutation +keys such as `source_mutation`, `source_mutations`, `source_note_updates`, +`delete_source`, `delete_sources`, `source_delete`, or `overwrite_source`. + +Allowed `apply_intent` values: + +- `create_derived_note` +- `update_derived_note` +- `create_derived_knowledge_page` +- `update_derived_knowledge_page` +- `create_derived_graph_view` +- `no_op` + +No `apply_intent` may update, delete, overwrite, or deprecate authoritative source +notes, docs, events, traces, or graph facts. + +## Review Lifecycle + +Review states: + +- `proposed` +- `approved` +- `rejected` +- `applied` +- `archived` + +Allowed review transitions: + +- `proposed -> approved` +- `proposed -> rejected` +- `proposed -> archived` +- `approved -> applied` +- `approved -> rejected` +- `approved -> archived` + +Terminal states are `rejected`, `applied`, and `archived`. + +`applied` means the proposal has been approved and marked as applied to the derived +target. It does not mean authoritative source memory was changed. + +Operator review actions map to the lifecycle states: + +- `approve`: `proposed -> approved` +- `apply`: `approved -> applied`, or `proposed -> approved -> applied` with both + transitions audited +- `discard`: `proposed|approved -> rejected` +- `defer`: `proposed|approved -> archived` + +Every review transition must write an append-only audit event with proposal id, run id, +reviewer agent id, action, prior state, next state, optional comment, and timestamp. + +## Service Boundary + +The first implementation exposes fixture-driven service flows: + +- create a consolidation run with optional proposal payloads and queued worker `job_id` +- list consolidation runs +- get a consolidation run +- list consolidation proposals +- get a consolidation proposal +- transition proposal review state through `approve`, `apply`, `discard`, and `defer` + actions with review-event readback + +These flows must not call LLM, embedding, rerank, or external provider adapters. + +## Future Connections + +Future viewer work should render proposals as reviewable records with source refs, +snapshots, lineage, diff, confidence, contradiction markers, and staleness markers. + +Future derived knowledge pages may use approved proposals as input, but those pages +remain rebuildable derived output. They must retain source pointers and must not become +a hidden replacement for evidence-bound ELF Core memory. diff --git a/docs/spec/system_doc_chunking_profiles_v1.md b/docs/spec/system_doc_chunking_profiles_v1.md new file mode 100644 index 00000000..20ad1fd8 --- /dev/null +++ b/docs/spec/system_doc_chunking_profiles_v1.md @@ -0,0 +1,54 @@ +# System: `doc_chunking_profiles/v1` for `docs_put` + +Purpose: Define token-based chunking profiles used by Doc Extension v1 ingestion. +Status: normative +Read this when: You are implementing, validating, or debugging `docs_put` chunking behavior. +Not this document: Retrieval ranking, filter semantics, or end-to-end ingestion workflow steps. +Defines: `doc_chunking_profiles/v1`, selected profiles, and chunking invariants for `docs_put`. + +Identifiers: +- Envelope identifier: `doc_chunking_profiles/v1` +- File: `docs/spec/system_doc_chunking_profiles_v1.md` + +Scope: +- Applies to `POST /v2/docs` (`docs_put`) chunking behavior in `packages/elf-service/src/docs.rs`. +- Profiles are selected by `doc_type`. + +Design goals: +- Deterministic chunking across ingesters when `doc_type` and input text are equal. +- Token-based boundaries to avoid byte-length split artifacts in Unicode/UTF-8 text. +- Small overlap to preserve continuity at boundaries. + +================================================== +1) Profile matrix +================================================== + +The following profile values are used unless overridden by a future `*_v2` contract: + +| `doc_type` | `max_tokens` | `overlap_tokens` | +|------------|--------------|------------------| +| `chat` | 1024 | 128 | +| `search` | 1024 | 128 | +| `dev` | 2048 | 256 | +| `knowledge`| 2048 | 256 | + +================================================== +2) Validation rules +================================================== + +Each profile must satisfy: +- `max_tokens > 0` +- `overlap_tokens >= 0` +- `overlap_tokens < max_tokens` + +================================================== +3) Compatibility rules +================================================== + +Forward compatibility: +- Consumers may accept additional profile keys or optional extension metadata. +- Unknown profile metadata is ignored by core chunking behavior. + +Backward compatibility: +- This profile set is normative for `doc_chunking_profiles/v1`. +- Clients must not invent alternative `max_tokens`/`overlap_tokens` values for these `doc_type` values without introducing a new version identifier. diff --git a/docs/spec/system_doc_extension_v1_filters.md b/docs/spec/system_doc_extension_v1_filters.md new file mode 100644 index 00000000..3046881c --- /dev/null +++ b/docs/spec/system_doc_extension_v1_filters.md @@ -0,0 +1,101 @@ +# System: Document Extension v1 Filter and Payload Contract + +Purpose: Define the `docs_search_filters/v1` filter contract for +`POST /v2/docs/search/l0` and MCP `elf_docs_search_l0`. +Status: normative +Read this when: You are implementing or validating Doc Extension filter fields, payload shape, or Qdrant index requirements. +Not this document: Retrieval ranking logic, query rewriting, or document ingestion flow design. +Defines: `docs_search_filters/v1` and `doc_extension_payload/v1`. + +Registry identifiers: +- `docs_search_filters/v1`: API filter compatibility contract for `docs_search_l0`. +- `doc_extension_payload/v1`: Qdrant payload + index compatibility contract for doc chunks. + +Status: shipped with Doc Extension v1. + +================================================== +Scope +================================================== + +- Defines filter parameters and Qdrant payload/index requirements for `docs_search_l0`. +- Does not define ranking, vector geometry, query text handling, or ingestion internals. + +================================================== +1) Filter Parameters +================================================== + +- `scope` (optional string): one of `agent_private`, `project_shared`, `org_shared`. +- `status` (optional string): defaults to `active` when omitted. Current implementation matches + this value exactly against stored doc status (`active`/`deleted` in current schema). +- `doc_type` (optional string): exact-match filter. +- `sparse_mode` (optional string): retrieval fusion control mode: + `auto` (default), `on`, `off`. +- `agent_id` (optional string): exact-match filter. +- `thread_id` (optional string): exact-match filter for `thread_id` payload field. +- `domain` (optional string): exact-match filter for `domain` payload field. +- `repo` (optional string): exact-match filter for `repo` payload field. +- `updated_after` (optional string): RFC3339 timestamp lower bound for `updated_at`. +- `updated_before` (optional string): RFC3339 timestamp upper bound for `updated_at`. +- `ts_gte` (optional string): RFC3339 timestamp lower bound for `doc_ts`. +- `ts_lte` (optional string): RFC3339 timestamp upper bound for `doc_ts`. +- Timestamp bounds are exclusive (`updated_after < updated_at < updated_before`), and values are parsed + as timezone-aware RFC3339 datetimes. +- `ts_gte`/`ts_lte` bounds are inclusive (`ts_gte <= doc_ts <= ts_lte`), and values are parsed + as timezone-aware RFC3339 datetimes. +- `level` on `POST /v2/docs/excerpts` is `L0|L1|L2` where `L0` is a compact 256-byte retrieval window. +- `explain` is an optional boolean on `docs_search_l0` and `docs_excerpts_get` responses that requests + staged diagnostics. + +Filter evaluation: +- Every supplied filter is combined with logical AND. +- `status` defaults to `active` when omitted. +- `sparse_mode` is validated as one of `auto|on|off` (default `auto`). +- `domain` requires `doc_type=search` and is rejected with `400` when used with other + `doc_type` values or when `doc_type` is omitted. +- `repo` requires `doc_type=dev` and is rejected with `400` when used with other + `doc_type` values or when `doc_type` is omitted. +- Invalid date values or `updated_after >= updated_before` are rejected with `400`. +- Invalid date values or `ts_gte >= ts_lte` are rejected with `400`. +- In `auto` sparse mode, sparse retrieval is enabled only when the query is judged as + symbol-heavy / exact-match oriented; otherwise only dense retrieval is used. +- `sparse_mode=on` runs both dense and sparse retrieval; `sparse_mode=off` runs dense-only. + +Response behavior: +- `docs_search_l0` always returns `trace_id`. +- `docs_excerpts_get` always returns `trace_id` and `locator`. +- When `explain=true`, both endpoints additionally return optional `trajectory` under + `doc_retrieval_trajectory/v1`. + +================================================== +2) Qdrant Payload Contract +================================================== + +Each point used by `docs_search_l0` MUST include payload fields: +- `scope` +- `status` +- `doc_type` +- `agent_id` +- `thread_id` +- `domain` +- `repo` +- `updated_at` +- `doc_ts` + +Payload field names are part of `docs_search_filters/v1` and `doc_extension_payload/v1` compatibility. + +================================================== +3) Qdrant Index Requirements +================================================== + +Implementations MUST provision payload indexes for: +- `scope` (keyword) +- `status` (keyword) +- `doc_type` (keyword) +- `agent_id` (keyword) +- `thread_id` (keyword) +- `domain` (keyword) +- `repo` (keyword) +- `updated_at` (datetime) +- `doc_ts` (datetime) + +Indexing is a deploy-time requirement before filtered production traffic is enabled. diff --git a/docs/spec/system_doc_extension_v1_trajectory.md b/docs/spec/system_doc_extension_v1_trajectory.md new file mode 100644 index 00000000..e13e542e --- /dev/null +++ b/docs/spec/system_doc_extension_v1_trajectory.md @@ -0,0 +1,170 @@ +# System: Doc Extension v1 Retrieval Trajectory (`doc_retrieval_trajectory/v1`) + +Purpose: Define the optional, response-only stage traces for Doc Extension v1 retrieval +(`docs_search_l0` and `docs_excerpts_get`) when `explain=true`. +Status: normative +Read this when: You are shaping, validating, or consuming response-only retrieval trajectories for Doc Extension v1. +Not this document: Persistent trace storage, ranking policy, or request-routing guidance. +Defines: `doc_retrieval_trajectory/v1`. + +This schema is intentionally lightweight and not persisted. It is returned directly in API +responses to support explainability and debugging. + +================================================== +1) Schema +================================================== + +- Identifier: `doc_retrieval_trajectory/v1` +- Type: JSON payload for response-only trajectory traces. +- Shape: + +```json +{ + "schema": "doc_retrieval_trajectory/v1", + "stages": [ + { + "stage_order": 0, + "stage_name": "request_validation", + "stats": {} + } + ] +} +``` + +================================================== +2) Stage Names +================================================== + +Endpoints: +- `POST /v2/docs/search/l0` (`DocsSearchL0Response`) +- `POST /v2/docs/excerpts` (`DocsExcerptResponse`) + +Allowed/expected stage names (in order): + +1. `request_validation` + Input validation and request-shape checks. + +2. `query_embedding` + Embedding request preparation/dispatch. + +3. `vector_dimension_check` + Ensures returned vector size matches the configured model/vector size. + +4. `vector_search` + Dense and optional sparse retrieval from Qdrant. + Dense retrieval runs first on every request; sparse retrieval is controlled by + `sparse_mode` (`auto`, `on`, `off`). + - `auto`: sparse retrieval only for symbol-heavy / exact-match style queries. + - `on`: always run both dense and sparse retrieval. + - `off`: dense-only retrieval. + +5. `dedupe` + Chunk-id deduplication between retrieval tiers. + +6. `chunk_lookup` + Document/chunk metadata hydration from Postgres. + +7. `result_projection` + Final scored item projection and output truncation. + Implementations apply a recency tie-break using `updated_at` and expose the + policy knobs in stage stats when available (`recency_tau_days`, `tie_breaker_weight`). + +8. `level_selection` (excerpts only) + `L0|L1|L2` selection and byte budget. + +9. `match_resolution` (excerpts only) + Selector resolution for `chunk_id` / `quote` / `position`. + +10. `window_projection` (excerpts only) + Byte-window expansion to the requested level. + +11. `verification` (excerpts only) + Verification flag/error summary and excerpt hash metadata. + +Any implementation may choose to emit a subset of stages, but stage order must be stable +and `stage_name` values should be non-empty and meaningful for downstream readers. + +================================================== +3) Examples +================================================== + +```json +{ + "schema": "doc_retrieval_trajectory/v1", + "stages": [ + { + "stage_order": 0, + "stage_name": "request_validation", + "stats": { "query_len": 23, "top_k": 5, "candidate_k": 30 } + }, + { + "stage_order": 1, + "stage_name": "vector_search", + "stats": { + "sparse_mode": "auto", + "channels": ["dense"], + "dense_raw_points": 24, + "sparse_raw_points": 0, + "raw_points": 24 + } + }, + { + "stage_order": 2, + "stage_name": "result_projection", + "stats": { + "returned_items": 5, + "pre_authorization_candidates": 8, + "recency_tau_days": 60, + "tie_breaker_weight": 0.12 + } + } + ] +} +``` + +================================================== +5) Evaluation Scenarios +================================================== + +- English dense-first over mixed-language docs (expected dense-first) + - Request `sparse_mode` omitted or `off` for a normal English query. + - Example: natural-language question with low symbol density from mixed `chat/dev` content. + - `trajectory.stages.vector_search` should show `channels=["dense"]` and `sparse_raw_points=0` (or absent). + - `trajectory.stages.result_projection` should show normal ranking output and no symbolic jump from sparse-only terms. + +- Exact-match cases (`auto` vs `on`) + - Query contains symbols / identifiers (`/`, `:`, `#`, hex, URLs, error codes like `ERR_...`, full stack traces, full identifiers). + - With `sparse_mode=auto`, expect `channels=["dense"]` for generic prose and `channels` may include `"sparse"` when the query is symbol-heavy. + - With `sparse_mode=on`, expect `channels` to include both `"dense"` and `"sparse"` even if `auto` would stay dense-only. + - Compare `vector_search.raw_points` and `result_projection` stability across modes for the same corpus; `sparse_mode=on` should improve retrieval of exact token patterns in symbol-heavy queries. + +- Recency bias checks + - Configure `cfg.ranking.recency_tau_days` and `cfg.ranking.tie_breaker_weight` > 0. + - In `trajectory.stages.result_projection`, verify fields: + - `recency_tau_days` (current effective value), + - `tie_breaker_weight` (current effective weight), + - `pre_authorization_candidates` and `returned_items`. + - Expected signal: newer `updated_at` chunks should move upward when fusion scores are close and tie-break is active. + +```json +{ + "schema": "doc_retrieval_trajectory/v1", + "stages": [ + { + "stage_order": 0, + "stage_name": "request_validation", + "stats": { "doc_id": "..." } + }, + { + "stage_order": 1, + "stage_name": "match_resolution", + "stats": { "selector_kind": "quote", "match_start": 84, "match_end": 120 } + }, + { + "stage_order": 2, + "stage_name": "verification", + "stats": { "verified": true, "error_count": 0 } + } + ] +} +``` diff --git a/docs/spec/system_doc_source_ref_v1.md b/docs/spec/system_doc_source_ref_v1.md new file mode 100644 index 00000000..c11d4f4f --- /dev/null +++ b/docs/spec/system_doc_source_ref_v1.md @@ -0,0 +1,190 @@ +# System: `doc_source_ref/v1` for `docs_put` + +Purpose: Define a minimal, versioned `source_ref` convention for docs ingested +through `POST /v2/docs` / MCP `elf_docs_put`. +Status: normative +Read this when: You are producing or validating `source_ref` payloads for `docs_put`. +Not this document: Note-level evidence pointers or retrieval-time document pointer resolution. +Defines: `doc_source_ref/v1`. + +Identifiers: +- Envelope identifier: `doc_source_ref/v1` +- File: `docs/spec/system_doc_source_ref_v1.md` + +Scope: +- Covers `doc_documents.source_ref` for docs ingested via `docs_put`. +- Covers doc types: `knowledge`, `chat`, `search`, `dev`. +- This schema is for provenance and deterministic filtering keys, not for + note-level evidence pointers (`source_ref/v1`). + +`source_ref` is required for `docs_put` and must conform to this spec. +Legacy `{}` or non-`doc_source_ref/v1` shapes are rejected for `docs_put`. + +Design goals: +- Deterministic and replayable: two independent ingesters SHOULD emit identical + keys for the same source event. +- Flat keys: fields SHOULD be top-level to support stable projection into vector + payloads and filter indexes. +- Minimal requirements: the service MAY accept additional keys, but downstream + filtering MUST rely only on keys enumerated by this spec. + +================================================== +1) Top-level shape and required keys +================================================== + +When `source_ref` is provided, it MUST be a JSON object with these required keys: + +- `schema` (string): exact value `doc_source_ref/v1`. +- `doc_type` (string): one of `knowledge`, `chat`, `search`, `dev`. +- `ts` (string): RFC3339 timestamp for event time (not ingest time). + +================================================== +2) Per-type required keys (minimal) +================================================== + +All required fields are top-level. + +-------------------------------------------------- +2.1) `doc_type="chat"` +-------------------------------------------------- + +Required: +- `thread_id` (string): stable thread identifier. +- `role` (string): stable role marker (producer-defined). Examples: `user`, `assistant`, `tool`. + +Optional (examples): +- `message_id` (string) + +-------------------------------------------------- +2.2) `doc_type="search"` +-------------------------------------------------- + +Required: +- `query` (string): literal query string. +- `url` (string): canonical URL for the selected result. +- `domain` (string): canonical domain for the URL, used as a stable filter key. + +Optional (examples): +- `provider` (string) + +-------------------------------------------------- +2.3) `doc_type="dev"` +-------------------------------------------------- + +Required: +- `repo` (string): repository identifier (producer-defined; SHOULD be stable and human-readable). +- Exactly one of: + - `commit_sha` (string) + - `pr_number` (integer) + - `issue_number` (integer) + +Optional (examples): +- `path` (string): file path within the repo. + +-------------------------------------------------- +2.4) `doc_type="knowledge"` +-------------------------------------------------- + +Required: +- No additional required keys beyond section (1). + +Optional: +- `uri` (string): canonical URI/path/URN for the knowledge source. + +================================================== +3) Identifier stability and parsing rules +================================================== + +The following fields are machine identifiers and MUST be byte-stable when +re-ingesting the same event: + +- `schema` +- `doc_type` +- `thread_id` +- `domain` +- `repo` +- `commit_sha` / `pr_number` / `issue_number` + +Timestamp rules: +- `ts` MUST be a timezone-aware RFC3339 datetime string. +- `ts` is the source event time. Do not use ingest time unless the source does + not provide event time. + +================================================== +4) Compatibility rules +================================================== + +Forward compatibility: +- Producers MAY include additional keys. +- Consumers MUST ignore unknown keys. + +Backward compatibility: +- This contract is strict for `docs_put` writes. Backward-compatible fallback + mappings are not performed. + +================================================== +5) Examples +================================================== + +Chat: + +```json +{ + "schema": "doc_source_ref/v1", + "doc_type": "chat", + "ts": "2026-02-25T19:05:15Z", + "thread_id": "thread-8f7e2f9a", + "role": "assistant", + "message_id": "message-1c3d" +} +``` + +Search: + +```json +{ + "schema": "doc_source_ref/v1", + "doc_type": "search", + "ts": "2026-02-25T19:05:15Z", + "query": "qdrant payload index keyword vs text", + "url": "https://qdrant.tech/documentation/concepts/payload/", + "domain": "qdrant.tech", + "provider": "web" +} +``` + +Dev (commit): + +```json +{ + "schema": "doc_source_ref/v1", + "doc_type": "dev", + "ts": "2026-02-25T19:05:15Z", + "repo": "hack-ink/ELF", + "commit_sha": "9f1f4e6d0a5b7c2e11c93b5a2c8a3f5e5a1b2c3d", + "path": "packages/elf-service/src/docs.rs" +} +``` + +Dev (PR): + +```json +{ + "schema": "doc_source_ref/v1", + "doc_type": "dev", + "ts": "2026-02-25T19:05:15Z", + "repo": "hack-ink/ELF", + "pr_number": 123 +} +``` + +Knowledge: + +```json +{ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T19:05:15Z", + "uri": "docs://kb/architecture/2026/02/overview" +} +``` diff --git a/docs/spec/system_elf_memory_service_v1.md b/docs/spec/system_elf_memory_service_v1.md deleted file mode 100644 index 435fa10b..00000000 --- a/docs/spec/system_elf_memory_service_v1.md +++ /dev/null @@ -1,1040 +0,0 @@ -# ELF Memory Service v1.0 Specification - -Description: ELF means Evidence-linked fact memory for agents. - -Audience: Implementation LLM or engineer agent. -Language: English only. No CJK characters are allowed anywhere in this document. -Contract: English-only API inputs and outputs. Reject any CJK at the API boundary. -Implementation target: Rust is recommended. The spec is language agnostic. - -Core idea: -- Postgres with pgvector is the only source of truth for notes, chunk embeddings, audit history, and the indexing outbox. -- Note-level embeddings are derived pooled vectors for update and duplicate checks. -- Qdrant is a derived index for candidate retrieval only. Qdrant must be rebuildable from Postgres vectors without calling the embedding API. -- Two write APIs have hard semantic differences: - - add_note is deterministic and must not call any LLM. - - add_event is LLM-driven extraction and must bind evidence for every stored note. - -Multi-tenant namespace: -- tenant_id, project_id, agent_id, scope, read_profile. - -Optional future work: -- Graph memory backend (Neo4j) is reserved and out of scope for v1.0. - -============================================================ -0. INVARIANTS (MUST HOLD) -============================================================ -I1. Postgres with pgvector is the only source of truth for: - - memory notes - - chunk embedding vectors - - chunk metadata - - pooled note embeddings (derived) - - audit and version history - - hit logs (optional) - - indexing outbox jobs -I2. Qdrant is derived and rebuildable: - - Qdrant may be dropped and recreated at any time. - - Qdrant must be rebuildable from Postgres vectors without calling the embedding API. -I3. Online retrieval: - - Qdrant returns candidate chunk_ids. - - Postgres returns authoritative notes and re-validates status, TTL, and scope. -I4. English-only contract: - - Any API input containing CJK must be rejected with HTTP 422. - - Upstream agents must canonicalize to English before calling ELF. -I5. add_note must not call any LLM under any circumstance. -I6. add_event must call the LLM extractor and must bind evidence with verbatim substring checks. - -============================================================ -1. CONFIGURATION (TOML) -============================================================ -File: elf.toml - -Rules: -- The config file path is required and must be provided with --config or -c. -- No default values are allowed in code. Every field below must be present in elf.toml unless explicitly marked optional. -- No environment variables are allowed for configuration. All values are stored in elf.toml. -- Provider api_key values must be present and non-empty. -- providers.embedding.dimensions must match storage.qdrant.vector_dim. -- chunking.enabled must be true. -- chunking.max_tokens must be greater than zero. -- chunking.overlap_tokens must be less than chunking.max_tokens. -- chunking.tokenizer_repo may be empty or omitted to inherit providers.embedding.model. - -Template (all values required): - -[service] -http_bind = "<REQUIRED_HOST:PORT>" -mcp_bind = "<REQUIRED_HOST:PORT>" -admin_bind = "<REQUIRED_HOST:PORT>" -log_level = "<REQUIRED_LOG_LEVEL>" - -[storage.postgres] -dsn = "<REQUIRED_POSTGRES_DSN>" -pool_max_conns = <REQUIRED_INT> - -[storage.qdrant] -url = "<REQUIRED_URL>" -collection = "mem_notes_v1" -vector_dim = <REQUIRED_INT> - -[providers.embedding] -provider_id = "<REQUIRED_ID>" -api_base = "<REQUIRED_URL>" -api_key = "<REQUIRED_NON_EMPTY>" -path = "<REQUIRED_PATH>" -model = "<REQUIRED_MODEL>" -dimensions = "<REQUIRED_INT>" -timeout_ms = <REQUIRED_INT> -# Must exist. Empty map is allowed. -default_headers = {} - -[providers.rerank] -provider_id = "<REQUIRED_ID>" -api_base = "<REQUIRED_URL>" -api_key = "<REQUIRED_NON_EMPTY>" -path = "<REQUIRED_PATH>" -model = "<REQUIRED_MODEL>" -timeout_ms = <REQUIRED_INT> -# Must exist. Empty map is allowed. -default_headers = {} - -[providers.llm_extractor] -provider_id = "<REQUIRED_ID>" -api_base = "<REQUIRED_URL>" -api_key = "<REQUIRED_NON_EMPTY>" -path = "<REQUIRED_PATH>" -model = "<REQUIRED_MODEL>" -temperature = <REQUIRED_FLOAT> -timeout_ms = <REQUIRED_INT> -# Must exist. Empty map is allowed. -default_headers = {} - -[scopes] -allowed = ["agent_private", "project_shared", "org_shared"] - -[scopes.read_profiles] -private_only = ["agent_private"] -private_plus_project = ["agent_private", "project_shared"] -all_scopes = ["agent_private", "project_shared", "org_shared"] - -[scopes.precedence] -agent_private = 30 -project_shared = 20 -org_shared = 10 - -[scopes.write_allowed] -agent_private = true -project_shared = true -org_shared = true - -[memory] -max_notes_per_add_event = 3 -max_note_chars = 240 -# Similarity thresholds -dup_sim_threshold = 0.92 -update_sim_threshold = 0.85 -# Retrieval sizes -candidate_k = 60 -top_k = 12 - -[chunking] -enabled = true -max_tokens = <REQUIRED_INT> -overlap_tokens = <REQUIRED_INT> -# Optional. Empty or omitted uses providers.embedding.model. -tokenizer_repo = "<OPTIONAL_STRING>" - -[search.expansion] -mode = "off|always|dynamic" -max_queries = <REQUIRED_INT> -include_original = <REQUIRED_BOOL> - -[search.dynamic] -min_candidates = <REQUIRED_INT> -min_top_score = <REQUIRED_FLOAT> - -[search.prefilter] -max_candidates = <REQUIRED_INT> - -[search.cache] -enabled = <REQUIRED_BOOL> -expansion_ttl_days = <REQUIRED_INT> -rerank_ttl_days = <REQUIRED_INT> -# Optional. Omit to disable payload size limits. -max_payload_bytes = <OPTIONAL_INT> -expansion_version = "<REQUIRED_NON_EMPTY>" -rerank_version = "<REQUIRED_NON_EMPTY>" - -[search.explain] -retention_days = <REQUIRED_INT> - -[ranking] -recency_tau_days = 60 -tie_breaker_weight = 0.1 - -[lifecycle.ttl_days] -plan = 14 -fact = 180 -preference = 0 -constraint = 0 -decision = 0 -profile = 0 - -[lifecycle] -purge_deleted_after_days = 30 -purge_deprecated_after_days = 180 - -[security] -bind_localhost_only = true -reject_cjk = true -redact_secrets_on_write = true -# Evidence rules for add_event -evidence_min_quotes = 1 -evidence_max_quotes = 2 -evidence_max_quote_chars = 320 - -============================================================ -2. CLI AND CONFIG LOADING -============================================================ -- elf-api, elf-worker, and elf-mcp are separate binaries. -- Each binary requires a config path via --config or -c. -- Startup must fail with a clear error if any required config field is missing. -- security.reject_cjk must be true. Startup must fail if it is false. - -============================================================ -3. ENGLISH-ONLY BOUNDARY -============================================================ -Definition: -- CJK detection is the presence of any codepoint in the following Unicode blocks: - - CJK Unified Ideographs - - CJK Symbols and Punctuation - - Hiragana - - Katakana - - Hangul - -Policy: -- If security.reject_cjk is true, any CJK in any string field listed below must return HTTP 422. - -Fields to check: -- add_note: notes[].text, notes[].key (optional), source_ref string fields if any -- add_event: messages[].content -- search: query - -Error response: -HTTP 422 -{ - "error_code": "NON_ENGLISH_INPUT", - "message": "CJK detected; upstream must canonicalize to English before calling ELF.", - "fields": ["$.messages[2].content", "$.notes[0].text"] -} - -============================================================ -4. DOMAIN MODEL -============================================================ -4.1 Memory types (exactly 6) -- preference -- constraint -- decision -- profile -- fact -- plan - -4.2 Canonical note -- A note is a short English sentence and must be <= max_note_chars. -- Format is not enforced. Recommended prefixes for consistency: - "Preference: ...", "Constraint: ...", "Decision: ...", "Profile: ...", "Fact: ...", "Plan: ..." - -4.3 Keys -- key is optional but strongly recommended for stable updates. -- key examples: preferred_language, no_secrets_policy, architecture_sot, project_workflow, long_term_goal. - -============================================================ -5. POSTGRES SCHEMA (SOURCE OF TRUTH + PGVECTOR) -============================================================ -Startup must: -- CREATE EXTENSION IF NOT EXISTS vector; -- Execute sql/init.sql. - -Schema location: -- All schema and index DDL must live under sql/ and be orchestrated by sql/init.sql. -- sql/init.sql must be idempotent and include the per-table files in dependency order. - -5.1 memory_notes (authoritative notes) -Columns: -- note_id uuid primary key -- tenant_id text not null -- project_id text not null -- agent_id text not null -- scope text not null -- type text not null -- key text null -- text text not null -- importance real not null -- confidence real not null -- status text not null -- created_at timestamptz not null -- updated_at timestamptz not null -- expires_at timestamptz null -- embedding_version text not null -- source_ref jsonb not null -- hit_count bigint not null default 0 -- last_hit_at timestamptz null - -Indexes (minimum): -- idx_notes_scope_status: (tenant_id, project_id, scope, status) -- idx_notes_key: (tenant_id, project_id, agent_id, scope, type, key) WHERE key IS NOT NULL -- idx_notes_expires: (expires_at) - -5.2 memory_note_chunks (chunk metadata) -Columns: -- chunk_id uuid primary key -- note_id uuid not null references memory_notes(note_id) on delete cascade -- chunk_index int not null -- start_offset int not null -- end_offset int not null -- text text not null -- embedding_version text not null -- created_at timestamptz not null default now() - -Indexes (minimum): -- idx_note_chunks_note: (note_id) -- idx_note_chunks_note_index: (note_id, chunk_index) - -5.3 note_chunk_embeddings (source of truth vectors; pgvector) -- chunk_id uuid references memory_note_chunks(chunk_id) on delete cascade -- embedding_version text not null -- embedding_dim int not null -- vec vector(<vector_dim>) not null -- created_at timestamptz not null default now() -primary key(chunk_id, embedding_version) - -Rules: -- Every memory_note_chunks row must have a corresponding note_chunk_embeddings row for its embedding_version. -- Chunk embeddings are the source of truth for retrieval and rebuild. - -5.4 note_embeddings (derived pooled vectors; pgvector) -- note_id uuid references memory_notes(note_id) on delete cascade -- embedding_version text not null -- embedding_dim int not null -- vec vector(<vector_dim>) not null -- created_at timestamptz not null default now() -primary key(note_id, embedding_version) - -Rules: -- note_embeddings is derived by mean pooling chunk embeddings for (note_id, embedding_version). -- note_embeddings must be refreshed whenever chunk embeddings change. - -5.5 memory_note_versions (append-only audit) -- version_id uuid primary key -- note_id uuid not null -- op text not null -- prev_snapshot jsonb null -- new_snapshot jsonb null -- reason text not null -- actor text not null -- ts timestamptz not null default now() - -5.6 memory_hits (optional) -- hit_id uuid primary key -- note_id uuid not null -- chunk_id uuid null -- query_hash text not null -- rank int not null -- final_score real not null -- ts timestamptz not null default now() - -5.7 indexing_outbox (guaranteed indexing) -- outbox_id uuid primary key -- note_id uuid not null -- op text not null -- embedding_version text not null -- status text not null -- attempts int not null default 0 -- last_error text null -- available_at timestamptz not null default now() -- created_at timestamptz not null default now() -- updated_at timestamptz not null default now() - -Indexes: -- idx_outbox_status_available: (status, available_at) -- idx_outbox_note_op_status: (note_id, op, status) - -5.8 search_traces (search explainability) -- trace_id uuid primary key -- tenant_id text not null -- project_id text not null -- agent_id text not null -- read_profile text not null -- query text not null -- expansion_mode text not null -- expanded_queries jsonb not null -- allowed_scopes jsonb not null -- candidate_count int not null -- top_k int not null -- config_snapshot jsonb not null -- trace_version int not null -- created_at timestamptz not null -- expires_at timestamptz not null - -Indexes: -- idx_search_traces_expires: (expires_at) -- idx_search_traces_context: (tenant_id, project_id, created_at) - -5.9 search_trace_items (per-result explain data) -- item_id uuid primary key -- trace_id uuid not null references search_traces(trace_id) on delete cascade -- note_id uuid not null -- chunk_id uuid null -- rank int not null -- retrieval_score real null -- retrieval_rank int null -- rerank_score real not null -- tie_breaker_score real not null -- final_score real not null -- boosts jsonb not null -- matched_terms jsonb not null -- matched_fields jsonb not null - -Indexes: -- idx_search_trace_items_trace: (trace_id, rank) -- idx_search_trace_items_note: (note_id) - -5.10 search_trace_outbox (async trace persistence) -- outbox_id uuid primary key -- trace_id uuid not null -- status text not null -- attempts int not null default 0 -- last_error text null -- available_at timestamptz not null default now() -- payload jsonb not null -- created_at timestamptz not null default now() -- updated_at timestamptz not null default now() - -Indexes: -- idx_trace_outbox_status_available: (status, available_at) -- idx_trace_outbox_trace_status: (trace_id, status) - -5.11 llm_cache (LLM response cache) -- cache_id uuid primary key -- cache_kind text not null -- cache_key text not null -- payload jsonb not null -- created_at timestamptz not null -- last_accessed_at timestamptz not null -- expires_at timestamptz not null -- hit_count bigint not null default 0 - -Indexes: -- idx_llm_cache_key: (cache_kind, cache_key) unique -- idx_llm_cache_expires: (expires_at) - -============================================================ -6. QDRANT COLLECTION (DERIVED INDEX ONLY) -============================================================ -- Collection: storage.qdrant.collection -- Dense vector: named `dense` with size storage.qdrant.vector_dim (cosine distance). -- Sparse vector: named `bm25` with `idf` modifier and model `qdrant/bm25`. -- Point id: chunk_id (string UUID) -- Payload fields (minimum): - note_id, chunk_id, chunk_index, start_offset, end_offset, - tenant_id, project_id, agent_id, scope, type, key, status, - updated_at, expires_at, importance, confidence, embedding_version -- Chunk text is not stored in Qdrant payload. - -IMPORTANT: -- Qdrant may be stale. Postgres is authoritative. - -============================================================ -7. PROVIDER ADAPTERS (HTTP) -============================================================ -7.1 EmbeddingProvider -Function: -- embed(texts[]) -> vectors[][] - -Contract: -- Output vector count equals input text count. -- Each vector length equals vector_dim. - -Implementation: -- POST {api_base}{path} - { "model": model, "input": [texts...], "dimensions": dimensions } -- Send Authorization: Bearer <api_key>. -- Merge default_headers into the request. -- Map response to float32[D]. - -embedding_version: -- "<provider_id>:<model>:<vector_dim>" - -7.2 RerankProvider -Function: -- rerank(query, docs[]) -> scores[] - -Contract: -- Scores are aligned to docs indexes. - -Implementation: -- POST {api_base}{path} - { "model": model, "query": "...", "documents": ["..."] } -- Send Authorization: Bearer <api_key>. -- Merge default_headers into the request. -- Map response into aligned float[] (some providers return indexes). - -7.3 LLM Extractor Provider -Function: -- extract(messages[]) -> JSON notes - -Contract: -- Strict JSON output. -- If response_format is available, use it. -- Otherwise enforce JSON-only with at most 2 retries. - -Implementation: -- POST {api_base}{path} - { "model": model, "temperature": temperature, "messages": [...] } -- Send Authorization: Bearer <api_key>. -- Merge default_headers into the request. - -============================================================ -8. API SEMANTICS: add_note vs add_event (HARD DIFFERENCES) -============================================================ -8.1 add_note (deterministic write) -MUST: -- Must not call any LLM. -- Must treat input notes as authoritative content with no rewriting. -- Must apply WriteGate, UpdateResolver, persistence, and indexing outbox. -- Must return per-note op result: ADD, UPDATE, NONE, or REJECTED with reason_code. - -MUST NOT: -- Must not infer missing type, scope, or key beyond validation defaults. -- Must not generate new text. - -8.2 add_event (LLM extraction write) -MUST: -- Must call the LLM extractor exactly once per request. -- Must require evidence binding for each candidate note. -- Must enforce max_notes_per_add_event on the server. -- Must apply WriteGate and UpdateResolver after extraction. -- Should support dry_run to return candidates without persisting. - -MUST NOT: -- Must not store notes lacking evidence or failing evidence substring checks. -- Must not store raw full logs as memory notes. - - If evidence.quote is not a verbatim substring of the cited message, return REJECTED with reason_code REJECT_EVIDENCE_MISMATCH. - -============================================================ -9. WRITEGATE (SERVER SIDE, ALWAYS ON) -============================================================ -Reject a note if any of the following are true: -- The note contains CJK. -- The type is not in the 6-type allowlist. -- The scope is not allowed or write not allowed. -- The text length is greater than max_note_chars. -- Secrets or PII are detected (regex and heuristics). -- The text is empty or whitespace only. - -On rejection: -- op = REJECTED -- reason_code is one of: - REJECT_CJK, REJECT_TOO_LONG, REJECT_SECRET, REJECT_INVALID_TYPE, - REJECT_SCOPE_DENIED, REJECT_EMPTY - -============================================================ -10. UPDATE RESOLVER (IN-PLACE UPDATE, STABLE note_id) -============================================================ -Resolution namespace group: -(tenant_id, project_id, agent_id, scope, type) - -Order: -1) Key-based: - - If key is not null and an active note exists with the same key: - -> UPDATE in place (same note_id). -2) Similarity-based (when key is null): - - Compute embedding for incoming text. - - Compare cosine similarity vs existing active notes in the group using Postgres-stored vec. - - If sim >= dup_sim_threshold -> NONE. - - Else if sim >= update_sim_threshold -> UPDATE best match in place. - - Else -> ADD new note_id. - -On UPDATE: -- Preserve note_id. -- Write memory_note_versions with prev and new snapshots. -- Update memory_notes.text, updated_at, expires_at, source_ref, confidence, importance. -- Enqueue outbox UPSERT. - -============================================================ -11. TTL AND LIFECYCLE -============================================================ -TTL assignment on write: -- If request.ttl_days is provided and > 0 -> expires_at = now + ttl_days. -- Else if lifecycle.ttl_days[type] > 0 -> expires_at = now + ttl_days[type]. -- Else expires_at = NULL. - -GC job (daily): -- If status = deleted and deleted age > purge_deleted_after_days -> hard purge row (cascade). -- If status = deprecated and last_hit_at older than purge_deprecated_after_days -> delete or purge. -- If expires_at < now -> set status = deleted + version row + outbox DELETE. - -============================================================ -12. PERSISTENCE AND INDEXING (SOURCE OF TRUTH FIRST + OUTBOX) -============================================================ -For every ADD, UPDATE, DEPRECATE, or DELETE, the Postgres transaction must: -- Update memory_notes. -- Write memory_note_versions. -- Insert indexing_outbox (UPSERT or DELETE) as PENDING. -- Commit. - -After commit: -- Best-effort inline outbox processing may run. -- Correctness is guaranteed by the background worker. - -Worker rules: -- For UPSERT: - - Fetch memory_notes row. - - If not active or expired -> mark outbox DONE and skip indexing. - - Split note text into sentence-aware chunks. - - Upsert memory_note_chunks rows for (note_id, chunk_index). - - Call embedding API for chunk text and upsert note_chunk_embeddings. - - Compute pooled note vector by mean pooling chunk embeddings and upsert note_embeddings. - - Upsert one Qdrant point per chunk with dense and bm25 vectors plus payload. - - Mark outbox DONE. -- For DELETE: - - Delete Qdrant points by note_id filter (ignore not found). - - Mark DONE. -- Failures: - - status = FAILED, attempts += 1, available_at = now + backoff(attempts). - -Search trace outbox (best-effort): -- Search enqueues trace payloads into search_trace_outbox with status = PENDING. -- Worker leases available jobs, inserts search_traces and search_trace_items, then marks DONE. -- On failure, status = FAILED, attempts += 1, last_error set, available_at = now + backoff(attempts). -- Failures must not affect the original search response. - -Periodic cleanup: -- Worker deletes expired search_traces (search_trace_items cascade). -- Worker deletes expired llm_cache rows. - -============================================================ -13. SEARCH PIPELINE (ONLINE) -============================================================ -Input: -- tenant_id, project_id, agent_id -- read_profile -- query (English only) -- optional top_k, candidate_k, record_hits - -Config: -- search.expansion.mode = off|always|dynamic -- search.expansion.max_queries -- search.expansion.include_original (default true) -- search.dynamic.min_candidates -- search.dynamic.min_top_score -- search.prefilter.max_candidates (0 or >= candidate_k means no prefilter) -- search.cache.enabled -- search.cache.expansion_ttl_days -- search.cache.rerank_ttl_days -- search.cache.max_payload_bytes (optional) -- search.cache.expansion_version -- search.cache.rerank_version -- search.explain.retention_days - -Steps: -1) English-only boundary check. -2) Resolve allowed_scopes = scopes.read_profiles[read_profile]. -3) Resolve expansion mode: - - off: use only original query. - - always: expand with LLM. - - dynamic: run a baseline hybrid search for the original query, then expand if - candidate_count < min_candidates OR top1_fusion_score < min_top_score. -4) If expansion is enabled, resolve expanded queries with cache support. - - Build an expansion cache key from: query (trimmed), provider_id, model, temperature, - expansion_version, max_queries, include_original. - - If search.cache.enabled and a non-expired cache entry exists, use cached queries. - - On cache miss, call the LLM expansion prompt and receive queries[]. - - Deduplicate, strip CJK, and cap at max_queries. - - Ensure original query is present when include_original = true. - - If search.cache.enabled and payload size is within max_payload_bytes (when set), - store the expanded queries with TTL = expansion_ttl_days. -5) For each query, embed -> query_vec (embedding API). -6) For each query, run Qdrant fusion query candidate_k with payload filters (dense + bm25): - tenant_id, project_id, status = active (best-effort), and scope filters: - - If scope = agent_private, require agent_id match. - - Otherwise scope in allowed_scopes. -7) Fuse all query results with RRF to produce candidate chunk_ids. -8) Prefilter (optional): if max_candidates > 0 and max_candidates < candidate_k, - keep only top max_candidates by fusion score. -9) Fetch authoritative notes from Postgres by note_id and re-apply filters: - status = active, not expired, scope allowed, and if scope = agent_private then agent_id must match. -10) Fetch chunk metadata for candidate chunks and immediate neighbors from memory_note_chunks. -11) Stitch snippets from chunk text (chunk + neighbors). -12) Rerank once using the original query, with cache support: - - Build a rerank cache key from: query (trimmed), provider_id, model, rerank_version, - and the candidate signature [(chunk_id, note_updated_at)...]. - - If search.cache.enabled and a cache entry exists that matches the candidate signature, - reuse cached scores. - - On cache miss, call the rerank provider: - scores = rerank(original_query, docs = [snippet ...]). - - If search.cache.enabled and payload size is within max_payload_bytes (when set), - store the rerank scores with TTL = rerank_ttl_days. -13) Tie-break: - base = (1 + 0.6 * importance) * exp(-age_days / recency_tau_days) - final = rerank_score + tie_breaker_weight * base -14) Aggregate by note using top-1 chunk score, then sort and take top_k. -15) Update hits (optional, when record_hits is true): - hit_count++, last_hit_at, memory_hits insert with chunk_id. -16) Build search trace payload with trace_id and per-item result_handle, then enqueue - search_trace_outbox (best-effort; failures do not fail the search). - - expires_at = now + search.explain.retention_days. -17) Return results. - -Cache notes: -- Cache key material is serialized as JSON and hashed with BLAKE3 (256-bit hex). -- Cache read/write failures are treated as misses and must not fail the search request. - -============================================================ -14. ADMIN: REBUILD QDRANT FROM POSTGRES (NO EMBED API) -============================================================ -Endpoint (localhost only): -POST /v1/admin/rebuild_qdrant - -Behavior: -- Scan memory_note_chunks joined to memory_notes where status = active and not expired. -- For each chunk: - - Load vec from note_chunk_embeddings (chunk_id, embedding_version). - - Upsert Qdrant point with chunk vectors and payload. -- Must not call the embedding API. - -Report: -- rebuilt_count -- missing_vector_count (notes without vec) -- error_count - -============================================================ -15. HTTP API (PUBLIC) -============================================================ -Base: service.http_bind - -POST /v1/memory/add_note -Body: -{ - "tenant_id": "...", - "project_id": "...", - "agent_id": "...", - "scope": "agent_private|project_shared|org_shared", - "notes": [ - { - "type": "preference|constraint|decision|profile|fact|plan", - "key": "string|null", - "text": "English-only sentence", - "importance": 0.0-1.0, - "confidence": 0.0-1.0, - "ttl_days": number|null, - "source_ref": { ... } - } - ] -} -Response: -{ - "results": [ - { "note_id": "uuid", "op": "ADD|UPDATE|NONE|REJECTED", "reason_code": "..." } - ] -} - -POST /v1/memory/add_event -Body: -{ - "tenant_id": "...", - "project_id": "...", - "agent_id": "...", - "scope": "optional-scope", - "dry_run": false, - "messages": [ - { "role": "user|assistant|tool", "content": "English-only", "ts": "optional", "msg_id": "optional" } - ] -} -Response: -{ - "extracted": [ ...extractor output... ], - "results": [ - { "note_id": "uuid|null", "op": "ADD|UPDATE|NONE|REJECTED", "reason_code": "...", "reason": "..." } - ] -} -Notes: -- reason_code values include WriteGate rejection codes and REJECT_EVIDENCE_MISMATCH. - -POST /v1/memory/search -Body: -{ - "tenant_id": "...", - "project_id": "...", - "agent_id": "...", - "read_profile": "private_only|private_plus_project|all_scopes", - "query": "English-only", - "top_k": 12, - "candidate_k": 60, - "record_hits": false -} -Response: -{ - "trace_id": "uuid", - "items": [ - { - "result_handle": "uuid", - "note_id": "uuid", - "chunk_id": "uuid", - "chunk_index": 0, - "start_offset": 0, - "end_offset": 0, - "snippet": "...", - "type": "...", - "key": null, - "scope": "...", - "importance": 0.0, - "confidence": 0.0, - "updated_at": "...", - "expires_at": "...|null", - "final_score": 0.0, - "source_ref": { ... }, - "explain": { - "retrieval_score": 0.0|null, - "retrieval_rank": 1|null, - "rerank_score": 0.0, - "tie_breaker_score": 0.0, - "final_score": 0.0, - "boosts": [{"name": "recency_importance", "score": 0.0}], - "matched_terms": ["..."], - "matched_fields": ["text","key"] - } - } - ] -} -Notes: -- result_handle is a stable handle for search explain. -- record_hits defaults to false when omitted. - -GET /v1/memory/search/explain?result_handle=... -Response: -{ - "trace": { - "trace_id": "uuid", - "tenant_id": "...", - "project_id": "...", - "agent_id": "...", - "read_profile": "...", - "query": "...", - "expansion_mode": "off|always|dynamic", - "expanded_queries": ["..."], - "allowed_scopes": ["..."], - "candidate_count": 0, - "top_k": 0, - "config_snapshot": { ... }, - "trace_version": 1, - "created_at": "..." - }, - "item": { - "result_handle": "uuid", - "note_id": "uuid", - "chunk_id": "uuid", - "rank": 1, - "explain": { - "retrieval_score": 0.0|null, - "retrieval_rank": 1|null, - "rerank_score": 0.0, - "tie_breaker_score": 0.0, - "final_score": 0.0, - "boosts": [{"name": "recency_importance", "score": 0.0}], - "matched_terms": ["..."], - "matched_fields": ["text","key"] - } - } -} -Notes: -- If result_handle is unknown or the trace has not been persisted yet, return INVALID_REQUEST. - -GET /v1/memory/notes/{note_id} -Response: -{ - "note_id": "uuid", - "tenant_id": "...", - "project_id": "...", - "agent_id": "...", - "scope": "...", - "type": "...", - "key": null, - "text": "...", - "importance": 0.0, - "confidence": 0.0, - "status": "...", - "updated_at": "...", - "expires_at": "...|null", - "source_ref": { ... } -} - -GET /v1/memory/list?tenant_id=...&project_id=...&scope=...&status=...&type=...&agent_id=... -Notes: -- If scope = agent_private, agent_id is required. -- If scope is omitted, agent_private notes are excluded. - -POST /v1/memory/update -Body: -{ - "tenant_id": "...", - "project_id": "...", - "agent_id": "...", - "note_id": "uuid", - "text": "optional", - "importance": 0.0-1.0 optional, - "confidence": 0.0-1.0 optional, - "ttl_days": number|null -} -Notes: -- If ttl_days is omitted, expires_at remains unchanged. -- If ttl_days <= 0, apply default TTL rules for the note type. -Response: -{ - "note_id": "uuid", - "op": "UPDATE|NONE|REJECTED", - "reason_code": "optional" -} - -POST /v1/memory/delete -Body: -{ - "tenant_id": "...", - "project_id": "...", - "agent_id": "...", - "note_id": "uuid" -} -Response: -{ - "note_id": "uuid", - "op": "DELETE|NONE" -} -GET /health - -Error codes (common): -- NON_ENGLISH_INPUT (422) -- SCOPE_DENIED (403) -- INVALID_REQUEST (400) -- INVALID_REQUEST (400) -- INTERNAL_ERROR (500) - -============================================================ -16. LLM QUERY EXPANSION PROMPT (search) - APPENDIX -============================================================ -LLM output must be JSON only and match the schema below. - -Schema: -{ - "queries": ["string", "..."] -} - -Hard rules: -- queries.length <= MAX_QUERIES -- Each query must be English only and must not contain any CJK characters. -- Each query must be a single sentence. -- Include the original query unless INCLUDE_ORIGINAL is false. - -System prompt (Expansion): -"You are a query expansion engine for a memory retrieval system. -Output must be valid JSON only and must match the provided schema exactly. -Generate short English-only query variations that preserve the original intent. -Do not include any CJK characters. Do not add explanations or extra fields." - -User prompt template: -"Return JSON matching this exact schema: -<SCHEMA_JSON> -Constraints: -- MAX_QUERIES = <MAX_QUERIES> -- INCLUDE_ORIGINAL = <INCLUDE_ORIGINAL> -Original query: -<QUERY>" - -============================================================ -17. MCP ADAPTER (SEPARATE PROCESS) -============================================================ -- Separate binary: elf-mcp. -- Streamable HTTP MCP server. -- Tools map 1:1 to HTTP endpoints: - memory_add_note, memory_add_event, memory_search, memory_list, memory_update, memory_delete. -- The MCP server must contain zero business logic or policy. -- All policy remains in elf-api. - -============================================================ -18. LLM EXTRACTOR PROMPT (add_event) - APPENDIX -============================================================ -LLM output must be JSON only and match the schema below. - -Schema: -{ - "notes": [ - { - "type": "preference|constraint|decision|profile|fact|plan", - "key": "string|null", - "text": "English-only sentence <= MAX_NOTE_CHARS", - "importance": 0.0, - "confidence": 0.0, - "ttl_days": number|null, - "scope_suggestion": "agent_private|project_shared|org_shared|null", - "evidence": [ - { "message_index": number, "quote": "string" } - ], - "reason": "string" - } - ] -} - -Hard rules: -- notes.length <= MAX_NOTES -- text must contain no CJK -- each note must be one sentence -- evidence must be 1..2 quotes -- each evidence.quote must be a verbatim substring of messages[message_index].content -- do not store secrets or PII - -System prompt (Extractor): -"You are a memory extraction engine for an agent memory system. -Output must be valid JSON only and must match the provided schema exactly. -Extract at most MAX_NOTES high-signal, cross-session reusable memory notes from the given messages. -Each note must be one English sentence and must not contain any CJK characters. -Preserve numbers, dates, percentages, currency amounts, tickers, URLs, and code snippets exactly. -Never store secrets or PII: API keys, tokens, private keys, seed phrases, passwords, bank IDs, personal addresses. -For every note, provide 1 to 2 evidence quotes copied verbatim from the input messages and include the message_index. -If you cannot provide verbatim evidence, omit the note. -If content is ephemeral or not useful long-term, return an empty notes array." - -User prompt template: -"Return JSON matching this exact schema: -<SCHEMA_JSON> -Constraints: -- MAX_NOTES = <MAX_NOTES> -- MAX_NOTE_CHARS = <MAX_NOTE_CHARS> -Here are the messages as JSON: -<MESSAGES_JSON>" - -============================================================ -19. TESTS AND ACCEPTANCE CRITERIA -============================================================ -A. add_note does not call LLM: -- Instrument LLM client call count. It must remain 0 during add_note tests. -B. English-only boundary: -- Any CJK in add_note, add_event, or search returns HTTP 422 with field path. -C. Evidence binding: -- If extractor evidence.quote is not a substring -> REJECTED with REJECT_EVIDENCE_MISMATCH. -D. Rebuild: -- Drop Qdrant collection, recreate, call /admin/rebuild_qdrant. -- Must succeed without calling embedding API. -E. Source of truth vectors: -- For every active chunk, note_chunk_embeddings row exists and vec dim matches config. -- note_embeddings exists for active notes as derived pooled vectors. -F. Idempotency: -- add_note same payload twice -> second op = NONE. -G. Outbox eventual consistency: -- Simulate embedding provider outage. -- Outbox goes FAILED and later retries to DONE after provider recovers. - -============================================================ -20. OUT OF SCOPE (v1.0) -============================================================ -- Translation or multilingual retrieval (handled by upstream agents). -- Graph memory backend (reserved for later). -- Public internet exposure and auth (localhost only in v1.0). diff --git a/docs/spec/system_elf_memory_service_v2.md b/docs/spec/system_elf_memory_service_v2.md new file mode 100644 index 00000000..b33588e9 --- /dev/null +++ b/docs/spec/system_elf_memory_service_v2.md @@ -0,0 +1,2418 @@ +# ELF Memory Service v2.0 Specification + +Purpose: Define the ELF Memory Service v2.0 contract, invariants, and storage model. +Status: normative +Read this when: You are implementing, validating, or reviewing the core ELF memory service behavior. +Not this document: Operator runbooks, local setup steps, or work-item triage workflows. +Defines: ELF Memory Service v2.0 API semantics, ingestion boundaries, and storage invariants. + +Description: ELF means Evidence-linked fact memory for agents. + +Audience: Implementation LLM or engineer agent. +Language: English only. +Contract: English-only API inputs and outputs. Reject non-English input at the API boundary. +Implementation target: Rust is recommended. The spec is language agnostic. + +Core idea: +- Postgres with pgvector is the only source of truth for notes, chunk embeddings, audit history, and the indexing outbox. +- Note-level embeddings are derived pooled vectors for update and duplicate checks. +- Qdrant is a derived index for candidate retrieval only. Qdrant must be rebuildable from Postgres vectors without calling the embedding API. +- Two write APIs have hard semantic differences: + - add_note is deterministic and must not call any LLM. + - add_event is LLM-driven extraction and must bind evidence for every stored note. + +Core vs Extensions: +- ELF Core is the high-trust, facts-first memory service defined by this specification. + - It owns: notes/events ingestion semantics, scopes/sharing, search, auditability, and the English gate. + - It must remain simple, deterministic where specified, and operable without any optional components. +- ELF Extensions are optional capability modules that may evolve independently without changing Core semantics. + - Extensions must not weaken Core invariants or introduce hidden dependencies into Core flows. + - Extensions should integrate via stable contracts (e.g., versioned source_ref pointers and bounded excerpt hydration). + - Example extension (future): an Evidence Store / Doc Platform used for long-form evidence storage and progressive loading. + +Multi-tenant namespace: +- tenant_id, project_id, agent_id, scope, read_profile. + +Optional future work: +- Graph memory backend is defined in Postgres in `system_graph_memory_postgres_v1.md` and kept aligned with this specification. + +============================================================ +0. INVARIANTS (MUST HOLD) +============================================================ +I1. Postgres with pgvector is the only source of truth for: + - memory notes + - scoped core memory blocks and attachments + - chunk embedding vectors + - chunk metadata + - pooled note embeddings (derived) + - audit and version history + - hit logs (optional) + - indexing outbox jobs +I2. Qdrant is derived and rebuildable: + - Qdrant may be dropped and recreated at any time. + - Qdrant must be rebuildable from Postgres vectors without calling the embedding API. +I3. Online retrieval: + - Qdrant returns candidate chunk_ids. + - Postgres returns authoritative notes and re-validates status, TTL, and scope. +I4. English-only contract: + - Any API input that fails the English gate (defined below) must be rejected with HTTP 422. + - Upstream agents must canonicalize to English before calling ELF. +I5. add_note must not call any LLM under any circumstance. +I6. add_event must call the LLM extractor and must bind evidence with verbatim substring checks. + +============================================================ +1. CONFIGURATION (TOML) +============================================================ +File: elf.toml + +Rules: +- The config file path is required and must be provided with --config or -c. +- No default values are allowed in code. Every field below must be present in elf.toml unless explicitly marked optional. +- No environment variables are allowed for configuration. All values are stored in elf.toml. +- Provider api_key values must be present and non-empty. +- providers.embedding.dimensions must match storage.qdrant.vector_dim. +- chunking.enabled must be true. +- chunking.max_tokens must be greater than zero. +- chunking.overlap_tokens must be less than chunking.max_tokens. +- chunking.tokenizer_repo must be present and non-empty. + +Template (all values required): + +[service] +http_bind = "<REQUIRED_HOST:PORT>" +mcp_bind = "<REQUIRED_HOST:PORT>" +admin_bind = "<REQUIRED_HOST:PORT>" +log_level = "<REQUIRED_LOG_LEVEL>" + +[storage.postgres] +dsn = "<REQUIRED_POSTGRES_DSN>" +pool_max_conns = <REQUIRED_INT> + +[storage.qdrant] +url = "<REQUIRED_URL>" +collection = "mem_notes_v2" +docs_collection = "doc_chunks_v1" +vector_dim = <REQUIRED_INT> + +[providers.embedding] +provider_id = "<REQUIRED_ID>" +api_base = "<REQUIRED_URL>" +api_key = "<REQUIRED_NON_EMPTY>" +path = "<REQUIRED_PATH>" +model = "<REQUIRED_MODEL>" +dimensions = <REQUIRED_INT> +timeout_ms = <REQUIRED_INT> +# Must exist. Empty map is allowed. +default_headers = {} + +[providers.rerank] +provider_id = "<REQUIRED_ID>" +api_base = "<REQUIRED_URL>" +api_key = "<REQUIRED_NON_EMPTY>" +path = "<REQUIRED_PATH>" +model = "<REQUIRED_MODEL>" +timeout_ms = <REQUIRED_INT> +# Must exist. Empty map is allowed. +default_headers = {} + +[providers.llm_extractor] +provider_id = "<REQUIRED_ID>" +api_base = "<REQUIRED_URL>" +api_key = "<REQUIRED_NON_EMPTY>" +path = "<REQUIRED_PATH>" +model = "<REQUIRED_MODEL>" +temperature = <REQUIRED_FLOAT> +timeout_ms = <REQUIRED_INT> +# Must exist. Empty map is allowed. +default_headers = {} + +[scopes] +allowed = ["agent_private", "project_shared", "org_shared"] + +[scopes.read_profiles] +private_only = ["agent_private"] +private_plus_project = ["agent_private", "project_shared"] +all_scopes = ["agent_private", "project_shared", "org_shared"] + +[scopes.precedence] +agent_private = 30 +project_shared = 20 +org_shared = 10 + +[scopes.write_allowed] +agent_private = true +project_shared = true +org_shared = true + +[memory] +max_notes_per_add_event = 3 +max_note_chars = 240 +# Similarity thresholds +dup_sim_threshold = 0.92 +update_sim_threshold = 0.85 +# Retrieval sizes +candidate_k = 60 +top_k = 12 + +[memory.policy] + +[[memory.policy.rules]] +note_type = "fact|plan|preference|constraint|decision|profile" +scope = "agent_private|project_shared|org_shared" +min_confidence = <OPTIONAL_FLOAT> +min_importance = <OPTIONAL_FLOAT> + +[chunking] +enabled = true +max_tokens = <REQUIRED_INT> +overlap_tokens = <REQUIRED_INT> +tokenizer_repo = "<REQUIRED_NON_EMPTY_STRING>" + +[search.expansion] +mode = "off|always|dynamic" +max_queries = <REQUIRED_INT> +include_original = <REQUIRED_BOOL> + +[search.dynamic] +min_candidates = <REQUIRED_INT> +min_top_score = <REQUIRED_FLOAT> + +[search.prefilter] +max_candidates = <REQUIRED_INT> + +[search.cache] +enabled = <REQUIRED_BOOL> +expansion_ttl_days = <REQUIRED_INT> +rerank_ttl_days = <REQUIRED_INT> +# Optional. Omit to disable payload size limits. +max_payload_bytes = <OPTIONAL_INT> + +[search.explain] +retention_days = <REQUIRED_INT> +capture_candidates = <REQUIRED_BOOL> +candidate_retention_days = <REQUIRED_INT> +write_mode = "outbox|inline" + +[search.recursive] +enabled = <REQUIRED_BOOL> +max_depth = <REQUIRED_INT> +max_children_per_node = <REQUIRED_INT> +max_nodes_per_scope = <REQUIRED_INT> +max_total_nodes = <REQUIRED_INT> + +[search.graph_context] +enabled = <REQUIRED_BOOL> +max_facts_per_item = <REQUIRED_INT> +max_evidence_notes_per_fact = <REQUIRED_INT> + +[ranking] +recency_tau_days = 60 +tie_breaker_weight = 0.1 + +[ranking.deterministic] +enabled = <REQUIRED_BOOL> + +[ranking.deterministic.lexical] +enabled = <REQUIRED_BOOL> +weight = <REQUIRED_FLOAT> +min_ratio = <REQUIRED_FLOAT> +max_query_terms = <REQUIRED_INT> +max_text_terms = <REQUIRED_INT> + +[ranking.deterministic.hits] +enabled = <REQUIRED_BOOL> +weight = <REQUIRED_FLOAT> +half_saturation = <REQUIRED_FLOAT> +last_hit_tau_days = <REQUIRED_FLOAT> + +[ranking.deterministic.decay] +enabled = <REQUIRED_BOOL> +weight = <REQUIRED_FLOAT> +tau_days = <REQUIRED_FLOAT> + +[ranking.blend] +enabled = <REQUIRED_BOOL> +rerank_normalization = "<REQUIRED_STRING>" +retrieval_normalization = "<REQUIRED_STRING>" + +[[ranking.blend.segments]] +max_retrieval_rank = <REQUIRED_INT> +retrieval_weight = <REQUIRED_FLOAT> + +[ranking.diversity] +enabled = <REQUIRED_BOOL> +sim_threshold = <REQUIRED_FLOAT> +mmr_lambda = <REQUIRED_FLOAT> +max_skips = <REQUIRED_INT> + +[ranking.retrieval_sources] +fusion_weight = <REQUIRED_FLOAT> +structured_field_weight = <REQUIRED_FLOAT> +fusion_priority = <REQUIRED_INT> +structured_field_priority = <REQUIRED_INT> + +[lifecycle.ttl_days] +plan = 14 +fact = 180 +preference = 0 +constraint = 0 +decision = 0 +profile = 0 + +[lifecycle] +purge_deleted_after_days = 30 +purge_deprecated_after_days = 180 + +[security] +bind_localhost_only = true +reject_non_english = true +redact_secrets_on_write = true +# Evidence rules for add_event +evidence_min_quotes = 1 +evidence_max_quotes = 2 +evidence_max_quote_chars = 320 +auth_mode = "off|static_keys" +# Must exist. Empty array is allowed only when auth_mode = "off". +auth_keys = [] + +# Required when auth_mode = "static_keys"; replace auth_keys = [] with one or more entries. +# [[security.auth_keys]] +# token_id = "<REQUIRED_ID>" +# token = "<REQUIRED_NON_EMPTY>" +# tenant_id = "<REQUIRED_ID>" +# project_id = "<REQUIRED_ID>" +# agent_id = "<REQUIRED_ID>" +# read_profile = "private_only|private_plus_project|all_scopes" +# role = "user|admin|super_admin" + +[context] +# Optional. Context metadata used to disambiguate retrieval across projects and scopes. +# +# project_descriptions keys: +# - "<tenant_id>:<project_id>" (recommended) +# - "<project_id>" (fallback) +project_descriptions = { "<OPTIONAL_KEY>" = "<OPTIONAL_STRING>" } +# scope_descriptions keys are scope labels, e.g. "project_shared". +scope_descriptions = { "<SCOPE>" = "<OPTIONAL_STRING>" } +# Optional. Additive score boost applied when query tokens match a scope description. +# Must be a finite number in the range 0.0-1.0. When greater than zero, scope_descriptions must be present. +scope_boost_weight = <OPTIONAL_FLOAT> + +[mcp] +# Optional. Used by elf-mcp to attach required context headers when forwarding to elf-api. +# This section is required when running elf-mcp. +tenant_id = "<REQUIRED_ID>" +project_id = "<REQUIRED_ID>" +agent_id = "<REQUIRED_ID>" +read_profile = "private_only|private_plus_project|all_scopes" + +============================================================ +2. CLI AND CONFIG LOADING +============================================================ +- elf-api, elf-worker, and elf-mcp are separate binaries. +- Each binary requires a config path via --config or -c. +- Startup must fail with a clear error if any required config field is missing. +- security.reject_non_english must be true. Startup must fail if it is false. + +============================================================ +3. ENGLISH GATE (ENGLISH-ONLY BOUNDARY) +============================================================ +Policy: +- ELF is English-only. All externally supplied text fields must be English. +- Translation or multilingual retrieval is out of scope and must be handled upstream. + +English gate algorithm (normative): +1) Normalize: + - Apply Unicode NFKC normalization. + - Reject if the normalized text contains control characters or zero-width/invisible + characters (implementation-defined denylist). +2) Script gate (hard reject): + - Reject if any codepoint is in a disallowed script. + - Normative allowlist: + - Allow: Latin, Common, Inherited. + - Reject: any other script (e.g., Han, Hiragana, Katakana, Hangul, Cyrillic, Arabic). +3) Language identification gate (LID) (conditional reject): + - Only apply LID to natural-language fields (note text, query, doc text). Do not + apply LID to structured identifiers (urls, ids, keys) to avoid false rejects. + - Only apply LID when the input is sufficiently long and letter-dense + (implementation-defined thresholds). + - If LID classifies the text as NOT English with confidence >= threshold, reject. + - If LID is low-confidence/unknown, do not reject (to avoid false positives). + +Fields to check: +- add_note: notes[].text, notes[].key (optional), source_ref string fields if any +- add_event: messages[].content +- search: query + +Error response: +HTTP 422 +{ + "error_code": "NON_ENGLISH_INPUT", + "message": "Non-English input detected; upstream must canonicalize to English before calling ELF.", + "fields": ["$.messages[2].content", "$.notes[0].text"] +} + +============================================================ +4. DOMAIN MODEL +============================================================ +4.1 Memory types (exactly 6) +- preference +- constraint +- decision +- profile +- fact +- plan + +4.2 Canonical note +- A note is a short English sentence and must be <= max_note_chars. +- Format is not enforced. Recommended prefixes for consistency: + "Preference: ...", "Constraint: ...", "Decision: ...", "Profile: ...", "Fact: ...", "Plan: ..." + +4.3 Keys +- key is optional but strongly recommended for stable updates. +- key examples: preferred_language, no_secrets_policy, architecture_sot, project_workflow, long_term_goal. + +4.4 source_ref (evidence pointer) +- source_ref is an optional, versioned pointer to supporting evidence for a stored note. +- Core requirement: ELF Core stores and returns source_ref as an opaque JSON object. Core does not interpret or dereference it. +- When source_ref is provided, it MUST be a JSON object and not a primitive value. +- Extensions requirement: ELF Extensions may define resolvers that can dereference source_ref into bounded excerpts for progressive loading. +- source_ref must be JSON-serializable, ASCII-safe, and stable over time. + +Recommended shape (informative): +{ + "schema": "source_ref/v1", + "resolver": "string", + "ref": { "...": "resolver-specific" }, + "state": { "...": "optional snapshot/version info" }, + "locator": { "...": "optional in-source excerpt selector(s)" }, + "hashes": { "...": "optional integrity checks" }, + "hints": { "...": "optional debug/UX fields" } +} + +Defined resolvers: +- `elf_doc_ext/v1`: Doc Extension v1 document pointer resolver. Defined in `docs/spec/system_source_ref_doc_pointer_v1.md`. + +Resolver tiers (informative): +- reproducible: dereference is stable and replayable given (ref + state) (example: fs_git with a commit SHA). +- best_effort: dereference may change over time (example: external conversation thread id); resolvers should expose whether excerpt verification succeeded. + +============================================================ +5. POSTGRES SCHEMA (SOURCE OF TRUTH + PGVECTOR) +============================================================ +Startup must: +- CREATE EXTENSION IF NOT EXISTS vector; +- Execute sql/init.sql. + +Schema location: +- All schema and index DDL must live under sql/ and be orchestrated by sql/init.sql. +- sql/init.sql must be idempotent and include the per-table files in dependency order. + +5.1 memory_notes (authoritative notes) +Columns: +- note_id uuid primary key +- tenant_id text not null +- project_id text not null +- agent_id text not null +- scope text not null +- type text not null +- key text null +- text text not null +- importance real not null +- confidence real not null +- status text not null +- created_at timestamptz not null +- updated_at timestamptz not null +- expires_at timestamptz null +- embedding_version text not null +- source_ref jsonb not null +- hit_count bigint not null default 0 +- last_hit_at timestamptz null + +Indexes (minimum): +- idx_notes_scope_status: (tenant_id, project_id, scope, status) +- idx_notes_key: (tenant_id, project_id, agent_id, scope, type, key) WHERE key IS NOT NULL +- idx_notes_expires: (expires_at) + +5.2 memory_note_chunks (chunk metadata) +Columns: +- chunk_id uuid primary key +- note_id uuid not null references memory_notes(note_id) on delete cascade +- chunk_index int not null +- start_offset int not null +- end_offset int not null +- text text not null +- embedding_version text not null +- created_at timestamptz not null default now() + +Indexes (minimum): +- idx_note_chunks_note: (note_id) +- idx_note_chunks_note_index: (note_id, chunk_index) + +5.3 note_chunk_embeddings (source of truth vectors; pgvector) +- chunk_id uuid references memory_note_chunks(chunk_id) on delete cascade +- embedding_version text not null +- embedding_dim int not null +- vec vector(<vector_dim>) not null +- created_at timestamptz not null default now() +primary key(chunk_id, embedding_version) + +Rules: +- Every memory_note_chunks row must have a corresponding note_chunk_embeddings row for its embedding_version. +- Chunk embeddings are the source of truth for retrieval and rebuild. + +5.4 note_embeddings (derived pooled vectors; pgvector) +- note_id uuid references memory_notes(note_id) on delete cascade +- embedding_version text not null +- embedding_dim int not null +- vec vector(<vector_dim>) not null +- created_at timestamptz not null default now() +primary key(note_id, embedding_version) + +Rules: +- note_embeddings is derived by mean pooling chunk embeddings for (note_id, embedding_version). +- note_embeddings must be refreshed whenever chunk embeddings change. + +5.5 memory_note_versions (append-only audit) +- version_id uuid primary key +- note_id uuid not null +- op text not null +- prev_snapshot jsonb null +- new_snapshot jsonb null +- reason text not null +- actor text not null +- ts timestamptz not null default now() + +5.6 memory_hits (optional) +- hit_id uuid primary key +- note_id uuid not null +- chunk_id uuid null +- query_hash text not null +- rank int not null +- final_score real not null +- ts timestamptz not null default now() + +5.7 indexing_outbox (guaranteed indexing) +- outbox_id uuid primary key +- note_id uuid not null +- op text not null +- embedding_version text not null +- status text not null +- attempts int not null default 0 +- last_error text null +- available_at timestamptz not null default now() +- created_at timestamptz not null default now() +- updated_at timestamptz not null default now() + +Indexes: +- idx_outbox_status_available: (status, available_at) +- idx_outbox_note_op_status: (note_id, op, status) + +5.8 search_traces (search explainability) +- trace_id uuid primary key +- tenant_id text not null +- project_id text not null +- agent_id text not null +- read_profile text not null +- query text not null +- expansion_mode text not null +- expanded_queries jsonb not null +- allowed_scopes jsonb not null +- candidate_count int not null +- top_k int not null +- config_snapshot jsonb not null +- trace_version int not null +- created_at timestamptz not null +- expires_at timestamptz not null + +Indexes: +- idx_search_traces_expires: (expires_at) +- idx_search_traces_context: (tenant_id, project_id, created_at) + +5.9 search_trace_items (per-result explain data) +- item_id uuid primary key +- trace_id uuid not null references search_traces(trace_id) on delete cascade +- note_id uuid not null +- chunk_id uuid null +- rank int not null +- final_score real not null +- explain jsonb not null + +Indexes: +- idx_search_trace_items_trace: (trace_id, rank) +- idx_search_trace_items_note: (note_id) + +5.10 search_trace_stages (stage-level retrieval trajectory) +- stage_id uuid primary key +- trace_id uuid not null references search_traces(trace_id) on delete cascade +- stage_order int not null +- stage_name text not null +- stage_payload jsonb not null +- created_at timestamptz not null + +Indexes: +- idx_search_trace_stages_trace_order: (trace_id, stage_order) +- idx_search_trace_stages_trace_name: (trace_id, stage_name) + +5.11 search_trace_stage_items (per-stage item metrics) +- id uuid primary key +- stage_id uuid not null references search_trace_stages(stage_id) on delete cascade +- item_id uuid null +- note_id uuid null +- chunk_id uuid null +- metrics jsonb not null + +Indexes: +- idx_search_trace_stage_items_stage_item: (stage_id, item_id) + +5.12 search_trace_outbox (async trace persistence) +- outbox_id uuid primary key +- trace_id uuid not null +- status text not null +- attempts int not null default 0 +- last_error text null +- available_at timestamptz not null default now() +- payload jsonb not null +- created_at timestamptz not null default now() +- updated_at timestamptz not null default now() + +Indexes: +- idx_trace_outbox_status_available: (status, available_at) +- idx_trace_outbox_trace_status: (trace_id, status) + +5.13 llm_cache (LLM response cache) +- cache_id uuid primary key +- cache_kind text not null +- cache_key text not null +- payload jsonb not null +- created_at timestamptz not null +- last_accessed_at timestamptz not null +- expires_at timestamptz not null +- hit_count bigint not null default 0 + +Indexes: +- idx_llm_cache_key: (cache_kind, cache_key) unique +- idx_llm_cache_expires: (expires_at) + +5.14 memory_ingest_decisions (ingest policy audit) +- decision_id uuid primary key +- tenant_id text not null +- project_id text not null +- agent_id text not null +- scope text not null +- pipeline text not null +- note_type text not null +- note_key text null +- note_id uuid null +- note_version_id uuid null +- base_decision text not null +- policy_decision text not null +- note_op text not null +- reason_code text null +- details jsonb not null +- ts timestamptz not null + +Indexing: +- idx_memory_ingest_decisions_tenant_scope_pipeline: (tenant_id, project_id, agent_id, scope, pipeline, ts) +- idx_memory_ingest_decisions_note_version_id: (note_version_id) + +details must include: +- similarity_best +- key_match +- matched_dup +- dup_sim_threshold +- update_sim_threshold +- confidence +- importance +- structured_present +- graph_present +- policy_rule +- min_confidence +- min_importance +- write_policy_audits (add_note: single object, add_event: array of message audits, optional) + +5.15 core_memory_blocks (authoritative always-attached context blocks) +- block_id uuid primary key +- tenant_id text not null +- project_id text not null +- agent_id text not null +- scope text not null +- key text not null +- title text not null +- content text not null +- source_ref jsonb not null +- status text not null +- created_at timestamptz not null +- updated_at timestamptz not null + +Rules: +- Core blocks are small read-only operating context, separate from archival note search. +- Core blocks must not be indexed into Qdrant or returned by archival search unless a future explicit contract says so. +- source_ref must be a JSON object and is returned with block readback. +- scope, write permission, English gate, auth, and shared-grant rules apply. + +Indexes: +- uq_core_memory_blocks_active_key: (tenant_id, project_id, agent_id, scope, key) WHERE status = 'active' +- idx_core_memory_blocks_scope_status: (tenant_id, project_id, scope, status) + +5.16 core_memory_block_attachments (explicit block attachment) +- attachment_id uuid primary key +- block_id uuid not null references core_memory_blocks(block_id) on delete cascade +- tenant_id text not null +- project_id text not null +- agent_id text not null +- read_profile text not null +- attached_by_agent_id text not null +- attached_at timestamptz not null +- detached_by_agent_id text null +- detached_at timestamptz null + +Rules: +- Active attachment is exact to tenant_id, project_id, agent_id, read_profile, and block_id. +- Attachment does not bypass scope access. Readback still applies read_profile scope resolution, + private-owner checks, shared grants, and block status. +- Detached rows remain as audit evidence. + +Indexes: +- uq_core_memory_block_attachments_active: + (tenant_id, project_id, agent_id, read_profile, block_id) WHERE detached_at IS NULL +- idx_core_memory_block_attachments_read: + (tenant_id, project_id, agent_id, read_profile, detached_at) +- idx_core_memory_block_attachments_block: (block_id, detached_at) + +5.17 core_memory_block_events (append-only block audit) +- event_id uuid primary key +- block_id uuid not null references core_memory_blocks(block_id) on delete cascade +- attachment_id uuid null references core_memory_block_attachments(attachment_id) on delete set null +- tenant_id text not null +- project_id text not null +- actor_agent_id text not null +- event_type text not null +- target_agent_id text null +- read_profile text null +- prev_snapshot jsonb null +- new_snapshot jsonb null +- reason text not null +- ts timestamptz not null + +event_type values: +- block_created +- block_updated +- attachment_added +- attachment_removed + +Rules: +- Every block create/update and attachment add/remove writes one event. +- Block readback may include audit history for returned blocks. + +============================================================ +6. QDRANT COLLECTION (DERIVED INDEX ONLY) +============================================================ +- Collection: storage.qdrant.collection +- Dense vector: named `dense` with size storage.qdrant.vector_dim (cosine distance). +- Sparse vector: named `bm25` with `idf` modifier and model `qdrant/bm25`. +- Point id: chunk_id (string UUID) +- Payload fields (minimum): + note_id, chunk_id, chunk_index, start_offset, end_offset, + tenant_id, project_id, agent_id, scope, type, key, status, + updated_at, expires_at, importance, confidence, embedding_version +- Chunk text is not stored in Qdrant payload. + +IMPORTANT: +- Qdrant may be stale. Postgres is authoritative. + +============================================================ +7. PROVIDER ADAPTERS (HTTP) +============================================================ +7.1 EmbeddingProvider +Function: +- embed(texts[]) -> vectors[][] + +Contract: +- Output vector count equals input text count. +- Each vector length equals vector_dim. + +Implementation: +- POST {api_base}{path} + { "model": model, "input": [texts...], "dimensions": dimensions } +- Send Authorization: Bearer <api_key>. +- Merge default_headers into the request. +- Map response to float32[D]. + +embedding_version: +- "<provider_id>:<model>:<vector_dim>" + +7.2 RerankProvider +Function: +- rerank(query, docs[]) -> scores[] + +Contract: +- Scores are aligned to docs indexes. + +Implementation: +- POST {api_base}{path} + { "model": model, "query": "...", "documents": ["..."] } +- Send Authorization: Bearer <api_key>. +- Merge default_headers into the request. +- Map response into aligned float[] (some providers return indexes). + +7.3 LLM Extractor Provider +Function: +- extract(messages[]) -> JSON notes + +Contract: +- Strict JSON output. +- If response_format is available, use it. +- Otherwise enforce JSON-only with at most 2 retries. + +Implementation: +- POST {api_base}{path} + { "model": model, "temperature": temperature, "messages": [...] } +- Send Authorization: Bearer <api_key>. +- Merge default_headers into the request. + +============================================================ +8. API SEMANTICS: add_note vs add_event (HARD DIFFERENCES) +============================================================ +8.1 add_note (deterministic write) +MUST: +- Must not call any LLM. +- Must treat input notes as authoritative content with no rewriting. +- Must apply WriteGate, UpdateResolver, persistence, and indexing outbox. +- Must return per-note op result: ADD, UPDATE, NONE, or REJECTED with reason_code. + +MUST NOT: +- Must not infer missing type, scope, or key beyond validation defaults. +- Must not generate new text. + +8.2 add_event (LLM extraction write) +MUST: +- Must call the LLM extractor exactly once per request. +- Must require evidence binding for each candidate note. +- Each input message MAY include optional write_policy for per-message redact/exclude policy. +- Must enforce max_notes_per_add_event on the server. +- Must apply WriteGate and UpdateResolver after extraction. +- Should support dry_run to return candidates without persisting. + +MUST NOT: +- Must not store notes lacking evidence or failing evidence substring checks. +- Must not store raw full logs as memory notes. + - If evidence.quote is not a verbatim substring of the cited message, return REJECTED with reason_code REJECT_EVIDENCE_MISMATCH. + - If write_policy is present and evidence mismatch is a byproduct of transformed content, return REJECTED with reason_code REJECT_WRITE_POLICY_MISMATCH. + +8.3 Policy decision pipeline (both add_note and add_event) +Stage-1 (base decision) is computed from resolver outcome + side-effect presence: +- Add -> remember +- Update -> update +- None + (structured_present || graph_present) -> update +- None + (!structured_present && !graph_present) -> ignore + +Stage-2 (policy stage) evaluates `memory.policy` rules and may only: +- keep base decision remember/update +- or downgrade remember/update -> ignore when thresholds fail + +Decision taxonomy: +- remember +- update +- ignore +- reject + +When policy downgrades to ignore: +- `memory_notes` must not be inserted/updated/deleted +- `memory_note_fields` must not be written +- graph memory rows must not be written +- indexing/search outbox rows must not be written +- only an audit row must be written via `memory_ingest_decisions` + +Ignore reason codes: +- `IGNORE_DUPLICATE`: base=ignore and duplicate match was detected (`metadata.matched_dup = true`) +- `IGNORE_POLICY_THRESHOLD`: base=remember/update and policy stage threshold/guard downgraded to ignore + +============================================================ +9. WRITEGATE (SERVER SIDE, ALWAYS ON) +============================================================ +Reject a note if any of the following are true: +- The note contains non-English input (fails the English gate). +- The type is not in the 6-type allowlist. +- The scope is not allowed or write not allowed. +- The text length is greater than max_note_chars. +- Secrets or PII are detected (regex and heuristics). +- The text is empty or whitespace only. + +On rejection: +- op = REJECTED +- reason_code is one of: + REJECT_NON_ENGLISH, REJECT_TOO_LONG, REJECT_SECRET, REJECT_INVALID_TYPE, + REJECT_SCOPE_DENIED, REJECT_EMPTY + +============================================================ +10. UPDATE RESOLVER (IN-PLACE UPDATE, STABLE note_id) +============================================================ +Resolution namespace group: +(tenant_id, project_id, agent_id, scope, type) + +Order: +1) Key-based: + - If key is not null and an active note exists with the same key: + -> UPDATE in place (same note_id). +2) Similarity-based (when key is null): + - Compute embedding for incoming text. + - Compare cosine similarity vs existing active notes in the group using Postgres-stored vec. + - If sim >= dup_sim_threshold -> NONE. + - Else if sim >= update_sim_threshold -> UPDATE best match in place. + - Else -> ADD new note_id. + +On UPDATE: +- Preserve note_id. +- Write memory_note_versions with prev and new snapshots. +- Update memory_notes.text, updated_at, expires_at, source_ref, confidence, importance. +- Enqueue outbox UPSERT. + +============================================================ +11. TTL AND LIFECYCLE +============================================================ +TTL assignment on write: +- If request.ttl_days is provided and > 0 -> expires_at = now + ttl_days. +- Else if lifecycle.ttl_days[type] > 0 -> expires_at = now + ttl_days[type]. +- Else expires_at = NULL. + +GC job (daily): +- If status = deleted and deleted age > purge_deleted_after_days -> hard purge row (cascade). +- If status = deprecated and last_hit_at older than purge_deprecated_after_days -> delete or purge. +- If expires_at < now -> set status = deleted + version row + outbox DELETE. + +============================================================ +12. PERSISTENCE AND INDEXING (SOURCE OF TRUTH FIRST + OUTBOX) +============================================================ +For every ADD, UPDATE, DEPRECATE, or DELETE, the Postgres transaction must: +- Update memory_notes. +- Write memory_note_versions. +- Insert indexing_outbox (UPSERT or DELETE) as PENDING. +- Commit. + +After commit: +- Best-effort inline outbox processing may run. +- Correctness is guaranteed by the background worker. + +Worker rules: +- For UPSERT: + - Fetch memory_notes row. + - If not active or expired -> mark outbox DONE and skip indexing. + - Split note text into sentence-aware chunks. + - Upsert memory_note_chunks rows for (note_id, chunk_index). + - Call embedding API for chunk text and upsert note_chunk_embeddings. + - Compute pooled note vector by mean pooling chunk embeddings and upsert note_embeddings. + - Upsert one Qdrant point per chunk with dense and bm25 vectors plus payload. + - Mark outbox DONE. +- For DELETE: + - Delete Qdrant points by note_id filter (ignore not found). + - Mark DONE. +- Failures: + - status = FAILED, attempts += 1, available_at = now + backoff(attempts). + +Search trace outbox (best-effort): +- Search enqueues trace payloads into search_trace_outbox with status = PENDING. +- Worker leases available jobs, inserts search_traces, search_trace_items, search_trace_stages, and search_trace_stage_items, then marks DONE. +- On failure, status = FAILED, attempts += 1, last_error set, available_at = now + backoff(attempts). +- Failures must not affect the original search response. + +Periodic cleanup: +- Worker deletes expired search_traces (search_trace_items/search_trace_stages/search_trace_stage_items cascade). +- Worker deletes expired llm_cache rows. + +============================================================ +13. SEARCH PIPELINE (ONLINE) +============================================================ +Input: +- tenant_id, project_id, agent_id +- read_profile +- query (English only) +- mode (`quick_find` or `planned_search`) - required +- optional top_k, candidate_k, filter, record_hits + +Config: +- search.expansion.mode = off|always|dynamic +- search.expansion.max_queries +- search.expansion.include_original (default true) +- search.dynamic.min_candidates +- search.dynamic.min_top_score +- search.prefilter.max_candidates (0 or >= candidate_k means no prefilter) +- search.cache.enabled +- search.cache.expansion_ttl_days +- search.cache.rerank_ttl_days +- search.cache.max_payload_bytes (optional) +- search.explain.retention_days + +Steps: +1) English-only boundary check. +2) Resolve allowed_scopes = scopes.read_profiles[read_profile]. +3) Resolve expansion mode: + - off: use only original query. + - always: expand with LLM. + - dynamic: run a baseline hybrid search for the original query, then expand if + candidate_count < min_candidates OR top1_fusion_score < min_top_score. +4) If expansion is enabled, resolve expanded queries with cache support. + - Build an expansion cache key from: query (trimmed), provider_id, model, temperature, + and the expansion cache schema version (hardcoded), plus max_queries and include_original. + - If search.cache.enabled and a non-expired cache entry exists, use cached queries. + - On cache miss, call the LLM expansion prompt and receive queries[]. + - Deduplicate, drop any non-English variants (English gate), and cap at max_queries. + - Ensure original query is present when include_original = true. + - If search.cache.enabled and payload size is within max_payload_bytes (when set), + store the expanded queries with TTL = expansion_ttl_days. +5) Resolve optional project context description: + - If context.project_descriptions is present, look up by key "tenant_id:project_id". + - If not found, try key "project_id" as a fallback. +6) For each query, embed -> query_vec (embedding API). + - Dense embedding input is: + - query, or + - query + "\n\nProject context:\n" + project_context_description (when present). + - BM25 input remains the raw query text (no context suffix). +7) For each query, run Qdrant fusion query candidate_k with payload filters (dense + bm25): + tenant_id, project_id, status = active (best-effort), and scope filters: + - If scope = agent_private, require agent_id match. + - Otherwise scope in allowed_scopes. + If filter is present, do not push filter criteria into Qdrant. +8) Fuse all query results with RRF to produce candidate chunk_ids. +9) Prefilter (optional): if max_candidates > 0 and max_candidates < candidate_k, + keep only top max_candidates by fusion score. +10) Fetch authoritative notes from Postgres by note_id and re-apply consistency checks: + status = active, not expired, scope allowed, and if scope = agent_private then agent_id must match. +11) If filter is present, apply service-side candidate filtering using the authoritative note metadata: + - effective_candidate_k = min(MAX_CANDIDATE_K, requested_candidate_k * 3), then clamp to >= top_k. + - The filter is evaluated after candidate retrieval and consistency checks. + - The filter is not pushed down to Qdrant or SQL. +12) Fetch chunk metadata for candidate chunks and immediate neighbors from memory_note_chunks. +13) Stitch snippets from chunk text (chunk + neighbors). +14) Rerank once using the original query, with cache support: + - Build a rerank cache key from: query (trimmed), provider_id, model, rerank cache schema version (hardcoded), + and the candidate signature [(chunk_id, note_updated_at)...]. + - If search.cache.enabled and a cache entry exists that matches the candidate signature, + reuse cached scores. + - On cache miss, call the rerank provider: + scores = rerank(original_query, docs = [snippet ...]). + - If search.cache.enabled and payload size is within max_payload_bytes (when set), + store the rerank scores with TTL = rerank_ttl_days. +15) Tie-break: + base = (1 + 0.6 * importance) * exp(-age_days / recency_tau_days) + final = rerank_score + tie_breaker_weight * base +16) Optional scope context boost: + - If context.scope_boost_weight > 0 and context.scope_descriptions contains scope labels, + apply an additive boost to items in that scope based on query token matches. + - Token matching uses case-insensitive ASCII alphanumeric tokens (length >= 2). + - boost = scope_boost_weight * (matched_token_count / query_token_count). +17) Aggregate by note using top-1 chunk score, then sort and take top_k. +18) Update hits (optional, when record_hits is true): + hit_count++, last_hit_at, memory_hits insert with chunk_id. +19) Build search trace payload with trace_id and per-item result_handle, then enqueue + search_trace_outbox (best-effort; failures do not fail the search). + - expires_at = now + search.explain.retention_days. +20) Return results. + +Cache notes: +- Cache key material is serialized as JSON and hashed with BLAKE3 (256-bit hex). +- Cache read/write failures are treated as misses and must not fail the search request. + +============================================================ +14. ADMIN HTTP API (DEBUGGING) +============================================================ +Base: http://{service.admin_bind} + +Note: Admin endpoints are intended for localhost use only. They are not exposed on the public bind. + +Authentication: +- security.auth_mode = "off": no auth header is required. +- security.auth_mode = "static_keys": admin requests must include `Authorization: Bearer <token>`. +- In `static_keys` mode, the matched `security.auth_keys` entry must have `admin = true` for admin endpoints. + +Request correlation: +- `X-ELF-Request-Id` is optional on admin endpoints. +- If omitted, elf-api generates a new UUID. +- Response includes `X-ELF-Request-Id` header and `request_id` in JSON responses. + +GET /viewer + +Behavior: +- Serves the local read-only web viewer from the admin bind only. +- Must not be mounted on the public HTTP bind by default. +- The viewer uses admin-bind same-origin requests and only calls read-only endpoints. +- In `static_keys` mode, the viewer page may load without credentials, but data requests still require an admin bearer token. + +Admin read-only session mirror: +- POST /v2/admin/searches +- GET /v2/admin/searches/{search_id} +- GET /v2/admin/searches/{search_id}/timeline +- POST /v2/admin/searches/{search_id}/notes + +Behavior: +- These endpoints mirror the public progressive search session endpoints for local admin viewer use. +- They are read-only with respect to notes; detail hydration must default to `record_hits = false` when the viewer calls it. +- They require the same context headers as the public session endpoints, plus admin authentication when `security.auth_mode = "static_keys"`. + +Admin read-only note mirror: +- GET /v2/admin/notes +- GET /v2/admin/notes/{note_id} + +Behavior: +- These endpoints mirror the public note list/detail reads for local admin viewer use. +- Note metadata that includes `created_at`, `hit_count`, and `last_hit_at` is available through `GET /v2/admin/notes/{note_id}/provenance`. + +Admin core memory block management: +- POST /v2/admin/core-blocks +- POST /v2/admin/core-blocks/{block_id}/attachments +- DELETE /v2/admin/core-blocks/attachments/{attachment_id} + +Behavior: +- These endpoints create/update core blocks and attach/detach them for exact tenant/project/agent/read_profile readback. +- Core blocks are read-only to normal public callers; public callers only read attached blocks. +- Mutations write append-only `core_memory_block_events`. +- Core blocks are not note-search hits and do not write Qdrant points, search sessions, search traces, or note outbox rows. + +Admin consolidation proposal review: +- POST /v2/admin/consolidation/runs +- GET /v2/admin/consolidation/runs +- GET /v2/admin/consolidation/runs/{run_id} +- GET /v2/admin/consolidation/proposals +- GET /v2/admin/consolidation/proposals/{proposal_id} +- POST /v2/admin/consolidation/proposals/{proposal_id}/review + +Behavior: +- These endpoints expose fixture-driven or manually supplied consolidation runs and + reviewable derived proposals. +- Creating a consolidation run enqueues a deterministic `consolidation_run_jobs` + worker job and returns `job_id`; the worker materializes supplied proposal payloads + into `consolidation_proposals`. +- Proposal payloads must follow `elf.consolidation/v1`, carry source refs and + snapshots, and may include unsupported-claim flags, contradiction markers, and + staleness markers for reviewer inspection. +- Review action values are `approve`, `apply`, `discard`, and `defer`. +- `apply` records an approval transition before the applied transition when a proposal + starts from `proposed`. +- Every review action writes append-only review audit events returned by proposal + detail readback. +- These endpoints must not call LLM, embedding, rerank, or external provider adapters. +- They must not mutate authoritative source notes, docs, events, traces, graph facts, + or search traces. + +Admin derived knowledge pages: +- POST /v2/admin/knowledge/pages/rebuild +- GET /v2/admin/knowledge/pages +- POST /v2/admin/knowledge/pages/search +- GET /v2/admin/knowledge/pages/{page_id} +- POST /v2/admin/knowledge/pages/{page_id}/lint + +Behavior: +- These endpoints expose deterministic rebuild, list/detail readback, and stale-source + lint for derived knowledge pages. The search endpoint exposes derived page section + snippets with visible citations, source coverage, lint summary, trust state, and + repair/rebuild guidance. +- Page payloads must follow `elf.knowledge_page/v1`, preserve section citations, and + write normalized source refs for lint. +- Pages are derived and rebuildable; rebuilding or linting a page must not mutate + authoritative notes, event audits, graph facts, consolidation proposals, docs, + traces, or source pointers. +- Page snippets are not authoritative note search hits and must be labeled as derived + knowledge page snippets wherever surfaced. +- The detailed contract is defined in `system_knowledge_pages_v1.md`. + +Admin reviewable memory summary readback: + +Behavior: +- Memory summary readback is a derived, reviewable artifact surface, not + authoritative note search and not a hidden note rewrite path. +- Summary entries must follow `elf.memory_summary/v1`, carry source refs, freshness or + validity metadata, and inclusion/downgrade/exclusion rationale for top-of-mind, + background, stale, superseded, tombstoned, and derived project-profile entries. +- Stale, superseded, or tombstoned entries must not be returned as current + top-of-mind facts. +- Derived project-profile entries must either cite source refs or carry explicit + unsupported-claim flags when excluded. +- Memory summaries must not call provider adapters, mutate authoritative source notes, + create Qdrant points, create search sessions, or record note hits in v1 contract + validation. +- The detailed contract is defined in `system_memory_summary_v1.md`. + +POST /v2/admin/qdrant/rebuild + +Behavior: +- Rebuild the Qdrant chunk index from Postgres chunk vectors. +- Must not call the embedding API. +- Qdrant is derived and can be dropped and recreated at any time. + +Response: +{ + "rebuilt_count": 0, + "missing_vector_count": 0, + "error_count": 0 +} + +POST /v2/admin/searches/raw + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) +- X-ELF-Read-Profile (required): private_only|private_plus_project|all_scopes + +Body: +{ + "query": "English-only", + "mode": "quick_find", + "top_k": 12, + "candidate_k": 60, + "payload_level": "l0", + "filter": { + "schema": "search_filter_expr/v1", + "expr": { + "op": "gte", + "field": "importance", + "value": 0.5 + } + } +} + +Response: +{ + "trace_id": "uuid", + "items": [ + { + "result_handle": "uuid", + "note_id": "uuid", + "chunk_id": "uuid", + "chunk_index": 0, + "start_offset": 0, + "end_offset": 0, + "snippet": "...", + "type": "fact|plan|preference|constraint|decision|profile", + "key": null, + "scope": "agent_private|project_shared|org_shared", + "importance": 0.0, + "confidence": 0.0, + "updated_at": "...", + "expires_at": "...|null", + "final_score": 0.0, + "source_ref": { ... }, + "explain": { + "match": { + "matched_terms": ["..."], + "matched_fields": ["text", "key"] + }, + "ranking": { + "schema": "search_ranking_explain/v2", + "policy_id": "ranking_v2:...", + "final_score": 0.0, + "terms": [ + { "name": "blend.retrieval", "value": 0.0 }, + { "name": "blend.rerank", "value": 0.0 }, + { "name": "tie_breaker", "value": 0.0 }, + { "name": "context.scope_boost", "value": 0.0 }, + { "name": "deterministic.lexical_bonus", "value": 0.0 }, + { "name": "deterministic.hit_boost", "value": 0.0 }, + { "name": "deterministic.decay_penalty", "value": 0.0 } + ] + }, + "relation_context": [ + { + "fact_id": "uuid", + "scope": "project_shared", + "subject": { "canonical": "string", "kind": "person|concept|null" }, + "predicate": "string", + "object": { + "entity": { "canonical": "string", "kind": "person|concept|null" }, + "value": null + }, + "valid_from": "...", + "valid_to": null, + "temporal_status": "current|historical|future", + "evidence_note_ids": ["uuid", "uuid"] + } + ] + } + } + } + ] +} + +Notes: +- `relation_context` is omitted unless `search.graph_context.enabled` is true. +- When present, relation context is evidence-bound and bounded by `search.graph_context.max_facts_per_item` and + `search.graph_context.max_evidence_notes_per_fact`. +- `relation_context.temporal_status` is derived from the graph fact validity window at the search read timestamp. + Historical facts may be returned when they are evidence-linked to a selected note; they must be labeled + `historical` instead of being presented as current. +- It is included wherever `SearchExplain` is returned, including admin trace surfaces (`/v2/admin/traces/*` and + `/v2/admin/trace-items/*`), in addition to search responses. +- Admin trace endpoints validate `tenant_id` + `project_id` only for access control. They are intended for + project-scoped operations and do not require the requesting `agent_id` to match the stored trace owner. +- This endpoint is intended for debugging and evaluation. It returns chunk-level items and explain components. +- The public search endpoint returns a compact note-level index view. + +GET /v2/admin/traces/recent + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Query: +- limit (optional): default `50`, max `200`. +- cursor_created_at (optional, RFC3339): timestamp cursor value. +- cursor_trace_id (optional, uuid): cursor trace id. +- agent_id (optional): filter traces by creator. +- read_profile (optional): filter by read_profile. +- created_after (optional, RFC3339): strict lower bound on `created_at`. +- created_before (optional, RFC3339): strict upper bound on `created_at`. + +Requirements: +- `cursor_created_at` and `cursor_trace_id` must be provided together or omitted together. + +Response: +{ + "schema": "elf.recent_traces/v1", + "traces": [ + { + "trace_id": "uuid", + "tenant_id": "string", + "project_id": "string", + "agent_id": "string", + "read_profile": "private_only|private_plus_project|all_scopes", + "query": "string", + "created_at": "..." + } + ], + "next_cursor": { + "created_at": "...", + "trace_id": "uuid" + } | null +} + +Ordering: +- `created_at DESC`, then `trace_id DESC`. +- The page cursor for the next page uses `(created_at, trace_id) < cursor`. + +GET /v2/admin/traces/{trace_id}/bundle + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Query: +- mode: `bounded` (default) or `full`. +- stage_items_limit (optional): max items per trajectory stage. +- candidates_limit (optional): max candidate count for `candidates`. + +Response: +{ + "schema": "elf.trace_bundle/v1", + "generated_at": "...", + "trace": { ... }, + "items": [ ... ], + "trajectory_summary": { + "schema": "search_retrieval_trajectory/v1", + "stages": [ ... ] + } | null, + "stages": [ ... ], + "candidates": [ ... ] | null +} +- `stage_items_limit`: `64` in `bounded` mode (cap `256`), `256` in `full` mode. +- `candidates_limit`: `0` in `bounded` mode (no candidates), `200` in `full` mode. +- Candidate snapshot is decoded to `TraceReplayCandidate`. +- `candidates` is omitted as `null` when not requested. + +GET /v2/admin/traces/{trace_id} + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Response: +{ + "trace": { ... }, + "items": [ ... ], + "trajectory_summary": { + "schema": "search_retrieval_trajectory/v1", + "stages": [ ... ] + } +} +`items[*].explain` follows the same `SearchExplain` schema as search responses (including optional `relation_context`). + +GET /v2/admin/trajectories/{trace_id} + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Response: +{ + "trace": { ... }, + "trajectory": { + "schema": "search_retrieval_trajectory/v1", + "stages": [ ... ] + }, + "stages": [ + { + "stage_order": 1, + "stage_name": "rewrite.expansion", + "stage_payload": { ... }, + "items": [ ... ] + } + ] +} + +GET /v2/admin/trace-items/{item_id} + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Response: +{ + "trace": { ... }, + "item": { ... }, + "trajectory": { + "schema": "search_retrieval_trajectory/v1", + "stages": [ ... ] + } +} +`item.explain` follows the same `SearchExplain` schema as search responses (including optional `relation_context`). + +GET /v2/admin/graph/predicates?scope=... + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Query: +- scope (optional): tenant_project|project|global|all (default: all) + +Response: +{ + "predicates": [ + { + "predicate_id": "uuid", + "scope_key": "string", + "tenant_id": "string|null", + "project_id": "string|null", + "canonical": "string", + "canonical_norm": "string", + "cardinality": "single|multi", + "status": "pending|active|deprecated", + "created_at": "...", + "updated_at": "..." + } + ] +} + +PATCH /v2/admin/graph/predicates/{predicate_id} + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Body: +{ + "status": "pending|active|deprecated|null", + "cardinality": "single|multi|null" +} + +Behavior: +- At least one of status or cardinality is required. +- Allowed status transitions: pending->active, pending->deprecated, active->deprecated. +- Deprecated predicates cannot be modified (409). +- Global predicates are immutable (403). +- Note: Global predicate mutations remain follow-up work and are not covered by this contract. + +Response: +{ + "predicate_id": "uuid", + "scope_key": "string", + "tenant_id": "string|null", + "project_id": "string|null", + "canonical": "string", + "canonical_norm": "string", + "cardinality": "single|multi", + "status": "pending|active|deprecated", + "created_at": "...", + "updated_at": "..." +} + +POST /v2/admin/graph/predicates/{predicate_id}/aliases + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Body: +{ + "alias": "string" +} + +Behavior: +- alias must be non-empty. +- Deprecated predicates cannot be modified (409). +- Global predicates are immutable (403). +- Note: Global predicate mutations remain follow-up work and are not covered by this contract. + +Response: +{ + "predicate_id": "uuid", + "aliases": [ + { + "alias_id": "uuid", + "predicate_id": "uuid", + "scope_key": "string", + "alias": "string", + "alias_norm": "string", + "created_at": "..." + } + ] +} + +GET /v2/admin/graph/predicates/{predicate_id}/aliases + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Response: +{ + "predicate_id": "uuid", + "aliases": [ + { + "alias_id": "uuid", + "predicate_id": "uuid", + "scope_key": "string", + "alias": "string", + "alias_norm": "string", + "created_at": "..." + } + ] +} + +GET /v2/admin/notes/{note_id}/provenance + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Path: +- note_id: uuid + +Response: +{ + "schema": "elf.note_provenance_bundle/v1", + "note": { ... }, + "ingest_decisions": [...], + "note_versions": [...], + "indexing_outbox": [...], + "recent_traces": [...] +} + +GET /v2/admin/notes/{note_id}/history + +Headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Path: +- note_id: uuid + +Response: +{ + "schema": "elf.memory_history/v1", + "note_id": "uuid", + "events": [ + { + "event_id": "string", + "event_type": "add|update|ignore|reject|expire|delete|derived|applied|invalidated|related", + "subject_type": "note", + "note_id": "uuid", + "source_table": "string", + "source_id": "uuid|null", + "related_note_version_id": "uuid|null", + "related_decision_id": "uuid|null", + "related_proposal_id": "uuid|null", + "actor": "string|null", + "op": "string|null", + "reason_code": "string|null", + "summary": "string", + "details": { ... }, + "ts": "..." + } + ] +} + +Notes: +- History events are a chronological read-only projection over durable source tables. +- Ingest decisions that produce note versions should set `note_version_id` so history + can link the decision to the resulting note version. +- Derived, applied, and invalidated events come from consolidation proposals and + review events that reference the note in `source_refs`. + +============================================================ +15. HTTP API (PUBLIC) +============================================================ +Base: http://{service.http_bind} + +All /v2 endpoints except GET /health require context headers: +- X-ELF-Tenant-Id (required) +- X-ELF-Project-Id (required) +- X-ELF-Agent-Id (required) + +Request correlation: +- `X-ELF-Request-Id` is optional on public endpoints. +- If omitted, elf-api generates a new UUID. +- Response includes `X-ELF-Request-Id` header and `request_id` in JSON responses. + +Search creation and graph query endpoints also require: +- X-ELF-Read-Profile (required): private_only|private_plus_project|all_scopes + +Header rules: +- Headers must be valid UTF-8 strings. +- Headers must be non-empty and at most 128 characters. +- Headers must pass the English identifier gate (no non-Latin scripts, no zero-width/control characters). + +Authentication: +- security.auth_mode = "off": no auth header is required. +- security.auth_mode = "static_keys": requests must include `Authorization: Bearer <token>`, matched against `security.auth_keys`. + +POST /v2/notes/ingest + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Body: +{ + "scope": "agent_private|project_shared|org_shared", + "notes": [ + { + "type": "preference|constraint|decision|profile|fact|plan", + "key": "string|null", + "text": "English-only sentence", + "importance": 0.0, + "confidence": 0.0, + "ttl_days": 180, + "write_policy": "optional", + "structured": { + "summary": "string|null", + "facts": "string[]|null", + "concepts": "string[]|null", + "entities": [ + { + "canonical": "string|null", + "kind": "string|null", + "aliases": "string[]|null" + } + ]|null, + "relations": [ + { + "subject": { + "canonical": "string|null", + "kind": "string|null", + "aliases": "string[]|null" + }, + "predicate": "string", + "object": { + "entity": { + "canonical": "string|null", + "kind": "string|null", + "aliases": "string[]|null" + }|null, + "value": "string|null" + }, + "valid_from": "ISO8601 datetime|null", + "valid_to": "ISO8601 datetime|null" + } + ]|null + }|null, + "source_ref": { ... } + } + ] +} + +Notes: +- Exactly one of object.entity and object.value must be non-null. + +Response: +{ + "results": [ + { + "note_id": "uuid|null", + "op": "ADD|UPDATE|NONE|DELETE|REJECTED", + "policy_decision": "remember|update|ignore|reject", + "reason_code": "optional", + "field_path": "optional" + } + ] +} + +Notes: +- This endpoint is deterministic and must not call any LLM. + +POST /v2/events/ingest + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Body: +{ + "scope": "optional-scope", + "dry_run": false, + "ingestion_profile": { + "id": "default", + "version": 1 + }, + "messages": [ + { + "role": "user|assistant|tool", + "content": "English-only", + "ts": "optional", + "msg_id": "optional", + "write_policy": "optional" + } + ] +} + +Response: +{ + "ingestion_profile": { + "id": "string", + "version": 1 + }, + "extracted": { ...extractor output... }, + "results": [ + { + "note_id": "uuid|null", + "op": "ADD|UPDATE|NONE|DELETE|REJECTED", + "policy_decision": "remember|update|ignore|reject", + "reason_code": "optional", + "reason": "optional", + "field_path": "optional", + "write_policy_audits": [ + { + "exclusions": [{ "start": 0, "end": 4 }], + "redactions": [{ "span": { "start": 0, "end": 4 }, "replacement": "***" }] + } + ] + } + ] +} + +Notes: +- reason_code values include writegate rejection codes, REJECT_EVIDENCE_MISMATCH, and REJECT_WRITE_POLICY_MISMATCH. +- `ingestion_profile.id` is required when profile override is provided, and when `version` is omitted, latest version for that id is used. +- If `ingestion_profile` is omitted, the tenant/project default profile is used. + +GET /v2/admin/events/ingestion-profiles + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Response: +{ + "profiles": [ + { + "profile_id": "string", + "version": 1, + "created_at": "...", + "created_by": "agent_id" + } + ] +} + +POST /v2/admin/events/ingestion-profiles + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Body: +{ + "profile_id": "string", + "version": 1, + "profile": {}, + "created_by": "agent_id" +} + +Response: +{ + "profile_id": "string", + "version": 1, + "profile": { ... }, + "created_at": "...", + "created_by": "agent_id" +} + +GET /v2/admin/events/ingestion-profiles/{profile_id}?version=1 + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Query: +- version (optional) + +Response: +{ + "profile_id": "string", + "version": 1, + "profile": { ... }, + "created_at": "...", + "created_by": "agent_id" +} + +GET /v2/admin/events/ingestion-profiles/{profile_id}/versions + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Response: +{ + "profiles": [ + { + "profile_id": "string", + "version": 1, + "created_at": "...", + "created_by": "agent_id" + } + ] +} + +GET /v2/admin/events/ingestion-profiles/default + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Response: +{ + "profile_id": "string", + "version": 1, + "updated_at": "..." +} + +PUT /v2/admin/events/ingestion-profiles/default + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Body: +{ + "profile_id": "string", + "version": 1 +} + +Response: +{ + "profile_id": "string", + "version": 1, + "updated_at": "..." +} + +POST /v2/graph/query + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id +- X-ELF-Read-Profile + +Body: +{ + "subject": { "entity_id": "uuid" } | { "surface": "string" }, + "predicate": { "predicate_id": "uuid" } | { "surface": "string" } | null, + "scopes": ["agent_private|project_shared|org_shared"] | null, + "as_of": "RFC3339 datetime|null", + "limit": 50, + "explain": false +} + +Response: +{ + "as_of": "...", + "subject": { + "entity_id": "uuid", + "canonical": "string", + "kind": "string|null" + }, + "predicate": { + "predicate_id": "uuid", + "canonical": "string" + } | null, + "scopes": ["agent_private|project_shared|org_shared"], + "truncated": false, + "facts": [ + { + "fact_id": "uuid", + "scope": "agent_private|project_shared|org_shared", + "actor": "agent_id", + "predicate": "string", + "predicate_id": "uuid|null", + "valid_from": "...", + "valid_to": "...|null", + "temporal_status": "current|historical|future", + "object": { + "entity": { + "entity_id": "uuid", + "canonical": "string", + "kind": "string|null" + } | null, + "value": "string|null" + }, + "evidence_note_ids": ["uuid"] + } + ], + "explain": { + "schema": "elf.graph_query/v1", + "as_of": "...", + "requested_limit": 50, + "allowed_scopes": ["..."], + "effective_scopes": ["..."], + "queried_rows": 51, + "returned_rows": 50, + "truncated": true + } | null +} + +Notes: +- `subject` is required and accepts exactly one lookup shape: `entity_id` or `surface`. +- `predicate` is optional; when omitted, matching facts across predicates are eligible. +- `X-ELF-Read-Profile` is required and gates readable scopes via `[scopes.read_profiles]`. +- `scopes` is optional. If omitted, the endpoint uses all scopes allowed by `read_profile`. If provided, each scope must be allowed by `read_profile`. +- Shared scopes still apply grant checks; unreadable shared facts are not returned. +- `limit` defaults to 50 and must be in the range 1..200. +- `truncated = true` means additional facts matched but were clipped by `limit`. +- `evidence_note_ids` is ordered by evidence creation time and capped to 16 IDs per fact. +- `explain` defaults to false; when true, response includes `explain.schema = "elf.graph_query/v1"`. + +GET /v2/core-blocks + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id +- X-ELF-Read-Profile + +Response: +{ + "schema": "elf.core_memory_blocks/v1", + "tenant_id": "string", + "project_id": "string", + "agent_id": "string", + "read_profile": "private_only|private_plus_project|all_scopes", + "items": [ + { + "block_id": "uuid", + "attachment_id": "uuid", + "tenant_id": "string", + "project_id": "string", + "agent_id": "block-owner-agent", + "scope": "agent_private|project_shared|org_shared", + "key": "string", + "title": "string", + "content": "small English operating context", + "source_ref": { ... }, + "status": "active", + "updated_at": "...", + "attached_at": "...", + "attached_by_agent_id": "string", + "audit_history": [ ... ] + } + ] +} + +Notes: +- This endpoint is not archival search. It does not embed, rerank, search Qdrant, + create a search session, or record note hits. +- A block is returned only when it has an active attachment for the exact + tenant/project/agent/read_profile and the block is readable under that read_profile's + scopes and shared grants. + +POST /v2/searches + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id +- X-ELF-Read-Profile + +Body: +{ + "mode": "quick_find", + "query": "English-only", + "top_k": 12, + "candidate_k": 60, + "payload_level": "l0", + "filter": { + "schema": "search_filter_expr/v1", + "expr": { + "op": "and", + "args": [ + { "op": "eq", "field": "scope", "value": "project_shared" }, + { "op": "gte", "field": "importance", "value": 0.5 } + ] + } + } +} + +Response: +{ + "mode": "quick_find", + "trace_id": "uuid", + "search_id": "uuid", + "expires_at": "...", + "trajectory_summary": { + "schema": "search_retrieval_trajectory/v1", + "stages": [ ... ] + } | null, + "items": [ + { + "note_id": "uuid", + "type": "...", + "key": null, + "scope": "...", + "importance": 0.0, + "confidence": 0.0, + "updated_at": "...", + "expires_at": "...|null", + "final_score": 0.0, + "summary": "..." + } + ] +} + +Notes: +- This endpoint creates a search session and returns a compact note index view. +- `trajectory_summary` is optional and includes staged retrieval trajectory metadata via `search_retrieval_trajectory/v1`, with `stages` only containing summary-level stats per stage (e.g., counts/timing); it intentionally excludes full stage internals. +- `mode` is required and controls how much planning/latency tradeoff the query uses: `quick_find` for lower-latency paths, `planned_search` for planning-focused retrieval. +- `query_plan` is included only when `mode` is `planned_search`. +- record_hits is always false for this endpoint. +- `payload_level` is optional and defaults to `l0`. +- This endpoint does not return full note text; use `/v2/searches/{search_id}/notes` for progressive note hydration. + +GET /v2/searches/{search_id}?top_k=12&touch=true + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Query parameters: +- top_k (optional): Override the number of items returned. +- touch (optional, default true): When true, extend the search session TTL. +- payload_level (optional, default l0): Accepted for request parity with note-detail shaping. + +Response: Same as POST /v2/searches. + +GET /v2/searches/{search_id}/timeline?group_by=day + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Query parameters: +- group_by (optional, default day): day|none +- payload_level (optional, default l0): if `group_by` is omitted, this endpoint defaults to `none` for l0 and `day` for other levels. + +Response: +{ + "search_id": "uuid", + "expires_at": "...", + "groups": [ + { "date": "YYYY-MM-DD|all", "items": [ ... ] } + ] +} + +Notes: +- This endpoint touches the search session and extends its TTL. + +POST /v2/searches/{search_id}/notes + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Body: +{ + "note_ids": ["uuid"], + "payload_level": "l0", + "record_hits": true +} + +Response: +{ + "search_id": "uuid", + "expires_at": "...", + "results": [ + { + "note_id": "uuid", + "note": { ...full note... }, + "error": null + } + ] +} + +Notes: +- record_hits defaults to true when omitted. +- This endpoint touches the search session and extends its TTL. + +Payload-level semantics for search note details: + +| payload_level | `searches/{search_id}/notes`.text | `searches/{search_id}/notes`.structured | `searches/{search_id}/notes`.source_ref | `/admin/searches/raw`.source_ref | +| --- | --- | --- | --- | --- | +| l0 | compact summary (bounded by `max_note_chars`) | `null` | `{}` | `{}` | +| l1 | compact summary (structured summary if available, else compact text) | object | `{}` | `{}` | +| l2 | full text | object | full object | full object | + +Notes: +- Omitted `payload_level` defaults to `l0` on both `/v2/searches/{search_id}/notes` and `/v2/admin/searches/raw`. + +GET /v2/notes?scope=project_shared&status=active&type=fact + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Notes: +- If scope is omitted, agent_private notes are excluded. +- If scope is agent_private, the calling agent_id is required and enforced. + +GET /v2/notes/{note_id} + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +PATCH /v2/notes/{note_id} + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Body: +{ + "text": "optional", + "importance": 0.0, + "confidence": 0.0, + "ttl_days": 180 +} + +Response: +{ + "note_id": "uuid", + "op": "ADD|UPDATE|NONE|DELETE|REJECTED", + "reason_code": "optional" +} + +DELETE /v2/notes/{note_id} + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Response: +{ + "note_id": "uuid", + "op": "ADD|UPDATE|NONE|DELETE|REJECTED" +} + +Notes: +- Shared scopes (`project_shared`, `org_shared`) are not implicitly readable by other agents. +- Access to a shared note requires an explicit `memory_space_grants` entry for the requesting agent/project. +- `team_shared` is the public API alias for internal `project_shared`. + +POST /v2/notes/{note_id}/publish + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Body: +{ + "space": "team_shared|org_shared" +} + +Response: +{ + "note_id": "uuid", + "space": "team_shared|org_shared" +} + +Behavior: +- Publishing a private note to `team_shared` changes visibility to shared scope and creates a project-wide grant so all agents in the same project can read the note when requested explicitly from shared scope. + +POST /v2/notes/{note_id}/unpublish + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Body: +{ + "space": "team_shared|org_shared" +} + +Response: +{ + "note_id": "uuid", + "space": "agent_private" +} + +GET /v2/spaces/{space}/grants + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Path: +- space: team_shared|org_shared + +Response: +{ + "grants": [ + { + "space": "team_shared|org_shared", + "grantee_kind": "project|agent", + "grantee_agent_id": null, + "granted_by_agent_id": "agent_id", + "granted_at": "..." + } + ] +} + +POST /v2/spaces/{space}/grants + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Path: +- space: team_shared|org_shared + +Body: +{ + "grantee_kind": "project|agent", + "grantee_agent_id": "optional-agent-id" +} + +Response: +{ + "space": "team_shared|org_shared", + "grantee_kind": "project|agent", + "grantee_agent_id": null, + "granted": true +} + +POST /v2/spaces/{space}/grants/revoke + +Headers: +- X-ELF-Tenant-Id, X-ELF-Project-Id, X-ELF-Agent-Id + +Path: +- space: team_shared|org_shared + +Body: +{ + "grantee_kind": "project|agent", + "grantee_agent_id": "optional-agent-id" +} + +Response: +{ + "revoked": true +} + +GET /health + +Error body: +{ + "error_code": "NON_ENGLISH_INPUT|SCOPE_DENIED|INVALID_REQUEST|INTERNAL_ERROR", + "message": "Human readable string.", + "fields": ["$.headers.X-ELF-Tenant-Id", "$.notes[0].text"] +} + +============================================================ +16. LLM QUERY EXPANSION PROMPT (search) - APPENDIX +============================================================ +LLM output must be JSON only and match the schema below. + +Schema: +{ + "queries": ["string", "..."] +} + +Hard rules: +- queries.length <= MAX_QUERIES +- Each query must be English only and must not contain any non-English text. +- Each query must be a single sentence. +- Include the original query unless INCLUDE_ORIGINAL is false. + +System prompt (Expansion): +"You are a query expansion engine for a memory retrieval system. +Output must be valid JSON only and must match the provided schema exactly. +Generate short English-only query variations that preserve the original intent. +Do not include any non-English text. Do not add explanations or extra fields." + +User prompt template: +"Return JSON matching this exact schema: +<SCHEMA_JSON> +Constraints: +- MAX_QUERIES = <MAX_QUERIES> +- INCLUDE_ORIGINAL = <INCLUDE_ORIGINAL> +Original query: +<QUERY>" + +============================================================ +17. MCP ADAPTER (SEPARATE PROCESS) +============================================================ +- Separate binary: elf-mcp. +- Streamable HTTP MCP server that forwards tool calls to the public HTTP API. +- elf-mcp reads the optional [mcp] config section and attaches these headers on every request: + - X-ELF-Tenant-Id + - X-ELF-Project-Id + - X-ELF-Agent-Id + - X-ELF-Read-Profile (defaults to mcp.read_profile; may be overridden per tool call) +- Tools map 1:1 to v2 endpoints: + - elf_notes_ingest -> POST /v2/notes/ingest + - elf_events_ingest -> POST /v2/events/ingest + - elf_core_blocks_get -> GET /v2/core-blocks + - elf_graph_query -> POST /v2/graph/query + - elf_searches_create -> POST /v2/searches + - elf_searches_get -> GET /v2/searches/{search_id} + - elf_searches_timeline -> GET /v2/searches/{search_id}/timeline + - elf_searches_notes -> POST /v2/searches/{search_id}/notes + - elf_docs_put -> POST /v2/docs + - elf_docs_get -> GET /v2/docs/{doc_id} + - elf_docs_search_l0 -> POST /v2/docs/search/l0 + - elf_docs_excerpts_get -> POST /v2/docs/excerpts + - elf_notes_list -> GET /v2/notes + - elf_notes_get -> GET /v2/notes/{note_id} + - elf_notes_patch -> PATCH /v2/notes/{note_id} + - elf_notes_delete -> DELETE /v2/notes/{note_id} + - elf_notes_publish -> POST /v2/notes/{note_id}/publish + - elf_notes_unpublish -> POST /v2/notes/{note_id}/unpublish + - elf_space_grants_list -> GET /v2/spaces/{space}/grants + - elf_space_grant_upsert -> POST /v2/spaces/{space}/grants + - elf_space_grant_revoke -> POST /v2/spaces/{space}/grants/revoke + - elf_admin_events_ingestion_profiles_list -> GET /v2/admin/events/ingestion-profiles + - elf_admin_events_ingestion_profiles_create -> POST /v2/admin/events/ingestion-profiles + - elf_admin_events_ingestion_profile_get -> GET /v2/admin/events/ingestion-profiles/{profile_id} + - elf_admin_events_ingestion_profile_versions_list -> GET /v2/admin/events/ingestion-profiles/{profile_id}/versions + - elf_admin_events_ingestion_profile_default_get -> GET /v2/admin/events/ingestion-profiles/default + - elf_admin_events_ingestion_profile_default_set -> PUT /v2/admin/events/ingestion-profiles/default + - elf_admin_traces_recent_list -> GET /v2/admin/traces/recent + - elf_admin_trace_get -> GET /v2/admin/traces/{trace_id} + - elf_admin_trajectory_get -> GET /v2/admin/trajectories/{trace_id} + - elf_admin_trace_item_get -> GET /v2/admin/trace-items/{item_id} + - elf_admin_trace_bundle_get -> GET /v2/admin/traces/{trace_id}/bundle + - elf_admin_note_provenance_get -> GET /v2/admin/notes/{note_id}/provenance + - elf_admin_memory_history_get -> GET /v2/admin/notes/{note_id}/history +- The MCP server must contain zero business logic or policy. +- All policy remains in elf-api and elf-service. + +============================================================ +18. LLM EXTRACTOR PROMPT (add_event) - APPENDIX +============================================================ +LLM output must be JSON only and match the schema below. + +Schema: +{ + "notes": [ + { + "type": "preference|constraint|decision|profile|fact|plan", + "key": "string|null", + "text": "English-only sentence <= MAX_NOTE_CHARS", + "importance": 0.0, + "confidence": 0.0, + "ttl_days": number|null, + "structured": { + "summary": "string|null", + "facts": "string[]|null", + "concepts": "string[]|null", + "entities": [ + { + "canonical": "string|null", + "kind": "string|null", + "aliases": "string[]|null" + } + ]|null, + "relations": [ + { + "subject": { + "canonical": "string|null", + "kind": "string|null", + "aliases": "string[]|null" + }, + "predicate": "string", + "object": { + "entity": { + "canonical": "string|null", + "kind": "string|null", + "aliases": "string[]|null" + }|null, + "value": "string|null" + }, + "valid_from": "ISO8601 datetime|null", + "valid_to": "ISO8601 datetime|null" + } + ]|null + }|null, + "scope_suggestion": "agent_private|project_shared|org_shared|null", + "evidence": [ + { "message_index": number, "quote": "string" } + ], + "reason": "string" + } + ] +} + +Notes: +- Exactly one of object.entity and object.value must be non-null. + +Hard rules: +- notes.length <= MAX_NOTES +- text must be English-only (must pass the English gate) +- each note must be one sentence +- evidence must be 1..2 quotes +- each evidence.quote must be a verbatim substring of messages[message_index].content +- when write_policy is provided on a source message, evidence checks run after policy transforms +- do not store secrets or PII + +System prompt (Extractor): +"You are a memory extraction engine for an agent memory system. +Output must be valid JSON only and must match the provided schema exactly. +Extract at most MAX_NOTES high-signal, cross-session reusable memory notes from the given messages. +Each note must be one English sentence and must not contain any non-English text. +Preserve numbers, dates, percentages, currency amounts, tickers, URLs, and code snippets exactly. +Never store secrets or PII: API keys, tokens, private keys, seed phrases, passwords, bank IDs, personal addresses. +For every note, provide 1 to 2 evidence quotes copied verbatim from the input messages and include the message_index. +If you cannot provide verbatim evidence, omit the note. +If content is ephemeral or not useful long-term, return an empty notes array." + +User prompt template: +"Return JSON matching this exact schema: +<SCHEMA_JSON> +Constraints: +- MAX_NOTES = <MAX_NOTES> +- MAX_NOTE_CHARS = <MAX_NOTE_CHARS> +Here are the messages as JSON: +<MESSAGES_JSON>" + +============================================================ +19. TESTS AND ACCEPTANCE CRITERIA +============================================================ +A. add_note does not call LLM: +- Instrument LLM client call count. It must remain 0 during add_note tests. +B. English-only boundary: +- Any input that fails the English gate (Section 3) in add_note, add_event, or search + returns HTTP 422 with a JSONPath-like field path. +C. Evidence binding: +- If extractor evidence.quote is not a substring -> REJECTED with REJECT_EVIDENCE_MISMATCH. +- If mismatch is introduced when requested message write_policy transforms content -> REJECTED with REJECT_WRITE_POLICY_MISMATCH. +D. Rebuild: +- Drop Qdrant collection, recreate, call /admin/rebuild_qdrant. +- Must succeed without calling embedding API. +E. Source of truth vectors: +- For every active chunk, note_chunk_embeddings row exists and vec dim matches config. +- note_embeddings exists for active notes as derived pooled vectors. +F. Idempotency: +- add_note same payload twice -> second op = NONE. +G. Outbox eventual consistency: +- Simulate embedding provider outage. +- Outbox goes FAILED and later retries to DONE after provider recovers. + +============================================================ +20. OUT OF SCOPE (v2.0) +============================================================ +- Translation or multilingual retrieval (handled by upstream agents). +- Graph memory backend (reserved for later). +- Public internet exposure and auth (localhost only in v2.0). diff --git a/docs/spec/system_graph_memory_postgres_v1.md b/docs/spec/system_graph_memory_postgres_v1.md new file mode 100644 index 00000000..92012ae0 --- /dev/null +++ b/docs/spec/system_graph_memory_postgres_v1.md @@ -0,0 +1,238 @@ +# Graph Memory Postgres v1.0 Specification + +Purpose: Define the canonical entity/fact temporal memory schema and invariants for PostgreSQL-backed graph memory. +Status: normative +Read this when: You are implementing, migrating, or validating ELF graph-memory persistence behavior. +Not this document: Graph query runbooks, external comparisons, or service rollout procedures. +Defines: Graph Memory Postgres v1.0 tables, keys, and temporal invariants. + +Description: Canonical entity/fact temporal memory schema and invariants for PostgreSQL-backed graph memory. +Language: English only. + +Purpose: +- Persist entities, aliases, temporal facts, and evidence links for ELF graph memory. +- Keep one active fact per `(tenant, project, scope, subject, predicate, value-or-entity)` combination. + +Core tables: +- `graph_entities` +- `graph_entity_aliases` +- `graph_predicates` +- `graph_predicate_aliases` +- `graph_facts` +- `graph_fact_evidence` +- `graph_fact_supersessions` + +============================================================ +1. ENTITIES +============================================================ + +`graph_entities` columns: +- `entity_id uuid PRIMARY KEY` +- `tenant_id text NOT NULL` +- `project_id text NOT NULL` +- `canonical text NOT NULL` +- `canonical_norm text NOT NULL` +- `kind text NULL` +- `created_at timestamptz NOT NULL DEFAULT now()` +- `updated_at timestamptz NOT NULL DEFAULT now()` + +Indexes: +- `UNIQUE (tenant_id, project_id, canonical_norm)` + +Constraint and behavior: +- Canonical values are normalized by application helper before insert/upsert. +- Normalized canonical names allow idempotent upsert behavior across whitespace/case differences. + +`graph_entity_aliases` columns: +- `alias_id uuid PRIMARY KEY` +- `entity_id uuid NOT NULL REFERENCES graph_entities(entity_id) ON DELETE CASCADE` +- `alias text NOT NULL` +- `alias_norm text NOT NULL` +- `created_at timestamptz NOT NULL DEFAULT now()` + +Indexes: +- `UNIQUE (entity_id, alias_norm)` +- `INDEX (alias_norm)` + +============================================================ +2. PREDICATES +============================================================ + +Predicates are modeled as a controlled vocabulary with a self-growing registry. + +The system stores two values per fact: +- `predicate` (surface string as provided by ingestion) +- `predicate_id` (canonical predicate identity; stable across aliases) + +`graph_predicates` columns: +- `predicate_id uuid PRIMARY KEY` +- `scope_key text NOT NULL` +- `tenant_id text NULL` +- `project_id text NULL` +- `canonical text NOT NULL` +- `canonical_norm text NOT NULL` +- `cardinality text NOT NULL` (`single` or `multi`) +- `status text NOT NULL` (`pending`, `active`, `deprecated`) +- `created_at timestamptz NOT NULL DEFAULT now()` +- `updated_at timestamptz NOT NULL DEFAULT now()` + +Indexes: +- `UNIQUE (scope_key, canonical_norm)` +- `INDEX (tenant_id, project_id, status)` + +`graph_predicate_aliases` columns: +- `alias_id uuid PRIMARY KEY` +- `predicate_id uuid NOT NULL REFERENCES graph_predicates(predicate_id) ON DELETE CASCADE` +- `scope_key text NOT NULL` +- `alias text NOT NULL` +- `alias_norm text NOT NULL` +- `created_at timestamptz NOT NULL DEFAULT now()` + +Indexes: +- `UNIQUE (scope_key, alias_norm)` +- `INDEX (predicate_id)` +- `INDEX (alias_norm)` + +Scope resolution: +- Predicates are resolved by `alias_norm` within `scope_key`, with precedence: + - `${tenant_id}:${project_id}` + - `__project__:${project_id}` + - `__global__` + +Registration behavior: +- If an incoming predicate alias does not resolve, it is registered in the tenant+project scope as: + - `status = pending` + - `cardinality = multi` (safe default) +- This avoids unsafe auto-supersession until an operator activates/configures the predicate. + +============================================================ +3. FACTS +============================================================ + +`graph_facts` columns: +- `fact_id uuid PRIMARY KEY` +- `tenant_id text NOT NULL` +- `project_id text NOT NULL` +- `agent_id text NOT NULL` +- `scope text NOT NULL` +- `subject_entity_id uuid NOT NULL REFERENCES graph_entities(entity_id)` +- `predicate text NOT NULL` +- `predicate_id uuid NULL REFERENCES graph_predicates(predicate_id)` +- `object_entity_id uuid NULL REFERENCES graph_entities(entity_id)` +- `object_value text NULL` +- `valid_from timestamptz NOT NULL` +- `valid_to timestamptz NULL` +- `created_at timestamptz NOT NULL DEFAULT now()` +- `updated_at timestamptz NOT NULL DEFAULT now()` + +Checks: +- Exactly one object reference per fact: + - `(object_entity_id IS NULL AND object_value IS NOT NULL)` OR + `(object_entity_id IS NOT NULL AND object_value IS NULL)` +- `valid_to IS NULL OR valid_to > valid_from` + +Indexes: +- `(tenant_id, project_id, subject_entity_id, predicate_id)` +- `(tenant_id, project_id, valid_to)` +- `(tenant_id, project_id, object_entity_id) WHERE object_entity_id IS NOT NULL` +- `UNIQUE (tenant_id, project_id, scope, subject_entity_id, predicate_id, object_entity_id) + WHERE valid_to IS NULL AND object_entity_id IS NOT NULL` +- `UNIQUE (tenant_id, project_id, scope, subject_entity_id, predicate_id, object_value) + WHERE valid_to IS NULL AND object_value IS NOT NULL` + +============================================================ +4. EVIDENCE +============================================================ + +`graph_fact_evidence` columns: +- `evidence_id uuid PRIMARY KEY` +- `fact_id uuid NOT NULL REFERENCES graph_facts(fact_id) ON DELETE CASCADE` +- `note_id uuid NOT NULL REFERENCES memory_notes(note_id) ON DELETE CASCADE` +- `created_at timestamptz NOT NULL DEFAULT now()` + +Indexes: +- `UNIQUE (fact_id, note_id)` +- `(note_id)` +- `(fact_id)` + +============================================================ +5. SUPERSESSION +============================================================ + +Supersession records provenance for fact invalidation and supports knowledge correction. + +`graph_fact_supersessions` columns: +- `supersession_id uuid PRIMARY KEY` +- `tenant_id text NOT NULL` +- `project_id text NOT NULL` +- `from_fact_id uuid NOT NULL REFERENCES graph_facts(fact_id) ON DELETE CASCADE` +- `to_fact_id uuid NOT NULL REFERENCES graph_facts(fact_id) ON DELETE CASCADE` +- `note_id uuid NOT NULL REFERENCES memory_notes(note_id) ON DELETE CASCADE` +- `effective_at timestamptz NOT NULL` +- `created_at timestamptz NOT NULL DEFAULT now()` + +Indexes: +- `UNIQUE (from_fact_id, to_fact_id, note_id)` +- `INDEX (from_fact_id)` +- `INDEX (to_fact_id)` +- `INDEX (note_id)` + +Supersession rule (write-time): +- If a predicate is configured as `status = active` and `cardinality = single`, and a new fact is + inserted with `valid_to IS NULL` and `valid_from <= now`, then any other open-ended facts for the + same `(tenant, project, scope, subject_entity_id, predicate_id)` are invalidated by setting + `valid_to = new.valid_from`, and a row is inserted into `graph_fact_supersessions` linking the + old fact to the new fact with provenance (`note_id`). + +============================================================ +6. INVARIANTS +============================================================ +- `graph_entities.canonical_norm` must be deterministic using: + - trim + - whitespace collapse to one space + - lowercase +- An active fact is defined by: `valid_from <= now AND (valid_to IS NULL OR valid_to > now)`. +- Active duplicate prevention is enforced by partial unique indexes. +- When ingestion reintroduces a note equivalent to an existing active fact, the system reuses the existing fact row and appends additional evidence rows for the new note instead of creating another active duplicate fact row. +- Graph read APIs should expose relation temporal state derived from the validity window: + - `current` when `valid_from <= read_at AND (valid_to IS NULL OR valid_to > read_at)`. + - `historical` when `valid_to <= read_at`. + - `future` when `valid_from > read_at`. +- Search relation context may include historical facts when they are evidence-linked to a returned note, but it must label them as historical instead of silently treating them as current. + +============================================================ +7. CALL EXAMPLES +============================================================ + +``` +canonical = normalize_entity_name(" Alice Example ") +=> "alice example" + +upsert_entity("tenant-a", "project-b", canonical, Some("person")) -> entity_id +upsert_entity_alias(entity_id, "A. Example") + +predicate = resolve_or_register_predicate("tenant-a", "project-b", "connected_to") -> predicate_id + +insert_fact_with_evidence( + "tenant-a", + "project-b", + "agent-c", + "project_shared", + subject_entity_id, + "connected_to", + predicate_id, + Some(object_entity_id), + None, + now, + None, + &[note_id_1, note_id_2], +) + +fetch_active_facts_for_subject( + "tenant-a", + "project-b", + "project_shared", + subject_entity_id, + now, +) +``` diff --git a/docs/spec/system_knowledge_pages_v1.md b/docs/spec/system_knowledge_pages_v1.md new file mode 100644 index 00000000..a30336f9 --- /dev/null +++ b/docs/spec/system_knowledge_pages_v1.md @@ -0,0 +1,167 @@ +# Derived Knowledge Pages v1 Specification + +Purpose: Define derived knowledge page storage, rebuild, citation, and lint contracts. +Status: normative +Read this when: You are implementing, validating, or reviewing project/entity/concept/issue/decision page rebuild behavior. +Not this document: Viewer integration, search ranking, live LLM page generation, or source-note mutation. +Defines: `elf.knowledge_page/v1` pages, sections, source refs, lint findings, and deterministic rebuild metadata. + +## Core Rule + +Knowledge pages are derived artifacts. They must never replace or mutate authoritative +notes, docs, event audits, graph facts, consolidation proposals, traces, or source +pointers. + +Postgres remains the storage authority for both source memory and derived page records. +Knowledge pages are rebuildable from explicit source references and may be deleted or +rebuilt without changing source memory. + +## Storage + +The v1 storage tables are: + +- `knowledge_pages` +- `knowledge_page_sections` +- `knowledge_page_source_refs` +- `knowledge_page_lint_findings` + +`knowledge_pages.contract_schema` must be `elf.knowledge_page/v1`. + +Allowed `knowledge_pages.page_kind` values: + +- `project` +- `entity` +- `concept` +- `issue` +- `decision` + +Allowed `knowledge_page_source_refs.source_kind` values: + +- `note` +- `event` +- `relation` +- `proposal` + +`event` currently means a durable `add_event` audit row in `memory_ingest_decisions`. + +## Citation Contract + +Every persisted page section must have at least one citation or an explicit +`unsupported_reason`. + +Each citation must be persisted twice: + +- in `knowledge_page_sections.citations` for section-local readback +- in `knowledge_page_source_refs` for normalized lint and stale-source detection + +The normalized source ref must preserve: + +- `source_kind` +- `source_id` +- source status when available +- source `updated_at` or equivalent freshness timestamp when available +- source content hash when available +- source snapshot metadata + +## Rebuild Contract + +The v1 rebuild path is deterministic for the same explicit source snapshot. + +Rebuild input sources may include: + +- active or historical `memory_notes` +- durable `add_event` audit rows from `memory_ingest_decisions` +- `graph_facts` plus `graph_fact_evidence` +- applied `consolidation_proposals` + +Unreviewed consolidation proposals must not be used as source input for persisted pages. + +`knowledge_pages.source_coverage` must include: + +- `schema = "elf.knowledge_page.source_coverage/v1"` +- page kind and page key +- per-kind source counts +- total source count +- cited source count +- section count +- unsupported section count +- `coverage_complete` + +`knowledge_pages.rebuild_metadata` must include: + +- `schema = "elf.knowledge_page.rebuild/v1"` +- `source_snapshot_hash` +- `deterministic` +- `provider_metadata` +- `allowed_variance` + +When future provider-backed or LLM-derived page text is persisted, +`rebuild_metadata.deterministic` must be false unless the provider output is fully +replayable from recorded metadata. + +## Lint Contract + +The v1 lint path compares stored normalized source refs with current source rows. + +At minimum, lint must detect: + +- missing source rows +- changed source status +- changed source freshness timestamp +- changed source content hash +- persisted sections with no citations and no explicit unsupported reason +- persisted sections with an explicit unsupported reason +- sections whose citations have no normalized source backlinks +- page-level low source coverage where `coverage_complete` is false or the cited + source count differs from the total source count + +Stale or missing source references must be stored in `knowledge_page_lint_findings` +with `finding_type = "stale_source_ref"` and enough `details` to show stored versus +current source state. + +Unsupported sections must be stored with `finding_type = "unsupported_claim"`. +Missing citations must use `finding_type = "missing_citation"`. +Missing normalized source backlinks must use `finding_type = "missing_source_ref"`. +Incomplete page coverage must use `finding_type = "low_source_coverage"`. +Every lint finding response must include repair or rebuild guidance. Guidance is +advisory and must not mutate source memory. + +Lint findings are derived diagnostics. They must not mutate authoritative source +memory. + +## Search and Viewer Readback + +Knowledge page search is a derived-artifact readback surface, not the authoritative +note search surface. Page snippets may be shown beside search sessions only when they +are labeled as derived knowledge page snippets and include visible citation and source +coverage metadata. + +Page search results must include: + +- result type discriminator `knowledge_page_section` +- page id, page kind, page key, title, status, section id, section key, heading, role +- bounded section snippet +- section citations and normalized source backlinks +- page source coverage metadata +- lint summary and trust state that distinguishes clean, warning, error, and low + coverage results +- a derived-result notice that source notes, event audits, relation facts, and applied + proposals remain authoritative +- repair or rebuild guidance when lint or source coverage indicates stale, + unsupported, missing, or weakly covered content + +Knowledge page snippets must not be inserted into note search results as if they were +authoritative memory notes. + +## Admin API + +Minimal admin readback endpoints: + +- `POST /v2/admin/knowledge/pages/rebuild` +- `GET /v2/admin/knowledge/pages` +- `POST /v2/admin/knowledge/pages/search` +- `GET /v2/admin/knowledge/pages/{page_id}` +- `POST /v2/admin/knowledge/pages/{page_id}/lint` + +These endpoints are local admin/operator surfaces. They must not call LLM, embedding, +rerank, or external provider adapters in v1. diff --git a/docs/spec/system_memory_summary_v1.md b/docs/spec/system_memory_summary_v1.md new file mode 100644 index 00000000..0db2fe57 --- /dev/null +++ b/docs/spec/system_memory_summary_v1.md @@ -0,0 +1,171 @@ +# Reviewable Memory Summary v1 Specification + +Purpose: Define the reviewable memory summary and source-trace contract. +Status: normative +Read this when: You are implementing, validating, or reviewing summary readback for top-of-mind, background, stale, superseded, tombstoned, or derived project-profile memory. +Not this document: Scheduled background jobs, polished viewer UI, live provider generation, or authoritative note mutation. +Defines: `elf.memory_summary/v1` summary artifacts, entries, source traces, freshness markers, and inclusion rationale. + +## Core Rule + +Memory summaries are derived readback artifacts. They must never replace, rewrite, +delete, deprecate, or silently update authoritative notes, docs, event audits, graph +facts, consolidation proposals, traces, or source pointers. + +Postgres remains the source of truth for source memory. A summary may be rebuilt, +discarded, archived, or regenerated without changing the source memory that produced +it. A summary is useful only when an operator can inspect why each entry is current, +background, stale, superseded, tombstoned, or excluded. + +## Contract Schema + +Canonical schema identifier: + +```text +elf.memory_summary/v1 +``` + +Every persisted or benchmarked summary artifact must carry +`contract_schema = "elf.memory_summary/v1"`. + +## Summary Artifact + +Required fields: + +- `summary_id`: stable summary artifact id. +- `contract_schema`: `elf.memory_summary/v1`. +- `generated_at`: RFC3339 timestamp for the readback artifact. +- `tenant_id`, `project_id`, `agent_id`, and `read_profile`: context used to build the + readback. +- `entries`: non-empty array of summary entries. +- `source_trace`: source selection and exclusion metadata. + +The artifact may include provider metadata in future lanes, but v1 summary readback +does not require provider execution and must not hide source selection behind provider +state. + +## Entry Categories + +`entries[].category` must be one of: + +- `top_of_mind`: current high-priority memory that may be attached or shown first. +- `background`: current lower-priority memory that is useful context but not urgent. +- `stale`: non-current memory retained only to explain why it is stale. +- `superseded`: historical memory replaced by newer source evidence. +- `tombstone`: delete, TTL, invalidation, or suppression evidence. +- `derived_project_profile`: derived profile or project-summary entry. + +`top_of_mind` entries must have `freshness.status = "current"`. A stale, +superseded, tombstoned, historical, unsupported, or unknown entry must not be surfaced +as top-of-mind. + +## Entry Contract + +Each summary entry must include: + +- `entry_id`: stable id within the summary. +- `category`: one of the categories above. +- `text`: bounded English summary text. +- `source_refs`: source evidence ids or source-ref handles used for the entry. +- `freshness`: validity metadata. +- `rationale`: inclusion, downgrade, or exclusion rationale. +- `unsupported_claim_flags`: reviewer prompts for claims that are not supported well + enough to include as current derived memory. + +`source_refs` must be non-empty for every included or downgraded entry. A +`derived_project_profile` entry may have empty `source_refs` only when +`rationale.decision = "excluded"` and `unsupported_claim_flags` is non-empty. That +shape records a refused derived claim, not a usable memory entry. + +## Freshness + +`freshness` must include: + +- `status`: one of `current`, `background`, `historical`, `stale`, `superseded`, + `tombstoned`, or `unsupported`. +- `observed_at`: RFC3339 timestamp when the source was observed, or `null` when the + source is intentionally untimed. +- `valid_from`: RFC3339 timestamp or `null`. +- `valid_to`: RFC3339 timestamp or `null`. +- `last_confirmed_at`: RFC3339 timestamp or `null`. +- `superseded_by`: array of entry ids or source ids that supersede this entry. +- `tombstone_refs`: array of source ids or source-ref handles proving deletion, TTL + expiry, invalidation, or suppression. + +For `category = "superseded"`, `freshness.superseded_by` must be non-empty. +For `category = "tombstone"`, `freshness.tombstone_refs` must be non-empty and +`freshness.status` must be `tombstoned`. + +## Rationale + +`rationale` must include: + +- `decision`: one of `included`, `downgraded`, or `excluded`. +- `reason_code`: stable code for why the entry appears in its category. +- `reason`: reviewer-facing explanation. + +Allowed reason-code families: + +- `TOP_OF_MIND_*` +- `BACKGROUND_*` +- `DOWNGRADED_STALE_*` +- `SUPERSEDED_*` +- `TOMBSTONE_*` +- `DERIVED_PROFILE_*` +- `EXCLUDED_UNSUPPORTED_*` + +The rationale must say why an entry is included, downgraded, or excluded. It is not +enough to say that an entry exists. + +## Source Trace + +`source_trace` must include: + +- `selected_source_refs`: sources used for included or downgraded entries. +- `dropped_source_refs`: candidates not used in the final summary. +- `stale_source_refs`: stale source candidates and their downgrade reason. +- `superseded_source_refs`: superseded sources and the source that superseded them. +- `tombstone_source_refs`: tombstone or TTL invalidation sources. +- `unsupported_claim_flags`: page-level or entry-level unsupported derived claims. + +Each source trace item should preserve source status, source `updated_at` or +equivalent freshness timestamp when available, and source snapshot metadata. Empty +trace arrays are allowed only when the category is absent from the summary. + +## Readback Rules + +Summary readback must: + +- Label the artifact as derived and reviewable. +- Return entries with source refs, freshness metadata, and rationale. +- Preserve current-vs-historical truth: current facts may be top-of-mind, while old + facts must be stale, superseded, tombstoned, or excluded. +- Preserve tombstones and TTL invalidations as suppression evidence instead of + restating the deleted fact as current. +- Preserve unsupported derived candidates as reviewer prompts, not as current facts. + +Summary readback must not: + +- Present a stale, superseded, or tombstoned source as current top-of-mind memory. +- Treat a derived profile entry as authoritative source memory. +- Omit source refs from included or downgraded entries. +- Include a derived project-profile entry with neither source refs nor unsupported + claim flags. +- Claim parity with managed memory or Dreaming products from this local contract alone. + +## Benchmark Requirements + +The `memory_summary` real-world benchmark suite must fail when: + +- stale, superseded, or tombstoned entries appear as current top-of-mind facts; +- included or downgraded entries lack source refs; +- entries lack freshness or rationale metadata; +- derived project-profile entries lack both source refs and unsupported-claim flags; +- unsupported derived claims are silently included as current memory. + +Unsupported derived claims may appear only as reviewer prompts. A summary entry with +`unsupported_claim_flags` must not also be included as current memory. + +Fixture-backed evidence proves only the contract shape. Live top-of-mind behavior and +scheduled background generation require separate live reports before product-quality +claims are allowed. diff --git a/docs/spec/system_provenance_mapping_v1.md b/docs/spec/system_provenance_mapping_v1.md new file mode 100644 index 00000000..fdffaf11 --- /dev/null +++ b/docs/spec/system_provenance_mapping_v1.md @@ -0,0 +1,163 @@ +# System: Note Provenance Mapping (v1) + +Purpose: Define the provenance bundle contract used by admin operations and traceability workflows. +Status: normative +Read this when: You are implementing or validating note-provenance responses and admin traceability outputs. +Not this document: Operator debugging procedure or request-correlation runbooks. +Defines: `elf.note_provenance_bundle/v1`. + +Identifier: +- `elf.note_provenance_bundle/v1` +- `elf.memory_history/v1` + +Status: active. + +================================================== +Scope +================================================== + +- Defines the response contract for `/v2/admin/notes/{note_id}/provenance`. +- Defines the response contract for `/v2/admin/notes/{note_id}/history`. +- Captures the same note-level artifacts needed for auditability and debugging: + - source note state + - ingest decisions + - note version history + - indexing outbox state + - recent traces involving the note + - normalized memory history events +- Does not define any mutation semantics. + +================================================== +1) Endpoint contract +================================================== + +`GET /v2/admin/notes/{note_id}/provenance` + +This admin endpoint returns a single JSON object that **must** use: + +```json +{ + "schema": "elf.note_provenance_bundle/v1", + "note": { ... }, + "ingest_decisions": [...], + "note_versions": [...], + "indexing_outbox": [...], + "recent_traces": [...], + "history": [...] +} +``` + +Headers: +- `X-ELF-Request-Id` is accepted and echoed via response body `request_id` plus response header. +- Standard admin headers from section 14 apply. + +`note` fields are a copy of the requested note with: + +- core fields (`note_id`, `tenant_id`, `project_id`, `agent_id`, `scope`, `type`, `status`, ...), +- `source_ref` and `embedding_version`, +- `hit_count` / `last_hit_at`. + +`ingest_decisions` is joined from `memory_ingest_decisions` by: +- `note_id`, `tenant_id`, `project_id` +and ordered by `ts DESC`. + +`note_versions` is joined from `memory_note_versions` by: +- `note_id`, `tenant_id`, `project_id` +and ordered by `ts DESC`. + +`indexing_outbox` is joined from `indexing_outbox` by: +- `note_id`, `tenant_id`, `project_id` +and ordered by `updated_at DESC`. + +`recent_traces` is joined from: +- `search_traces` and `search_trace_items` +where the trace references the note id, ordered by `created_at DESC, trace_id DESC`. + +`history` is a normalized chronological projection joined from: +- `memory_note_versions` for add/update/delete/publish/unpublish and related transitions. +- `memory_ingest_decisions` for ignore/reject decisions and for decision-to-version links. +- `memory_notes.expires_at` for persisted expiry readback when the note has reached its + TTL timestamp and no explicit expiry version row exists. +- `consolidation_proposals` and `consolidation_proposal_reviews` for derived, + applied, and invalidated proposal outcomes that reference the note in + `source_refs`. + +================================================== +2) Response field shape +================================================== + +Core envelope: + +- `schema` (string, required): `elf.note_provenance_bundle/v1`. +- `note` (object, required): note snapshot for the requested `note_id`. +- `ingest_decisions` (array, required): ordered ingest audit entries. +- `note_versions` (array, required): ordered historical versions. +- `indexing_outbox` (array, required): active/retry indexing jobs for the note. +- `recent_traces` (array, required): bounded traces involving this note. +- `history` (array, required): bounded chronological memory events. + +No additional top-level keys are required by this contract. + +================================================== +3) History endpoint +================================================== + +`GET /v2/admin/notes/{note_id}/history` + +This admin endpoint returns: + +```json +{ + "schema": "elf.memory_history/v1", + "note_id": "uuid", + "events": [ + { + "event_id": "string", + "event_type": "add|update|ignore|reject|expire|delete|derived|applied|invalidated|related", + "subject_type": "note", + "note_id": "uuid", + "source_table": "string", + "source_id": "uuid|null", + "related_note_version_id": "uuid|null", + "related_decision_id": "uuid|null", + "related_proposal_id": "uuid|null", + "actor": "string|null", + "op": "string|null", + "reason_code": "string|null", + "summary": "string", + "details": {}, + "ts": "RFC3339 timestamp" + } + ] +} +``` + +History ordering is chronological by `ts ASC`, then `event_id ASC`. Events are +bounded by service limits. + +================================================== +4) MCP exposure +================================================== + +MCP tool: + +- `elf_admin_note_provenance_get` -> `GET /v2/admin/notes/{note_id}/provenance` +- `elf_admin_memory_history_get` -> `GET /v2/admin/notes/{note_id}/history` + +Request input: + +```json +{ + "note_id": "uuid" +} +``` + +================================================== +5) Operational guidance +================================================== + +- Keep `recent_traces` small (bounded by service defaults) to avoid large admin payloads. +- Use this endpoint for: + - explainability investigation, + - evidence lineage checks, + - outbox lag/metadata checks before manual remediation. diff --git a/docs/spec/system_search_filter_expr_v1.md b/docs/spec/system_search_filter_expr_v1.md new file mode 100644 index 00000000..55635e73 --- /dev/null +++ b/docs/spec/system_search_filter_expr_v1.md @@ -0,0 +1,172 @@ +# System: Search Filter Expression Contract v1 + +Purpose: Define the structured filter payload used by search endpoints via `search_filter_expr/v1`. +Status: normative +Read this when: You are implementing, validating, or parsing structured search filters. +Not this document: Ranking behavior, retrieval fusion policy, or search troubleshooting steps. +Defines: `search_filter_expr/v1`. + +Registry identifier: +- `search_filter_expr/v1`: Structured filter request envelope. + +Status: active. + +================================================== +Scope +================================================== + +- Defines valid `filter` JSON wrappers for search request payloads. +- Defines allowed comparison operators and fields. +- Defines validation and parsing limits. +- Does not define ranking or retrieval algorithm details. + +================================================== +1) Envelope +================================================== + +`filter` MUST be an object with this exact shape: + +```json +{ + "schema": "search_filter_expr/v1", + "expr": { + "op": "and|or|not|eq|neq|in|contains|gt|gte|lt|lte", + "args|expr|field|value": "..." + } +} +``` + +`schema` is required and must be exactly `search_filter_expr/v1`. +`expr` is required. + +================================================== +2) Expression model +================================================== + +Allowed operators: + +- logical + - `and`: logical AND of `args`. + - `or`: logical OR of `args`. + - `not`: logical NOT of `expr`. +- leaf comparisons + - `eq`: equality. + - `neq`: inequality. + - `contains`: substring contains. + - `in`: membership in an array. + - `gt`, `gte`, `lt`, `lte`: numeric/date comparisons. + +Node shapes: + +- Logical: + - `{ "op": "and", "args": [<node>, ...] }` + - `{ "op": "or", "args": [<node>, ...] }` + - `{ "op": "not", "expr": <node> }` +- Leaf: + - `{ "op": "eq|neq|contains|gt|gte|lt|lte", "field": <field>, "value": <value> }` + - `{ "op": "in", "field": <field>, "value": [<value>, ...] }` + +`field` is required for all leaf ops. +`args`/`expr` are required for logical ops. + +================================================== +3) Field allowlist +================================================== + +Only these fields are allowed: + +- `type` +- `key` +- `scope` +- `agent_id` +- `importance` +- `confidence` +- `updated_at` +- `expires_at` +- `hit_count` +- `last_hit_at` + +Requests using any other field name are rejected as validation errors. + +================================================== +4) Value constraints +================================================== + +- `importance`, `confidence`, `hit_count`: JSON number. +- `updated_at`, `expires_at`, `last_hit_at`: RFC3339 datetime strings. +- `type`, `key`, `scope`, `agent_id`: strings (trimmed). +- `contains` values must be strings. +- `in` value must be array. + +================================================== +2b) Filter impact payload +================================================== + +When filter is provided, search trajectory payload `recall.candidates` includes: + +```json +{ + "filter_impact": { + "schema": "search_filter_impact/v1", + "requested_candidate_k": 10, + "effective_candidate_k": 30, + "candidate_count_pre": 100, + "candidate_count_post": 60, + "dropped_total": 40, + "top_drop_reasons": [ + { "reason": "eq:scope", "count": 20 }, + { "reason": "in:type", "count": 15 } + ], + "filter": { + "schema": "search_filter_expr/v1", + "expr": { + "op": "eq", + "field": "scope", + "value": "project_shared" + } + } + } +} +``` + +- `requested_candidate_k`: candidate_k passed by the caller. +- `effective_candidate_k`: internal candidate overfetch value when filter is present. + `effective_candidate_k = min(MAX_CANDIDATE_K, requested_candidate_k * 3)` then clamped to be >= `top_k`. +- `candidate_count_pre`: candidates before filter evaluation (after consistency checks). +- `candidate_count_post`: candidates after filter evaluation. +- `dropped_total`: `candidate_count_pre - candidate_count_post`. +- `top_drop_reasons`: up to five reasons with highest drop counts, sorted by count desc then reason asc. +- `filter`: the validated filter payload that was evaluated. + +================================================== +5) Parse/validation limits +================================================== + +- Max depth: `<= 8` +- Max node count: `<= 128` +- `in` list limit: `<= 128` +- String size limit: UTF-8 bytes `<= 512` + +Validation errors are reported as `Error::InvalidRequest` equivalents and include JSONPath in the +message (for example, `$.filter.expr[0].field` for bad field declarations). + +================================================== +6) Error reporting +================================================== + +Errors are actionable and include the exact JSONPath where validation failed. +Examples: +- `$.filter.expr` +- `$.filter.expr.value` +- `$.filter.expr.args[1]` + +================================================== +7) Service-side application +================================================== + +`search_filter_expr/v1` is evaluated after retrieval candidate generation and +Postgres consistency checks. + +- It is **not** pushed down to Qdrant payload filters. +- It is **not** translated into SQL filters. +- It is evaluated against authoritative Postgres note metadata. diff --git a/docs/spec/system_source_ref_doc_pointer_v1.md b/docs/spec/system_source_ref_doc_pointer_v1.md new file mode 100644 index 00000000..ae83154d --- /dev/null +++ b/docs/spec/system_source_ref_doc_pointer_v1.md @@ -0,0 +1,211 @@ +# System: `source_ref` Doc Pointer Resolver (v1) + +Purpose: Define a concrete, versioned `source_ref` schema for document pointers so agents can reliably hydrate long-form evidence after a note is retrieved. +Status: normative +Read this when: You are implementing or validating note-level document-pointer hydration for retrieved evidence. +Not this document: `docs_put` ingestion-time `doc_source_ref/v1` rules or operator retrieval workflows. +Defines: `source_ref/v1` with `resolver = "elf_doc_ext/v1"`. + +Audience: LLM agents and implementers integrating ELF Core + Doc Extension v1. + +Scope: +- This spec defines a `source_ref/v1` payload with `resolver = "elf_doc_ext/v1"`. +- It targets Doc Extension v1 (PG source of truth + bounded excerpt hydration). + +Non-goals: +- Defining a translation pipeline. +- Defining non-ELF doc backends (S3/Git/threads/etc.). Those should use different `resolver` identifiers. + +============================================================ +1. Background +============================================================ + +ELF Core stores `source_ref` as an opaque JSON object and does not interpret it. Extensions and agents may interpret `source_ref` to hydrate supporting evidence on demand. + +This spec standardizes one common case: + +- A short English note in ELF Core references long-form evidence stored in Doc Extension v1. +- The note’s `source_ref` contains a stable pointer (doc_id + optional chunk_id + optional selector hints). +- When needed, an agent can call `docs_excerpts_get` and obtain a bounded excerpt plus verification signals. + +============================================================ +2. Identifiers (versioned) +============================================================ + +Envelope schema identifier: +- `schema = "source_ref/v1"` + +Doc pointer resolver identifier (this spec): +- `resolver = "elf_doc_ext/v1"` + +============================================================ +3. Data model (normative) +============================================================ + +### 3.1 Top-level object + +The `source_ref` object MUST be a JSON object and MUST include: + +- `schema` (string): `"source_ref/v1"` +- `resolver` (string): `"elf_doc_ext/v1"` +- `ref` (object): stable document identifiers (see 3.2) + +The `source_ref` object MAY include: + +- `state` (object): integrity and snapshot fields (see 3.3) +- `locator` (object): excerpt selector hints (see 3.4) +- `hashes` (object): optional integrity checks (see 3.5) +- `hints` (object): optional UX/debug fields (see 3.6) + +All keys and string values SHOULD be ASCII-safe and stable over time. + +### 3.2 `ref` (required) + +`ref` MUST include: + +- `doc_id` (string): UUID of the document in Doc Extension v1. + +`ref` MAY include: + +- `chunk_id` (string): UUID of a specific chunk. Use when the pointer came from `docs_search_l0`. + +Notes: +- `doc_id` is the canonical lookup key for hydration. +- `chunk_id` is an optional anchor that can help choose a small search neighborhood. + +### 3.3 `state` (optional but recommended) + +`state` MAY include: + +- `content_hash` (string): blake3 hex of the authoritative document content bytes as stored by Doc Extension v1. +- `chunk_hash` (string): blake3 hex of the authoritative chunk text (when `ref.chunk_id` is present). +- `doc_updated_at` (string): RFC3339 timestamp. Informative for debugging and cache keys. + +If provided, these fields allow agents to detect drift and to report stronger provenance. + +### 3.4 `locator` (optional) + +`locator` carries excerpt selector hints. The canonical selector vocabulary is: + +- `quote` (object): `TextQuoteSelector` with: + - `exact` (string, required) + - `prefix` (string, optional) + - `suffix` (string, optional) +- `position` (object): `TextPositionSelector` with: + - `start` (integer, required) + - `end` (integer, required) + +Rules: +- When both `quote` and `position` are present, agents SHOULD prefer `quote` and treat `position` as a fallback. +- `position` is byte-offset based (UTF-8), and is more brittle under content edits than `quote`. + +Optional fields: +- `level` (string): `"L0"`, `"L1"` or `"L2"` as a suggested excerpt size tier for hydration. If omitted, agents should choose based on context budget. + +### 3.5 `hashes` (optional) + +`hashes` MAY include: + +- `content_hash` (string): same meaning as `state.content_hash` (duplicated here to support simpler consumers). +- `excerpt_hash` (string): blake3 hex of a previously-hydrated excerpt, when the agent wants to pin a specific excerpt payload. + +Notes: +- `excerpt_hash` is only meaningful when the hydration request (selector + level) is stable and replayable. +- Doc Extension v1 returns `content_hash` and `excerpt_hash` along with `verified` and `verification_errors`. + +### 3.6 `hints` (optional) + +`hints` MAY include: + +- `title` (string) +- `uri` (string): canonical location (informative; not required for dereference) +- `mime_type` (string) + +These fields are convenience-only and MUST NOT be used as the sole dereference mechanism for this resolver. + +============================================================ +4. Hydration procedure (informative) +============================================================ + +Given a note with: + +- `source_ref.schema = "source_ref/v1"` +- `source_ref.resolver = "elf_doc_ext/v1"` + +An agent typically hydrates evidence by calling: + +- `docs_excerpts_get` with: + - `doc_id` from `ref.doc_id` + - optional `chunk_id` from `ref.chunk_id` + - optional selector hints from `locator.quote` and/or `locator.position` + - `level` from `locator.level` or an agent default + +The agent SHOULD: + +- Prefer excerpts with `verification.verified = true`. +- Preserve `content_hash` and `excerpt_hash` returned by Doc Extension v1 when storing derived facts or when building audit trails. + +============================================================ +5. English-only boundary interaction (normative) +============================================================ + +- ELF Core note fields (`notes[].text`, `notes[].key`, and other natural-language fields) MUST comply with the English-only boundary defined by the ELF Memory Service v2 spec. +- Doc Extension v1 MAY store original long-form evidence; agents should store English facts in ELF notes and keep originals in docs. +- `source_ref` pointers are metadata and MAY contain identifiers/URIs that are not English sentences. + +============================================================ +6. Examples (informative) +============================================================ + +### 6.1 Minimal doc pointer (doc_id only) + +```json +{ + "schema": "source_ref/v1", + "resolver": "elf_doc_ext/v1", + "ref": { + "doc_id": "6b5b2f08-9a89-4c6c-9b6b-9c0c2f0b1f2d" + } +} +``` + +### 6.2 Pointer anchored to a chunk (from docs_search_l0) + +```json +{ + "schema": "source_ref/v1", + "resolver": "elf_doc_ext/v1", + "ref": { + "doc_id": "6b5b2f08-9a89-4c6c-9b6b-9c0c2f0b1f2d", + "chunk_id": "b2e8a8d2-4c10-4a1b-98f8-7a8702fd0cc1" + }, + "state": { + "content_hash": "baf7cfd2d5b71f5b0f5d5a08a3c38d7b43cf7a2e5a4f75d5c1b4a9072f6dd3b8", + "chunk_hash": "bd85b0e07464bde3a7f3a2b2f3c2d5d4c1c9f0d0c1a2b3c4d5e6f7a8b9c0d1e2" + } +} +``` + +### 6.3 Pointer with quote + fallback position selector + +```json +{ + "schema": "source_ref/v1", + "resolver": "elf_doc_ext/v1", + "ref": { + "doc_id": "6b5b2f08-9a89-4c6c-9b6b-9c0c2f0b1f2d" + }, + "locator": { + "level": "L1", + "quote": { + "exact": "Deployment steps for service.", + "prefix": "Fact: ", + "suffix": "\\n" + }, + "position": { + "start": 1234, + "end": 1262 + } + } +} +``` diff --git a/docs/spec/system_version_registry.md b/docs/spec/system_version_registry.md new file mode 100644 index 00000000..efe338af --- /dev/null +++ b/docs/spec/system_version_registry.md @@ -0,0 +1,195 @@ +# System Version Registry + +Purpose: Provide a single registry for versioned identifiers used across ELF. +Status: normative +Read this when: You are introducing, validating, or auditing a versioned identifier used by ELF. +Not this document: Detailed behavior for any one subsystem or the procedural rollout for a version bump. +Defines: The canonical registry of ELF versioned identifiers. + +This document is normative. When a new versioned identifier is introduced, it must be added here. + +## Registry + +### HTTP API version + +- Identifier: `/v2` (URL path prefix). +- Type: HTTP API version. +- Defined in: `apps/elf-api/src/routes.rs`, `docs/spec/system_elf_memory_service_v2.md`. +- Consumers: Clients calling the ELF Memory Service API, `apps/elf-mcp`. +- Bump rule: Introduce a new prefix (for example, `/v3`) only for breaking API contract changes. Add a new spec file and keep old specs stable. + +### source_ref envelope schema + +- Identifier: `source_ref/v1`. +- Type: `source_ref` JSON envelope schema identifier. +- Defined in: `docs/spec/system_elf_memory_service_v2.md`. +- Consumers: Note/event ingestion payloads, persisted `source_ref` fields, extensions and agents that hydrate evidence. +- Bump rule: Introduce `source_ref/v2` only when the envelope becomes incompatible with v1. Keep older identifiers immutable. + +### source_ref envelope for docs_put + +- Identifier: `doc_source_ref/v1`. +- Type: `docs_put.source_ref` JSON envelope schema identifier. +- Defined in: `docs/spec/system_doc_source_ref_v1.md`. +- Consumers: Docs ingestion (`POST /v2/docs`, MCP `elf_docs_put`) and any doc evidence consumers that need durable source provenance. +- Bump rule: Introduce `doc_source_ref/v2` only when the required/optional key contract becomes incompatible with v1. Keep older identifiers immutable. + +### source_ref resolver: Doc Extension v1 doc pointer + +- Identifier: `elf_doc_ext/v1`. +- Type: `source_ref.resolver` identifier for Doc Extension v1 pointers. +- Defined in: `docs/spec/system_source_ref_doc_pointer_v1.md`. +- Consumers: Agents that hydrate doc excerpts and build evidence-linked facts; Doc Extension v1 excerpt endpoints. +- Bump rule: Introduce `elf_doc_ext/v2` only when the dereference contract (required fields, semantics, or verification surface) becomes incompatible. + +### Note provenance bundle schema + +- Identifier: `elf.note_provenance_bundle/v1`. +- Type: Admin provenance response envelope for note-level audit and correlation. +- Defined in: `docs/spec/system_provenance_mapping_v1.md`. +- Consumers: Admin tooling and MCP adapter (`elf_admin_note_provenance_get`), diagnostics runbooks. +- Bump rule: Introduce a new bundle version only when existing keys/shape/required joins become incompatible with v1 clients. + +### Memory history schema + +- Identifier: `elf.memory_history/v1`. +- Type: Admin memory history response envelope for chronological memory evolution readback. +- Defined in: `docs/spec/system_provenance_mapping_v1.md`. +- Consumers: Admin tooling and MCP adapter (`elf_admin_memory_history_get`), diagnostics runbooks, lifecycle benchmarks. +- Bump rule: Introduce a new history version only when event shape or ordering semantics become incompatible with v1 clients. + +### Doc Extension v1 docs filters contract + +- Identifier: `docs_search_filters/v1`. +- Type: Filter parameters and required Qdrant payload/index requirements for + `docs_search_l0` (HTTP/MCP). +- Defined in: `docs/spec/system_doc_extension_v1_filters.md`. +- Consumers: `apps/elf-api/src/routes.rs`, `apps/elf-mcp/src/server.rs`, `packages/elf-service/src/docs.rs`. +- Bump rule: Introduce `docs_search_filters/v2` only if accepted filter keys, + value constraints, evaluation semantics, or required Qdrant filter/index fields + become incompatible. + +### Doc Extension v1 payload/index contract + +- Identifier: `doc_extension_payload/v1`. +- Type: Qdrant payload shape and required indexes for doc chunk points. +- Defined in: `docs/spec/system_doc_extension_v1_filters.md`. +- Consumers: `apps/elf-worker/src/worker.rs`, `packages/elf-service/src/docs.rs`. +- Bump rule: Introduce `doc_extension_payload/v2` only when payload shape changes break compatible filter deployment. + +### Doc chunking profiles for doc ingestion + +- Identifier: `doc_chunking_profiles/v1`. +- Type: `docs_put` chunking profile identifier for token-window settings. +- Defined in: `docs/spec/system_doc_chunking_profiles_v1.md`. +- Consumers: `packages/elf-service/src/docs.rs`, `apps/elf-api` clients relying on typed `doc_type` behavior for deterministic token chunking. +- Bump rule: Introduce `doc_chunking_profiles/v2` only when required chunk window fields and defaults become incompatible with v1. + +### Graph query explain schema + +- Identifier: `elf.graph_query/v1`. +- Type: Graph query explain payload schema identifier. +- Defined in: `packages/elf-service/src/graph_query.rs` (`ELF_GRAPH_QUERY_SCHEMA_V1`), `docs/spec/system_elf_memory_service_v2.md`. +- Consumers: `POST /v2/graph/query` responses (`explain.schema`), `apps/elf-api`, `apps/elf-mcp`. +- Bump rule: Introduce `elf.graph_query/v2` only when explain payload fields or semantics become incompatible with v1. + +### Search ranking explain schema + +- Identifier: `search_ranking_explain/v2`. +- Type: JSON schema identifier for `SearchExplain.ranking`. +- Defined in: `packages/elf-service/src/ranking_explain_v2.rs`. +- Consumers: Search responses, trace items (`explain` JSON), evaluation harness. +- Bump rule: Change the identifier only when the payload becomes incompatible with the previous version. Do not reuse older identifiers. +- Notes: The v2 model is additive. `final_score` must equal the sum of `terms[].value`. + +### Search retrieval trajectory schema + +- Identifier: `search_retrieval_trajectory/v1`. +- Type: JSON schema identifier for staged retrieval trajectory payloads. +- Defined in: `packages/elf-service/src/search.rs` (`SEARCH_RETRIEVAL_TRAJECTORY_SCHEMA_V1`). +- Consumers: Search responses (`/v2/searches`, `/v2/searches/{search_id}`), admin trajectory endpoints, trace summaries, item explain trajectory output, evaluation attribution. +- Bump rule: Change the identifier only for incompatible trajectory payload changes. Keep previous identifiers immutable. + +### Recent traces admin list schema + +- Identifier: `elf.recent_traces/v1`. +- Type: Admin trace list response payload identifier. +- Defined in: `packages/elf-service/src/search.rs` (`RECENT_TRACES_SCHEMA_V1`) and + `docs/spec/system_elf_memory_service_v2.md`. +- Consumers: `GET /v2/admin/traces/recent` API response, `apps/elf-api`, `apps/elf-mcp`. +- Bump rule: Introduce a new identifier only if this response payload becomes incompatible. + +### Trace bundle schema + +- Identifier: `elf.trace_bundle/v1`. +- Type: Trace bundle response payload identifier for diagnostics. +- Defined in: `packages/elf-service/src/search.rs` (`TRACE_BUNDLE_SCHEMA_V1`) and + `docs/spec/system_elf_memory_service_v2.md`. +- Consumers: `GET /v2/admin/traces/{trace_id}/bundle` API response, `apps/elf-api`, `apps/elf-mcp`. +- Bump rule: Introduce a new identifier only if this response payload becomes incompatible. + +### Search filter expression schema + +- Identifier: `search_filter_expr/v1`. +- Type: JSON envelope schema for structured search filters (`filter` request payload on search endpoints). +- Defined in: `docs/spec/system_search_filter_expr_v1.md`, `apps/elf-api/src/routes.rs`, `apps/elf-mcp/src/server.rs`, `packages/elf-service/src/search.rs` (`SearchFilter`). +- Consumers: Search creation endpoints (`/v2/searches`, `/v2/admin/searches/raw`) and admin/observability surfaces. +- Bump rule: Introduce `search_filter_expr/v2` only if filter field allowlist, operators, parsing limits, value typing, or parse error model become incompatible. + +### Search filter impact schema + +- Identifier: `search_filter_impact/v1`. +- Type: Search trajectory payload for filter outcome diagnostics. +- Defined in: `docs/spec/system_search_filter_expr_v1.md`, `packages/elf-service/src/search/filter.rs` (`SearchFilterImpact`), `packages/elf-service/src/search.rs` (`SearchFilterImpact::to_stage_payload`). +- Consumers: Search trajectory stage `recall.candidates` stage payload (`search_retrieval_trajectory/v1`). +- Bump rule: Introduce `search_filter_impact/v2` only when impact fields become incompatible. + +### Doc retrieval trajectory schema + +- Identifier: `doc_retrieval_trajectory/v1`. +- Type: JSON schema identifier for staged retrieval/excerpt diagnostics in doc endpoints. +- Defined in: `packages/elf-service/src/docs.rs` (`DOC_RETRIEVAL_TRAJECTORY_SCHEMA_V1`). +- Consumers: `DocsSearchL0Response` and `DocsExcerptResponse` when `explain=true`, MCP adapters forwarding doc routes. +- Bump rule: Change the identifier only when stage format or stage ordering semantics become incompatible. + +### Ranking policy identifier + +- Identifier: `ranking_v2:<hash>`. +- Type: Ranking policy identifier recorded in traces. +- Defined in: `packages/elf-service/src/search.rs`, `docs/spec/system_elf_memory_service_v2.md`. +- Consumers: Trace inspection, evaluation replay, debugging. +- Bump rule: If the policy encoding or semantics change in a way that makes old and new policies non-comparable, introduce a new prefix (for example, `ranking_v3:`). + +### Search trace version + +- Identifier: `trace_version` (integer), current value `3`. +- Type: Trace schema version for search traces. +- Defined in: `packages/elf-service/src/search.rs` (`TRACE_VERSION`), `sql/tables/006_search_traces.sql`. +- Consumers: Worker trace persistence, trace readers, evaluation harness. +- Bump rule: Increment only when a trace schema change requires explicit version gating in readers or replay logic. + +### Embedding version + +- Identifier: `embedding_version` (string), format `{provider_id}:{model}:{vector_dim}`. +- Type: Embedding compatibility identifier. +- Defined in: `packages/elf-service/src/lib.rs` (`embedding_version(cfg)`). +- Consumers: Postgres keys (`note_embeddings`, `note_chunk_embeddings`, outbox), Qdrant payload filtering, rebuild flows. +- Bump rule: This is not a numeric version. Treat the full string as an immutable identifier. A change to any component (`provider_id`, `model`, or `vector_dim`) produces a new `embedding_version`. + +### LLM cache payload schema versions + +- Identifier: `schema_version` (integer), `expansion` current value `1`, `rerank` current value `1`. +- Type: Cache payload schema version. +- Defined in: `packages/elf-service/src/search.rs` (`EXPANSION_CACHE_SCHEMA_VERSION`, `RERANK_CACHE_SCHEMA_VERSION`). +- Consumers: Search cache read and write paths. +- Bump rule: Increment when the cached payload shape changes such that older entries must be rejected or migrated. + +## Repository process identifiers + +### Commit message schema + +- Identifier: `cmsg/1`. +- Type: Commit message schema identifier. +- Defined in: `AGENTS.md`. +- Consumers: Automated agents and repository tooling. +- Bump rule: Introduce `cmsg/2` only when the schema becomes incompatible with existing automation. diff --git a/elf.example.toml b/elf.example.toml index 5666d413..47648141 100644 --- a/elf.example.toml +++ b/elf.example.toml @@ -1,46 +1,53 @@ [service] -admin_bind = "127.0.0.1:8090" -http_bind = "127.0.0.1:8089" +admin_bind = "127.0.0.1:51891" +http_bind = "127.0.0.1:51892" log_level = "info" -mcp_bind = "127.0.0.1:8091" +mcp_bind = "127.0.0.1:51893" [storage.postgres] dsn = "postgres://postgres:postgres@127.0.0.1:5432/elf" pool_max_conns = 10 [storage.qdrant] -collection = "mem_notes_v1" -url = "http://127.0.0.1:6334" -vector_dim = 4096 +collection = "mem_notes_v2" +docs_collection = "doc_chunks_v1" +url = "http://127.0.0.1:6334" +vector_dim = 4_096 + +[mcp] +agent_id = "local-agent" +project_id = "local-project" +read_profile = "private_plus_project" +tenant_id = "local-tenant" [providers.embedding] -api_base = "https://provider.example/v1" +api_base = "https://provider.example" api_key = "REPLACE_ME" default_headers = {} -dimensions = 4096 +dimensions = 4_096 model = "embedding-model" path = "/embeddings" provider_id = "provider-id" -timeout_ms = 20000 +timeout_ms = 20_000 [providers.rerank] -api_base = "https://provider.example/v1" +api_base = "https://provider.example" api_key = "REPLACE_ME" default_headers = {} model = "rerank-model" path = "/rerank" provider_id = "provider-id" -timeout_ms = 20000 +timeout_ms = 20_000 [providers.llm_extractor] -api_base = "https://provider.example/v1" +api_base = "https://provider.example" api_key = "REPLACE_ME" default_headers = {} model = "llm-model" path = "/chat/completions" provider_id = "provider-id" temperature = 0.1 -timeout_ms = 30000 +timeout_ms = 30_000 [scopes] allowed = ["agent_private", "org_shared", "project_shared"] @@ -68,12 +75,19 @@ max_notes_per_add_event = 3 top_k = 12 update_sim_threshold = 0.85 +[memory.policy] + +[[memory.policy.rules]] +min_confidence = 0.9 +min_importance = 0.75 +note_type = "preference" +scope = "agent_private" + [chunking] enabled = true max_tokens = 512 overlap_tokens = 128 -# If empty, uses providers.embedding.model -tokenizer_repo = "" +tokenizer_repo = "REPLACE_ME" [search.expansion] include_original = true @@ -90,18 +104,81 @@ max_candidates = 0 [search.cache] enabled = true expansion_ttl_days = 7 -expansion_version = "v1" -max_payload_bytes = 262144 +max_payload_bytes = 262_144 rerank_ttl_days = 7 -rerank_version = "v1" [search.explain] -retention_days = 7 +candidate_retention_days = 2 +capture_candidates = false +retention_days = 7 +write_mode = "outbox" + +[search.recursive] +enabled = false +max_children_per_node = 4 +max_depth = 2 +max_nodes_per_scope = 32 +max_total_nodes = 256 + +[search.graph_context] +enabled = false +max_evidence_notes_per_fact = 16 +max_facts_per_item = 16 [ranking] recency_tau_days = 60 tie_breaker_weight = 0.1 +[ranking.deterministic] +enabled = false + +[ranking.deterministic.lexical] +enabled = false +max_query_terms = 16 +max_text_terms = 1024 +min_ratio = 0.3 +weight = 0.05 + +[ranking.deterministic.hits] +enabled = false +half_saturation = 8.0 +last_hit_tau_days = 14.0 +weight = 0.05 + +[ranking.deterministic.decay] +enabled = false +tau_days = 30.0 +weight = 0.05 + +[ranking.blend] +enabled = true +rerank_normalization = "rank" +retrieval_normalization = "rank" + +[[ranking.blend.segments]] +max_retrieval_rank = 3 +retrieval_weight = 0.8 + +[[ranking.blend.segments]] +max_retrieval_rank = 10 +retrieval_weight = 0.5 + +[[ranking.blend.segments]] +max_retrieval_rank = 1_000_000 +retrieval_weight = 0.2 + +[ranking.diversity] +enabled = true +max_skips = 64 +mmr_lambda = 0.7 +sim_threshold = 0.88 + +[ranking.retrieval_sources] +fusion_priority = 1 +fusion_weight = 1.0 +structured_field_priority = 0 +structured_field_weight = 1.0 + [lifecycle.ttl_days] constraint = 0 decision = 0 @@ -115,9 +192,45 @@ purge_deleted_after_days = 30 purge_deprecated_after_days = 180 [security] +auth_keys = [] +auth_mode = "off" bind_localhost_only = true evidence_max_quote_chars = 320 evidence_max_quotes = 2 evidence_min_quotes = 1 redact_secrets_on_write = true -reject_cjk = true +reject_non_english = true + +# Explicit auth mode: +# - "off": no auth checks; only safe for local loopback binds. +# - "static_keys": require Authorization: Bearer <token> and derive context from keys. +# +# When auth_mode is "static_keys", every request context is derived from the matched key. +# Caller-provided context headers are ignored/overridden. +# [[security.auth_keys]] +# token_id = "dev-client" +# token = "replace-with-opaque-secret" +# tenant_id = "t" +# project_id = "p" +# agent_id = "a" +# read_profile = "private_plus_project" +# role = "user" + +[context] +# Optional. Context metadata used to disambiguate retrieval across projects and scopes. +# +# project_descriptions keys: +# - "<tenant_id>:<project_id>" (recommended) +# - "<project_id>" (fallback) +# Optional. Additive score boost applied when query tokens match a scope description. +# Set to 0.0 to disable. +# Must be a finite number in the range 0.0-1.0. When greater than zero, scope_descriptions must be present. +scope_boost_weight = 0.05 + +[context.project_descriptions] +"t:p" = "Example project context description." + +[context.scope_descriptions] +agent_private = "Personal notes for a single agent." +org_shared = "Organization-wide policies and shared operating context." +project_shared = "Project-specific shared notes and technical context." diff --git a/packages/elf-chunking/Cargo.toml b/packages/elf-chunking/Cargo.toml index 05b3a0e9..fb8c36cd 100644 --- a/packages/elf-chunking/Cargo.toml +++ b/packages/elf-chunking/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2024" name = "elf-chunking" -version = "0.1.0" +version = "0.2.0" [dependencies] tokenizers = { workspace = true } diff --git a/packages/elf-chunking/src/lib.rs b/packages/elf-chunking/src/lib.rs index 7186c2dc..00c25670 100644 --- a/packages/elf-chunking/src/lib.rs +++ b/packages/elf-chunking/src/lib.rs @@ -1,34 +1,54 @@ -// crates.io -pub use tokenizers::Tokenizer; -use unicode_segmentation::UnicodeSegmentation; +//! Sentence-aware token chunking utilities for ELF ingestion paths. + +pub use tokenizers::{Error, Tokenizer}; -pub type TokenizerError = tokenizers::Error; +use std::path::Path; + +use unicode_segmentation::UnicodeSegmentation; -#[derive(Debug, Clone)] +/// Token-window settings used when splitting text into chunks. +#[derive(Clone, Debug)] pub struct ChunkingConfig { + /// Maximum tokens allowed in one output chunk. pub max_tokens: u32, + /// Number of tail tokens carried into the next chunk. pub overlap_tokens: u32, } -#[derive(Debug, Clone)] +/// One token-bounded text chunk with offsets into the original input. +#[derive(Clone, Debug)] pub struct Chunk { + /// Zero-based chunk position in the output sequence. pub chunk_index: i32, + /// Byte offset where this chunk starts in the original text. pub start_offset: usize, + /// Byte offset where this chunk ends in the original text. pub end_offset: usize, + /// Chunk text slice copied from the original input. pub text: String, } -pub fn load_tokenizer(repo: &str) -> Result<Tokenizer, TokenizerError> { +/// Loads a tokenizer from a local JSON file path or Hugging Face repository identifier. +pub fn load_tokenizer(repo: &str) -> Result<Tokenizer, Error> { + let path = Path::new(repo); + + if path.exists() && path.is_file() { + return Tokenizer::from_file(path); + } + Tokenizer::from_pretrained(repo, None) } +/// Splits text into sentence-aware chunks that honor the configured token window. +/// +/// Returned chunks preserve byte offsets into the original `text`. pub fn split_text(text: &str, cfg: &ChunkingConfig, tokenizer: &Tokenizer) -> Vec<Chunk> { let sentences: Vec<(usize, &str)> = text.split_sentence_bound_indices().collect(); let mut chunks = Vec::new(); let mut current = String::new(); - let mut current_start = 0usize; - let mut last_end = 0usize; - let mut chunk_index = 0i32; + let mut current_start = 0_usize; + let mut last_end = 0_usize; + let mut chunk_index = 0_i32; for (idx, sentence) in sentences { let candidate = format!("{}{}", current, sentence); @@ -36,9 +56,11 @@ pub fn split_text(text: &str, cfg: &ChunkingConfig, tokenizer: &Tokenizer) -> Ve Ok(encoding) => encoding.len(), Err(err) => { tracing::error!(error = %err, "Tokenizer failed to encode sentence candidate."); + 0 }, }; + if token_count as u32 > cfg.max_tokens && !current.is_empty() { chunks.push(Chunk { chunk_index, @@ -46,17 +68,23 @@ pub fn split_text(text: &str, cfg: &ChunkingConfig, tokenizer: &Tokenizer) -> Ve end_offset: last_end, text: current.clone(), }); + chunk_index += 1; + let overlap = overlap_tail(¤t, cfg.overlap_tokens, tokenizer); + current_start = last_end.saturating_sub(overlap.len()); current = overlap; } if current.is_empty() { current_start = idx; } + current.push_str(sentence); + last_end = idx + sentence.len(); } + if !current.is_empty() { chunks.push(Chunk { chunk_index, @@ -65,6 +93,7 @@ pub fn split_text(text: &str, cfg: &ChunkingConfig, tokenizer: &Tokenizer) -> Ve text: current, }); } + chunks } @@ -72,20 +101,24 @@ fn overlap_tail(text: &str, overlap_tokens: u32, tokenizer: &Tokenizer) -> Strin if overlap_tokens == 0 { return String::new(); } + let encoding = match tokenizer.encode(text, false) { Ok(encoding) => encoding, Err(err) => { tracing::error!(error = %err, "Tokenizer failed to encode overlap tail."); + return String::new(); }, }; let tokens = encoding.get_ids(); let start = tokens.len().saturating_sub(overlap_tokens as usize); let tail_ids = &tokens[start..]; + match tokenizer.decode(tail_ids, true) { Ok(decoded) => decoded, Err(err) => { tracing::error!(error = %err, "Tokenizer failed to decode overlap tail."); + String::new() }, } @@ -93,14 +126,35 @@ fn overlap_tail(text: &str, overlap_tokens: u32, tokenizer: &Tokenizer) -> Strin #[cfg(test)] mod tests { - use super::*; + use crate::ChunkingConfig; + + fn local_dev_tokenizer_path() -> std::path::PathBuf { + std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("../../config/local/tokenizer.wordlevel.json") + } #[test] - fn splits_into_chunks_with_overlap() { + fn loads_local_dev_tokenizer_fixture() { + let path = local_dev_tokenizer_path(); + let tokenizer = crate::load_tokenizer(path.to_str().expect("Path must be valid UTF-8")) + .expect("Local dev tokenizer must load."); let cfg = ChunkingConfig { max_tokens: 10, overlap_tokens: 2 }; - let tokenizer = load_tokenizer("Qwen/Qwen3-Embedding-8B").unwrap(); - let chunks = split_text("One. Two. Three. Four.", &cfg, &tokenizer); + let chunks = crate::split_text("One local note. Another local note.", &cfg, &tokenizer); + assert!(!chunks.is_empty()); + assert!(chunks[0].text.contains("local note")); + } + + #[test] + fn splits_into_chunks_with_overlap() { + let cfg = ChunkingConfig { max_tokens: 2, overlap_tokens: 1 }; + let path = local_dev_tokenizer_path(); + let tokenizer = crate::load_tokenizer(path.to_str().expect("Path must be valid UTF-8")) + .expect("Local dev tokenizer must load."); + let chunks = crate::split_text("One. Two. Three. Four.", &cfg, &tokenizer); + + assert!(chunks.len() > 1); assert!(chunks[0].text.contains("One")); + assert!(chunks.last().expect("Chunk should exist").text.contains("Four")); } } diff --git a/packages/elf-cli/Cargo.toml b/packages/elf-cli/Cargo.toml index 49983e2c..182c91cf 100644 --- a/packages/elf-cli/Cargo.toml +++ b/packages/elf-cli/Cargo.toml @@ -2,7 +2,7 @@ build = "../../build.rs" edition = "2024" name = "elf-cli" -version = "0.1.0" +version = "0.2.0" [dependencies] clap = { workspace = true } diff --git a/packages/elf-cli/src/lib.rs b/packages/elf-cli/src/lib.rs index 1c207829..5cc0deb1 100644 --- a/packages/elf-cli/src/lib.rs +++ b/packages/elf-cli/src/lib.rs @@ -1,9 +1,11 @@ -// crates.io +//! Shared CLI metadata and style helpers for ELF binaries. + use clap::builder::{ Styles, styling::{AnsiColor, Effects}, }; +/// Build-time version string including git SHA and target triple. pub const VERSION: &str = concat!( env!("CARGO_PKG_VERSION"), "-", @@ -12,6 +14,7 @@ pub const VERSION: &str = concat!( env!("VERGEN_CARGO_TARGET_TRIPLE"), ); +/// Returns the shared clap style palette for ELF CLIs. pub fn styles() -> Styles { Styles::styled() .header(AnsiColor::Red.on_default() | Effects::BOLD) diff --git a/packages/elf-config/Cargo.toml b/packages/elf-config/Cargo.toml index e4c83153..d6723f99 100644 --- a/packages/elf-config/Cargo.toml +++ b/packages/elf-config/Cargo.toml @@ -1,10 +1,10 @@ [package] edition = "2024" name = "elf-config" -version = "0.1.0" +version = "0.2.0" [dependencies] -color-eyre = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +thiserror = { workspace = true } toml = { workspace = true } diff --git a/packages/elf-config/src/error.rs b/packages/elf-config/src/error.rs new file mode 100644 index 00000000..a6702a94 --- /dev/null +++ b/packages/elf-config/src/error.rs @@ -0,0 +1,29 @@ +/// Result alias for ELF configuration loading and validation. +pub type Result<T, E = Error> = std::result::Result<T, E>; + +/// Errors returned while reading, parsing, or validating an ELF config file. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Reading the config file from disk failed. + #[error("Failed to read config file at {path:?}.")] + ReadConfig { + /// Path of the config file that failed to load. + path: std::path::PathBuf, + /// Underlying filesystem error. + source: std::io::Error, + }, + /// Parsing the TOML config into the typed schema failed. + #[error("Failed to parse config file at {path:?}.")] + ParseConfig { + /// Path of the config file that failed to parse. + path: std::path::PathBuf, + /// Underlying TOML decode error. + source: toml::de::Error, + }, + /// A semantic validation rule rejected the config contents. + #[error("{message}")] + Validation { + /// Human-readable validation failure message. + message: String, + }, +} diff --git a/packages/elf-config/src/lib.rs b/packages/elf-config/src/lib.rs index 457a397d..bf865c3a 100644 --- a/packages/elf-config/src/lib.rs +++ b/packages/elf-config/src/lib.rs @@ -1,98 +1,781 @@ +//! ELF configuration loading and validation. + +mod error; mod types; -// std -use std::{fs, path::Path}; +pub use self::{ + error::{Error, Result}, + types::{ + Chunking, Config, Context, EmbeddingProviderConfig, Lifecycle, LlmProviderConfig, + McpContext, Memory, MemoryPolicy, MemoryPolicyRule, Postgres, ProviderConfig, Providers, + Qdrant, Ranking, RankingBlend, RankingBlendSegment, RankingDeterministic, + RankingDeterministicDecay, RankingDeterministicHits, RankingDeterministicLexical, + RankingDiversity, RankingRetrievalSources, ReadProfiles, ScopePrecedence, + ScopeWriteAllowed, Scopes, Search, SearchCache, SearchDynamic, SearchExpansion, + SearchExplain, SearchGraphContext, SearchPrefilter, SearchRecursive, Security, + SecurityAuthKey, SecurityAuthRole, Service, Storage, TtlDays, + }, +}; -// crates.io -use color_eyre::eyre; +use std::{collections::HashSet, fs, path::Path}; -// self -pub use types::{ - Chunking, Config, EmbeddingProviderConfig, Lifecycle, LlmProviderConfig, Memory, Postgres, - ProviderConfig, Providers, Qdrant, Ranking, ReadProfiles, ScopePrecedence, ScopeWriteAllowed, - Scopes, Search, SearchCache, SearchDynamic, SearchExpansion, SearchExplain, SearchPrefilter, - Security, Service, Storage, TtlDays, -}; +/// Loads, deserializes, and validates an ELF TOML configuration file. +pub fn load(path: &Path) -> Result<Config> { + let raw = fs::read_to_string(path) + .map_err(|err| Error::ReadConfig { path: path.to_path_buf(), source: err })?; + let cfg: Config = toml::from_str(&raw) + .map_err(|err| Error::ParseConfig { path: path.to_path_buf(), source: err })?; -pub fn load(path: &Path) -> color_eyre::Result<Config> { - let raw = fs::read_to_string(path)?; - let mut cfg: Config = toml::from_str(&raw)?; - normalize(&mut cfg); validate(&cfg)?; + Ok(cfg) } -fn normalize(cfg: &mut Config) { - if cfg.chunking.tokenizer_repo.as_deref().map(|repo| repo.trim().is_empty()).unwrap_or(false) { - cfg.chunking.tokenizer_repo = None; +/// Validates a deserialized ELF configuration against repository runtime rules. +pub fn validate(cfg: &Config) -> Result<()> { + validate_security(cfg)?; + validate_service(cfg)?; + validate_storage(cfg)?; + validate_providers(cfg)?; + validate_memory(cfg)?; + validate_search(cfg)?; + validate_ranking(cfg)?; + validate_chunking(cfg)?; + validate_context(cfg)?; + validate_mcp(cfg)?; + validate_search_graph_context(cfg)?; + + Ok(()) +} + +fn validate_storage(cfg: &Config) -> Result<()> { + if cfg.storage.postgres.dsn.trim().is_empty() { + return Err(Error::Validation { + message: "storage.postgres.dsn must be non-empty.".to_string(), + }); + } + if cfg.storage.qdrant.url.trim().is_empty() { + return Err(Error::Validation { + message: "storage.qdrant.url must be non-empty.".to_string(), + }); + } + if cfg.storage.qdrant.collection.trim().is_empty() { + return Err(Error::Validation { + message: "storage.qdrant.collection must be non-empty.".to_string(), + }); + } + if cfg.storage.qdrant.docs_collection.trim().is_empty() { + return Err(Error::Validation { + message: "storage.qdrant.docs_collection must be non-empty.".to_string(), + }); } + if cfg.storage.qdrant.vector_dim == 0 { + return Err(Error::Validation { + message: "storage.qdrant.vector_dim must be greater than zero.".to_string(), + }); + } + + Ok(()) +} + +fn validate_memory(cfg: &Config) -> Result<()> { + let mut seen_rules = HashSet::new(); + + for (idx, rule) in cfg.memory.policy.rules.iter().enumerate() { + let path = format!("memory.policy.rules[{idx}]"); + + if let Some(note_type) = rule.note_type.as_ref() { + if note_type.trim().is_empty() { + return Err(Error::Validation { + message: format!("{path}.note_type cannot be blank or whitespace-only."), + }); + } + if !matches!( + note_type.as_str(), + "preference" | "constraint" | "decision" | "profile" | "fact" | "plan" + ) { + return Err(Error::Validation { + message: format!( + "{path}.note_type must be one of preference, constraint, decision, profile, fact, or plan." + ), + }); + } + } + if let Some(scope) = rule.scope.as_ref() { + if scope.trim().is_empty() { + return Err(Error::Validation { + message: format!("{path}.scope cannot be blank or whitespace-only."), + }); + } + if !cfg.scopes.allowed.iter().any(|allowed_scope| allowed_scope == scope) { + return Err(Error::Validation { + message: format!("{path}.scope must be one of allowed scopes."), + }); + } + } + if let Some(min_confidence) = rule.min_confidence { + if !min_confidence.is_finite() { + return Err(Error::Validation { + message: format!("{path}.min_confidence must be a finite number."), + }); + } + if !(0.0..=1.0).contains(&min_confidence) { + return Err(Error::Validation { + message: format!("{path}.min_confidence must be between 0.0 and 1.0."), + }); + } + } + if let Some(min_importance) = rule.min_importance { + if !min_importance.is_finite() { + return Err(Error::Validation { + message: format!("{path}.min_importance must be a finite number."), + }); + } + if !(0.0..=1.0).contains(&min_importance) { + return Err(Error::Validation { + message: format!("{path}.min_importance must be between 0.0 and 1.0."), + }); + } + } + + let rule_key = (rule.note_type.clone(), rule.scope.clone()); + + if !seen_rules.insert(rule_key) { + return Err(Error::Validation { + message: format!("{path} has a duplicate note_type and scope pair."), + }); + } + } + + Ok(()) } -pub fn validate(cfg: &Config) -> color_eyre::Result<()> { - if !cfg.security.reject_cjk { - return Err(eyre::eyre!("security.reject_cjk must be true.")); +fn validate_security(cfg: &Config) -> Result<()> { + if !cfg.security.reject_non_english { + return Err(Error::Validation { + message: "security.reject_non_english must be true.".to_string(), + }); + } + + let auth_mode = cfg.security.auth_mode.trim(); + + if !matches!(auth_mode, "off" | "static_keys") { + return Err(Error::Validation { + message: "security.auth_mode must be one of off or static_keys.".to_string(), + }); + } + if auth_mode == "off" { + if !cfg.security.auth_keys.is_empty() { + return Err(Error::Validation { + message: "security.auth_keys must be empty when security.auth_mode is off." + .to_string(), + }); + } + + return Ok(()); } + if cfg.security.auth_keys.is_empty() { + return Err(Error::Validation { + message: "security.auth_keys must be non-empty when security.auth_mode is static_keys." + .to_string(), + }); + } + + let mut token_ids = HashSet::new(); + let mut tokens = HashSet::new(); + + for (idx, key) in cfg.security.auth_keys.iter().enumerate() { + let path = format!("security.auth_keys[{idx}]"); + + if key.token_id.trim().is_empty() { + return Err(Error::Validation { + message: format!("{path}.token_id must be non-empty."), + }); + } + if key.token.trim().is_empty() { + return Err(Error::Validation { message: format!("{path}.token must be non-empty.") }); + } + if key.tenant_id.trim().is_empty() { + return Err(Error::Validation { + message: format!("{path}.tenant_id must be non-empty."), + }); + } + if key.project_id.trim().is_empty() { + return Err(Error::Validation { + message: format!("{path}.project_id must be non-empty."), + }); + } + if key.read_profile.trim().is_empty() { + return Err(Error::Validation { + message: format!("{path}.read_profile must be non-empty."), + }); + } + if !matches!( + key.read_profile.as_str(), + "private_only" | "private_plus_project" | "all_scopes" + ) { + return Err(Error::Validation { + message: format!( + "{path}.read_profile must be one of private_only, private_plus_project, or all_scopes." + ), + }); + } + + if let Some(agent_id) = key.agent_id.as_ref() + && agent_id.trim().is_empty() + { + return Err(Error::Validation { + message: format!("{path}.agent_id must be non-empty when provided."), + }); + } + + if key.agent_id.as_ref().map(|agent_id| agent_id.trim().is_empty()).unwrap_or(true) { + return Err(Error::Validation { + message: format!( + "{path}.agent_id is required when security.auth_mode is static_keys." + ), + }); + } + if !token_ids.insert(key.token_id.as_str()) { + return Err(Error::Validation { + message: format!("{path}.token_id must be unique across security.auth_keys."), + }); + } + if !tokens.insert(key.token.as_str()) { + return Err(Error::Validation { + message: format!("{path}.token must be unique across security.auth_keys."), + }); + } + } + + Ok(()) +} + +fn validate_service(cfg: &Config) -> Result<()> { if cfg.service.mcp_bind.trim().is_empty() { - return Err(eyre::eyre!("service.mcp_bind must be non-empty.")); + return Err(Error::Validation { + message: "service.mcp_bind must be non-empty.".to_string(), + }); } + + Ok(()) +} + +fn validate_providers(cfg: &Config) -> Result<()> { if cfg.providers.embedding.dimensions == 0 { - return Err(eyre::eyre!("providers.embedding.dimensions must be greater than zero.")); + return Err(Error::Validation { + message: "providers.embedding.dimensions must be greater than zero.".to_string(), + }); } if cfg.providers.embedding.dimensions != cfg.storage.qdrant.vector_dim { - return Err(eyre::eyre!( - "providers.embedding.dimensions must match storage.qdrant.vector_dim." - )); + return Err(Error::Validation { + message: "providers.embedding.dimensions must match storage.qdrant.vector_dim." + .to_string(), + }); + } + + for (label, key) in [ + ("embedding", &cfg.providers.embedding.api_key), + ("rerank", &cfg.providers.rerank.api_key), + ("llm_extractor", &cfg.providers.llm_extractor.api_key), + ] { + if key.trim().is_empty() { + return Err(Error::Validation { + message: format!("Provider {label} api_key must be non-empty."), + }); + } } + + Ok(()) +} + +fn validate_search(cfg: &Config) -> Result<()> { + validate_search_expansion(cfg)?; + validate_search_dynamic(cfg)?; + validate_search_cache(cfg)?; + validate_search_explain(cfg)?; + validate_search_explain_write_mode(cfg)?; + validate_search_recursive(cfg)?; + + Ok(()) +} + +fn validate_search_expansion(cfg: &Config) -> Result<()> { let expansion_mode = cfg.search.expansion.mode.as_str(); + if !matches!(expansion_mode, "off" | "always" | "dynamic") { - return Err(eyre::eyre!("search.expansion.mode must be one of off, always, or dynamic.")); + return Err(Error::Validation { + message: "search.expansion.mode must be one of off, always, or dynamic.".to_string(), + }); } if cfg.search.expansion.max_queries == 0 { - return Err(eyre::eyre!("search.expansion.max_queries must be greater than zero.")); + return Err(Error::Validation { + message: "search.expansion.max_queries must be greater than zero.".to_string(), + }); } + + Ok(()) +} + +fn validate_search_dynamic(cfg: &Config) -> Result<()> { if cfg.search.dynamic.min_candidates == 0 { - return Err(eyre::eyre!("search.dynamic.min_candidates must be greater than zero.")); + return Err(Error::Validation { + message: "search.dynamic.min_candidates must be greater than zero.".to_string(), + }); } if cfg.search.dynamic.min_top_score < 0.0 { - return Err(eyre::eyre!("search.dynamic.min_top_score must be zero or greater.")); + return Err(Error::Validation { + message: "search.dynamic.min_top_score must be zero or greater.".to_string(), + }); } + + Ok(()) +} + +fn validate_search_cache(cfg: &Config) -> Result<()> { if cfg.search.cache.expansion_ttl_days <= 0 { - return Err(eyre::eyre!("search.cache.expansion_ttl_days must be greater than zero.")); + return Err(Error::Validation { + message: "search.cache.expansion_ttl_days must be greater than zero.".to_string(), + }); } if cfg.search.cache.rerank_ttl_days <= 0 { - return Err(eyre::eyre!("search.cache.rerank_ttl_days must be greater than zero.")); + return Err(Error::Validation { + message: "search.cache.rerank_ttl_days must be greater than zero.".to_string(), + }); } + if let Some(max) = cfg.search.cache.max_payload_bytes && max == 0 { - return Err(eyre::eyre!("search.cache.max_payload_bytes must be greater than zero.")); + return Err(Error::Validation { + message: "search.cache.max_payload_bytes must be greater than zero.".to_string(), + }); + } + + Ok(()) +} + +fn validate_search_explain(cfg: &Config) -> Result<()> { + if cfg.search.explain.retention_days <= 0 { + return Err(Error::Validation { + message: "search.explain.retention_days must be greater than zero.".to_string(), + }); } - if cfg.search.cache.expansion_version.trim().is_empty() { - return Err(eyre::eyre!("search.cache.expansion_version must be non-empty.")); + if cfg.search.explain.candidate_retention_days <= 0 { + return Err(Error::Validation { + message: "search.explain.candidate_retention_days must be greater than zero." + .to_string(), + }); } - if cfg.search.cache.rerank_version.trim().is_empty() { - return Err(eyre::eyre!("search.cache.rerank_version must be non-empty.")); + if cfg.search.explain.candidate_retention_days > cfg.search.explain.retention_days { + return Err(Error::Validation { + message: + "search.explain.candidate_retention_days must be less than or equal to search.explain.retention_days." + .to_string(), + }); } - if cfg.search.explain.retention_days <= 0 { - return Err(eyre::eyre!("search.explain.retention_days must be greater than zero.")); + + Ok(()) +} + +fn validate_search_explain_write_mode(cfg: &Config) -> Result<()> { + match cfg.search.explain.write_mode.trim().to_ascii_lowercase().as_str() { + "outbox" | "inline" => Ok(()), + other => Err(Error::Validation { + message: format!( + "search.explain.write_mode must be one of: outbox, inline. Got {other}." + ), + }), + } +} + +fn validate_search_recursive(cfg: &Config) -> Result<()> { + if !cfg.search.recursive.enabled { + return Ok(()); + } + if cfg.search.recursive.max_depth == 0 { + return Err(Error::Validation { + message: "search.recursive.max_depth must be greater than zero.".to_string(), + }); + } + if cfg.search.recursive.max_depth > 8 { + return Err(Error::Validation { + message: "search.recursive.max_depth must be 8 or less.".to_string(), + }); + } + if cfg.search.recursive.max_children_per_node == 0 { + return Err(Error::Validation { + message: "search.recursive.max_children_per_node must be greater than zero." + .to_string(), + }); + } + if cfg.search.recursive.max_children_per_node > 64 { + return Err(Error::Validation { + message: "search.recursive.max_children_per_node must be 64 or less.".to_string(), + }); + } + if cfg.search.recursive.max_nodes_per_scope == 0 { + return Err(Error::Validation { + message: "search.recursive.max_nodes_per_scope must be greater than zero.".to_string(), + }); + } + if cfg.search.recursive.max_nodes_per_scope > 250 { + return Err(Error::Validation { + message: "search.recursive.max_nodes_per_scope must be 250 or less.".to_string(), + }); + } + if cfg.search.recursive.max_total_nodes == 0 { + return Err(Error::Validation { + message: "search.recursive.max_total_nodes must be greater than zero.".to_string(), + }); + } + if cfg.search.recursive.max_total_nodes > 2_000 { + return Err(Error::Validation { + message: "search.recursive.max_total_nodes must be 2_000 or less.".to_string(), + }); } + if cfg.search.recursive.max_total_nodes < cfg.search.recursive.max_nodes_per_scope { + return Err(Error::Validation { + message: + "search.recursive.max_total_nodes must be at least search.recursive.max_nodes_per_scope." + .to_string(), + }); + } + + Ok(()) +} + +fn validate_search_graph_context(cfg: &Config) -> Result<()> { + if !cfg.search.graph_context.enabled { + return Ok(()); + } + + let ctx = &cfg.search.graph_context; + + if ctx.max_facts_per_item == 0 { + return Err(Error::Validation { + message: "search.graph_context.max_facts_per_item must be greater than zero." + .to_string(), + }); + } + if ctx.max_facts_per_item > 1_000 { + return Err(Error::Validation { + message: "search.graph_context.max_facts_per_item must be 1,000 or less.".to_string(), + }); + } + if ctx.max_evidence_notes_per_fact == 0 { + return Err(Error::Validation { + message: "search.graph_context.max_evidence_notes_per_fact must be greater than zero." + .to_string(), + }); + } + if ctx.max_evidence_notes_per_fact > 1_000 { + return Err(Error::Validation { + message: "search.graph_context.max_evidence_notes_per_fact must be 1,000 or less." + .to_string(), + }); + } + + Ok(()) +} + +fn validate_ranking(cfg: &Config) -> Result<()> { + validate_ranking_core(cfg)?; + validate_ranking_blend(cfg)?; + validate_ranking_diversity(cfg)?; + validate_ranking_retrieval_sources(cfg)?; + validate_ranking_deterministic(cfg)?; + + Ok(()) +} + +fn validate_ranking_core(cfg: &Config) -> Result<()> { + if cfg.ranking.tie_breaker_weight < 0.0 { + return Err(Error::Validation { + message: "ranking.tie_breaker_weight must be zero or greater.".to_string(), + }); + } + if !cfg.ranking.tie_breaker_weight.is_finite() { + return Err(Error::Validation { + message: "ranking.tie_breaker_weight must be a finite number.".to_string(), + }); + } + if cfg.ranking.recency_tau_days < 0.0 { + return Err(Error::Validation { + message: "ranking.recency_tau_days must be zero or greater.".to_string(), + }); + } + if !cfg.ranking.recency_tau_days.is_finite() { + return Err(Error::Validation { + message: "ranking.recency_tau_days must be a finite number.".to_string(), + }); + } + + Ok(()) +} + +fn validate_ranking_blend(cfg: &Config) -> Result<()> { + if !cfg.ranking.blend.enabled { + return Ok(()); + } + if cfg.ranking.blend.segments.is_empty() { + return Err(Error::Validation { + message: "ranking.blend.segments must be non-empty when enabled.".to_string(), + }); + } + + for segment in &cfg.ranking.blend.segments { + if !segment.retrieval_weight.is_finite() { + return Err(Error::Validation { + message: "ranking.blend.segments.retrieval_weight must be a finite number." + .to_string(), + }); + } + if !(0.0..=1.0).contains(&segment.retrieval_weight) { + return Err(Error::Validation { + message: "ranking.blend.segments.retrieval_weight must be in the range 0.0-1.0." + .to_string(), + }); + } + if segment.max_retrieval_rank == 0 { + return Err(Error::Validation { + message: "ranking.blend.segments.max_retrieval_rank must be greater than zero." + .to_string(), + }); + } + } + + Ok(()) +} + +fn validate_ranking_diversity(cfg: &Config) -> Result<()> { + let diversity = &cfg.ranking.diversity; + + if !diversity.sim_threshold.is_finite() { + return Err(Error::Validation { + message: "ranking.diversity.sim_threshold must be a finite number.".to_string(), + }); + } + if !(0.0..=1.0).contains(&diversity.sim_threshold) { + return Err(Error::Validation { + message: "ranking.diversity.sim_threshold must be in the range 0.0-1.0.".to_string(), + }); + } + if !diversity.mmr_lambda.is_finite() { + return Err(Error::Validation { + message: "ranking.diversity.mmr_lambda must be a finite number.".to_string(), + }); + } + if !(0.0..=1.0).contains(&diversity.mmr_lambda) { + return Err(Error::Validation { + message: "ranking.diversity.mmr_lambda must be in the range 0.0-1.0.".to_string(), + }); + } + + Ok(()) +} + +fn validate_ranking_retrieval_sources(cfg: &Config) -> Result<()> { + let retrieval_sources = &cfg.ranking.retrieval_sources; + + for (path, value) in [ + ("ranking.retrieval_sources.fusion_weight", retrieval_sources.fusion_weight), + ( + "ranking.retrieval_sources.structured_field_weight", + retrieval_sources.structured_field_weight, + ), + ] { + if !value.is_finite() { + return Err(Error::Validation { message: format!("{path} must be a finite number.") }); + } + if value < 0.0 { + return Err(Error::Validation { message: format!("{path} must be zero or greater.") }); + } + } + + if retrieval_sources.fusion_weight <= 0.0 && retrieval_sources.structured_field_weight <= 0.0 { + return Err(Error::Validation { + message: "At least one retrieval source weight must be greater than zero.".to_string(), + }); + } + + Ok(()) +} + +fn validate_ranking_deterministic(cfg: &Config) -> Result<()> { + let det = &cfg.ranking.deterministic; + let det_lex = &det.lexical; + let det_hits = &det.hits; + let det_decay = &det.decay; + + for (path, weight) in [ + ("ranking.deterministic.lexical", det_lex.weight), + ("ranking.deterministic.hits", det_hits.weight), + ("ranking.deterministic.decay", det_decay.weight), + ] { + if weight < 0.0 { + return Err(Error::Validation { + message: format!("{path}.weight must be zero or greater."), + }); + } + if !weight.is_finite() { + return Err(Error::Validation { + message: format!("{path}.weight must be a finite number."), + }); + } + } + + if det.enabled && det_lex.enabled { + if !det_lex.min_ratio.is_finite() { + return Err(Error::Validation { + message: "ranking.deterministic.lexical.min_ratio must be a finite number." + .to_string(), + }); + } + if !(0.0..=1.0).contains(&det_lex.min_ratio) { + return Err(Error::Validation { + message: "ranking.deterministic.lexical.min_ratio must be in the range 0.0-1.0." + .to_string(), + }); + } + if det_lex.max_query_terms == 0 { + return Err(Error::Validation { + message: "ranking.deterministic.lexical.max_query_terms must be greater than zero." + .to_string(), + }); + } + if det_lex.max_text_terms == 0 { + return Err(Error::Validation { + message: "ranking.deterministic.lexical.max_text_terms must be greater than zero." + .to_string(), + }); + } + } + if det.enabled && det_hits.enabled { + if !det_hits.half_saturation.is_finite() { + return Err(Error::Validation { + message: "ranking.deterministic.hits.half_saturation must be a finite number." + .to_string(), + }); + } + if det_hits.half_saturation <= 0.0 { + return Err(Error::Validation { + message: "ranking.deterministic.hits.half_saturation must be greater than zero." + .to_string(), + }); + } + if !det_hits.last_hit_tau_days.is_finite() { + return Err(Error::Validation { + message: "ranking.deterministic.hits.last_hit_tau_days must be a finite number." + .to_string(), + }); + } + if det_hits.last_hit_tau_days < 0.0 { + return Err(Error::Validation { + message: "ranking.deterministic.hits.last_hit_tau_days must be zero or greater." + .to_string(), + }); + } + } + if det.enabled && det_decay.enabled { + if !det_decay.tau_days.is_finite() { + return Err(Error::Validation { + message: "ranking.deterministic.decay.tau_days must be a finite number." + .to_string(), + }); + } + if det_decay.tau_days <= 0.0 { + return Err(Error::Validation { + message: "ranking.deterministic.decay.tau_days must be greater than zero." + .to_string(), + }); + } + } + + Ok(()) +} + +fn validate_chunking(cfg: &Config) -> Result<()> { if !cfg.chunking.enabled { - return Err(eyre::eyre!("chunking.enabled must be true.")); + return Err(Error::Validation { message: "chunking.enabled must be true.".to_string() }); + } + if cfg.chunking.tokenizer_repo.trim().is_empty() { + return Err(Error::Validation { + message: "chunking.tokenizer_repo must be a non-empty string.".to_string(), + }); } if cfg.chunking.max_tokens == 0 { - return Err(eyre::eyre!("chunking.max_tokens must be greater than zero.")); + return Err(Error::Validation { + message: "chunking.max_tokens must be greater than zero.".to_string(), + }); } if cfg.chunking.overlap_tokens >= cfg.chunking.max_tokens { - return Err(eyre::eyre!("chunking.overlap_tokens must be less than chunking.max_tokens.")); + return Err(Error::Validation { + message: "chunking.overlap_tokens must be less than chunking.max_tokens.".to_string(), + }); } - for (label, key) in [ - ("embedding", &cfg.providers.embedding.api_key), - ("rerank", &cfg.providers.rerank.api_key), - ("llm_extractor", &cfg.providers.llm_extractor.api_key), + + Ok(()) +} + +fn validate_context(cfg: &Config) -> Result<()> { + if let Some(context) = cfg.context.as_ref() + && let Some(weight) = context.scope_boost_weight + { + if !weight.is_finite() { + return Err(Error::Validation { + message: "context.scope_boost_weight must be a finite number.".to_string(), + }); + } + if weight < 0.0 { + return Err(Error::Validation { + message: "context.scope_boost_weight must be zero or greater.".to_string(), + }); + } + if weight > 1.0 { + return Err(Error::Validation { + message: "context.scope_boost_weight must be 1.0 or less.".to_string(), + }); + } + if weight > 0.0 + && context + .scope_descriptions + .as_ref() + .map(|descriptions| descriptions.is_empty()) + .unwrap_or(true) + { + return Err(Error::Validation { + message: "context.scope_descriptions must be non-empty when context.scope_boost_weight is greater than zero." + .to_string(), + }); + } + } + + Ok(()) +} + +fn validate_mcp(cfg: &Config) -> Result<()> { + let Some(mcp) = cfg.mcp.as_ref() else { return Ok(()) }; + + for (label, value) in [ + ("mcp.tenant_id", &mcp.tenant_id), + ("mcp.project_id", &mcp.project_id), + ("mcp.agent_id", &mcp.agent_id), + ("mcp.read_profile", &mcp.read_profile), ] { - if key.trim().is_empty() { - return Err(eyre::eyre!("Provider {label} api_key must be non-empty.")); + if value.trim().is_empty() { + return Err(Error::Validation { message: format!("{label} must be non-empty.") }); } } + + if !matches!(mcp.read_profile.as_str(), "private_only" | "private_plus_project" | "all_scopes") + { + return Err(Error::Validation { + message: + "mcp.read_profile must be one of private_only, private_plus_project, or all_scopes." + .to_string(), + }); + } + Ok(()) } diff --git a/packages/elf-config/src/types.rs b/packages/elf-config/src/types.rs index 5d435601..ff7144e0 100644 --- a/packages/elf-config/src/types.rs +++ b/packages/elf-config/src/types.rs @@ -1,207 +1,568 @@ -// crates.io +use std::collections::HashMap; + use serde::Deserialize; +use serde_json::{Map, Value}; +/// Complete ELF runtime configuration loaded from `elf.toml`. #[derive(Debug, Deserialize)] pub struct Config { + /// Network bind and log-level settings for ELF services. pub service: Service, + /// Postgres and Qdrant storage backends. pub storage: Storage, + /// Provider settings for embedding, rerank, and extraction calls. pub providers: Providers, + /// Scope labels, read profiles, precedence, and write permissions. pub scopes: Scopes, + /// Write-path limits and memory policy controls. pub memory: Memory, + /// Sentence-aware chunking settings used by ingestion paths. pub chunking: Chunking, + /// Query expansion, caching, explainability, and recursive search settings. pub search: Search, + /// Retrieval ranking, blending, and diversity settings. pub ranking: Ranking, + /// TTL and purge windows for stored notes. pub lifecycle: Lifecycle, + /// Bind-localhost, evidence, and auth settings. pub security: Security, + /// Optional retrieval context metadata used to boost project and scope matches. + pub context: Option<Context>, + /// Optional MCP forwarding context used by `elf-mcp`. + pub mcp: Option<McpContext>, } +/// Optional metadata used to improve retrieval disambiguation across projects and scopes. +#[derive(Debug, Deserialize)] +pub struct Context { + /// Optional. Map keys are either "<tenant_id>:<project_id>" or "<project_id>". + pub project_descriptions: Option<HashMap<String, String>>, + /// Optional. Map keys are scope labels, e.g. "project_shared". + pub scope_descriptions: Option<HashMap<String, String>>, + /// Optional. Additive boost applied to final scores when a query's tokens match a scope + /// description. + pub scope_boost_weight: Option<f32>, +} + +/// Static forwarding context attached by `elf-mcp` to proxied requests. +#[derive(Clone, Debug, Deserialize)] +pub struct McpContext { + /// Tenant identifier attached to proxied MCP requests. + pub tenant_id: String, + /// Project identifier attached to proxied MCP requests. + pub project_id: String, + /// Agent identifier attached to proxied MCP requests. + pub agent_id: String, + /// Read profile attached to proxied MCP requests. + pub read_profile: String, +} + +/// Bind addresses and logging settings for ELF services. #[derive(Debug, Deserialize)] pub struct Service { + /// Bind address for the public HTTP API. pub http_bind: String, + /// Bind address for the MCP server entrypoint. pub mcp_bind: String, + /// Bind address for the admin HTTP API. pub admin_bind: String, + /// Default service log level. pub log_level: String, } +/// Storage backend configuration for persisted note and document data. #[derive(Debug, Deserialize)] pub struct Storage { + /// Postgres source-of-truth settings. pub postgres: Postgres, + /// Qdrant derived-index settings. pub qdrant: Qdrant, } +/// Postgres connection settings. #[derive(Debug, Deserialize)] pub struct Postgres { + /// Postgres DSN used by ELF services. pub dsn: String, + /// Maximum number of pooled Postgres connections. pub pool_max_conns: u32, } +/// Qdrant collection settings for note and document vectors. #[derive(Debug, Deserialize)] pub struct Qdrant { + /// Qdrant base URL used by clients in this workspace. pub url: String, + /// Primary notes collection name. pub collection: String, + /// Document-chunk collection name. + pub docs_collection: String, + /// Vector dimension expected by both note and document collections. pub vector_dim: u32, } +/// Provider configuration bundle for all external model calls. #[derive(Debug, Deserialize)] pub struct Providers { + /// Embedding provider used for vector generation. pub embedding: EmbeddingProviderConfig, + /// Rerank provider used for late-stage scoring. pub rerank: ProviderConfig, + /// LLM provider used by extraction flows such as `add_event`. pub llm_extractor: LlmProviderConfig, } +/// Embedding-provider settings. #[derive(Debug, Deserialize)] pub struct EmbeddingProviderConfig { + /// Provider implementation identifier. pub provider_id: String, + /// Base URL for embedding API requests. pub api_base: String, + /// Non-empty API key for embedding requests. pub api_key: String, + /// Request path appended to `api_base`. pub path: String, + /// Embedding model identifier. pub model: String, + /// Expected embedding vector dimension. pub dimensions: u32, + /// Request timeout in milliseconds. pub timeout_ms: u64, - pub default_headers: serde_json::Map<String, serde_json::Value>, + /// Extra HTTP headers sent with embedding requests. + pub default_headers: Map<String, Value>, } +/// Generic provider settings shared by non-embedding APIs such as rerank. #[derive(Debug, Deserialize)] pub struct ProviderConfig { + /// Provider implementation identifier. pub provider_id: String, + /// Base URL for provider API requests. pub api_base: String, + /// Non-empty API key for provider requests. pub api_key: String, + /// Request path appended to `api_base`. pub path: String, + /// Provider model identifier. pub model: String, + /// Request timeout in milliseconds. pub timeout_ms: u64, - pub default_headers: serde_json::Map<String, serde_json::Value>, + /// Extra HTTP headers sent with provider requests. + pub default_headers: Map<String, Value>, } +/// LLM extractor provider settings. #[derive(Debug, Deserialize)] pub struct LlmProviderConfig { + /// Provider implementation identifier. pub provider_id: String, + /// Base URL for extraction API requests. pub api_base: String, + /// Non-empty API key for extraction requests. pub api_key: String, + /// Request path appended to `api_base`. pub path: String, + /// LLM model identifier. pub model: String, + /// Sampling temperature for extraction requests. pub temperature: f32, + /// Request timeout in milliseconds. pub timeout_ms: u64, - pub default_headers: serde_json::Map<String, serde_json::Value>, + /// Extra HTTP headers sent with extraction requests. + pub default_headers: Map<String, Value>, } +/// Scope labels and access policy used by memory operations. #[derive(Debug, Deserialize)] pub struct Scopes { + /// All scope labels allowed by this deployment. pub allowed: Vec<String>, + /// Scope sets referenced by named read profiles. pub read_profiles: ReadProfiles, + /// Relative precedence used when multiple scopes are eligible. pub precedence: ScopePrecedence, + /// Scope-level write permissions. pub write_allowed: ScopeWriteAllowed, } +/// Scope lists used by named read profiles. #[derive(Debug, Deserialize)] pub struct ReadProfiles { + /// Scope set for `private_only`. pub private_only: Vec<String>, + /// Scope set for `private_plus_project`. pub private_plus_project: Vec<String>, + /// Scope set for `all_scopes`. pub all_scopes: Vec<String>, } +/// Integer precedence used to break ties between scope classes. #[derive(Debug, Deserialize)] pub struct ScopePrecedence { + /// Precedence assigned to `agent_private`. pub agent_private: i32, + /// Precedence assigned to `project_shared`. pub project_shared: i32, + /// Precedence assigned to `org_shared`. pub org_shared: i32, } +/// Scope-level write toggles. #[derive(Debug, Deserialize)] pub struct ScopeWriteAllowed { + /// Whether writes to `agent_private` are allowed. pub agent_private: bool, + /// Whether writes to `project_shared` are allowed. pub project_shared: bool, + /// Whether writes to `org_shared` are allowed. pub org_shared: bool, } +/// Write-path limits and policy controls for note ingestion. #[derive(Debug, Deserialize)] pub struct Memory { + /// Maximum number of notes accepted per `add_event` request. pub max_notes_per_add_event: u32, + /// Maximum character length for an individual note. pub max_note_chars: u32, + /// Similarity threshold for duplicate detection. pub dup_sim_threshold: f32, + /// Similarity threshold for update-vs-insert decisions. pub update_sim_threshold: f32, + /// Candidate pool size used before final top-k selection. pub candidate_k: u32, + /// Final top-k size for note retrieval. pub top_k: u32, + /// Optional downgrade rules applied after base memory decisions. + pub policy: MemoryPolicy, +} + +/// Collection of memory-policy downgrade rules. +#[derive(Debug, Deserialize)] +pub struct MemoryPolicy { + /// Ordered policy rules evaluated against note type, scope, and scores. + pub rules: Vec<MemoryPolicyRule>, +} + +/// A single memory-policy rule matched by note metadata and confidence/importance thresholds. +#[derive(Debug, Default, Deserialize)] +pub struct MemoryPolicyRule { + /// Optional note type selector. + pub note_type: Option<String>, + /// Optional scope selector. + pub scope: Option<String>, + /// Optional minimum confidence required for the rule to match. + pub min_confidence: Option<f32>, + /// Optional minimum importance required for the rule to match. + pub min_importance: Option<f32>, } +/// Sentence-aware token chunking settings. #[derive(Debug, Deserialize)] pub struct Chunking { + /// Whether chunking support is enabled. pub enabled: bool, + /// Maximum tokens allowed in one chunk. pub max_tokens: u32, + /// Number of tail tokens overlapped into the next chunk. pub overlap_tokens: u32, - pub tokenizer_repo: Option<String>, + /// Hugging Face tokenizer repo used for token counting. + pub tokenizer_repo: String, } +/// Query-time search settings. #[derive(Debug, Deserialize)] pub struct Search { + /// Query expansion behavior. pub expansion: SearchExpansion, + /// Dynamic-expansion trigger thresholds. pub dynamic: SearchDynamic, + /// Prefilter candidate cap. pub prefilter: SearchPrefilter, + /// Search cache settings. pub cache: SearchCache, + /// Explainability retention settings. pub explain: SearchExplain, + /// Recursive retrieval traversal settings. + pub recursive: SearchRecursive, + /// Graph-context enrichment settings. + pub graph_context: SearchGraphContext, } +/// Query expansion settings. #[derive(Debug, Deserialize)] pub struct SearchExpansion { + /// Expansion mode such as `off`, `always`, or `dynamic`. pub mode: String, + /// Maximum number of expansion queries emitted. pub max_queries: u32, + /// Whether the original query is retained alongside expansions. pub include_original: bool, } +/// Thresholds that determine when dynamic expansion is activated. #[derive(Debug, Deserialize)] pub struct SearchDynamic { + /// Minimum initial candidate count before dynamic expansion is skipped. pub min_candidates: u32, + /// Minimum top score before dynamic expansion is skipped. pub min_top_score: f32, } +/// Candidate prefilter settings. #[derive(Debug, Deserialize)] pub struct SearchPrefilter { + /// Maximum number of candidates kept before later stages. pub max_candidates: u32, } +/// Cache settings for expansion and rerank outputs. #[derive(Debug, Deserialize)] pub struct SearchCache { + /// Whether search caching is enabled. pub enabled: bool, + /// TTL in days for cached expansion outputs. pub expansion_ttl_days: i64, + /// TTL in days for cached rerank outputs. pub rerank_ttl_days: i64, + /// Optional upper bound on cached payload size in bytes. pub max_payload_bytes: Option<u64>, - pub expansion_version: String, - pub rerank_version: String, } +/// Search explainability retention and write-path settings. #[derive(Debug, Deserialize)] pub struct SearchExplain { + /// Retention window for explain rows in days. pub retention_days: i64, + /// Whether candidate snapshots are captured. + pub capture_candidates: bool, + /// Retention window for candidate snapshots in days. + pub candidate_retention_days: i64, + /// Explainability write mode. + pub write_mode: String, +} + +/// Recursive retrieval traversal limits. +#[derive(Debug, Deserialize)] +pub struct SearchRecursive { + /// Whether recursive retrieval is enabled. + pub enabled: bool, + /// Maximum recursion depth. + pub max_depth: u32, + /// Maximum children expanded per node. + pub max_children_per_node: u32, + /// Maximum nodes retained per scope. + pub max_nodes_per_scope: u32, + /// Maximum nodes retained across the whole traversal. + pub max_total_nodes: u32, } +/// Graph-context enrichment limits applied to search responses. +#[derive(Debug, Deserialize)] +pub struct SearchGraphContext { + /// Whether graph-context enrichment is enabled. + pub enabled: bool, + /// Maximum facts attached to one response item. + pub max_facts_per_item: u32, + /// Maximum evidence notes attached to one fact. + pub max_evidence_notes_per_fact: u32, +} + +/// Ranking settings for retrieval and rerank fusion. #[derive(Debug, Deserialize)] pub struct Ranking { + /// Recency decay window in days. pub recency_tau_days: f32, + /// Small deterministic tie-breaker weight. pub tie_breaker_weight: f32, + /// Retrieval/rerank blending configuration. + pub blend: RankingBlend, + /// Optional deterministic scoring overlays. + pub deterministic: RankingDeterministic, + /// Diversity settings applied during selection. + pub diversity: RankingDiversity, + /// Source weighting and priority between fusion and structured fields. + pub retrieval_sources: RankingRetrievalSources, +} + +/// Deterministic ranking overlays applied on top of model scores. +#[derive(Debug, Deserialize)] +pub struct RankingDeterministic { + /// Whether deterministic overlays are enabled. + pub enabled: bool, + /// Lexical-overlap term settings. + pub lexical: RankingDeterministicLexical, + /// Historical-hit term settings. + pub hits: RankingDeterministicHits, + /// Decay term settings. + pub decay: RankingDeterministicDecay, } +/// Lexical-overlap deterministic term. +#[derive(Debug, Deserialize)] +pub struct RankingDeterministicLexical { + /// Whether the lexical term is enabled. + pub enabled: bool, + /// Weight assigned to the lexical term. + pub weight: f32, + /// Minimum overlap ratio required before the term applies. + pub min_ratio: f32, + /// Maximum number of query terms examined. + pub max_query_terms: u32, + /// Maximum number of text terms examined. + pub max_text_terms: u32, +} + +/// Historical-hit deterministic term. +#[derive(Debug, Deserialize)] +pub struct RankingDeterministicHits { + /// Whether the hits term is enabled. + pub enabled: bool, + /// Weight assigned to the hits term. + pub weight: f32, + /// Half-saturation parameter for hit-count scaling. + pub half_saturation: f32, + /// Decay window in days for the last-hit component. + pub last_hit_tau_days: f32, +} + +/// Decay-based deterministic term. +#[derive(Debug, Deserialize)] +pub struct RankingDeterministicDecay { + /// Whether the decay term is enabled. + pub enabled: bool, + /// Weight assigned to the decay term. + pub weight: f32, + /// Decay window in days. + pub tau_days: f32, +} + +/// Retrieval/rerank blending configuration. +#[derive(Debug, Deserialize)] +pub struct RankingBlend { + /// Whether blend mode is enabled. + pub enabled: bool, + /// Normalization strategy applied to rerank scores. + pub rerank_normalization: String, + /// Normalization strategy applied to retrieval scores. + pub retrieval_normalization: String, + /// Retrieval-rank segments that assign retrieval weights. + pub segments: Vec<RankingBlendSegment>, +} + +/// One retrieval-rank segment used by blend mode. +#[derive(Debug, Deserialize)] +pub struct RankingBlendSegment { + /// Inclusive maximum retrieval rank for this segment. + pub max_retrieval_rank: u32, + /// Retrieval weight applied within this segment. + pub retrieval_weight: f32, +} + +/// Diversity controls used when selecting final results. +#[derive(Debug, Deserialize)] +pub struct RankingDiversity { + /// Whether diversity filtering is enabled. + pub enabled: bool, + /// Similarity threshold above which candidates may be skipped. + pub sim_threshold: f32, + /// Lambda used by MMR-style balancing. + pub mmr_lambda: f32, + /// Maximum number of skipped candidates before backfilling. + pub max_skips: u32, +} + +/// Source weighting and priority between fusion and structured-field retrieval. +#[derive(Debug, Deserialize)] +pub struct RankingRetrievalSources { + /// Weight applied to fused retrieval results. + pub fusion_weight: f32, + /// Weight applied to structured-field matches. + pub structured_field_weight: f32, + /// Priority assigned to fused retrieval results. + pub fusion_priority: u32, + /// Priority assigned to structured-field matches. + pub structured_field_priority: u32, +} + +/// Lifecycle retention and purge settings. #[derive(Debug, Deserialize)] pub struct Lifecycle { + /// Note-type-specific TTL settings. pub ttl_days: TtlDays, + /// Days to retain deleted notes before purge. pub purge_deleted_after_days: i64, + /// Days to retain deprecated notes before purge. pub purge_deprecated_after_days: i64, } +/// TTL values in days for each note type. #[derive(Debug, Deserialize)] pub struct TtlDays { + /// TTL for `plan` notes. pub plan: i64, + /// TTL for `fact` notes. pub fact: i64, + /// TTL for `preference` notes. pub preference: i64, + /// TTL for `constraint` notes. pub constraint: i64, + /// TTL for `decision` notes. pub decision: i64, + /// TTL for `profile` notes. pub profile: i64, } +/// Request security, evidence, and auth settings. #[derive(Debug, Deserialize)] pub struct Security { + /// Whether services must bind only to loopback interfaces. pub bind_localhost_only: bool, - pub reject_cjk: bool, + /// Whether non-English input is rejected at the API boundary. + pub reject_non_english: bool, + /// Whether secret-like text is redacted before write. pub redact_secrets_on_write: bool, + /// Minimum number of quotes required for evidence binding. pub evidence_min_quotes: u32, + /// Maximum number of quotes allowed for evidence binding. pub evidence_max_quotes: u32, + /// Maximum characters allowed in one evidence quote. pub evidence_max_quote_chars: u32, + /// Authentication mode such as `off` or `static_keys`. + pub auth_mode: String, + /// Static bearer-token entries used when `auth_mode` is `static_keys`. + pub auth_keys: Vec<SecurityAuthKey>, +} + +/// A single static bearer-token entry. +#[derive(Debug, Deserialize)] +pub struct SecurityAuthKey { + /// Stable token identifier used for auditing. + pub token_id: String, + /// Bearer token value matched from incoming requests. + pub token: String, + /// Tenant identifier granted by this token. + pub tenant_id: String, + /// Project identifier granted by this token. + pub project_id: String, + + /// Optional agent identifier restriction. + pub agent_id: Option<String>, + /// Read profile granted by this token. + pub read_profile: String, + /// Role assigned to this token. + pub role: SecurityAuthRole, +} + +/// Role values accepted by static auth keys. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SecurityAuthRole { + /// Standard user token. + User, + /// Admin token with elevated write privileges. + Admin, + /// Super-admin token for global admin operations. + SuperAdmin, } diff --git a/packages/elf-config/tests/config_validation.rs b/packages/elf-config/tests/config_validation.rs index aad49576..26554a07 100644 --- a/packages/elf-config/tests/config_validation.rs +++ b/packages/elf-config/tests/config_validation.rs @@ -1,227 +1,752 @@ -// std +#![allow(unused_crate_dependencies)] + +//! Config validation tests for the ELF configuration loader. + use std::{ + collections::HashMap, env, fs, path::PathBuf, + process, + sync::atomic::{AtomicU64, Ordering}, time::{SystemTime, UNIX_EPOCH}, }; -fn sample_toml(reject_cjk: bool) -> String { - sample_toml_with_cache(reject_cjk, 7, 7, true, "v1", "v1") +use toml::Value; + +use elf_config::{self, Config, Context, Error, MemoryPolicyRule}; + +const SAMPLE_CONFIG_TEMPLATE_TOML: &str = include_str!("fixtures/sample_config.template.toml"); +const TRACE_GATE_CONFIG_TOML: &str = + include_str!("../../../.github/fixtures/trace_gate/config.toml"); + +fn sample_toml(reject_non_english: bool) -> String { + sample_toml_with_recursive(reject_non_english, false, 2, 4, 32, 256) +} + +fn sample_toml_with_recursive( + reject_non_english: bool, + recursive_enabled: bool, + max_depth: i64, + max_children_per_node: i64, + max_nodes_per_scope: i64, + max_total_nodes: i64, +) -> String { + let mut value: Value = + toml::from_str(SAMPLE_CONFIG_TEMPLATE_TOML).expect("Failed to parse template config."); + let root = value.as_table_mut().expect("Template config must be a table."); + let search = root + .get_mut("search") + .and_then(Value::as_table_mut) + .expect("Template config must include [search]."); + let recursive = search + .get_mut("recursive") + .and_then(Value::as_table_mut) + .expect("Template config must include [search.recursive]."); + + recursive.insert("enabled".to_string(), Value::Boolean(recursive_enabled)); + recursive.insert("max_depth".to_string(), Value::Integer(max_depth)); + recursive.insert("max_children_per_node".to_string(), Value::Integer(max_children_per_node)); + recursive.insert("max_nodes_per_scope".to_string(), Value::Integer(max_nodes_per_scope)); + recursive.insert("max_total_nodes".to_string(), Value::Integer(max_total_nodes)); + + let security = root + .get_mut("security") + .and_then(Value::as_table_mut) + .expect("Template config must include [security]."); + + security.insert("reject_non_english".to_string(), Value::Boolean(reject_non_english)); + + toml::to_string(&value).expect("Failed to render template config.") } fn sample_toml_with_cache( - reject_cjk: bool, + reject_non_english: bool, expansion_ttl_days: i64, rerank_ttl_days: i64, cache_enabled: bool, - expansion_version: &str, - rerank_version: &str, ) -> String { - format!( - r#"[service] -http_bind = "127.0.0.1:8080" -mcp_bind = "127.0.0.1:9090" -admin_bind = "127.0.0.1:8081" -log_level = "info" - -[storage.postgres] -dsn = "postgres://user:pass@127.0.0.1:5432/elf" -pool_max_conns = 5 - -[storage.qdrant] -url = "http://127.0.0.1:6334" -collection = "mem_notes_v1" -vector_dim = 1536 - -[providers.embedding] -provider_id = "embed" -api_base = "http://localhost" -api_key = "key" -path = "/embeddings" -model = "model" -dimensions = 1536 -timeout_ms = 1000 -default_headers = {{}} - -[providers.rerank] -provider_id = "rerank" -api_base = "http://localhost" -api_key = "key" -path = "/rerank" -model = "model" -timeout_ms = 1000 -default_headers = {{}} - -[providers.llm_extractor] -provider_id = "llm" -api_base = "http://localhost" -api_key = "key" -path = "/chat/completions" -model = "model" -temperature = 0.1 -timeout_ms = 1000 -default_headers = {{}} - -[scopes] -allowed = ["agent_private"] - -[scopes.read_profiles] -private_only = ["agent_private"] -private_plus_project = ["agent_private"] -all_scopes = ["agent_private"] - -[scopes.precedence] -agent_private = 30 -project_shared = 20 -org_shared = 10 - -[scopes.write_allowed] -agent_private = true -project_shared = true -org_shared = true - -[memory] -max_notes_per_add_event = 3 -max_note_chars = 240 -dup_sim_threshold = 0.92 -update_sim_threshold = 0.85 -candidate_k = 60 -top_k = 12 - -[chunking] -enabled = true -max_tokens = 512 -overlap_tokens = 128 -tokenizer_repo = "" - -[search.expansion] -mode = "dynamic" -max_queries = 4 -include_original = true - -[search.dynamic] -min_candidates = 10 -min_top_score = 0.12 - -[search.prefilter] -max_candidates = 0 - -[search.cache] -enabled = {cache_enabled} -expansion_ttl_days = {expansion_ttl_days} -rerank_ttl_days = {rerank_ttl_days} -max_payload_bytes = 262144 -expansion_version = "{expansion_version}" -rerank_version = "{rerank_version}" - -[search.explain] -retention_days = 7 - -[ranking] -recency_tau_days = 60.0 -tie_breaker_weight = 0.1 - -[lifecycle.ttl_days] -plan = 14 -fact = 180 -preference = 0 -constraint = 0 -decision = 0 -profile = 0 - -[lifecycle] -purge_deleted_after_days = 30 -purge_deprecated_after_days = 180 - -[security] -bind_localhost_only = true -reject_cjk = {reject_cjk} -redact_secrets_on_write = true -evidence_min_quotes = 1 -evidence_max_quotes = 2 -evidence_max_quote_chars = 320 -"#, - reject_cjk = reject_cjk, - cache_enabled = cache_enabled, - expansion_ttl_days = expansion_ttl_days, - rerank_ttl_days = rerank_ttl_days, - expansion_version = expansion_version, - rerank_version = rerank_version - ) + let mut value: Value = + toml::from_str(&sample_toml_with_recursive(reject_non_english, false, 2, 4, 32, 256)) + .expect("Failed to parse template config."); + let root = value.as_table_mut().expect("Template config must be a table."); + let search = root + .get_mut("search") + .and_then(Value::as_table_mut) + .expect("Template config must include [search]."); + let cache = search + .get_mut("cache") + .and_then(Value::as_table_mut) + .expect("Template config must include [search.cache]."); + + cache.insert("enabled".to_string(), Value::Boolean(cache_enabled)); + cache.insert("expansion_ttl_days".to_string(), Value::Integer(expansion_ttl_days)); + cache.insert("rerank_ttl_days".to_string(), Value::Integer(rerank_ttl_days)); + + toml::to_string(&value).expect("Failed to render template config.") } fn write_temp_config(payload: String) -> PathBuf { + static COUNTER: AtomicU64 = AtomicU64::new(0); + let nanos = SystemTime::now() .duration_since(UNIX_EPOCH) .expect("System time must be valid.") .as_nanos(); + let ordinal = COUNTER.fetch_add(1, Ordering::SeqCst); + let pid = process::id(); let mut path = env::temp_dir(); - path.push(format!("elf_config_test_{nanos}.toml")); + + path.push(format!("elf_config_test_{nanos}_{pid}_{ordinal}.toml")); + fs::write(&path, payload).expect("Failed to write test config."); + path } -fn base_config() -> elf_config::Config { +fn remove_required_config_key(payload: &str, path: &[&str]) -> String { + assert!(!path.is_empty(), "Config path must not be empty."); + + let mut value: Value = toml::from_str(payload).expect("Failed to parse test config."); + let mut table = value.as_table_mut().expect("Template config must be a table."); + + for segment in &path[..path.len() - 1] { + table = table + .get_mut(*segment) + .and_then(Value::as_table_mut) + .unwrap_or_else(|| panic!("Template config must include [{}].", segment)); + } + + let field = path[path.len() - 1]; + let removed = table.remove(field); + + assert!(removed.is_some(), "Template config must include {}.", path.join(".")); + + toml::to_string(&value).expect("Failed to render template config.") +} + +fn assert_missing_field_error(result: Result<Config, Error>, field: &str) { + let err = result.expect_err("Expected missing required field parse error."); + let message = match err { + Error::ParseConfig { source, .. } => source.to_string(), + err => panic!("Expected parse config error, got {err}"), + }; + + assert!(message.contains(&format!("missing field `{field}`")), "Unexpected error: {message}"); +} + +fn base_config() -> Config { let payload = sample_toml(true); + toml::from_str(&payload).expect("Failed to parse test config.") } #[test] -fn reject_cjk_must_be_true() { +fn required_config_fields_must_be_explicit() { + let cases = [ + (&["storage", "qdrant", "docs_collection"][..], "docs_collection"), + (&["memory", "policy"][..], "policy"), + (&["search", "recursive"][..], "recursive"), + (&["search", "graph_context"][..], "graph_context"), + (&["security", "auth_keys"][..], "auth_keys"), + ]; + + for (path, field) in cases { + let payload = remove_required_config_key(&sample_toml(true), path); + let config_path = write_temp_config(payload); + let result = elf_config::load(&config_path); + + fs::remove_file(&config_path).expect("Failed to remove test config."); + + assert_missing_field_error(result, field); + } +} + +#[test] +fn docker_local_config_is_strict_valid() { + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../config/local/elf.docker.toml"); + let cfg = elf_config::load(path.as_path()).expect("Docker local config must load."); + + assert_eq!( + cfg.storage.postgres.dsn, + "postgres://elf_dev:elf_dev_password@127.0.0.1:51888/elf_local" + ); + assert_eq!(cfg.storage.qdrant.url, "http://127.0.0.1:51890"); + assert_eq!(cfg.storage.qdrant.collection, "elf_local_notes"); + assert_eq!(cfg.storage.qdrant.docs_collection, "elf_local_doc_chunks"); + assert_eq!(cfg.providers.embedding.provider_id, "local"); + assert_eq!(cfg.providers.rerank.provider_id, "local"); + assert_eq!(cfg.search.expansion.mode, "off"); +} + +#[test] +fn reject_non_english_must_be_true() { let payload = sample_toml(false); let path = write_temp_config(payload); - let result = elf_config::load(&path); + fs::remove_file(&path).expect("Failed to remove test config."); - let err = result.expect_err("Expected reject_cjk validation error."); + let err = result.expect_err("Expected reject_non_english validation error."); let message = err.to_string(); + assert!( - message.contains("security.reject_cjk must be true."), + message.contains("security.reject_non_english must be true."), "Unexpected error message: {message}" ); } #[test] fn cache_ttl_must_be_positive() { - let payload = sample_toml_with_cache(true, 0, 7, true, "v1", "v1"); + let payload = sample_toml_with_cache(true, 0, 7, true); let path = write_temp_config(payload); - let result = elf_config::load(&path); + fs::remove_file(&path).expect("Failed to remove test config."); let err = result.expect_err("Expected cache TTL validation error."); + assert!( err.to_string().contains("search.cache.expansion_ttl_days must be greater than zero."), "Unexpected error: {err}" ); } +#[test] +fn recursive_search_settings_can_be_valid() { + let mut cfg = base_config(); + + cfg.search.recursive.enabled = true; + cfg.search.recursive.max_depth = 4; + cfg.search.recursive.max_children_per_node = 12; + cfg.search.recursive.max_nodes_per_scope = 64; + cfg.search.recursive.max_total_nodes = 120; + + assert!(elf_config::validate(&cfg).is_ok()); +} + +#[test] +fn recursive_search_settings_require_valid_depth_bounds() { + let mut cfg = base_config(); + + cfg.search.recursive.enabled = true; + cfg.search.recursive.max_depth = 0; + + let err = + elf_config::validate(&cfg).expect_err("Expected recursive max_depth validation error."); + + assert!( + err.to_string().contains("search.recursive.max_depth must be greater than zero."), + "Unexpected error: {err}" + ); +} + +#[test] +fn recursive_search_settings_require_reasonable_bounds() { + let mut cfg = base_config(); + + cfg.search.recursive.enabled = true; + cfg.search.recursive.max_children_per_node = 0; + + let err = + elf_config::validate(&cfg).expect_err("Expected recursive branch factor validation error."); + + assert!( + err.to_string() + .contains("search.recursive.max_children_per_node must be greater than zero."), + "Unexpected error: {err}" + ); + + cfg = base_config(); + cfg.search.recursive.enabled = true; + cfg.search.recursive.max_total_nodes = 8; + cfg.search.recursive.max_nodes_per_scope = 12; + + let err = elf_config::validate(&cfg) + .expect_err("Expected recursive max_total_nodes lower-bound validation error."); + + assert!( + err.to_string().contains( + "search.recursive.max_total_nodes must be at least search.recursive.max_nodes_per_scope." + ), + "Unexpected error: {err}" + ); +} + +#[test] +fn graph_context_settings_max_facts_per_item_must_be_positive_when_enabled() { + let mut cfg = base_config(); + + cfg.search.graph_context.enabled = true; + cfg.search.graph_context.max_facts_per_item = 0; + + let err = elf_config::validate(&cfg) + .expect_err("Expected graph_context max_facts_per_item validation error."); + + assert!( + err.to_string() + .contains("search.graph_context.max_facts_per_item must be greater than zero."), + "Unexpected error: {err}" + ); +} + +#[test] +fn graph_context_settings_max_evidence_notes_per_fact_must_be_positive_when_enabled() { + let mut cfg = base_config(); + + cfg.search.graph_context.enabled = true; + cfg.search.graph_context.max_evidence_notes_per_fact = 0; + + let err = elf_config::validate(&cfg) + .expect_err("Expected graph_context max_evidence_notes_per_fact validation error."); + + assert!( + err.to_string().contains( + "search.graph_context.max_evidence_notes_per_fact must be greater than zero." + ), + "Unexpected error: {err}" + ); +} + +#[test] +fn graph_context_settings_max_facts_per_item_cannot_exceed_hard_limit() { + let mut cfg = base_config(); + + cfg.search.graph_context.enabled = true; + cfg.search.graph_context.max_facts_per_item = 1_001; + + let err = elf_config::validate(&cfg) + .expect_err("Expected graph_context max_facts_per_item upper-bound validation error."); + + assert!( + err.to_string().contains("search.graph_context.max_facts_per_item must be 1,000 or less."), + "Unexpected error: {err}" + ); +} + +#[test] +fn graph_context_settings_max_evidence_notes_per_fact_cannot_exceed_hard_limit() { + let mut cfg = base_config(); + + cfg.search.graph_context.enabled = true; + cfg.search.graph_context.max_evidence_notes_per_fact = 1_001; + + let err = elf_config::validate(&cfg).expect_err( + "Expected graph_context max_evidence_notes_per_fact upper-bound validation error.", + ); + + assert!( + err.to_string() + .contains("search.graph_context.max_evidence_notes_per_fact must be 1,000 or less."), + "Unexpected error: {err}" + ); +} + #[test] fn chunking_config_requires_valid_bounds() { let mut cfg = base_config(); + cfg.chunking.max_tokens = 0; + assert!(elf_config::validate(&cfg).is_err()); cfg = base_config(); cfg.chunking.overlap_tokens = cfg.chunking.max_tokens; + assert!(elf_config::validate(&cfg).is_err()); } #[test] -fn chunking_tokenizer_repo_can_inherit_from_embedding_model() { +fn chunking_tokenizer_repo_cannot_be_empty_or_whitespace() { + let mut payload = sample_toml(true); + + payload = payload.replace("tokenizer_repo = \"REPLACE_ME\"", "tokenizer_repo = \" \""); + + let path = write_temp_config(payload); + let err = elf_config::load(&path).expect_err("Expected tokenizer validation error."); + + fs::remove_file(&path).expect("Failed to remove test config."); + + assert!(err.to_string().contains("chunking.tokenizer_repo must be a non-empty string.")); +} + +#[test] +fn chunking_tokenizer_repo_is_required() { + let mut payload = sample_toml(true); + + payload = payload.replace("tokenizer_repo = \"REPLACE_ME\"\n", ""); + + let path = write_temp_config(payload); + let err = elf_config::load(&path).expect_err("Expected missing tokenizer_repo parse error."); + + fs::remove_file(&path).expect("Failed to remove test config."); + + let message = match err { + Error::ParseConfig { source, .. } => source.to_string(), + err => panic!("Expected parse config error, got {err}"), + }; + + assert!( + message.contains("missing field `tokenizer_repo`") + || message.contains("missing field `tokenizer repo`"), + "Unexpected error: {message}" + ); +} + +#[test] +fn context_scope_boost_weight_requires_scope_descriptions_when_enabled() { let mut cfg = base_config(); - cfg.chunking.tokenizer_repo = None; + + cfg.context = Some(Context { + project_descriptions: None, + scope_descriptions: None, + scope_boost_weight: Some(0.1), + }); + + let err = elf_config::validate(&cfg).expect_err("Expected context validation error."); + + assert!( + err.to_string().contains( + "context.scope_descriptions must be non-empty when context.scope_boost_weight is greater than zero." + ), + "Unexpected error: {err}" + ); +} + +#[test] +fn context_scope_boost_weight_accepts_zero_without_descriptions() { + let mut cfg = base_config(); + + cfg.context = Some(Context { + project_descriptions: None, + scope_descriptions: None, + scope_boost_weight: Some(0.0), + }); + assert!(elf_config::validate(&cfg).is_ok()); } #[test] -fn chunking_tokenizer_repo_empty_string_normalizes_to_none() { - let payload = sample_toml(true); - let path = write_temp_config(payload); +fn context_scope_boost_weight_must_be_finite() { + let mut cfg = base_config(); + let mut scope_descriptions = HashMap::new(); + + scope_descriptions.insert("project_shared".to_string(), "Project notes.".to_string()); - let cfg = elf_config::load(&path).expect("Expected config to load."); + cfg.context = Some(Context { + project_descriptions: None, + scope_descriptions: Some(scope_descriptions), + scope_boost_weight: Some(f32::NAN), + }); + + let err = elf_config::validate(&cfg).expect_err("Expected context validation error."); + + assert!( + err.to_string().contains("context.scope_boost_weight must be a finite number."), + "Unexpected error: {err}" + ); +} + +#[test] +fn context_scope_boost_weight_must_be_in_range() { + let mut cfg = base_config(); + let mut scope_descriptions = HashMap::new(); + + scope_descriptions.insert("project_shared".to_string(), "Project notes.".to_string()); + + cfg.context = Some(Context { + project_descriptions: None, + scope_descriptions: Some(scope_descriptions.clone()), + scope_boost_weight: Some(-0.01), + }); + + let err = elf_config::validate(&cfg).expect_err("Expected context validation error."); + + assert!( + err.to_string().contains("context.scope_boost_weight must be zero or greater."), + "Unexpected error: {err}" + ); + + cfg.context = Some(Context { + project_descriptions: None, + scope_descriptions: Some(scope_descriptions), + scope_boost_weight: Some(1.01), + }); + + let err = elf_config::validate(&cfg).expect_err("Expected context validation error."); + + assert!( + err.to_string().contains("context.scope_boost_weight must be 1.0 or less."), + "Unexpected error: {err}" + ); +} + +#[test] +fn elf_example_toml_is_valid() { + let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + + path.push("../../elf.example.toml"); + + elf_config::load(&path).expect("Expected elf.example.toml to be a valid config."); +} + +#[test] +fn trace_gate_fixture_toml_is_valid() { + let path = write_temp_config(TRACE_GATE_CONFIG_TOML.to_string()); + + elf_config::load(&path).expect("Expected trace gate fixture config to be valid."); fs::remove_file(&path).expect("Failed to remove test config."); +} + +#[test] +fn retrieval_source_weights_must_be_non_negative() { + let mut cfg = base_config(); + + cfg.ranking.retrieval_sources.fusion_weight = -0.1; + + let err = + elf_config::validate(&cfg).expect_err("Expected retrieval source weight validation error."); + + assert!( + err.to_string() + .contains("ranking.retrieval_sources.fusion_weight must be zero or greater."), + "Unexpected error: {err}" + ); +} + +#[test] +fn retrieval_source_weights_require_at_least_one_positive() { + let mut cfg = base_config(); + + cfg.ranking.retrieval_sources.fusion_weight = 0.0; + cfg.ranking.retrieval_sources.structured_field_weight = 0.0; + + let err = elf_config::validate(&cfg) + .expect_err("Expected retrieval source at-least-one-positive validation error."); + + assert!( + err.to_string().contains("At least one retrieval source weight must be greater than zero."), + "Unexpected error: {err}" + ); +} + +#[test] +fn security_auth_keys_require_unique_token_ids() { + let mut cfg = base_config(); + + cfg.security.auth_mode = "static_keys".to_string(); + cfg.security.auth_keys = vec![ + elf_config::SecurityAuthKey { + token_id: "k1".to_string(), + token: "secret-1".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: elf_config::SecurityAuthRole::User, + }, + elf_config::SecurityAuthKey { + token_id: "k1".to_string(), + token: "secret-2".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "private_plus_project".to_string(), + role: elf_config::SecurityAuthRole::Admin, + }, + ]; + + let err = + elf_config::validate(&cfg).expect_err("Expected duplicate token_id validation error."); + + assert!( + err.to_string().contains("token_id must be unique across security.auth_keys."), + "Unexpected error: {err}" + ); +} - assert!(cfg.chunking.tokenizer_repo.is_none()); +#[test] +fn security_auth_keys_require_known_read_profile() { + let mut cfg = base_config(); + + cfg.security.auth_mode = "static_keys".to_string(); + cfg.security.auth_keys = vec![elf_config::SecurityAuthKey { + token_id: "k1".to_string(), + token: "secret-1".to_string(), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: Some("a".to_string()), + read_profile: "unknown".to_string(), + role: elf_config::SecurityAuthRole::User, + }]; + + let err = + elf_config::validate(&cfg).expect_err("Expected auth key read_profile validation error."); + + assert!( + err.to_string().contains( + "read_profile must be one of private_only, private_plus_project, or all_scopes." + ), + "Unexpected error: {err}" + ); +} + +#[test] +fn memory_policy_min_confidence_must_be_finite() { + let mut cfg = base_config(); + + cfg.memory + .policy + .rules + .push(MemoryPolicyRule { min_confidence: Some(f32::NAN), ..Default::default() }); + + let err = elf_config::validate(&cfg).expect_err("Expected min_confidence validation error."); + + assert!( + err.to_string().contains("memory.policy.rules[1].min_confidence must be a finite number."), + "Unexpected error: {err}" + ); +} + +#[test] +fn memory_policy_min_confidence_must_be_in_range() { + let mut cfg = base_config(); + + cfg.memory + .policy + .rules + .push(MemoryPolicyRule { min_confidence: Some(1.01), ..Default::default() }); + + let err = + elf_config::validate(&cfg).expect_err("Expected min_confidence range validation error."); + + assert!( + err.to_string() + .contains("memory.policy.rules[1].min_confidence must be between 0.0 and 1.0."), + "Unexpected error: {err}" + ); +} + +#[test] +fn memory_policy_min_importance_must_be_finite() { + let mut cfg = base_config(); + + cfg.memory + .policy + .rules + .push(MemoryPolicyRule { min_importance: Some(f32::INFINITY), ..Default::default() }); + + let err = elf_config::validate(&cfg).expect_err("Expected min_importance validation error."); + + assert!( + err.to_string().contains("memory.policy.rules[1].min_importance must be a finite number."), + "Unexpected error: {err}" + ); +} + +#[test] +fn memory_policy_min_importance_must_be_in_range() { + let mut cfg = base_config(); + + cfg.memory + .policy + .rules + .push(MemoryPolicyRule { min_importance: Some(-0.01), ..Default::default() }); + + let err = + elf_config::validate(&cfg).expect_err("Expected min_importance range validation error."); + + assert!( + err.to_string() + .contains("memory.policy.rules[1].min_importance must be between 0.0 and 1.0."), + "Unexpected error: {err}" + ); +} + +#[test] +fn memory_policy_note_type_must_be_known_value() { + let mut cfg = base_config(); + + cfg.memory + .policy + .rules + .push(MemoryPolicyRule { note_type: Some("unknown".to_string()), ..Default::default() }); + + let err = elf_config::validate(&cfg).expect_err("Expected note_type validation error."); + + assert!( + err.to_string().contains( + "memory.policy.rules[1].note_type must be one of preference, constraint, decision, profile, fact, or plan." + ), + "Unexpected error: {err}" + ); +} + +#[test] +fn memory_policy_scope_must_be_allowed() { + let mut cfg = base_config(); + + cfg.memory + .policy + .rules + .push(MemoryPolicyRule { scope: Some("invalid_scope".to_string()), ..Default::default() }); + + let err = elf_config::validate(&cfg).expect_err("Expected scope validation error."); + + assert!( + err.to_string().contains("memory.policy.rules[1].scope must be one of allowed scopes."), + "Unexpected error: {err}" + ); +} + +#[test] +fn memory_policy_rule_pairs_must_be_unique() { + let mut cfg = base_config(); + + cfg.memory.policy.rules.push(Default::default()); + cfg.memory.policy.rules.push(Default::default()); + + let err = elf_config::validate(&cfg).expect_err("Expected duplicate rule validation error."); + + assert!( + err.to_string() + .contains("memory.policy.rules[2] has a duplicate note_type and scope pair."), + "Unexpected error: {err}" + ); +} + +#[test] +fn memory_policy_note_type_must_not_be_whitespace_only() { + let mut cfg = base_config(); + + cfg.memory + .policy + .rules + .push(MemoryPolicyRule { note_type: Some(" ".to_string()), ..Default::default() }); + + let err = + elf_config::validate(&cfg).expect_err("Expected whitespace note_type validation error."); + + assert!( + err.to_string() + .contains("memory.policy.rules[1].note_type cannot be blank or whitespace-only."), + "Unexpected error: {err}" + ); +} + +#[test] +fn memory_policy_scope_must_not_be_whitespace_only() { + let mut cfg = base_config(); + + cfg.memory + .policy + .rules + .push(MemoryPolicyRule { scope: Some(" ".to_string()), ..Default::default() }); + + let err = elf_config::validate(&cfg).expect_err("Expected whitespace scope validation error."); + + assert!( + err.to_string() + .contains("memory.policy.rules[1].scope cannot be blank or whitespace-only."), + "Unexpected error: {err}" + ); } diff --git a/packages/elf-config/tests/fixtures/sample_config.template.toml b/packages/elf-config/tests/fixtures/sample_config.template.toml new file mode 100644 index 00000000..ec15e713 --- /dev/null +++ b/packages/elf-config/tests/fixtures/sample_config.template.toml @@ -0,0 +1,196 @@ +[service] +admin_bind = "127.0.0.1:8081" +http_bind = "127.0.0.1:8080" +log_level = "info" +mcp_bind = "127.0.0.1:9090" + +[storage.postgres] +dsn = "postgres://user:pass@127.0.0.1:5432/elf" +pool_max_conns = 5 + +[storage.qdrant] +collection = "mem_notes_v2" +docs_collection = "doc_chunks_v1" +url = "http://127.0.0.1:6334" +vector_dim = 4_096 + +[providers.embedding] +api_base = "http://localhost" +api_key = "key" +default_headers = {} +dimensions = 4_096 +model = "model" +path = "/embeddings" +provider_id = "embed" +timeout_ms = 1_000 + +[providers.rerank] +api_base = "http://localhost" +api_key = "key" +default_headers = {} +model = "model" +path = "/rerank" +provider_id = "rerank" +timeout_ms = 1_000 + +[providers.llm_extractor] +api_base = "http://localhost" +api_key = "key" +default_headers = {} +model = "model" +path = "/chat/completions" +provider_id = "llm" +temperature = 0.1 +timeout_ms = 1_000 + +[scopes] +allowed = ["agent_private"] + +[scopes.read_profiles] +all_scopes = ["agent_private"] +private_only = ["agent_private"] +private_plus_project = ["agent_private"] + +[scopes.precedence] +agent_private = 30 +org_shared = 10 +project_shared = 20 + +[scopes.write_allowed] +agent_private = true +org_shared = true +project_shared = true + +[memory] +candidate_k = 60 +dup_sim_threshold = 0.92 +max_note_chars = 240 +max_notes_per_add_event = 3 +top_k = 12 +update_sim_threshold = 0.85 + +[memory.policy] + +[[memory.policy.rules]] +min_confidence = 0.9 +min_importance = 0.75 +note_type = "preference" +scope = "agent_private" + +[chunking] +enabled = true +max_tokens = 512 +overlap_tokens = 128 +tokenizer_repo = "REPLACE_ME" + +[search.expansion] +include_original = true +max_queries = 4 +mode = "dynamic" + +[search.dynamic] +min_candidates = 10 +min_top_score = 0.12 + +[search.prefilter] +max_candidates = 0 + +[search.cache] +enabled = true +expansion_ttl_days = 7 +max_payload_bytes = 262_144 +rerank_ttl_days = 7 + +[search.explain] +candidate_retention_days = 2 +capture_candidates = false +retention_days = 7 +write_mode = "outbox" + +[search.recursive] +enabled = false +max_children_per_node = 4 +max_depth = 2 +max_nodes_per_scope = 32 +max_total_nodes = 256 + +[search.graph_context] +enabled = false +max_evidence_notes_per_fact = 16 +max_facts_per_item = 16 + +[ranking] +recency_tau_days = 60.0 +tie_breaker_weight = 0.1 + +[ranking.deterministic] +enabled = false + +[ranking.deterministic.lexical] +enabled = false +max_query_terms = 16 +max_text_terms = 1024 +min_ratio = 0.3 +weight = 0.05 + +[ranking.deterministic.hits] +enabled = false +half_saturation = 8.0 +last_hit_tau_days = 14.0 +weight = 0.05 + +[ranking.deterministic.decay] +enabled = false +tau_days = 30.0 +weight = 0.05 + +[ranking.blend] +enabled = true +rerank_normalization = "rank" +retrieval_normalization = "rank" + +[[ranking.blend.segments]] +max_retrieval_rank = 3 +retrieval_weight = 0.8 + +[[ranking.blend.segments]] +max_retrieval_rank = 10 +retrieval_weight = 0.5 + +[[ranking.blend.segments]] +max_retrieval_rank = 1_000_000 +retrieval_weight = 0.2 + +[ranking.diversity] +enabled = true +max_skips = 64 +mmr_lambda = 0.7 +sim_threshold = 0.88 + +[ranking.retrieval_sources] +fusion_priority = 1 +fusion_weight = 1.0 +structured_field_priority = 0 +structured_field_weight = 1.0 + +[lifecycle.ttl_days] +constraint = 0 +decision = 0 +fact = 180 +plan = 14 +preference = 0 +profile = 0 + +[lifecycle] +purge_deleted_after_days = 30 +purge_deprecated_after_days = 180 + +[security] +auth_keys = [] +auth_mode = "off" +bind_localhost_only = true +evidence_max_quote_chars = 320 +evidence_max_quotes = 2 +evidence_min_quotes = 1 +redact_secrets_on_write = true +reject_non_english = true diff --git a/packages/elf-domain/Cargo.toml b/packages/elf-domain/Cargo.toml index 45b9ca06..25c4d732 100644 --- a/packages/elf-domain/Cargo.toml +++ b/packages/elf-domain/Cargo.toml @@ -1,10 +1,19 @@ [package] edition = "2024" name = "elf-domain" -version = "0.1.0" +version = "0.2.0" [dependencies] -elf-config = { path = "../elf-config" } -regex = { version = "1.0" } +regex = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +time = { workspace = true } +unicode-normalization = { workspace = true } +unicode-script = { workspace = true } +uuid = { workspace = true } +whatlang = { workspace = true } + +elf-config = { workspace = true } + +[dev-dependencies] serde_json = { workspace = true } -time = { workspace = true } diff --git a/packages/elf-domain/src/cjk.rs b/packages/elf-domain/src/cjk.rs deleted file mode 100644 index 736d7789..00000000 --- a/packages/elf-domain/src/cjk.rs +++ /dev/null @@ -1,13 +0,0 @@ -pub fn contains_cjk(input: &str) -> bool { - input.chars().any(|c| { - let code = c as u32; - matches!( - code, - 0x3000..=0x303F - | 0x3040..=0x309F - | 0x30A0..=0x30FF - | 0x4E00..=0x9FFF - | 0xAC00..=0xD7AF - ) - }) -} diff --git a/packages/elf-domain/src/consolidation.rs b/packages/elf-domain/src/consolidation.rs new file mode 100644 index 00000000..e9af2075 --- /dev/null +++ b/packages/elf-domain/src/consolidation.rs @@ -0,0 +1,615 @@ +//! Consolidation proposal contract validation. + +use std::{ + error::Error, + fmt::{Display, Formatter}, +}; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use time::OffsetDateTime; +use uuid::Uuid; + +/// Current consolidation contract schema identifier. +pub const CONSOLIDATION_CONTRACT_SCHEMA_V1: &str = "elf.consolidation/v1"; + +const FORBIDDEN_DIFF_KEYS: [&str; 7] = [ + "delete_source", + "delete_sources", + "source_delete", + "source_mutation", + "source_mutations", + "source_note_updates", + "overwrite_source", +]; + +/// Error returned by consolidation contract validation. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ConsolidationValidationError { + /// A required source reference list was empty. + MissingSourceRefs, + /// A source snapshot did not include any immutable freshness guard. + MissingSourceSnapshot, + /// A JSON field was not the required object shape. + InvalidJsonObject { + /// Name of the invalid field. + field: &'static str, + }, + /// A required text field was empty. + EmptyText { + /// Name of the invalid field. + field: &'static str, + }, + /// A confidence value was outside the inclusive range 0.0 to 1.0. + InvalidConfidence, + /// The proposal diff included a source mutation key. + DestructiveDiff, + /// A proposal review transition is not allowed by the lifecycle. + InvalidReviewTransition { + /// Current review state. + from: ConsolidationReviewState, + /// Requested review state. + to: ConsolidationReviewState, + }, + /// A run state transition is not allowed by the job lifecycle. + InvalidRunTransition { + /// Current run state. + from: ConsolidationRunState, + /// Requested run state. + to: ConsolidationRunState, + }, + /// A stored state string is not part of the contract. + UnknownState { + /// Name of the invalid field. + field: &'static str, + }, + /// The queued contract schema did not match the consolidation v1 contract. + InvalidContractSchema, +} +impl Display for ConsolidationValidationError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::MissingSourceRefs => write!(f, "source_refs must not be empty"), + Self::MissingSourceSnapshot => + write!(f, "source snapshot must include at least one freshness guard"), + Self::InvalidJsonObject { field } => write!(f, "{field} must be a JSON object"), + Self::EmptyText { field } => write!(f, "{field} must not be empty"), + Self::InvalidConfidence => write!(f, "confidence must be in the range 0.0..=1.0"), + Self::DestructiveDiff => write!(f, "proposal diff must not mutate source memory"), + Self::InvalidReviewTransition { from, to } => + write!(f, "invalid proposal review transition from {from:?} to {to:?}"), + Self::InvalidRunTransition { from, to } => + write!(f, "invalid consolidation run transition from {from:?} to {to:?}"), + Self::UnknownState { field } => write!(f, "{field} is not a known state"), + Self::InvalidContractSchema => + write!(f, "contract_schema must be elf.consolidation/v1"), + } + } +} +impl Error for ConsolidationValidationError {} + +/// Source artifact kind accepted by consolidation input references. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ConsolidationSourceKind { + /// Memory note evidence. + Note, + /// Event ingestion source. + Event, + /// Search trace source. + Trace, + /// Search trace item source. + TraceItem, + /// Document extension source. + Doc, + /// Document chunk source. + DocChunk, +} +impl ConsolidationSourceKind { + /// Returns the canonical storage string. + pub fn as_str(self) -> &'static str { + match self { + Self::Note => "note", + Self::Event => "event", + Self::Trace => "trace", + Self::TraceItem => "trace_item", + Self::Doc => "doc", + Self::DocChunk => "doc_chunk", + } + } +} + +/// Immutable source snapshot guard captured before a proposal is stored. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct ConsolidationSourceSnapshot { + /// Source lifecycle status observed by the consolidation run. + pub status: Option<String>, + /// Source last-update timestamp observed by the consolidation run. + pub updated_at: Option<OffsetDateTime>, + /// Source content or payload hash, when available. + pub content_hash: Option<String>, + /// Source embedding version, when relevant. + pub embedding_version: Option<String>, + /// Trace schema or trace version, when the source is a trace. + pub trace_version: Option<i32>, + #[serde(default)] + /// Opaque source reference copied from the authoritative source. + pub source_ref: Value, + #[serde(default)] + /// Additional snapshot metadata used for replay or review. + pub metadata: Value, +} +impl ConsolidationSourceSnapshot { + /// Validates snapshot shape and immutable freshness guards. + pub fn validate(&self) -> Result<(), ConsolidationValidationError> { + validate_json_object("source_ref", &self.source_ref)?; + validate_json_object("metadata", &self.metadata)?; + + let has_hash = self.content_hash.as_ref().is_some_and(|hash| !hash.trim().is_empty()); + let has_embedding = + self.embedding_version.as_ref().is_some_and(|version| !version.trim().is_empty()); + let has_status = self.status.as_ref().is_some_and(|status| !status.trim().is_empty()); + let has_source_ref = non_empty_object(&self.source_ref); + let has_metadata = non_empty_object(&self.metadata); + let has_guard = self.updated_at.is_some() + || self.trace_version.is_some() + || has_hash + || has_embedding + || has_status + || has_source_ref + || has_metadata; + + if has_guard { Ok(()) } else { Err(ConsolidationValidationError::MissingSourceSnapshot) } + } +} + +/// Stable pointer to one immutable consolidation input. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct ConsolidationInputRef { + /// Kind of source artifact being referenced. + pub kind: ConsolidationSourceKind, + /// Identifier of the source artifact. + pub id: Uuid, + /// Snapshot metadata captured before proposal generation. + pub snapshot: ConsolidationSourceSnapshot, +} +impl ConsolidationInputRef { + /// Validates the input reference and its snapshot guard. + pub fn validate(&self) -> Result<(), ConsolidationValidationError> { + self.snapshot.validate() + } +} + +/// Confidence or honesty marker severity. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ConsolidationMarkerSeverity { + /// Low-severity marker. + Low, + /// Medium-severity marker. + Medium, + /// High-severity marker. + High, +} + +/// One contradiction or staleness marker attached to a proposal. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct ConsolidationMarker { + /// Marker severity. + pub severity: ConsolidationMarkerSeverity, + /// Human-readable marker text. + pub message: String, + /// Optional source that triggered the marker. + pub source: Option<ConsolidationInputRef>, +} +impl ConsolidationMarker { + /// Validates marker content and optional source evidence. + pub fn validate(&self) -> Result<(), ConsolidationValidationError> { + if self.message.trim().is_empty() { + return Err(ConsolidationValidationError::EmptyText { field: "marker.message" }); + } + + if let Some(source) = &self.source { + source.validate()?; + } + + Ok(()) + } +} + +/// Contradiction and staleness markers attached to a proposal. +#[derive(Clone, Debug, Default, PartialEq, Deserialize, Serialize)] +pub struct ConsolidationMarkers { + #[serde(default)] + /// Contradiction markers that a reviewer must inspect. + pub contradictions: Vec<ConsolidationMarker>, + #[serde(default)] + /// Staleness markers that a reviewer must inspect. + pub staleness: Vec<ConsolidationMarker>, +} +impl ConsolidationMarkers { + /// Validates all marker payloads. + pub fn validate(&self) -> Result<(), ConsolidationValidationError> { + for marker in self.contradictions.iter().chain(self.staleness.iter()) { + marker.validate()?; + } + + Ok(()) + } +} + +/// Unsupported-claim marker attached to a proposal for reviewer inspection. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct ConsolidationUnsupportedClaimFlag { + /// Stable claim identifier when the source fixture or worker supplies one. + pub claim_id: Option<String>, + /// Human-readable unsupported-claim description. + pub message: String, + /// Optional source that demonstrates why the claim is unsupported. + pub source: Option<ConsolidationInputRef>, +} +impl ConsolidationUnsupportedClaimFlag { + /// Validates unsupported-claim marker content and optional source evidence. + pub fn validate(&self) -> Result<(), ConsolidationValidationError> { + if self.message.trim().is_empty() { + return Err(ConsolidationValidationError::EmptyText { + field: "unsupported_claim_flags.message", + }); + } + + if let Some(claim_id) = &self.claim_id + && claim_id.trim().is_empty() + { + return Err(ConsolidationValidationError::EmptyText { + field: "unsupported_claim_flags.claim_id", + }); + } + if let Some(source) = &self.source { + source.validate()?; + } + + Ok(()) + } +} + +/// Derived-output apply intent for a reviewable proposal. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ConsolidationApplyIntent { + /// Create a new derived memory note after review. + CreateDerivedNote, + /// Update an existing derived memory note after review. + UpdateDerivedNote, + /// Create a derived knowledge page after review. + CreateDerivedKnowledgePage, + /// Update a derived knowledge page after review. + UpdateDerivedKnowledgePage, + /// Create or refresh a derived graph view after review. + CreateDerivedGraphView, + /// Store the proposal for review without applying a downstream derived artifact. + NoOp, +} +impl ConsolidationApplyIntent { + /// Returns the canonical storage string. + pub fn as_str(self) -> &'static str { + match self { + Self::CreateDerivedNote => "create_derived_note", + Self::UpdateDerivedNote => "update_derived_note", + Self::CreateDerivedKnowledgePage => "create_derived_knowledge_page", + Self::UpdateDerivedKnowledgePage => "update_derived_knowledge_page", + Self::CreateDerivedGraphView => "create_derived_graph_view", + Self::NoOp => "no_op", + } + } +} + +/// Reviewer action requested for a consolidation proposal. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ConsolidationReviewAction { + /// Approve a proposal for later application. + Approve, + /// Apply an approved proposal to a derived target. + Apply, + /// Discard a proposal as rejected. + Discard, + /// Defer a proposal by archiving it for later audit. + Defer, +} +impl ConsolidationReviewAction { + /// Returns the canonical storage string. + pub fn as_str(self) -> &'static str { + match self { + Self::Approve => "approve", + Self::Apply => "apply", + Self::Discard => "discard", + Self::Defer => "defer", + } + } +} + +/// Review lifecycle for a consolidation proposal. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ConsolidationReviewState { + /// Proposal is awaiting review. + Proposed, + /// Proposal has been approved for downstream derived-output application. + Approved, + /// Proposal was rejected by a reviewer. + Rejected, + /// Proposal was approved and marked applied to the derived target. + Applied, + /// Proposal is retained but no longer active for review. + Archived, +} +impl ConsolidationReviewState { + /// Returns the canonical storage string. + pub fn as_str(self) -> &'static str { + match self { + Self::Proposed => "proposed", + Self::Approved => "approved", + Self::Rejected => "rejected", + Self::Applied => "applied", + Self::Archived => "archived", + } + } + + /// Parses a canonical storage string. + pub fn parse(raw: &str) -> Option<Self> { + match raw { + "proposed" => Some(Self::Proposed), + "approved" => Some(Self::Approved), + "rejected" => Some(Self::Rejected), + "applied" => Some(Self::Applied), + "archived" => Some(Self::Archived), + _ => None, + } + } + + /// Validates a review lifecycle transition. + pub fn validate_transition(self, to: Self) -> Result<(), ConsolidationValidationError> { + let allowed = match self { + Self::Proposed => matches!(to, Self::Approved | Self::Rejected | Self::Archived), + Self::Approved => matches!(to, Self::Applied | Self::Rejected | Self::Archived), + Self::Rejected | Self::Applied | Self::Archived => false, + }; + + if allowed { + Ok(()) + } else { + Err(ConsolidationValidationError::InvalidReviewTransition { from: self, to }) + } + } +} + +/// Consolidation job lifecycle. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ConsolidationRunState { + /// Job has been registered but has not started. + Pending, + /// Job is actively generating fixture or future provider-backed proposals. + Running, + /// Job completed proposal generation. + Completed, + /// Job failed before completion. + Failed, + /// Job was cancelled by an operator. + Cancelled, +} +impl ConsolidationRunState { + /// Returns the canonical storage string. + pub fn as_str(self) -> &'static str { + match self { + Self::Pending => "pending", + Self::Running => "running", + Self::Completed => "completed", + Self::Failed => "failed", + Self::Cancelled => "cancelled", + } + } + + /// Parses a canonical storage string. + pub fn parse(raw: &str) -> Option<Self> { + match raw { + "pending" => Some(Self::Pending), + "running" => Some(Self::Running), + "completed" => Some(Self::Completed), + "failed" => Some(Self::Failed), + "cancelled" => Some(Self::Cancelled), + _ => None, + } + } + + /// Validates a job lifecycle transition. + pub fn validate_transition(self, to: Self) -> Result<(), ConsolidationValidationError> { + let allowed = match self { + Self::Pending => matches!(to, Self::Running | Self::Cancelled), + Self::Running => matches!(to, Self::Completed | Self::Failed | Self::Cancelled), + Self::Completed | Self::Failed | Self::Cancelled => false, + }; + + if allowed { + Ok(()) + } else { + Err(ConsolidationValidationError::InvalidRunTransition { from: self, to }) + } + } +} + +/// Reviewable diff between prior derived output and proposed derived output. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct ConsolidationProposalDiff { + /// Human-readable diff summary. + pub summary: String, + #[serde(default)] + /// Previous derived output snapshot, or an empty object for creates. + pub before: Value, + #[serde(default)] + /// Proposed derived output snapshot. + pub after: Value, +} +impl ConsolidationProposalDiff { + /// Validates diff shape and rejects source-mutation payloads. + pub fn validate(&self) -> Result<(), ConsolidationValidationError> { + if self.summary.trim().is_empty() { + return Err(ConsolidationValidationError::EmptyText { field: "diff.summary" }); + } + + validate_json_object("diff.before", &self.before)?; + validate_json_object("diff.after", &self.after)?; + + if contains_forbidden_diff_key(&self.before) || contains_forbidden_diff_key(&self.after) { + return Err(ConsolidationValidationError::DestructiveDiff); + } + + Ok(()) + } +} + +/// Source lineage for one consolidation proposal. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct ConsolidationLineage { + /// Source references directly supporting the proposal. + pub source_refs: Vec<ConsolidationInputRef>, + /// Parent consolidation run, when this proposal is derived from an earlier run. + pub parent_run_id: Option<Uuid>, + #[serde(default)] + /// Parent proposals used as lineage inputs. + pub parent_proposal_ids: Vec<Uuid>, +} +impl ConsolidationLineage { + /// Validates source lineage references. + pub fn validate(&self) -> Result<(), ConsolidationValidationError> { + validate_source_refs(&self.source_refs) + } +} + +/// Full reviewable consolidation proposal contract. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct ConsolidationProposalContract { + /// Proposal kind, such as `derived_note` or `knowledge_page`. + pub proposal_kind: String, + /// Derived-output apply intent. + pub apply_intent: ConsolidationApplyIntent, + /// Source references directly supporting the proposal. + pub source_refs: Vec<ConsolidationInputRef>, + #[serde(default)] + /// Aggregate source snapshot metadata for reviewer inspection. + pub source_snapshot: Value, + /// Proposal lineage. + pub lineage: ConsolidationLineage, + /// Model or fixture confidence in the proposal. + pub confidence: f32, + #[serde(default)] + /// Unsupported claims that the reviewer must inspect before accepting a proposal. + pub unsupported_claim_flags: Vec<ConsolidationUnsupportedClaimFlag>, + /// Review markers for contradiction and staleness checks. + pub markers: ConsolidationMarkers, + /// Reviewable derived-output diff. + pub diff: ConsolidationProposalDiff, + #[serde(default)] + /// Derived target reference, when the target already exists. + pub target_ref: Value, + #[serde(default)] + /// Proposed derived output payload. + pub proposed_payload: Value, +} +impl ConsolidationProposalContract { + /// Validates a proposal contract before persistence. + pub fn validate(&self) -> Result<(), ConsolidationValidationError> { + if self.proposal_kind.trim().is_empty() { + return Err(ConsolidationValidationError::EmptyText { field: "proposal_kind" }); + } + + validate_source_refs(&self.source_refs)?; + validate_json_object("source_snapshot", &self.source_snapshot)?; + + self.lineage.validate()?; + + if !self.confidence.is_finite() || !(0.0..=1.0).contains(&self.confidence) { + return Err(ConsolidationValidationError::InvalidConfidence); + } + + self.markers.validate()?; + + for flag in &self.unsupported_claim_flags { + flag.validate()?; + } + + self.diff.validate()?; + + validate_json_object("target_ref", &self.target_ref)?; + validate_json_object("proposed_payload", &self.proposed_payload)?; + + Ok(()) + } +} + +/// Worker payload for materializing one consolidation run. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct ConsolidationJobPayload { + /// Versioned consolidation contract schema. + pub contract_schema: String, + #[serde(default)] + /// Proposals to persist for review. + pub proposals: Vec<ConsolidationProposalContract>, +} +impl ConsolidationJobPayload { + /// Validates the queued worker payload and all proposal contracts. + pub fn validate(&self) -> Result<(), ConsolidationValidationError> { + if self.contract_schema != CONSOLIDATION_CONTRACT_SCHEMA_V1 { + return Err(ConsolidationValidationError::InvalidContractSchema); + } + + for proposal in &self.proposals { + proposal.validate()?; + } + + Ok(()) + } +} + +/// Validates a source reference list. +pub fn validate_source_refs( + source_refs: &[ConsolidationInputRef], +) -> Result<(), ConsolidationValidationError> { + if source_refs.is_empty() { + return Err(ConsolidationValidationError::MissingSourceRefs); + } + + for source_ref in source_refs { + source_ref.validate()?; + } + + Ok(()) +} + +fn validate_json_object( + field: &'static str, + value: &Value, +) -> Result<(), ConsolidationValidationError> { + if matches!(value, Value::Object(_)) { + Ok(()) + } else { + Err(ConsolidationValidationError::InvalidJsonObject { field }) + } +} + +fn non_empty_object(value: &Value) -> bool { + match value { + Value::Object(map) => !map.is_empty(), + _ => false, + } +} + +fn contains_forbidden_diff_key(value: &Value) -> bool { + match value { + Value::Object(map) => map.iter().any(|(key, nested)| { + FORBIDDEN_DIFF_KEYS.contains(&key.as_str()) || contains_forbidden_diff_key(nested) + }), + Value::Array(items) => items.iter().any(contains_forbidden_diff_key), + _ => false, + } +} diff --git a/packages/elf-domain/src/english_gate.rs b/packages/elf-domain/src/english_gate.rs new file mode 100644 index 00000000..5c0d559c --- /dev/null +++ b/packages/elf-domain/src/english_gate.rs @@ -0,0 +1,218 @@ +//! English-gate helpers for request text and identifiers. + +use unicode_normalization::UnicodeNormalization; +use unicode_script::{Script, UnicodeScript}; +use whatlang::Lang; + +/// English-gate input classes that determine which checks apply. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum EnglishGateKind { + /// Natural-language text that is expected to be English prose. + NaturalLanguage, + /// Structured identifiers (keys, URLs, ids). No language identification is applied. + Identifier, +} + +/// Reasons the English gate rejected an input string. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum EnglishGateRejectReason { + /// The input contains a disallowed control character. + DisallowedControlChar, + /// The input contains a disallowed zero-width character. + DisallowedZeroWidthChar, + /// The input contains characters from disallowed scripts. + DisallowedScript, + /// Language identification reported a confident non-English result. + LanguageIdNonEnglish, +} + +/// Applies ELF's English gate to an input string. +pub fn english_gate(input: &str, kind: EnglishGateKind) -> Result<(), EnglishGateRejectReason> { + let normalized: String = input.nfkc().collect(); + + if contains_disallowed_controls(normalized.as_str()) { + return Err(EnglishGateRejectReason::DisallowedControlChar); + } + if contains_disallowed_zero_width(normalized.as_str()) { + return Err(EnglishGateRejectReason::DisallowedZeroWidthChar); + } + if contains_disallowed_scripts(normalized.as_str()) { + return Err(EnglishGateRejectReason::DisallowedScript); + } + if kind == EnglishGateKind::NaturalLanguage + && should_apply_lid(normalized.as_str()) + && is_confidently_non_english(normalized.as_str()) + { + return Err(EnglishGateRejectReason::LanguageIdNonEnglish); + } + + Ok(()) +} + +/// Returns `true` when natural-language input passes the English gate. +pub fn is_english_natural_language(input: &str) -> bool { + english_gate(input, EnglishGateKind::NaturalLanguage).is_ok() +} + +/// Returns `true` when identifier-like input passes the English gate. +pub fn is_english_identifier(input: &str) -> bool { + english_gate(input, EnglishGateKind::Identifier).is_ok() +} + +fn contains_disallowed_controls(input: &str) -> bool { + for ch in input.chars() { + if !ch.is_control() { + continue; + } + // Allow common whitespace controls used in code/docs. + if matches!(ch, '\n' | '\r' | '\t') { + continue; + } + + return true; + } + + false +} + +fn contains_disallowed_zero_width(input: &str) -> bool { + for ch in input.chars() { + if matches!( + ch, + '\u{00AD}' // soft hyphen + | '\u{034F}' // combining grapheme joiner + | '\u{061C}' // arabic letter mark + | '\u{180E}' // mongolian vowel separator (deprecated) + | '\u{200B}' // zero width space + | '\u{200C}' // zero width non-joiner + | '\u{200D}' // zero width joiner + | '\u{2060}' // word joiner + | '\u{FEFF}' // zero width no-break space + ) { + return true; + } + } + + false +} + +fn contains_disallowed_scripts(input: &str) -> bool { + for ch in input.chars() { + if ch.is_ascii() { + continue; + } + if ch.is_whitespace() { + continue; + } + + // Allow only Latin + neutral scripts for punctuation/symbols/emoji. + match ch.script() { + Script::Latin | Script::Common | Script::Inherited => {}, + _ => return true, + } + } + + false +} + +fn should_apply_lid(input: &str) -> bool { + let mut letters = 0_usize; + let mut non_space = 0_usize; + let mut whitespace = 0_usize; + + for ch in input.chars() { + if ch.is_whitespace() { + whitespace += 1; + + continue; + } + + non_space += 1; + + if ch.is_alphabetic() { + letters += 1; + } + } + + // Skip short strings (too noisy for LID) and single-token identifiers. + if letters < 32 || non_space < 64 || whitespace == 0 { + return false; + } + + let density = letters as f32 / non_space as f32; + + density >= 0.60 +} + +fn is_confidently_non_english(input: &str) -> bool { + let Some(info) = whatlang::detect(input) else { + return false; + }; + + // Be conservative: only reject when the detector is confident. + if !info.is_reliable() { + return false; + } + if info.confidence() < 0.85 { + return false; + } + + info.lang() != Lang::Eng +} + +#[cfg(test)] +mod tests { + use crate::english_gate::{self, EnglishGateKind}; + + #[test] + fn accepts_basic_english() { + assert!(english_gate::is_english_natural_language("Preference: Use English.")); + } + + #[test] + fn rejects_cyrillic_script() { + assert!(!english_gate::is_english_natural_language("Привет мир")); + } + + #[test] + fn rejects_zero_width_chars() { + assert!(!english_gate::is_english_natural_language("hello\u{200B}world")); + } + + #[test] + fn rejects_disallowed_control_chars() { + assert!(!english_gate::is_english_natural_language("hello\u{0007}world")); + } + + #[test] + fn nfkc_normalization_allows_fullwidth_latin() { + assert!(english_gate::is_english_natural_language( + "Fullwidth latin letters should normalize." + )); + } + + #[test] + fn identifier_gate_skips_lid_but_still_rejects_disallowed_script() { + assert!(english_gate::is_english_identifier("preferred_language")); + + assert!(!english_gate::is_english_identifier("ключ")); // Cyrillic + } + + #[test] + fn lid_is_applied_only_for_long_letter_dense_text() { + let short_french = "Bonjour."; + + assert!(english_gate::english_gate(short_french, EnglishGateKind::NaturalLanguage).is_ok()); + + let long_french = "Bonjour, je veux m'assurer que ce texte est suffisamment long et riche en lettres pour declencher la detection de langue. Merci beaucoup."; + + assert!(english_gate::english_gate(long_french, EnglishGateKind::NaturalLanguage).is_err()); + } + + #[test] + fn code_like_text_is_not_rejected_by_lid_thresholds() { + let codeish = "Error: expected `foo::bar()`; got `foo::baz()` at line 12."; + + assert!(english_gate::is_english_natural_language(codeish)); + } +} diff --git a/packages/elf-domain/src/evidence.rs b/packages/elf-domain/src/evidence.rs index f1afc75b..f84b4d2f 100644 --- a/packages/elf-domain/src/evidence.rs +++ b/packages/elf-domain/src/evidence.rs @@ -1,3 +1,10 @@ +//! Evidence-binding helpers for verbatim quote checks. + +/// Returns whether `quote` appears verbatim in `messages[index]`. pub fn evidence_matches(messages: &[String], index: usize, quote: &str) -> bool { + if quote.trim().is_empty() { + return false; + } + messages.get(index).map(|msg| msg.contains(quote)).unwrap_or(false) } diff --git a/packages/elf-domain/src/knowledge.rs b/packages/elf-domain/src/knowledge.rs new file mode 100644 index 00000000..ce933b42 --- /dev/null +++ b/packages/elf-domain/src/knowledge.rs @@ -0,0 +1,86 @@ +//! Derived knowledge page contract identifiers and storage enums. + +use serde::{Deserialize, Serialize}; + +/// Current derived knowledge page contract schema identifier. +pub const KNOWLEDGE_PAGE_CONTRACT_SCHEMA_V1: &str = "elf.knowledge_page/v1"; +/// Current deterministic rebuild metadata schema identifier. +pub const KNOWLEDGE_PAGE_REBUILD_SCHEMA_V1: &str = "elf.knowledge_page.rebuild/v1"; +/// Current source coverage metadata schema identifier. +pub const KNOWLEDGE_PAGE_SOURCE_COVERAGE_SCHEMA_V1: &str = "elf.knowledge_page.source_coverage/v1"; + +/// Derived knowledge page category. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum KnowledgePageKind { + /// Project overview page. + Project, + /// Entity dossier page. + Entity, + /// Concept page. + Concept, + /// Issue timeline or issue dossier page. + Issue, + /// Decision page. + Decision, +} +impl KnowledgePageKind { + /// Returns the canonical storage string. + pub fn as_str(self) -> &'static str { + match self { + Self::Project => "project", + Self::Entity => "entity", + Self::Concept => "concept", + Self::Issue => "issue", + Self::Decision => "decision", + } + } + + /// Parses a canonical storage string. + pub fn parse(raw: &str) -> Option<Self> { + match raw { + "project" => Some(Self::Project), + "entity" => Some(Self::Entity), + "concept" => Some(Self::Concept), + "issue" => Some(Self::Issue), + "decision" => Some(Self::Decision), + _ => None, + } + } +} + +/// Authoritative source kind used by a derived page citation. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum KnowledgeSourceKind { + /// Memory note source. + Note, + /// Event source reserved for future durable event rows. + Event, + /// Graph relation fact source. + Relation, + /// Reviewed consolidation proposal source. + Proposal, +} +impl KnowledgeSourceKind { + /// Returns the canonical storage string. + pub fn as_str(self) -> &'static str { + match self { + Self::Note => "note", + Self::Event => "event", + Self::Relation => "relation", + Self::Proposal => "proposal", + } + } + + /// Parses a canonical storage string. + pub fn parse(raw: &str) -> Option<Self> { + match raw { + "note" => Some(Self::Note), + "event" => Some(Self::Event), + "relation" => Some(Self::Relation), + "proposal" => Some(Self::Proposal), + _ => None, + } + } +} diff --git a/packages/elf-domain/src/lib.rs b/packages/elf-domain/src/lib.rs index 358699cd..9e9747b8 100644 --- a/packages/elf-domain/src/lib.rs +++ b/packages/elf-domain/src/lib.rs @@ -1,4 +1,9 @@ -pub mod cjk; +//! Domain-level validation and policy helpers shared across ELF services. + +pub mod consolidation; +pub mod english_gate; pub mod evidence; +pub mod knowledge; +pub mod memory_policy; pub mod ttl; pub mod writegate; diff --git a/packages/elf-domain/src/memory_policy.rs b/packages/elf-domain/src/memory_policy.rs new file mode 100644 index 00000000..cafe3aef --- /dev/null +++ b/packages/elf-domain/src/memory_policy.rs @@ -0,0 +1,534 @@ +//! Memory-policy evaluation helpers. + +use serde::{Deserialize, Serialize}; + +use elf_config::{Config, MemoryPolicyRule}; + +/// Base memory decision after policy evaluation. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum MemoryPolicyDecision { + /// Persist the note as a new memory item. + Remember, + /// Update an existing memory item. + Update, + /// Ignore the note without persisting it. + Ignore, + /// Reject the note entirely. + Reject, +} + +/// Result of evaluating memory-policy rules for one note candidate. +#[derive(Debug)] +pub struct MemoryPolicyEvaluation<'a> { + /// Final decision after any downgrade rules are applied. + pub decision: MemoryPolicyDecision, + /// Rule that matched the note, if any. + pub matched_rule: Option<&'a MemoryPolicyRule>, +} + +/// Evaluates memory-policy downgrade rules for a note candidate. +pub fn evaluate_memory_policy<'a>( + cfg: &'a Config, + note_type: &str, + scope: &str, + confidence: f64, + importance: f64, + base_decision: MemoryPolicyDecision, +) -> MemoryPolicyEvaluation<'a> { + let matched_rule = select_memory_policy_rule(cfg, note_type, scope); + let decision = + if matches!(base_decision, MemoryPolicyDecision::Remember | MemoryPolicyDecision::Update) + && should_downgrade(matched_rule, confidence, importance) + { + MemoryPolicyDecision::Ignore + } else { + base_decision + }; + + MemoryPolicyEvaluation { decision, matched_rule } +} + +fn select_memory_policy_rule<'a>( + cfg: &'a Config, + note_type: &str, + scope: &str, +) -> Option<&'a MemoryPolicyRule> { + let exact_match = + cfg.memory.policy.rules.iter().find(|rule| matches_exact(note_type, scope, rule)); + + if exact_match.is_some() { + return exact_match; + } + + let note_type_match = + cfg.memory.policy.rules.iter().find(|rule| matches_note_type(note_type, rule)); + + if note_type_match.is_some() { + return note_type_match; + } + + let scope_match = cfg.memory.policy.rules.iter().find(|rule| matches_scope(scope, rule)); + + if scope_match.is_some() { + return scope_match; + } + + cfg.memory.policy.rules.iter().find(|rule| rule.note_type.is_none() && rule.scope.is_none()) +} + +fn matches_exact(note_type: &str, scope: &str, rule: &MemoryPolicyRule) -> bool { + match (rule.note_type.as_deref(), rule.scope.as_deref()) { + (Some(rule_type), Some(rule_scope)) => rule_type == note_type && rule_scope == scope, + _ => false, + } +} + +fn matches_note_type(note_type: &str, rule: &MemoryPolicyRule) -> bool { + match (rule.note_type.as_deref(), rule.scope.as_deref()) { + (Some(rule_type), None) => rule_type == note_type, + _ => false, + } +} + +fn matches_scope(scope: &str, rule: &MemoryPolicyRule) -> bool { + match (rule.note_type.as_deref(), rule.scope.as_deref()) { + (None, Some(rule_scope)) => rule_scope == scope, + _ => false, + } +} + +fn should_downgrade( + matched_rule: Option<&MemoryPolicyRule>, + confidence: f64, + importance: f64, +) -> bool { + let Some(rule) = matched_rule else { + return false; + }; + + if let Some(min_confidence) = rule.min_confidence + && (!confidence.is_finite() || confidence < f64::from(min_confidence)) + { + return true; + } + if let Some(min_importance) = rule.min_importance + && (!importance.is_finite() || importance < f64::from(min_importance)) + { + return true; + } + + false +} + +#[cfg(test)] +mod tests { + use crate::memory_policy::{self, MemoryPolicyDecision, MemoryPolicyEvaluation}; + use elf_config::{ + Chunking, Config, EmbeddingProviderConfig, Lifecycle, LlmProviderConfig, Memory, + MemoryPolicy, MemoryPolicyRule, Postgres, ProviderConfig, Providers, Qdrant, Ranking, + RankingBlend, RankingBlendSegment, RankingDeterministic, RankingDeterministicDecay, + RankingDeterministicHits, RankingDeterministicLexical, RankingDiversity, + RankingRetrievalSources, ReadProfiles, ScopePrecedence, ScopeWriteAllowed, Scopes, Search, + SearchCache, SearchDynamic, SearchExpansion, SearchExplain, SearchGraphContext, + SearchPrefilter, SearchRecursive, Security, Service, Storage, TtlDays, + }; + + fn test_config(policy: MemoryPolicy) -> Config { + let mut cfg = test_default_config(); + + cfg.memory.policy = policy; + + cfg + } + + fn test_default_config() -> Config { + Config { + service: test_service_config(), + storage: test_storage_config(), + providers: test_providers_config(), + scopes: test_scopes_config(), + memory: test_memory_config(), + search: test_search_config(), + ranking: test_ranking_config(), + lifecycle: test_lifecycle_config(), + security: test_security_config(), + chunking: test_chunking_config(), + context: None, + mcp: None, + } + } + + fn test_service_config() -> Service { + Service { + http_bind: "127.0.0.1:8080".to_string(), + mcp_bind: "127.0.0.1:8082".to_string(), + admin_bind: "127.0.0.1:8081".to_string(), + log_level: "info".to_string(), + } + } + + fn test_storage_config() -> Storage { + Storage { + postgres: Postgres { + dsn: "postgres://user:pass@localhost/db".to_string(), + pool_max_conns: 1, + }, + qdrant: Qdrant { + url: "http://localhost".to_string(), + collection: "mem_notes_v2".to_string(), + docs_collection: "doc_chunks_v1".to_string(), + vector_dim: 4_096, + }, + } + } + + fn test_providers_config() -> Providers { + Providers { + embedding: test_embedding_provider_config(), + rerank: test_rerank_provider_config(), + llm_extractor: test_llm_extractor_provider_config(), + } + } + + fn test_embedding_provider_config() -> EmbeddingProviderConfig { + EmbeddingProviderConfig { + provider_id: "p".to_string(), + api_base: "http://localhost".to_string(), + api_key: "key".to_string(), + path: "/".to_string(), + model: "m".to_string(), + dimensions: 3, + timeout_ms: 1_000, + default_headers: Default::default(), + } + } + + fn test_rerank_provider_config() -> ProviderConfig { + ProviderConfig { + provider_id: "p".to_string(), + api_base: "http://localhost".to_string(), + api_key: "key".to_string(), + path: "/".to_string(), + model: "m".to_string(), + timeout_ms: 1_000, + default_headers: Default::default(), + } + } + + fn test_llm_extractor_provider_config() -> LlmProviderConfig { + LlmProviderConfig { + provider_id: "p".to_string(), + api_base: "http://localhost".to_string(), + api_key: "key".to_string(), + path: "/".to_string(), + model: "m".to_string(), + temperature: 0.1, + timeout_ms: 1_000, + default_headers: Default::default(), + } + } + + fn test_scopes_config() -> Scopes { + Scopes { + allowed: vec!["agent_private".to_string()], + read_profiles: test_read_profiles_config(), + precedence: ScopePrecedence { agent_private: 30, project_shared: 20, org_shared: 10 }, + write_allowed: ScopeWriteAllowed { + agent_private: true, + project_shared: true, + org_shared: true, + }, + } + } + + fn test_read_profiles_config() -> ReadProfiles { + ReadProfiles { + private_only: vec!["agent_private".to_string()], + private_plus_project: vec!["agent_private".to_string()], + all_scopes: vec!["agent_private".to_string()], + } + } + + fn test_memory_config() -> Memory { + Memory { + max_notes_per_add_event: 3, + max_note_chars: 240, + dup_sim_threshold: 0.92, + update_sim_threshold: 0.85, + candidate_k: 60, + top_k: 12, + policy: MemoryPolicy { + rules: vec![ + MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: Some(0.9), + min_importance: Some(0.1), + }, + MemoryPolicyRule { + note_type: Some("preference".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: Some(0.75), + min_importance: None, + }, + MemoryPolicyRule { + note_type: Some("preference".to_string()), + scope: None, + min_confidence: Some(0.6), + min_importance: None, + }, + MemoryPolicyRule { + note_type: None, + scope: None, + min_confidence: None, + min_importance: None, + }, + ], + }, + } + } + + fn test_search_config() -> Search { + Search { + expansion: SearchExpansion { + mode: "off".to_string(), + max_queries: 4, + include_original: true, + }, + dynamic: SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, + prefilter: SearchPrefilter { max_candidates: 0 }, + cache: SearchCache { + enabled: true, + expansion_ttl_days: 7, + rerank_ttl_days: 7, + max_payload_bytes: Some(262_144), + }, + explain: SearchExplain { + retention_days: 7, + capture_candidates: false, + candidate_retention_days: 2, + write_mode: "outbox".to_string(), + }, + recursive: SearchRecursive { + enabled: false, + max_depth: 2, + max_children_per_node: 4, + max_nodes_per_scope: 32, + max_total_nodes: 256, + }, + graph_context: SearchGraphContext { + enabled: false, + max_facts_per_item: 16, + max_evidence_notes_per_fact: 16, + }, + } + } + + fn test_ranking_config() -> Ranking { + Ranking { + recency_tau_days: 60.0, + tie_breaker_weight: 0.1, + deterministic: test_ranking_deterministic_config(), + blend: RankingBlend { + enabled: true, + rerank_normalization: "rank".to_string(), + retrieval_normalization: "rank".to_string(), + segments: vec![ + RankingBlendSegment { max_retrieval_rank: 3, retrieval_weight: 0.8 }, + RankingBlendSegment { max_retrieval_rank: 10, retrieval_weight: 0.5 }, + RankingBlendSegment { max_retrieval_rank: 1_000_000, retrieval_weight: 0.2 }, + ], + }, + diversity: RankingDiversity { + enabled: true, + sim_threshold: 0.88, + mmr_lambda: 0.7, + max_skips: 64, + }, + retrieval_sources: RankingRetrievalSources { + fusion_weight: 1.0, + structured_field_weight: 1.0, + fusion_priority: 1, + structured_field_priority: 0, + }, + } + } + + fn test_ranking_deterministic_config() -> RankingDeterministic { + RankingDeterministic { + enabled: false, + lexical: RankingDeterministicLexical { + enabled: false, + weight: 0.05, + min_ratio: 0.3, + max_query_terms: 16, + max_text_terms: 1_024, + }, + hits: RankingDeterministicHits { + enabled: false, + weight: 0.05, + half_saturation: 8.0, + last_hit_tau_days: 14.0, + }, + decay: RankingDeterministicDecay { enabled: false, weight: 0.05, tau_days: 30.0 }, + } + } + + fn test_lifecycle_config() -> Lifecycle { + Lifecycle { + ttl_days: TtlDays { + plan: 14, + fact: 180, + preference: 0, + constraint: 0, + decision: 0, + profile: 0, + }, + purge_deleted_after_days: 30, + purge_deprecated_after_days: 180, + } + } + + fn test_security_config() -> Security { + Security { + bind_localhost_only: true, + reject_non_english: true, + redact_secrets_on_write: true, + evidence_min_quotes: 1, + evidence_max_quotes: 2, + evidence_max_quote_chars: 320, + auth_mode: "off".to_string(), + auth_keys: vec![], + } + } + + fn test_chunking_config() -> Chunking { + Chunking { + enabled: true, + max_tokens: 512, + overlap_tokens: 128, + tokenizer_repo: "REPLACE_ME".to_string(), + } + } + #[test] + fn policy_precedence_prefers_note_type_and_scope_over_note_type_only() { + let cfg = test_config(MemoryPolicy { + rules: vec![ + MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: None, + min_confidence: Some(0.05), + min_importance: None, + }, + MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: Some(0.95), + min_importance: None, + }, + MemoryPolicyRule { + note_type: None, + scope: Some("agent_private".to_string()), + min_confidence: Some(0.40), + min_importance: None, + }, + ], + }); + let MemoryPolicyEvaluation { decision, matched_rule } = + memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.5, + 0.5, + MemoryPolicyDecision::Remember, + ); + + assert_eq!(decision, MemoryPolicyDecision::Ignore); + + let rule = matched_rule.expect("expected policy match"); + + assert_eq!(rule.note_type.as_deref(), Some("fact")); + assert_eq!(rule.scope.as_deref(), Some("agent_private")); + assert_eq!(rule.min_confidence, Some(0.95)); + assert_eq!(rule.min_importance, None); + } + + #[test] + fn evaluate_downgrades_base_remember_update_only() { + let cfg = test_config(MemoryPolicy { + rules: vec![MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: Some(0.9), + min_importance: Some(0.5), + }], + }); + let remember = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.95, + 0.4, + MemoryPolicyDecision::Remember, + ); + + assert_eq!(remember.decision, MemoryPolicyDecision::Ignore); + + let update = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + f64::NAN, + f64::NAN, + MemoryPolicyDecision::Update, + ); + + assert_eq!(update.decision, MemoryPolicyDecision::Ignore); + + let ignore = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.1, + 0.1, + MemoryPolicyDecision::Ignore, + ); + + assert_eq!(ignore.decision, MemoryPolicyDecision::Ignore); + + let reject = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.1, + 0.1, + MemoryPolicyDecision::Reject, + ); + + assert_eq!(reject.decision, MemoryPolicyDecision::Reject); + } + + #[test] + fn evaluate_without_matching_threshold_leaves_base_unchanged() { + let cfg = test_config(MemoryPolicy { + rules: vec![MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: None, + min_importance: None, + }], + }); + let output = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.0, + 0.0, + MemoryPolicyDecision::Remember, + ); + + assert_eq!(output.decision, MemoryPolicyDecision::Remember); + } +} diff --git a/packages/elf-domain/src/ttl.rs b/packages/elf-domain/src/ttl.rs index 4f4f2a5b..f04a13f3 100644 --- a/packages/elf-domain/src/ttl.rs +++ b/packages/elf-domain/src/ttl.rs @@ -1,10 +1,14 @@ -// crates.io +//! TTL helpers derived from lifecycle configuration. + use time::{Duration, OffsetDateTime}; +use elf_config::Config; + +/// Computes the note expiration timestamp from an explicit TTL or configured defaults. pub fn compute_expires_at( ttl_days: Option<i64>, note_type: &str, - cfg: &elf_config::Config, + cfg: &Config, now: OffsetDateTime, ) -> Option<OffsetDateTime> { let days = if let Some(value) = ttl_days.filter(|days| *days > 0) { diff --git a/packages/elf-domain/src/writegate.rs b/packages/elf-domain/src/writegate.rs index 177c88d2..3d66dcc4 100644 --- a/packages/elf-domain/src/writegate.rs +++ b/packages/elf-domain/src/writegate.rs @@ -1,31 +1,213 @@ -// crates.io +//! Writegate validation and redaction helpers. + use regex::Regex; +use serde::{Deserialize, Serialize}; -// self -use crate::cjk; +use crate::english_gate; +use elf_config::Config; -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +/// Reasons a note can be rejected by the write gate. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum RejectCode { - RejectCjk, + /// The note text failed the English gate. + RejectNonEnglish, + /// The note text exceeded the configured length limit. RejectTooLong, + /// The note text appears to contain secret material. RejectSecret, + /// The note type is not one of the allowed values. RejectInvalidType, + /// The note scope is not allowed or not writable. RejectScopeDenied, + /// The note text is empty after trimming. RejectEmpty, } +/// One write-policy redaction operation. +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum WriteRedaction { + /// Replaces the target span with a literal string. + Replace { + /// Span to replace before persistence. + span: WriteSpan, + /// Literal replacement text to insert for the span. + replacement: String, + }, + /// Removes the target span entirely. + Remove { + /// Span to remove before persistence. + span: WriteSpan, + }, +} + +/// Errors returned while validating write-policy spans. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum WritePolicyError { + /// A span was out of bounds or not aligned to char boundaries. + InvalidSpan, + /// Two exclusions/redactions overlapped. + OverlappingOps, +} + +#[derive(Clone, Debug)] +enum WriteOpKind { + Exclude, + Redact(String), +} + +/// Half-open byte span within input text. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub struct WriteSpan { + /// Inclusive start byte offset. + pub start: usize, + /// Exclusive end byte offset. + pub end: usize, +} + +/// Optional write-policy transform applied before note ingestion. +#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub struct WritePolicy { + /// Spans that should be removed before persistence. + #[serde(default)] + pub exclusions: Vec<WriteSpan>, + /// Redactions that should be applied before persistence. + #[serde(default)] + pub redactions: Vec<WriteRedaction>, +} + +/// Result of applying a write policy to one note body. +#[derive(Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +pub struct WritePolicyResult { + /// Transformed note text after exclusions and redactions. + pub transformed: String, + /// Audit data describing which operations were applied. + pub audit: WritePolicyAudit, +} + +/// Audit payload emitted when a write policy is applied. +#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub struct WritePolicyAudit { + /// Exclusion spans that were applied. + pub exclusions: Vec<WriteSpan>, + /// Redactions that were applied. + pub redactions: Vec<WriteRedactionResult>, +} + +/// One redaction entry in write-policy audit output. +#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub struct WriteRedactionResult { + /// Span that was removed or replaced. + pub span: WriteSpan, + /// Replacement text that was applied. + pub replacement: String, +} + +/// Normalized note input passed through `writegate`. pub struct NoteInput { + /// Requested note type. pub note_type: String, + /// Requested write scope. pub scope: String, + /// Note text after request decoding. pub text: String, } -pub fn writegate(note: &NoteInput, cfg: &elf_config::Config) -> Result<(), RejectCode> { +#[derive(Clone, Debug)] +struct WriteOp { + span: WriteSpan, + kind: WriteOpKind, +} + +/// Applies an optional write policy to note text and returns the transformed output. +pub fn apply_write_policy( + text: &str, + policy: Option<&WritePolicy>, +) -> Result<WritePolicyResult, WritePolicyError> { + let policy = match policy { + Some(policy) => policy, + None => { + return Ok(WritePolicyResult { + transformed: text.to_string(), + audit: WritePolicyAudit::default(), + }); + }, + }; + let mut exclusions = policy.exclusions.clone(); + let mut redactions = policy.redactions.clone(); + + if exclusions.is_empty() && redactions.is_empty() { + return Ok(WritePolicyResult { + transformed: text.to_string(), + audit: WritePolicyAudit::default(), + }); + } + + exclusions.sort_by_key(|span| (span.start, span.end)); + redactions.sort_by_key(|r| match r { + WriteRedaction::Replace { span, .. } => (span.start, span.end), + WriteRedaction::Remove { span } => (span.start, span.end), + }); + + let mut ops = Vec::with_capacity(exclusions.len() + redactions.len()); + let mut audit = WritePolicyAudit::default(); + + for span in &exclusions { + validate_span(text, span)?; + + ops.push(WriteOp { span: *span, kind: WriteOpKind::Exclude }); + audit.exclusions.push(*span); + } + for redaction in &redactions { + match redaction { + WriteRedaction::Remove { span } => { + validate_span(text, span)?; + + ops.push(WriteOp { span: *span, kind: WriteOpKind::Redact(String::new()) }); + audit + .redactions + .push(WriteRedactionResult { span: *span, replacement: String::new() }); + }, + + WriteRedaction::Replace { span, replacement } => { + validate_span(text, span)?; + + ops.push(WriteOp { span: *span, kind: WriteOpKind::Redact(replacement.clone()) }); + audit + .redactions + .push(WriteRedactionResult { span: *span, replacement: replacement.clone() }); + }, + } + } + + ops.sort_by_key(|op| (op.span.start, op.span.end)); + + validate_non_overlapping_ops(&ops)?; + + let mut transformed = text.to_string(); + + for op in ops.iter().rev() { + match &op.kind { + WriteOpKind::Exclude => transformed.replace_range(op.span.start..op.span.end, ""), + WriteOpKind::Redact(replacement) => + transformed.replace_range(op.span.start..op.span.end, replacement.as_str()), + } + } + + Ok(WritePolicyResult { transformed, audit }) +} + +/// Validates note content and metadata against ELF write-gate rules. +pub fn writegate(note: &NoteInput, cfg: &Config) -> Result<(), RejectCode> { if note.text.trim().is_empty() { return Err(RejectCode::RejectEmpty); } - if cjk::contains_cjk(¬e.text) { - return Err(RejectCode::RejectCjk); + if !english_gate::is_english_natural_language(note.text.as_str()) { + return Err(RejectCode::RejectNonEnglish); } if note.text.chars().count() as u32 > cfg.memory.max_note_chars { return Err(RejectCode::RejectTooLong); @@ -42,23 +224,12 @@ pub fn writegate(note: &NoteInput, cfg: &elf_config::Config) -> Result<(), Rejec if contains_secrets(¬e.text) { return Err(RejectCode::RejectSecret); } - Ok(()) -} - -fn scope_write_allowed(cfg: &elf_config::Config, scope: &str) -> bool { - match scope { - "agent_private" => cfg.scopes.write_allowed.agent_private, - "project_shared" => cfg.scopes.write_allowed.project_shared, - "org_shared" => cfg.scopes.write_allowed.org_shared, - _ => false, - } -} -fn is_allowed_type(note_type: &str) -> bool { - matches!(note_type, "preference" | "constraint" | "decision" | "profile" | "fact" | "plan") + Ok(()) } -fn contains_secrets(text: &str) -> bool { +/// Returns whether the input appears to contain secret material. +pub fn contains_secrets(text: &str) -> bool { let patterns = [ r"(?i)-----BEGIN (RSA|OPENSSH|EC|DSA) PRIVATE KEY-----", r"(?i)ssh-rsa", @@ -79,81 +250,197 @@ fn contains_secrets(text: &str) -> bool { false } +fn validate_span(text: &str, span: &WriteSpan) -> Result<(), WritePolicyError> { + if span.end < span.start { + return Err(WritePolicyError::InvalidSpan); + } + if span.end > text.len() { + return Err(WritePolicyError::InvalidSpan); + } + if !text.is_char_boundary(span.start) || !text.is_char_boundary(span.end) { + return Err(WritePolicyError::InvalidSpan); + } + + Ok(()) +} + +fn validate_non_overlapping_ops(ops: &[WriteOp]) -> Result<(), WritePolicyError> { + let mut last_end = 0_usize; + + for op in ops { + if op.span.start < last_end { + return Err(WritePolicyError::OverlappingOps); + } + + last_end = op.span.end; + } + + Ok(()) +} + +fn scope_write_allowed(cfg: &Config, scope: &str) -> bool { + match scope { + "agent_private" => cfg.scopes.write_allowed.agent_private, + "project_shared" => cfg.scopes.write_allowed.project_shared, + "org_shared" => cfg.scopes.write_allowed.org_shared, + _ => false, + } +} + +fn is_allowed_type(note_type: &str) -> bool { + matches!(note_type, "preference" | "constraint" | "decision" | "profile" | "fact" | "plan") +} + #[cfg(test)] mod tests { - use super::*; + use crate::writegate::{ + self, NoteInput, RejectCode, WritePolicy, WritePolicyResult, WriteRedaction, + WriteRedactionResult, + }; + use elf_config::{ + Chunking, Config, EmbeddingProviderConfig, Lifecycle, LlmProviderConfig, Memory, + MemoryPolicy, Postgres, ProviderConfig, Providers, Qdrant, Ranking, RankingBlend, + RankingBlendSegment, RankingDeterministic, RankingDeterministicDecay, + RankingDeterministicHits, RankingDeterministicLexical, RankingDiversity, + RankingRetrievalSources, ReadProfiles, ScopePrecedence, ScopeWriteAllowed, Scopes, Search, + SearchCache, SearchDynamic, SearchExpansion, SearchExplain, SearchGraphContext, + SearchPrefilter, SearchRecursive, Security, Service, Storage, TtlDays, + }; + + fn test_ranking() -> Ranking { + Ranking { + recency_tau_days: 60.0, + tie_breaker_weight: 0.1, + deterministic: RankingDeterministic { + enabled: false, + lexical: RankingDeterministicLexical { + enabled: false, + weight: 0.05, + min_ratio: 0.3, + max_query_terms: 16, + max_text_terms: 1_024, + }, + hits: RankingDeterministicHits { + enabled: false, + weight: 0.05, + half_saturation: 8.0, + last_hit_tau_days: 14.0, + }, + decay: RankingDeterministicDecay { enabled: false, weight: 0.05, tau_days: 30.0 }, + }, + blend: RankingBlend { + enabled: true, + rerank_normalization: "rank".to_string(), + retrieval_normalization: "rank".to_string(), + segments: vec![ + RankingBlendSegment { max_retrieval_rank: 3, retrieval_weight: 0.8 }, + RankingBlendSegment { max_retrieval_rank: 10, retrieval_weight: 0.5 }, + RankingBlendSegment { max_retrieval_rank: 1_000_000, retrieval_weight: 0.2 }, + ], + }, + diversity: RankingDiversity { + enabled: true, + sim_threshold: 0.88, + mmr_lambda: 0.7, + max_skips: 64, + }, + retrieval_sources: RankingRetrievalSources { + fusion_weight: 1.0, + structured_field_weight: 1.0, + fusion_priority: 1, + structured_field_priority: 0, + }, + } + } - fn config() -> elf_config::Config { - elf_config::Config { - service: elf_config::Service { + fn config() -> Config { + Config { + service: Service { http_bind: "127.0.0.1:8080".to_string(), mcp_bind: "127.0.0.1:8082".to_string(), admin_bind: "127.0.0.1:8081".to_string(), log_level: "info".to_string(), }, - storage: elf_config::Storage { - postgres: elf_config::Postgres { + storage: Storage { + postgres: Postgres { dsn: "postgres://user:pass@localhost/db".to_string(), pool_max_conns: 1, }, - qdrant: elf_config::Qdrant { + qdrant: Qdrant { url: "http://localhost".to_string(), - collection: "mem_notes_v1".to_string(), - vector_dim: 3, + collection: "mem_notes_v2".to_string(), + docs_collection: "doc_chunks_v1".to_string(), + vector_dim: 4_096, }, }, - providers: elf_config::Providers { + providers: Providers { embedding: dummy_embedding_provider(), rerank: dummy_provider(), llm_extractor: dummy_llm_provider(), }, - scopes: elf_config::Scopes { + scopes: Scopes { allowed: vec!["agent_private".to_string()], - read_profiles: elf_config::ReadProfiles { + read_profiles: ReadProfiles { private_only: vec!["agent_private".to_string()], private_plus_project: vec!["agent_private".to_string()], all_scopes: vec!["agent_private".to_string()], }, - precedence: elf_config::ScopePrecedence { + precedence: ScopePrecedence { agent_private: 30, project_shared: 20, org_shared: 10, }, - write_allowed: elf_config::ScopeWriteAllowed { + write_allowed: ScopeWriteAllowed { agent_private: true, project_shared: true, org_shared: true, }, }, - memory: elf_config::Memory { + memory: Memory { max_notes_per_add_event: 3, max_note_chars: 10, dup_sim_threshold: 0.9, update_sim_threshold: 0.8, candidate_k: 10, top_k: 5, + policy: MemoryPolicy { rules: vec![] }, }, - search: elf_config::Search { - expansion: elf_config::SearchExpansion { + search: Search { + expansion: SearchExpansion { mode: "off".to_string(), max_queries: 4, include_original: true, }, - dynamic: elf_config::SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, - prefilter: elf_config::SearchPrefilter { max_candidates: 0 }, - cache: elf_config::SearchCache { + dynamic: SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, + prefilter: SearchPrefilter { max_candidates: 0 }, + cache: SearchCache { enabled: true, expansion_ttl_days: 7, rerank_ttl_days: 7, max_payload_bytes: Some(262_144), - expansion_version: "v1".to_string(), - rerank_version: "v1".to_string(), }, - explain: elf_config::SearchExplain { retention_days: 7 }, + explain: SearchExplain { + retention_days: 7, + capture_candidates: false, + candidate_retention_days: 2, + write_mode: "outbox".to_string(), + }, + recursive: SearchRecursive { + enabled: false, + max_depth: 2, + max_children_per_node: 4, + max_nodes_per_scope: 32, + max_total_nodes: 256, + }, + graph_context: SearchGraphContext { + enabled: false, + max_facts_per_item: 16, + max_evidence_notes_per_fact: 16, + }, }, - ranking: elf_config::Ranking { recency_tau_days: 60.0, tie_breaker_weight: 0.1 }, - lifecycle: elf_config::Lifecycle { - ttl_days: elf_config::TtlDays { + ranking: test_ranking(), + lifecycle: Lifecycle { + ttl_days: TtlDays { plan: 1, fact: 2, preference: 0, @@ -164,57 +451,61 @@ mod tests { purge_deleted_after_days: 30, purge_deprecated_after_days: 180, }, - security: elf_config::Security { + security: Security { bind_localhost_only: true, - reject_cjk: true, + reject_non_english: true, redact_secrets_on_write: true, evidence_min_quotes: 1, evidence_max_quotes: 2, evidence_max_quote_chars: 320, + auth_mode: "off".to_string(), + auth_keys: vec![], }, - chunking: elf_config::Chunking { + chunking: Chunking { enabled: true, max_tokens: 512, overlap_tokens: 128, - tokenizer_repo: None, + tokenizer_repo: "REPLACE_ME".to_string(), }, + context: None, + mcp: None, } } - fn dummy_embedding_provider() -> elf_config::EmbeddingProviderConfig { - elf_config::EmbeddingProviderConfig { + fn dummy_embedding_provider() -> EmbeddingProviderConfig { + EmbeddingProviderConfig { provider_id: "p".to_string(), api_base: "http://localhost".to_string(), api_key: "key".to_string(), path: "/".to_string(), model: "m".to_string(), dimensions: 3, - timeout_ms: 1000, + timeout_ms: 1_000, default_headers: serde_json::Map::new(), } } - fn dummy_provider() -> elf_config::ProviderConfig { - elf_config::ProviderConfig { + fn dummy_provider() -> ProviderConfig { + ProviderConfig { provider_id: "p".to_string(), api_base: "http://localhost".to_string(), api_key: "key".to_string(), path: "/".to_string(), model: "m".to_string(), - timeout_ms: 1000, + timeout_ms: 1_000, default_headers: serde_json::Map::new(), } } - fn dummy_llm_provider() -> elf_config::LlmProviderConfig { - elf_config::LlmProviderConfig { + fn dummy_llm_provider() -> LlmProviderConfig { + LlmProviderConfig { provider_id: "p".to_string(), api_base: "http://localhost".to_string(), api_key: "key".to_string(), path: "/".to_string(), model: "m".to_string(), temperature: 0.1, - timeout_ms: 1000, + timeout_ms: 1_000, default_headers: serde_json::Map::new(), } } @@ -227,7 +518,8 @@ mod tests { scope: "agent_private".to_string(), text: "12345678901".to_string(), }; - assert_eq!(writegate(¬e, &cfg), Err(RejectCode::RejectTooLong)); + + assert_eq!(writegate::writegate(¬e, &cfg), Err(RejectCode::RejectTooLong)); } #[test] @@ -238,11 +530,62 @@ mod tests { scope: "agent_private".to_string(), text: "hello".to_string(), }; - assert_eq!(writegate(¬e, &cfg), Err(RejectCode::RejectInvalidType)); + + assert_eq!(writegate::writegate(¬e, &cfg), Err(RejectCode::RejectInvalidType)); } #[test] fn detects_secret_patterns() { - assert!(contains_secrets("password: hunter2")); + assert!(writegate::contains_secrets("password: hunter2")); + } + + #[test] + fn applies_empty_policy_as_noop() { + let policy = WritePolicy::default(); + + assert_eq!( + writegate::apply_write_policy("keep this", Some(&policy)), + Ok(WritePolicyResult { + transformed: "keep this".to_string(), + ..WritePolicyResult::default() + }) + ); + } + + #[test] + fn applies_exclusion_span() { + let policy = WritePolicy { + exclusions: vec![crate::writegate::WriteSpan { start: 4, end: 9 }], + redactions: vec![], + }; + let actual = writegate::apply_write_policy("hello world", Some(&policy)) + .expect("policy apply should succeed"); + + assert_eq!(actual.transformed, "hellld"); + assert_eq!(actual.audit.exclusions, vec![crate::writegate::WriteSpan { start: 4, end: 9 }]); + assert!(actual.audit.redactions.is_empty()); + } + + #[test] + fn applies_simple_replacement_redaction() { + let policy = WritePolicy { + exclusions: vec![], + redactions: vec![WriteRedaction::Replace { + span: crate::writegate::WriteSpan { start: 4, end: 5 }, + replacement: "***".to_string(), + }], + }; + let actual = writegate::apply_write_policy("secret", Some(&policy)) + .expect("policy apply should succeed"); + + assert_eq!(actual.transformed, "secr***t"); + assert_eq!( + actual.audit.redactions, + vec![WriteRedactionResult { + span: crate::writegate::WriteSpan { start: 4, end: 5 }, + replacement: "***".to_string(), + }] + ); + assert!(actual.audit.exclusions.is_empty()); } } diff --git a/packages/elf-domain/tests/consolidation.rs b/packages/elf-domain/tests/consolidation.rs new file mode 100644 index 00000000..65828267 --- /dev/null +++ b/packages/elf-domain/tests/consolidation.rs @@ -0,0 +1,204 @@ +#![allow(unused_crate_dependencies)] + +//! Integration tests for consolidation proposal contract validation. + +use time::OffsetDateTime; +use uuid::Uuid; + +use elf_domain::consolidation::{ + CONSOLIDATION_CONTRACT_SCHEMA_V1, ConsolidationApplyIntent, ConsolidationInputRef, + ConsolidationJobPayload, ConsolidationLineage, ConsolidationMarkers, + ConsolidationProposalContract, ConsolidationProposalDiff, ConsolidationReviewAction, + ConsolidationReviewState, ConsolidationRunState, ConsolidationSourceKind, + ConsolidationSourceSnapshot, ConsolidationUnsupportedClaimFlag, ConsolidationValidationError, +}; + +#[test] +fn proposal_contract_accepts_reviewable_derived_output() { + let source = source_ref(); + let proposal = proposal_contract(source); + + assert!(proposal.validate().is_ok()); +} + +#[test] +fn source_refs_require_immutable_snapshot_guards() { + let mut source = source_ref(); + + source.snapshot = ConsolidationSourceSnapshot { + status: None, + updated_at: None, + content_hash: None, + embedding_version: None, + trace_version: None, + source_ref: serde_json::json!({}), + metadata: serde_json::json!({}), + }; + + assert_eq!(source.validate(), Err(ConsolidationValidationError::MissingSourceSnapshot)); +} + +#[test] +fn proposal_contract_requires_lineage_source_refs() { + let source = source_ref(); + let mut proposal = proposal_contract(source); + + proposal.lineage.source_refs = Vec::new(); + + assert_eq!(proposal.validate(), Err(ConsolidationValidationError::MissingSourceRefs)); +} + +#[test] +fn proposal_contract_rejects_destructive_diff_payloads() { + let source = source_ref(); + let mut proposal = proposal_contract(source); + + proposal.diff.after = serde_json::json!({ + "summary": "Replace stale source facts.", + "source_mutations": [ + { "kind": "note", "op": "delete" } + ] + }); + + assert_eq!(proposal.validate(), Err(ConsolidationValidationError::DestructiveDiff)); +} + +#[test] +fn unsupported_claim_flags_require_reviewer_text() { + let source = source_ref(); + let mut proposal = proposal_contract(source.clone()); + + proposal.unsupported_claim_flags = vec![ConsolidationUnsupportedClaimFlag { + claim_id: Some("unsupported-worker-claim".to_string()), + message: " ".to_string(), + source: Some(source), + }]; + + assert_eq!( + proposal.validate(), + Err(ConsolidationValidationError::EmptyText { field: "unsupported_claim_flags.message" }) + ); +} + +#[test] +fn destructive_apply_intents_are_not_part_of_the_contract() { + let parsed = + serde_json::from_value::<ConsolidationApplyIntent>(serde_json::json!("delete_source_note")); + + assert!(parsed.is_err()); +} + +#[test] +fn review_actions_use_explicit_operator_vocabulary() { + let action = serde_json::from_value::<ConsolidationReviewAction>(serde_json::json!("defer")) + .expect("review action should parse"); + + assert_eq!(action.as_str(), "defer"); + + let parsed = + serde_json::from_value::<ConsolidationReviewAction>(serde_json::json!("silently_apply")); + + assert!(parsed.is_err()); +} + +#[test] +fn proposal_lifecycle_requires_approval_before_apply() { + assert!( + ConsolidationReviewState::Proposed + .validate_transition(ConsolidationReviewState::Applied) + .is_err() + ); + assert!( + ConsolidationReviewState::Proposed + .validate_transition(ConsolidationReviewState::Approved) + .is_ok() + ); + assert!( + ConsolidationReviewState::Approved + .validate_transition(ConsolidationReviewState::Applied) + .is_ok() + ); + assert!( + ConsolidationReviewState::Applied + .validate_transition(ConsolidationReviewState::Rejected) + .is_err() + ); +} + +#[test] +fn run_lifecycle_rejects_skipping_generation_state() { + assert!( + ConsolidationRunState::Pending + .validate_transition(ConsolidationRunState::Completed) + .is_err() + ); + assert!( + ConsolidationRunState::Pending.validate_transition(ConsolidationRunState::Running).is_ok() + ); + assert!( + ConsolidationRunState::Running + .validate_transition(ConsolidationRunState::Completed) + .is_ok() + ); +} + +#[test] +fn queued_payload_requires_consolidation_contract_schema() { + let source = source_ref(); + let mut payload = ConsolidationJobPayload { + contract_schema: CONSOLIDATION_CONTRACT_SCHEMA_V1.to_string(), + proposals: vec![proposal_contract(source)], + }; + + assert!(payload.validate().is_ok()); + + payload.contract_schema = "elf.consolidation/v0".to_string(); + + assert_eq!(payload.validate(), Err(ConsolidationValidationError::InvalidContractSchema)); +} + +fn proposal_contract(source: ConsolidationInputRef) -> ConsolidationProposalContract { + let lineage = ConsolidationLineage { + source_refs: vec![source.clone()], + parent_run_id: None, + parent_proposal_ids: Vec::new(), + }; + + ConsolidationProposalContract { + proposal_kind: "derived_note".to_string(), + apply_intent: ConsolidationApplyIntent::CreateDerivedNote, + source_refs: vec![source], + source_snapshot: serde_json::json!({ "window": "fixture" }), + lineage, + confidence: 0.85, + unsupported_claim_flags: Vec::new(), + markers: ConsolidationMarkers::default(), + diff: ConsolidationProposalDiff { + summary: "Create one derived note from stable evidence.".to_string(), + before: serde_json::json!({}), + after: serde_json::json!({ "text": "Fact: The project keeps consolidation output reviewable." }), + }, + target_ref: serde_json::json!({}), + proposed_payload: serde_json::json!({ + "type": "fact", + "text": "Fact: The project keeps consolidation output reviewable." + }), + } +} + +fn source_ref() -> ConsolidationInputRef { + ConsolidationInputRef { + kind: ConsolidationSourceKind::Note, + id: Uuid::parse_str("11111111-1111-1111-1111-111111111111") + .expect("test UUID must be valid"), + snapshot: ConsolidationSourceSnapshot { + status: Some("active".to_string()), + updated_at: Some(OffsetDateTime::UNIX_EPOCH), + content_hash: Some("blake3:fixture".to_string()), + embedding_version: Some("fixture:model:4".to_string()), + trace_version: None, + source_ref: serde_json::json!({ "schema": "source_ref/v1", "resolver": "fixture" }), + metadata: serde_json::json!({}), + }, + } +} diff --git a/packages/elf-domain/tests/domain.rs b/packages/elf-domain/tests/domain.rs index cdd5a582..db3dfbc9 100644 --- a/packages/elf-domain/tests/domain.rs +++ b/packages/elf-domain/tests/domain.rs @@ -1,95 +1,189 @@ -// crates.io +#![allow(unused_crate_dependencies)] + +//! Integration tests for domain-layer helpers. + use serde_json::Map; use time::OffsetDateTime; -// self -use elf_domain::{cjk, evidence, ttl}; +use elf_config::{ + Chunking, Config, EmbeddingProviderConfig, Lifecycle, LlmProviderConfig, Memory, MemoryPolicy, + Postgres, ProviderConfig, Providers, Qdrant, Ranking, RankingBlend, RankingBlendSegment, + RankingDeterministic, RankingDeterministicDecay, RankingDeterministicHits, + RankingDeterministicLexical, RankingDiversity, RankingRetrievalSources, ReadProfiles, + ScopePrecedence, ScopeWriteAllowed, Scopes, Search, SearchCache, SearchDynamic, + SearchExpansion, SearchExplain, SearchGraphContext, SearchPrefilter, SearchRecursive, Security, + Service, Storage, TtlDays, +}; +use elf_domain::{evidence, ttl}; -#[test] -fn detects_cjk() { - assert!(cjk::contains_cjk("\u{4F60}\u{597D}")); - assert!(!cjk::contains_cjk("hello")); +fn dummy_embedding_provider() -> EmbeddingProviderConfig { + EmbeddingProviderConfig { + provider_id: "p".to_string(), + api_base: "http://localhost".to_string(), + api_key: "key".to_string(), + path: "/".to_string(), + model: "m".to_string(), + dimensions: 3, + timeout_ms: 1_000, + default_headers: Map::new(), + } } -#[test] -fn evidence_requires_substring() { - let messages = vec!["Hello world".to_string()]; - assert!(evidence::evidence_matches(&messages, 0, "world")); - assert!(!evidence::evidence_matches(&messages, 0, "missing")); +fn dummy_provider() -> ProviderConfig { + ProviderConfig { + provider_id: "p".to_string(), + api_base: "http://localhost".to_string(), + api_key: "key".to_string(), + path: "/".to_string(), + model: "m".to_string(), + timeout_ms: 1_000, + default_headers: Map::new(), + } } -#[test] -fn computes_ttl_from_defaults() { - let cfg = elf_config::Config { - service: elf_config::Service { +fn dummy_llm_provider() -> LlmProviderConfig { + LlmProviderConfig { + provider_id: "p".to_string(), + api_base: "http://localhost".to_string(), + api_key: "key".to_string(), + path: "/".to_string(), + model: "m".to_string(), + temperature: 0.1, + timeout_ms: 1_000, + default_headers: Map::new(), + } +} + +fn test_ranking() -> Ranking { + Ranking { + recency_tau_days: 60.0, + tie_breaker_weight: 0.1, + deterministic: RankingDeterministic { + enabled: false, + lexical: RankingDeterministicLexical { + enabled: false, + weight: 0.05, + min_ratio: 0.3, + max_query_terms: 16, + max_text_terms: 1_024, + }, + hits: RankingDeterministicHits { + enabled: false, + weight: 0.05, + half_saturation: 8.0, + last_hit_tau_days: 14.0, + }, + decay: RankingDeterministicDecay { enabled: false, weight: 0.05, tau_days: 30.0 }, + }, + blend: RankingBlend { + enabled: true, + rerank_normalization: "rank".to_string(), + retrieval_normalization: "rank".to_string(), + segments: vec![ + RankingBlendSegment { max_retrieval_rank: 3, retrieval_weight: 0.8 }, + RankingBlendSegment { max_retrieval_rank: 10, retrieval_weight: 0.5 }, + RankingBlendSegment { max_retrieval_rank: 1_000_000, retrieval_weight: 0.2 }, + ], + }, + diversity: RankingDiversity { + enabled: true, + sim_threshold: 0.88, + mmr_lambda: 0.7, + max_skips: 64, + }, + retrieval_sources: RankingRetrievalSources { + fusion_weight: 1.0, + structured_field_weight: 1.0, + fusion_priority: 1, + structured_field_priority: 0, + }, + } +} + +fn base_config() -> Config { + Config { + service: Service { http_bind: "127.0.0.1:8080".to_string(), mcp_bind: "127.0.0.1:8082".to_string(), admin_bind: "127.0.0.1:8081".to_string(), log_level: "info".to_string(), }, - storage: elf_config::Storage { - postgres: elf_config::Postgres { + storage: Storage { + postgres: Postgres { dsn: "postgres://user:pass@localhost/db".to_string(), pool_max_conns: 1, }, - qdrant: elf_config::Qdrant { + qdrant: Qdrant { url: "http://localhost".to_string(), - collection: "mem_notes_v1".to_string(), - vector_dim: 3, + collection: "mem_notes_v2".to_string(), + docs_collection: "doc_chunks_v1".to_string(), + vector_dim: 4_096, }, }, - providers: elf_config::Providers { + providers: Providers { embedding: dummy_embedding_provider(), rerank: dummy_provider(), llm_extractor: dummy_llm_provider(), }, - scopes: elf_config::Scopes { + scopes: Scopes { allowed: vec!["agent_private".to_string()], - read_profiles: elf_config::ReadProfiles { + read_profiles: ReadProfiles { private_only: vec!["agent_private".to_string()], private_plus_project: vec!["agent_private".to_string()], all_scopes: vec!["agent_private".to_string()], }, - precedence: elf_config::ScopePrecedence { - agent_private: 30, - project_shared: 20, - org_shared: 10, - }, - write_allowed: elf_config::ScopeWriteAllowed { + precedence: ScopePrecedence { agent_private: 30, project_shared: 20, org_shared: 10 }, + write_allowed: ScopeWriteAllowed { agent_private: true, project_shared: true, org_shared: true, }, }, - memory: elf_config::Memory { + memory: Memory { max_notes_per_add_event: 3, max_note_chars: 240, dup_sim_threshold: 0.92, update_sim_threshold: 0.85, candidate_k: 60, top_k: 12, + policy: MemoryPolicy { rules: vec![] }, }, - search: elf_config::Search { - expansion: elf_config::SearchExpansion { + search: Search { + expansion: SearchExpansion { mode: "off".to_string(), max_queries: 4, include_original: true, }, - dynamic: elf_config::SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, - prefilter: elf_config::SearchPrefilter { max_candidates: 0 }, - cache: elf_config::SearchCache { + dynamic: SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, + prefilter: SearchPrefilter { max_candidates: 0 }, + cache: SearchCache { enabled: true, expansion_ttl_days: 7, rerank_ttl_days: 7, max_payload_bytes: Some(262_144), - expansion_version: "v1".to_string(), - rerank_version: "v1".to_string(), }, - explain: elf_config::SearchExplain { retention_days: 7 }, + explain: SearchExplain { + retention_days: 7, + capture_candidates: false, + candidate_retention_days: 2, + write_mode: "outbox".to_string(), + }, + recursive: SearchRecursive { + enabled: false, + max_depth: 2, + max_children_per_node: 4, + max_nodes_per_scope: 32, + max_total_nodes: 256, + }, + graph_context: SearchGraphContext { + enabled: false, + max_facts_per_item: 16, + max_evidence_notes_per_fact: 16, + }, }, - ranking: elf_config::Ranking { recency_tau_days: 60.0, tie_breaker_weight: 0.1 }, - lifecycle: elf_config::Lifecycle { - ttl_days: elf_config::TtlDays { + ranking: test_ranking(), + lifecycle: Lifecycle { + ttl_days: TtlDays { plan: 14, fact: 180, preference: 0, @@ -100,61 +194,48 @@ fn computes_ttl_from_defaults() { purge_deleted_after_days: 30, purge_deprecated_after_days: 180, }, - security: elf_config::Security { + security: Security { bind_localhost_only: true, - reject_cjk: true, + reject_non_english: true, redact_secrets_on_write: true, evidence_min_quotes: 1, evidence_max_quotes: 2, evidence_max_quote_chars: 320, + auth_mode: "off".to_string(), + auth_keys: vec![], }, - chunking: elf_config::Chunking { + chunking: Chunking { enabled: true, max_tokens: 512, overlap_tokens: 128, - tokenizer_repo: None, + tokenizer_repo: "REPLACE_ME".to_string(), }, - }; - - let now = OffsetDateTime::now_utc(); - let expires = ttl::compute_expires_at(None, "plan", &cfg, now).expect("TTL missing"); - assert!(expires > now); + context: None, + mcp: None, + } } -fn dummy_embedding_provider() -> elf_config::EmbeddingProviderConfig { - elf_config::EmbeddingProviderConfig { - provider_id: "p".to_string(), - api_base: "http://localhost".to_string(), - api_key: "key".to_string(), - path: "/".to_string(), - model: "m".to_string(), - dimensions: 3, - timeout_ms: 1000, - default_headers: Map::new(), - } +#[test] +fn evidence_requires_substring() { + let messages = vec!["Hello world".to_string()]; + + assert!(evidence::evidence_matches(&messages, 0, "world")); + assert!(!evidence::evidence_matches(&messages, 0, "missing")); } -fn dummy_provider() -> elf_config::ProviderConfig { - elf_config::ProviderConfig { - provider_id: "p".to_string(), - api_base: "http://localhost".to_string(), - api_key: "key".to_string(), - path: "/".to_string(), - model: "m".to_string(), - timeout_ms: 1000, - default_headers: Map::new(), - } +#[test] +fn evidence_rejects_empty_quote() { + let messages = vec!["Hello world".to_string()]; + + assert!(!evidence::evidence_matches(&messages, 0, "")); + assert!(!evidence::evidence_matches(&messages, 0, " ")); } -fn dummy_llm_provider() -> elf_config::LlmProviderConfig { - elf_config::LlmProviderConfig { - provider_id: "p".to_string(), - api_base: "http://localhost".to_string(), - api_key: "key".to_string(), - path: "/".to_string(), - model: "m".to_string(), - temperature: 0.1, - timeout_ms: 1000, - default_headers: Map::new(), - } +#[test] +fn computes_ttl_from_defaults() { + let cfg = base_config(); + let now = OffsetDateTime::now_utc(); + let expires = ttl::compute_expires_at(None, "plan", &cfg, now).expect("TTL missing"); + + assert!(expires > now); } diff --git a/packages/elf-domain/tests/memory_policy.rs b/packages/elf-domain/tests/memory_policy.rs new file mode 100644 index 00000000..18261e00 --- /dev/null +++ b/packages/elf-domain/tests/memory_policy.rs @@ -0,0 +1,528 @@ +#![allow(unused_crate_dependencies)] + +//! Integration tests for memory-policy evaluation. + +use serde_json::Map; + +use elf_config::{ + Chunking, Config, EmbeddingProviderConfig, Lifecycle, LlmProviderConfig, Memory, MemoryPolicy, + MemoryPolicyRule, Postgres, ProviderConfig, Providers, Qdrant, Ranking, RankingBlend, + RankingBlendSegment, RankingDeterministic, RankingDeterministicDecay, RankingDeterministicHits, + RankingDeterministicLexical, RankingDiversity, RankingRetrievalSources, ReadProfiles, + ScopePrecedence, ScopeWriteAllowed, Scopes, Search, SearchCache, SearchDynamic, + SearchExpansion, SearchExplain, SearchGraphContext, SearchPrefilter, SearchRecursive, Security, + Service, Storage, TtlDays, +}; +use elf_domain::memory_policy::{self, MemoryPolicyDecision, MemoryPolicyEvaluation}; + +fn memory_policy_config(policy: MemoryPolicy) -> Config { + let mut cfg = memory_policy_default_config(); + + cfg.memory.policy = policy; + + cfg +} + +fn memory_policy_default_config() -> Config { + Config { + service: memory_policy_service_config(), + storage: memory_policy_storage_config(), + providers: memory_policy_providers_config(), + scopes: memory_policy_scopes_config(), + memory: memory_policy_memory_config(), + search: memory_policy_search_config(), + ranking: memory_policy_ranking_config(), + lifecycle: memory_policy_lifecycle_config(), + security: memory_policy_security_config(), + chunking: memory_policy_chunking_config(), + context: None, + mcp: None, + } +} + +fn memory_policy_service_config() -> Service { + Service { + http_bind: "127.0.0.1:8080".to_string(), + mcp_bind: "127.0.0.1:8082".to_string(), + admin_bind: "127.0.0.1:8081".to_string(), + log_level: "info".to_string(), + } +} + +fn memory_policy_storage_config() -> Storage { + Storage { + postgres: Postgres { + dsn: "postgres://user:pass@localhost/db".to_string(), + pool_max_conns: 1, + }, + qdrant: Qdrant { + url: "http://localhost".to_string(), + collection: "mem_notes_v2".to_string(), + docs_collection: "doc_chunks_v1".to_string(), + vector_dim: 4_096, + }, + } +} + +fn memory_policy_providers_config() -> Providers { + Providers { + embedding: embedding_provider_config(), + rerank: rerank_provider_config(), + llm_extractor: llm_extractor_provider_config(), + } +} + +fn embedding_provider_config() -> EmbeddingProviderConfig { + EmbeddingProviderConfig { + provider_id: "p".to_string(), + api_base: "http://localhost".to_string(), + api_key: "key".to_string(), + path: "/".to_string(), + model: "m".to_string(), + dimensions: 3, + timeout_ms: 1_000, + default_headers: Map::new(), + } +} + +fn rerank_provider_config() -> ProviderConfig { + ProviderConfig { + provider_id: "p".to_string(), + api_base: "http://localhost".to_string(), + api_key: "key".to_string(), + path: "/".to_string(), + model: "m".to_string(), + timeout_ms: 1_000, + default_headers: Map::new(), + } +} + +fn llm_extractor_provider_config() -> LlmProviderConfig { + LlmProviderConfig { + provider_id: "p".to_string(), + api_base: "http://localhost".to_string(), + api_key: "key".to_string(), + path: "/".to_string(), + model: "m".to_string(), + temperature: 0.1, + timeout_ms: 1_000, + default_headers: Map::new(), + } +} + +fn memory_policy_scopes_config() -> Scopes { + Scopes { + allowed: vec!["agent_private".to_string()], + read_profiles: ReadProfiles { + private_only: vec!["agent_private".to_string()], + private_plus_project: vec!["agent_private".to_string()], + all_scopes: vec!["agent_private".to_string()], + }, + precedence: ScopePrecedence { agent_private: 30, project_shared: 20, org_shared: 10 }, + write_allowed: ScopeWriteAllowed { + agent_private: true, + project_shared: true, + org_shared: true, + }, + } +} + +fn memory_policy_memory_config() -> Memory { + Memory { + max_notes_per_add_event: 3, + max_note_chars: 240, + dup_sim_threshold: 0.92, + update_sim_threshold: 0.85, + candidate_k: 60, + top_k: 12, + policy: MemoryPolicy { + rules: vec![ + MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: Some(0.9), + min_importance: Some(0.1), + }, + MemoryPolicyRule { + note_type: Some("preference".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: Some(0.75), + min_importance: None, + }, + MemoryPolicyRule { + note_type: Some("preference".to_string()), + scope: None, + min_confidence: Some(0.6), + min_importance: None, + }, + MemoryPolicyRule { + note_type: None, + scope: None, + min_confidence: None, + min_importance: None, + }, + ], + }, + } +} + +fn memory_policy_search_config() -> Search { + Search { + expansion: SearchExpansion { + mode: "off".to_string(), + max_queries: 4, + include_original: true, + }, + dynamic: SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, + prefilter: SearchPrefilter { max_candidates: 0 }, + cache: SearchCache { + enabled: true, + expansion_ttl_days: 7, + rerank_ttl_days: 7, + max_payload_bytes: Some(262_144), + }, + explain: SearchExplain { + retention_days: 7, + capture_candidates: false, + candidate_retention_days: 2, + write_mode: "outbox".to_string(), + }, + recursive: SearchRecursive { + enabled: false, + max_depth: 2, + max_children_per_node: 4, + max_nodes_per_scope: 32, + max_total_nodes: 256, + }, + graph_context: SearchGraphContext { + enabled: false, + max_facts_per_item: 16, + max_evidence_notes_per_fact: 16, + }, + } +} + +fn memory_policy_ranking_config() -> Ranking { + Ranking { + recency_tau_days: 60.0, + tie_breaker_weight: 0.1, + deterministic: RankingDeterministic { + enabled: false, + lexical: RankingDeterministicLexical { + enabled: false, + weight: 0.05, + min_ratio: 0.3, + max_query_terms: 16, + max_text_terms: 1_024, + }, + hits: RankingDeterministicHits { + enabled: false, + weight: 0.05, + half_saturation: 8.0, + last_hit_tau_days: 14.0, + }, + decay: RankingDeterministicDecay { enabled: false, weight: 0.05, tau_days: 30.0 }, + }, + blend: RankingBlend { + enabled: true, + rerank_normalization: "rank".to_string(), + retrieval_normalization: "rank".to_string(), + segments: vec![RankingBlendSegment { max_retrieval_rank: 10, retrieval_weight: 0.5 }], + }, + diversity: RankingDiversity { + enabled: true, + sim_threshold: 0.88, + mmr_lambda: 0.7, + max_skips: 64, + }, + retrieval_sources: RankingRetrievalSources { + fusion_weight: 1.0, + structured_field_weight: 1.0, + fusion_priority: 1, + structured_field_priority: 0, + }, + } +} + +fn memory_policy_lifecycle_config() -> Lifecycle { + Lifecycle { + ttl_days: TtlDays { + plan: 14, + fact: 180, + preference: 0, + constraint: 0, + decision: 0, + profile: 0, + }, + purge_deleted_after_days: 30, + purge_deprecated_after_days: 180, + } +} + +fn memory_policy_security_config() -> Security { + Security { + bind_localhost_only: true, + reject_non_english: true, + redact_secrets_on_write: true, + evidence_min_quotes: 1, + evidence_max_quotes: 2, + evidence_max_quote_chars: 320, + auth_mode: "off".to_string(), + auth_keys: vec![], + } +} + +fn memory_policy_chunking_config() -> Chunking { + Chunking { + enabled: true, + max_tokens: 512, + overlap_tokens: 128, + tokenizer_repo: "REPLACE_ME".to_string(), + } +} +#[test] +fn selects_note_type_and_scope_rule_before_note_type() { + let cfg = memory_policy_config(MemoryPolicy { + rules: vec![ + MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: None, + min_confidence: Some(0.2), + min_importance: None, + }, + MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: Some(0.9), + min_importance: None, + }, + MemoryPolicyRule { + note_type: None, + scope: Some("agent_private".to_string()), + min_confidence: Some(0.0), + min_importance: None, + }, + ], + }); + let MemoryPolicyEvaluation { decision, matched_rule } = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.5, + 0.5, + MemoryPolicyDecision::Remember, + ); + + assert_eq!(decision, MemoryPolicyDecision::Ignore); + assert!(matched_rule.is_some()); + assert_eq!(matched_rule.unwrap().note_type.as_deref(), Some("fact")); + assert_eq!(matched_rule.unwrap().scope.as_deref(), Some("agent_private")); + assert_eq!(matched_rule.unwrap().min_confidence, Some(0.9)); +} + +#[test] +fn downgrades_only_remember_or_update() { + let cfg = memory_policy_config(MemoryPolicy { + rules: vec![MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: Some(0.9), + min_importance: None, + }], + }); + let remember = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.5, + 0.5, + MemoryPolicyDecision::Remember, + ); + + assert_eq!(remember.decision, MemoryPolicyDecision::Ignore); + + let update = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.5, + 0.5, + MemoryPolicyDecision::Update, + ); + + assert_eq!(update.decision, MemoryPolicyDecision::Ignore); + + let ignored = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.5, + 0.5, + MemoryPolicyDecision::Ignore, + ); + + assert_eq!(ignored.decision, MemoryPolicyDecision::Ignore); + + let rejected = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.5, + 0.5, + MemoryPolicyDecision::Reject, + ); + + assert_eq!(rejected.decision, MemoryPolicyDecision::Reject); +} + +#[test] +fn note_type_only_beats_scope_only() { + let cfg = memory_policy_config(MemoryPolicy { + rules: vec![ + MemoryPolicyRule { + note_type: None, + scope: Some("agent_private".to_string()), + min_confidence: Some(0.1), + min_importance: None, + }, + MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: None, + min_confidence: Some(0.1), + min_importance: None, + }, + ], + }); + let output = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.2, + 0.0, + MemoryPolicyDecision::Remember, + ); + + assert_eq!(output.decision, MemoryPolicyDecision::Remember); + assert_eq!(output.matched_rule.and_then(|rule| rule.note_type.as_deref()), Some("fact")); + assert_eq!(output.matched_rule.and_then(|rule| rule.scope.as_deref()), None); +} + +#[test] +fn scope_only_beats_fallback_none() { + let cfg = memory_policy_config(MemoryPolicy { + rules: vec![ + MemoryPolicyRule { + note_type: None, + scope: None, + min_confidence: Some(0.1), + min_importance: None, + }, + MemoryPolicyRule { + note_type: None, + scope: Some("agent_private".to_string()), + min_confidence: Some(0.1), + min_importance: None, + }, + ], + }); + let output = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.2, + 0.0, + MemoryPolicyDecision::Remember, + ); + + assert_eq!(output.decision, MemoryPolicyDecision::Remember); + assert_eq!(output.matched_rule.and_then(|rule| rule.note_type.as_deref()), None); + assert_eq!(output.matched_rule.and_then(|rule| rule.scope.as_deref()), Some("agent_private")); +} + +#[test] +fn confidence_meets_minimum_is_not_a_downgrade() { + let cfg = memory_policy_config(MemoryPolicy { + rules: vec![MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: Some(0.5), + min_importance: None, + }], + }); + let output = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.5, + 0.0, + MemoryPolicyDecision::Remember, + ); + + assert_eq!(output.decision, MemoryPolicyDecision::Remember); +} + +#[test] +fn importance_meets_minimum_is_not_a_downgrade() { + let cfg = memory_policy_config(MemoryPolicy { + rules: vec![MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: None, + min_importance: Some(0.7), + }], + }); + let output = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.0, + 0.7, + MemoryPolicyDecision::Remember, + ); + + assert_eq!(output.decision, MemoryPolicyDecision::Remember); +} + +#[test] +fn non_finite_metrics_fail_threshold() { + let cfg = memory_policy_config(MemoryPolicy { + rules: vec![MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: Some(0.9), + min_importance: None, + }], + }); + let output = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + f64::NAN, + 0.5, + MemoryPolicyDecision::Remember, + ); + + assert_eq!(output.decision, MemoryPolicyDecision::Ignore); +} + +#[test] +fn missing_threshold_does_not_change_decision() { + let cfg = memory_policy_config(MemoryPolicy { + rules: vec![MemoryPolicyRule { + note_type: Some("fact".to_string()), + scope: Some("agent_private".to_string()), + min_confidence: None, + min_importance: None, + }], + }); + let output = memory_policy::evaluate_memory_policy( + &cfg, + "fact", + "agent_private", + 0.0, + 0.0, + MemoryPolicyDecision::Remember, + ); + + assert_eq!(output.decision, MemoryPolicyDecision::Remember); +} diff --git a/packages/elf-providers/Cargo.toml b/packages/elf-providers/Cargo.toml index 1b4e46e1..3e4dea34 100644 --- a/packages/elf-providers/Cargo.toml +++ b/packages/elf-providers/Cargo.toml @@ -1,12 +1,12 @@ [package] edition = "2024" name = "elf-providers" -version = "0.1.0" +version = "0.2.0" [dependencies] -color-eyre = { workspace = true } -elf-config = { path = "../elf-config" } +blake3 = { workspace = true } reqwest = { workspace = true } -serde = { workspace = true } serde_json = { workspace = true } -tokio = { workspace = true } +thiserror = { workspace = true } + +elf-config = { workspace = true } diff --git a/packages/elf-providers/src/embedding.rs b/packages/elf-providers/src/embedding.rs index be1c0f13..5c7cf50e 100644 --- a/packages/elf-providers/src/embedding.rs +++ b/packages/elf-providers/src/embedding.rs @@ -1,16 +1,22 @@ -// std -use std::time::Duration as StdDuration; +//! Embedding-provider client helpers. + +use std::time::Duration; -// crates.io -use color_eyre::{Result, eyre}; use reqwest::Client; use serde_json::Value; -pub async fn embed( - cfg: &elf_config::EmbeddingProviderConfig, - texts: &[String], -) -> Result<Vec<Vec<f32>>> { - let client = Client::builder().timeout(StdDuration::from_millis(cfg.timeout_ms)).build()?; +use crate::{Error, Result}; +use elf_config::EmbeddingProviderConfig; + +/// Embeds texts with the configured provider or local fallback implementation. +pub async fn embed(cfg: &EmbeddingProviderConfig, texts: &[String]) -> Result<Vec<Vec<f32>>> { + if cfg.provider_id == "local" { + let dim = cfg.dimensions as usize; + + return Ok(texts.iter().map(|text| local_embed(dim, text)).collect()); + } + + let client = Client::builder().timeout(Duration::from_millis(cfg.timeout_ms)).build()?; let url = format!("{}{}", cfg.api_base, cfg.path); let body = serde_json::json!({ "model": cfg.model, @@ -24,42 +30,115 @@ pub async fn embed( .send() .await?; let json: Value = res.error_for_status()?.json().await?; + parse_embedding_response(json) } -fn parse_embedding_response(json: Value) -> Result<Vec<Vec<f32>>> { - let data = json - .get("data") - .and_then(|v| v.as_array()) - .ok_or_else(|| eyre::eyre!("Embedding response is missing data array."))?; +fn local_embed(dim: usize, text: &str) -> Vec<f32> { + let mut vec = vec![0.0_f32; dim]; + + if dim == 0 { + return vec; + } + + let normalized = normalize_ascii_alnum_lowercase(text); + + for token in normalized.split_whitespace() { + if token.len() < 2 { + continue; + } + + let hash = blake3::hash(token.as_bytes()); + let bytes = hash.as_bytes(); + let index = (u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize) % dim; + let sign = if bytes[4] & 1 == 0 { 1.0 } else { -1.0 }; + + vec[index] += sign; + } + + if vec.iter().all(|value| *value == 0.0) { + let hash = blake3::hash(text.as_bytes()); + let bytes = hash.as_bytes(); + let index = (u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize) % dim; + + vec[index] = 1.0; + } + + l2_normalize(&mut vec); + + vec +} + +fn normalize_ascii_alnum_lowercase(text: &str) -> String { + let mut normalized = String::with_capacity(text.len()); + + for ch in text.chars() { + if ch.is_ascii_alphanumeric() { + normalized.push(ch.to_ascii_lowercase()); + } else { + normalized.push(' '); + } + } + + normalized +} +fn l2_normalize(vec: &mut [f32]) { + let mut norm = 0.0_f32; + + for value in vec.iter() { + norm += value * value; + } + + if norm <= 0.0 { + return; + } + + let inv = 1.0 / norm.sqrt(); + + for value in vec.iter_mut() { + *value *= inv; + } +} + +fn parse_embedding_response(json: Value) -> Result<Vec<Vec<f32>>> { + let data = json.get("data").and_then(|v| v.as_array()).ok_or_else(|| { + Error::InvalidResponse { message: "Embedding response is missing data array.".to_string() } + })?; let mut indexed: Vec<(usize, Vec<f32>)> = Vec::with_capacity(data.len()); + for (fallback_index, item) in data.iter().enumerate() { let index = item .get("index") .and_then(|v| v.as_u64()) .map(|v| v as usize) .unwrap_or(fallback_index); - let embedding = item - .get("embedding") - .and_then(|v| v.as_array()) - .ok_or_else(|| eyre::eyre!("Embedding item missing embedding array."))?; + let embedding = item.get("embedding").and_then(|v| v.as_array()).ok_or_else(|| { + Error::InvalidResponse { + message: "Embedding item missing embedding array.".to_string(), + } + })?; let mut vec = Vec::with_capacity(embedding.len()); + for value in embedding { - let number = - value.as_f64().ok_or_else(|| eyre::eyre!("Embedding value must be numeric."))?; + let number = value.as_f64().ok_or_else(|| Error::InvalidResponse { + message: "Embedding value must be numeric.".to_string(), + })?; + vec.push(number as f32); } + indexed.push((index, vec)); } indexed.sort_by_key(|(index, _)| *index); + Ok(indexed.into_iter().map(|(_, vec)| vec).collect()) } #[cfg(test)] mod tests { - use super::*; + use crate::embedding::{self}; #[test] fn parses_embeddings_in_index_order() { @@ -69,9 +148,37 @@ mod tests { { "index": 0, "embedding": [0.5, 1.5] } ] }); - let parsed = parse_embedding_response(json).expect("parse failed"); + let parsed = embedding::parse_embedding_response(json).expect("parse failed"); + assert_eq!(parsed.len(), 2); assert_eq!(parsed[0], vec![0.5, 1.5]); assert_eq!(parsed[1], vec![2.0, 3.0]); } + + #[test] + fn local_embedding_is_deterministic_and_has_expected_dimension() { + let a = embedding::local_embed(64, "Embeddings are stored in Postgres."); + let b = embedding::local_embed(64, "Embeddings are stored in Postgres."); + + assert_eq!(a.len(), 64); + assert_eq!(a, b); + } + + #[test] + fn local_embedding_is_more_similar_for_shared_tokens() { + let a = embedding::local_embed(512, "alpha beta"); + let b = embedding::local_embed(512, "alpha gamma"); + let c = embedding::local_embed(512, "delta epsilon"); + let sim_ab = dot(&a, &b); + let sim_ac = dot(&a, &c); + + assert!( + sim_ab > sim_ac, + "Expected shared-token similarity to be higher. sim_ab={sim_ab}, sim_ac={sim_ac}" + ); + } + + fn dot(a: &[f32], b: &[f32]) -> f32 { + a.iter().zip(b.iter()).map(|(x, y)| x * y).sum() + } } diff --git a/packages/elf-providers/src/error.rs b/packages/elf-providers/src/error.rs new file mode 100644 index 00000000..f42b52dc --- /dev/null +++ b/packages/elf-providers/src/error.rs @@ -0,0 +1,31 @@ +/// Result alias for provider adapters. +pub type Result<T, E = Error> = std::result::Result<T, E>; + +/// Errors returned by provider adapters. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// HTTP transport or response decoding error from `reqwest`. + #[error(transparent)] + Reqwest(#[from] reqwest::Error), + /// JSON encode or decode failure. + #[error(transparent)] + SerdeJson(#[from] serde_json::Error), + /// Invalid HTTP header name in provider config. + #[error(transparent)] + InvalidHeaderName(#[from] reqwest::header::InvalidHeaderName), + /// Invalid HTTP header value in provider config. + #[error(transparent)] + InvalidHeaderValue(#[from] reqwest::header::InvalidHeaderValue), + /// Local provider configuration was invalid. + #[error("{message}")] + InvalidConfig { + /// Human-readable configuration error. + message: String, + }, + /// Provider response shape was invalid. + #[error("{message}")] + InvalidResponse { + /// Human-readable response validation error. + message: String, + }, +} diff --git a/packages/elf-providers/src/extractor.rs b/packages/elf-providers/src/extractor.rs index 98542c79..905382c1 100644 --- a/packages/elf-providers/src/extractor.rs +++ b/packages/elf-providers/src/extractor.rs @@ -1,13 +1,16 @@ -// std -use std::time::Duration as StdDuration; +//! LLM extraction-provider client helpers. + +use std::time::Duration; -// crates.io -use color_eyre::{Result, eyre}; use reqwest::Client; use serde_json::Value; -pub async fn extract(cfg: &elf_config::LlmProviderConfig, messages: &[Value]) -> Result<Value> { - let client = Client::builder().timeout(StdDuration::from_millis(cfg.timeout_ms)).build()?; +use crate::{Error, Result}; +use elf_config::LlmProviderConfig; + +/// Calls the configured extractor provider and returns parsed JSON content. +pub async fn extract(cfg: &LlmProviderConfig, messages: &[Value]) -> Result<Value> { + let client = Client::builder().timeout(Duration::from_millis(cfg.timeout_ms)).build()?; let url = format!("{}{}", cfg.api_base, cfg.path); for _ in 0..3 { @@ -23,12 +26,13 @@ pub async fn extract(cfg: &elf_config::LlmProviderConfig, messages: &[Value]) -> .send() .await?; let json: Value = res.error_for_status()?.json().await?; + if let Ok(parsed) = parse_extractor_json(json) { return Ok(parsed); } } - Err(eyre::eyre!("Extractor response is not valid JSON.")) + Err(Error::InvalidResponse { message: "Extractor response is not valid JSON.".to_string() }) } fn parse_extractor_json(json: Value) -> Result<Value> { @@ -40,8 +44,10 @@ fn parse_extractor_json(json: Value) -> Result<Value> { .and_then(|msg| msg.get("content")) .and_then(|c| c.as_str()) { - let parsed: Value = serde_json::from_str(content) - .map_err(|_| eyre::eyre!("Extractor content is not valid JSON."))?; + let parsed: Value = serde_json::from_str(content).map_err(|_| Error::InvalidResponse { + message: "Extractor content is not valid JSON.".to_string(), + })?; + return Ok(parsed); } @@ -49,12 +55,14 @@ fn parse_extractor_json(json: Value) -> Result<Value> { return Ok(json); } - Err(eyre::eyre!("Extractor response is missing JSON content.")) + Err(Error::InvalidResponse { + message: "Extractor response is missing JSON content.".to_string(), + }) } #[cfg(test)] mod tests { - use super::*; + use crate::extractor; #[test] fn parses_choice_content_json() { @@ -63,7 +71,8 @@ mod tests { { "message": { "content": "{\"notes\": []}" } } ] }); - let parsed = parse_extractor_json(json).expect("parse failed"); + let parsed = extractor::parse_extractor_json(json).expect("parse failed"); + assert!(parsed.get("notes").is_some()); } } diff --git a/packages/elf-providers/src/lib.rs b/packages/elf-providers/src/lib.rs index ece51866..a8adbf90 100644 --- a/packages/elf-providers/src/lib.rs +++ b/packages/elf-providers/src/lib.rs @@ -1,20 +1,32 @@ +//! Provider adapters for embedding, rerank, and extraction requests. + pub mod embedding; pub mod extractor; pub mod rerank; -// crates.io -use color_eyre::{Result, eyre}; -use reqwest::header::{AUTHORIZATION, HeaderMap, HeaderName}; +mod error; + +pub use error::{Error, Result}; + +use reqwest::header::{ACCEPT_ENCODING, AUTHORIZATION, HeaderMap, HeaderName, HeaderValue}; use serde_json::{Map, Value}; +/// Builds authenticated request headers for provider API calls. pub fn auth_headers(api_key: &str, default_headers: &Map<String, Value>) -> Result<HeaderMap> { let mut headers = HeaderMap::new(); + headers.insert(AUTHORIZATION, format!("Bearer {api_key}").parse()?); + headers.insert(ACCEPT_ENCODING, HeaderValue::from_static("identity")); + for (key, value) in default_headers { let Some(raw) = value.as_str() else { - return Err(eyre::eyre!("Default header values must be strings.")); + return Err(Error::InvalidConfig { + message: "Default header values must be strings.".to_string(), + }); }; + headers.insert(HeaderName::from_bytes(key.as_bytes())?, raw.parse()?); } + Ok(headers) } diff --git a/packages/elf-providers/src/rerank.rs b/packages/elf-providers/src/rerank.rs index 9e64d63c..652abe09 100644 --- a/packages/elf-providers/src/rerank.rs +++ b/packages/elf-providers/src/rerank.rs @@ -1,17 +1,55 @@ -// std -use std::time::Duration as StdDuration; +//! Rerank-provider client helpers. + +use std::{ + collections::HashSet, + sync::atomic::{AtomicU64, Ordering}, + time::Duration, +}; -// crates.io -use color_eyre::{Result, eyre}; use reqwest::Client; use serde_json::Value; -pub async fn rerank( - cfg: &elf_config::ProviderConfig, - query: &str, - docs: &[String], -) -> Result<Vec<f32>> { - let client = Client::builder().timeout(StdDuration::from_millis(cfg.timeout_ms)).build()?; +use crate::{Error, Result}; +use elf_config::ProviderConfig; + +static LOCAL_NOISE_CALL_COUNTER: AtomicU64 = AtomicU64::new(0); + +struct XorShift64 { + state: u64, +} +impl XorShift64 { + fn new(seed: u64) -> Self { + let state = if seed == 0 { 0x4D59_5DF4_D0F3_3173 } else { seed }; + + Self { state } + } + + fn next_u64(&mut self) -> u64 { + let mut x = self.state; + + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + self.state = x; + + x + } + + fn next_f32(&mut self) -> f32 { + // Map to [0, 1). Keep 24 bits of precision for a stable f32. + let bits = (self.next_u64() >> 40) as u32; + + (bits as f32) / ((1_u32 << 24) as f32) + } +} + +/// Reranks documents with the configured provider or local fallback implementation. +pub async fn rerank(cfg: &ProviderConfig, query: &str, docs: &[String]) -> Result<Vec<f32>> { + if cfg.provider_id == "local" { + return Ok(local_rerank_dispatch(cfg.model.as_str(), query, docs)); + } + + let client = Client::builder().timeout(Duration::from_millis(cfg.timeout_ms)).build()?; let url = format!("{}{}", cfg.api_base, cfg.path); let body = serde_json::json!({ "model": cfg.model, "query": query, "documents": docs }); let res = client @@ -21,27 +59,123 @@ pub async fn rerank( .send() .await?; let json: Value = res.error_for_status()?.json().await?; + parse_rerank_response(json, docs.len()) } +fn local_rerank_dispatch(model: &str, query: &str, docs: &[String]) -> Vec<f32> { + if let Some(noise_std) = parse_local_noisy_model(model) { + return local_rerank_noisy(query, docs, noise_std); + } + + local_rerank(query, docs) +} + +fn parse_local_noisy_model(model: &str) -> Option<f32> { + let prefix = "local-token-overlap-noisy@"; + let rest = model.strip_prefix(prefix)?; + let std: f32 = rest.parse().ok()?; + + Some(std.max(0.0)) +} + +fn local_rerank(query: &str, docs: &[String]) -> Vec<f32> { + let query_tokens = tokenize_ascii_alnum(query); + + if query_tokens.is_empty() { + return vec![0.0; docs.len()]; + } + + let denom = query_tokens.len() as f32; + let mut scores = Vec::with_capacity(docs.len()); + + for doc in docs { + let doc_tokens = tokenize_ascii_alnum(doc); + let matched = query_tokens.intersection(&doc_tokens).count() as f32; + + scores.push(matched / denom); + } + + scores +} + +fn local_rerank_noisy(query: &str, docs: &[String], noise_std: f32) -> Vec<f32> { + let base = local_rerank(query, docs); + + if noise_std <= 0.0 { + return base; + } + + let query_hash = blake3::hash(query.as_bytes()); + let mut seed_bytes = [0_u8; 8]; + + seed_bytes.copy_from_slice(&query_hash.as_bytes()[..8]); + // Vary the noise across calls to simulate reranker instability. + + let call_idx = LOCAL_NOISE_CALL_COUNTER.fetch_add(1, Ordering::Relaxed); + let mut seed = u64::from_le_bytes(seed_bytes); + + seed ^= call_idx.wrapping_mul(0x9E37_79B9_7F4A_7C15); + + let mut out = Vec::with_capacity(base.len()); + + for (i, score) in base.into_iter().enumerate() { + let mut rng = XorShift64::new(seed ^ (i as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15)); + let u = rng.next_f32(); + let signed = (u * 2.0) - 1.0; + let noisy = score + signed * noise_std; + + out.push(noisy.clamp(0.0, 1.0)); + } + + out +} + +fn tokenize_ascii_alnum(text: &str) -> HashSet<String> { + let mut normalized = String::with_capacity(text.len()); + + for ch in text.chars() { + if ch.is_ascii_alphanumeric() { + normalized.push(ch.to_ascii_lowercase()); + } else { + normalized.push(' '); + } + } + + let mut out = HashSet::new(); + + for token in normalized.split_whitespace() { + if token.len() < 2 { + continue; + } + + out.insert(token.to_string()); + } + + out +} + fn parse_rerank_response(json: Value, doc_count: usize) -> Result<Vec<f32>> { - let mut scores = vec![0.0f32; doc_count]; - let results = json - .get("results") - .or_else(|| json.get("data")) - .and_then(|v| v.as_array()) - .ok_or_else(|| eyre::eyre!("Rerank response is missing results array."))?; + let results = + json.get("results").or_else(|| json.get("data")).and_then(|v| v.as_array()).ok_or_else( + || Error::InvalidResponse { + message: "Rerank response is missing results array.".to_string(), + }, + )?; + let mut scores = vec![0.0_f32; doc_count]; for item in results { - let index = item - .get("index") - .and_then(|v| v.as_u64()) - .ok_or_else(|| eyre::eyre!("Rerank result missing index."))? as usize; + let index = item.get("index").and_then(|v| v.as_u64()).ok_or_else(|| { + Error::InvalidResponse { message: "Rerank result missing index.".to_string() } + })? as usize; let score = item .get("relevance_score") .or_else(|| item.get("score")) .and_then(|v| v.as_f64()) - .ok_or_else(|| eyre::eyre!("Rerank result missing score."))? as f32; + .ok_or_else(|| Error::InvalidResponse { + message: "Rerank result missing score.".to_string(), + })? as f32; + if index < scores.len() { scores[index] = score; } @@ -52,7 +186,7 @@ fn parse_rerank_response(json: Value, doc_count: usize) -> Result<Vec<f32>> { #[cfg(test)] mod tests { - use super::*; + use crate::rerank::{self}; #[test] fn aligns_scores_by_index() { @@ -62,7 +196,54 @@ mod tests { { "index": 0, "relevance_score": 0.9 } ] }); - let scores = parse_rerank_response(json, 2).expect("parse failed"); + let scores = rerank::parse_rerank_response(json, 2) + .expect("Rerank response parsing must succeed for the valid JSON fixture."); + assert_eq!(scores, vec![0.9, 0.2]); } + + #[test] + fn local_rerank_scores_match_token_overlap_fraction() { + let scores = + rerank::local_rerank("alpha beta", &[String::from("alpha"), String::from("gamma")]); + + assert_eq!(scores.len(), 2); + assert!((scores[0] - 0.5).abs() < 1e-6, "Unexpected score: {}", scores[0]); + assert_eq!(scores[1], 0.0); + } + + #[test] + fn local_noisy_model_is_detected_and_nonnegative() { + assert_eq!(rerank::parse_local_noisy_model("local-token-overlap"), None); + assert_eq!(rerank::parse_local_noisy_model("local-token-overlap-noisy@0.02"), Some(0.02)); + assert_eq!(rerank::parse_local_noisy_model("local-token-overlap-noisy@-1"), Some(0.0)); + } + + #[test] + fn local_rerank_noisy_varies_across_calls() { + // Use a base score away from 0 and 1 so clamping does not mask noise. + let docs = [String::from("alpha"), String::from("alpha")]; + let first = + rerank::local_rerank_dispatch("local-token-overlap-noisy@0.1", "alpha beta", &docs); + + assert!(first.iter().all(|v| (0.0..=1.0).contains(v))); + + let mut varied = false; + + for _ in 0..32 { + let next = + rerank::local_rerank_dispatch("local-token-overlap-noisy@0.1", "alpha beta", &docs); + + assert_eq!(first.len(), next.len()); + assert!(next.iter().all(|v| (0.0..=1.0).contains(v))); + + if next != first { + varied = true; + + break; + } + } + + assert!(varied, "Expected noisy rerank to vary across calls."); + } } diff --git a/packages/elf-providers/tests/providers.rs b/packages/elf-providers/tests/providers.rs index d10e265e..4838f60f 100644 --- a/packages/elf-providers/tests/providers.rs +++ b/packages/elf-providers/tests/providers.rs @@ -1,4 +1,7 @@ -// crates.io +#![allow(unused_crate_dependencies)] + +//! Integration checks for provider-facing helpers. + use reqwest::header::AUTHORIZATION; use serde_json::Map; @@ -7,5 +10,6 @@ fn builds_bearer_auth_header() { let headers = elf_providers::auth_headers("secret", &Map::new()).expect("Failed to build headers."); let value = headers.get(AUTHORIZATION).expect("Missing authorization header."); + assert_eq!(value, "Bearer secret"); } diff --git a/packages/elf-service/Cargo.toml b/packages/elf-service/Cargo.toml index 8bfd9f29..87c4744a 100644 --- a/packages/elf-service/Cargo.toml +++ b/packages/elf-service/Cargo.toml @@ -1,27 +1,30 @@ [package] edition = "2024" name = "elf-service" -version = "0.1.0" +version = "0.2.0" [dependencies] blake3 = { workspace = true } -color-eyre = { workspace = true } -elf-config = { path = "../elf-config" } -elf-domain = { path = "../elf-domain" } -elf-providers = { path = "../elf-providers" } -elf-storage = { path = "../elf-storage" } qdrant-client = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } sqlx = { workspace = true } +thiserror = { workspace = true } time = { workspace = true } +tokenizers = { workspace = true } tracing = { workspace = true } uuid = { workspace = true } +elf-chunking = { workspace = true } +elf-config = { workspace = true } +elf-domain = { workspace = true } +elf-providers = { workspace = true } +elf-storage = { workspace = true } + [dev-dependencies] -axum = { workspace = true } -elf-chunking = { path = "../elf-chunking" } -elf-testkit = { path = "../elf-testkit" } -tokenizers = { workspace = true } -tokio = { workspace = true } -unicode-segmentation = { workspace = true } +ahash = { workspace = true } +axum = { workspace = true } +tokio = { workspace = true } + +elf-testkit = { workspace = true } +elf-worker = { workspace = true } diff --git a/packages/elf-service/src/access.rs b/packages/elf-service/src/access.rs new file mode 100644 index 00000000..9de99062 --- /dev/null +++ b/packages/elf-service/src/access.rs @@ -0,0 +1,184 @@ +use std::collections::HashSet; + +use sqlx::PgExecutor; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::Result; +use elf_storage::models::MemoryNote; + +pub(crate) const ORG_PROJECT_ID: &str = "__org__"; + +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub(crate) struct SharedSpaceGrantKey { + pub(crate) scope: String, + pub(crate) space_owner_agent_id: String, +} + +pub(crate) fn note_read_allowed( + note: &MemoryNote, + requester_agent_id: &str, + allowed_scopes: &[String], + shared_grants: &HashSet<SharedSpaceGrantKey>, + now: OffsetDateTime, +) -> bool { + if note.status != "active" { + return false; + } + if note.expires_at.map(|expires_at| expires_at <= now).unwrap_or(false) { + return false; + } + if !allowed_scopes.iter().any(|scope| scope == ¬e.scope) { + return false; + } + if note.scope == "agent_private" { + return note.agent_id == requester_agent_id; + } + if !is_shared_scope(note.scope.as_str()) { + return false; + } + if note.agent_id == requester_agent_id { + return true; + } + + shared_grants.contains(&SharedSpaceGrantKey { + scope: note.scope.clone(), + space_owner_agent_id: note.agent_id.clone(), + }) +} + +pub(crate) async fn load_shared_read_grants<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + grantee_agent_id: &str, +) -> Result<HashSet<SharedSpaceGrantKey>> +where + E: PgExecutor<'e>, +{ + let rows: Vec<(String, String)> = sqlx::query_as( + "\ +SELECT scope, space_owner_agent_id +FROM memory_space_grants +WHERE tenant_id = $1 + AND project_id = $2 + AND revoked_at IS NULL + AND scope IN ('project_shared', 'org_shared') + AND ( + grantee_kind = 'project' + OR (grantee_kind = 'agent' AND grantee_agent_id = $3) + )", + ) + .bind(tenant_id) + .bind(project_id) + .bind(grantee_agent_id) + .fetch_all(executor) + .await?; + let mut grants = HashSet::with_capacity(rows.len()); + + for (scope, space_owner_agent_id) in rows { + grants.insert(SharedSpaceGrantKey { scope, space_owner_agent_id }); + } + + Ok(grants) +} + +pub(crate) async fn load_shared_read_grants_with_org_shared<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + grantee_agent_id: &str, + org_shared_allowed: bool, +) -> Result<HashSet<SharedSpaceGrantKey>> +where + E: PgExecutor<'e>, +{ + if !org_shared_allowed { + return load_shared_read_grants(executor, tenant_id, project_id, grantee_agent_id).await; + } + + let rows: Vec<(String, String)> = sqlx::query_as( + "\ +SELECT scope, space_owner_agent_id +FROM memory_space_grants +WHERE tenant_id = $1 + AND revoked_at IS NULL + AND ( + (project_id = $2 AND scope = 'project_shared') + OR (scope = 'org_shared' AND project_id = $4) + ) + AND ( + grantee_kind = 'project' + OR (grantee_kind = 'agent' AND grantee_agent_id = $3) + )", + ) + .bind(tenant_id) + .bind(project_id) + .bind(grantee_agent_id) + .bind(ORG_PROJECT_ID) + .fetch_all(executor) + .await?; + let mut grants = HashSet::with_capacity(rows.len()); + + for (scope, space_owner_agent_id) in rows { + grants.insert(SharedSpaceGrantKey { scope, space_owner_agent_id }); + } + + Ok(grants) +} + +pub(crate) async fn ensure_active_project_scope_grant<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + scope: &str, + space_owner_agent_id: &str, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + if !is_shared_scope(scope) { + return Ok(()); + } + + sqlx::query( + "\ +INSERT INTO memory_space_grants ( + grant_id, + tenant_id, + project_id, + scope, + space_owner_agent_id, + grantee_kind, + grantee_agent_id, + granted_by_agent_id, + granted_at +) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) +ON CONFLICT (tenant_id, project_id, scope, space_owner_agent_id) +WHERE revoked_at IS NULL AND grantee_kind='project' +DO UPDATE +SET + granted_by_agent_id = EXCLUDED.granted_by_agent_id, + granted_at = EXCLUDED.granted_at, + revoked_at = NULL, + revoked_by_agent_id = NULL", + ) + .bind(Uuid::new_v4()) + .bind(tenant_id) + .bind(project_id) + .bind(scope) + .bind(space_owner_agent_id) + .bind("project") + .bind::<Option<&str>>(None) + .bind(space_owner_agent_id) + .bind(OffsetDateTime::now_utc()) + .execute(executor) + .await?; + + Ok(()) +} + +fn is_shared_scope(scope: &str) -> bool { + matches!(scope, "project_shared" | "org_shared") +} diff --git a/packages/elf-service/src/add_event.rs b/packages/elf-service/src/add_event.rs index 846fb61f..a6eb0b80 100644 --- a/packages/elf-service/src/add_event.rs +++ b/packages/elf-service/src/add_event.rs @@ -1,60 +1,111 @@ -// crates.io +//! Event ingestion APIs. + +use serde::{Deserialize, Serialize}; use serde_json::Value; -use time::OffsetDateTime; +use sqlx::{PgConnection, Postgres, Transaction}; +use time::{Duration, OffsetDateTime}; use uuid::Uuid; -// self -use elf_domain::{cjk, evidence, ttl, writegate}; -use elf_storage::models::MemoryNote; - use crate::{ - ElfService, InsertVersionArgs, NoteOp, REJECT_EVIDENCE_MISMATCH, ResolveUpdateArgs, - ServiceError, ServiceResult, UpdateDecision, + ElfService, Error, InsertVersionArgs, NoteOp, REJECT_EVIDENCE_MISMATCH, + REJECT_WRITE_POLICY_MISMATCH, ResolveUpdateArgs, Result, UpdateDecision, + access::{self, ORG_PROJECT_ID}, + graph_ingestion, + ingest_audit::{self, IngestAuditArgs}, + ingestion_profiles::{self, IngestionProfileRef, IngestionProfileSelector}, + structured_fields::{self, StructuredFields}, +}; +use elf_config::Config; +use elf_domain::{ + english_gate, evidence, + memory_policy::{self, MemoryPolicyDecision}, + ttl, + writegate::{self, NoteInput, WritePolicy, WritePolicyAudit, WritePolicyError}, }; +use elf_storage::models::MemoryNote; + +type ProcessedEventOutput = (Vec<EventMessage>, Vec<bool>, Option<Vec<WritePolicyAudit>>); +type AddEventPersistOutput = (AddEventResult, Option<Uuid>); -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +const REJECT_STRUCTURED_INVALID: &str = "REJECT_STRUCTURED_INVALID"; +const IGNORE_DUPLICATE: &str = "IGNORE_DUPLICATE"; +const IGNORE_POLICY_THRESHOLD: &str = "IGNORE_POLICY_THRESHOLD"; + +/// One chat or event message passed to the event extractor. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct EventMessage { + /// Speaker or message role. pub role: String, + /// Message body content. pub content: String, + /// Optional source timestamp string. pub ts: Option<String>, + /// Optional message identifier from the upstream source. pub msg_id: Option<String>, + /// Optional write policy applied before extraction. + pub write_policy: Option<WritePolicy>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Request payload for event-driven note extraction. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct AddEventRequest { + /// Tenant that owns the request. pub tenant_id: String, + /// Project that owns the request. pub project_id: String, + /// Agent that emitted the event batch. pub agent_id: String, + /// Optional explicit scope override for extracted notes. pub scope: Option<String>, + /// When true, performs validation and extraction without persisting notes. pub dry_run: Option<bool>, + /// Optional ingestion profile selector. + pub ingestion_profile: Option<IngestionProfileSelector>, + /// Source messages to extract notes from. pub messages: Vec<EventMessage>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Per-note outcome for an `add_event` request. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct AddEventResult { + /// Note identifier when one was created or updated. pub note_id: Option<Uuid>, + /// Persistence operation chosen for the extracted note. pub op: NoteOp, + /// Memory-policy decision applied to the extracted note. + pub policy_decision: MemoryPolicyDecision, + /// Machine-readable rejection or ignore code, if any. pub reason_code: Option<String>, + /// Human-readable rejection or ignore message, if any. pub reason: Option<String>, + /// Field path associated with a validation failure, if any. + pub field_path: Option<String>, + /// Per-message write-policy audits when write policies were applied. + pub write_policy_audits: Option<Vec<WritePolicyAudit>>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Response payload for event-driven note extraction. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct AddEventResponse { + /// Raw structured extractor output after normalization. pub extracted: Value, + /// One result per extracted note. pub results: Vec<AddEventResult>, + /// Resolved ingestion profile used for the request. + pub ingestion_profile: Option<IngestionProfileRef>, } -#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] struct ExtractorOutput { pub notes: Vec<ExtractedNote>, } -#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] struct ExtractedNote { - #[serde(rename = "type")] - pub note_type: Option<String>, + pub r#type: Option<String>, pub key: Option<String>, pub text: Option<String>, + pub structured: Option<StructuredFields>, pub importance: Option<f32>, pub confidence: Option<f32>, pub ttl_days: Option<i64>, @@ -63,372 +114,1340 @@ struct ExtractedNote { pub reason: Option<String>, } -#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] struct EvidenceQuote { pub message_index: usize, pub quote: String, } -impl ElfService { - pub async fn add_event(&self, req: AddEventRequest) -> ServiceResult<AddEventResponse> { - if req.messages.is_empty() { - return Err(ServiceError::InvalidRequest { - message: "Messages list is empty.".to_string(), - }); - } - if req.tenant_id.trim().is_empty() - || req.project_id.trim().is_empty() - || req.agent_id.trim().is_empty() - { - return Err(ServiceError::InvalidRequest { - message: "tenant_id, project_id, and agent_id are required.".to_string(), - }); - } - if let Some(scope) = req.scope.as_ref() - && scope.trim().is_empty() - { - return Err(ServiceError::InvalidRequest { - message: "scope must not be empty when provided.".to_string(), - }); - } +struct NoteProcessingData { + note_type: String, + text: String, + structured: Option<StructuredFields>, + importance: f32, + confidence: f32, + reason: Option<String>, + ttl_days: Option<i64>, + scope: String, + evidence: Vec<EvidenceQuote>, + structured_present: bool, + graph_present: bool, +} +impl NoteProcessingData { + fn from_request_and_note(req: &AddEventRequest, note: &ExtractedNote) -> Self { + let note_type = note.r#type.clone().unwrap_or_default(); + let text = note.text.clone().unwrap_or_default(); + let structured = note.structured.clone(); + let structured_present = + structured.as_ref().is_some_and(|value| !value.is_effectively_empty()); + let graph_present = structured.as_ref().is_some_and(StructuredFields::has_graph_fields); - for (idx, msg) in req.messages.iter().enumerate() { - if cjk::contains_cjk(&msg.content) { - return Err(ServiceError::NonEnglishInput { - field: format!("$.messages[{idx}].content"), - }); - } + Self { + note_type, + text, + structured, + importance: note.importance.unwrap_or(0.0), + confidence: note.confidence.unwrap_or(0.0), + reason: note.reason.clone(), + ttl_days: note.ttl_days, + scope: req.scope.clone().or(note.scope_suggestion.clone()).unwrap_or_default(), + evidence: note.evidence.clone().unwrap_or_default(), + structured_present, + graph_present, } + } +} + +struct PersistExtractedNoteArgs<'a> { + req: &'a AddEventRequest, + project_id: &'a str, + structured: Option<&'a StructuredFields>, + key: Option<&'a str>, + reason: Option<&'a String>, + note_type: &'a str, + text: &'a str, + scope: &'a str, + importance: f32, + confidence: f32, + expires_at: Option<OffsetDateTime>, + source_ref: Value, + now: OffsetDateTime, + embed_version: &'a str, +} + +struct AddEventContext<'a> { + tenant_id: &'a str, + project_id: &'a str, + agent_id: &'a str, + scope: &'a str, + now: OffsetDateTime, +} - let messages_json = build_extractor_messages( - &req.messages, +impl ElfService { + /// Extracts notes from an event transcript and optionally persists the accepted results. + pub async fn add_event(&self, req: AddEventRequest) -> Result<AddEventResponse> { + validate_add_event_request(&req)?; + + let resolved_profile = ingestion_profiles::resolve_add_event_profile( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + req.ingestion_profile.as_ref(), + ) + .await?; + let (messages, message_policy_applied, write_policy_audits) = + apply_write_policies_to_messages(req.messages.as_slice())?; + let message_texts: Vec<String> = + messages.iter().map(|message| message.content.clone()).collect(); + let messages_json = + serde_json::to_string(&messages).map_err(|_| Error::InvalidRequest { + message: "Failed to serialize messages for extractor.".to_string(), + })?; + let extractor_messages = resolved_profile.build_extractor_messages( + &messages_json, self.cfg.memory.max_notes_per_add_event, self.cfg.memory.max_note_chars, )?; - - let extracted_raw = self - .providers - .extractor - .extract(&self.cfg.providers.llm_extractor, &messages_json) - .await?; - + let llm_cfg = resolved_profile.resolved_llm_config(&self.cfg.providers.llm_extractor); + let extracted_raw = self.providers.extractor.extract(&llm_cfg, &extractor_messages).await?; + let max_notes = self.cfg.memory.max_notes_per_add_event as usize; let mut extracted: ExtractorOutput = serde_json::from_value(extracted_raw.clone()) - .map_err(|_| ServiceError::InvalidRequest { + .map_err(|_| Error::InvalidRequest { message: "Extractor output is missing notes array.".to_string(), })?; - let max_notes = self.cfg.memory.max_notes_per_add_event as usize; if extracted.notes.len() > max_notes { extracted.notes.truncate(max_notes); } - let extracted_json = - serde_json::to_value(&extracted).map_err(|_| ServiceError::InvalidRequest { - message: "Failed to serialize extracted notes.".to_string(), - })?; - - let now = OffsetDateTime::now_utc(); + let extracted_json = serde_json::to_value(&extracted).map_err(|_| { + Error::InvalidRequest { message: "Failed to serialize extracted notes.".to_string() } + })?; + let base_now = OffsetDateTime::now_utc(); let embed_version = crate::embedding_version(&self.cfg); let dry_run = req.dry_run.unwrap_or(false); let mut results = Vec::with_capacity(extracted.notes.len()); - let message_texts: Vec<String> = req.messages.iter().map(|m| m.content.clone()).collect(); - - for note in extracted.notes { - let note_type = note.note_type.unwrap_or_default(); - let text = note.text.unwrap_or_default(); - let importance = note.importance.unwrap_or(0.0); - let confidence = note.confidence.unwrap_or(0.0); - let ttl_days = note.ttl_days; - let scope = req.scope.clone().or(note.scope_suggestion.clone()).unwrap_or_default(); - let evidence = note.evidence.unwrap_or_default(); - - if evidence.is_empty() - || evidence.len() < self.cfg.security.evidence_min_quotes as usize - || evidence.len() > self.cfg.security.evidence_max_quotes as usize - { - results.push(AddEventResult { - note_id: None, - op: NoteOp::Rejected, - reason_code: Some(REJECT_EVIDENCE_MISMATCH.to_string()), - reason: note.reason.clone(), - }); - continue; - } - let mut evidence_ok = true; - for quote in &evidence { - if quote.quote.len() > self.cfg.security.evidence_max_quote_chars as usize { - evidence_ok = false; - break; - } - if !evidence::evidence_matches(&message_texts, quote.message_index, "e.quote) { - evidence_ok = false; - break; - } - } + for (note_idx, note) in extracted.notes.into_iter().enumerate() { + let now = base_now + Duration::microseconds(note_idx as i64); - if !evidence_ok { - results.push(AddEventResult { - note_id: None, - op: NoteOp::Rejected, - reason_code: Some(REJECT_EVIDENCE_MISMATCH.to_string()), - reason: note.reason.clone(), - }); - continue; - } + results.push( + self.process_extracted_note( + &req, + &resolved_profile.profile_ref, + &message_texts, + &message_policy_applied, + write_policy_audits.as_ref(), + note, + now, + embed_version.as_str(), + dry_run, + ) + .await?, + ); + } - let gate_input = writegate::NoteInput { - note_type: note_type.clone(), - scope: scope.clone(), - text: text.clone(), - }; - if let Err(code) = writegate::writegate(&gate_input, &self.cfg) { - results.push(AddEventResult { - note_id: None, - op: NoteOp::Rejected, - reason_code: Some(crate::writegate_reason_code(code).to_string()), - reason: note.reason.clone(), - }); - continue; - } + Ok(AddEventResponse { + extracted: extracted_json, + results, + ingestion_profile: Some(resolved_profile.profile_ref), + }) + } + + #[allow(clippy::too_many_arguments)] + async fn process_extracted_note( + &self, + req: &AddEventRequest, + ingestion_profile: &IngestionProfileRef, + message_texts: &[String], + message_policy_applied: &[bool], + write_policy_audits: Option<&Vec<WritePolicyAudit>>, + note: ExtractedNote, + now: OffsetDateTime, + embed_version: &str, + dry_run: bool, + ) -> Result<AddEventResult> { + let note_data = NoteProcessingData::from_request_and_note(req, ¬e); + let effective_project_id = if note_data.scope.trim() == "org_shared" { + ORG_PROJECT_ID + } else { + req.project_id.as_str() + }; + let ctx = AddEventContext { + tenant_id: req.tenant_id.as_str(), + project_id: effective_project_id, + agent_id: req.agent_id.as_str(), + scope: note_data.scope.as_str(), + now, + }; + let mut tx = self.db.pool.begin().await?; + + if let Some(result) = self + .record_extracted_note_rejections( + &mut tx, + &ctx, + ingestion_profile, + ¬e, + ¬e_data, + message_texts, + message_policy_applied, + write_policy_audits, + ) + .await? + { + tx.commit().await?; + + return Ok(result); + } - let expires_at = ttl::compute_expires_at(ttl_days, ¬e_type, &self.cfg, now); - let mut tx = self.db.pool.begin().await?; - let decision = crate::resolve_update( + let result = self + .apply_extracted_note_decision( + req, + ingestion_profile, &mut tx, - ResolveUpdateArgs { - cfg: &self.cfg, - providers: &self.providers, - tenant_id: &req.tenant_id, - project_id: &req.project_id, - agent_id: &req.agent_id, - scope: &scope, - note_type: ¬e_type, - key: note.key.as_deref(), - text: &text, + &ctx, + ¬e, + ¬e_data, + note_data.note_type.as_str(), + effective_project_id, + now, + embed_version, + dry_run, + write_policy_audits, + ) + .await?; + + tx.commit().await?; + + Ok(result) + } + + #[allow(clippy::too_many_arguments)] + async fn apply_extracted_note_decision( + &self, + req: &AddEventRequest, + ingestion_profile: &IngestionProfileRef, + tx: &mut Transaction<'_, Postgres>, + ctx: &AddEventContext<'_>, + note: &ExtractedNote, + note_data: &NoteProcessingData, + note_type: &str, + project_id: &str, + now: OffsetDateTime, + embed_version: &str, + dry_run: bool, + write_policy_audits: Option<&Vec<WritePolicyAudit>>, + ) -> Result<AddEventResult> { + let decision = self.resolve_extracted_note_update(note, req, note_data, tx, now).await?; + let metadata = decision.metadata(); + let base_decision = base_decision_for_update( + &decision, + note_data.structured_present, + note_data.graph_present, + ); + let (policy_decision, decision_policy_rule, min_confidence, min_importance) = + resolve_policy_for_update(&self.cfg, note_data, base_decision); + let ignore_reason_code = + ignore_reason_code_for_policy(base_decision, policy_decision, metadata.matched_dup); + let should_apply = matches!( + policy_decision, + MemoryPolicyDecision::Remember | MemoryPolicyDecision::Update + ); + let mut result = build_result_from_decision( + &decision, + policy_decision, + note_data.reason.clone(), + note_data.structured_present || note_data.graph_present, + ); + + apply_policy_ignore_adjustments( + &mut result, + &decision, + policy_decision, + ignore_reason_code, + ); + + let mut note_version_id = None; + + if should_apply && !dry_run { + let persist_args = PersistExtractedNoteArgs { + req, + project_id, + structured: note_data.structured.as_ref(), + key: note.key.as_deref(), + reason: note.reason.as_ref(), + note_type, + text: note_data.text.as_str(), + scope: note_data.scope.as_str(), + importance: note_data.importance, + confidence: note_data.confidence, + expires_at: ttl::compute_expires_at( + note_data.ttl_days, + note_data.note_type.as_str(), + &self.cfg, now, - }, + ), + source_ref: serde_json::json!({ + "evidence": note_data.evidence.clone(), + "reason": note_data.reason.clone().unwrap_or_default(), + "ingestion_profile": serde_json::json!({ + "id": ingestion_profile.id, + "version": ingestion_profile.version, + }), + }), + now, + embed_version, + }; + let persisted = self + .persist_extracted_note_decision(tx, persist_args, decision, policy_decision) + .await?; + + result = persisted.0; + note_version_id = persisted.1; + } + + result.write_policy_audits = write_policy_audits.cloned(); + + record_ingest_decision( + tx, + &self.cfg, + ctx, + note, + note_data.note_type.as_str(), + result.note_id, + note_version_id, + base_decision, + policy_decision, + result.op, + result.reason_code.as_deref(), + decision_policy_rule.as_deref(), + metadata.similarity_best, + metadata.key_match, + metadata.matched_dup, + min_confidence, + min_importance, + Some(ingestion_profile.id.as_str()), + Some(ingestion_profile.version), + note_data.structured_present, + note_data.graph_present, + write_policy_audits.cloned(), + ) + .await?; + + Ok(result) + } + + #[allow(clippy::too_many_arguments)] + async fn record_extracted_note_rejections( + &self, + tx: &mut Transaction<'_, Postgres>, + ctx: &AddEventContext<'_>, + ingestion_profile: &IngestionProfileRef, + note: &ExtractedNote, + note_data: &NoteProcessingData, + message_texts: &[String], + message_policy_applied: &[bool], + write_policy_audits: Option<&Vec<WritePolicyAudit>>, + ) -> Result<Option<AddEventResult>> { + if let Some(result) = reject_extracted_note_if_evidence_invalid( + &self.cfg, + note.reason.as_ref(), + ¬e_data.evidence, + message_texts, + message_policy_applied, + ) { + let mut result = result; + + result.write_policy_audits = write_policy_audits.cloned(); + + record_ingest_decision( + tx, + &self.cfg, + ctx, + note, + note_data.note_type.as_str(), + None, + None, + MemoryPolicyDecision::Reject, + MemoryPolicyDecision::Reject, + NoteOp::Rejected, + result.reason_code.as_deref(), + None, + None, + false, + false, + None, + None, + Some(ingestion_profile.id.as_str()), + Some(ingestion_profile.version), + note_data.structured_present, + note_data.graph_present, + write_policy_audits.cloned(), ) .await?; - if dry_run { - tx.commit().await?; - let (note_id, op) = match decision { - UpdateDecision::Add { note_id } => (Some(note_id), NoteOp::Add), - UpdateDecision::Update { note_id } => (Some(note_id), NoteOp::Update), - UpdateDecision::None { note_id } => (Some(note_id), NoteOp::None), - }; - results.push(AddEventResult { - note_id, - op, - reason_code: None, - reason: note.reason.clone(), - }); - continue; - } + return Ok(Some(result)); + } else if let Some(result) = reject_extracted_note_if_structured_invalid( + note_data.structured.as_ref(), + note_data.text.as_str(), + ¬e_data.evidence, + note.reason.as_ref(), + ) { + let mut result = result; - let source_ref = serde_json::json!({ - "evidence": evidence, - "reason": note.reason.clone().unwrap_or_default(), - }); + result.write_policy_audits = write_policy_audits.cloned(); - match decision { - UpdateDecision::Add { note_id } => { - let memory_note = MemoryNote { - note_id, - tenant_id: req.tenant_id.clone(), - project_id: req.project_id.clone(), - agent_id: req.agent_id.clone(), - scope: scope.clone(), - r#type: note_type.clone(), - key: note.key.clone(), - text: text.clone(), - importance, - confidence, - status: "active".to_string(), - created_at: now, - updated_at: now, - expires_at, - embedding_version: embed_version.clone(), - source_ref, - hit_count: 0, - last_hit_at: None, - }; - - sqlx::query( - "INSERT INTO memory_notes \ - (note_id, tenant_id, project_id, agent_id, scope, type, key, text, importance, confidence, status, created_at, updated_at, expires_at, embedding_version, source_ref, hit_count, last_hit_at) \ - VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18)", - ) - .bind(memory_note.note_id) - .bind(&memory_note.tenant_id) - .bind(&memory_note.project_id) - .bind(&memory_note.agent_id) - .bind(&memory_note.scope) - .bind(&memory_note.r#type) - .bind(&memory_note.key) - .bind(&memory_note.text) - .bind(memory_note.importance) - .bind(memory_note.confidence) - .bind(&memory_note.status) - .bind(memory_note.created_at) - .bind(memory_note.updated_at) - .bind(memory_note.expires_at) - .bind(&memory_note.embedding_version) - .bind(&memory_note.source_ref) - .bind(memory_note.hit_count) - .bind(memory_note.last_hit_at) - .execute(&mut *tx) - .await?; - - crate::insert_version( - &mut tx, - InsertVersionArgs { - note_id: memory_note.note_id, - op: "ADD", - prev_snapshot: None, - new_snapshot: Some(crate::note_snapshot(&memory_note)), - reason: "add_event", - actor: "add_event", - ts: now, - }, - ) - .await?; - crate::enqueue_outbox_tx( - &mut tx, - memory_note.note_id, - "UPSERT", - &memory_note.embedding_version, - now, - ) - .await?; - tx.commit().await?; - - results.push(AddEventResult { - note_id: Some(note_id), - op: NoteOp::Add, - reason_code: None, - reason: note.reason.clone(), - }); + record_ingest_decision( + tx, + &self.cfg, + ctx, + note, + note_data.note_type.as_str(), + None, + None, + MemoryPolicyDecision::Reject, + MemoryPolicyDecision::Reject, + NoteOp::Rejected, + Some(REJECT_STRUCTURED_INVALID), + None, + None, + false, + false, + None, + None, + Some(ingestion_profile.id.as_str()), + Some(ingestion_profile.version), + note_data.structured_present, + note_data.graph_present, + write_policy_audits.cloned(), + ) + .await?; + + return Ok(Some(result)); + } else if let Some(result) = reject_extracted_note_if_writegate_rejects( + &self.cfg, + note.reason.as_ref(), + note_data.note_type.as_str(), + note_data.scope.as_str(), + note_data.text.as_str(), + ) { + let mut result = result; + + result.write_policy_audits = write_policy_audits.cloned(); + + record_ingest_decision( + tx, + &self.cfg, + ctx, + note, + note_data.note_type.as_str(), + None, + None, + MemoryPolicyDecision::Reject, + MemoryPolicyDecision::Reject, + NoteOp::Rejected, + result.reason_code.as_deref(), + None, + None, + false, + false, + None, + None, + Some(ingestion_profile.id.as_str()), + Some(ingestion_profile.version), + note_data.structured_present, + note_data.graph_present, + write_policy_audits.cloned(), + ) + .await?; + + return Ok(Some(result)); + } + + Ok(None) + } + + async fn resolve_extracted_note_update( + &self, + note: &ExtractedNote, + req: &AddEventRequest, + note_data: &NoteProcessingData, + tx: &mut PgConnection, + now: OffsetDateTime, + ) -> Result<UpdateDecision> { + crate::resolve_update( + tx, + ResolveUpdateArgs { + cfg: &self.cfg, + providers: &self.providers, + tenant_id: req.tenant_id.as_str(), + project_id: if note_data.scope.trim() == "org_shared" { + ORG_PROJECT_ID + } else { + req.project_id.as_str() }, - UpdateDecision::Update { note_id } => { - let mut existing: MemoryNote = - sqlx::query_as("SELECT * FROM memory_notes WHERE note_id = $1 FOR UPDATE") - .bind(note_id) - .fetch_one(&mut *tx) - .await?; - let prev_snapshot = crate::note_snapshot(&existing); - - existing.text = text.clone(); - existing.importance = importance; - existing.confidence = confidence; - existing.updated_at = now; - existing.expires_at = expires_at; - existing.source_ref = source_ref; - - sqlx::query( - "UPDATE memory_notes SET text = $1, importance = $2, confidence = $3, updated_at = $4, expires_at = $5, source_ref = $6 WHERE note_id = $7", - ) - .bind(&existing.text) - .bind(existing.importance) - .bind(existing.confidence) - .bind(existing.updated_at) - .bind(existing.expires_at) - .bind(&existing.source_ref) - .bind(existing.note_id) - .execute(&mut *tx) - .await?; - - crate::insert_version( - &mut tx, - InsertVersionArgs { - note_id: existing.note_id, - op: "UPDATE", - prev_snapshot: Some(prev_snapshot), - new_snapshot: Some(crate::note_snapshot(&existing)), - reason: "add_event", - actor: "add_event", - ts: now, - }, - ) - .await?; - crate::enqueue_outbox_tx( - &mut tx, - existing.note_id, - "UPSERT", - &existing.embedding_version, - now, - ) + agent_id: req.agent_id.as_str(), + scope: note_data.scope.as_str(), + note_type: note_data.note_type.as_str(), + key: note.key.as_deref(), + text: note_data.text.as_str(), + now, + }, + ) + .await + } + + async fn persist_extracted_note_decision( + &self, + tx: &mut Transaction<'_, Postgres>, + args: PersistExtractedNoteArgs<'_>, + decision: UpdateDecision, + policy_decision: MemoryPolicyDecision, + ) -> Result<AddEventPersistOutput> { + match (decision, args) { + (UpdateDecision::Add { note_id, .. }, args) => + self.persist_extracted_note_add(tx, args, note_id, policy_decision).await, + (UpdateDecision::Update { note_id, .. }, args) => + self.persist_extracted_note_update(tx, args, note_id, policy_decision).await, + (UpdateDecision::None { note_id, .. }, args) => + self.persist_extracted_note_none(tx, args, note_id, policy_decision).await, + } + } + + async fn persist_extracted_note_add( + &self, + tx: &mut Transaction<'_, Postgres>, + args: PersistExtractedNoteArgs<'_>, + note_id: Uuid, + policy_decision: MemoryPolicyDecision, + ) -> Result<AddEventPersistOutput> { + access::ensure_active_project_scope_grant( + &mut **tx, + args.req.tenant_id.as_str(), + args.project_id, + args.scope, + args.req.agent_id.as_str(), + ) + .await?; + + let memory_note = MemoryNote { + note_id, + tenant_id: args.req.tenant_id.clone(), + project_id: args.project_id.to_string(), + agent_id: args.req.agent_id.clone(), + scope: args.scope.to_string(), + r#type: args.note_type.to_string(), + key: args.key.map(ToString::to_string), + text: args.text.to_string(), + importance: args.importance, + confidence: args.confidence, + status: "active".to_string(), + created_at: args.now, + updated_at: args.now, + expires_at: args.expires_at, + embedding_version: args.embed_version.to_string(), + source_ref: args.source_ref, + hit_count: 0, + last_hit_at: None, + }; + + insert_memory_note_tx(tx, &memory_note).await?; + + let note_version_id = crate::insert_version( + &mut **tx, + InsertVersionArgs { + note_id: memory_note.note_id, + op: "ADD", + prev_snapshot: None, + new_snapshot: Some(crate::note_snapshot(&memory_note)), + reason: "add_event", + actor: args.req.agent_id.as_str(), + ts: args.now, + }, + ) + .await?; + + crate::enqueue_outbox_tx( + &mut **tx, + memory_note.note_id, + "UPSERT", + args.embed_version, + args.now, + ) + .await?; + + upsert_structured_fields_tx(tx, args.structured, memory_note.note_id, args.now).await?; + + if let Some(structured) = args.structured + && structured.has_graph_fields() + { + graph_ingestion::persist_graph_fields_tx( + tx, + args.req.tenant_id.as_str(), + args.project_id, + args.req.agent_id.as_str(), + args.scope, + memory_note.note_id, + structured, + args.now, + ) + .await?; + } + + Ok(( + AddEventResult { + note_id: Some(note_id), + op: NoteOp::Add, + policy_decision, + reason_code: None, + reason: args.reason.cloned(), + field_path: None, + write_policy_audits: None, + }, + Some(note_version_id), + )) + } + + async fn persist_extracted_note_update( + &self, + tx: &mut Transaction<'_, Postgres>, + args: PersistExtractedNoteArgs<'_>, + note_id: Uuid, + policy_decision: MemoryPolicyDecision, + ) -> Result<AddEventPersistOutput> { + let mut existing: MemoryNote = sqlx::query_as::<_, MemoryNote>( + "SELECT * FROM memory_notes WHERE note_id = $1 FOR UPDATE", + ) + .bind(note_id) + .fetch_one(&mut **tx) + .await?; + + access::ensure_active_project_scope_grant( + &mut **tx, + existing.tenant_id.as_str(), + existing.project_id.as_str(), + existing.scope.as_str(), + existing.agent_id.as_str(), + ) + .await?; + + let prev_snapshot = crate::note_snapshot(&existing); + + existing.text = args.text.to_string(); + existing.importance = args.importance; + existing.confidence = args.confidence; + existing.updated_at = args.now; + existing.expires_at = args.expires_at; + existing.source_ref = args.source_ref; + + update_memory_note_tx(tx, &existing).await?; + + let note_version_id = crate::insert_version( + &mut **tx, + InsertVersionArgs { + note_id: existing.note_id, + op: "UPDATE", + prev_snapshot: Some(prev_snapshot), + new_snapshot: Some(crate::note_snapshot(&existing)), + reason: "add_event", + actor: args.req.agent_id.as_str(), + ts: args.now, + }, + ) + .await?; + + crate::enqueue_outbox_tx( + &mut **tx, + existing.note_id, + "UPSERT", + existing.embedding_version.as_str(), + args.now, + ) + .await?; + + upsert_structured_fields_tx(tx, args.structured, existing.note_id, args.now).await?; + + if let Some(structured) = args.structured + && structured.has_graph_fields() + { + graph_ingestion::persist_graph_fields_tx( + tx, + args.req.tenant_id.as_str(), + existing.project_id.as_str(), + args.req.agent_id.as_str(), + args.scope, + existing.note_id, + structured, + args.now, + ) + .await?; + } + + Ok(( + AddEventResult { + note_id: Some(note_id), + op: NoteOp::Update, + policy_decision, + reason_code: None, + reason: args.reason.cloned(), + field_path: None, + write_policy_audits: None, + }, + Some(note_version_id), + )) + } + + async fn persist_extracted_note_none( + &self, + tx: &mut Transaction<'_, Postgres>, + args: PersistExtractedNoteArgs<'_>, + note_id: Uuid, + policy_decision: MemoryPolicyDecision, + ) -> Result<AddEventPersistOutput> { + let mut did_update = false; + + if let Some(structured) = args.structured + && !structured.is_effectively_empty() + { + structured_fields::upsert_structured_fields_tx(tx, note_id, structured, args.now) + .await?; + crate::enqueue_outbox_tx(&mut **tx, note_id, "UPSERT", args.embed_version, args.now) + .await?; + + did_update = true; + } + if let Some(structured) = args.structured + && structured.has_graph_fields() + { + graph_ingestion::persist_graph_fields_tx( + tx, + args.req.tenant_id.as_str(), + args.project_id, + args.req.agent_id.as_str(), + args.scope, + note_id, + structured, + args.now, + ) + .await?; + + did_update = true; + } + + if did_update { + let note_row: MemoryNote = + sqlx::query_as("SELECT * FROM memory_notes WHERE note_id = $1") + .bind(note_id) + .fetch_one(&mut **tx) .await?; - tx.commit().await?; - - results.push(AddEventResult { - note_id: Some(note_id), - op: NoteOp::Update, - reason_code: None, - reason: note.reason.clone(), - }); - }, - UpdateDecision::None { note_id } => { - tx.commit().await?; - results.push(AddEventResult { - note_id: Some(note_id), - op: NoteOp::None, - reason_code: None, - reason: note.reason.clone(), - }); + let snapshot = crate::note_snapshot(¬e_row); + let note_version_id = crate::insert_version( + &mut **tx, + InsertVersionArgs { + note_id, + op: "UPDATE", + prev_snapshot: Some(snapshot.clone()), + new_snapshot: Some(snapshot), + reason: "add_event_structured", + actor: args.req.agent_id.as_str(), + ts: args.now, }, + ) + .await?; + + if matches!(args.scope, "project_shared" | "org_shared") { + access::ensure_active_project_scope_grant( + &mut **tx, + args.req.tenant_id.as_str(), + args.project_id, + args.scope, + args.req.agent_id.as_str(), + ) + .await?; } + + return Ok(( + AddEventResult { + note_id: Some(note_id), + op: NoteOp::Update, + policy_decision, + reason_code: None, + reason: args.reason.cloned(), + field_path: None, + write_policy_audits: None, + }, + Some(note_version_id), + )); } - Ok(AddEventResponse { extracted: extracted_json, results }) + Ok(( + AddEventResult { + note_id: Some(note_id), + op: NoteOp::None, + policy_decision, + reason_code: None, + reason: args.reason.cloned(), + field_path: None, + write_policy_audits: None, + }, + None, + )) } } -fn build_extractor_messages( - messages: &[EventMessage], - max_notes: u32, - max_note_chars: u32, -) -> ServiceResult<Vec<Value>> { - let schema = serde_json::json!({ - "notes": [ - { - "type": "preference|constraint|decision|profile|fact|plan", - "key": "string|null", - "text": "English-only sentence <= MAX_NOTE_CHARS", - "importance": 0.0, - "confidence": 0.0, - "ttl_days": "number|null", - "scope_suggestion": "agent_private|project_shared|org_shared|null", - "evidence": [ - { "message_index": "number", "quote": "string" } - ], - "reason": "string" - } - ] - }); - - let system_prompt = "You are a memory extraction engine for an agent memory system. \ -Output must be valid JSON only and must match the provided schema exactly. \ -Extract at most MAX_NOTES high-signal, cross-session reusable memory notes from the given messages. \ -Each note must be one English sentence and must not contain any CJK characters. \ -Preserve numbers, dates, percentages, currency amounts, tickers, URLs, and code snippets exactly. \ -Never store secrets or PII: API keys, tokens, private keys, seed phrases, passwords, bank IDs, personal addresses. \ -For every note, provide 1 to 2 evidence quotes copied verbatim from the input messages and include the message_index. \ -If you cannot provide verbatim evidence, omit the note. \ -If content is ephemeral or not useful long-term, return an empty notes array."; - - let messages_json = - serde_json::to_string(messages).map_err(|_| ServiceError::InvalidRequest { - message: "Failed to serialize messages for extractor.".to_string(), - })?; +fn resolve_policy_for_update( + cfg: &Config, + note_data: &NoteProcessingData, + base_decision: MemoryPolicyDecision, +) -> (MemoryPolicyDecision, Option<String>, Option<f32>, Option<f32>) { + if matches!(base_decision, MemoryPolicyDecision::Remember | MemoryPolicyDecision::Update) { + let policy_eval = memory_policy::evaluate_memory_policy( + cfg, + note_data.note_type.as_str(), + note_data.scope.as_str(), + note_data.confidence as f64, + note_data.importance as f64, + base_decision, + ); + let decision_policy_rule = policy_eval + .matched_rule + .and_then(|rule| policy_rule_id(rule.note_type.as_deref(), rule.scope.as_deref())); + let min_confidence = policy_eval.matched_rule.and_then(|rule| rule.min_confidence); + let min_importance = policy_eval.matched_rule.and_then(|rule| rule.min_importance); + + (policy_eval.decision, decision_policy_rule, min_confidence, min_importance) + } else { + (MemoryPolicyDecision::Ignore, None, None, None) + } +} + +fn ignore_reason_code_for_policy( + base_decision: MemoryPolicyDecision, + policy_decision: MemoryPolicyDecision, + matched_duplicate: bool, +) -> Option<&'static str> { + if !matches!(policy_decision, MemoryPolicyDecision::Ignore) { + return None; + } + + match base_decision { + MemoryPolicyDecision::Remember | MemoryPolicyDecision::Update => + Some(IGNORE_POLICY_THRESHOLD), + MemoryPolicyDecision::Ignore if matched_duplicate => Some(IGNORE_DUPLICATE), + _ => None, + } +} + +fn build_result_from_decision( + decision: &UpdateDecision, + policy_decision: MemoryPolicyDecision, + reason: Option<String>, + structured_present: bool, +) -> AddEventResult { + match decision { + UpdateDecision::Add { note_id, .. } => AddEventResult { + note_id: Some(*note_id), + op: NoteOp::Add, + policy_decision, + reason_code: None, + reason, + field_path: None, + write_policy_audits: None, + }, + UpdateDecision::Update { note_id, .. } => AddEventResult { + note_id: Some(*note_id), + op: NoteOp::Update, + policy_decision, + reason_code: None, + reason, + field_path: None, + write_policy_audits: None, + }, + UpdateDecision::None { note_id, .. } => AddEventResult { + note_id: Some(*note_id), + op: if structured_present { NoteOp::Update } else { NoteOp::None }, + policy_decision, + reason_code: None, + reason, + field_path: None, + write_policy_audits: None, + }, + } +} + +fn apply_policy_ignore_adjustments( + result: &mut AddEventResult, + decision: &UpdateDecision, + policy_decision: MemoryPolicyDecision, + ignore_reason_code: Option<&str>, +) { + if !matches!(policy_decision, MemoryPolicyDecision::Ignore) { + return; + } + + if let UpdateDecision::Add { .. } = decision { + result.note_id = None; + } + + result.op = NoteOp::None; + result.reason_code = ignore_reason_code.map(str::to_string); +} + +fn validate_add_event_request(req: &AddEventRequest) -> Result<()> { + if req.messages.is_empty() { + return Err(Error::InvalidRequest { message: "Messages list is empty.".to_string() }); + } + if req.tenant_id.trim().is_empty() + || req.project_id.trim().is_empty() + || req.agent_id.trim().is_empty() + { + return Err(Error::InvalidRequest { + message: "tenant_id, project_id, and agent_id are required.".to_string(), + }); + } + + if let Some(scope) = req.scope.as_ref() + && scope.trim().is_empty() + { + return Err(Error::InvalidRequest { + message: "scope must not be empty when provided.".to_string(), + }); + } + if let Some(profile) = req.ingestion_profile.as_ref() { + if profile.id.trim().is_empty() { + return Err(Error::InvalidRequest { + message: "ingestion_profile.id must not be empty.".to_string(), + }); + } + + if let Some(version) = profile.version + && version <= 0 + { + return Err(Error::InvalidRequest { + message: "ingestion_profile.version must be greater than zero.".to_string(), + }); + } + } + + for (idx, msg) in req.messages.iter().enumerate() { + if !english_gate::is_english_natural_language(msg.content.as_str()) { + return Err(Error::NonEnglishInput { field: format!("$.messages[{idx}].content") }); + } + } + + Ok(()) +} + +fn apply_write_policies_to_messages(messages: &[EventMessage]) -> Result<ProcessedEventOutput> { + let mut message_policy_applied = Vec::with_capacity(messages.len()); + let mut write_policy_audits = Vec::new(); + let mut transformed_messages = Vec::with_capacity(messages.len()); + + for message in messages { + let (transformed_message, audit) = apply_write_policy_to_message(message)?; + + message_policy_applied.push(audit.is_some()); + + if let Some(audit) = audit { + write_policy_audits.push(audit); + } + + transformed_messages.push(transformed_message); + } + + Ok(( + transformed_messages, + message_policy_applied, + if write_policy_audits.is_empty() { None } else { Some(write_policy_audits) }, + )) +} + +fn apply_write_policy_to_message( + message: &EventMessage, +) -> Result<(EventMessage, Option<WritePolicyAudit>)> { + let result = + writegate::apply_write_policy(message.content.as_str(), message.write_policy.as_ref()) + .map_err(|err| { + let message = match err { + WritePolicyError::InvalidSpan => "Invalid write_policy span provided.", + WritePolicyError::OverlappingOps => "Overlapping write_policy spans provided.", + }; + + Error::InvalidRequest { message: message.to_string() } + })?; + let has_policy = message.write_policy.is_some(); + let mut transformed = message.clone(); + + transformed.content = result.transformed; + + Ok((transformed, if has_policy { Some(result.audit) } else { None })) +} + +fn reject_extracted_note_if_evidence_invalid( + cfg: &Config, + reason: Option<&String>, + evidence: &[EvidenceQuote], + message_texts: &[String], + message_policy_applied: &[bool], +) -> Option<AddEventResult> { + if evidence.is_empty() + || evidence.len() < cfg.security.evidence_min_quotes as usize + || evidence.len() > cfg.security.evidence_max_quotes as usize + { + return Some(AddEventResult { + note_id: None, + op: NoteOp::Rejected, + policy_decision: MemoryPolicyDecision::Reject, + reason_code: Some(REJECT_EVIDENCE_MISMATCH.to_string()), + reason: reason.cloned(), + field_path: None, + write_policy_audits: None, + }); + } + + for quote in evidence { + if quote.quote.len() > cfg.security.evidence_max_quote_chars as usize { + return Some(AddEventResult { + note_id: None, + op: NoteOp::Rejected, + policy_decision: MemoryPolicyDecision::Reject, + reason_code: Some(REJECT_EVIDENCE_MISMATCH.to_string()), + reason: reason.cloned(), + field_path: None, + write_policy_audits: None, + }); + } + if !evidence::evidence_matches(message_texts, quote.message_index, quote.quote.as_str()) { + let reason_code = + message_policy_applied.get(quote.message_index).is_some_and(|applied| *applied); + + return Some(AddEventResult { + note_id: None, + op: NoteOp::Rejected, + policy_decision: MemoryPolicyDecision::Reject, + reason_code: Some(if reason_code { + REJECT_WRITE_POLICY_MISMATCH.to_string() + } else { + REJECT_EVIDENCE_MISMATCH.to_string() + }), + reason: reason.cloned(), + field_path: None, + write_policy_audits: None, + }); + } + } + + None +} + +fn reject_extracted_note_if_structured_invalid( + structured: Option<&StructuredFields>, + text: &str, + evidence: &[EvidenceQuote], + reason: Option<&String>, +) -> Option<AddEventResult> { + let structured = structured?; + + if structured.is_effectively_empty() { + return None; + } + + let event_evidence: Vec<(usize, String)> = + evidence.iter().map(|q| (q.message_index, q.quote.clone())).collect(); + + if let Err(err) = structured_fields::validate_structured_fields( + structured, + text, + &serde_json::json!({}), + Some(event_evidence.as_slice()), + ) { + tracing::info!(error = %err, "Rejecting extracted note due to invalid structured fields."); + + let field_path = extract_structured_rejection_field_path(&err); + + return Some(AddEventResult { + note_id: None, + op: NoteOp::Rejected, + policy_decision: MemoryPolicyDecision::Reject, + reason_code: Some(REJECT_STRUCTURED_INVALID.to_string()), + reason: reason.cloned(), + field_path, + write_policy_audits: None, + }); + } + + None +} - let user_prompt = format!( - "Return JSON matching this exact schema:\n{schema}\nConstraints:\n- MAX_NOTES = {max_notes}\n- MAX_NOTE_CHARS = {max_note_chars}\nHere are the messages as JSON:\n{messages_json}" - ); +fn reject_extracted_note_if_writegate_rejects( + cfg: &Config, + reason: Option<&String>, + note_type: &str, + scope: &str, + text: &str, +) -> Option<AddEventResult> { + let gate_input = NoteInput { + note_type: note_type.to_string(), + scope: scope.to_string(), + text: text.to_string(), + }; - Ok(vec![ - serde_json::json!({ "role": "system", "content": system_prompt }), - serde_json::json!({ "role": "user", "content": user_prompt }), - ]) + if let Err(code) = writegate::writegate(&gate_input, cfg) { + return Some(AddEventResult { + note_id: None, + op: NoteOp::Rejected, + policy_decision: MemoryPolicyDecision::Reject, + reason_code: Some(crate::writegate_reason_code(code).to_string()), + reason: reason.cloned(), + field_path: None, + write_policy_audits: None, + }); + } + + None +} + +fn extract_structured_rejection_field_path(err: &Error) -> Option<String> { + match err { + Error::NonEnglishInput { field } => Some(field.clone()), + Error::InvalidRequest { message } if message.starts_with("structured.") => + message.split_whitespace().next().map(ToString::to_string), + _ => None, + } +} + +fn base_decision_for_update( + decision: &UpdateDecision, + structured_present: bool, + graph_present: bool, +) -> MemoryPolicyDecision { + match decision { + UpdateDecision::Update { .. } => MemoryPolicyDecision::Update, + UpdateDecision::Add { .. } => MemoryPolicyDecision::Remember, + UpdateDecision::None { .. } => + if structured_present || graph_present { + MemoryPolicyDecision::Update + } else { + MemoryPolicyDecision::Ignore + }, + } +} + +fn policy_rule_id(note_type: Option<&str>, scope: Option<&str>) -> Option<String> { + match (note_type, scope) { + (Some(note_type), Some(scope)) => Some(format!("note_type={note_type},scope={scope}")), + (Some(note_type), None) => Some(format!("note_type={note_type}")), + (None, Some(scope)) => Some(format!("scope={scope}")), + (None, None) => None, + } +} + +#[allow(clippy::too_many_arguments)] +async fn record_ingest_decision( + tx: &mut Transaction<'_, Postgres>, + cfg: &Config, + ctx: &AddEventContext<'_>, + note: &ExtractedNote, + note_type: &str, + note_id: Option<Uuid>, + note_version_id: Option<Uuid>, + base_decision: MemoryPolicyDecision, + policy_decision: MemoryPolicyDecision, + note_op: NoteOp, + reason_code: Option<&str>, + policy_rule: Option<&str>, + similarity_best: Option<f32>, + key_match: bool, + matched_dup: bool, + min_confidence: Option<f32>, + min_importance: Option<f32>, + ingestion_profile_id: Option<&str>, + ingestion_profile_version: Option<i32>, + structured_present: bool, + graph_present: bool, + write_policy_audits: Option<Vec<WritePolicyAudit>>, +) -> Result<()> { + let args = IngestAuditArgs { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + scope: ctx.scope, + pipeline: "add_event", + note_type, + note_key: note.key.as_deref(), + note_id, + note_version_id, + base_decision, + policy_decision, + note_op, + reason_code, + similarity_best, + key_match, + matched_dup, + dup_sim_threshold: cfg.memory.dup_sim_threshold, + update_sim_threshold: cfg.memory.update_sim_threshold, + confidence: note.confidence.unwrap_or(0.0), + importance: note.importance.unwrap_or(0.0), + structured_present, + graph_present, + policy_rule, + min_confidence, + min_importance, + ingestion_profile_id, + ingestion_profile_version, + write_policy_audits, + ts: ctx.now, + }; + + ingest_audit::insert_ingest_decision(tx, args).await +} + +async fn update_memory_note_tx( + tx: &mut Transaction<'_, Postgres>, + memory_note: &MemoryNote, +) -> Result<()> { + sqlx::query( + "\ +UPDATE memory_notes +SET + text = $1, + importance = $2, + confidence = $3, + updated_at = $4, + expires_at = $5, + source_ref = $6 +WHERE note_id = $7", + ) + .bind(memory_note.text.as_str()) + .bind(memory_note.importance) + .bind(memory_note.confidence) + .bind(memory_note.updated_at) + .bind(memory_note.expires_at) + .bind(&memory_note.source_ref) + .bind(memory_note.note_id) + .execute(&mut **tx) + .await?; + + Ok(()) +} + +async fn insert_memory_note_tx( + tx: &mut Transaction<'_, Postgres>, + memory_note: &MemoryNote, +) -> Result<()> { + sqlx::query( + "\ +INSERT INTO memory_notes ( + note_id, + tenant_id, + project_id, + agent_id, + scope, + type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref, + hit_count, + last_hit_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15, + $16, + $17, + $18 +)", + ) + .bind(memory_note.note_id) + .bind(memory_note.tenant_id.as_str()) + .bind(memory_note.project_id.as_str()) + .bind(memory_note.agent_id.as_str()) + .bind(memory_note.scope.as_str()) + .bind(memory_note.r#type.as_str()) + .bind(memory_note.key.as_deref()) + .bind(memory_note.text.as_str()) + .bind(memory_note.importance) + .bind(memory_note.confidence) + .bind(memory_note.status.as_str()) + .bind(memory_note.created_at) + .bind(memory_note.updated_at) + .bind(memory_note.expires_at) + .bind(memory_note.embedding_version.as_str()) + .bind(&memory_note.source_ref) + .bind(memory_note.hit_count) + .bind(memory_note.last_hit_at) + .execute(&mut **tx) + .await?; + + Ok(()) +} + +async fn upsert_structured_fields_tx( + tx: &mut Transaction<'_, Postgres>, + structured: Option<&StructuredFields>, + note_id: Uuid, + now: OffsetDateTime, +) -> Result<()> { + if let Some(structured) = structured + && !structured.is_effectively_empty() + { + structured_fields::upsert_structured_fields_tx(tx, note_id, structured, now).await?; + } + + Ok(()) +} + +#[cfg(test)] +mod english_gate_tests { + use crate::{ + Error, + add_event::{self, AddEventRequest, EventMessage}, + }; + + #[test] + fn rejects_long_non_english_message_content() { + let req = AddEventRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: None, + dry_run: None, + ingestion_profile: None, + messages: vec![EventMessage { + role: "user".to_string(), + content: "Bonjour, je veux m'assurer que ce texte est suffisamment long et riche en lettres pour declencher la detection de langue. Merci beaucoup." + .to_string(), + ts: None, + msg_id: None, + write_policy: None, + }], + }; + let err = add_event::validate_add_event_request(&req) + .expect_err("Expected English gate rejection."); + + assert!(matches!( + err, + Error::NonEnglishInput { field } if field == "$.messages[0].content" + )); + } } diff --git a/packages/elf-service/src/add_note.rs b/packages/elf-service/src/add_note.rs index fa8ea353..4a67401c 100644 --- a/packages/elf-service/src/add_note.rs +++ b/packages/elf-service/src/add_note.rs @@ -1,312 +1,1113 @@ -// crates.io +//! Direct note ingestion APIs. + +use serde::{Deserialize, Serialize}; use serde_json::Value; -use time::OffsetDateTime; +use sqlx::{Postgres, Transaction}; +use time::{Duration, OffsetDateTime}; use uuid::Uuid; -// self -use elf_domain::{cjk, ttl, writegate}; -use elf_storage::models::MemoryNote; - use crate::{ - ElfService, InsertVersionArgs, NoteOp, ResolveUpdateArgs, ServiceError, ServiceResult, - UpdateDecision, + ElfService, Error, InsertVersionArgs, NoteOp, ResolveUpdateArgs, Result, UpdateDecision, + UpdateDecisionMetadata, + access::{self, ORG_PROJECT_ID}, + graph_ingestion, + ingest_audit::{self, IngestAuditArgs}, + structured_fields::{self, StructuredFields}, }; +use elf_config::Config; +use elf_domain::{ + english_gate, + memory_policy::{self, MemoryPolicyDecision}, + ttl, + writegate::{self, NoteInput, WritePolicy, WritePolicyAudit, WritePolicyError}, +}; +use elf_storage::models::MemoryNote; -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +type AddNoteApplyOutput = (AddNoteResult, NoteOp, Option<Uuid>); + +const REJECT_STRUCTURED_INVALID: &str = "REJECT_STRUCTURED_INVALID"; +const IGNORE_DUPLICATE: &str = "IGNORE_DUPLICATE"; +const IGNORE_POLICY_THRESHOLD: &str = "IGNORE_POLICY_THRESHOLD"; + +/// Request payload for direct note ingestion. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct AddNoteRequest { + /// Tenant that owns the request. pub tenant_id: String, + /// Project that owns the request. pub project_id: String, + /// Agent that is writing the notes. pub agent_id: String, + /// Scope to apply to all notes in the batch. pub scope: String, + /// Notes to validate and persist. pub notes: Vec<AddNoteInput>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// One note supplied to `add_note`. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct AddNoteInput { - #[serde(rename = "type")] - pub note_type: String, + /// Note type discriminator. + pub r#type: String, + /// Optional application-defined key for deduplication or lookup. pub key: Option<String>, + /// Note body text. pub text: String, + /// Optional structured extraction payload to persist alongside the note. + pub structured: Option<StructuredFields>, + /// Importance score for ranking and retention. pub importance: f32, + /// Confidence score for ranking and retention. pub confidence: f32, + /// Optional TTL override in days. pub ttl_days: Option<i64>, + #[serde(default = "default_source_ref")] + /// Structured source reference metadata. pub source_ref: Value, + /// Optional write policy applied before validation and persistence. + pub write_policy: Option<WritePolicy>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Per-note outcome for an `add_note` request. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct AddNoteResult { + /// Note identifier when one was created or updated. pub note_id: Option<Uuid>, + /// Persistence operation chosen for the note. pub op: NoteOp, + /// Memory-policy decision applied to the note. + pub policy_decision: MemoryPolicyDecision, + /// Machine-readable rejection or ignore code, if any. pub reason_code: Option<String>, + /// Field path associated with a validation failure, if any. + pub field_path: Option<String>, + /// Write-policy audit emitted for this note, if any. + pub write_policy_audit: Option<WritePolicyAudit>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Response payload for direct note ingestion. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct AddNoteResponse { + /// One result per requested note. pub results: Vec<AddNoteResult>, } +struct AddNoteContext<'a> { + tenant_id: &'a str, + project_id: &'a str, + agent_id: &'a str, + scope: &'a str, + now: OffsetDateTime, + embed_version: &'a str, +} + impl ElfService { - pub async fn add_note(&self, req: AddNoteRequest) -> ServiceResult<AddNoteResponse> { - if req.notes.is_empty() { - return Err(ServiceError::InvalidRequest { - message: "Notes list is empty.".to_string(), - }); - } - if req.tenant_id.trim().is_empty() - || req.project_id.trim().is_empty() - || req.agent_id.trim().is_empty() - || req.scope.trim().is_empty() - { - return Err(ServiceError::InvalidRequest { - message: "tenant_id, project_id, agent_id, and scope are required.".to_string(), - }); - } + /// Validates and persists notes supplied directly by the caller. + pub async fn add_note(&self, req: AddNoteRequest) -> Result<AddNoteResponse> { + let req = normalize_add_note_request(req); - for (idx, note) in req.notes.iter().enumerate() { - if cjk::contains_cjk(¬e.text) { - return Err(ServiceError::NonEnglishInput { - field: format!("$.notes[{idx}].text"), - }); - } - if let Some(key) = ¬e.key - && cjk::contains_cjk(key) - { - return Err(ServiceError::NonEnglishInput { field: format!("$.notes[{idx}].key") }); - } - if let Some(path) = - find_cjk_path(¬e.source_ref, &format!("$.notes[{idx}].source_ref")) - { - return Err(ServiceError::NonEnglishInput { field: path }); - } - } + validate_add_note_request(&req)?; - let now = OffsetDateTime::now_utc(); + let base_now = OffsetDateTime::now_utc(); let embed_version = crate::embedding_version(&self.cfg); - let mut results = Vec::with_capacity(req.notes.len()); + let AddNoteRequest { tenant_id, project_id, agent_id, scope, notes } = req; + let effective_project_id = + if scope.trim() == "org_shared" { ORG_PROJECT_ID } else { project_id.as_str() }; + let mut results = Vec::with_capacity(notes.len()); - for note in req.notes { - let gate_input = writegate::NoteInput { - note_type: note.note_type.clone(), - scope: req.scope.clone(), - text: note.text.clone(), + for (note_idx, note) in notes.into_iter().enumerate() { + let now = base_now + Duration::microseconds(note_idx as i64); + let ctx = AddNoteContext { + tenant_id: tenant_id.as_str(), + project_id: effective_project_id, + agent_id: agent_id.as_str(), + scope: scope.as_str(), + now, + embed_version: embed_version.as_str(), }; - if let Err(code) = writegate::writegate(&gate_input, &self.cfg) { - results.push(AddNoteResult { - note_id: None, - op: NoteOp::Rejected, - reason_code: Some(crate::writegate_reason_code(code).to_string()), - }); - continue; - } - let mut tx = self.db.pool.begin().await?; - let decision = crate::resolve_update( + results.push(self.process_add_note_input(&ctx, note).await?); + } + + Ok(AddNoteResponse { results }) + } + + async fn process_add_note_input( + &self, + ctx: &AddNoteContext<'_>, + note: AddNoteInput, + ) -> Result<AddNoteResult> { + let mut note = note; + let (transformed, write_policy_audit) = + apply_write_policy_to_note(note.write_policy.as_ref(), note.text.as_str())?; + + note.text = transformed; + + let (structured_present, graph_present) = + Self::structured_and_graph_present(note.structured.as_ref()); + let mut tx = self.db.pool.begin().await?; + + if let Some(result) = + self.handle_rejection_paths(&mut tx, ctx, ¬e, write_policy_audit.as_ref()).await? + { + tx.commit().await?; + + return Ok(result); + } + + let (decision, metadata) = self.resolve_update_decision(&mut tx, ctx, ¬e).await?; + let base_decision = + Self::base_decision_for_update(&decision, structured_present, graph_present); + let (policy_decision, decision_policy_rule, min_confidence, min_importance) = + self.decide_policy_decision(ctx.scope, ¬e, base_decision); + let note_id = decision.note_id(); + let ignore_reason_code = + Self::ignore_reason_code(policy_decision, base_decision, metadata.matched_dup); + let (result, note_op, note_version_id) = self + .apply_policy_result( &mut tx, - ResolveUpdateArgs { - cfg: &self.cfg, - providers: &self.providers, - tenant_id: &req.tenant_id, - project_id: &req.project_id, - agent_id: &req.agent_id, - scope: &req.scope, - note_type: ¬e.note_type, - key: note.key.as_deref(), - text: ¬e.text, - now, - }, + &decision, + ctx, + ¬e, + note_id, + policy_decision, + ignore_reason_code, ) .await?; + let mut result = result; - match decision { - UpdateDecision::Add { note_id } => { - let expires_at = - ttl::compute_expires_at(note.ttl_days, ¬e.note_type, &self.cfg, now); - let memory_note = MemoryNote { - note_id, - tenant_id: req.tenant_id.clone(), - project_id: req.project_id.clone(), - agent_id: req.agent_id.clone(), - scope: req.scope.clone(), - r#type: note.note_type.clone(), - key: note.key.clone(), - text: note.text.clone(), - importance: note.importance, - confidence: note.confidence, - status: "active".to_string(), - created_at: now, - updated_at: now, - expires_at, - embedding_version: embed_version.clone(), - source_ref: note.source_ref.clone(), - hit_count: 0, - last_hit_at: None, - }; - - sqlx::query( - "INSERT INTO memory_notes \ - (note_id, tenant_id, project_id, agent_id, scope, type, key, text, importance, confidence, status, created_at, updated_at, expires_at, embedding_version, source_ref, hit_count, last_hit_at) \ - VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18)", - ) - .bind(memory_note.note_id) - .bind(&memory_note.tenant_id) - .bind(&memory_note.project_id) - .bind(&memory_note.agent_id) - .bind(&memory_note.scope) - .bind(&memory_note.r#type) - .bind(&memory_note.key) - .bind(&memory_note.text) - .bind(memory_note.importance) - .bind(memory_note.confidence) - .bind(&memory_note.status) - .bind(memory_note.created_at) - .bind(memory_note.updated_at) - .bind(memory_note.expires_at) - .bind(&memory_note.embedding_version) - .bind(&memory_note.source_ref) - .bind(memory_note.hit_count) - .bind(memory_note.last_hit_at) - .execute(&mut *tx) - .await?; - - crate::insert_version( - &mut tx, - InsertVersionArgs { - note_id: memory_note.note_id, - op: "ADD", - prev_snapshot: None, - new_snapshot: Some(crate::note_snapshot(&memory_note)), - reason: "add_note", - actor: "add_note", - ts: now, - }, - ) - .await?; - crate::enqueue_outbox_tx( - &mut tx, - memory_note.note_id, - "UPSERT", - &memory_note.embedding_version, - now, - ) - .await?; - tx.commit().await?; + result.write_policy_audit = write_policy_audit.clone(); - results.push(AddNoteResult { - note_id: Some(note_id), - op: NoteOp::Add, - reason_code: None, - }); - }, - UpdateDecision::Update { note_id } => { - let mut existing: MemoryNote = - sqlx::query_as("SELECT * FROM memory_notes WHERE note_id = $1 FOR UPDATE") - .bind(note_id) - .fetch_one(&mut *tx) - .await?; - let prev_snapshot = crate::note_snapshot(&existing); - - let requested_ttl = note.ttl_days.filter(|days| *days > 0); - let expires_at = match requested_ttl { - Some(ttl) => - ttl::compute_expires_at(Some(ttl), ¬e.note_type, &self.cfg, now), - None => existing.expires_at, - }; - - let expires_match = if let Some(ttl_days) = requested_ttl { - match existing.expires_at { - Some(existing_expires_at) => { - let existing_ttl = - (existing_expires_at - existing.updated_at).whole_days() as i64; - existing_ttl == ttl_days - }, - None => false, - } - } else { - existing.expires_at == expires_at - }; - let unchanged = existing.text == note.text - && (existing.importance - note.importance).abs() <= f32::EPSILON - && (existing.confidence - note.confidence).abs() <= f32::EPSILON - && expires_match && existing.source_ref == note.source_ref; - - if unchanged { - tx.commit().await?; - results.push(AddNoteResult { + self.record_ingest_decision( + &mut tx, + ctx, + ¬e, + result.note_id, + note_version_id, + base_decision, + result.policy_decision, + note_op, + result.reason_code.as_deref(), + decision_policy_rule.as_deref(), + metadata.similarity_best, + metadata.key_match, + metadata.matched_dup, + min_confidence, + min_importance, + write_policy_audit, + ) + .await?; + tx.commit().await?; + + Ok(result) + } + + fn structured_and_graph_present(structured: Option<&StructuredFields>) -> (bool, bool) { + let structured_present = structured.is_some_and(|s| !s.is_effectively_empty()); + let graph_present = structured.is_some_and(StructuredFields::has_graph_fields); + + (structured_present, graph_present) + } + + async fn handle_rejection_paths( + &self, + tx: &mut Transaction<'_, Postgres>, + ctx: &AddNoteContext<'_>, + note: &AddNoteInput, + write_policy_audit: Option<&WritePolicyAudit>, + ) -> Result<Option<AddNoteResult>> { + if let Some(result) = reject_note_if_structured_invalid(note) { + let mut result = result; + + result.write_policy_audit = write_policy_audit.cloned(); + + self.record_ingest_decision( + tx, + ctx, + note, + None, + None, + MemoryPolicyDecision::Reject, + MemoryPolicyDecision::Reject, + NoteOp::Rejected, + result.reason_code.as_deref(), + None, + None, + false, + false, + None, + None, + write_policy_audit.cloned(), + ) + .await?; + + return Ok(Some(result)); + } + if let Some(result) = reject_note_if_writegate_rejects(&self.cfg, ctx.scope, note) { + let mut result = result; + + result.write_policy_audit = write_policy_audit.cloned(); + + self.record_ingest_decision( + tx, + ctx, + note, + None, + None, + MemoryPolicyDecision::Reject, + MemoryPolicyDecision::Reject, + NoteOp::Rejected, + result.reason_code.as_deref(), + None, + None, + false, + false, + None, + None, + write_policy_audit.cloned(), + ) + .await?; + + return Ok(Some(result)); + } + + Ok(None) + } + + async fn resolve_update_decision( + &self, + tx: &mut Transaction<'_, Postgres>, + ctx: &AddNoteContext<'_>, + note: &AddNoteInput, + ) -> Result<(UpdateDecision, UpdateDecisionMetadata)> { + let decision = crate::resolve_update( + &mut **tx, + ResolveUpdateArgs { + cfg: &self.cfg, + providers: &self.providers, + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + scope: ctx.scope, + note_type: note.r#type.as_str(), + key: note.key.as_deref(), + text: note.text.as_str(), + now: ctx.now, + }, + ) + .await?; + let metadata = decision.metadata(); + + Ok((decision, metadata)) + } + + fn decide_policy_decision( + &self, + scope: &str, + note: &AddNoteInput, + base_decision: MemoryPolicyDecision, + ) -> (MemoryPolicyDecision, Option<String>, Option<f32>, Option<f32>) { + if matches!(base_decision, MemoryPolicyDecision::Remember | MemoryPolicyDecision::Update) { + let policy_eval = memory_policy::evaluate_memory_policy( + &self.cfg, + note.r#type.as_str(), + scope, + f64::from(note.confidence), + f64::from(note.importance), + base_decision, + ); + let decision_policy_rule = policy_eval.matched_rule.and_then(|rule| { + Self::policy_rule_id(rule.note_type.as_deref(), rule.scope.as_deref()) + }); + let min_confidence = policy_eval.matched_rule.and_then(|rule| rule.min_confidence); + let min_importance = policy_eval.matched_rule.and_then(|rule| rule.min_importance); + + (policy_eval.decision, decision_policy_rule, min_confidence, min_importance) + } else { + (MemoryPolicyDecision::Ignore, None, None, None) + } + } + + fn ignore_reason_code( + policy_decision: MemoryPolicyDecision, + base_decision: MemoryPolicyDecision, + matched_dup: bool, + ) -> Option<&'static str> { + if !matches!(policy_decision, MemoryPolicyDecision::Ignore) { + return None; + } + + match base_decision { + MemoryPolicyDecision::Remember | MemoryPolicyDecision::Update => + Some(IGNORE_POLICY_THRESHOLD), + MemoryPolicyDecision::Ignore if matched_dup => Some(IGNORE_DUPLICATE), + _ => None, + } + } + + #[allow(clippy::too_many_arguments)] + async fn apply_policy_result( + &self, + tx: &mut Transaction<'_, Postgres>, + decision: &UpdateDecision, + ctx: &AddNoteContext<'_>, + note: &AddNoteInput, + note_id: Uuid, + policy_decision: MemoryPolicyDecision, + ignore_reason_code: Option<&'static str>, + ) -> Result<AddNoteApplyOutput> { + let should_apply = matches!( + policy_decision, + MemoryPolicyDecision::Remember | MemoryPolicyDecision::Update + ); + + if should_apply { + let (result, note_version_id) = match decision { + UpdateDecision::Add { .. } => { + let note_version_id = self.handle_add_note_add(tx, ctx, note, note_id).await?; + + ( + AddNoteResult { note_id: Some(note_id), - op: NoteOp::None, + op: NoteOp::Add, + policy_decision, reason_code: None, - }); - continue; - } - - existing.text = note.text.clone(); - existing.importance = note.importance; - existing.confidence = note.confidence; - existing.updated_at = now; - existing.expires_at = expires_at; - existing.source_ref = note.source_ref.clone(); - - sqlx::query( - "UPDATE memory_notes SET text = $1, importance = $2, confidence = $3, updated_at = $4, expires_at = $5, source_ref = $6 WHERE note_id = $7", - ) - .bind(&existing.text) - .bind(existing.importance) - .bind(existing.confidence) - .bind(existing.updated_at) - .bind(existing.expires_at) - .bind(&existing.source_ref) - .bind(existing.note_id) - .execute(&mut *tx) - .await?; - - crate::insert_version( - &mut tx, - InsertVersionArgs { - note_id: existing.note_id, - op: "UPDATE", - prev_snapshot: Some(prev_snapshot), - new_snapshot: Some(crate::note_snapshot(&existing)), - reason: "add_note", - actor: "add_note", - ts: now, + field_path: None, + write_policy_audit: None, }, + Some(note_version_id), ) - .await?; - crate::enqueue_outbox_tx( - &mut tx, - existing.note_id, - "UPSERT", - &existing.embedding_version, - now, + }, + UpdateDecision::Update { .. } => + self.handle_add_note_update( + tx, + note, + note_id, + ctx.agent_id, + ctx.now, + policy_decision, ) + .await?, + UpdateDecision::None { .. } => { + let (mut none_result, note_version_id) = self + .handle_add_note_none( + tx, + ctx, + note, + note_id, + ctx.now, + ctx.embed_version, + policy_decision, + ) + .await?; + + none_result.policy_decision = policy_decision; + + (none_result, note_version_id) + }, + }; + let note_op = result.op; + + Ok((result, note_op, note_version_id)) + } else { + let mut result = AddNoteResult { + note_id: Some(note_id), + op: NoteOp::None, + policy_decision, + reason_code: ignore_reason_code.map(str::to_string), + field_path: None, + write_policy_audit: None, + }; + + match decision { + UpdateDecision::Add { .. } => { + result.note_id = None; + }, + UpdateDecision::Update { .. } | UpdateDecision::None { .. } => {}, + } + + Ok((result, NoteOp::None, None)) + } + } + + #[allow(clippy::too_many_arguments)] + async fn record_ingest_decision( + &self, + tx: &mut Transaction<'_, Postgres>, + ctx: &AddNoteContext<'_>, + note: &AddNoteInput, + note_id: Option<Uuid>, + note_version_id: Option<Uuid>, + base_decision: MemoryPolicyDecision, + policy_decision: MemoryPolicyDecision, + note_op: NoteOp, + reason_code: Option<&str>, + policy_rule: Option<&str>, + similarity_best: Option<f32>, + key_match: bool, + matched_dup: bool, + min_confidence: Option<f32>, + min_importance: Option<f32>, + write_policy_audit: Option<WritePolicyAudit>, + ) -> Result<()> { + let decision = IngestAuditArgs { + tenant_id: ctx.tenant_id, + project_id: ctx.project_id, + agent_id: ctx.agent_id, + scope: ctx.scope, + pipeline: "add_note", + note_type: note.r#type.as_str(), + note_key: note.key.as_deref(), + note_id, + note_version_id, + base_decision, + policy_decision, + note_op, + reason_code, + similarity_best, + key_match, + matched_dup, + dup_sim_threshold: self.cfg.memory.dup_sim_threshold, + update_sim_threshold: self.cfg.memory.update_sim_threshold, + confidence: note.confidence, + importance: note.importance, + structured_present: note.structured.as_ref().is_some_and(|s| !s.is_effectively_empty()), + graph_present: note.structured.as_ref().is_some_and(StructuredFields::has_graph_fields), + policy_rule, + min_confidence, + min_importance, + write_policy_audits: write_policy_audit.map(|audit| vec![audit]), + ingestion_profile_id: None, + ingestion_profile_version: None, + ts: ctx.now, + }; + + ingest_audit::insert_ingest_decision(tx, decision).await + } + + fn base_decision_for_update( + decision: &UpdateDecision, + structured_present: bool, + graph_present: bool, + ) -> MemoryPolicyDecision { + match decision { + UpdateDecision::Update { .. } => MemoryPolicyDecision::Update, + UpdateDecision::Add { .. } => MemoryPolicyDecision::Remember, + UpdateDecision::None { .. } => + if structured_present || graph_present { + MemoryPolicyDecision::Update + } else { + MemoryPolicyDecision::Ignore + }, + } + } + + fn policy_rule_id(note_type: Option<&str>, scope: Option<&str>) -> Option<String> { + match (note_type, scope) { + (Some(note_type), Some(scope)) => Some(format!("note_type={note_type},scope={scope}")), + (Some(note_type), None) => Some(format!("note_type={note_type}")), + (None, Some(scope)) => Some(format!("scope={scope}")), + (None, None) => None, + } + } + + async fn handle_add_note_add( + &self, + tx: &mut Transaction<'_, Postgres>, + ctx: &AddNoteContext<'_>, + note: &AddNoteInput, + note_id: Uuid, + ) -> Result<Uuid> { + access::ensure_active_project_scope_grant( + &mut **tx, + ctx.tenant_id, + ctx.project_id, + ctx.scope, + ctx.agent_id, + ) + .await?; + + let expires_at = + ttl::compute_expires_at(note.ttl_days, note.r#type.as_str(), &self.cfg, ctx.now); + let memory_note = MemoryNote { + note_id, + tenant_id: ctx.tenant_id.to_string(), + project_id: ctx.project_id.to_string(), + agent_id: ctx.agent_id.to_string(), + scope: ctx.scope.to_string(), + r#type: note.r#type.clone(), + key: note.key.clone(), + text: note.text.clone(), + importance: note.importance, + confidence: note.confidence, + status: "active".to_string(), + created_at: ctx.now, + updated_at: ctx.now, + expires_at, + embedding_version: ctx.embed_version.to_string(), + source_ref: note.source_ref.clone(), + hit_count: 0, + last_hit_at: None, + }; + + insert_memory_note_tx(tx, &memory_note).await?; + + let note_version_id = crate::insert_version( + &mut **tx, + InsertVersionArgs { + note_id: memory_note.note_id, + op: "ADD", + prev_snapshot: None, + new_snapshot: Some(crate::note_snapshot(&memory_note)), + reason: "add_note", + actor: ctx.agent_id, + ts: ctx.now, + }, + ) + .await?; + + self.upsert_structured_and_enqueue_outbox( + tx, + note, + memory_note.note_id, + ctx.embed_version, + ctx.now, + ) + .await?; + self.persist_graph_fields_if_present( + tx, + ctx.tenant_id, + ctx.project_id, + ctx.agent_id, + ctx.scope, + memory_note.note_id, + ctx.now, + note.structured.as_ref(), + ) + .await?; + + Ok(note_version_id) + } + + async fn handle_add_note_update( + &self, + tx: &mut Transaction<'_, Postgres>, + note: &AddNoteInput, + note_id: Uuid, + agent_id: &str, + now: OffsetDateTime, + policy_decision: MemoryPolicyDecision, + ) -> Result<(AddNoteResult, Option<Uuid>)> { + let mut existing: MemoryNote = sqlx::query_as::<_, MemoryNote>( + "SELECT * FROM memory_notes WHERE note_id = $1 FOR UPDATE", + ) + .bind(note_id) + .fetch_one(&mut **tx) + .await?; + let prev_snapshot = crate::note_snapshot(&existing); + let requested_ttl = note.ttl_days.filter(|days| *days > 0); + let expires_at = match requested_ttl { + Some(ttl) => ttl::compute_expires_at(Some(ttl), note.r#type.as_str(), &self.cfg, now), + None => existing.expires_at, + }; + let expires_match = requested_ttl.map_or(existing.expires_at == expires_at, |ttl_days| { + match existing.expires_at { + Some(existing_expires_at) => { + let existing_ttl = + (existing_expires_at - existing.updated_at).whole_days() as i64; + + existing_ttl == ttl_days + }, + None => false, + } + }); + let float_eps = 1e-6_f32; + let unchanged = existing.text == note.text + && (existing.importance - note.importance).abs() <= float_eps + && (existing.confidence - note.confidence).abs() <= float_eps + && expires_match + && existing.source_ref == note.source_ref; + + if unchanged { + return Ok(( + AddNoteResult { + note_id: Some(note_id), + op: NoteOp::None, + policy_decision: MemoryPolicyDecision::Ignore, + reason_code: None, + field_path: None, + write_policy_audit: None, + }, + None, + )); + } + + access::ensure_active_project_scope_grant( + &mut **tx, + existing.tenant_id.as_str(), + existing.project_id.as_str(), + existing.scope.as_str(), + existing.agent_id.as_str(), + ) + .await?; + + existing.text = note.text.clone(); + existing.importance = note.importance; + existing.confidence = note.confidence; + existing.updated_at = now; + existing.expires_at = expires_at; + existing.source_ref = note.source_ref.clone(); + + update_memory_note_tx(tx, &existing).await?; + + let note_version_id = crate::insert_version( + &mut **tx, + InsertVersionArgs { + note_id: existing.note_id, + op: "UPDATE", + prev_snapshot: Some(prev_snapshot), + new_snapshot: Some(crate::note_snapshot(&existing)), + reason: "add_note", + actor: agent_id, + ts: now, + }, + ) + .await?; + + self.persist_graph_fields_if_present( + tx, + existing.tenant_id.as_str(), + existing.project_id.as_str(), + existing.agent_id.as_str(), + existing.scope.as_str(), + existing.note_id, + now, + note.structured.as_ref(), + ) + .await?; + self.upsert_structured_and_enqueue_outbox( + tx, + note, + existing.note_id, + existing.embedding_version.as_str(), + now, + ) + .await?; + + Ok(( + AddNoteResult { + note_id: Some(note_id), + op: NoteOp::Update, + policy_decision, + reason_code: None, + field_path: None, + write_policy_audit: None, + }, + Some(note_version_id), + )) + } + + #[allow(clippy::too_many_arguments)] + async fn handle_add_note_none( + &self, + tx: &mut Transaction<'_, Postgres>, + ctx: &AddNoteContext<'_>, + note: &AddNoteInput, + note_id: Uuid, + now: OffsetDateTime, + embed_version: &str, + policy_decision: MemoryPolicyDecision, + ) -> Result<(AddNoteResult, Option<Uuid>)> { + let mut should_update = false; + + if let Some(structured) = note.structured.as_ref() { + if !structured.is_effectively_empty() { + structured_fields::upsert_structured_fields_tx(tx, note_id, structured, now) .await?; - tx.commit().await?; + crate::enqueue_outbox_tx(&mut **tx, note_id, "UPSERT", embed_version, now).await?; - results.push(AddNoteResult { - note_id: Some(note_id), - op: NoteOp::Update, - reason_code: None, - }); + should_update = true; + } + if structured.has_graph_fields() { + self.persist_graph_fields_if_present( + tx, + ctx.tenant_id, + ctx.project_id, + ctx.agent_id, + ctx.scope, + note_id, + now, + Some(structured), + ) + .await?; + + should_update = true; + } + } + + if should_update { + let note_row: MemoryNote = + sqlx::query_as("SELECT * FROM memory_notes WHERE note_id = $1") + .bind(note_id) + .fetch_one(&mut **tx) + .await?; + let snapshot = crate::note_snapshot(¬e_row); + let note_version_id = crate::insert_version( + &mut **tx, + InsertVersionArgs { + note_id, + op: "UPDATE", + prev_snapshot: Some(snapshot.clone()), + new_snapshot: Some(snapshot), + reason: "add_note_structured", + actor: ctx.agent_id, + ts: now, }, - UpdateDecision::None { note_id } => { - tx.commit().await?; - results.push(AddNoteResult { - note_id: Some(note_id), - op: NoteOp::None, - reason_code: None, - }); + ) + .await?; + + if matches!(ctx.scope, "project_shared" | "org_shared") { + access::ensure_active_project_scope_grant( + &mut **tx, + ctx.tenant_id, + ctx.project_id, + ctx.scope, + ctx.agent_id, + ) + .await?; + } + + return Ok(( + AddNoteResult { + note_id: Some(note_id), + op: NoteOp::Update, + policy_decision, + reason_code: None, + field_path: None, + write_policy_audit: None, }, + Some(note_version_id), + )); + } + + Ok(( + AddNoteResult { + note_id: Some(note_id), + op: NoteOp::None, + policy_decision, + reason_code: None, + field_path: None, + write_policy_audit: None, + }, + None, + )) + } + + #[allow(clippy::too_many_arguments)] + async fn persist_graph_fields_if_present( + &self, + tx: &mut Transaction<'_, Postgres>, + tenant_id: &str, + project_id: &str, + agent_id: &str, + scope: &str, + note_id: Uuid, + now: OffsetDateTime, + structured: Option<&StructuredFields>, + ) -> Result<()> { + let Some(structured) = structured else { + return Ok(()); + }; + + if !structured.has_graph_fields() { + return Ok(()); + } + + graph_ingestion::persist_graph_fields_tx( + tx, tenant_id, project_id, agent_id, scope, note_id, structured, now, + ) + .await?; + + Ok(()) + } + + async fn upsert_structured_and_enqueue_outbox( + &self, + tx: &mut Transaction<'_, Postgres>, + note: &AddNoteInput, + note_id: Uuid, + embed_version: &str, + now: OffsetDateTime, + ) -> Result<()> { + if let Some(structured) = note.structured.as_ref() + && !structured.is_effectively_empty() + { + structured_fields::upsert_structured_fields_tx(tx, note_id, structured, now).await?; + } + + crate::enqueue_outbox_tx(&mut **tx, note_id, "UPSERT", embed_version, now).await?; + + Ok(()) + } +} + +fn default_source_ref() -> Value { + Value::Object(Default::default()) +} + +fn normalize_add_note_request(mut req: AddNoteRequest) -> AddNoteRequest { + for note in &mut req.notes { + if note.source_ref.is_null() { + note.source_ref = default_source_ref(); + } + } + + req +} + +fn validate_add_note_request(req: &AddNoteRequest) -> Result<()> { + if req.notes.is_empty() { + return Err(Error::InvalidRequest { message: "Notes list is empty.".to_string() }); + } + if req.tenant_id.trim().is_empty() + || req.project_id.trim().is_empty() + || req.agent_id.trim().is_empty() + || req.scope.trim().is_empty() + { + return Err(Error::InvalidRequest { + message: "tenant_id, project_id, agent_id, and scope are required.".to_string(), + }); + } + + for (idx, note) in req.notes.iter().enumerate() { + if !note.source_ref.is_object() { + return Err(Error::InvalidRequest { + message: "source_ref must be a JSON object.".to_string(), + }); + } + if !english_gate::is_english_natural_language(note.text.as_str()) { + return Err(Error::NonEnglishInput { field: format!("$.notes[{idx}].text") }); + } + + if let Some(key) = note.key.as_ref() + && !english_gate::is_english_identifier(key) + { + return Err(Error::NonEnglishInput { field: format!("$.notes[{idx}].key") }); + } + if let Some(path) = find_non_english_path_in_structured( + note.structured.as_ref(), + &format!("$.notes[{idx}].structured"), + ) { + return Err(Error::NonEnglishInput { field: path }); + } + if let Some(path) = + find_non_english_path(¬e.source_ref, &format!("$.notes[{idx}].source_ref")) + { + return Err(Error::NonEnglishInput { field: path }); + } + } + + Ok(()) +} + +fn reject_note_if_structured_invalid(note: &AddNoteInput) -> Option<AddNoteResult> { + if let Some(structured) = note.structured.as_ref() + && let Err(err) = structured_fields::validate_structured_fields( + structured, + note.text.as_str(), + ¬e.source_ref, + None, + ) { + tracing::info!(error = %err, "Rejecting note due to invalid structured fields."); + + let field_path = extract_structured_rejection_field_path(&err); + + return Some(AddNoteResult { + note_id: None, + op: NoteOp::Rejected, + policy_decision: MemoryPolicyDecision::Reject, + reason_code: Some(REJECT_STRUCTURED_INVALID.to_string()), + field_path, + write_policy_audit: None, + }); + } + + None +} + +fn reject_note_if_writegate_rejects( + cfg: &Config, + scope: &str, + note: &AddNoteInput, +) -> Option<AddNoteResult> { + let gate_input = NoteInput { + note_type: note.r#type.clone(), + scope: scope.to_string(), + text: note.text.clone(), + }; + + if let Err(code) = writegate::writegate(&gate_input, cfg) { + return Some(AddNoteResult { + note_id: None, + op: NoteOp::Rejected, + policy_decision: MemoryPolicyDecision::Reject, + reason_code: Some(crate::writegate_reason_code(code).to_string()), + field_path: None, + write_policy_audit: None, + }); + } + + None +} + +fn apply_write_policy_to_note( + policy: Option<&WritePolicy>, + text: &str, +) -> Result<(String, Option<WritePolicyAudit>)> { + let result = writegate::apply_write_policy(text, policy).map_err(|err| { + let message = match err { + WritePolicyError::InvalidSpan => "Invalid write_policy span provided.", + WritePolicyError::OverlappingOps => "Overlapping write_policy spans provided.", + }; + + Error::InvalidRequest { message: message.to_string() } + })?; + + Ok((result.transformed, policy.is_some().then_some(result.audit))) +} + +fn find_non_english_path_in_structured( + structured: Option<&StructuredFields>, + base: &str, +) -> Option<String> { + let structured = structured?; + + if let Some(summary) = structured.summary.as_ref() + && !english_gate::is_english_natural_language(summary) + { + return Some(format!("{base}.summary")); + } + if let Some(items) = structured.facts.as_ref() { + for (idx, item) in items.iter().enumerate() { + if !english_gate::is_english_natural_language(item) { + return Some(format!("{base}.facts[{idx}]")); + } + } + } + if let Some(items) = structured.concepts.as_ref() { + for (idx, item) in items.iter().enumerate() { + if !english_gate::is_english_natural_language(item) { + return Some(format!("{base}.concepts[{idx}]")); } } + } + if let Some(items) = structured.entities.as_ref() { + for (idx, entity) in items.iter().enumerate() { + let base = format!("{base}.entities[{idx}]"); - Ok(AddNoteResponse { results }) + if let Some(canonical) = entity.canonical.as_ref() + && !english_gate::is_english_natural_language(canonical) + { + return Some(format!("{base}.canonical")); + } + if let Some(kind) = entity.kind.as_ref() + && !english_gate::is_english_natural_language(kind) + { + return Some(format!("{base}.kind")); + } + if let Some(aliases) = entity.aliases.as_ref() { + for (alias_idx, alias) in aliases.iter().enumerate() { + if !english_gate::is_english_natural_language(alias) { + return Some(format!("{base}.aliases[{alias_idx}]")); + } + } + } + } + } + if let Some(items) = structured.relations.as_ref() { + for (idx, relation) in items.iter().enumerate() { + let base = format!("{base}.relations[{idx}]"); + + if let Some(subject) = relation.subject.as_ref() { + let subject_base = format!("{base}.subject"); + + if let Some(canonical) = subject.canonical.as_ref() + && !english_gate::is_english_natural_language(canonical) + { + return Some(format!("{subject_base}.canonical")); + } + if let Some(kind) = subject.kind.as_ref() + && !english_gate::is_english_natural_language(kind) + { + return Some(format!("{subject_base}.kind")); + } + if let Some(aliases) = subject.aliases.as_ref() { + for (alias_idx, alias) in aliases.iter().enumerate() { + if !english_gate::is_english_natural_language(alias) { + return Some(format!("{subject_base}.aliases[{alias_idx}]")); + } + } + } + } + if let Some(predicate) = relation.predicate.as_ref() + && !english_gate::is_english_natural_language(predicate) + { + return Some(format!("{base}.predicate")); + } + if let Some(object) = relation.object.as_ref() { + if let Some(entity) = object.entity.as_ref() { + let object_base = format!("{base}.object.entity"); + + if let Some(canonical) = entity.canonical.as_ref() + && !english_gate::is_english_natural_language(canonical) + { + return Some(format!("{object_base}.canonical")); + } + if let Some(kind) = entity.kind.as_ref() + && !english_gate::is_english_natural_language(kind) + { + return Some(format!("{object_base}.kind")); + } + if let Some(aliases) = entity.aliases.as_ref() { + for (alias_idx, alias) in aliases.iter().enumerate() { + if !english_gate::is_english_natural_language(alias) { + return Some(format!("{object_base}.aliases[{alias_idx}]")); + } + } + } + } + if let Some(value) = object.value.as_ref() + && !english_gate::is_english_natural_language(value) + { + return Some(format!("{base}.object.value")); + } + } + } } + + None } -fn find_cjk_path(value: &Value, path: &str) -> Option<String> { +fn find_non_english_path(value: &Value, path: &str) -> Option<String> { + find_non_english_path_inner(value, path, true) +} + +fn find_non_english_path_inner( + value: &Value, + path: &str, + is_identifier_lane: bool, +) -> Option<String> { + fn has_english_gate(text: &str, is_identifier_lane: bool) -> bool { + if is_identifier_lane { + return english_gate::is_english_identifier(text); + } + + english_gate::is_english_natural_language(text) + } + match value { Value::String(text) => - if cjk::contains_cjk(text) { + if !has_english_gate(text, is_identifier_lane) { Some(path.to_string()) } else { None @@ -314,19 +1115,29 @@ fn find_cjk_path(value: &Value, path: &str) -> Option<String> { Value::Array(items) => { for (idx, item) in items.iter().enumerate() { let child_path = format!("{path}[{idx}]"); - if let Some(found) = find_cjk_path(item, &child_path) { + + if let Some(found) = + find_non_english_path_inner(item, &child_path, is_identifier_lane) + { return Some(found); } } + None }, Value::Object(map) => { for (key, value) in map.iter() { + let identifier_lane = is_identifier_lane + || matches!(key.as_str(), "ref" | "schema" | "resolver" | "hashes" | "state"); let child_path = format!("{path}[\"{}\"]", escape_json_path_key(key)); - if let Some(found) = find_cjk_path(value, &child_path) { + + if let Some(found) = + find_non_english_path_inner(value, &child_path, identifier_lane) + { return Some(found); } } + None }, _ => None, @@ -336,3 +1147,285 @@ fn find_cjk_path(value: &Value, path: &str) -> Option<String> { fn escape_json_path_key(key: &str) -> String { key.replace('\\', "\\\\").replace('"', "\\\"") } + +fn extract_structured_rejection_field_path(err: &Error) -> Option<String> { + match err { + Error::NonEnglishInput { field } => Some(field.clone()), + Error::InvalidRequest { message } if message.starts_with("structured.") => + message.split_whitespace().next().map(ToString::to_string), + _ => None, + } +} + +async fn insert_memory_note_tx( + tx: &mut Transaction<'_, Postgres>, + memory_note: &MemoryNote, +) -> Result<()> { + sqlx::query( + "\ +INSERT INTO memory_notes ( + note_id, + tenant_id, + project_id, + agent_id, + scope, + type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref, + hit_count, + last_hit_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15, + $16, + $17, + $18 +)", + ) + .bind(memory_note.note_id) + .bind(memory_note.tenant_id.as_str()) + .bind(memory_note.project_id.as_str()) + .bind(memory_note.agent_id.as_str()) + .bind(memory_note.scope.as_str()) + .bind(memory_note.r#type.as_str()) + .bind(memory_note.key.as_deref()) + .bind(memory_note.text.as_str()) + .bind(memory_note.importance) + .bind(memory_note.confidence) + .bind(memory_note.status.as_str()) + .bind(memory_note.created_at) + .bind(memory_note.updated_at) + .bind(memory_note.expires_at) + .bind(memory_note.embedding_version.as_str()) + .bind(&memory_note.source_ref) + .bind(memory_note.hit_count) + .bind(memory_note.last_hit_at) + .execute(&mut **tx) + .await?; + + Ok(()) +} + +async fn update_memory_note_tx( + tx: &mut Transaction<'_, Postgres>, + memory_note: &MemoryNote, +) -> Result<()> { + sqlx::query( + "\ +UPDATE memory_notes +SET + text = $1, + importance = $2, + confidence = $3, + updated_at = $4, + expires_at = $5, + source_ref = $6 +WHERE note_id = $7", + ) + .bind(memory_note.text.as_str()) + .bind(memory_note.importance) + .bind(memory_note.confidence) + .bind(memory_note.updated_at) + .bind(memory_note.expires_at) + .bind(&memory_note.source_ref) + .bind(memory_note.note_id) + .execute(&mut **tx) + .await?; + + Ok(()) +} + +#[cfg(test)] +mod english_gate_tests { + use serde_json; + + use crate::{ + Error, + add_note::{self, AddNoteInput, AddNoteRequest}, + }; + + #[test] + fn accepts_identifier_like_source_ref_ref_field() { + add_note::validate_add_note_request(&AddNoteRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some("test_key".to_string()), + text: "English text".to_string(), + structured: None, + importance: 0.5, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({"ref": "packages/elf-service/src/docs.rs:661"}), + write_policy: None, + }], + }) + .expect("Expected identifier-like source_ref to be accepted."); + } + + #[test] + fn rejects_non_english_source_ref_hints_quote() { + let req = AddNoteRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some("test_key".to_string()), + text: "English text".to_string(), + structured: None, + importance: 0.5, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({"hints": {"quote": "\u{4f60}\u{597d}\u{4e16}\u{754c}"}}), + write_policy: None, + }], + }; + let err = add_note::validate_add_note_request(&req).expect_err( + "Expected non-English free-text under source_ref.hints.quote to be rejected.", + ); + + match err { + Error::NonEnglishInput { field } => { + assert_eq!(field, "$.notes[0].source_ref[\"hints\"][\"quote\"]") + }, + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn rejects_long_non_english_note_text() { + let req = AddNoteRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some("test_key".to_string()), + text: "Bonjour, je veux m'assurer que ce texte est suffisamment long et riche en lettres pour declencher la detection de langue. Merci beaucoup." + .to_string(), + structured: None, + importance: 0.5, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({}), + write_policy: None, + }], + }; + let err = add_note::validate_add_note_request(&req) + .expect_err("Expected English gate rejection."); + + assert!(matches!( + err, + Error::NonEnglishInput { field } if field == "$.notes[0].text" + )); + } + + #[test] + fn accepts_missing_source_ref_and_defaults_to_empty_object() { + let req: AddNoteRequest = serde_json::from_value(serde_json::json!({ + "tenant_id": "t", + "project_id": "p", + "agent_id": "a", + "scope": "agent_private", + "notes": [ + { + "type": "fact", + "text": "English text.", + "importance": 0.5, + "confidence": 0.9 + } + ] + })) + .expect("Expected request to deserialize with default source_ref."); + + assert_eq!(req.notes[0].source_ref, serde_json::json!({})); + + add_note::validate_add_note_request(&req) + .expect("Expected missing source_ref to be accepted."); + } + + #[test] + fn accepts_null_source_ref_and_normalizes_to_empty_object() { + let req = AddNoteRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some("test_key".to_string()), + text: "English text.".to_string(), + structured: None, + importance: 0.5, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!(null), + write_policy: None, + }], + }; + let req = super::normalize_add_note_request(req); + + assert_eq!(req.notes[0].source_ref, serde_json::json!({})); + + add_note::validate_add_note_request(&req) + .expect("Expected null source_ref to be accepted."); + } + + #[test] + fn rejects_non_object_source_ref() { + let req = AddNoteRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some("test_key".to_string()), + text: "English text.".to_string(), + structured: None, + importance: 0.5, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!("legacy-shape"), + write_policy: None, + }], + }; + let err = add_note::validate_add_note_request(&req) + .expect_err("Expected non-object source_ref rejection."); + + match err { + Error::InvalidRequest { message } => { + assert_eq!(message, "source_ref must be a JSON object."); + }, + other => panic!("Expected InvalidRequest for non-object source_ref, got {other:?}"), + } + } +} diff --git a/packages/elf-service/src/admin.rs b/packages/elf-service/src/admin.rs index 0a6ea4c1..8b3d976a 100644 --- a/packages/elf-service/src/admin.rs +++ b/packages/elf-service/src/admin.rs @@ -1,43 +1,48 @@ -// std +//! Administrative maintenance APIs. + use std::collections::HashMap; -// crates.io use qdrant_client::{ - client::Payload, + Payload, qdrant::{Document, PointStruct, UpsertPointsBuilder, Vector}, }; +use serde::{Deserialize, Serialize}; use serde_json::Value; -use time::OffsetDateTime; +use sqlx::FromRow; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; +use uuid::Uuid; -// self -use crate::{ElfService, ServiceError, ServiceResult}; +use crate::{ElfService, Error, Result}; use elf_storage::qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME}; -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Summary of one Qdrant rebuild run. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct RebuildReport { + /// Number of chunks successfully rebuilt into Qdrant. pub rebuilt_count: u64, + /// Number of chunks skipped because no embedding vector was present. pub missing_vector_count: u64, + /// Number of chunks skipped because rebuild failed. pub error_count: u64, } -#[derive(sqlx::FromRow)] +#[derive(FromRow)] struct RebuildRow { - chunk_id: uuid::Uuid, + chunk_id: Uuid, chunk_index: i32, start_offset: i32, end_offset: i32, chunk_text: String, - note_id: uuid::Uuid, + note_id: Uuid, tenant_id: String, project_id: String, agent_id: String, scope: String, - #[sqlx(rename = "type")] - note_type: String, + r#type: String, key: Option<String>, status: String, - updated_at: time::OffsetDateTime, - expires_at: Option<time::OffsetDateTime>, + updated_at: OffsetDateTime, + expires_at: Option<OffsetDateTime>, importance: f32, confidence: f32, embedding_version: String, @@ -45,45 +50,67 @@ struct RebuildRow { } impl ElfService { - pub async fn rebuild_qdrant(&self) -> ServiceResult<RebuildReport> { + /// Rebuilds Qdrant note points from persisted Postgres chunks and embeddings. + pub async fn rebuild_qdrant(&self) -> Result<RebuildReport> { let now = OffsetDateTime::now_utc(); - let rows: Vec<RebuildRow> = sqlx::query_as( - "SELECT c.chunk_id, c.chunk_index, c.start_offset, c.end_offset, c.text AS chunk_text, \ - n.note_id, n.tenant_id, n.project_id, n.agent_id, n.scope, n.type, n.key, n.status, \ - n.updated_at, n.expires_at, n.importance, n.confidence, c.embedding_version, \ - e.vec::text AS vec_text \ - FROM memory_note_chunks c \ - JOIN memory_notes n ON n.note_id = c.note_id \ - LEFT JOIN note_chunk_embeddings e \ - ON e.chunk_id = c.chunk_id AND e.embedding_version = c.embedding_version \ - WHERE n.status = 'active' AND (n.expires_at IS NULL OR n.expires_at > $1)", + let rows: Vec<RebuildRow> = sqlx::query_as::<_, RebuildRow>( + "\ +SELECT + c.chunk_id, + c.chunk_index, + c.start_offset, + c.end_offset, + c.text AS chunk_text, + n.note_id, + n.tenant_id, + n.project_id, + n.agent_id, + n.scope, + n.type AS \"type\", + n.key, + n.status, + n.updated_at, + n.expires_at, + n.importance, + n.confidence, + c.embedding_version, + e.vec::text AS vec_text +FROM memory_note_chunks c +JOIN memory_notes n ON n.note_id = c.note_id +LEFT JOIN note_chunk_embeddings e + ON e.chunk_id = c.chunk_id AND e.embedding_version = c.embedding_version + WHERE n.status = 'active' AND (n.expires_at IS NULL OR n.expires_at > $1)", ) .bind(now) .fetch_all(&self.db.pool) .await?; - - let mut rebuilt_count = 0u64; - let mut missing_vector_count = 0u64; - let mut error_count = 0u64; + let mut rebuilt_count = 0_u64; + let mut missing_vector_count = 0_u64; + let mut error_count = 0_u64; for row in rows { let Some(vec_text) = row.vec_text else { missing_vector_count += 1; + continue; }; let vec = match crate::parse_pg_vector(&vec_text) { Ok(vec) => vec, Err(_) => { error_count += 1; + continue; }, }; + if vec.len() != self.cfg.storage.qdrant.vector_dim as usize { error_count += 1; + continue; } let mut payload = Payload::new(); + payload.insert("note_id", row.note_id.to_string()); payload.insert("chunk_id", row.chunk_id.to_string()); payload.insert("chunk_index", Value::from(row.chunk_index)); @@ -93,25 +120,29 @@ impl ElfService { payload.insert("project_id", row.project_id); payload.insert("agent_id", row.agent_id); payload.insert("scope", row.scope); - payload.insert("type", row.note_type); + payload.insert("type", row.r#type); payload.insert("key", row.key.map(Value::String).unwrap_or(Value::Null)); payload.insert("status", row.status); payload.insert("updated_at", Value::String(format_timestamp(row.updated_at)?)); + let expires_value = match row.expires_at { Some(ts) => Value::String(format_timestamp(ts)?), None => Value::Null, }; + payload.insert("expires_at", expires_value); payload.insert("importance", Value::from(row.importance as f64)); payload.insert("confidence", Value::from(row.confidence as f64)); payload.insert("embedding_version", row.embedding_version.clone()); let mut vectors = HashMap::new(); + vectors.insert(DENSE_VECTOR_NAME.to_string(), Vector::from(vec)); vectors.insert( BM25_VECTOR_NAME.to_string(), Vector::from(Document::new(row.chunk_text, BM25_MODEL)), ); + let point = PointStruct::new(row.chunk_id.to_string(), vectors, payload); let result = self .qdrant @@ -124,6 +155,7 @@ impl ElfService { if result.is_err() { error_count += 1; + continue; } @@ -134,9 +166,7 @@ impl ElfService { } } -fn format_timestamp(ts: OffsetDateTime) -> ServiceResult<String> { - use time::format_description::well_known::Rfc3339; - ts.format(&Rfc3339).map_err(|_| ServiceError::InvalidRequest { - message: "Failed to format timestamp.".to_string(), - }) +fn format_timestamp(ts: OffsetDateTime) -> Result<String> { + ts.format(&Rfc3339) + .map_err(|_| Error::InvalidRequest { message: "Failed to format timestamp.".to_string() }) } diff --git a/packages/elf-service/src/admin_graph_predicates.rs b/packages/elf-service/src/admin_graph_predicates.rs new file mode 100644 index 00000000..b451c571 --- /dev/null +++ b/packages/elf-service/src/admin_graph_predicates.rs @@ -0,0 +1,495 @@ +//! Administrative graph-predicate APIs. + +use serde::Serialize; +use sqlx::PgConnection; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{ElfService, Result}; +use elf_config::SecurityAuthRole; +use elf_storage::{ + graph, + models::{GraphPredicate, GraphPredicateAlias}, +}; + +const GRAPH_PREDICATE_SCOPE_GLOBAL: &str = "__global__"; +const GRAPH_PREDICATE_SCOPE_PROJECT_PREFIX: &str = "__project__:"; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum AdminGraphPredicateScope { + TenantProject, + Project, + Global, + All, +} +impl AdminGraphPredicateScope { + fn parse(raw: &str) -> Option<Self> { + match raw.trim() { + "tenant_project" => Some(Self::TenantProject), + "project" => Some(Self::Project), + "global" => Some(Self::Global), + "all" => Some(Self::All), + _ => None, + } + } +} + +/// Request payload for listing graph predicates visible in admin scope. +#[derive(Clone, Debug)] +pub struct AdminGraphPredicatesListRequest { + /// Tenant to query within. + pub tenant_id: String, + /// Project to query within. + pub project_id: String, + /// Agent requesting the list. + pub agent_id: String, + /// Optional admin scope filter. + pub scope: Option<String>, +} + +/// Request payload for patching a graph predicate. +#[derive(Clone, Debug)] +pub struct AdminGraphPredicatePatchRequest { + /// Tenant to query within. + pub tenant_id: String, + /// Project to query within. + pub project_id: String, + /// Agent requesting the mutation. + pub agent_id: String, + /// Optional auth token identifier used for super-admin checks. + pub token_id: Option<String>, + /// Predicate identifier to mutate. + pub predicate_id: Uuid, + /// Optional new predicate status. + pub status: Option<String>, + /// Optional new cardinality value. + pub cardinality: Option<String>, +} + +/// Request payload for adding a graph predicate alias. +#[derive(Clone, Debug)] +pub struct AdminGraphPredicateAliasAddRequest { + /// Tenant to query within. + pub tenant_id: String, + /// Project to query within. + pub project_id: String, + /// Agent requesting the mutation. + pub agent_id: String, + /// Optional auth token identifier used for super-admin checks. + pub token_id: Option<String>, + /// Predicate identifier to extend. + pub predicate_id: Uuid, + /// Alias surface to add. + pub alias: String, +} + +/// Request payload for listing graph predicate aliases. +#[derive(Clone, Debug)] +pub struct AdminGraphPredicateAliasesListRequest { + /// Tenant to query within. + pub tenant_id: String, + /// Project to query within. + pub project_id: String, + /// Agent requesting the list. + pub agent_id: String, + /// Predicate identifier to inspect. + pub predicate_id: Uuid, +} + +/// Serialized graph predicate returned by admin APIs. +#[derive(Clone, Debug, Serialize)] +pub struct AdminGraphPredicateResponse { + /// Predicate identifier. + pub predicate_id: Uuid, + /// Predicate scope key. + pub scope_key: String, + /// Tenant scope when tenant-specific. + pub tenant_id: Option<String>, + /// Project scope when project-specific. + pub project_id: Option<String>, + /// Canonical predicate surface. + pub canonical: String, + /// Normalized canonical predicate surface. + pub canonical_norm: String, + /// Cardinality policy. + pub cardinality: String, + /// Lifecycle status. + pub status: String, + #[serde(with = "crate::time_serde")] + /// Creation timestamp. + pub created_at: OffsetDateTime, + #[serde(with = "crate::time_serde")] + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Serialized graph predicate alias returned by admin APIs. +#[derive(Clone, Debug, Serialize)] +pub struct AdminGraphPredicateAliasResponse { + /// Alias identifier. + pub alias_id: Uuid, + /// Predicate identifier that owns the alias. + pub predicate_id: Uuid, + /// Scope key where the alias resolves. + pub scope_key: String, + /// Alias surface. + pub alias: String, + /// Normalized alias surface. + pub alias_norm: String, + #[serde(with = "crate::time_serde")] + /// Creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Response payload for listing graph predicates. +#[derive(Clone, Debug, Serialize)] +pub struct AdminGraphPredicatesListResponse { + /// Returned predicates. + pub predicates: Vec<AdminGraphPredicateResponse>, +} + +/// Response payload for graph predicate alias operations. +#[derive(Clone, Debug, Serialize)] +pub struct AdminGraphPredicateAliasesResponse { + /// Predicate identifier. + pub predicate_id: Uuid, + /// Returned aliases. + pub aliases: Vec<AdminGraphPredicateAliasResponse>, +} + +impl ElfService { + fn is_super_admin_token_id(&self, token_id: Option<&str>) -> bool { + if self.cfg.security.auth_mode.trim() != "static_keys" { + return false; + } + + let Some(token_id) = token_id.map(str::trim).filter(|value| !value.is_empty()) else { + return false; + }; + + self.cfg + .security + .auth_keys + .iter() + .any(|key| key.token_id == token_id && matches!(key.role, SecurityAuthRole::SuperAdmin)) + } + + /// Lists graph predicates visible to the caller's admin context. + pub async fn admin_graph_predicates_list( + &self, + req: AdminGraphPredicatesListRequest, + ) -> Result<AdminGraphPredicatesListResponse> { + let raw = req.scope.as_deref().unwrap_or("all"); + let scope = + AdminGraphPredicateScope::parse(raw).ok_or_else(|| crate::Error::InvalidRequest { + message: "scope must be one of tenant_project|project|global|all".to_string(), + })?; + let scope_keys = + graph_predicate_scope_keys(req.tenant_id.as_str(), req.project_id.as_str(), scope); + let mut conn = self.db.pool.acquire().await?; + let predicates = graph::list_predicates_by_scope_keys(&mut conn, &scope_keys) + .await + .map_err(map_storage_error)?; + let predicates = predicates.into_iter().map(to_predicate_response).collect(); + + Ok(AdminGraphPredicatesListResponse { predicates }) + } + + /// Updates a mutable graph predicate field inside the allowed admin scope. + pub async fn admin_graph_predicate_patch( + &self, + req: AdminGraphPredicatePatchRequest, + ) -> Result<AdminGraphPredicateResponse> { + if req.status.is_none() && req.cardinality.is_none() { + return Err(crate::Error::InvalidRequest { + message: "At least one of status or cardinality is required.".to_string(), + }); + } + + let status = req.status.as_deref().map(str::trim); + + if status.is_some_and(str::is_empty) { + return Err(crate::Error::InvalidRequest { + message: "status must be non-empty.".to_string(), + }); + } + + let cardinality = req.cardinality.as_deref().map(str::trim); + + if cardinality.is_some_and(str::is_empty) { + return Err(crate::Error::InvalidRequest { + message: "cardinality must be non-empty.".to_string(), + }); + } + + let allow_global_mutation = self.is_super_admin_token_id(req.token_id.as_deref()); + let mut conn = self.db.pool.acquire().await?; + let existing = load_predicate_in_context( + &mut conn, + req.tenant_id.as_str(), + req.project_id.as_str(), + req.predicate_id, + PredicateAccess::Mutate, + allow_global_mutation, + ) + .await?; + let old_status = existing.status.clone(); + let old_cardinality = existing.cardinality.clone(); + + if old_status == "deprecated" { + return Err(crate::Error::Conflict { + message: "graph predicate is deprecated and cannot be modified.".to_string(), + }); + } + + let new_status = match status { + None => None, + Some(raw) => { + let raw = raw.to_string(); + + if !matches!(raw.as_str(), "pending" | "active" | "deprecated") { + return Err(crate::Error::InvalidRequest { + message: "status must be one of pending|active|deprecated.".to_string(), + }); + } + if raw != old_status + && !predicate_status_transition_allowed(old_status.as_str(), raw.as_str()) + { + return Err(crate::Error::Conflict { + message: format!( + "Invalid graph predicate status transition; from={old_status} to={raw}.", + ), + }); + } + + Some(raw) + }, + }; + let new_cardinality = match cardinality { + None => None, + Some(raw) => { + let raw = raw.to_string(); + + if !matches!(raw.as_str(), "single" | "multi") { + return Err(crate::Error::InvalidRequest { + message: "cardinality must be one of single|multi.".to_string(), + }); + } + + Some(raw) + }, + }; + let updated = graph::update_predicate_guarded( + &mut conn, + req.predicate_id, + old_status.as_str(), + old_cardinality.as_str(), + new_status.as_deref(), + new_cardinality.as_deref(), + ) + .await + .map_err(map_storage_error)?; + + tracing::info!( + actor_agent_id = %req.agent_id, + predicate_id = %req.predicate_id, + old_status = %old_status, + new_status = %updated.status, + old_cardinality = %old_cardinality, + new_cardinality = %updated.cardinality, + "Admin graph predicate patched." + ); + + Ok(to_predicate_response(updated)) + } + + /// Adds an alias to a mutable graph predicate. + pub async fn admin_graph_predicate_alias_add( + &self, + req: AdminGraphPredicateAliasAddRequest, + ) -> Result<AdminGraphPredicateAliasesResponse> { + let alias = req.alias.trim(); + + if alias.is_empty() { + return Err(crate::Error::InvalidRequest { + message: "alias must be non-empty.".to_string(), + }); + } + + let allow_global_mutation = self.is_super_admin_token_id(req.token_id.as_deref()); + let mut conn = self.db.pool.acquire().await?; + let predicate = load_predicate_in_context( + &mut conn, + req.tenant_id.as_str(), + req.project_id.as_str(), + req.predicate_id, + PredicateAccess::Mutate, + allow_global_mutation, + ) + .await?; + + if predicate.status == "deprecated" { + return Err(crate::Error::Conflict { + message: "graph predicate is deprecated and cannot be modified.".to_string(), + }); + } + + graph::add_predicate_alias(&mut conn, req.predicate_id, alias) + .await + .map_err(map_storage_error)?; + + tracing::info!( + actor_agent_id = %req.agent_id, + predicate_id = %req.predicate_id, + alias = %alias, + "Admin graph predicate alias added." + ); + + let mut aliases = graph::list_predicate_aliases(&mut conn, req.predicate_id) + .await + .map_err(map_storage_error)?; + + stable_sort_aliases(&mut aliases); + + let aliases = aliases.into_iter().map(to_alias_response).collect(); + + Ok(AdminGraphPredicateAliasesResponse { predicate_id: req.predicate_id, aliases }) + } + + /// Lists aliases for a graph predicate visible in admin scope. + pub async fn admin_graph_predicate_aliases_list( + &self, + req: AdminGraphPredicateAliasesListRequest, + ) -> Result<AdminGraphPredicateAliasesResponse> { + let mut conn = self.db.pool.acquire().await?; + + load_predicate_in_context( + &mut conn, + req.tenant_id.as_str(), + req.project_id.as_str(), + req.predicate_id, + PredicateAccess::Read, + false, + ) + .await?; + + let mut aliases = graph::list_predicate_aliases(&mut conn, req.predicate_id) + .await + .map_err(map_storage_error)?; + + stable_sort_aliases(&mut aliases); + + let aliases = aliases.into_iter().map(to_alias_response).collect(); + + Ok(AdminGraphPredicateAliasesResponse { predicate_id: req.predicate_id, aliases }) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum PredicateAccess { + Read, + Mutate, +} + +fn graph_predicate_scope_keys( + tenant_id: &str, + project_id: &str, + scope: AdminGraphPredicateScope, +) -> Vec<String> { + let tenant_project_key = format!("{tenant_id}:{project_id}"); + let project_key = format!("{GRAPH_PREDICATE_SCOPE_PROJECT_PREFIX}{project_id}"); + let global_key = GRAPH_PREDICATE_SCOPE_GLOBAL.to_string(); + + match scope { + AdminGraphPredicateScope::TenantProject => vec![tenant_project_key], + AdminGraphPredicateScope::Project => vec![project_key], + AdminGraphPredicateScope::Global => vec![global_key], + AdminGraphPredicateScope::All => vec![tenant_project_key, project_key, global_key], + } +} + +fn predicate_status_transition_allowed(old: &str, new: &str) -> bool { + matches!( + (old, new), + ("pending", "active") | ("pending", "deprecated") | ("active", "deprecated") + ) +} + +fn stable_sort_aliases(aliases: &mut [GraphPredicateAlias]) { + aliases.sort_by(|a, b| { + a.created_at + .cmp(&b.created_at) + .then_with(|| a.alias_norm.cmp(&b.alias_norm)) + .then_with(|| a.alias.cmp(&b.alias)) + }); +} + +fn to_predicate_response(predicate: GraphPredicate) -> AdminGraphPredicateResponse { + AdminGraphPredicateResponse { + predicate_id: predicate.predicate_id, + scope_key: predicate.scope_key, + tenant_id: predicate.tenant_id, + project_id: predicate.project_id, + canonical: predicate.canonical, + canonical_norm: predicate.canonical_norm, + cardinality: predicate.cardinality, + status: predicate.status, + created_at: predicate.created_at, + updated_at: predicate.updated_at, + } +} + +fn to_alias_response(alias: GraphPredicateAlias) -> AdminGraphPredicateAliasResponse { + AdminGraphPredicateAliasResponse { + alias_id: alias.alias_id, + predicate_id: alias.predicate_id, + scope_key: alias.scope_key, + alias: alias.alias, + alias_norm: alias.alias_norm, + created_at: alias.created_at, + } +} + +fn map_storage_error(err: elf_storage::Error) -> crate::Error { + match err { + elf_storage::Error::InvalidArgument(message) => crate::Error::InvalidRequest { message }, + elf_storage::Error::NotFound(message) => crate::Error::NotFound { message }, + elf_storage::Error::Conflict(message) => crate::Error::Conflict { message }, + elf_storage::Error::Sqlx(err) => crate::Error::Storage { message: err.to_string() }, + elf_storage::Error::Qdrant(err) => crate::Error::Qdrant { message: err.to_string() }, + } +} + +async fn load_predicate_in_context( + conn: &mut PgConnection, + tenant_id: &str, + project_id: &str, + predicate_id: Uuid, + access: PredicateAccess, + allow_global_mutation: bool, +) -> Result<GraphPredicate> { + let predicate = graph::get_predicate_by_id(conn, predicate_id) + .await + .map_err(map_storage_error)? + .ok_or_else(|| crate::Error::NotFound { + message: format!("graph predicate not found; predicate_id={predicate_id}"), + })?; + let tenant_project_key = format!("{tenant_id}:{project_id}"); + let project_key = format!("{GRAPH_PREDICATE_SCOPE_PROJECT_PREFIX}{project_id}"); + let is_in_context = + predicate.scope_key == tenant_project_key || predicate.scope_key == project_key; + let is_global = predicate.scope_key == GRAPH_PREDICATE_SCOPE_GLOBAL; + + if !is_in_context && !is_global { + return Err(crate::Error::NotFound { + message: format!("graph predicate not found; predicate_id={predicate_id}"), + }); + } + if access == PredicateAccess::Mutate && is_global && !allow_global_mutation { + return Err(crate::Error::ScopeDenied { + message: "Super-admin token required to modify global graph predicates.".to_string(), + }); + } + + Ok(predicate) +} diff --git a/packages/elf-service/src/consolidation.rs b/packages/elf-service/src/consolidation.rs new file mode 100644 index 00000000..9ac2a32f --- /dev/null +++ b/packages/elf-service/src/consolidation.rs @@ -0,0 +1,735 @@ +//! Fixture-driven consolidation run and proposal service APIs. + +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; +use time::{Duration, OffsetDateTime}; +use uuid::Uuid; + +use crate::{ElfService, Error, Result}; +use elf_domain::consolidation::{ + self, CONSOLIDATION_CONTRACT_SCHEMA_V1, ConsolidationApplyIntent, ConsolidationInputRef, + ConsolidationJobPayload, ConsolidationLineage, ConsolidationMarkers, + ConsolidationProposalContract, ConsolidationProposalDiff, ConsolidationReviewAction, + ConsolidationReviewState, ConsolidationRunState, ConsolidationUnsupportedClaimFlag, + ConsolidationValidationError, +}; +use elf_storage::{ + consolidation::{ + ConsolidationProposalReviewEventInsert, ConsolidationProposalReviewUpdate, + ConsolidationRunJobInsert, + }, + models::{ConsolidationProposal, ConsolidationProposalReviewEvent, ConsolidationRun}, +}; + +const DEFAULT_LIST_LIMIT: i64 = 50; +const MAX_LIST_LIMIT: i64 = 200; + +/// Request to create a fixture-backed consolidation run. +#[derive(Clone, Debug, Deserialize)] +pub struct ConsolidationRunCreateRequest { + /// Tenant that owns the run. + pub tenant_id: String, + /// Project that owns the run. + pub project_id: String, + /// Agent registering the run. + pub agent_id: String, + /// Job kind, such as `fixture` or `manual`. + pub job_kind: String, + /// Input references considered by the run. + pub input_refs: Vec<ConsolidationInputRef>, + #[serde(default = "empty_object")] + /// Aggregate source snapshot metadata for the run. + pub source_snapshot: Value, + /// Run lineage. + pub lineage: ConsolidationLineage, + #[serde(default)] + /// Fixture-generated proposals to persist with this run. + pub proposals: Vec<ConsolidationProposalInput>, +} + +/// Fixture proposal input for a consolidation run. +#[derive(Clone, Debug, Deserialize)] +pub struct ConsolidationProposalInput { + /// Proposal kind, such as `derived_note` or `knowledge_page`. + pub proposal_kind: String, + /// Derived-output apply intent. + pub apply_intent: ConsolidationApplyIntent, + /// Source references directly supporting the proposal. + pub source_refs: Vec<ConsolidationInputRef>, + #[serde(default = "empty_object")] + /// Aggregate source snapshot metadata for reviewer inspection. + pub source_snapshot: Value, + /// Proposal lineage. + pub lineage: ConsolidationLineage, + /// Fixture confidence in the proposal. + pub confidence: f32, + #[serde(default)] + /// Unsupported claims reviewers must inspect before accepting the proposal. + pub unsupported_claim_flags: Vec<ConsolidationUnsupportedClaimFlag>, + #[serde(default)] + /// Review markers for contradiction and staleness checks. + pub markers: ConsolidationMarkers, + /// Reviewable derived-output diff. + pub diff: ConsolidationProposalDiff, + #[serde(default = "empty_object")] + /// Derived target reference, when the target already exists. + pub target_ref: Value, + #[serde(default = "empty_object")] + /// Proposed derived output payload. + pub proposed_payload: Value, +} +impl ConsolidationProposalInput { + fn into_contract(self) -> ConsolidationProposalContract { + ConsolidationProposalContract { + proposal_kind: self.proposal_kind, + apply_intent: self.apply_intent, + source_refs: self.source_refs, + source_snapshot: self.source_snapshot, + lineage: self.lineage, + confidence: self.confidence, + unsupported_claim_flags: self.unsupported_claim_flags, + markers: self.markers, + diff: self.diff, + target_ref: self.target_ref, + proposed_payload: self.proposed_payload, + } + } +} + +/// Response returned after creating one consolidation run. +#[derive(Clone, Debug, Serialize)] +pub struct ConsolidationRunCreateResponse { + /// Created run. + pub run: ConsolidationRunResponse, + /// Enqueued worker job identifier. + pub job_id: Uuid, + /// Proposals stored with the run. + pub proposals: Vec<ConsolidationProposalResponse>, +} + +/// Request to get one consolidation run. +#[derive(Clone, Debug, Deserialize)] +pub struct ConsolidationRunGetRequest { + /// Tenant that owns the run. + pub tenant_id: String, + /// Project that owns the run. + pub project_id: String, + /// Run identifier. + pub run_id: Uuid, +} + +/// Request to list consolidation runs. +#[derive(Clone, Debug, Deserialize)] +pub struct ConsolidationRunsListRequest { + /// Tenant that owns the runs. + pub tenant_id: String, + /// Project that owns the runs. + pub project_id: String, + /// Maximum number of runs to return. + pub limit: Option<u32>, +} + +/// Response returned by consolidation run listing. +#[derive(Clone, Debug, Serialize)] +pub struct ConsolidationRunsListResponse { + /// Returned runs. + pub runs: Vec<ConsolidationRunResponse>, +} + +/// Public consolidation run DTO. +#[derive(Clone, Debug, Serialize)] +pub struct ConsolidationRunResponse { + /// Consolidation run identifier. + pub run_id: Uuid, + /// Tenant that owns the run. + pub tenant_id: String, + /// Project that owns the run. + pub project_id: String, + /// Agent that registered the run. + pub agent_id: String, + /// Versioned consolidation contract schema. + pub contract_schema: String, + /// Job kind, such as fixture or manual. + pub job_kind: String, + /// Current run state. + pub status: String, + /// Serialized input references. + pub input_refs: Value, + /// Aggregate source snapshot metadata. + pub source_snapshot: Value, + /// Serialized run lineage. + pub lineage: Value, + /// Structured error payload for failed runs. + pub error: Value, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, + /// Completion timestamp for terminal runs. + pub completed_at: Option<OffsetDateTime>, +} +impl From<ConsolidationRun> for ConsolidationRunResponse { + fn from(run: ConsolidationRun) -> Self { + Self { + run_id: run.run_id, + tenant_id: run.tenant_id, + project_id: run.project_id, + agent_id: run.agent_id, + contract_schema: run.contract_schema, + job_kind: run.job_kind, + status: run.status, + input_refs: run.input_refs, + source_snapshot: run.source_snapshot, + lineage: run.lineage, + error: run.error, + created_at: run.created_at, + updated_at: run.updated_at, + completed_at: run.completed_at, + } + } +} + +/// Request to get one consolidation proposal. +#[derive(Clone, Debug, Deserialize)] +pub struct ConsolidationProposalGetRequest { + /// Tenant that owns the proposal. + pub tenant_id: String, + /// Project that owns the proposal. + pub project_id: String, + /// Proposal identifier. + pub proposal_id: Uuid, +} + +/// Request to list consolidation proposals. +#[derive(Clone, Debug, Deserialize)] +pub struct ConsolidationProposalsListRequest { + /// Tenant that owns the proposals. + pub tenant_id: String, + /// Project that owns the proposals. + pub project_id: String, + /// Optional run filter. + pub run_id: Option<Uuid>, + /// Optional review-state filter. + pub review_state: Option<ConsolidationReviewState>, + /// Maximum number of proposals to return. + pub limit: Option<u32>, +} + +/// Response returned by consolidation proposal listing. +#[derive(Clone, Debug, Serialize)] +pub struct ConsolidationProposalsListResponse { + /// Returned proposals. + pub proposals: Vec<ConsolidationProposalResponse>, +} + +/// Request to apply one proposal review action. +#[derive(Clone, Debug, Deserialize)] +pub struct ConsolidationProposalReviewRequest { + /// Tenant that owns the proposal. + pub tenant_id: String, + /// Project that owns the proposal. + pub project_id: String, + /// Agent performing the review action. + pub reviewer_agent_id: String, + /// Proposal identifier. + pub proposal_id: Uuid, + /// Requested review action. + pub review_action: ConsolidationReviewAction, + /// Optional reviewer comment. + pub review_comment: Option<String>, +} + +/// Public consolidation proposal review audit DTO. +#[derive(Clone, Debug, Serialize)] +pub struct ConsolidationProposalReviewEventResponse { + /// Review event identifier. + pub review_id: Uuid, + /// Reviewed proposal identifier. + pub proposal_id: Uuid, + /// Parent consolidation run identifier. + pub run_id: Uuid, + /// Tenant that owns the proposal. + pub tenant_id: String, + /// Project that owns the proposal. + pub project_id: String, + /// Agent that performed the review action. + pub reviewer_agent_id: String, + /// Review action requested by the reviewer. + pub action: String, + /// Review state before the transition. + pub from_review_state: String, + /// Review state after the transition. + pub to_review_state: String, + /// Optional reviewer comment. + pub review_comment: Option<String>, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} +impl From<ConsolidationProposalReviewEvent> for ConsolidationProposalReviewEventResponse { + fn from(event: ConsolidationProposalReviewEvent) -> Self { + Self { + review_id: event.review_id, + proposal_id: event.proposal_id, + run_id: event.run_id, + tenant_id: event.tenant_id, + project_id: event.project_id, + reviewer_agent_id: event.reviewer_agent_id, + action: event.action, + from_review_state: event.from_review_state, + to_review_state: event.to_review_state, + review_comment: event.review_comment, + created_at: event.created_at, + } + } +} + +/// Public consolidation proposal DTO. +#[derive(Clone, Debug, Serialize)] +pub struct ConsolidationProposalResponse { + /// Consolidation proposal identifier. + pub proposal_id: Uuid, + /// Parent consolidation run identifier. + pub run_id: Uuid, + /// Tenant that owns the proposal. + pub tenant_id: String, + /// Project that owns the proposal. + pub project_id: String, + /// Agent that registered the proposal. + pub agent_id: String, + /// Versioned consolidation contract schema. + pub contract_schema: String, + /// Proposal kind, such as derived_note or knowledge_page. + pub proposal_kind: String, + /// Derived-output apply intent. + pub apply_intent: String, + /// Current review state. + pub review_state: String, + /// Serialized source references. + pub source_refs: Value, + /// Aggregate source snapshot metadata. + pub source_snapshot: Value, + /// Serialized proposal lineage. + pub lineage: Value, + /// Serialized reviewable diff. + pub diff: Value, + /// Proposal confidence score. + pub confidence: f32, + /// Serialized unsupported-claim flags. + pub unsupported_claim_flags: Value, + /// Serialized contradiction markers. + pub contradiction_markers: Value, + /// Serialized staleness markers. + pub staleness_markers: Value, + /// Serialized derived target reference. + pub target_ref: Value, + /// Serialized proposed derived output payload. + pub proposed_payload: Value, + /// Agent that last reviewed the proposal. + pub reviewer_agent_id: Option<String>, + /// Optional reviewer comment. + pub review_comment: Option<String>, + /// Timestamp of the last review transition. + pub reviewed_at: Option<OffsetDateTime>, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, + /// Append-only review events for detail readback. + pub review_events: Vec<ConsolidationProposalReviewEventResponse>, +} +impl From<ConsolidationProposal> for ConsolidationProposalResponse { + fn from(proposal: ConsolidationProposal) -> Self { + Self { + proposal_id: proposal.proposal_id, + run_id: proposal.run_id, + tenant_id: proposal.tenant_id, + project_id: proposal.project_id, + agent_id: proposal.agent_id, + contract_schema: proposal.contract_schema, + proposal_kind: proposal.proposal_kind, + apply_intent: proposal.apply_intent, + review_state: proposal.review_state, + source_refs: proposal.source_refs, + source_snapshot: proposal.source_snapshot, + lineage: proposal.lineage, + diff: proposal.diff, + confidence: proposal.confidence, + unsupported_claim_flags: proposal.unsupported_claim_flags, + contradiction_markers: proposal.contradiction_markers, + staleness_markers: proposal.staleness_markers, + target_ref: proposal.target_ref, + proposed_payload: proposal.proposed_payload, + reviewer_agent_id: proposal.reviewer_agent_id, + review_comment: proposal.review_comment, + reviewed_at: proposal.reviewed_at, + created_at: proposal.created_at, + updated_at: proposal.updated_at, + review_events: Vec::new(), + } + } +} + +impl ElfService { + /// Creates a fixture-backed consolidation run and optional proposals. + pub async fn consolidation_run_create( + &self, + req: ConsolidationRunCreateRequest, + ) -> Result<ConsolidationRunCreateResponse> { + validate_context(req.tenant_id.as_str(), req.project_id.as_str(), req.agent_id.as_str())?; + validate_job_kind(req.job_kind.as_str())?; + + consolidation::validate_source_refs(&req.input_refs).map_err(validation_error)?; + + validate_object("source_snapshot", &req.source_snapshot)?; + + req.lineage.validate().map_err(validation_error)?; + + let proposal_contracts = + req.proposals.into_iter().map(ConsolidationProposalInput::into_contract).collect(); + let payload = ConsolidationJobPayload { + contract_schema: CONSOLIDATION_CONTRACT_SCHEMA_V1.to_string(), + proposals: proposal_contracts, + }; + + payload.validate().map_err(validation_error)?; + + let now = OffsetDateTime::now_utc(); + let run_state = ConsolidationRunState::Pending; + let run_id = Uuid::new_v4(); + let job_id = Uuid::new_v4(); + let run = ConsolidationRun { + run_id, + tenant_id: req.tenant_id.clone(), + project_id: req.project_id.clone(), + agent_id: req.agent_id.clone(), + contract_schema: CONSOLIDATION_CONTRACT_SCHEMA_V1.to_string(), + job_kind: req.job_kind.clone(), + status: run_state.as_str().to_string(), + input_refs: to_value(&req.input_refs)?, + source_snapshot: req.source_snapshot, + lineage: to_value(&req.lineage)?, + error: empty_object(), + created_at: now, + updated_at: now, + completed_at: terminal_time(run_state, now), + }; + let payload_value = to_value(&payload)?; + let mut tx = self.db.pool.begin().await?; + + elf_storage::consolidation::insert_consolidation_run(&mut *tx, &run).await?; + elf_storage::consolidation::insert_consolidation_run_job( + &mut *tx, + ConsolidationRunJobInsert { + job_id, + run_id, + tenant_id: req.tenant_id.as_str(), + project_id: req.project_id.as_str(), + agent_id: req.agent_id.as_str(), + job_kind: req.job_kind.as_str(), + payload: &payload_value, + now, + }, + ) + .await?; + + tx.commit().await?; + + Ok(ConsolidationRunCreateResponse { + run: ConsolidationRunResponse::from(run), + job_id, + proposals: Vec::new(), + }) + } + + /// Fetches one consolidation run. + pub async fn consolidation_run_get( + &self, + req: ConsolidationRunGetRequest, + ) -> Result<ConsolidationRunResponse> { + let run = elf_storage::consolidation::get_consolidation_run( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + req.run_id, + ) + .await? + .ok_or_else(|| Error::NotFound { message: "consolidation run not found".to_string() })?; + + Ok(ConsolidationRunResponse::from(run)) + } + + /// Lists consolidation runs. + pub async fn consolidation_runs_list( + &self, + req: ConsolidationRunsListRequest, + ) -> Result<ConsolidationRunsListResponse> { + let limit = bounded_limit(req.limit); + let rows = elf_storage::consolidation::list_consolidation_runs( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + limit, + ) + .await?; + let runs = rows.into_iter().map(ConsolidationRunResponse::from).collect(); + + Ok(ConsolidationRunsListResponse { runs }) + } + + /// Fetches one consolidation proposal. + pub async fn consolidation_proposal_get( + &self, + req: ConsolidationProposalGetRequest, + ) -> Result<ConsolidationProposalResponse> { + let proposal = elf_storage::consolidation::get_consolidation_proposal( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + req.proposal_id, + ) + .await? + .ok_or_else(|| Error::NotFound { + message: "consolidation proposal not found".to_string(), + })?; + let review_events = self + .consolidation_proposal_review_events( + req.tenant_id.as_str(), + req.project_id.as_str(), + req.proposal_id, + ) + .await?; + let mut response = ConsolidationProposalResponse::from(proposal); + + response.review_events = review_events; + + Ok(response) + } + + /// Lists consolidation proposals. + pub async fn consolidation_proposals_list( + &self, + req: ConsolidationProposalsListRequest, + ) -> Result<ConsolidationProposalsListResponse> { + let limit = bounded_limit(req.limit); + let review_state = req.review_state.map(ConsolidationReviewState::as_str); + let rows = elf_storage::consolidation::list_consolidation_proposals( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + req.run_id, + review_state, + limit, + ) + .await?; + let proposals = rows.into_iter().map(ConsolidationProposalResponse::from).collect(); + + Ok(ConsolidationProposalsListResponse { proposals }) + } + + /// Applies one allowed proposal review action. + pub async fn consolidation_proposal_review( + &self, + req: ConsolidationProposalReviewRequest, + ) -> Result<ConsolidationProposalResponse> { + validate_context( + req.tenant_id.as_str(), + req.project_id.as_str(), + req.reviewer_agent_id.as_str(), + )?; + + let existing = elf_storage::consolidation::get_consolidation_proposal( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + req.proposal_id, + ) + .await? + .ok_or_else(|| Error::NotFound { + message: "consolidation proposal not found".to_string(), + })?; + let current = + ConsolidationReviewState::parse(existing.review_state.as_str()).ok_or_else(|| { + Error::InvalidRequest { + message: "stored proposal review_state is invalid".to_string(), + } + })?; + let now = OffsetDateTime::now_utc(); + let steps = review_steps(current, req.review_action)?; + let mut tx = self.db.pool.begin().await?; + let mut last_state = current; + let mut updated = existing; + + for (step_index, (action, next_state)) in steps.into_iter().enumerate() { + last_state.validate_transition(next_state).map_err(validation_error)?; + + let transition_time = now.saturating_add(Duration::milliseconds(step_index as i64)); + + elf_storage::consolidation::insert_consolidation_proposal_review_event( + &mut *tx, + ConsolidationProposalReviewEventInsert { + review_id: Uuid::new_v4(), + proposal_id: req.proposal_id, + run_id: updated.run_id, + tenant_id: req.tenant_id.as_str(), + project_id: req.project_id.as_str(), + reviewer_agent_id: req.reviewer_agent_id.as_str(), + action: action.as_str(), + from_review_state: last_state.as_str(), + to_review_state: next_state.as_str(), + review_comment: req.review_comment.as_deref(), + created_at: transition_time, + }, + ) + .await?; + + updated = elf_storage::consolidation::update_consolidation_proposal_review( + &mut *tx, + ConsolidationProposalReviewUpdate { + tenant_id: req.tenant_id.as_str(), + project_id: req.project_id.as_str(), + proposal_id: req.proposal_id, + review_state: next_state.as_str(), + reviewer_agent_id: req.reviewer_agent_id.as_str(), + review_comment: req.review_comment.as_deref(), + now: transition_time, + }, + ) + .await? + .ok_or_else(|| Error::NotFound { + message: "consolidation proposal not found".to_string(), + })?; + last_state = next_state; + } + + tx.commit().await?; + + let review_events = self + .consolidation_proposal_review_events( + req.tenant_id.as_str(), + req.project_id.as_str(), + req.proposal_id, + ) + .await?; + let mut response = ConsolidationProposalResponse::from(updated); + + response.review_events = review_events; + + Ok(response) + } + + async fn consolidation_proposal_review_events( + &self, + tenant_id: &str, + project_id: &str, + proposal_id: Uuid, + ) -> Result<Vec<ConsolidationProposalReviewEventResponse>> { + let events = elf_storage::consolidation::list_consolidation_proposal_review_events( + &self.db.pool, + tenant_id, + project_id, + proposal_id, + ) + .await?; + + Ok(events.into_iter().map(ConsolidationProposalReviewEventResponse::from).collect()) + } +} + +fn validate_context(tenant_id: &str, project_id: &str, agent_id: &str) -> Result<()> { + validate_non_empty("tenant_id", tenant_id)?; + validate_non_empty("project_id", project_id)?; + + validate_non_empty("agent_id", agent_id) +} + +fn validate_job_kind(job_kind: &str) -> Result<()> { + validate_non_empty("job_kind", job_kind)?; + + match job_kind { + "fixture" | "manual" => Ok(()), + _ => Err(Error::InvalidRequest { + message: "job_kind must be fixture or manual for consolidation v1.".to_string(), + }), + } +} + +fn validate_non_empty(field: &'static str, value: &str) -> Result<()> { + if value.trim().is_empty() { + return Err(Error::InvalidRequest { message: format!("{field} must not be empty.") }); + } + + Ok(()) +} + +fn validate_object(field: &str, value: &Value) -> Result<()> { + if matches!(value, Value::Object(_)) { + Ok(()) + } else { + Err(Error::InvalidRequest { message: format!("{field} must be a JSON object.") }) + } +} + +fn validation_error(err: ConsolidationValidationError) -> Error { + Error::InvalidRequest { message: err.to_string() } +} + +fn review_steps( + current: ConsolidationReviewState, + action: ConsolidationReviewAction, +) -> Result<Vec<(ConsolidationReviewAction, ConsolidationReviewState)>> { + let steps = match action { + ConsolidationReviewAction::Approve => + vec![(ConsolidationReviewAction::Approve, ConsolidationReviewState::Approved)], + ConsolidationReviewAction::Apply => match current { + ConsolidationReviewState::Proposed => vec![ + (ConsolidationReviewAction::Approve, ConsolidationReviewState::Approved), + (ConsolidationReviewAction::Apply, ConsolidationReviewState::Applied), + ], + ConsolidationReviewState::Approved => + vec![(ConsolidationReviewAction::Apply, ConsolidationReviewState::Applied)], + ConsolidationReviewState::Rejected + | ConsolidationReviewState::Applied + | ConsolidationReviewState::Archived => + vec![(ConsolidationReviewAction::Apply, ConsolidationReviewState::Applied)], + }, + ConsolidationReviewAction::Discard => + vec![(ConsolidationReviewAction::Discard, ConsolidationReviewState::Rejected)], + ConsolidationReviewAction::Defer => + vec![(ConsolidationReviewAction::Defer, ConsolidationReviewState::Archived)], + }; + let mut state = current; + + for (_, next_state) in &steps { + state.validate_transition(*next_state).map_err(validation_error)?; + + state = *next_state; + } + + Ok(steps) +} + +fn bounded_limit(limit: Option<u32>) -> i64 { + limit.map(i64::from).unwrap_or(DEFAULT_LIST_LIMIT).clamp(1, MAX_LIST_LIMIT) +} + +fn to_value<T>(value: &T) -> Result<Value> +where + T: Serialize, +{ + serde_json::to_value(value).map_err(|err| Error::InvalidRequest { + message: format!("failed to serialize consolidation contract: {err}"), + }) +} + +fn empty_object() -> Value { + Value::Object(Map::new()) +} + +fn terminal_time(state: ConsolidationRunState, now: OffsetDateTime) -> Option<OffsetDateTime> { + match state { + ConsolidationRunState::Completed + | ConsolidationRunState::Failed + | ConsolidationRunState::Cancelled => Some(now), + ConsolidationRunState::Pending | ConsolidationRunState::Running => None, + } +} diff --git a/packages/elf-service/src/core_blocks.rs b/packages/elf-service/src/core_blocks.rs new file mode 100644 index 00000000..3ff42bf9 --- /dev/null +++ b/packages/elf-service/src/core_blocks.rs @@ -0,0 +1,1230 @@ +//! Scoped core memory block APIs. + +use std::collections::{HashMap, HashSet}; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use sqlx::{FromRow, PgExecutor, Postgres, Transaction}; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{ + ElfService, Error, Result, + access::{self, ORG_PROJECT_ID}, + search, +}; +use elf_config::Config; +use elf_domain::english_gate::{self, EnglishGateKind}; + +/// Core memory blocks response schema identifier. +pub const ELF_CORE_MEMORY_BLOCKS_SCHEMA_V1: &str = "elf.core_memory_blocks/v1"; + +const MAX_CORE_BLOCK_CONTENT_CHARS: usize = 2_000; + +/// Request payload for attached core block readback. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CoreBlocksGetRequest { + /// Tenant that owns the request. + pub tenant_id: String, + /// Project context for attachment lookup. + pub project_id: String, + /// Agent requesting attached blocks. + pub agent_id: String, + /// Read profile whose exact attachments should be returned. + pub read_profile: String, +} + +/// Response payload for attached core block readback. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CoreBlocksResponse { + /// Response schema identifier. + pub schema: String, + /// Tenant that owns the request. + pub tenant_id: String, + /// Project context for attachment lookup. + pub project_id: String, + /// Agent requesting attached blocks. + pub agent_id: String, + /// Read profile used for attachment lookup. + pub read_profile: String, + /// Attached core blocks visible to the caller. + pub items: Vec<CoreBlockItem>, +} + +/// One attached core memory block. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CoreBlockItem { + /// Core block identifier. + pub block_id: Uuid, + /// Active attachment identifier that made the block visible. + pub attachment_id: Uuid, + /// Tenant that owns the block. + pub tenant_id: String, + /// Project that owns the block. + pub project_id: String, + /// Agent that owns the block's scope. + pub agent_id: String, + /// Scope key for the block. + pub scope: String, + /// Stable block key. + pub key: String, + /// Human-readable block title. + pub title: String, + /// Small always-attached context payload. + pub content: String, + /// Structured source/provenance metadata for the block. + pub source_ref: Value, + /// Lifecycle status for the block. + pub status: String, + #[serde(with = "crate::time_serde")] + /// Last block update timestamp. + pub updated_at: OffsetDateTime, + #[serde(with = "crate::time_serde")] + /// Attachment creation timestamp. + pub attached_at: OffsetDateTime, + /// Agent that created the attachment. + pub attached_by_agent_id: String, + /// Append-only block and attachment audit events. + pub audit_history: Vec<CoreBlockAuditEvent>, +} + +/// One core block audit event. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CoreBlockAuditEvent { + /// Audit event identifier. + pub event_id: Uuid, + /// Block identifier affected by the event. + pub block_id: Uuid, + /// Attachment identifier affected by the event, when applicable. + pub attachment_id: Option<Uuid>, + /// Agent that performed the event. + pub actor_agent_id: String, + /// Event type. + pub event_type: String, + /// Attachment target agent, when applicable. + pub target_agent_id: Option<String>, + /// Attachment read profile, when applicable. + pub read_profile: Option<String>, + /// Optional previous state snapshot. + pub prev_snapshot: Option<Value>, + /// Optional new state snapshot. + pub new_snapshot: Option<Value>, + /// Human-readable event reason. + pub reason: String, + #[serde(with = "crate::time_serde")] + /// Event timestamp. + pub ts: OffsetDateTime, +} + +/// Request payload for creating or updating a core block through admin APIs. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CoreBlockUpsertRequest { + /// Tenant that owns the request. + pub tenant_id: String, + /// Project context for the block. + pub project_id: String, + /// Agent creating or updating the block. + pub agent_id: String, + /// Existing block id to update. Omit to create. + pub block_id: Option<Uuid>, + /// Scope key for the block. + pub scope: String, + /// Stable block key. + pub key: String, + /// Human-readable block title. + pub title: String, + /// Small always-attached context payload. + pub content: String, + /// Structured source/provenance metadata for the block. + pub source_ref: Value, + /// Optional audit reason. + pub reason: Option<String>, +} + +/// Response payload for core block creation or update. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CoreBlockUpsertResponse { + /// Stored block record. + pub block: CoreBlockRecord, +} + +/// Core block record returned by admin mutation APIs. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CoreBlockRecord { + /// Core block identifier. + pub block_id: Uuid, + /// Tenant that owns the block. + pub tenant_id: String, + /// Project that owns the block. + pub project_id: String, + /// Agent that owns the block's scope. + pub agent_id: String, + /// Scope key for the block. + pub scope: String, + /// Stable block key. + pub key: String, + /// Human-readable block title. + pub title: String, + /// Small always-attached context payload. + pub content: String, + /// Structured source/provenance metadata for the block. + pub source_ref: Value, + /// Lifecycle status for the block. + pub status: String, + #[serde(with = "crate::time_serde")] + /// Creation timestamp. + pub created_at: OffsetDateTime, + #[serde(with = "crate::time_serde")] + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Request payload for attaching a block to an agent/read-profile pair. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CoreBlockAttachRequest { + /// Tenant that owns the request. + pub tenant_id: String, + /// Project context for the attachment. + pub project_id: String, + /// Agent creating the attachment. + pub agent_id: String, + /// Block to attach. + pub block_id: Uuid, + /// Target agent that should receive the block. + pub target_agent_id: String, + /// Exact read profile for the attachment. + pub read_profile: String, + /// Optional audit reason. + pub reason: Option<String>, +} + +/// Response payload for attaching a core block. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CoreBlockAttachResponse { + /// Attachment identifier. + pub attachment_id: Uuid, + /// Block identifier. + pub block_id: Uuid, + /// Target agent for the attachment. + pub target_agent_id: String, + /// Exact read profile for the attachment. + pub read_profile: String, + /// Agent that created the attachment. + pub attached_by_agent_id: String, + #[serde(with = "crate::time_serde")] + /// Attachment timestamp. + pub attached_at: OffsetDateTime, +} + +/// Request payload for detaching a block attachment. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CoreBlockDetachRequest { + /// Tenant that owns the request. + pub tenant_id: String, + /// Project context for the attachment. + pub project_id: String, + /// Agent detaching the block. + pub agent_id: String, + /// Attachment to detach. + pub attachment_id: Uuid, + /// Optional audit reason. + pub reason: Option<String>, +} + +/// Response payload for detaching a core block. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CoreBlockDetachResponse { + /// Attachment identifier. + pub attachment_id: Uuid, + /// Whether an active attachment was detached. + pub detached: bool, +} + +#[derive(Clone, Debug, FromRow)] +struct CoreBlockRow { + block_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + scope: String, + key: String, + title: String, + content: String, + source_ref: Value, + status: String, + created_at: OffsetDateTime, + updated_at: OffsetDateTime, +} +impl CoreBlockRow { + fn into_record(self) -> CoreBlockRecord { + CoreBlockRecord { + block_id: self.block_id, + tenant_id: self.tenant_id, + project_id: self.project_id, + agent_id: self.agent_id, + scope: self.scope, + key: self.key, + title: self.title, + content: self.content, + source_ref: self.source_ref, + status: self.status, + created_at: self.created_at, + updated_at: self.updated_at, + } + } +} + +#[derive(Clone, Debug, FromRow)] +struct CoreBlockAttachmentRow { + attachment_id: Uuid, + block_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + attached_by_agent_id: String, + attached_at: OffsetDateTime, + detached_by_agent_id: Option<String>, + detached_at: Option<OffsetDateTime>, +} + +#[derive(Clone, Debug, FromRow)] +struct CoreBlockJoinedRow { + attachment_id: Uuid, + attachment_agent_id: String, + attached_by_agent_id: String, + attached_at: OffsetDateTime, + block_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + scope: String, + key: String, + title: String, + content: String, + source_ref: Value, + status: String, + created_at: OffsetDateTime, + updated_at: OffsetDateTime, +} +impl CoreBlockJoinedRow { + fn into_item(self, audit_by_block: &HashMap<Uuid, Vec<CoreBlockAuditEvent>>) -> CoreBlockItem { + let audit_history = audit_by_block.get(&self.block_id).cloned().unwrap_or_else(Vec::new); + + CoreBlockItem { + block_id: self.block_id, + attachment_id: self.attachment_id, + tenant_id: self.tenant_id, + project_id: self.project_id, + agent_id: self.agent_id, + scope: self.scope, + key: self.key, + title: self.title, + content: self.content, + source_ref: self.source_ref, + status: self.status, + updated_at: self.updated_at, + attached_at: self.attached_at, + attached_by_agent_id: self.attached_by_agent_id, + audit_history, + } + } +} + +#[derive(Clone, Debug, FromRow)] +struct CoreBlockEventRow { + event_id: Uuid, + block_id: Uuid, + attachment_id: Option<Uuid>, + actor_agent_id: String, + event_type: String, + target_agent_id: Option<String>, + read_profile: Option<String>, + prev_snapshot: Option<Value>, + new_snapshot: Option<Value>, + reason: String, + ts: OffsetDateTime, +} + +struct PreparedGetRequest { + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + allowed_scopes: Vec<String>, +} + +struct PreparedUpsertRequest { + tenant_id: String, + project_id: String, + agent_id: String, + block_id: Option<Uuid>, + scope: String, + key: String, + title: String, + content: String, + source_ref: Value, + reason: String, +} + +struct PreparedAttachRequest { + tenant_id: String, + project_id: String, + agent_id: String, + block_id: Uuid, + target_agent_id: String, + read_profile: String, + allowed_scopes: Vec<String>, + reason: String, +} + +struct PreparedDetachRequest { + tenant_id: String, + project_id: String, + agent_id: String, + attachment_id: Uuid, + reason: String, +} + +struct CoreBlockEventInput<'a> { + block_id: Uuid, + attachment_id: Option<Uuid>, + tenant_id: &'a str, + project_id: &'a str, + actor_agent_id: &'a str, + event_type: &'a str, + target_agent_id: Option<&'a str>, + read_profile: Option<&'a str>, + prev_snapshot: Option<Value>, + new_snapshot: Option<Value>, + reason: &'a str, + ts: OffsetDateTime, +} + +impl ElfService { + /// Returns core memory blocks explicitly attached for one agent/read-profile pair. + pub async fn core_blocks_get(&self, req: CoreBlocksGetRequest) -> Result<CoreBlocksResponse> { + let prepared = prepare_get_request(&self.cfg, req)?; + let rows = fetch_attached_block_rows( + &self.db.pool, + prepared.tenant_id.as_str(), + prepared.project_id.as_str(), + prepared.agent_id.as_str(), + prepared.read_profile.as_str(), + ) + .await?; + let shared_grants = access::load_shared_read_grants_with_org_shared( + &self.db.pool, + prepared.tenant_id.as_str(), + prepared.project_id.as_str(), + prepared.agent_id.as_str(), + prepared.allowed_scopes.iter().any(|scope| scope == "org_shared"), + ) + .await?; + let visible_rows = filter_visible_rows(rows, &prepared.allowed_scopes, &shared_grants); + let block_ids = visible_rows.iter().map(|row| row.block_id).collect::<Vec<_>>(); + let audit_by_block = fetch_audit_history(&self.db.pool, &block_ids).await?; + let items = + visible_rows.into_iter().map(|row| row.into_item(&audit_by_block)).collect::<Vec<_>>(); + + Ok(CoreBlocksResponse { + schema: ELF_CORE_MEMORY_BLOCKS_SCHEMA_V1.to_string(), + tenant_id: prepared.tenant_id, + project_id: prepared.project_id, + agent_id: prepared.agent_id, + read_profile: prepared.read_profile, + items, + }) + } + + /// Creates or updates a core memory block and records append-only audit history. + pub async fn core_block_upsert( + &self, + req: CoreBlockUpsertRequest, + ) -> Result<CoreBlockUpsertResponse> { + let prepared = prepare_upsert_request(&self.cfg, req)?; + let now = OffsetDateTime::now_utc(); + let mut tx = self.db.pool.begin().await?; + let (row, prev_snapshot) = match prepared.block_id { + Some(block_id) => update_core_block(&mut tx, &prepared, block_id, now).await?, + None => (insert_core_block(&mut tx, &prepared, now).await?, None), + }; + + insert_core_block_event( + &mut tx, + CoreBlockEventInput { + block_id: row.block_id, + attachment_id: None, + tenant_id: prepared.tenant_id.as_str(), + project_id: prepared.project_id.as_str(), + actor_agent_id: prepared.agent_id.as_str(), + event_type: if prepared.block_id.is_some() { + "block_updated" + } else { + "block_created" + }, + target_agent_id: None, + read_profile: None, + prev_snapshot, + new_snapshot: Some(block_snapshot(&row)), + reason: prepared.reason.as_str(), + ts: now, + }, + ) + .await?; + + tx.commit().await?; + + Ok(CoreBlockUpsertResponse { block: row.into_record() }) + } + + /// Attaches an active core block to one exact agent/read-profile pair. + pub async fn core_block_attach( + &self, + req: CoreBlockAttachRequest, + ) -> Result<CoreBlockAttachResponse> { + let prepared = prepare_attach_request(&self.cfg, req)?; + let now = OffsetDateTime::now_utc(); + let mut tx = self.db.pool.begin().await?; + let block = fetch_active_block_for_attachment(&mut tx, &prepared).await?; + let shared_grants = access::load_shared_read_grants_with_org_shared( + &mut *tx, + prepared.tenant_id.as_str(), + prepared.project_id.as_str(), + prepared.target_agent_id.as_str(), + prepared.allowed_scopes.iter().any(|scope| scope == "org_shared"), + ) + .await?; + + if !block_read_allowed( + &block, + prepared.target_agent_id.as_str(), + &prepared.allowed_scopes, + &shared_grants, + ) { + return Err(Error::ScopeDenied { + message: "Block scope is not allowed for this attachment.".to_string(), + }); + } + + let attachment = upsert_core_block_attachment(&mut tx, &prepared, now).await?; + + insert_core_block_event( + &mut tx, + CoreBlockEventInput { + block_id: attachment.block_id, + attachment_id: Some(attachment.attachment_id), + tenant_id: prepared.tenant_id.as_str(), + project_id: prepared.project_id.as_str(), + actor_agent_id: prepared.agent_id.as_str(), + event_type: "attachment_added", + target_agent_id: Some(prepared.target_agent_id.as_str()), + read_profile: Some(prepared.read_profile.as_str()), + prev_snapshot: None, + new_snapshot: Some(attachment_snapshot(&attachment)), + reason: prepared.reason.as_str(), + ts: now, + }, + ) + .await?; + + tx.commit().await?; + + Ok(CoreBlockAttachResponse { + attachment_id: attachment.attachment_id, + block_id: attachment.block_id, + target_agent_id: attachment.agent_id, + read_profile: attachment.read_profile, + attached_by_agent_id: attachment.attached_by_agent_id, + attached_at: attachment.attached_at, + }) + } + + /// Detaches an active core block attachment and records an audit event. + pub async fn core_block_detach( + &self, + req: CoreBlockDetachRequest, + ) -> Result<CoreBlockDetachResponse> { + let prepared = prepare_detach_request(req)?; + let now = OffsetDateTime::now_utc(); + let mut tx = self.db.pool.begin().await?; + let Some(prev) = fetch_active_attachment_for_update(&mut tx, &prepared).await? else { + tx.commit().await?; + + return Ok(CoreBlockDetachResponse { + attachment_id: prepared.attachment_id, + detached: false, + }); + }; + let updated = detach_core_block_attachment(&mut tx, &prepared, now).await?; + + insert_core_block_event( + &mut tx, + CoreBlockEventInput { + block_id: updated.block_id, + attachment_id: Some(updated.attachment_id), + tenant_id: prepared.tenant_id.as_str(), + project_id: prepared.project_id.as_str(), + actor_agent_id: prepared.agent_id.as_str(), + event_type: "attachment_removed", + target_agent_id: Some(updated.agent_id.as_str()), + read_profile: Some(updated.read_profile.as_str()), + prev_snapshot: Some(attachment_snapshot(&prev)), + new_snapshot: Some(attachment_snapshot(&updated)), + reason: prepared.reason.as_str(), + ts: now, + }, + ) + .await?; + + tx.commit().await?; + + Ok(CoreBlockDetachResponse { attachment_id: updated.attachment_id, detached: true }) + } +} + +fn prepare_get_request(cfg: &Config, req: CoreBlocksGetRequest) -> Result<PreparedGetRequest> { + let tenant_id = normalize_required(req.tenant_id.as_str(), "tenant_id")?; + let project_id = normalize_required(req.project_id.as_str(), "project_id")?; + let agent_id = normalize_required(req.agent_id.as_str(), "agent_id")?; + let read_profile = normalize_required(req.read_profile.as_str(), "read_profile")?; + let allowed_scopes = search::resolve_read_profile_scopes(cfg, read_profile.as_str())?; + + Ok(PreparedGetRequest { tenant_id, project_id, agent_id, read_profile, allowed_scopes }) +} + +fn prepare_upsert_request( + cfg: &Config, + req: CoreBlockUpsertRequest, +) -> Result<PreparedUpsertRequest> { + let tenant_id = normalize_required(req.tenant_id.as_str(), "tenant_id")?; + let requested_project_id = normalize_required(req.project_id.as_str(), "project_id")?; + let agent_id = normalize_required(req.agent_id.as_str(), "agent_id")?; + let scope = normalize_required(req.scope.as_str(), "scope")?; + let key = normalize_required(req.key.as_str(), "key")?; + let title = normalize_required(req.title.as_str(), "title")?; + let content = normalize_required(req.content.as_str(), "content")?; + let reason = req + .reason + .as_deref() + .map(|value| normalize_required(value, "reason")) + .transpose()? + .unwrap_or_else(|| "core block upsert".to_string()); + let project_id = + if scope == "org_shared" { ORG_PROJECT_ID.to_string() } else { requested_project_id }; + + validate_write_scope(cfg, scope.as_str())?; + validate_english(key.as_str(), EnglishGateKind::Identifier, "$.key")?; + validate_english(title.as_str(), EnglishGateKind::NaturalLanguage, "$.title")?; + validate_english(content.as_str(), EnglishGateKind::NaturalLanguage, "$.content")?; + validate_source_ref(&req.source_ref)?; + + if content.chars().count() > MAX_CORE_BLOCK_CONTENT_CHARS { + return Err(Error::InvalidRequest { message: "content is too long.".to_string() }); + } + + Ok(PreparedUpsertRequest { + tenant_id, + project_id, + agent_id, + block_id: req.block_id, + scope, + key, + title, + content, + source_ref: req.source_ref, + reason, + }) +} + +fn prepare_attach_request( + cfg: &Config, + req: CoreBlockAttachRequest, +) -> Result<PreparedAttachRequest> { + let tenant_id = normalize_required(req.tenant_id.as_str(), "tenant_id")?; + let project_id = normalize_required(req.project_id.as_str(), "project_id")?; + let agent_id = normalize_required(req.agent_id.as_str(), "agent_id")?; + let target_agent_id = normalize_required(req.target_agent_id.as_str(), "target_agent_id")?; + let read_profile = normalize_required(req.read_profile.as_str(), "read_profile")?; + let allowed_scopes = search::resolve_read_profile_scopes(cfg, read_profile.as_str())?; + let reason = req + .reason + .as_deref() + .map(|value| normalize_required(value, "reason")) + .transpose()? + .unwrap_or_else(|| "core block attachment".to_string()); + + validate_english(target_agent_id.as_str(), EnglishGateKind::Identifier, "$.target_agent_id")?; + + Ok(PreparedAttachRequest { + tenant_id, + project_id, + agent_id, + block_id: req.block_id, + target_agent_id, + read_profile, + allowed_scopes, + reason, + }) +} + +fn prepare_detach_request(req: CoreBlockDetachRequest) -> Result<PreparedDetachRequest> { + let tenant_id = normalize_required(req.tenant_id.as_str(), "tenant_id")?; + let project_id = normalize_required(req.project_id.as_str(), "project_id")?; + let agent_id = normalize_required(req.agent_id.as_str(), "agent_id")?; + let reason = req + .reason + .as_deref() + .map(|value| normalize_required(value, "reason")) + .transpose()? + .unwrap_or_else(|| "core block detach".to_string()); + + Ok(PreparedDetachRequest { + tenant_id, + project_id, + agent_id, + attachment_id: req.attachment_id, + reason, + }) +} + +fn filter_visible_rows( + rows: Vec<CoreBlockJoinedRow>, + allowed_scopes: &[String], + shared_grants: &HashSet<access::SharedSpaceGrantKey>, +) -> Vec<CoreBlockJoinedRow> { + rows.into_iter() + .filter(|row| { + let block = CoreBlockRow { + block_id: row.block_id, + tenant_id: row.tenant_id.clone(), + project_id: row.project_id.clone(), + agent_id: row.agent_id.clone(), + scope: row.scope.clone(), + key: row.key.clone(), + title: row.title.clone(), + content: row.content.clone(), + source_ref: row.source_ref.clone(), + status: row.status.clone(), + created_at: row.created_at, + updated_at: row.updated_at, + }; + + block_read_allowed( + &block, + row.attachment_agent_id.as_str(), + allowed_scopes, + shared_grants, + ) + }) + .collect() +} + +fn block_read_allowed( + block: &CoreBlockRow, + requester_agent_id: &str, + allowed_scopes: &[String], + shared_grants: &HashSet<access::SharedSpaceGrantKey>, +) -> bool { + if block.status != "active" { + return false; + } + if !allowed_scopes.iter().any(|scope| scope == &block.scope) { + return false; + } + if block.scope == "agent_private" { + return block.agent_id == requester_agent_id; + } + if !matches!(block.scope.as_str(), "project_shared" | "org_shared") { + return false; + } + if block.agent_id == requester_agent_id { + return true; + } + + shared_grants.contains(&access::SharedSpaceGrantKey { + scope: block.scope.clone(), + space_owner_agent_id: block.agent_id.clone(), + }) +} + +fn block_snapshot(block: &CoreBlockRow) -> Value { + serde_json::json!({ + "block_id": block.block_id, + "tenant_id": block.tenant_id, + "project_id": block.project_id, + "agent_id": block.agent_id, + "scope": block.scope, + "key": block.key, + "title": block.title, + "content": block.content, + "source_ref": block.source_ref, + "status": block.status, + "created_at": block.created_at, + "updated_at": block.updated_at, + }) +} + +fn attachment_snapshot(attachment: &CoreBlockAttachmentRow) -> Value { + serde_json::json!({ + "attachment_id": attachment.attachment_id, + "block_id": attachment.block_id, + "tenant_id": attachment.tenant_id, + "project_id": attachment.project_id, + "agent_id": attachment.agent_id, + "read_profile": attachment.read_profile, + "attached_by_agent_id": attachment.attached_by_agent_id, + "attached_at": attachment.attached_at, + "detached_by_agent_id": attachment.detached_by_agent_id, + "detached_at": attachment.detached_at, + }) +} + +fn normalize_required(raw: &str, field: &str) -> Result<String> { + let trimmed = raw.trim(); + + if trimmed.is_empty() { + return Err(Error::InvalidRequest { message: format!("{field} is required.") }); + } + + Ok(trimmed.to_string()) +} + +fn validate_write_scope(cfg: &Config, scope: &str) -> Result<()> { + if !cfg.scopes.allowed.iter().any(|allowed| allowed == scope) { + return Err(Error::ScopeDenied { message: "Scope is not allowed.".to_string() }); + } + + let write_allowed = match scope { + "agent_private" => cfg.scopes.write_allowed.agent_private, + "project_shared" => cfg.scopes.write_allowed.project_shared, + "org_shared" => cfg.scopes.write_allowed.org_shared, + _ => false, + }; + + if !write_allowed { + return Err(Error::ScopeDenied { message: "Scope is not allowed.".to_string() }); + } + + Ok(()) +} + +fn validate_english(input: &str, kind: EnglishGateKind, field: &str) -> Result<()> { + english_gate::english_gate(input, kind) + .map_err(|_| Error::NonEnglishInput { field: field.to_string() }) +} + +fn validate_source_ref(source_ref: &Value) -> Result<()> { + if !source_ref.is_object() { + return Err(Error::InvalidRequest { + message: "source_ref must be a JSON object.".to_string(), + }); + } + + Ok(()) +} + +async fn insert_core_block( + tx: &mut Transaction<'_, Postgres>, + req: &PreparedUpsertRequest, + now: OffsetDateTime, +) -> Result<CoreBlockRow> { + ensure_no_active_key_conflict(tx, req, None).await?; + + sqlx::query_as::<_, CoreBlockRow>( + "\ +INSERT INTO core_memory_blocks ( + block_id, + tenant_id, + project_id, + agent_id, + scope, + key, + title, + content, + source_ref, + status, + created_at, + updated_at +) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, 'active', $10, $10) +RETURNING *", + ) + .bind(Uuid::new_v4()) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(req.agent_id.as_str()) + .bind(req.scope.as_str()) + .bind(req.key.as_str()) + .bind(req.title.as_str()) + .bind(req.content.as_str()) + .bind(&req.source_ref) + .bind(now) + .fetch_one(&mut **tx) + .await + .map_err(Into::into) +} + +async fn update_core_block( + tx: &mut Transaction<'_, Postgres>, + req: &PreparedUpsertRequest, + block_id: Uuid, + now: OffsetDateTime, +) -> Result<(CoreBlockRow, Option<Value>)> { + let prev = fetch_owned_block_for_update(tx, req, block_id).await?; + let prev_snapshot = Some(block_snapshot(&prev)); + + ensure_no_active_key_conflict(tx, req, Some(block_id)).await?; + + let row = sqlx::query_as::<_, CoreBlockRow>( + "\ +UPDATE core_memory_blocks +SET + key = $6, + title = $7, + content = $8, + source_ref = $9, + updated_at = $10 +WHERE block_id = $1 + AND tenant_id = $2 + AND project_id = $3 + AND agent_id = $4 + AND scope = $5 + AND status = 'active' +RETURNING *", + ) + .bind(block_id) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(req.agent_id.as_str()) + .bind(req.scope.as_str()) + .bind(req.key.as_str()) + .bind(req.title.as_str()) + .bind(req.content.as_str()) + .bind(&req.source_ref) + .bind(now) + .fetch_optional(&mut **tx) + .await? + .ok_or_else(|| Error::NotFound { message: "Core block not found.".to_string() })?; + + Ok((row, prev_snapshot)) +} + +async fn fetch_owned_block_for_update( + tx: &mut Transaction<'_, Postgres>, + req: &PreparedUpsertRequest, + block_id: Uuid, +) -> Result<CoreBlockRow> { + sqlx::query_as::<_, CoreBlockRow>( + "\ +SELECT * +FROM core_memory_blocks +WHERE block_id = $1 + AND tenant_id = $2 + AND project_id = $3 + AND agent_id = $4 + AND scope = $5 + AND status = 'active' +FOR UPDATE", + ) + .bind(block_id) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(req.agent_id.as_str()) + .bind(req.scope.as_str()) + .fetch_optional(&mut **tx) + .await? + .ok_or_else(|| Error::NotFound { message: "Core block not found.".to_string() }) +} + +async fn ensure_no_active_key_conflict( + tx: &mut Transaction<'_, Postgres>, + req: &PreparedUpsertRequest, + block_id: Option<Uuid>, +) -> Result<()> { + let conflict: Option<Uuid> = sqlx::query_scalar( + "\ +SELECT block_id +FROM core_memory_blocks +WHERE tenant_id = $1 + AND project_id = $2 + AND agent_id = $3 + AND scope = $4 + AND key = $5 + AND status = 'active' + AND ($6::uuid IS NULL OR block_id <> $6) +LIMIT 1", + ) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(req.agent_id.as_str()) + .bind(req.scope.as_str()) + .bind(req.key.as_str()) + .bind(block_id) + .fetch_optional(&mut **tx) + .await?; + + if conflict.is_some() { + return Err(Error::Conflict { message: "Core block key already exists.".to_string() }); + } + + Ok(()) +} + +async fn fetch_active_block_for_attachment( + tx: &mut Transaction<'_, Postgres>, + req: &PreparedAttachRequest, +) -> Result<CoreBlockRow> { + sqlx::query_as::<_, CoreBlockRow>( + "\ +SELECT * +FROM core_memory_blocks +WHERE block_id = $1 + AND tenant_id = $2 + AND status = 'active' + AND ( + project_id = $3 + OR (project_id = $4 AND scope = 'org_shared') + )", + ) + .bind(req.block_id) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(ORG_PROJECT_ID) + .fetch_optional(&mut **tx) + .await? + .ok_or_else(|| Error::NotFound { message: "Core block not found.".to_string() }) +} + +async fn upsert_core_block_attachment( + tx: &mut Transaction<'_, Postgres>, + req: &PreparedAttachRequest, + now: OffsetDateTime, +) -> Result<CoreBlockAttachmentRow> { + sqlx::query_as::<_, CoreBlockAttachmentRow>( + "\ +INSERT INTO core_memory_block_attachments ( + attachment_id, + block_id, + tenant_id, + project_id, + agent_id, + read_profile, + attached_by_agent_id, + attached_at +) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8) +ON CONFLICT (tenant_id, project_id, agent_id, read_profile, block_id) +WHERE detached_at IS NULL +DO UPDATE +SET + attached_by_agent_id = EXCLUDED.attached_by_agent_id, + attached_at = EXCLUDED.attached_at, + detached_by_agent_id = NULL, + detached_at = NULL +RETURNING *", + ) + .bind(Uuid::new_v4()) + .bind(req.block_id) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(req.target_agent_id.as_str()) + .bind(req.read_profile.as_str()) + .bind(req.agent_id.as_str()) + .bind(now) + .fetch_one(&mut **tx) + .await + .map_err(Into::into) +} + +async fn fetch_active_attachment_for_update( + tx: &mut Transaction<'_, Postgres>, + req: &PreparedDetachRequest, +) -> Result<Option<CoreBlockAttachmentRow>> { + sqlx::query_as::<_, CoreBlockAttachmentRow>( + "\ +SELECT * +FROM core_memory_block_attachments +WHERE attachment_id = $1 + AND tenant_id = $2 + AND project_id = $3 + AND detached_at IS NULL +FOR UPDATE", + ) + .bind(req.attachment_id) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .fetch_optional(&mut **tx) + .await + .map_err(Into::into) +} + +async fn detach_core_block_attachment( + tx: &mut Transaction<'_, Postgres>, + req: &PreparedDetachRequest, + now: OffsetDateTime, +) -> Result<CoreBlockAttachmentRow> { + sqlx::query_as::<_, CoreBlockAttachmentRow>( + "\ +UPDATE core_memory_block_attachments +SET + detached_by_agent_id = $4, + detached_at = $5 +WHERE attachment_id = $1 + AND tenant_id = $2 + AND project_id = $3 + AND detached_at IS NULL +RETURNING *", + ) + .bind(req.attachment_id) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(req.agent_id.as_str()) + .bind(now) + .fetch_one(&mut **tx) + .await + .map_err(Into::into) +} + +async fn fetch_attached_block_rows<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + agent_id: &str, + read_profile: &str, +) -> Result<Vec<CoreBlockJoinedRow>> +where + E: PgExecutor<'e>, +{ + sqlx::query_as::<_, CoreBlockJoinedRow>( + "\ +SELECT + a.attachment_id, + a.agent_id AS attachment_agent_id, + a.attached_by_agent_id, + a.attached_at, + b.block_id, + b.tenant_id, + b.project_id, + b.agent_id, + b.scope, + b.key, + b.title, + b.content, + b.source_ref, + b.status, + b.created_at, + b.updated_at +FROM core_memory_block_attachments a +JOIN core_memory_blocks b ON b.block_id = a.block_id +WHERE a.tenant_id = $1 + AND a.project_id = $2 + AND a.agent_id = $3 + AND a.read_profile = $4 + AND a.detached_at IS NULL + AND b.status = 'active' +ORDER BY a.attached_at ASC, b.key ASC", + ) + .bind(tenant_id) + .bind(project_id) + .bind(agent_id) + .bind(read_profile) + .fetch_all(executor) + .await + .map_err(Into::into) +} + +async fn fetch_audit_history<'e, E>( + executor: E, + block_ids: &[Uuid], +) -> Result<HashMap<Uuid, Vec<CoreBlockAuditEvent>>> +where + E: PgExecutor<'e>, +{ + if block_ids.is_empty() { + return Ok(HashMap::new()); + } + + let rows = sqlx::query_as::<_, CoreBlockEventRow>( + "\ +SELECT + event_id, + block_id, + attachment_id, + actor_agent_id, + event_type, + target_agent_id, + read_profile, + prev_snapshot, + new_snapshot, + reason, + ts +FROM core_memory_block_events +WHERE block_id = ANY($1) +ORDER BY ts ASC, event_id ASC", + ) + .bind(block_ids) + .fetch_all(executor) + .await?; + let mut by_block: HashMap<Uuid, Vec<CoreBlockAuditEvent>> = HashMap::new(); + + for row in rows { + by_block.entry(row.block_id).or_default().push(CoreBlockAuditEvent { + event_id: row.event_id, + block_id: row.block_id, + attachment_id: row.attachment_id, + actor_agent_id: row.actor_agent_id, + event_type: row.event_type, + target_agent_id: row.target_agent_id, + read_profile: row.read_profile, + prev_snapshot: row.prev_snapshot, + new_snapshot: row.new_snapshot, + reason: row.reason, + ts: row.ts, + }); + } + + Ok(by_block) +} + +async fn insert_core_block_event( + tx: &mut Transaction<'_, Postgres>, + event: CoreBlockEventInput<'_>, +) -> Result<()> { + sqlx::query( + "\ +INSERT INTO core_memory_block_events ( + event_id, + block_id, + attachment_id, + tenant_id, + project_id, + actor_agent_id, + event_type, + target_agent_id, + read_profile, + prev_snapshot, + new_snapshot, + reason, + ts +) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)", + ) + .bind(Uuid::new_v4()) + .bind(event.block_id) + .bind(event.attachment_id) + .bind(event.tenant_id) + .bind(event.project_id) + .bind(event.actor_agent_id) + .bind(event.event_type) + .bind(event.target_agent_id) + .bind(event.read_profile) + .bind(event.prev_snapshot) + .bind(event.new_snapshot) + .bind(event.reason) + .bind(event.ts) + .execute(&mut **tx) + .await?; + + Ok(()) +} diff --git a/packages/elf-service/src/delete.rs b/packages/elf-service/src/delete.rs index 0f89b776..34b2fc7f 100644 --- a/packages/elf-service/src/delete.rs +++ b/packages/elf-service/src/delete.rs @@ -1,50 +1,67 @@ -// crates.io +//! Note deletion APIs. + +use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use uuid::Uuid; -// self +use crate::{ElfService, Error, InsertVersionArgs, NoteOp, Result, access::ORG_PROJECT_ID}; use elf_storage::models::MemoryNote; -use crate::{ElfService, InsertVersionArgs, NoteOp, ServiceError, ServiceResult}; - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Request payload for note deletion. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct DeleteRequest { + /// Tenant that owns the note. pub tenant_id: String, + /// Project that owns the note. pub project_id: String, + /// Agent requesting the deletion. pub agent_id: String, + /// Identifier of the note to delete. pub note_id: Uuid, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Response payload for note deletion. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct DeleteResponse { + /// Identifier of the affected note. pub note_id: Uuid, + /// Operation that was applied. pub op: NoteOp, } impl ElfService { - pub async fn delete(&self, req: DeleteRequest) -> ServiceResult<DeleteResponse> { + /// Soft-deletes one note when the caller owns it and the scope is writable. + pub async fn delete(&self, req: DeleteRequest) -> Result<DeleteResponse> { let now = OffsetDateTime::now_utc(); let tenant_id = req.tenant_id.trim(); let project_id = req.project_id.trim(); let agent_id = req.agent_id.trim(); + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { - return Err(ServiceError::InvalidRequest { + return Err(Error::InvalidRequest { message: "tenant_id, project_id, and agent_id are required.".to_string(), }); } + let mut tx = self.db.pool.begin().await?; - let mut note: MemoryNote = sqlx::query_as( - "SELECT * FROM memory_notes \ - WHERE note_id = $1 AND tenant_id = $2 AND project_id = $3 AND agent_id = $4 \ - FOR UPDATE", + let mut note: MemoryNote = sqlx::query_as::<_, MemoryNote>( + "\ +SELECT * +FROM memory_notes +WHERE note_id = $1 AND tenant_id = $2 AND project_id IN ($3, $4) +FOR UPDATE", ) .bind(req.note_id) .bind(tenant_id) .bind(project_id) - .bind(agent_id) + .bind(ORG_PROJECT_ID) .fetch_optional(&mut *tx) .await? - .ok_or_else(|| ServiceError::InvalidRequest { message: "Note not found.".to_string() })?; + .ok_or_else(|| Error::InvalidRequest { message: "Note not found.".to_string() })?; + + if note.agent_id != agent_id { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + } let scope_allowed = self.cfg.scopes.allowed.iter().any(|scope| scope == ¬e.scope); let write_allowed = match note.scope.as_str() { @@ -53,40 +70,41 @@ impl ElfService { "org_shared" => self.cfg.scopes.write_allowed.org_shared, _ => false, }; + if !scope_allowed || !write_allowed { - return Err(ServiceError::ScopeDenied { message: "Scope is not allowed.".to_string() }); + return Err(Error::ScopeDenied { message: "Scope is not allowed.".to_string() }); } - if note.status == "deleted" { tx.commit().await?; + return Ok(DeleteResponse { note_id: note.note_id, op: NoteOp::None }); } let prev_snapshot = crate::note_snapshot(¬e); + note.status = "deleted".to_string(); note.updated_at = now; sqlx::query("UPDATE memory_notes SET status = $1, updated_at = $2 WHERE note_id = $3") - .bind(¬e.status) + .bind(note.status.as_str()) .bind(note.updated_at) .bind(note.note_id) .execute(&mut *tx) .await?; - crate::insert_version( - &mut tx, + &mut *tx, InsertVersionArgs { note_id: note.note_id, op: "DELETE", prev_snapshot: Some(prev_snapshot), new_snapshot: Some(crate::note_snapshot(¬e)), reason: "delete", - actor: "delete", + actor: agent_id, ts: now, }, ) .await?; - crate::enqueue_outbox_tx(&mut tx, note.note_id, "DELETE", ¬e.embedding_version, now) + crate::enqueue_outbox_tx(&mut *tx, note.note_id, "DELETE", ¬e.embedding_version, now) .await?; tx.commit().await?; diff --git a/packages/elf-service/src/docs.rs b/packages/elf-service/src/docs.rs new file mode 100644 index 00000000..ec9b652b --- /dev/null +++ b/packages/elf-service/src/docs.rs @@ -0,0 +1,3198 @@ +//! Document ingestion and retrieval APIs. + +use std::{ + collections::{HashMap, HashSet}, + slice, +}; + +use qdrant_client::{ + Qdrant, + qdrant::{ + Condition, DatetimeRange, Document, Filter, Fusion, MinShould, PrefetchQueryBuilder, Query, + QueryPointsBuilder, ScoredPoint, Timestamp, point_id::PointIdOptions, + }, +}; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; +use sqlx::{FromRow, PgExecutor, PgPool}; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; +use tokenizers::Tokenizer; +use uuid::Uuid; + +use crate::{ + ElfService, Error, Result, + access::{self, ORG_PROJECT_ID, SharedSpaceGrantKey}, + search, +}; +use elf_config::Config; +use elf_domain::{ + english_gate, + writegate::{self, WritePolicy, WritePolicyAudit}, +}; +use elf_storage::{ + doc_outbox, docs, + models::{DocChunk, DocDocument}, + qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME}, +}; + +const MAX_TOP_K: u32 = 32; +const MAX_CANDIDATE_K: u32 = 1_024; +const DEFAULT_DOC_MAX_BYTES: usize = 4 * 1_024 * 1_024; +const DEFAULT_MAX_CHUNKS_PER_DOC: usize = 4_096; +const DEFAULT_L0_MAX_BYTES: usize = 256; +const DEFAULT_L1_MAX_BYTES: usize = 8 * 1_024; +const DEFAULT_L2_MAX_BYTES: usize = 32 * 1_024; +const DOC_RETRIEVAL_TRAJECTORY_SCHEMA_V1: &str = "doc_retrieval_trajectory/v1"; +const DOC_SOURCE_REF_SCHEMA_V1: &str = "source_ref/v1"; +const DOC_SOURCE_REF_RESOLVER_V1: &str = "elf_doc_ext/v1"; +const DOC_STATUSES: [&str; 2] = ["active", "deleted"]; + +/// Document classification used for persistence and retrieval filters. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum DocType { + /// Long-lived knowledge-base material. + Knowledge, + /// Chat transcripts or conversational context. + Chat, + /// Search-produced reference material. + Search, + /// Development-oriented artifacts such as code or plans. + Dev, +} +impl DocType { + /// Returns the canonical storage and API string for this document type. + pub fn as_str(self) -> &'static str { + match self { + Self::Knowledge => "knowledge", + Self::Chat => "chat", + Self::Search => "search", + Self::Dev => "dev", + } + } + + /// Parses a canonical document-type string. + pub fn parse(raw_doc_type: &str) -> Result<Self> { + match raw_doc_type { + "knowledge" => Ok(Self::Knowledge), + "chat" => Ok(Self::Chat), + "search" => Ok(Self::Search), + "dev" => Ok(Self::Dev), + _ => Err(Error::InvalidRequest { + message: "doc_type must be one of: knowledge, chat, search, dev.".to_string(), + }), + } + } +} + +/// Request payload for document ingestion. +#[derive(Clone, Debug, Deserialize)] +pub struct DocsPutRequest { + /// Tenant that owns the document. + pub tenant_id: String, + /// Project that owns the document. + pub project_id: String, + /// Agent ingesting the document. + pub agent_id: String, + /// Scope to assign to the document. + pub scope: String, + /// Optional raw document-type string. + pub doc_type: Option<String>, + /// Optional display title for the document. + pub title: Option<String>, + /// Optional write policy applied before persistence. + pub write_policy: Option<WritePolicy>, + #[serde(default)] + /// Structured provenance metadata for the document. + pub source_ref: Value, + /// Full document body to store and chunk. + pub content: String, +} + +/// Response payload for document ingestion. +#[derive(Clone, Debug, Serialize)] +pub struct DocsPutResponse { + /// Identifier of the stored document. + pub doc_id: Uuid, + /// Number of persisted chunks generated from the content. + pub chunk_count: u32, + /// Byte length of the stored content. + pub content_bytes: u32, + /// Whole-document BLAKE3 hash. + pub content_hash: String, + #[serde(skip_serializing_if = "Option::is_none")] + /// Write-policy audit emitted for the stored document, when applicable. + pub write_policy_audit: Option<WritePolicyAudit>, +} + +/// Request payload for document metadata lookup. +#[derive(Clone, Debug, Deserialize)] +pub struct DocsGetRequest { + /// Tenant that owns the document. + pub tenant_id: String, + /// Project that owns the document. + pub project_id: String, + /// Agent requesting the read. + pub agent_id: String, + /// Read profile that determines visible scopes. + pub read_profile: String, + /// Identifier of the document to fetch. + pub doc_id: Uuid, +} + +/// Response payload for document metadata lookup. +#[derive(Clone, Debug, Serialize)] +pub struct DocsGetResponse { + /// Document identifier. + pub doc_id: Uuid, + /// Tenant that owns the document. + pub tenant_id: String, + /// Project that owns the document. + pub project_id: String, + /// Agent that ingested the document. + pub agent_id: String, + /// Scope key for the document. + pub scope: String, + /// Stored document type. + pub doc_type: String, + /// Lifecycle status for the document. + pub status: String, + /// Optional document title. + pub title: Option<String>, + /// Structured provenance metadata. + pub source_ref: Value, + /// Byte length of the stored content. + pub content_bytes: u32, + /// Whole-document BLAKE3 hash. + pub content_hash: String, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Request payload for L0 document retrieval. +#[derive(Clone, Debug, Deserialize)] +pub struct DocsSearchL0Request { + /// Tenant to search within. + pub tenant_id: String, + /// Project to search within. + pub project_id: String, + /// Agent used for access-control checks. + pub caller_agent_id: String, + /// Read profile that determines visible scopes. + pub read_profile: String, + /// Search query text. + pub query: String, + /// Optional scope filter. + pub scope: Option<String>, + /// Optional status filter. + pub status: Option<String>, + /// Optional document-type filter. + pub doc_type: Option<String>, + /// Sparse-retrieval mode override. + pub sparse_mode: Option<String>, + /// Optional domain filter from source metadata. + pub domain: Option<String>, + /// Optional repository filter from source metadata. + pub repo: Option<String>, + /// Optional agent filter. + pub agent_id: Option<String>, + /// Optional thread filter. + pub thread_id: Option<String>, + /// Optional lower bound for `updated_at`. + pub updated_after: Option<String>, + /// Optional upper bound for `updated_at`. + pub updated_before: Option<String>, + /// Optional lower bound for source timestamp metadata. + pub ts_gte: Option<String>, + /// Optional upper bound for source timestamp metadata. + pub ts_lte: Option<String>, + /// Maximum number of returned items. + pub top_k: Option<u32>, + /// Retrieval breadth before deduplication and projection. + pub candidate_k: Option<u32>, + /// When true, includes retrieval trajectory output. + pub explain: Option<bool>, +} + +/// One chunk-level hit returned by `docs_search_l0`. +#[derive(Clone, Debug, Serialize)] +pub struct DocsSearchL0Item { + /// Document identifier. + pub doc_id: Uuid, + /// Chunk identifier. + pub chunk_id: Uuid, + /// Stable pointer bundle for later excerpt or resolution workflows. + pub pointer: DocsSearchL0ItemPointer, + /// Final score after retrieval and boosting. + pub score: f32, + /// Returned snippet text. + pub snippet: String, + /// Scope key for the document. + pub scope: String, + /// Stored document type. + pub doc_type: String, + /// Project that owns the document. + pub project_id: String, + /// Agent that ingested the document. + pub agent_id: String, + /// Last update timestamp for the document. + pub updated_at: OffsetDateTime, + /// Whole-document BLAKE3 hash. + pub content_hash: String, + /// Chunk-level BLAKE3 hash. + pub chunk_hash: String, +} + +/// Response payload for `docs_search_l0`. +#[derive(Clone, Debug, Serialize)] +pub struct DocsSearchL0Response { + /// Retrieval trace identifier. + pub trace_id: Uuid, + /// Returned chunk hits. + pub items: Vec<DocsSearchL0Item>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional retrieval trajectory emitted in explain mode. + pub trajectory: Option<DocRetrievalTrajectory>, +} + +/// Stable pointer for a chunk hit returned by document search. +#[derive(Clone, Debug, Serialize)] +pub struct DocsSearchL0ItemPointer { + /// Pointer schema identifier. + pub schema: String, + /// Pointer resolver identifier. + pub resolver: String, + #[serde(rename = "ref")] + /// Logical identifiers used by the resolver. + pub reference: DocsSearchL0ItemReference, + /// Freshness guard for the pointer target. + pub state: DocsSearchL0ItemState, +} + +/// Logical identifiers for a document-search hit. +#[derive(Clone, Debug, Serialize)] +pub struct DocsSearchL0ItemReference { + /// Document identifier. + pub doc_id: Uuid, + /// Chunk identifier. + pub chunk_id: Uuid, +} + +/// Freshness guard for a document-search hit. +#[derive(Clone, Debug, Serialize)] +pub struct DocsSearchL0ItemState { + /// Whole-document BLAKE3 hash. + pub content_hash: String, + /// Chunk-level BLAKE3 hash. + pub chunk_hash: String, + #[serde(with = "crate::time_serde")] + /// Last update timestamp for the document. + pub doc_updated_at: OffsetDateTime, +} + +/// Explain payload for a document retrieval run. +#[derive(Clone, Debug, Serialize)] +pub struct DocRetrievalTrajectory { + /// Trajectory schema identifier. + pub schema: String, + /// Ordered retrieval stages. + pub stages: Vec<DocRetrievalTrajectoryStage>, +} + +/// One stage in a document retrieval trajectory. +#[derive(Clone, Debug, Serialize)] +pub struct DocRetrievalTrajectoryStage { + /// Zero-based stage order. + pub stage_order: u32, + /// Stable stage name. + pub stage_name: String, + /// Free-form stage statistics. + pub stats: Value, +} + +/// Quote-based selector for excerpt extraction. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TextQuoteSelector { + /// Exact quote text to resolve. + pub exact: String, + /// Optional leading context used to disambiguate repeated quotes. + pub prefix: Option<String>, + /// Optional trailing context used to disambiguate repeated quotes. + pub suffix: Option<String>, +} + +/// Byte-position selector for excerpt extraction. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TextPositionSelector { + /// Inclusive start byte offset. + pub start: usize, + /// Exclusive end byte offset. + pub end: usize, +} + +/// Request payload for excerpt retrieval. +#[derive(Clone, Debug, Deserialize)] +pub struct DocsExcerptsGetRequest { + /// Tenant that owns the document. + pub tenant_id: String, + /// Project that owns the document. + pub project_id: String, + /// Agent requesting the read. + pub agent_id: String, + /// Read profile that determines visible scopes. + pub read_profile: String, + /// Identifier of the source document. + pub doc_id: Uuid, + /// Excerpt budget level: `L0`, `L1`, or `L2`. + pub level: String, // "L0" | "L1" | "L2" + /// Optional chunk identifier when the caller already knows the chunk. + pub chunk_id: Option<Uuid>, + /// Optional quote-based selector. + pub quote: Option<TextQuoteSelector>, + /// Optional byte-position selector. + pub position: Option<TextPositionSelector>, + /// When true, includes retrieval trajectory output. + pub explain: Option<bool>, +} + +/// Verification metadata for one extracted excerpt. +#[derive(Clone, Debug, Serialize)] +pub struct DocsExcerptVerification { + /// Whether the excerpt selectors verified against current content. + pub verified: bool, + /// Verification failure codes. + pub verification_errors: Vec<String>, + /// Whole-document BLAKE3 hash. + pub content_hash: String, + /// BLAKE3 hash of the returned excerpt. + pub excerpt_hash: String, +} + +/// Response payload for excerpt retrieval. +#[derive(Clone, Debug, Serialize)] +pub struct DocsExcerptResponse { + /// Excerpt trace identifier. + pub trace_id: Uuid, + /// Identifier of the source document. + pub doc_id: Uuid, + /// Returned excerpt text. + pub excerpt: String, + /// Inclusive start offset of the returned window. + pub start_offset: usize, + /// Exclusive end offset of the returned window. + pub end_offset: usize, + /// Concrete selector resolution result. + pub locator: DocsExcerptLocator, + /// Verification metadata for the returned excerpt. + pub verification: DocsExcerptVerification, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional retrieval trajectory emitted in explain mode. + pub trajectory: Option<DocRetrievalTrajectory>, +} + +/// Selector resolution metadata for an excerpt. +#[derive(Clone, Debug, Serialize)] +pub struct DocsExcerptLocator { + /// Selector kind that produced the match. + pub selector_kind: String, + /// Inclusive start offset of the matched selector span. + pub match_start_offset: usize, + /// Exclusive end offset of the matched selector span. + pub match_end_offset: usize, + #[serde(skip_serializing_if = "Option::is_none")] + /// Matched chunk identifier, when known. + pub chunk_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Quote selector actually used for resolution. + pub quote: Option<TextQuoteSelector>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Position selector actually used for resolution. + pub position: Option<TextPositionSelector>, +} + +#[derive(Clone, Copy)] +struct DocExcerptMatch { + selector_kind: ExcerptsSelectorKind, + match_start_offset: usize, + match_end_offset: usize, +} + +struct DocExcerptRange { + selector_kind: ExcerptsSelectorKind, + match_start_offset: usize, + match_end_offset: usize, + start_offset: usize, + end_offset: usize, +} + +struct DocTrajectoryBuilder { + explain: bool, + stages: Vec<DocRetrievalTrajectoryStage>, + stage_order: u32, +} +impl DocTrajectoryBuilder { + fn new(explain: bool) -> Self { + Self { explain, stages: Vec::new(), stage_order: 0 } + } + + fn push(&mut self, stage_name: &str, stats: Value) { + if !self.explain { + return; + } + + self.stages.push(DocRetrievalTrajectoryStage { + stage_order: self.stage_order, + stage_name: stage_name.to_string(), + stats, + }); + + self.stage_order += 1; + } + + fn into_trajectory(self) -> Option<DocRetrievalTrajectory> { + if !self.explain { + return None; + } + + Some(DocRetrievalTrajectory { + schema: DOC_RETRIEVAL_TRAJECTORY_SCHEMA_V1.to_string(), + stages: self.stages, + }) + } +} + +#[derive(Clone, Debug)] +struct DocsSearchL0Filters { + scope: Option<String>, + status: String, + doc_type: Option<DocType>, + sparse_mode: DocsSparseMode, + domain: Option<String>, + repo: Option<String>, + agent_id: Option<String>, + thread_id: Option<String>, + updated_after: Option<OffsetDateTime>, + updated_before: Option<OffsetDateTime>, + ts_gte: Option<OffsetDateTime>, + ts_lte: Option<OffsetDateTime>, +} + +#[derive(Clone, Copy, Debug)] +struct DocChunkingProfile { + max_tokens: usize, + overlap_tokens: usize, + max_chunks: usize, +} + +#[derive(Clone, Debug)] +struct ByteChunk { + chunk_id: Uuid, + start_offset: usize, + end_offset: usize, + text: String, +} + +#[derive(Debug)] +struct ValidatedDocsPut { + doc_type: DocType, + content: String, + write_policy_audit: Option<WritePolicyAudit>, +} + +#[derive(Clone, Debug, FromRow)] +struct DocSearchRow { + chunk_id: Uuid, + doc_id: Uuid, + scope: String, + doc_type: String, + project_id: String, + agent_id: String, + updated_at: OffsetDateTime, + content_hash: String, + chunk_hash: String, + chunk_text: String, +} + +struct DocsSearchL0Prepared { + top_k: u32, + candidate_k: u32, + sparse_mode: DocsSparseMode, + sparse_enabled: bool, + now: OffsetDateTime, + trajectory: DocTrajectoryBuilder, + allowed_scopes: Vec<String>, + shared_grants: HashSet<SharedSpaceGrantKey>, + filter: Filter, + vector: Vec<f32>, + status: String, +} + +#[derive(Debug)] +struct DocsSearchL0FiltersParsed { + scope: Option<String>, + status: String, + doc_type: Option<DocType>, + sparse_mode: DocsSparseMode, + domain: Option<String>, + repo: Option<String>, + agent_id: Option<String>, + thread_id: Option<String>, +} + +#[derive(Debug)] +struct DocsSearchL0RangesParsed { + updated_after: Option<OffsetDateTime>, + updated_before: Option<OffsetDateTime>, + ts_gte: Option<OffsetDateTime>, + ts_lte: Option<OffsetDateTime>, +} + +impl ElfService { + /// Validates, chunks, stores, and enqueues a document for indexing. + pub async fn docs_put(&self, req: DocsPutRequest) -> Result<DocsPutResponse> { + let ValidatedDocsPut { doc_type, content, write_policy_audit } = validate_docs_put(&req)?; + let now = OffsetDateTime::now_utc(); + let embed_version = crate::embedding_version(&self.cfg); + let DocsPutRequest { tenant_id, project_id, agent_id, scope, title, source_ref, .. } = req; + let chunking_profile = resolve_doc_chunking_profile(doc_type); + let tokenizer = load_tokenizer(&self.cfg)?; + let effective_project_id = + if scope.trim() == "org_shared" { ORG_PROJECT_ID } else { project_id.as_str() }; + let content_bytes = content.len(); + let content_hash = blake3::hash(content.as_bytes()); + let doc_id = Uuid::new_v4(); + let chunks = split_tokens_by_offsets( + content.as_str(), + chunking_profile.max_tokens, + chunking_profile.overlap_tokens, + chunking_profile.max_chunks, + &tokenizer, + )?; + let doc_row = DocDocument { + doc_id, + tenant_id: tenant_id.clone(), + project_id: effective_project_id.to_string(), + agent_id: agent_id.clone(), + scope: scope.clone(), + doc_type: doc_type.as_str().to_string(), + status: "active".to_string(), + title, + source_ref: docs::normalize_source_ref(Some(source_ref)), + content, + content_bytes: content_bytes as i32, + content_hash: content_hash.to_hex().to_string(), + created_at: now, + updated_at: now, + }; + let mut tx = self.db.pool.begin().await?; + + docs::insert_doc_document(&mut *tx, &doc_row).await?; + + for (chunk_index, chunk) in chunks.iter().enumerate() { + let chunk_hash = blake3::hash(chunk.text.as_bytes()); + let chunk_row = DocChunk { + chunk_id: chunk.chunk_id, + doc_id, + chunk_index: chunk_index as i32, + start_offset: chunk.start_offset as i32, + end_offset: chunk.end_offset as i32, + chunk_text: chunk.text.clone(), + chunk_hash: chunk_hash.to_hex().to_string(), + created_at: now, + }; + + docs::insert_doc_chunk(&mut *tx, &chunk_row).await?; + doc_outbox::enqueue_doc_outbox( + &mut *tx, + doc_id, + chunk_row.chunk_id, + "UPSERT", + embed_version.as_str(), + ) + .await?; + } + + if scope.trim() != "agent_private" { + access::ensure_active_project_scope_grant( + &mut *tx, + tenant_id.as_str(), + effective_project_id, + scope.as_str(), + agent_id.as_str(), + ) + .await?; + } + + tx.commit().await?; + + Ok(DocsPutResponse { + doc_id, + chunk_count: chunks.len() as u32, + content_bytes: content_bytes as u32, + content_hash: content_hash.to_hex().to_string(), + write_policy_audit, + }) + } + + /// Loads document metadata when the caller can read the requested scope. + pub async fn docs_get(&self, req: DocsGetRequest) -> Result<DocsGetResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let agent_id = req.agent_id.trim(); + let read_profile = req.read_profile.trim(); + + if tenant_id.is_empty() + || project_id.is_empty() + || agent_id.is_empty() + || read_profile.is_empty() + { + return Err(Error::InvalidRequest { + message: "tenant_id, project_id, agent_id, and read_profile are required." + .to_string(), + }); + } + + let allowed_scopes = search::resolve_read_profile_scopes(&self.cfg, read_profile)?; + let org_shared_allowed = allowed_scopes.iter().any(|scope| scope == "org_shared"); + let row: Option<DocDocument> = sqlx::query_as::<_, DocDocument>( + "\ +SELECT + doc_id, + tenant_id, + project_id, + agent_id, + scope, + doc_type, + status, + title, + COALESCE(source_ref, '{}'::jsonb) AS source_ref, + content, + content_bytes, + content_hash, + created_at, + updated_at +FROM doc_documents +WHERE doc_id = $1 + AND tenant_id = $2 + AND ( + project_id = $3 + OR (project_id = $4 AND scope = 'org_shared') + ) +LIMIT 1", + ) + .bind(req.doc_id) + .bind(tenant_id) + .bind(project_id) + .bind(ORG_PROJECT_ID) + .fetch_optional(&self.db.pool) + .await?; + let Some(row) = row else { + return Err(Error::NotFound { message: "Doc not found.".to_string() }); + }; + let shared_grants = if row.scope == "agent_private" { + HashSet::new() + } else { + access::load_shared_read_grants_with_org_shared( + &self.db.pool, + tenant_id, + project_id, + agent_id, + org_shared_allowed, + ) + .await? + }; + + if row.status != "active" + || !doc_read_allowed( + agent_id, + &allowed_scopes, + &shared_grants, + row.agent_id.as_str(), + row.scope.as_str(), + ) { + return Err(Error::NotFound { message: "Doc not found.".to_string() }); + } + + Ok(DocsGetResponse { + doc_id: row.doc_id, + tenant_id: row.tenant_id, + project_id: row.project_id, + agent_id: row.agent_id, + scope: row.scope, + doc_type: row.doc_type, + status: row.status, + title: row.title, + source_ref: row.source_ref, + content_bytes: row.content_bytes.max(0) as u32, + content_hash: row.content_hash, + created_at: row.created_at, + updated_at: row.updated_at, + }) + } + + /// Runs L0 document retrieval with access filtering and optional explain output. + pub async fn docs_search_l0(&self, req: DocsSearchL0Request) -> Result<DocsSearchL0Response> { + let trace_id = Uuid::new_v4(); + let filters = validate_docs_search_l0(&req)?; + let mut prepared = self.prepare_docs_search_l0_request(&req, &filters).await?; + let scored = run_doc_fusion_query( + &self.qdrant.client, + self.cfg.storage.qdrant.docs_collection.as_str(), + req.query.as_str(), + &prepared.vector, + &prepared.filter, + prepared.sparse_mode, + prepared.candidate_k, + ) + .await?; + + self.record_docs_search_l0_vector_stats( + &mut prepared.trajectory, + &scored, + prepared.sparse_enabled, + prepared.sparse_mode, + ); + + let scored_chunks = + docs_search_l0_deduplicated_chunks(&scored, prepared.candidate_k as usize)?; + let chunk_ids: Vec<Uuid> = scored_chunks.iter().map(|(chunk_id, _)| *chunk_id).collect(); + let rows = self + .load_doc_search_rows(&req, &prepared.status, &chunk_ids, &mut prepared.trajectory) + .await?; + let mut items = self.build_docs_search_l0_items( + &req, + &scored_chunks, + &rows, + &prepared.allowed_scopes, + &prepared.shared_grants, + &mut prepared.trajectory, + ); + + apply_doc_recency_boost( + &mut items, + prepared.now, + self.cfg.ranking.recency_tau_days, + self.cfg.ranking.tie_breaker_weight, + ); + + items.sort_by(|a, b| b.score.total_cmp(&a.score)); + items.truncate(prepared.top_k as usize); + + record_result_projection_stage( + &mut prepared.trajectory, + rows.len(), + items.len(), + self.cfg.ranking.recency_tau_days, + self.cfg.ranking.tie_breaker_weight, + ); + + Ok(DocsSearchL0Response { + trace_id, + items, + trajectory: prepared.trajectory.into_trajectory(), + }) + } + + async fn load_doc_search_rows( + &self, + req: &DocsSearchL0Request, + status: &str, + chunk_ids: &[Uuid], + trajectory: &mut DocTrajectoryBuilder, + ) -> Result<HashMap<Uuid, DocSearchRow>> { + let rows = load_doc_search_rows( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + status, + chunk_ids, + ) + .await?; + + trajectory.push( + "chunk_lookup", + serde_json::json!({ + "requested_chunks": chunk_ids.len(), + "loaded_chunks": rows.len(), + }), + ); + + Ok(rows) + } + + fn build_docs_search_l0_items( + &self, + req: &DocsSearchL0Request, + scored_chunks: &[(Uuid, f32)], + rows: &HashMap<Uuid, DocSearchRow>, + allowed_scopes: &[String], + shared_grants: &HashSet<SharedSpaceGrantKey>, + trajectory: &mut DocTrajectoryBuilder, + ) -> Vec<DocsSearchL0Item> { + let items = docs_search_l0_project_items( + scored_chunks, + rows, + req.caller_agent_id.as_str(), + allowed_scopes, + shared_grants, + ); + + trajectory.push( + "dedupe", + serde_json::json!({ + "raw_candidates": scored_chunks.len(), + "deduped_candidates": items.len(), + }), + ); + + items + } + + async fn prepare_docs_search_l0_request( + &self, + req: &DocsSearchL0Request, + filters: &DocsSearchL0Filters, + ) -> Result<DocsSearchL0Prepared> { + let explain = req.explain.unwrap_or(false); + let top_k = req.top_k.unwrap_or(12).min(MAX_TOP_K); + let candidate_k = req.candidate_k.unwrap_or(60).min(MAX_CANDIDATE_K); + let sparse_mode = filters.sparse_mode; + let sparse_enabled = docs_search_sparse_enabled(sparse_mode, req.query.as_str()); + let now = OffsetDateTime::now_utc(); + let mut trajectory = DocTrajectoryBuilder::new(explain); + + trajectory.push( + "request_validation", + serde_json::json!({ + "query_len": req.query.len(), + "top_k": top_k, + "candidate_k": candidate_k, + "sparse_mode": sparse_mode.as_str(), + "doc_type": filters + .doc_type + .as_ref() + .map(|doc_type| doc_type.as_str()) + .unwrap_or("<default>"), + "status": &filters.status, + }), + ); + + let allowed_scopes = + search::resolve_read_profile_scopes(&self.cfg, req.read_profile.as_str())?; + let org_shared_allowed = allowed_scopes.iter().any(|scope| scope == "org_shared"); + let shared_grants = access::load_shared_read_grants_with_org_shared( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + req.caller_agent_id.as_str(), + org_shared_allowed, + ) + .await?; + let filter = build_doc_search_filter( + req.tenant_id.as_str(), + req.project_id.as_str(), + req.caller_agent_id.as_str(), + &allowed_scopes, + filters, + ); + let embedded = self + .providers + .embedding + .embed(&self.cfg.providers.embedding, slice::from_ref(&req.query)) + .await?; + + trajectory.push("query_embedding", serde_json::json!({ "provider": "embedding" })); + + let vector = embedded.first().ok_or_else(|| Error::Provider { + message: "Embedding provider returned no vectors.".to_string(), + })?; + + trajectory.push( + "vector_dimension_check", + serde_json::json!({ + "provided_dim": vector.len(), + "expected_dim": self.cfg.storage.qdrant.vector_dim as usize, + }), + ); + + if vector.len() != self.cfg.storage.qdrant.vector_dim as usize { + return Err(Error::Provider { + message: "Embedding vector dimension mismatch.".to_string(), + }); + } + + Ok(DocsSearchL0Prepared { + top_k, + candidate_k, + sparse_mode, + sparse_enabled, + now, + trajectory, + allowed_scopes, + shared_grants, + filter, + vector: vector.to_vec(), + status: filters.status.clone(), + }) + } + + fn record_docs_search_l0_vector_stats( + &self, + trajectory: &mut DocTrajectoryBuilder, + scored: &[ScoredPoint], + sparse_enabled: bool, + sparse_mode: DocsSparseMode, + ) { + let channels = if sparse_enabled { vec!["dense", "sparse"] } else { vec!["dense"] }; + + trajectory.push( + "vector_search", + serde_json::json!({ + "raw_points": scored.len(), + "sparse_mode": sparse_mode.as_str(), + "channels": channels, + }), + ); + } + + /// Resolves and verifies an excerpt window from quote, position, or chunk selectors. + pub async fn docs_excerpts_get( + &self, + req: DocsExcerptsGetRequest, + ) -> Result<DocsExcerptResponse> { + let explain = req.explain.unwrap_or(false); + let trace_id = Uuid::new_v4(); + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let agent_id = req.agent_id.trim(); + let read_profile = req.read_profile.trim(); + let mut trajectory = DocTrajectoryBuilder::new(explain); + + trajectory.push( + "request_validation", + serde_json::json!({ + "doc_id": req.doc_id, + "read_profile": read_profile, + }), + ); + + validate_docs_excerpts_get( + tenant_id, + project_id, + agent_id, + read_profile, + req.quote.as_ref(), + )?; + + let doc = load_docs_excerpt_context( + &self.cfg, + &self.db.pool, + tenant_id, + project_id, + agent_id, + read_profile, + req.doc_id, + ) + .await?; + let level_max = excerpt_level_max(req.level.as_str())?; + + trajectory.push( + "level_selection", + serde_json::json!({ + "level": req.level, + "max_bytes": level_max, + }), + ); + + let mut verified = true; + let mut verification_errors = Vec::new(); + let DocExcerptRange { + selector_kind, + match_start_offset, + match_end_offset, + start_offset, + end_offset, + } = docs_excerpts_resolve_windowed_match( + &self.db.pool, + &doc, + &req, + level_max, + &mut trajectory, + &mut verified, + &mut verification_errors, + ) + .await?; + let excerpt = doc.content.get(start_offset..end_offset).unwrap_or("").to_string(); + + if excerpt.is_empty() { + verified = false; + + verification_errors.push("EMPTY_EXCERPT".to_string()); + } + + let excerpt_hash = blake3::hash(excerpt.as_bytes()).to_hex().to_string(); + + trajectory.push( + "verification", + serde_json::json!({ + "verified": verified, + "error_count": verification_errors.len(), + }), + ); + + Ok(DocsExcerptResponse { + trace_id, + doc_id: doc.doc_id, + excerpt, + start_offset, + end_offset, + locator: docs_excerpt_locator( + &req, + &selector_kind, + match_start_offset, + match_end_offset, + ), + verification: DocsExcerptVerification { + verified, + verification_errors, + content_hash: doc.content_hash.clone(), + excerpt_hash, + }, + trajectory: trajectory.into_trajectory(), + }) + } +} + +#[derive(Clone, Copy, Debug)] +enum DocsSparseMode { + Auto, + On, + Off, +} +impl DocsSparseMode { + fn as_str(self) -> &'static str { + match self { + Self::Auto => "auto", + Self::On => "on", + Self::Off => "off", + } + } +} + +#[derive(Clone, Copy)] +enum ExcerptsSelectorKind { + ChunkId, + Quote, + Position, +} +impl ExcerptsSelectorKind { + fn as_str(&self) -> &'static str { + match self { + Self::ChunkId => "chunk_id", + Self::Quote => "quote", + Self::Position => "position", + } + } +} + +fn docs_search_l0_deduplicated_chunks( + scored: &[ScoredPoint], + candidate_k: usize, +) -> Result<Vec<(Uuid, f32)>> { + let mut seen = HashSet::new(); + let mut chunks = Vec::new(); + + for point in scored.iter().take(candidate_k) { + let chunk_id = parse_scored_point_uuid_id(point)?; + + if seen.insert(chunk_id) { + chunks.push((chunk_id, point.score)); + } + } + + Ok(chunks) +} + +fn docs_search_l0_project_items( + scored_chunks: &[(Uuid, f32)], + rows: &HashMap<Uuid, DocSearchRow>, + caller_agent_id: &str, + allowed_scopes: &[String], + shared_grants: &HashSet<SharedSpaceGrantKey>, +) -> Vec<DocsSearchL0Item> { + let mut items = Vec::with_capacity(scored_chunks.len()); + + for (chunk_id, score) in scored_chunks { + let Some(row) = rows.get(chunk_id) else { continue }; + + if !doc_read_allowed( + caller_agent_id, + allowed_scopes, + shared_grants, + row.agent_id.as_str(), + row.scope.as_str(), + ) { + continue; + } + + items.push(DocsSearchL0Item { + doc_id: row.doc_id, + chunk_id: *chunk_id, + pointer: build_docs_l0_pointer(row, *chunk_id), + score: *score, + snippet: truncate_bytes(row.chunk_text.as_str(), DEFAULT_L0_MAX_BYTES), + scope: row.scope.clone(), + doc_type: row.doc_type.clone(), + project_id: row.project_id.clone(), + agent_id: row.agent_id.clone(), + updated_at: row.updated_at, + content_hash: row.content_hash.clone(), + chunk_hash: row.chunk_hash.clone(), + }); + } + + items +} + +fn apply_doc_recency_boost( + items: &mut [DocsSearchL0Item], + now: OffsetDateTime, + recency_tau_days: f32, + tie_breaker_weight: f32, +) { + if tie_breaker_weight <= 0.0 || items.is_empty() { + return; + } + + for item in items.iter_mut() { + let age_days = ((now - item.updated_at).as_seconds_f32() / 86_400.0).max(0.0); + let recency_decay = + if recency_tau_days > 0.0 { (-age_days / recency_tau_days).exp() } else { 1.0 }; + + item.score += tie_breaker_weight * recency_decay; + } +} + +fn record_result_projection_stage( + trajectory: &mut DocTrajectoryBuilder, + pre_authorization_candidates: usize, + returned_items: usize, + recency_tau_days: f32, + tie_breaker_weight: f32, +) { + trajectory.push( + "result_projection", + serde_json::json!({ + "pre_authorization_candidates": pre_authorization_candidates, + "returned_items": returned_items, + "recency_tau_days": recency_tau_days, + "tie_breaker_weight": tie_breaker_weight, + "recency_boost_applied": tie_breaker_weight > 0.0 && !pre_authorization_candidates.eq(&0), + }), + ) +} + +fn docs_excerpt_locator( + req: &DocsExcerptsGetRequest, + selector_kind: &ExcerptsSelectorKind, + match_start_offset: usize, + match_end_offset: usize, +) -> DocsExcerptLocator { + DocsExcerptLocator { + selector_kind: selector_kind.as_str().to_string(), + match_start_offset, + match_end_offset, + chunk_id: req.chunk_id, + quote: req.quote.clone(), + position: req.position.clone(), + } +} + +fn build_docs_l0_pointer(row: &DocSearchRow, chunk_id: Uuid) -> DocsSearchL0ItemPointer { + DocsSearchL0ItemPointer { + schema: DOC_SOURCE_REF_SCHEMA_V1.to_string(), + resolver: DOC_SOURCE_REF_RESOLVER_V1.to_string(), + reference: DocsSearchL0ItemReference { doc_id: row.doc_id, chunk_id }, + state: DocsSearchL0ItemState { + content_hash: row.content_hash.clone(), + chunk_hash: row.chunk_hash.clone(), + doc_updated_at: row.updated_at, + }, + } +} + +fn resolve_doc_chunking_profile(doc_type: DocType) -> DocChunkingProfile { + match doc_type { + DocType::Chat | DocType::Search => DocChunkingProfile { + max_tokens: 1_024, + overlap_tokens: 128, + max_chunks: DEFAULT_MAX_CHUNKS_PER_DOC, + }, + DocType::Knowledge | DocType::Dev => DocChunkingProfile { + max_tokens: 2_048, + overlap_tokens: 256, + max_chunks: DEFAULT_MAX_CHUNKS_PER_DOC, + }, + } +} + +fn validate_docs_excerpts_get( + tenant_id: &str, + project_id: &str, + agent_id: &str, + read_profile: &str, + quote: Option<&TextQuoteSelector>, +) -> Result<()> { + if tenant_id.is_empty() + || project_id.is_empty() + || agent_id.is_empty() + || read_profile.is_empty() + { + return Err(Error::InvalidRequest { + message: "tenant_id, project_id, agent_id, and read_profile are required.".to_string(), + }); + } + + if let Some(quote) = quote { + validate_quote_selector_english(quote)?; + } + + Ok(()) +} + +fn validate_quote_selector_english(quote: &TextQuoteSelector) -> Result<()> { + if !english_gate::is_english_natural_language(quote.exact.as_str()) { + return Err(Error::NonEnglishInput { field: "$.quote.exact".to_string() }); + } + + if let Some(prefix) = quote.prefix.as_ref() + && !english_gate::is_english_natural_language(prefix.as_str()) + { + return Err(Error::NonEnglishInput { field: "$.quote.prefix".to_string() }); + } + if let Some(suffix) = quote.suffix.as_ref() + && !english_gate::is_english_natural_language(suffix.as_str()) + { + return Err(Error::NonEnglishInput { field: "$.quote.suffix".to_string() }); + } + + Ok(()) +} + +fn excerpt_level_max(level: &str) -> Result<usize> { + match level { + "L0" => Ok(DEFAULT_L0_MAX_BYTES), + "L1" => Ok(DEFAULT_L1_MAX_BYTES), + "L2" => Ok(DEFAULT_L2_MAX_BYTES), + _ => Err(Error::InvalidRequest { message: "level must be L0, L1, or L2.".to_string() }), + } +} + +fn validate_docs_put(req: &DocsPutRequest) -> Result<ValidatedDocsPut> { + if req.content.trim().is_empty() { + return Err(Error::InvalidRequest { message: "content must be non-empty.".to_string() }); + } + if req.scope.trim().is_empty() { + return Err(Error::InvalidRequest { message: "scope must be non-empty.".to_string() }); + } + if !matches!(req.scope.as_str(), "agent_private" | "project_shared" | "org_shared") { + return Err(Error::InvalidRequest { message: "Unknown scope.".to_string() }); + } + + let source_ref = req.source_ref.as_object().ok_or_else(|| Error::InvalidRequest { + message: "source_ref must be a JSON object.".to_string(), + })?; + let source_ref_doc_type = + extract_source_ref_string(source_ref, "doc_type", "$.source_ref[\"doc_type\"]")?; + let source_ref_doc_type = DocType::parse(&source_ref_doc_type)?; + let source_ref_schema = + extract_source_ref_string(source_ref, "schema", "$.source_ref[\"schema\"]")?; + + if source_ref_schema != "doc_source_ref/v1" { + return Err(Error::InvalidRequest { + message: "source_ref.schema must be 'doc_source_ref/v1'.".to_string(), + }); + } + + let ts = extract_source_ref_string(source_ref, "ts", "$.source_ref[\"ts\"]")?; + + OffsetDateTime::parse(ts.as_str(), &Rfc3339).map_err(|_| Error::InvalidRequest { + message: "$.source_ref[\"ts\"] must be an RFC3339 datetime string.".to_string(), + })?; + + let doc_type = if let Some(doc_type) = req.doc_type.as_ref() { + let doc_type = DocType::parse(doc_type.as_str())?; + + if doc_type != source_ref_doc_type { + return Err(Error::InvalidRequest { + message: "doc_type must match source_ref.doc_type.".to_string(), + }); + } + + doc_type + } else { + source_ref_doc_type + }; + + validate_doc_source_ref_requirements(source_ref_doc_type.as_str(), source_ref)?; + + let write_policy = + writegate::apply_write_policy(req.content.as_str(), req.write_policy.as_ref()).map_err( + |err| Error::InvalidRequest { message: format!("write_policy is invalid: {err:?}") }, + )?; + let write_policy_audit = + if req.write_policy.is_some() { Some(write_policy.audit) } else { None }; + let content = write_policy.transformed; + + if content.trim().is_empty() { + return Err(Error::InvalidRequest { message: "content must be non-empty.".to_string() }); + } + if content.len() > DEFAULT_DOC_MAX_BYTES { + return Err(Error::InvalidRequest { + message: "content exceeds max_doc_bytes.".to_string(), + }); + } + if writegate::contains_secrets(content.as_str()) { + return Err(Error::InvalidRequest { message: "content contains secrets.".to_string() }); + } + + if let Some(found) = find_non_english_path(&req.source_ref, "$.source_ref") { + return Err(Error::NonEnglishInput { field: found }); + } + + if !english_gate::is_english_natural_language(content.as_str()) { + return Err(Error::NonEnglishInput { field: "$.content".to_string() }); + } + + if let Some(title) = req.title.as_ref() + && !english_gate::is_english_natural_language(title.as_str()) + { + return Err(Error::NonEnglishInput { field: "$.title".to_string() }); + } + + Ok(ValidatedDocsPut { doc_type, content, write_policy_audit }) +} + +fn extract_source_ref_string( + source_ref: &Map<String, Value>, + key: &str, + path: &str, +) -> Result<String> { + source_ref + .get(key) + .and_then(Value::as_str) + .map(|text| text.trim().to_string()) + .filter(|text| !text.is_empty()) + .ok_or_else(|| Error::InvalidRequest { message: format!("{path} is required.") }) +} + +fn validate_doc_source_ref_requirements( + source_doc_type: &str, + source_ref: &Map<String, Value>, +) -> Result<()> { + match source_doc_type { + "chat" => { + extract_source_ref_string(source_ref, "thread_id", "$.source_ref[\"thread_id\"]")?; + extract_source_ref_string(source_ref, "role", "$.source_ref[\"role\"]")?; + }, + "search" => { + extract_source_ref_string(source_ref, "query", "$.source_ref[\"query\"]")?; + extract_source_ref_string(source_ref, "url", "$.source_ref[\"url\"]")?; + extract_source_ref_string(source_ref, "domain", "$.source_ref[\"domain\"]")?; + }, + "dev" => { + extract_source_ref_string(source_ref, "repo", "$.source_ref[\"repo\"]")?; + + let commit_sha_present = source_ref + .get("commit_sha") + .and_then(Value::as_str) + .is_some_and(|value| !value.trim().is_empty()); + let pr_number_present = source_ref + .get("pr_number") + .is_some_and(|value| value.as_i64().is_some() || value.as_u64().is_some()); + let issue_number_present = source_ref + .get("issue_number") + .is_some_and(|value| value.as_i64().is_some() || value.as_u64().is_some()); + let present_count = + commit_sha_present as u8 + pr_number_present as u8 + issue_number_present as u8; + + if present_count != 1 { + return Err(Error::InvalidRequest { + message: + "For doc_type=dev, exactly one of commit_sha, pr_number, or issue_number is required." + .to_string(), + }); + } + }, + "knowledge" => {}, + _ => unreachable!(), + } + + Ok(()) +} + +fn validate_docs_search_l0(req: &DocsSearchL0Request) -> Result<DocsSearchL0Filters> { + validate_docs_search_l0_query(req)?; + + let filters = parse_docs_search_l0_filters(req)?; + let ranges = parse_docs_search_l0_ranges(req)?; + + validate_docs_search_l0_temporal_ranges( + ranges.updated_after.as_ref(), + ranges.updated_before.as_ref(), + ranges.ts_gte.as_ref(), + ranges.ts_lte.as_ref(), + )?; + + Ok(DocsSearchL0Filters { + scope: filters.scope, + status: filters.status, + doc_type: filters.doc_type, + sparse_mode: filters.sparse_mode, + domain: filters.domain, + repo: filters.repo, + agent_id: filters.agent_id, + thread_id: filters.thread_id, + updated_after: ranges.updated_after, + updated_before: ranges.updated_before, + ts_gte: ranges.ts_gte, + ts_lte: ranges.ts_lte, + }) +} + +fn validate_docs_search_l0_query(req: &DocsSearchL0Request) -> Result<()> { + if req.query.trim().is_empty() { + return Err(Error::InvalidRequest { message: "query must be non-empty.".to_string() }); + } + if !english_gate::is_english_natural_language(req.query.as_str()) { + return Err(Error::NonEnglishInput { field: "$.query".to_string() }); + } + + Ok(()) +} + +fn parse_docs_search_l0_filters(req: &DocsSearchL0Request) -> Result<DocsSearchL0FiltersParsed> { + let scope = if let Some(scope) = req.scope.as_ref() { + let scope = scope.trim(); + + if scope.is_empty() { + return Err(Error::InvalidRequest { message: "scope must be non-empty.".to_string() }); + } + if !matches!(scope, "agent_private" | "project_shared" | "org_shared") { + return Err(Error::InvalidRequest { message: "Unknown scope.".to_string() }); + } + + Some(scope.to_string()) + } else { + None + }; + let status = req + .status + .as_ref() + .map(|status| status.trim().to_string()) + .filter(|status| !status.is_empty()) + .unwrap_or_else(|| "active".to_string()) + .to_lowercase(); + let status = if DOC_STATUSES.contains(&status.as_str()) { + status + } else { + return Err(Error::InvalidRequest { + message: "status must be one of: active|deleted.".to_string(), + }); + }; + let sparse_mode = parse_sparse_mode(req.sparse_mode.as_ref())?; + let doc_type = if let Some(doc_type) = req.doc_type.as_ref() { + let doc_type = doc_type.trim(); + + if doc_type.is_empty() { + return Err(Error::InvalidRequest { + message: "doc_type must be non-empty.".to_string(), + }); + } + + Some(DocType::parse(doc_type)?) + } else { + None + }; + let domain = req + .domain + .as_ref() + .map(|domain| domain.trim().to_string()) + .filter(|domain| !domain.is_empty()); + let repo = + req.repo.as_ref().map(|repo| repo.trim().to_string()).filter(|repo| !repo.is_empty()); + + if domain.is_some() && doc_type != Some(DocType::Search) { + return Err(Error::InvalidRequest { + message: "domain requires doc_type=search.".to_string(), + }); + } + if repo.is_some() && doc_type != Some(DocType::Dev) { + return Err(Error::InvalidRequest { message: "repo requires doc_type=dev.".to_string() }); + } + + let agent_id = req + .agent_id + .as_ref() + .map(|agent_id| agent_id.trim().to_string()) + .filter(|agent_id| !agent_id.is_empty()); + let thread_id = req + .thread_id + .as_ref() + .map(|thread_id| thread_id.trim().to_string()) + .filter(|thread_id| !thread_id.is_empty()); + + if thread_id.is_some() && doc_type != Some(DocType::Chat) { + return Err(Error::InvalidRequest { + message: "thread_id requires doc_type=chat.".to_string(), + }); + } + + Ok(DocsSearchL0FiltersParsed { + scope, + status, + doc_type, + sparse_mode, + domain, + repo, + agent_id, + thread_id, + }) +} + +fn parse_docs_search_l0_ranges(req: &DocsSearchL0Request) -> Result<DocsSearchL0RangesParsed> { + let updated_after = parse_optional_rfc3339(req.updated_after.as_ref(), "$.updated_after")?; + let updated_before = parse_optional_rfc3339(req.updated_before.as_ref(), "$.updated_before")?; + let ts_gte = parse_optional_rfc3339(req.ts_gte.as_ref(), "$.ts_gte")?; + let ts_lte = parse_optional_rfc3339(req.ts_lte.as_ref(), "$.ts_lte")?; + + Ok(DocsSearchL0RangesParsed { updated_after, updated_before, ts_gte, ts_lte }) +} + +fn validate_docs_search_l0_temporal_ranges( + updated_after: Option<&OffsetDateTime>, + updated_before: Option<&OffsetDateTime>, + ts_gte: Option<&OffsetDateTime>, + ts_lte: Option<&OffsetDateTime>, +) -> Result<()> { + if let (Some(updated_after), Some(updated_before)) = (updated_after, updated_before) + && updated_after >= updated_before + { + return Err(Error::InvalidRequest { + message: "updated_after must be earlier than updated_before.".to_string(), + }); + } + if let (Some(ts_gte), Some(ts_lte)) = (ts_gte, ts_lte) + && ts_gte >= ts_lte + { + return Err(Error::InvalidRequest { + message: "ts_gte must be earlier than ts_lte.".to_string(), + }); + } + + Ok(()) +} + +fn parse_sparse_mode(raw: Option<&String>) -> Result<DocsSparseMode> { + let raw = raw.as_ref().map(|mode| mode.trim().to_lowercase()); + let Some(mode) = raw else { + return Ok(DocsSparseMode::Auto); + }; + let mode = mode.as_str(); + + match mode { + "auto" => Ok(DocsSparseMode::Auto), + "on" => Ok(DocsSparseMode::On), + "off" => Ok(DocsSparseMode::Off), + _ => Err(Error::InvalidRequest { + message: "sparse_mode must be one of: auto|on|off.".to_string(), + }), + } +} + +fn parse_optional_rfc3339(raw: Option<&String>, path: &str) -> Result<Option<OffsetDateTime>> { + let Some(raw) = raw else { + return Ok(None); + }; + let raw = raw.trim(); + + if raw.is_empty() { + return Err(Error::InvalidRequest { message: format!("{path} must be non-empty.") }); + } + + OffsetDateTime::parse(raw, &Rfc3339).map(Some).map_err(|_| Error::InvalidRequest { + message: format!("{path} must be an RFC3339 datetime string."), + }) +} + +fn find_non_english_path(value: &Value, path: &str) -> Option<String> { + find_non_english_path_inner(value, path, false) +} + +fn find_non_english_path_inner( + value: &Value, + path: &str, + is_identifier_lane: bool, +) -> Option<String> { + fn has_english_gate(text: &str, is_identifier_lane: bool) -> bool { + if is_identifier_lane { + return english_gate::is_english_identifier(text); + } + + english_gate::is_english_natural_language(text) + } + + match value { + Value::String(text) => + if !has_english_gate(text, is_identifier_lane) { + Some(path.to_string()) + } else { + None + }, + Value::Array(items) => { + for (idx, item) in items.iter().enumerate() { + let child_path = format!("{path}[{idx}]"); + + if let Some(found) = + find_non_english_path_inner(item, &child_path, is_identifier_lane) + { + return Some(found); + } + } + + None + }, + Value::Object(map) => { + for (key, value) in map.iter() { + let identifier_lane = is_identifier_lane + || matches!(key.as_str(), "ref" | "schema" | "resolver" | "hashes" | "state"); + let child_path = format!("{path}[\"{}\"]", escape_json_path_key(key)); + + if let Some(found) = + find_non_english_path_inner(value, &child_path, identifier_lane) + { + return Some(found); + } + } + + None + }, + _ => None, + } +} + +fn escape_json_path_key(key: &str) -> String { + key.replace('\\', "\\\\").replace('"', "\\\"") +} + +fn load_tokenizer(cfg: &Config) -> Result<Tokenizer> { + let tokenizer_repo = cfg.chunking.tokenizer_repo.trim(); + + if tokenizer_repo.is_empty() { + return Err(Error::InvalidRequest { + message: "chunking.tokenizer_repo must be set.".to_string(), + }); + } + + elf_chunking::load_tokenizer(tokenizer_repo).map_err(|err| Error::InvalidRequest { + message: format!("failed to load tokenizer: {err}"), + }) +} + +fn split_tokens_by_offsets( + text: &str, + profile_max_tokens: usize, + profile_overlap_tokens: usize, + max_chunks: usize, + tokenizer: &Tokenizer, +) -> Result<Vec<ByteChunk>> { + if profile_max_tokens == 0 { + return Err(Error::InvalidRequest { + message: "max_tokens must be greater than zero.".to_string(), + }); + } + if profile_overlap_tokens >= profile_max_tokens { + return Err(Error::InvalidRequest { + message: "overlap_tokens must be less than max_tokens.".to_string(), + }); + } + + let encoding = tokenizer.encode(text, false).map_err(|err| Error::InvalidRequest { + message: format!("failed to tokenize content: {err}"), + })?; + let offsets = encoding.get_offsets(); + let mut chunks = Vec::new(); + + if offsets.is_empty() { + return Ok(Vec::new()); + } + + let mut chunk_start_token = 0_usize; + + while chunk_start_token < offsets.len() { + let chunk_end_token = (chunk_start_token + profile_max_tokens).min(offsets.len()); + let (start_offset, end_offset) = { + let (start, _) = offsets[chunk_start_token]; + let (_, end) = offsets[chunk_end_token.saturating_sub(1)]; + + (start, end) + }; + let chunk_text = + text.get(start_offset..end_offset).ok_or_else(|| Error::InvalidRequest { + message: "computed chunk offset is invalid UTF-8 boundary.".to_string(), + })?; + + chunks.push(ByteChunk { + chunk_id: Uuid::new_v4(), + start_offset, + end_offset, + text: chunk_text.to_string(), + }); + + if chunk_end_token >= offsets.len() { + break; + } + if chunks.len() >= max_chunks { + return Err(Error::InvalidRequest { + message: "doc exceeds max_chunks_per_doc.".to_string(), + }); + } + + chunk_start_token = chunk_end_token.saturating_sub(profile_overlap_tokens); + } + + Ok(chunks) +} + +fn build_doc_search_filter( + tenant_id: &str, + project_id: &str, + caller_agent_id: &str, + allowed_scopes: &[String], + filters: &DocsSearchL0Filters, +) -> Filter { + let private_scope = "agent_private".to_string(); + let non_private_scopes: Vec<String> = + allowed_scopes.iter().filter(|scope| *scope != "agent_private").cloned().collect(); + let mut scope_should_conditions = Vec::new(); + + if allowed_scopes.iter().any(|scope| scope == "agent_private") { + let private_filter = Filter::all([ + Condition::matches("scope", private_scope), + Condition::matches("agent_id", caller_agent_id.to_string()), + ]); + + scope_should_conditions.push(Condition::from(private_filter)); + } + if !non_private_scopes.is_empty() { + scope_should_conditions.push(Condition::matches("scope", non_private_scopes)); + } + + let scope_min_should = if scope_should_conditions.is_empty() { + None + } else { + Some(MinShould { min_count: 1, conditions: scope_should_conditions }) + }; + let mut project_or_org_branches = vec![Condition::from(Filter { + must: vec![Condition::matches("project_id", project_id.to_string())], + should: Vec::new(), + must_not: Vec::new(), + min_should: scope_min_should, + })]; + + if allowed_scopes.iter().any(|scope| scope == "org_shared") { + let org_filter = Filter::all([ + Condition::matches("project_id", ORG_PROJECT_ID.to_string()), + Condition::matches("scope", "org_shared".to_string()), + ]); + + project_or_org_branches.push(Condition::from(org_filter)); + } + + Filter { + must: { + let mut must = vec![ + Condition::matches("tenant_id", tenant_id.to_string()), + Condition::matches("status", filters.status.clone()), + ]; + + if let Some(scope) = filters.scope.as_ref() { + must.push(Condition::matches("scope", scope.to_string())); + } + if let Some(doc_type) = filters.doc_type.as_ref() { + must.push(Condition::matches("doc_type", doc_type.as_str().to_string())); + } + if let Some(domain) = filters.domain.as_ref() { + must.push(Condition::matches("domain", domain.to_string())); + } + if let Some(repo) = filters.repo.as_ref() { + must.push(Condition::matches("repo", repo.to_string())); + } + if let Some(agent_id) = filters.agent_id.as_ref() { + must.push(Condition::matches("agent_id", agent_id.to_string())); + } + if let Some(thread_id) = filters.thread_id.as_ref() { + must.push(Condition::matches("thread_id", thread_id.to_string())); + } + if let Some(datetime_filter) = datetime_filter_range( + filters.updated_after.as_ref(), + filters.updated_before.as_ref(), + ) { + must.push(datetime_filter); + } + if let Some(datetime_filter) = + doc_ts_filter_range(filters.ts_gte.as_ref(), filters.ts_lte.as_ref()) + { + must.push(datetime_filter); + } + + must + }, + should: Vec::new(), + must_not: Vec::new(), + min_should: Some(MinShould { min_count: 1, conditions: project_or_org_branches }), + } +} + +fn datetime_filter_range( + updated_after: Option<&OffsetDateTime>, + updated_before: Option<&OffsetDateTime>, +) -> Option<Condition> { + let gt = updated_after.map(|updated_after| Timestamp { + seconds: updated_after.unix_timestamp(), + nanos: updated_after.nanosecond() as i32, + }); + let lt = updated_before.map(|updated_before| Timestamp { + seconds: updated_before.unix_timestamp(), + nanos: updated_before.nanosecond() as i32, + }); + + if gt.is_none() && lt.is_none() { + return None; + } + + Some(Condition::datetime_range("updated_at", DatetimeRange { lt, gt, gte: None, lte: None })) +} + +fn doc_ts_filter_range( + ts_gte: Option<&OffsetDateTime>, + ts_lte: Option<&OffsetDateTime>, +) -> Option<Condition> { + let gte = ts_gte.map(|ts_gte| Timestamp { + seconds: ts_gte.unix_timestamp(), + nanos: ts_gte.nanosecond() as i32, + }); + let lte = ts_lte.map(|ts_lte| Timestamp { + seconds: ts_lte.unix_timestamp(), + nanos: ts_lte.nanosecond() as i32, + }); + + if gte.is_none() && lte.is_none() { + return None; + } + + Some(Condition::datetime_range("doc_ts", DatetimeRange { lt: None, gt: None, gte, lte })) +} + +fn doc_read_allowed( + requester_agent_id: &str, + allowed_scopes: &[String], + shared_grants: &HashSet<SharedSpaceGrantKey>, + owner_agent_id: &str, + scope: &str, +) -> bool { + if !allowed_scopes.iter().any(|s| s == scope) { + return false; + } + if scope == "agent_private" { + return owner_agent_id == requester_agent_id; + } + if owner_agent_id == requester_agent_id { + return true; + } + + shared_grants.contains(&SharedSpaceGrantKey { + scope: scope.to_string(), + space_owner_agent_id: owner_agent_id.to_string(), + }) +} + +fn parse_scored_point_uuid_id(point: &ScoredPoint) -> Result<Uuid> { + let id = point + .id + .as_ref() + .ok_or_else(|| Error::Qdrant { message: "Qdrant returned item without id.".to_string() })?; + + match id.point_id_options.as_ref() { + Some(PointIdOptions::Uuid(s)) => Uuid::parse_str(s.as_str()) + .map_err(|_| Error::Qdrant { message: "Qdrant returned invalid uuid id.".to_string() }), + Some(other) => Err(Error::Qdrant { + message: format!("Qdrant returned unsupported id type: {other:?}."), + }), + None => Err(Error::Qdrant { message: "Qdrant returned item with missing id.".to_string() }), + } +} + +fn truncate_bytes(text: &str, max: usize) -> String { + if text.len() <= max { + return text.to_string(); + } + + let mut cut = max; + + while cut > 0 && !text.is_char_boundary(cut) { + cut -= 1; + } + + text.get(0..cut).unwrap_or("").to_string() +} + +fn locate_quote(text: &str, quote: &TextQuoteSelector) -> Option<(usize, usize)> { + let prefix = quote.prefix.as_deref().unwrap_or(""); + let suffix = quote.suffix.as_deref().unwrap_or(""); + + for (start, _) in text.match_indices(quote.exact.as_str()) { + let end = start + quote.exact.len(); + + if !text[..start].ends_with(prefix) { + continue; + } + if !text[end..].starts_with(suffix) { + continue; + } + + return Some((start, end)); + } + + None +} + +fn bounded_window( + match_start: usize, + match_end: usize, + text: &str, + max_bytes: usize, +) -> (usize, usize) { + let len = text.len(); + let match_center = match_start.saturating_add(match_end.saturating_sub(match_start) / 2); + let half = max_bytes / 2; + let mut start = match_center.saturating_sub(half); + let mut end = (start + max_bytes).min(len); + + if end - start < max_bytes && start > 0 { + start = start.saturating_sub(max_bytes - (end - start)); + } + + while start < len && !text.is_char_boundary(start) { + start += 1; + } + while end > start && !text.is_char_boundary(end) { + end -= 1; + } + + (start, end) +} + +fn docs_search_sparse_enabled(mode: DocsSparseMode, query: &str) -> bool { + match mode { + DocsSparseMode::Auto => should_enable_sparse_auto(query), + DocsSparseMode::On => true, + DocsSparseMode::Off => false, + } +} + +fn should_enable_sparse_auto(query: &str) -> bool { + let trimmed = query.trim(); + + if trimmed.is_empty() { + return false; + } + if trimmed.contains("://") + || trimmed.contains('/') + || trimmed.contains('\\') + || trimmed.contains('?') + { + return true; + } + + let has_mixed_alpha_num = trimmed.split_whitespace().any(|token| { + token.chars().any(|ch| ch.is_ascii_alphabetic()) + && token.chars().any(|ch| ch.is_ascii_digit()) + }); + let special_count = trimmed + .chars() + .filter(|ch| !(ch.is_ascii_alphanumeric() || ch.is_ascii_whitespace() || *ch == '_')) + .count(); + let compact_hex_like = { + let compact = trimmed.chars().filter(|ch| !ch.is_ascii_whitespace()).collect::<String>(); + + compact.len() >= 12 && compact.chars().all(|ch| ch.is_ascii_hexdigit() || ch == '-') + }; + + special_count >= 2 || compact_hex_like || (has_mixed_alpha_num && trimmed.len() > 12) +} + +async fn load_docs_excerpt_context( + cfg: &Config, + pool: &PgPool, + tenant_id: &str, + project_id: &str, + agent_id: &str, + read_profile: &str, + doc_id: Uuid, +) -> Result<DocDocument> { + let allowed_scopes = search::resolve_read_profile_scopes(cfg, read_profile)?; + let org_shared_allowed = allowed_scopes.iter().any(|scope| scope == "org_shared"); + let shared_grants = access::load_shared_read_grants_with_org_shared( + pool, + tenant_id, + project_id, + agent_id, + org_shared_allowed, + ) + .await?; + let doc = load_doc_document_for_read(pool, doc_id, tenant_id, project_id) + .await? + .ok_or_else(|| Error::NotFound { message: "Doc not found.".to_string() })?; + + if doc.status != "active" + || !doc_read_allowed( + agent_id, + &allowed_scopes, + &shared_grants, + doc.agent_id.as_str(), + doc.scope.as_str(), + ) { + return Err(Error::NotFound { message: "Doc not found.".to_string() }); + } + + Ok(doc) +} + +async fn docs_excerpts_resolve_windowed_match( + pool: &PgPool, + doc: &DocDocument, + req: &DocsExcerptsGetRequest, + level_max: usize, + trajectory: &mut DocTrajectoryBuilder, + verified: &mut bool, + verification_errors: &mut Vec<String>, +) -> Result<DocExcerptRange> { + let DocExcerptMatch { selector_kind, match_start_offset, match_end_offset } = + docs_excerpts_resolve_match(pool, doc, req, verified, verification_errors).await?; + + trajectory.push( + "match_resolution", + serde_json::json!({ + "selector_kind": selector_kind.as_str(), + "match_start": match_start_offset, + "match_end": match_end_offset, + }), + ); + + let (start_offset, end_offset) = + bounded_window(match_start_offset, match_end_offset, doc.content.as_str(), level_max); + + trajectory.push( + "window_projection", + serde_json::json!({ + "window_start": start_offset, + "window_end": end_offset, + "content_len": doc.content.len(), + }), + ); + + Ok(DocExcerptRange { + selector_kind, + match_start_offset, + match_end_offset, + start_offset, + end_offset, + }) +} + +async fn docs_excerpts_resolve_match( + pool: &PgPool, + doc: &DocDocument, + req: &DocsExcerptsGetRequest, + verified: &mut bool, + verification_errors: &mut Vec<String>, +) -> Result<DocExcerptMatch> { + let (match_start_offset, match_end_offset, selector_kind) = + resolve_excerpts_match_range(pool, doc, req, verified, verification_errors).await?; + + Ok(DocExcerptMatch { selector_kind, match_start_offset, match_end_offset }) +} + +async fn load_doc_document_for_read( + executor: impl PgExecutor<'_>, + doc_id: Uuid, + tenant_id: &str, + project_id: &str, +) -> Result<Option<DocDocument>> { + let row: Option<DocDocument> = sqlx::query_as::<_, DocDocument>( + "\ +SELECT + doc_id, + tenant_id, + project_id, + agent_id, + scope, + doc_type, + status, + title, + COALESCE(source_ref, '{}'::jsonb) AS source_ref, + content, + content_bytes, + content_hash, + created_at, + updated_at +FROM doc_documents +WHERE doc_id = $1 + AND tenant_id = $2 + AND ( + project_id = $3 + OR (project_id = $4 AND scope = 'org_shared') + ) +LIMIT 1", + ) + .bind(doc_id) + .bind(tenant_id) + .bind(project_id) + .bind(ORG_PROJECT_ID) + .fetch_optional(executor) + .await?; + + Ok(row) +} + +async fn resolve_excerpts_match_range( + pool: &PgPool, + doc: &DocDocument, + req: &DocsExcerptsGetRequest, + verified: &mut bool, + verification_errors: &mut Vec<String>, +) -> Result<(usize, usize, ExcerptsSelectorKind)> { + if let Some(chunk_id) = req.chunk_id { + let chunk = docs::get_doc_chunk(pool, chunk_id).await?; + let Some(chunk) = chunk else { + return Err(Error::NotFound { message: "Chunk not found.".to_string() }); + }; + + if chunk.doc_id != doc.doc_id { + return Err(Error::NotFound { message: "Chunk not found.".to_string() }); + } + + return Ok(( + chunk.start_offset.max(0) as usize, + chunk.end_offset.max(0) as usize, + ExcerptsSelectorKind::ChunkId, + )); + } + if let Some(quote) = req.quote.as_ref() { + return Ok(match locate_quote(&doc.content, quote) { + Some((s, e)) => (s, e, ExcerptsSelectorKind::Quote), + None => { + *verified = false; + + verification_errors.push("QUOTE_SELECTOR_NOT_FOUND".to_string()); + + if let Some(pos) = req.position.as_ref() { + ( + pos.start.min(doc.content.len()), + pos.end.min(doc.content.len()), + ExcerptsSelectorKind::Position, + ) + } else { + return Err(Error::NotFound { + message: "Selector did not match document.".to_string(), + }); + } + }, + }); + } + if let Some(pos) = req.position.as_ref() { + return Ok(( + pos.start.min(doc.content.len()), + pos.end.min(doc.content.len()), + ExcerptsSelectorKind::Position, + )); + } + + Err(Error::InvalidRequest { + message: "One of chunk_id, quote, or position is required.".to_string(), + }) +} + +async fn run_doc_fusion_query( + client: &Qdrant, + collection: &str, + query_text: &str, + vector: &[f32], + filter: &Filter, + sparse_mode: DocsSparseMode, + candidate_k: u32, +) -> Result<Vec<ScoredPoint>> { + let sparse_enabled = docs_search_sparse_enabled(sparse_mode, query_text); + let dense_prefetch = PrefetchQueryBuilder::default() + .query(Query::new_nearest(vector.to_vec())) + .using(DENSE_VECTOR_NAME) + .filter(filter.clone()) + .limit(candidate_k as u64); + let mut search = QueryPointsBuilder::new(collection.to_string()); + + search = search.add_prefetch(dense_prefetch); + + if sparse_enabled { + let bm25_prefetch = PrefetchQueryBuilder::default() + .query(Query::new_nearest(Document::new(query_text.to_string(), BM25_MODEL))) + .using(BM25_VECTOR_NAME) + .filter(filter.clone()) + .limit(candidate_k as u64); + + search = search.add_prefetch(bm25_prefetch); + } + + let search = search.with_payload(false).query(Fusion::Rrf).limit(candidate_k as u64); + let response = + client.query(search).await.map_err(|err| Error::Qdrant { message: err.to_string() })?; + + Ok(response.result) +} + +async fn load_doc_search_rows( + executor: impl PgExecutor<'_>, + tenant_id: &str, + project_id: &str, + status: &str, + chunk_ids: &[Uuid], +) -> Result<HashMap<Uuid, DocSearchRow>> { + if chunk_ids.is_empty() { + return Ok(HashMap::new()); + } + + let rows: Vec<DocSearchRow> = sqlx::query_as( + "\ +SELECT + c.chunk_id, + c.doc_id, + d.scope, + d.doc_type, + d.project_id, + d.agent_id, + d.updated_at, + d.content_hash, + c.chunk_hash, + c.chunk_text +FROM doc_chunks c +JOIN doc_documents d ON d.doc_id = c.doc_id +WHERE c.chunk_id = ANY($1) + AND d.tenant_id = $2 + AND d.status = $4 + AND ( + d.project_id = $3 + OR (d.project_id = $5 AND d.scope = 'org_shared') + )", + ) + .bind(chunk_ids) + .bind(tenant_id) + .bind(project_id) + .bind(status) + .bind(ORG_PROJECT_ID) + .fetch_all(executor) + .await?; + let mut map = HashMap::with_capacity(rows.len()); + + for row in rows { + map.insert(row.chunk_id, row); + } + + Ok(map) +} + +#[cfg(test)] +mod tests { + use ahash::AHashMap; + use qdrant_client::qdrant::{ + DatetimeRange, Filter, condition::ConditionOneOf, r#match::MatchValue, + }; + use time::{OffsetDateTime, format_description::well_known::Rfc3339}; + use tokenizers::{ + Tokenizer, models::wordlevel::WordLevel, pre_tokenizers::whitespace::Whitespace, + }; + + use crate::docs::{ + self, DocType, DocsPutRequest, DocsSearchL0Filters, DocsSearchL0Request, DocsSparseMode, + Error, + }; + use elf_domain::writegate::{WritePolicy, WriteSpan}; + + const TENANT_ID: &str = "tenant"; + const PROJECT_ID: &str = "project"; + + fn test_request_with_query(query: &str) -> DocsSearchL0Request { + DocsSearchL0Request { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + caller_agent_id: "agent".to_string(), + read_profile: "private_plus_project".to_string(), + query: query.to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + top_k: None, + candidate_k: None, + explain: None, + } + } + + fn first_datetime_range(filter: &Filter, key: &str) -> Option<DatetimeRange> { + for condition in &filter.must { + if let Some(ConditionOneOf::Field(field)) = condition.condition_one_of.as_ref() { + if field.key != key { + continue; + } + + if let Some(range) = field.datetime_range.as_ref() { + return Some(*range); + } + } + } + + None + } + + fn first_match_value(filter: &Filter, key: &str) -> Option<String> { + for condition in &filter.must { + if let Some(ConditionOneOf::Field(field)) = condition.condition_one_of.as_ref() { + if field.key != key { + continue; + } + + if let Some(r#match) = field.r#match.as_ref() { + let Some(match_value) = r#match.match_value.as_ref() else { + continue; + }; + + return match match_value { + MatchValue::Keyword(value) => Some(value.clone()), + _ => None, + }; + } + } + } + + None + } + + fn test_tokenizer() -> Tokenizer { + let mut vocab = AHashMap::new(); + + vocab.insert("alpha".to_string(), 1_u32); + vocab.insert("beta".to_string(), 2_u32); + vocab.insert("charlie".to_string(), 3_u32); + vocab.insert("delta".to_string(), 4_u32); + vocab.insert("<unk>".to_string(), 0_u32); + + let model = WordLevel::builder() + .vocab(vocab) + .unk_token("<unk>".to_string()) + .build() + .expect("Failed to build test tokenizer."); + let mut tokenizer = Tokenizer::new(model); + + tokenizer.with_pre_tokenizer(Some(Whitespace)); + + tokenizer + } + + #[test] + fn doc_type_parses_and_serializes() { + let encoded = + serde_json::to_string(&DocType::Knowledge).expect("Expected DocType serialization."); + let parsed = + serde_json::from_str::<DocType>("\"knowledge\"").expect("Expected parse to succeed."); + let invalid: Result<DocType, _> = serde_json::from_str("\"invalid\""); + + assert_eq!(encoded, "\"knowledge\""); + assert_eq!(parsed, DocType::Knowledge); + assert!(invalid.is_err()); + } + + #[test] + fn docs_search_l0_requires_chat_doc_type_for_thread_id() { + let err = docs::validate_docs_search_l0(&DocsSearchL0Request { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + caller_agent_id: "agent".to_string(), + read_profile: "private_plus_project".to_string(), + query: "thread".to_string(), + scope: None, + status: None, + doc_type: Some("search".to_string()), + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: Some("thread-1".to_string()), + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + top_k: None, + candidate_k: None, + explain: None, + }) + .expect_err("Expected thread_id to require doc_type=chat."); + + match err { + Error::InvalidRequest { message } => assert!(message.contains("thread_id requires")), + other => panic!("Unexpected error: {other:?}"), + } + + docs::validate_docs_search_l0(&DocsSearchL0Request { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + caller_agent_id: "agent".to_string(), + read_profile: "private_plus_project".to_string(), + query: "thread".to_string(), + scope: None, + status: None, + doc_type: Some("chat".to_string()), + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: Some("thread-1".to_string()), + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + top_k: None, + candidate_k: None, + explain: None, + }) + .expect("Expected thread_id filter to be accepted for chat."); + } + + #[test] + fn validate_docs_put_rejects_invalid_doc_type() { + let err = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: None, + title: None, + write_policy: None, + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "invalid", + "ts": "2026-02-25T12:00:00Z", + }), + content: "Hello world.".to_string(), + }) + .expect_err("Expected invalid doc_type to be rejected."); + + match err { + Error::InvalidRequest { message } => assert!(message.contains("doc_type")), + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn resolve_doc_chunking_profile_is_deterministic_by_doc_type() { + let small = docs::resolve_doc_chunking_profile(DocType::Chat); + + assert_eq!(small.max_tokens, 1_024); + assert_eq!(small.overlap_tokens, 128); + + let default = docs::resolve_doc_chunking_profile(DocType::Knowledge); + + assert_eq!(default.max_tokens, 2_048); + assert_eq!(default.overlap_tokens, 256); + } + + #[test] + fn validate_docs_search_l0_defaults_status_and_filters_dates() { + let filters = docs::validate_docs_search_l0(&test_request_with_query("hello world")) + .expect("valid request"); + + assert_eq!(filters.status, "active"); + + let bad_dates = DocsSearchL0Request { + updated_after: Some("2026-02-25T12:00:00Z".to_string()), + updated_before: Some("2026-02-25T11:00:00Z".to_string()), + sparse_mode: None, + domain: None, + repo: None, + ..test_request_with_query("status") + }; + let err = docs::validate_docs_search_l0(&bad_dates) + .expect_err("Expected bad date order to be rejected."); + + match err { + Error::InvalidRequest { message } => { + assert!(message.contains("earlier")); + }, + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_search_l0_rejects_invalid_status() { + let err = docs::validate_docs_search_l0(&DocsSearchL0Request { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + caller_agent_id: "agent".to_string(), + read_profile: "private_plus_project".to_string(), + query: "status".to_string(), + scope: None, + status: Some("archived".to_string()), + doc_type: None, + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + top_k: None, + candidate_k: None, + explain: None, + }) + .expect_err("Expected invalid status to be rejected."); + + match err { + Error::InvalidRequest { message } => assert!(message.contains("status")), + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_search_l0_rejects_invalid_datetime_format() { + let err = docs::validate_docs_search_l0(&DocsSearchL0Request { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + caller_agent_id: "agent".to_string(), + read_profile: "private_plus_project".to_string(), + query: "status".to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: None, + updated_after: Some("2026-02-25T12:00:00".to_string()), + updated_before: None, + ts_gte: None, + ts_lte: None, + top_k: None, + candidate_k: None, + explain: None, + }) + .expect_err("Expected invalid RFC3339 datetime to be rejected."); + + match err { + Error::InvalidRequest { message } => assert!(message.contains("RFC3339")), + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn build_doc_search_filter_applies_status_and_requested_filters() { + let filters = DocsSearchL0Filters { + scope: Some("project_shared".to_string()), + status: "deleted".to_string(), + doc_type: Some(DocType::Chat), + sparse_mode: DocsSparseMode::Auto, + domain: None, + repo: None, + agent_id: Some("owner".to_string()), + thread_id: Some("thread-7".to_string()), + updated_after: Some( + OffsetDateTime::parse("2026-02-20T00:00:00Z", &Rfc3339) + .expect("Invalid timestamp."), + ), + updated_before: Some( + OffsetDateTime::parse("2026-02-28T00:00:00Z", &Rfc3339) + .expect("Invalid timestamp."), + ), + ts_gte: Some( + OffsetDateTime::parse("2026-01-01T00:00:00Z", &Rfc3339) + .expect("Invalid timestamp."), + ), + ts_lte: Some( + OffsetDateTime::parse("2026-12-31T00:00:00Z", &Rfc3339) + .expect("Invalid timestamp."), + ), + }; + let filter = super::build_doc_search_filter( + TENANT_ID, + PROJECT_ID, + "requester", + &["agent_private".to_string(), "project_shared".to_string()], + &filters, + ); + + assert_eq!(first_match_value(&filter, "tenant_id").as_deref(), Some("tenant")); + assert_eq!(first_match_value(&filter, "status").as_deref(), Some("deleted")); + assert_eq!(first_match_value(&filter, "scope").as_deref(), Some("project_shared")); + assert_eq!(first_match_value(&filter, "doc_type").as_deref(), Some("chat")); + assert_eq!(first_match_value(&filter, "agent_id").as_deref(), Some("owner")); + assert_eq!(first_match_value(&filter, "thread_id").as_deref(), Some("thread-7")); + assert_eq!(first_match_value(&filter, "domain").as_deref(), None); + assert_eq!(first_match_value(&filter, "repo").as_deref(), None); + + let datetime_range = first_datetime_range(&filter, "updated_at") + .expect("Expected datetime filter for updated_at."); + let after = + OffsetDateTime::parse("2026-02-20T00:00:00Z", &Rfc3339).expect("Invalid timestamp."); + let before = + OffsetDateTime::parse("2026-02-28T00:00:00Z", &Rfc3339).expect("Invalid timestamp."); + let lt = datetime_range.lt.as_ref().expect("Expected datetime filter .lt value."); + let gt = datetime_range.gt.as_ref().expect("Expected datetime filter .gt value."); + + assert_eq!(lt.seconds, before.unix_timestamp()); + assert_eq!(lt.nanos, before.nanosecond() as i32); + assert_eq!(gt.seconds, after.unix_timestamp()); + assert_eq!(gt.nanos, after.nanosecond() as i32); + assert!(datetime_range.gte.is_none()); + assert!(datetime_range.lte.is_none()); + + let doc_ts_range = + first_datetime_range(&filter, "doc_ts").expect("Expected datetime filter for doc_ts."); + let gte = doc_ts_range.gte.as_ref().expect("Expected datetime filter .gte value."); + let lte = doc_ts_range.lte.as_ref().expect("Expected datetime filter .lte value."); + let doc_ts_gte = + OffsetDateTime::parse("2026-01-01T00:00:00Z", &Rfc3339).expect("Invalid timestamp."); + let doc_ts_lte = + OffsetDateTime::parse("2026-12-31T00:00:00Z", &Rfc3339).expect("Invalid timestamp."); + + assert_eq!(gte.seconds, doc_ts_gte.unix_timestamp()); + assert_eq!(gte.nanos, doc_ts_gte.nanosecond() as i32); + assert_eq!(lte.seconds, doc_ts_lte.unix_timestamp()); + assert_eq!(lte.nanos, doc_ts_lte.nanosecond() as i32); + assert!(doc_ts_range.gt.is_none()); + assert!(doc_ts_range.lt.is_none()); + } + + #[test] + fn validate_docs_search_l0_rejects_invalid_doc_ts_order() { + let err = docs::validate_docs_search_l0(&DocsSearchL0Request { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + caller_agent_id: "agent".to_string(), + read_profile: "private_plus_project".to_string(), + query: "status".to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: Some("2026-02-25T12:00:00Z".to_string()), + ts_lte: Some("2026-02-25T11:00:00Z".to_string()), + top_k: None, + candidate_k: None, + explain: None, + }) + .expect_err("Expected bad doc_ts order to be rejected."); + + match err { + Error::InvalidRequest { message } => { + assert!(message.contains("earlier")); + }, + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_search_l0_rejects_invalid_sparse_mode() { + let err = docs::validate_docs_search_l0(&DocsSearchL0Request { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + caller_agent_id: "agent".to_string(), + read_profile: "private_plus_project".to_string(), + query: "status".to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: Some("invalid".to_string()), + domain: None, + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + top_k: None, + candidate_k: None, + explain: None, + }) + .expect_err("Expected invalid sparse mode to be rejected."); + + match err { + Error::InvalidRequest { message } => { + assert!(message.contains("sparse_mode")); + }, + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_search_l0_rejects_domain_without_doc_type_search() { + let err = docs::validate_docs_search_l0(&DocsSearchL0Request { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + caller_agent_id: "agent".to_string(), + read_profile: "private_plus_project".to_string(), + query: "status".to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: None, + domain: Some("example.com".to_string()), + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + top_k: None, + candidate_k: None, + explain: None, + }) + .expect_err("Expected domain without doc_type=search to be rejected."); + + match err { + Error::InvalidRequest { message } => { + assert!(message.contains("doc_type=search")); + }, + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_search_l0_rejects_repo_without_doc_type_dev() { + let err = docs::validate_docs_search_l0(&DocsSearchL0Request { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + caller_agent_id: "agent".to_string(), + read_profile: "private_plus_project".to_string(), + query: "status".to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: None, + domain: None, + repo: Some("hack-ink/ELF".to_string()), + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + top_k: None, + candidate_k: None, + explain: None, + }) + .expect_err("Expected repo without doc_type=dev to be rejected."); + + match err { + Error::InvalidRequest { message } => { + assert!(message.contains("doc_type=dev")); + }, + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_search_l0_default_sparse_mode() { + let filters = docs::validate_docs_search_l0(&test_request_with_query("status")) + .expect("valid request"); + + assert!(matches!(filters.sparse_mode, DocsSparseMode::Auto)); + } + + #[test] + fn should_enable_sparse_auto_uses_symbol_cues() { + assert!(super::should_enable_sparse_auto("https://example.com/search?q=abc")); + assert!(!super::should_enable_sparse_auto("how to debug a timeout")); + } + + #[test] + fn excerpt_level_max_supports_l0_and_rejects_unknown_level() { + assert_eq!( + super::excerpt_level_max("L0").expect("Expected L0 to be supported."), + super::DEFAULT_L0_MAX_BYTES + ); + assert!(super::excerpt_level_max("L3").is_err()); + } + + #[test] + fn validate_docs_put_rejects_missing_source_ref() { + let err = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: Some(DocType::Knowledge.as_str().to_string()), + title: None, + write_policy: None, + source_ref: serde_json::json!({"schema":"doc_source_ref/v1", "doc_type":"knowledge"}), + content: "Hello world.".to_string(), + }) + .expect_err("Expected missing source_ref.ts to be rejected."); + + match err { + Error::InvalidRequest { message } => assert!(message.contains("source_ref[\"ts\"]")), + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_put_rejects_non_object_source_ref() { + let err = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: None, + title: None, + write_policy: None, + source_ref: serde_json::json!("legacy-shape"), + content: "Hello world.".to_string(), + }) + .expect_err("Expected non-object source_ref to be rejected."); + + match err { + Error::InvalidRequest { message } => { + assert!(message.contains("source_ref must be a JSON object")) + }, + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_put_rejects_mismatched_request_and_source_ref_doc_type() { + let err = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: Some(DocType::Chat.as_str().to_string()), + title: None, + write_policy: None, + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + }), + content: "Hello world.".to_string(), + }) + .expect_err("Expected mismatched doc_type to be rejected."); + + match err { + Error::InvalidRequest { message } => assert!(message.contains("match")), + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_put_rejects_wrong_source_ref_schema() { + let err = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: None, + title: None, + write_policy: None, + source_ref: serde_json::json!({ + "schema": "note_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + }), + content: "Hello world.".to_string(), + }) + .expect_err("Expected wrong source_ref.schema to be rejected."); + + match err { + Error::InvalidRequest { message } => assert!(message.contains("doc_source_ref/v1")), + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_put_rejects_chat_source_ref_with_missing_thread_metadata() { + let err = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: Some(DocType::Chat.as_str().to_string()), + title: None, + write_policy: None, + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "chat", + "ts": "2026-02-25T12:00:00Z", + }), + content: "Hello world.".to_string(), + }) + .expect_err("Expected chat source_ref to require thread_id/role."); + + match err { + Error::InvalidRequest { message } => assert!(message.contains("thread_id")), + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_put_rejects_search_source_ref_with_missing_domain() { + let err = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: Some(DocType::Search.as_str().to_string()), + title: None, + write_policy: None, + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "search", + "ts": "2026-02-25T12:00:00Z", + "query": "test", + "url": "https://example.com", + }), + content: "Hello world.".to_string(), + }) + .expect_err("Expected search source_ref to require domain."); + + match err { + Error::InvalidRequest { message } => assert!(message.contains("domain")), + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_put_rejects_dev_source_ref_with_multiple_identifiers() { + let err = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: Some(DocType::Dev.as_str().to_string()), + title: None, + write_policy: None, + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "dev", + "ts": "2026-02-25T12:00:00Z", + "repo": "hack-ink/ELF", + "commit_sha": "9f0a3f4c4eb58bfcf4a5f4f9d0c7be0e13c2f8d19", + "issue_number": 123, + }), + content: "Hello world.".to_string(), + }) + .expect_err("Expected dev source_ref to enforce exactly one identifier field."); + + match err { + Error::InvalidRequest { message } => { + assert!(message.contains("exactly one of commit_sha, pr_number, or issue_number")) + }, + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_put_uses_source_ref_doc_type_when_request_doc_type_is_absent() { + let resolved_doc_type = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: None, + title: None, + write_policy: None, + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "chat", + "ts": "2026-02-25T12:00:00Z", + "thread_id": "thread-1", + "role": "assistant" + }), + content: "Hello world.".to_string(), + }) + .expect("Expected valid source_ref to resolve doc_type."); + + assert_eq!(resolved_doc_type.doc_type, DocType::Chat); + } + + #[test] + fn validate_docs_put_applies_write_policy_and_includes_audit() { + let validated = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: Some(DocType::Knowledge.as_str().to_string()), + title: None, + write_policy: Some(WritePolicy { + exclusions: vec![WriteSpan { start: 6, end: 35 }], + redactions: vec![], + }), + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + }), + content: "Hello sk-abcdefghijklmnopqrstuvwxyz!".to_string(), + }) + .expect("Expected valid write policy transformation."); + let expected_audit = elf_domain::writegate::WritePolicyAudit { + exclusions: vec![WriteSpan { start: 6, end: 35 }], + ..Default::default() + }; + + assert_eq!(validated.content, "Hello !".to_string()); + assert_eq!(validated.write_policy_audit.unwrap_or_default(), expected_audit); + } + + #[test] + fn validate_docs_put_rejects_secret_after_write_policy() { + let err = docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: Some(DocType::Knowledge.as_str().to_string()), + title: None, + write_policy: Some(WritePolicy { exclusions: vec![], redactions: vec![] }), + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + }), + content: "Hello sk-abcdefghijklmnopqrstuvwxyz!".to_string(), + }) + .expect_err("Expected secret-bearing content to be rejected."); + + match err { + Error::InvalidRequest { message } => assert!(message.contains("contains secrets")), + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn validate_docs_put_allows_doc_source_ref_v1_and_rejects_free_text() { + docs::validate_docs_put(&DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: None, + title: Some("English title".to_string()), + write_policy: None, + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + "notes": "English only." + }), + content: "English content.".to_string(), + }) + .expect("Expected doc_source_ref/v1 source_ref to be accepted."); + + let err = docs::validate_docs_put(&DocsPutRequest { + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + "notes": "\u{4f60}\u{597d}\u{4e16}\u{754c}" + }), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: None, + title: Some("English title".to_string()), + write_policy: None, + content: "English content.".to_string(), + }) + .expect_err("Expected non-English free-text in source_ref."); + + match err { + Error::NonEnglishInput { field } => assert_eq!(field, "$.source_ref[\"notes\"]"), + other => panic!("Unexpected error: {other:?}"), + } + + let err = docs::validate_docs_put(&DocsPutRequest { + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + "ref": "\u{4f60}\u{597d}\u{4e16}\u{754c}" + }), + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "project_shared".to_string(), + doc_type: None, + title: Some("English title".to_string()), + write_policy: None, + content: "English content.".to_string(), + }) + .expect_err("Expected identifier lane with non-Latin text to be rejected."); + + match err { + Error::NonEnglishInput { field } => assert_eq!(field, "$.source_ref[\"ref\"]"), + other => panic!("Unexpected error: {other:?}"), + } + } + + #[test] + fn split_tokens_by_offsets_preserves_original_substring_offsets() { + let tokenizer = test_tokenizer(); + let chunks = + super::split_tokens_by_offsets("alpha bravo charlie delta", 2, 1, 10, &tokenizer) + .expect("Expected token chunking to succeed."); + + assert_eq!(chunks.len(), 3); + assert_eq!(chunks[0].start_offset, 0); + assert_eq!(chunks[0].end_offset, 11); + assert_eq!(chunks[1].start_offset, 6); + assert_eq!(chunks[1].end_offset, 19); + assert_eq!(chunks[2].start_offset, 12); + assert_eq!(chunks[2].end_offset, 25); + + for chunk in &chunks { + assert_eq!( + chunk.text, + "alpha bravo charlie delta"[chunk.start_offset..chunk.end_offset] + ); + } + } +} diff --git a/packages/elf-service/src/error.rs b/packages/elf-service/src/error.rs new file mode 100644 index 00000000..4cdea109 --- /dev/null +++ b/packages/elf-service/src/error.rs @@ -0,0 +1,72 @@ +/// Service-layer result type. +pub type Result<T, E = Error> = std::result::Result<T, E>; + +/// Errors returned by ELF service APIs. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// The request contained non-English input in the named field path. + #[error("Non-English input detected at {field}.")] + NonEnglishInput { + /// Field path that failed the English gate. + field: String, + }, + /// The request payload was invalid. + #[error("Invalid request: {message}")] + InvalidRequest { + /// Human-readable validation failure. + message: String, + }, + /// The caller is not allowed to act on the requested scope. + #[error("Scope denied: {message}")] + ScopeDenied { + /// Human-readable access failure. + message: String, + }, + /// The requested service resource could not be found. + #[error("Not found: {message}")] + NotFound { + /// Human-readable lookup failure. + message: String, + }, + /// The requested mutation conflicts with existing state. + #[error("Conflict: {message}")] + Conflict { + /// Human-readable conflict reason. + message: String, + }, + /// An external model or provider returned an error. + #[error("Provider error: {message}")] + Provider { + /// Human-readable provider failure. + message: String, + }, + /// Postgres or other storage work failed. + #[error("Storage error: {message}")] + Storage { + /// Human-readable storage failure. + message: String, + }, + /// Qdrant vector-store work failed. + #[error("Qdrant error: {message}")] + Qdrant { + /// Human-readable Qdrant failure. + message: String, + }, +} +impl From<sqlx::Error> for Error { + fn from(err: sqlx::Error) -> Self { + Self::Storage { message: err.to_string() } + } +} + +impl From<elf_storage::Error> for Error { + fn from(err: elf_storage::Error) -> Self { + match err { + elf_storage::Error::Sqlx(inner) => Self::Storage { message: inner.to_string() }, + elf_storage::Error::InvalidArgument(message) => Self::InvalidRequest { message }, + elf_storage::Error::NotFound(message) => Self::NotFound { message }, + elf_storage::Error::Conflict(message) => Self::Conflict { message }, + elf_storage::Error::Qdrant(inner) => Self::Qdrant { message: inner.to_string() }, + } + } +} diff --git a/packages/elf-service/src/graph.rs b/packages/elf-service/src/graph.rs new file mode 100644 index 00000000..8b187100 --- /dev/null +++ b/packages/elf-service/src/graph.rs @@ -0,0 +1,87 @@ +//! Graph retrieval and mutation APIs. + +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{ElfService, Error, Result}; +use elf_storage::graph; + +/// Temporal state for a graph relation fact relative to a read timestamp. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum RelationTemporalStatus { + /// The fact's validity window starts after the read timestamp. + Future, + /// The fact is valid at the read timestamp. + #[default] + Current, + /// The fact was invalidated before or at the read timestamp. + Historical, +} + +#[allow(dead_code)] +pub(crate) struct GraphUpsertFactArgs<'a> { + pub tenant_id: &'a str, + pub project_id: &'a str, + pub agent_id: &'a str, + pub scope: &'a str, + pub subject_entity_id: Uuid, + pub predicate: &'a str, + pub object_entity_id: Option<Uuid>, + pub object_value: Option<&'a str>, + pub valid_from: OffsetDateTime, + pub valid_to: Option<OffsetDateTime>, + pub evidence_note_ids: &'a [Uuid], +} + +impl ElfService { + #[allow(dead_code)] + pub(crate) async fn graph_upsert_fact(&self, args: GraphUpsertFactArgs<'_>) -> Result<Uuid> { + let mut tx = self.db.pool.begin().await?; + let predicate = graph::resolve_or_register_predicate( + &mut tx, + args.tenant_id, + args.project_id, + args.predicate, + ) + .await + .map_err(|err| Error::Storage { message: err.to_string() })?; + let fact_id = graph::insert_fact_with_evidence( + &mut tx, + args.tenant_id, + args.project_id, + args.agent_id, + args.scope, + args.subject_entity_id, + args.predicate, + predicate.predicate_id, + args.object_entity_id, + args.object_value, + args.valid_from, + args.valid_to, + args.evidence_note_ids, + ) + .await + .map_err(|err| Error::Storage { message: err.to_string() })?; + + tx.commit().await?; + + Ok(fact_id) + } +} + +pub(crate) fn relation_temporal_status( + valid_from: OffsetDateTime, + valid_to: Option<OffsetDateTime>, + read_at: OffsetDateTime, +) -> RelationTemporalStatus { + if valid_from > read_at { + return RelationTemporalStatus::Future; + } + if valid_to.is_some_and(|valid_to| valid_to <= read_at) { + return RelationTemporalStatus::Historical; + } + + RelationTemporalStatus::Current +} diff --git a/packages/elf-service/src/graph_ingestion.rs b/packages/elf-service/src/graph_ingestion.rs new file mode 100644 index 00000000..1a210713 --- /dev/null +++ b/packages/elf-service/src/graph_ingestion.rs @@ -0,0 +1,179 @@ +use sqlx::{Postgres, Transaction}; +use time::{Duration, OffsetDateTime}; +use uuid::Uuid; + +use crate::{Error, Result, StructuredFields, structured_fields::StructuredEntity}; +use elf_storage::graph; + +#[allow(clippy::too_many_arguments)] +pub(crate) async fn persist_graph_fields_tx( + tx: &mut Transaction<'_, Postgres>, + tenant_id: &str, + project_id: &str, + agent_id: &str, + scope: &str, + note_id: Uuid, + structured: &StructuredFields, + now: OffsetDateTime, +) -> Result<()> { + if !structured.has_graph_fields() { + return Ok(()); + } + + if let Some(entities) = structured.entities.as_ref() { + for (entity_idx, entity) in entities.iter().enumerate() { + let base_path = format!("structured.entities[{entity_idx}]"); + + upsert_graph_entity_and_aliases(tx, tenant_id, project_id, entity, base_path.as_str()) + .await?; + } + } + + let relations = structured.relations.as_deref().unwrap_or_default(); + + for (relation_idx, relation) in relations.iter().enumerate() { + let relation_now = now + Duration::microseconds(relation_idx as i64); + let relation_path = format!("structured.relations[{relation_idx}]"); + let subject = relation.subject.as_ref().ok_or_else(|| Error::InvalidRequest { + message: format!("{relation_path}.subject is required."), + })?; + let predicate = relation.predicate.as_deref().ok_or_else(|| Error::InvalidRequest { + message: format!("{relation_path}.predicate is required."), + })?; + let subject_entity_id = upsert_graph_entity_and_aliases( + tx, + tenant_id, + project_id, + subject, + &format!("{relation_path}.subject"), + ) + .await?; + let valid_from = relation.valid_from.unwrap_or(relation_now); + let valid_to = relation.valid_to; + + if let Some(valid_to) = valid_to + && valid_to <= valid_from + { + return Err(Error::InvalidRequest { + message: format!("{relation_path}.valid_to must be greater than valid_from."), + }); + } + + let object = relation.object.as_ref().ok_or_else(|| Error::InvalidRequest { + message: format!("{relation_path}.object is required."), + })?; + let (object_entity_id, object_value) = match (&object.entity, &object.value) { + (Some(entity), None) => { + let entity_id = upsert_graph_entity_and_aliases( + tx, + tenant_id, + project_id, + entity, + &format!("{relation_path}.object.entity"), + ) + .await?; + + (Some(entity_id), None) + }, + (None, Some(value)) => (None, Some(value.as_str())), + _ => { + return Err(Error::InvalidRequest { + message: format!( + "{relation_path}.object must provide exactly one of entity or value.", + ), + }); + }, + }; + let predicate_row = + graph::resolve_or_register_predicate(tx, tenant_id, project_id, predicate) + .await + .map_err(|err| Error::Storage { message: err.to_string() })?; + + reject_deprecated_predicate(predicate_row.status.as_str(), relation_path.as_str())?; + + let fact_id = graph::upsert_fact_with_evidence( + tx, + tenant_id, + project_id, + agent_id, + scope, + subject_entity_id, + predicate, + predicate_row.predicate_id, + object_entity_id, + object_value, + valid_from, + valid_to, + &[note_id], + ) + .await + .map_err(|err| Error::Storage { message: err.to_string() })?; + let is_current_truth = predicate_row.status == "active" + && predicate_row.cardinality == "single" + && valid_to.is_none() + && valid_from <= relation_now; + + if is_current_truth { + graph::supersede_conflicting_active_facts( + tx, + tenant_id, + project_id, + scope, + subject_entity_id, + predicate_row.predicate_id, + fact_id, + note_id, + valid_from, + ) + .await + .map_err(|err| Error::Storage { message: err.to_string() })?; + } + } + + Ok(()) +} + +fn reject_deprecated_predicate(status: &str, relation_path: &str) -> Result<()> { + if status == "deprecated" { + return Err(Error::InvalidRequest { + message: format!("{relation_path}.predicate is deprecated and cannot be used."), + }); + } + + Ok(()) +} + +async fn upsert_graph_entity_and_aliases( + tx: &mut Transaction<'_, Postgres>, + tenant_id: &str, + project_id: &str, + entity: &StructuredEntity, + context_path: &str, +) -> Result<Uuid> { + let canonical = entity.canonical.as_deref().ok_or_else(|| Error::InvalidRequest { + message: format!("{context_path}.canonical is required."), + })?; + let canonical = canonical.trim(); + let entity_id = + graph::upsert_entity(tx, tenant_id, project_id, canonical, entity.kind.as_deref()) + .await + .map_err(|err| Error::Storage { message: err.to_string() })?; + + if let Some(aliases) = entity.aliases.as_ref() { + for (alias_idx, alias) in aliases.iter().enumerate() { + let alias = alias.trim(); + + if alias.is_empty() { + return Err(Error::InvalidRequest { + message: format!("{context_path}.aliases[{alias_idx}] must not be empty."), + }); + } + + graph::upsert_entity_alias(tx, entity_id, alias) + .await + .map_err(|err| Error::Storage { message: err.to_string() })?; + } + } + + Ok(entity_id) +} diff --git a/packages/elf-service/src/graph_query.rs b/packages/elf-service/src/graph_query.rs new file mode 100644 index 00000000..75e37d73 --- /dev/null +++ b/packages/elf-service/src/graph_query.rs @@ -0,0 +1,829 @@ +//! Structured graph query APIs. + +use std::collections::HashSet; + +use serde::{Deserialize, Serialize}; +use sqlx::{FromRow, PgConnection}; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{ + ElfService, Error, Result, + access::{self, ORG_PROJECT_ID}, + graph::RelationTemporalStatus, + search, +}; +use elf_storage::{graph, models::GraphEntity}; + +/// Schema identifier for graph-query responses. +pub const ELF_GRAPH_QUERY_SCHEMA_V1: &str = "elf.graph_query/v1"; + +const DEFAULT_GRAPH_QUERY_LIMIT: u32 = 50; +const MAX_GRAPH_QUERY_LIMIT: u32 = 200; +const GRAPH_QUERY_EVIDENCE_LIMIT: i64 = 16; +const GRAPH_QUERY_FACTS_SQL: &str = "\ +SELECT + fact_id, + scope, + agent_id AS actor, + predicate, + predicate_id, + object_entity_id, + object_entity.canonical AS object_canonical, + object_entity.kind AS object_kind, + object_value, + valid_from, + valid_to, + COALESCE( + (SELECT ARRAY_AGG(e.note_id ORDER BY e.created_at ASC, e.note_id ASC) + FROM ( + SELECT note_id, created_at + FROM graph_fact_evidence + WHERE fact_id = gf.fact_id + ORDER BY created_at ASC, note_id ASC + LIMIT $9 + ) e), + '{}'::uuid[] + ) AS evidence_note_ids +FROM graph_facts AS gf +LEFT JOIN graph_entities AS object_entity + ON object_entity.entity_id = gf.object_entity_id + AND object_entity.tenant_id = gf.tenant_id + AND object_entity.project_id = gf.project_id +WHERE gf.tenant_id = $1 + AND (gf.project_id = $2 OR (gf.project_id = $10 AND gf.scope = 'org_shared')) + AND gf.subject_entity_id = $3 + AND gf.scope = ANY($4::text[]) + AND gf.valid_from <= $5 + AND (gf.valid_to IS NULL OR gf.valid_to > $5) + AND ($11::uuid IS NULL OR gf.predicate_id = $11) + AND ( + (gf.scope = 'agent_private' AND gf.agent_id = $6) + OR (gf.scope <> 'agent_private' AND ( + gf.agent_id = $6 OR (gf.scope || ':' || gf.agent_id) = ANY($7::text[]) + )) + ) +ORDER BY gf.valid_from DESC, gf.fact_id ASC +LIMIT $8"; + +/// Subject selector used by graph-query APIs. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(untagged)] +pub enum GraphQueryEntityRef { + /// Resolve the subject by entity identifier. + EntityId { + /// Entity identifier to resolve. + entity_id: Uuid, + }, + /// Resolve the subject by canonical or alias surface. + Surface { + /// Canonical or alias surface to resolve. + surface: String, + }, +} + +/// Predicate selector used by graph-query APIs. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(untagged)] +pub enum GraphQueryPredicateRef { + /// Resolve the predicate by predicate identifier. + PredicateId { + /// Predicate identifier to resolve. + predicate_id: Uuid, + }, + /// Resolve the predicate by canonical or alias surface. + Surface { + /// Canonical or alias surface to resolve. + surface: String, + }, +} + +/// Request payload for graph-query lookups. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct GraphQueryRequest { + /// Tenant to query within. + pub tenant_id: String, + /// Project to query within. + pub project_id: String, + /// Agent requesting the read. + pub agent_id: String, + /// Read profile that determines visible scopes. + pub read_profile: String, + /// Subject entity selector. + pub subject: GraphQueryEntityRef, + + /// Optional predicate selector used to narrow the results. + pub predicate: Option<GraphQueryPredicateRef>, + + /// Optional requested scopes. + pub scopes: Option<Vec<String>>, + #[serde(with = "crate::time_serde::option")] + /// Point-in-time view for temporal facts. + pub as_of: Option<OffsetDateTime>, + /// Optional maximum number of returned facts. + pub limit: Option<u32>, + /// When true, includes explain metadata. + pub explain: Option<bool>, +} + +/// Response payload for graph-query lookups. +#[derive(Clone, Debug, Serialize)] +pub struct GraphQueryResponse { + #[serde(with = "crate::time_serde")] + /// Effective point-in-time view used for the query. + pub as_of: OffsetDateTime, + /// Resolved subject entity. + pub subject: GraphQueryEntity, + #[serde(skip_serializing_if = "Option::is_none")] + /// Resolved predicate, when the request filtered by predicate. + pub predicate: Option<GraphQueryPredicate>, + /// Effective scopes used for the query. + pub scopes: Vec<String>, + /// Whether the result set was truncated by the limit. + pub truncated: bool, + /// Returned fact rows. + pub facts: Vec<GraphQueryFact>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional explain metadata. + pub explain: Option<GraphQueryExplain>, +} + +/// Resolved graph entity reference. +#[derive(Clone, Debug, Serialize)] +pub struct GraphQueryEntity { + /// Entity identifier. + pub entity_id: Uuid, + /// Canonical entity surface. + pub canonical: String, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional entity kind. + pub kind: Option<String>, +} + +/// Resolved graph predicate reference. +#[derive(Clone, Debug, Serialize)] +pub struct GraphQueryPredicate { + /// Predicate identifier. + pub predicate_id: Uuid, + /// Canonical predicate surface. + pub canonical: String, +} + +/// One graph fact returned by the query. +#[derive(Clone, Debug, Serialize)] +pub struct GraphQueryFact { + /// Fact identifier. + pub fact_id: Uuid, + /// Scope key for the fact. + pub scope: String, + /// Agent that emitted the fact. + pub actor: String, + /// Predicate surface recorded on the fact. + pub predicate: String, + #[serde(skip_serializing_if = "Option::is_none")] + /// Resolved predicate identifier, when available. + pub predicate_id: Option<Uuid>, + #[serde(with = "crate::time_serde")] + /// Start of the fact validity window. + pub valid_from: OffsetDateTime, + #[serde(with = "crate::time_serde::option")] + /// End of the fact validity window, if superseded. + pub valid_to: Option<OffsetDateTime>, + /// Temporal state for the fact relative to the service read timestamp. + pub temporal_status: RelationTemporalStatus, + /// Object payload for the fact. + pub object: GraphQueryObject, + /// Evidence note identifiers supporting the fact. + pub evidence_note_ids: Vec<Uuid>, +} + +/// Object payload returned for a graph fact. +#[derive(Clone, Debug, Serialize)] +pub struct GraphQueryObject { + #[serde(skip_serializing_if = "Option::is_none")] + /// Entity-shaped object value. + pub entity: Option<GraphQueryObjectEntity>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Scalar object value. + pub value: Option<String>, +} + +/// Resolved entity payload for a graph-fact object. +#[derive(Clone, Debug, Serialize)] +pub struct GraphQueryObjectEntity { + /// Entity identifier. + pub entity_id: Uuid, + /// Canonical entity surface. + pub canonical: String, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional entity kind. + pub kind: Option<String>, +} + +/// Explain metadata for a graph-query response. +#[derive(Clone, Debug, Serialize)] +pub struct GraphQueryExplain { + /// Explain schema identifier. + pub schema: String, + #[serde(with = "crate::time_serde")] + /// Effective point-in-time view used for the query. + pub as_of: OffsetDateTime, + /// Requested result limit. + pub requested_limit: u32, + /// Scopes allowed by the read profile. + pub allowed_scopes: Vec<String>, + /// Scopes effectively queried after request filtering. + pub effective_scopes: Vec<String>, + /// Number of rows read from storage. + pub queried_rows: usize, + /// Number of rows returned to the caller. + pub returned_rows: usize, + /// Whether the result set was truncated by the limit. + pub truncated: bool, +} + +#[derive(Debug)] +struct PreparedGraphQuery { + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + subject: GraphQueryEntityRef, + predicate: Option<GraphQueryPredicateRef>, + requested_scopes: Vec<String>, + as_of: OffsetDateTime, + limit: usize, + explain: bool, +} + +#[derive(Debug)] +struct ResolvedGraphQuerySubject { + entity_id: Uuid, + canonical: String, + kind: Option<String>, +} + +#[derive(Debug)] +struct ResolvedGraphQueryPredicate { + id: Uuid, + canonical: String, +} + +#[derive(Debug)] +struct GraphQueryRowsFetchParams<'a> { + tenant_id: &'a str, + project_id: &'a str, + subject_entity_id: Uuid, + scopes: &'a [String], + as_of: OffsetDateTime, + actor: &'a str, + shared_scope_keys: &'a [String], + predicate_id: Option<Uuid>, + limit_plus_one: i64, +} + +#[derive(Debug, FromRow)] +struct GraphQueryFactRow { + fact_id: Uuid, + scope: String, + actor: String, + predicate: String, + predicate_id: Option<Uuid>, + object_entity_id: Option<Uuid>, + object_canonical: Option<String>, + object_kind: Option<String>, + object_value: Option<String>, + valid_from: OffsetDateTime, + valid_to: Option<OffsetDateTime>, + evidence_note_ids: Vec<Uuid>, +} + +impl ElfService { + /// Resolves a subject and returns active graph facts visible to the caller. + pub async fn graph_query(&self, req: GraphQueryRequest) -> Result<GraphQueryResponse> { + let prepared = validate_graph_query_request(req)?; + let allowed_scopes = + search::resolve_read_profile_scopes(&self.cfg, prepared.read_profile.as_str())?; + let effective_scopes = + resolve_effective_scopes(&allowed_scopes, prepared.requested_scopes.as_slice())?; + let org_shared_allowed = allowed_scopes.iter().any(|scope| scope.trim() == "org_shared"); + let mut conn = self.db.pool.acquire().await?; + let subject = + resolve_subject(&mut conn, &prepared.tenant_id, &prepared.project_id, prepared.subject) + .await?; + let predicate = resolve_predicate( + &mut conn, + &prepared.tenant_id, + &prepared.project_id, + prepared.predicate, + ) + .await?; + let shared_grants = access::load_shared_read_grants_with_org_shared( + conn.as_mut(), + prepared.tenant_id.as_str(), + prepared.project_id.as_str(), + prepared.agent_id.as_str(), + org_shared_allowed, + ) + .await?; + let shared_scope_keys: Vec<String> = shared_grants + .into_iter() + .map(|item| format!("{}:{}", item.scope, item.space_owner_agent_id)) + .collect(); + let predicate_id = predicate.as_ref().map(|predicate| predicate.id); + let read_at = OffsetDateTime::now_utc(); + let rows = fetch_graph_query_rows( + &mut conn, + GraphQueryRowsFetchParams { + tenant_id: prepared.tenant_id.as_str(), + project_id: prepared.project_id.as_str(), + subject_entity_id: subject.entity_id, + scopes: effective_scopes.as_slice(), + as_of: prepared.as_of, + actor: prepared.agent_id.as_str(), + shared_scope_keys: shared_scope_keys.as_slice(), + predicate_id, + limit_plus_one: (prepared.limit as i64) + 1, + }, + ) + .await?; + let facts: Vec<GraphQueryFact> = rows + .into_iter() + .map(|row| { + let object = if let Some(entity_id) = row.object_entity_id { + GraphQueryObject { + entity: Some(GraphQueryObjectEntity { + entity_id, + canonical: row.object_canonical.unwrap_or_else(|| "".to_string()), + kind: row.object_kind, + }), + value: None, + } + } else { + GraphQueryObject { entity: None, value: row.object_value } + }; + + GraphQueryFact { + fact_id: row.fact_id, + scope: row.scope, + actor: row.actor, + predicate: row.predicate, + predicate_id: row.predicate_id, + valid_from: row.valid_from, + valid_to: row.valid_to, + temporal_status: crate::graph::relation_temporal_status( + row.valid_from, + row.valid_to, + read_at, + ), + object, + evidence_note_ids: row.evidence_note_ids, + } + }) + .collect(); + let queried_rows = facts.len(); + let (facts, truncated) = truncate_graph_query_facts(facts, prepared.limit); + let explain = if prepared.explain { + Some(build_graph_query_explain( + prepared.as_of, + &allowed_scopes, + &effective_scopes, + prepared.limit, + queried_rows, + facts.len(), + truncated, + )) + } else { + None + }; + + Ok(GraphQueryResponse { + as_of: prepared.as_of, + subject: GraphQueryEntity { + entity_id: subject.entity_id, + canonical: subject.canonical, + kind: subject.kind, + }, + predicate: predicate.map(|resolved| GraphQueryPredicate { + predicate_id: resolved.id, + canonical: resolved.canonical, + }), + scopes: effective_scopes, + truncated, + facts, + explain, + }) + } +} + +pub(crate) fn resolve_effective_scopes( + allowed_scopes: &[String], + requested_scopes: &[String], +) -> Result<Vec<String>> { + let allowed = allowed_scopes + .iter() + .map(|scope| scope.trim()) + .filter(|scope| !scope.is_empty()) + .collect::<Vec<_>>(); + + if allowed.is_empty() { + return Err(Error::InvalidRequest { + message: "read_profile resolves to no readable scopes.".to_string(), + }); + } + if requested_scopes.is_empty() { + let mut deduped = Vec::with_capacity(allowed.len()); + + for scope in allowed { + if !deduped.iter().any(|value| value == scope) { + deduped.push(scope.to_string()); + } + } + + return Ok(deduped); + } + + let mut effective = Vec::new(); + + for requested_scope in requested_scopes { + if !allowed.iter().any(|scope| scope == requested_scope) { + return Err(Error::InvalidRequest { + message: format!("scope is not readable under read_profile: {}", requested_scope), + }); + } + if !effective.iter().any(|scope| scope == requested_scope) { + effective.push(requested_scope.to_string()); + } + } + + Ok(effective) +} + +pub(crate) fn truncate_graph_query_facts( + mut facts: Vec<GraphQueryFact>, + limit: usize, +) -> (Vec<GraphQueryFact>, bool) { + let truncated = facts.len() > limit; + + if truncated { + facts.truncate(limit); + } + + (facts, truncated) +} + +pub(crate) fn build_graph_query_explain( + as_of: OffsetDateTime, + allowed_scopes: &[String], + effective_scopes: &[String], + requested_limit: usize, + queried_rows: usize, + returned_rows: usize, + truncated: bool, +) -> GraphQueryExplain { + GraphQueryExplain { + schema: ELF_GRAPH_QUERY_SCHEMA_V1.to_string(), + as_of, + requested_limit: requested_limit as u32, + allowed_scopes: allowed_scopes.to_vec(), + effective_scopes: effective_scopes.to_vec(), + queried_rows, + returned_rows, + truncated, + } +} + +fn validate_graph_query_request(req: GraphQueryRequest) -> Result<PreparedGraphQuery> { + let tenant_id = normalize_required_field(req.tenant_id.as_str(), "tenant_id")?; + let project_id = normalize_required_field(req.project_id.as_str(), "project_id")?; + let agent_id = normalize_required_field(req.agent_id.as_str(), "agent_id")?; + let read_profile = normalize_required_field(req.read_profile.as_str(), "read_profile")?; + let subject = match req.subject { + GraphQueryEntityRef::EntityId { entity_id } => GraphQueryEntityRef::EntityId { entity_id }, + GraphQueryEntityRef::Surface { surface } => { + let surface = normalize_required_field(surface.as_str(), "subject.surface")?; + + GraphQueryEntityRef::Surface { surface } + }, + }; + let predicate = match req.predicate { + Some(GraphQueryPredicateRef::PredicateId { predicate_id }) => + Some(GraphQueryPredicateRef::PredicateId { predicate_id }), + Some(GraphQueryPredicateRef::Surface { surface }) => { + let surface = normalize_required_field(surface.as_str(), "predicate.surface")?; + + Some(GraphQueryPredicateRef::Surface { surface }) + }, + None => None, + }; + let requested_scopes = normalize_scopes(req.scopes)?; + let limit = req.limit.unwrap_or(DEFAULT_GRAPH_QUERY_LIMIT); + + if !matches!(limit, 1..=MAX_GRAPH_QUERY_LIMIT) { + return Err(Error::InvalidRequest { + message: format!("limit must be between 1 and {MAX_GRAPH_QUERY_LIMIT}."), + }); + } + + Ok(PreparedGraphQuery { + tenant_id, + project_id, + agent_id, + read_profile, + subject, + predicate, + requested_scopes, + as_of: req.as_of.unwrap_or_else(OffsetDateTime::now_utc), + limit: limit as usize, + explain: req.explain.unwrap_or(false), + }) +} + +fn normalize_required_field(value: &str, field: &str) -> Result<String> { + let trimmed = value.trim(); + + if trimmed.is_empty() { + return Err(Error::InvalidRequest { message: format!("{field} is required.") }); + } + + Ok(trimmed.to_string()) +} + +fn normalize_scopes(scopes: Option<Vec<String>>) -> Result<Vec<String>> { + let scopes = scopes.unwrap_or_default(); + let mut seen = HashSet::new(); + let mut normalized = Vec::new(); + + for scope in scopes { + let scope = scope.trim().to_string(); + + if scope.is_empty() { + return Err(Error::InvalidRequest { + message: "scopes entries must be non-empty strings.".to_string(), + }); + } + if seen.insert(scope.clone()) { + normalized.push(scope); + } + } + + Ok(normalized) +} + +async fn resolve_subject( + conn: &mut PgConnection, + tenant_id: &str, + project_id: &str, + subject: GraphQueryEntityRef, +) -> Result<ResolvedGraphQuerySubject> { + match subject { + GraphQueryEntityRef::EntityId { entity_id } => { + let row = sqlx::query_as::<_, GraphEntity>( + "\ +SELECT + entity_id, + tenant_id, + project_id, + canonical, + canonical_norm, + kind, + created_at, + updated_at +FROM graph_entities +WHERE tenant_id = $1 + AND project_id = $2 + AND entity_id = $3", + ) + .bind(tenant_id) + .bind(project_id) + .bind(entity_id) + .fetch_optional(conn) + .await?; + let Some(row) = row else { + return Err(Error::NotFound { + message: format!("graph entity not found for subject entity_id={entity_id}"), + }); + }; + + Ok(ResolvedGraphQuerySubject { + entity_id: row.entity_id, + canonical: row.canonical, + kind: row.kind, + }) + }, + GraphQueryEntityRef::Surface { surface } => { + let Some(row) = + graph::resolve_entity_by_surface(conn, tenant_id, project_id, &surface).await? + else { + return Err(Error::NotFound { + message: format!("graph entity not found for subject surface={surface}"), + }); + }; + + Ok(ResolvedGraphQuerySubject { + entity_id: row.entity_id, + canonical: row.canonical, + kind: row.kind, + }) + }, + } +} + +async fn resolve_predicate( + conn: &mut PgConnection, + tenant_id: &str, + project_id: &str, + predicate: Option<GraphQueryPredicateRef>, +) -> Result<Option<ResolvedGraphQueryPredicate>> { + let Some(predicate) = predicate else { + return Ok(None); + }; + + match predicate { + GraphQueryPredicateRef::PredicateId { predicate_id } => { + let row = graph::get_predicate_by_id(conn, predicate_id).await?; + let Some(row) = row else { + return Err(Error::NotFound { + message: format!("graph predicate not found: {predicate_id}"), + }); + }; + + Ok(Some(ResolvedGraphQueryPredicate { id: row.predicate_id, canonical: row.canonical })) + }, + GraphQueryPredicateRef::Surface { surface } => { + let Some(row) = + graph::resolve_predicate_no_register(conn, tenant_id, project_id, &surface).await? + else { + return Err(Error::NotFound { + message: format!("graph predicate not found for surface={surface}"), + }); + }; + + Ok(Some(ResolvedGraphQueryPredicate { id: row.predicate_id, canonical: row.canonical })) + }, + } +} + +async fn fetch_graph_query_rows( + conn: &mut PgConnection, + params: GraphQueryRowsFetchParams<'_>, +) -> Result<Vec<GraphQueryFactRow>> { + let GraphQueryRowsFetchParams { + tenant_id, + project_id, + subject_entity_id, + scopes, + as_of, + actor, + shared_scope_keys, + predicate_id, + limit_plus_one, + } = params; + let rows = sqlx::query_as::<_, GraphQueryFactRow>(GRAPH_QUERY_FACTS_SQL) + .bind(tenant_id) + .bind(project_id) + .bind(subject_entity_id) + .bind(scopes) + .bind(as_of) + .bind(actor) + .bind(shared_scope_keys) + .bind(limit_plus_one) + .bind(GRAPH_QUERY_EVIDENCE_LIMIT) + .bind(ORG_PROJECT_ID) + .bind(predicate_id) + .fetch_all(conn) + .await?; + + Ok(rows) +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use uuid::Uuid; + + use crate::{ + ELF_GRAPH_QUERY_SCHEMA_V1, Error, GraphQueryFact, GraphQueryObject, GraphQueryObjectEntity, + graph::RelationTemporalStatus, + graph_query::{self, GraphQueryEntityRef, GraphQueryRequest, OffsetDateTime}, + }; + + fn base_request() -> GraphQueryRequest { + GraphQueryRequest { + tenant_id: "tenant".to_string(), + project_id: "project".to_string(), + agent_id: "agent".to_string(), + read_profile: "private_plus_project".to_string(), + subject: GraphQueryEntityRef::Surface { surface: "Alice".to_string() }, + predicate: None, + scopes: None, + as_of: None, + limit: Some(10), + explain: Some(true), + } + } + + #[test] + fn test_validate_graph_query_request_rejects_invalid_fields() { + let mut request = base_request(); + + request.subject = GraphQueryEntityRef::Surface { surface: " ".to_string() }; + + let err = graph_query::validate_graph_query_request(request) + .expect_err("invalid subject should fail"); + + assert!(matches!(err, Error::InvalidRequest { .. }), "expected invalid request error"); + } + + #[test] + fn test_truncate_graph_query_facts_and_explain_shaping() { + let facts = vec![ + GraphQueryFact { + fact_id: Uuid::from_u128(1), + scope: "project_shared".to_string(), + actor: "agent1".to_string(), + predicate: "knows".to_string(), + predicate_id: None, + valid_from: OffsetDateTime::from_unix_timestamp(1).expect("valid timestamp"), + valid_to: None, + temporal_status: RelationTemporalStatus::Current, + object: GraphQueryObject { + entity: Some(GraphQueryObjectEntity { + entity_id: Uuid::from_u128(100), + canonical: "Bob".to_string(), + kind: Some("person".to_string()), + }), + value: None, + }, + evidence_note_ids: vec![], + }, + GraphQueryFact { + fact_id: Uuid::from_u128(2), + scope: "project_shared".to_string(), + actor: "agent1".to_string(), + predicate: "likes".to_string(), + predicate_id: None, + valid_from: OffsetDateTime::from_unix_timestamp(2).expect("valid timestamp"), + valid_to: None, + temporal_status: RelationTemporalStatus::Current, + object: GraphQueryObject { + entity: Some(GraphQueryObjectEntity { + entity_id: Uuid::from_u128(101), + canonical: "Carol".to_string(), + kind: Some("person".to_string()), + }), + value: None, + }, + evidence_note_ids: vec![], + }, + GraphQueryFact { + fact_id: Uuid::from_u128(3), + scope: "project_shared".to_string(), + actor: "agent2".to_string(), + predicate: "located_in".to_string(), + predicate_id: None, + valid_from: OffsetDateTime::from_unix_timestamp(3).expect("valid timestamp"), + valid_to: None, + temporal_status: RelationTemporalStatus::Current, + object: GraphQueryObject { entity: None, value: Some("office".to_string()) }, + evidence_note_ids: vec![], + }, + ]; + let (trimmed, truncated) = graph_query::truncate_graph_query_facts(facts, 2); + + assert!(truncated); + assert_eq!(trimmed.len(), 2); + + let explain = graph_query::build_graph_query_explain( + OffsetDateTime::from_unix_timestamp(4).expect("valid timestamp"), + &["private_plus_project".to_string()], + &["private_plus_project".to_string()], + 2, + 3, + trimmed.len(), + truncated, + ); + + assert_eq!(explain.queried_rows, 3); + assert_eq!(explain.returned_rows, 2); + assert!(explain.truncated); + assert_eq!(explain.schema, ELF_GRAPH_QUERY_SCHEMA_V1); + } + + #[test] + fn test_resolve_effective_scopes_validates_requested_scopes() { + let allowed = vec![ + "agent_private".to_string(), + "project_shared".to_string(), + "org_shared".to_string(), + ]; + let requested = vec!["project_shared".to_string(), "project_shared".to_string()]; + let resolved = + graph_query::resolve_effective_scopes(&allowed, &requested).expect("valid scopes"); + let deduped: HashSet<_> = resolved.iter().collect(); + + assert_eq!(resolved, vec!["project_shared".to_string()]); + assert_eq!(deduped.len(), 1); + } +} diff --git a/packages/elf-service/src/ingest_audit.rs b/packages/elf-service/src/ingest_audit.rs new file mode 100644 index 00000000..77b2d5f6 --- /dev/null +++ b/packages/elf-service/src/ingest_audit.rs @@ -0,0 +1,154 @@ +use sqlx::{Postgres, Transaction}; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{NoteOp, Result}; +use elf_domain::{memory_policy::MemoryPolicyDecision, writegate::WritePolicyAudit}; + +pub(crate) struct IngestAuditArgs<'a> { + pub tenant_id: &'a str, + pub project_id: &'a str, + pub agent_id: &'a str, + pub scope: &'a str, + pub pipeline: &'a str, + pub note_type: &'a str, + pub note_key: Option<&'a str>, + pub note_id: Option<Uuid>, + pub note_version_id: Option<Uuid>, + pub base_decision: MemoryPolicyDecision, + pub policy_decision: MemoryPolicyDecision, + pub note_op: NoteOp, + pub reason_code: Option<&'a str>, + pub similarity_best: Option<f32>, + pub key_match: bool, + pub matched_dup: bool, + pub dup_sim_threshold: f32, + pub update_sim_threshold: f32, + pub confidence: f32, + pub importance: f32, + pub structured_present: bool, + pub graph_present: bool, + pub policy_rule: Option<&'a str>, + pub min_confidence: Option<f32>, + pub min_importance: Option<f32>, + pub write_policy_audits: Option<Vec<WritePolicyAudit>>, + pub ingestion_profile_id: Option<&'a str>, + pub ingestion_profile_version: Option<i32>, + pub ts: OffsetDateTime, +} + +pub(crate) async fn insert_ingest_decision( + tx: &mut Transaction<'_, Postgres>, + args: IngestAuditArgs<'_>, +) -> Result<()> { + let IngestAuditArgs { + tenant_id, + project_id, + agent_id, + scope, + pipeline, + note_type, + note_key, + note_id, + note_version_id, + base_decision, + policy_decision, + note_op, + reason_code, + similarity_best, + key_match, + matched_dup, + dup_sim_threshold, + update_sim_threshold, + confidence, + importance, + structured_present, + graph_present, + policy_rule, + min_confidence, + min_importance, + write_policy_audits, + ingestion_profile_id, + ingestion_profile_version, + ts, + } = args; + + sqlx::query( + "\ +INSERT INTO memory_ingest_decisions ( + decision_id, + tenant_id, + project_id, + agent_id, + scope, + pipeline, + note_type, + note_key, + note_id, + note_version_id, + base_decision, + policy_decision, + note_op, + reason_code, + details, + ts +) +VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16)", + ) + .bind(Uuid::new_v4()) + .bind(tenant_id) + .bind(project_id) + .bind(agent_id) + .bind(scope) + .bind(pipeline) + .bind(note_type) + .bind(note_key) + .bind(note_id) + .bind(note_version_id) + .bind(memory_policy_decision_to_str(base_decision)) + .bind(memory_policy_decision_to_str(policy_decision)) + .bind(note_op_to_str(note_op)) + .bind(reason_code) + .bind(serde_json::json!({ + "similarity_best": similarity_best, + "key_match": key_match, + "matched_dup": matched_dup, + "dup_sim_threshold": dup_sim_threshold, + "update_sim_threshold": update_sim_threshold, + "confidence": confidence, + "importance": importance, + "structured_present": structured_present, + "graph_present": graph_present, + "policy_rule": policy_rule, + "min_confidence": min_confidence, + "min_importance": min_importance, + "write_policy_audits": write_policy_audits, + "ingestion_profile": ingestion_profile_id.zip(ingestion_profile_version).map( + |(id, version)| serde_json::json!({ "id": id, "version": version }), + ), + })) + .bind(ts) + .execute(&mut **tx) + .await?; + + Ok(()) +} + +fn memory_policy_decision_to_str(decision: MemoryPolicyDecision) -> &'static str { + match decision { + MemoryPolicyDecision::Remember => "remember", + MemoryPolicyDecision::Update => "update", + MemoryPolicyDecision::Ignore => "ignore", + MemoryPolicyDecision::Reject => "reject", + } +} + +fn note_op_to_str(op: NoteOp) -> &'static str { + match op { + NoteOp::Add => "ADD", + NoteOp::Update => "UPDATE", + NoteOp::None => "NONE", + NoteOp::Delete => "DELETE", + NoteOp::Rejected => "REJECTED", + } +} diff --git a/packages/elf-service/src/ingestion_profiles.rs b/packages/elf-service/src/ingestion_profiles.rs new file mode 100644 index 00000000..3955c856 --- /dev/null +++ b/packages/elf-service/src/ingestion_profiles.rs @@ -0,0 +1,887 @@ +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use sqlx::{FromRow, PgPool}; +use time::OffsetDateTime; + +use crate::{ElfService, Error, Result}; +use elf_config::LlmProviderConfig; + +const ADD_EVENT_PIPELINE: &str = "add_event"; +const DEFAULT_PROFILE_ID: &str = "default"; +const DEFAULT_PROFILE_VERSION: i32 = 1; + +/// Selector for an ingestion profile and optional version. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct IngestionProfileSelector { + /// Profile identifier. + pub id: String, + /// Optional explicit version. + pub version: Option<i32>, +} + +/// Resolved ingestion-profile reference. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct IngestionProfileRef { + /// Profile identifier. + pub id: String, + /// Resolved version. + pub version: i32, +} + +/// Request payload for creating an ingestion profile version. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct AdminIngestionProfileCreateRequest { + /// Tenant that owns the profile. + pub tenant_id: String, + /// Project that owns the profile. + pub project_id: String, + /// Profile identifier. + pub profile_id: String, + /// Optional explicit version number. + pub version: Option<i32>, + /// JSON profile payload. + pub profile: Value, + /// Actor creating the profile version. + pub created_by: String, +} + +/// Request payload for listing ingestion profiles. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct AdminIngestionProfileListRequest { + /// Tenant that owns the profiles. + pub tenant_id: String, + /// Project that owns the profiles. + pub project_id: String, +} + +/// Request payload for fetching one ingestion profile. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct AdminIngestionProfileGetRequest { + /// Tenant that owns the profile. + pub tenant_id: String, + /// Project that owns the profile. + pub project_id: String, + /// Profile identifier. + pub profile_id: String, + /// Optional explicit version. + pub version: Option<i32>, +} + +/// Request payload for listing all versions of one ingestion profile. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct AdminIngestionProfileVersionsListRequest { + /// Tenant that owns the profile. + pub tenant_id: String, + /// Project that owns the profile. + pub project_id: String, + /// Profile identifier. + pub profile_id: String, +} + +/// Request payload for reading the default ingestion profile pointer. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct AdminIngestionProfileDefaultGetRequest { + /// Tenant that owns the default pointer. + pub tenant_id: String, + /// Project that owns the default pointer. + pub project_id: String, +} + +/// Request payload for updating the default ingestion profile pointer. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct AdminIngestionProfileDefaultSetRequest { + /// Tenant that owns the default pointer. + pub tenant_id: String, + /// Project that owns the default pointer. + pub project_id: String, + /// Profile identifier to make default. + pub profile_id: String, + /// Optional explicit version to make default. + pub version: Option<i32>, +} + +/// Response payload for one ingestion profile version. +#[derive(Clone, Debug, Serialize)] +pub struct AdminIngestionProfileResponse { + /// Profile identifier. + pub profile_id: String, + /// Profile version. + pub version: i32, + /// JSON profile payload. + pub profile: Value, + #[serde(with = "crate::time_serde")] + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Actor that created the version. + pub created_by: String, +} + +/// Summary row for an ingestion profile version. +#[derive(Clone, Debug, Serialize)] +pub struct AdminIngestionProfileSummary { + /// Profile identifier. + pub profile_id: String, + /// Profile version. + pub version: i32, + #[serde(with = "crate::time_serde")] + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Actor that created the version. + pub created_by: String, +} + +/// Response payload for listing ingestion profiles. +#[derive(Clone, Debug, Serialize)] +pub struct AdminIngestionProfilesListResponse { + /// Returned profile summaries. + pub profiles: Vec<AdminIngestionProfileSummary>, +} + +/// Response payload for listing versions of one ingestion profile. +#[derive(Clone, Debug, Serialize)] +pub struct AdminIngestionProfileVersionsListResponse { + /// Returned profile-version summaries. + pub profiles: Vec<AdminIngestionProfileSummary>, +} + +/// Response payload for reading the default ingestion profile pointer. +#[derive(Clone, Debug, Serialize)] +pub struct AdminIngestionProfileDefaultResponse { + /// Default profile identifier. + pub profile_id: String, + /// Default profile version, when pinned. + pub version: Option<i32>, + #[serde(with = "crate::time_serde")] + /// Last update timestamp for the default pointer. + pub updated_at: OffsetDateTime, +} + +#[derive(Clone, Debug)] +pub(crate) struct ResolvedIngestionProfile { + pub profile_ref: IngestionProfileRef, + pub prompt_schema: Value, + pub prompt_system: String, + pub prompt_user_template: String, + pub model: Option<String>, + pub temperature: Option<f32>, + pub timeout_ms: Option<u64>, +} +impl ResolvedIngestionProfile { + pub(crate) fn build_extractor_messages( + &self, + messages_json: &str, + max_notes: u32, + max_note_chars: u32, + ) -> Result<Vec<Value>> { + let schema = + serde_json::to_string(&self.prompt_schema).map_err(|_| Error::InvalidRequest { + message: "Failed to serialize ingestion profile schema.".to_string(), + })?; + let user_prompt = self + .prompt_user_template + .replace("{SCHEMA}", &schema) + .replace("{MAX_NOTES}", max_notes.to_string().as_str()) + .replace("{MAX_NOTE_CHARS}", max_note_chars.to_string().as_str()) + .replace("{MESSAGES_JSON}", messages_json); + + Ok(vec![ + serde_json::json!({ "role": "system", "content": self.prompt_system.clone() }), + serde_json::json!({ "role": "user", "content": user_prompt }), + ]) + } + + pub(crate) fn resolved_llm_config(&self, base: &LlmProviderConfig) -> LlmProviderConfig { + LlmProviderConfig { + provider_id: base.provider_id.clone(), + api_base: base.api_base.clone(), + api_key: base.api_key.clone(), + path: base.path.clone(), + model: self.model.clone().unwrap_or_else(|| base.model.clone()), + temperature: self.temperature.unwrap_or(base.temperature), + timeout_ms: self.timeout_ms.unwrap_or(base.timeout_ms), + default_headers: base.default_headers.clone(), + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct IngestionProfileV1 { + #[serde(default = "default_schema_version")] + schema_version: i32, + + prompt_schema: Option<Value>, + + prompt_system_template: Option<String>, + + prompt_user_template: Option<String>, + + model: Option<String>, + + temperature: Option<f32>, + + timeout_ms: Option<u64>, +} +impl IngestionProfileV1 { + fn with_defaults(self) -> Self { + let defaults = builtin_profile_v1(); + let mut merged = defaults; + + if self.schema_version != 0 { + merged.schema_version = self.schema_version; + } + + merged.prompt_schema = self.prompt_schema.or(merged.prompt_schema); + merged.prompt_system_template = + self.prompt_system_template.or(merged.prompt_system_template); + merged.prompt_user_template = self.prompt_user_template.or(merged.prompt_user_template); + merged.model = self.model.or(merged.model); + merged.temperature = self.temperature.or(merged.temperature); + merged.timeout_ms = self.timeout_ms.or(merged.timeout_ms); + + merged + } +} + +#[derive(FromRow)] +struct ProfileRow { + profile_id: String, + version: i32, + profile: Value, +} + +#[derive(FromRow)] +struct ProfileMetadataRow { + profile_id: String, + version: i32, + profile: Value, + created_at: OffsetDateTime, + created_by: String, +} + +#[derive(FromRow)] +struct ProfileSummaryRow { + profile_id: String, + version: i32, + created_at: OffsetDateTime, + created_by: String, +} + +#[derive(FromRow)] +struct ProfileDefaultRow { + profile_id: String, + version: Option<i32>, + updated_at: OffsetDateTime, +} + +impl ElfService { + /// Creates a new ingestion profile version. + pub async fn admin_ingestion_profile_create( + &self, + req: AdminIngestionProfileCreateRequest, + ) -> Result<AdminIngestionProfileResponse> { + let profile_id = req.profile_id.trim().to_string(); + let created_by = req.created_by.trim().to_string(); + + if profile_id.is_empty() { + return Err(Error::InvalidRequest { + message: "profile_id must be non-empty.".to_string(), + }); + } + if created_by.is_empty() { + return Err(Error::InvalidRequest { + message: "created_by must be non-empty.".to_string(), + }); + } + if !req.profile.is_object() { + return Err(Error::InvalidRequest { + message: "profile must be a JSON object.".to_string(), + }); + } + + let _ = parse_profile(req.profile.clone())?; + let version = match req.version { + Some(version) if version > 0 => version, + Some(_) => { + return Err(Error::InvalidRequest { + message: "version must be greater than 0.".to_string(), + }); + }, + None => { + sqlx::query_scalar::<_, i32>( + "SELECT COALESCE(MAX(version), 0) + 1 FROM memory_ingestion_profiles WHERE tenant_id=$1 AND project_id=$2 AND pipeline=$3 AND profile_id=$4", + ) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(ADD_EVENT_PIPELINE) + .bind(profile_id.as_str()) + .fetch_one(&self.db.pool) + .await? + } + }; + let row = sqlx::query_as::<_, ProfileMetadataRow>( + "\ +INSERT INTO memory_ingestion_profiles ( + tenant_id, + project_id, + pipeline, + profile_id, + version, + profile, + created_by +) VALUES ($1,$2,$3,$4,$5,$6::jsonb,$7) +ON CONFLICT DO NOTHING +RETURNING profile_id, version, profile, created_at, created_by", + ) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(ADD_EVENT_PIPELINE) + .bind(profile_id.as_str()) + .bind(version) + .bind(req.profile) + .bind(created_by.as_str()) + .fetch_optional(&self.db.pool) + .await?; + let row = row.ok_or_else(|| Error::Conflict { + message: format!( + "Ingestion profile '{}' version {} already exists for tenant '{}' project '{}' pipeline '{}'.", + profile_id, version, req.tenant_id, req.project_id, ADD_EVENT_PIPELINE, + ), + })?; + + Ok(AdminIngestionProfileResponse { + profile_id: row.profile_id, + version: row.version, + profile: row.profile, + created_at: row.created_at, + created_by: row.created_by, + }) + } + + /// Lists the latest visible ingestion profile versions. + pub async fn admin_ingestion_profiles_list( + &self, + req: AdminIngestionProfileListRequest, + ) -> Result<AdminIngestionProfilesListResponse> { + let rows = sqlx::query_as::<_, ProfileSummaryRow>( + "\ +SELECT DISTINCT ON (profile_id) + profile_id, version, created_at, created_by +FROM memory_ingestion_profiles +WHERE tenant_id=$1 AND project_id=$2 AND pipeline=$3 +ORDER BY profile_id, version DESC", + ) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(ADD_EVENT_PIPELINE) + .fetch_all(&self.db.pool) + .await?; + let profiles = rows + .into_iter() + .map(|row| AdminIngestionProfileSummary { + profile_id: row.profile_id, + version: row.version, + created_at: row.created_at, + created_by: row.created_by, + }) + .collect(); + + Ok(AdminIngestionProfilesListResponse { profiles }) + } + + /// Fetches one ingestion profile version. + pub async fn admin_ingestion_profile_get( + &self, + req: AdminIngestionProfileGetRequest, + ) -> Result<AdminIngestionProfileResponse> { + let selector = IngestionProfileSelector { + id: req.profile_id.trim().to_string(), + version: req.version, + }; + + if selector.id.is_empty() { + return Err(Error::InvalidRequest { + message: "profile_id must be non-empty.".to_string(), + }); + } + + if let Some(version) = selector.version + && version <= 0 + { + return Err(Error::InvalidRequest { + message: "version must be greater than 0.".to_string(), + }); + } + + let row = select_profile_metadata( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + &selector, + ) + .await?; + + Ok(AdminIngestionProfileResponse { + profile_id: row.profile_id, + version: row.version, + profile: row.profile, + created_at: row.created_at, + created_by: row.created_by, + }) + } + + /// Lists all versions for one ingestion profile. + pub async fn admin_ingestion_profile_versions_list( + &self, + req: AdminIngestionProfileVersionsListRequest, + ) -> Result<AdminIngestionProfileVersionsListResponse> { + let profile_id = req.profile_id.trim().to_string(); + + if profile_id.is_empty() { + return Err(Error::InvalidRequest { + message: "profile_id must be non-empty.".to_string(), + }); + } + + let rows = sqlx::query_as::<_, ProfileSummaryRow>( + "\ +SELECT profile_id, version, created_at, created_by +FROM memory_ingestion_profiles +WHERE tenant_id=$1 AND project_id=$2 AND pipeline=$3 AND profile_id=$4 +ORDER BY version DESC", + ) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(ADD_EVENT_PIPELINE) + .bind(profile_id) + .fetch_all(&self.db.pool) + .await?; + let profiles = rows + .into_iter() + .map(|row| AdminIngestionProfileSummary { + profile_id: row.profile_id, + version: row.version, + created_at: row.created_at, + created_by: row.created_by, + }) + .collect(); + + Ok(AdminIngestionProfileVersionsListResponse { profiles }) + } + + /// Reads the default ingestion profile pointer. + pub async fn admin_ingestion_profile_default_get( + &self, + req: AdminIngestionProfileDefaultGetRequest, + ) -> Result<AdminIngestionProfileDefaultResponse> { + seed_default_profile(&self.db.pool, req.tenant_id.as_str(), req.project_id.as_str()) + .await?; + + let row = sqlx::query_as::<_, ProfileDefaultRow>( + "\ +SELECT profile_id, version, updated_at +FROM memory_ingestion_profile_defaults +WHERE tenant_id=$1 AND project_id=$2 AND pipeline=$3", + ) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(ADD_EVENT_PIPELINE) + .fetch_optional(&self.db.pool) + .await?; + let row = match row { + Some(row) => row, + None => { + let selector = select_default_selector( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + ) + .await?; + + ProfileDefaultRow { + profile_id: selector.id, + version: selector.version, + updated_at: OffsetDateTime::now_utc(), + } + }, + }; + + Ok(AdminIngestionProfileDefaultResponse { + profile_id: row.profile_id, + version: row.version, + updated_at: row.updated_at, + }) + } + + /// Updates the default ingestion profile pointer. + pub async fn admin_ingestion_profile_default_set( + &self, + req: AdminIngestionProfileDefaultSetRequest, + ) -> Result<AdminIngestionProfileDefaultResponse> { + let profile_id = req.profile_id.trim().to_string(); + + if profile_id.is_empty() { + return Err(Error::InvalidRequest { + message: "profile_id must be non-empty.".to_string(), + }); + } + + if let Some(version) = req.version + && version <= 0 + { + return Err(Error::InvalidRequest { + message: "version must be greater than 0.".to_string(), + }); + } + + let selector = IngestionProfileSelector { id: profile_id.clone(), version: req.version }; + let row = select_profile_metadata( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + &selector, + ) + .await?; + let version = row.version; + let row = sqlx::query_as::<_, ProfileDefaultRow>( + "\ +INSERT INTO memory_ingestion_profile_defaults ( + tenant_id, + project_id, + pipeline, + profile_id, + version +) VALUES ($1,$2,$3,$4,$5) +ON CONFLICT (tenant_id, project_id, pipeline) DO UPDATE +SET profile_id = EXCLUDED.profile_id, + version = EXCLUDED.version, + updated_at = now() +RETURNING profile_id, version, updated_at", + ) + .bind(req.tenant_id.as_str()) + .bind(req.project_id.as_str()) + .bind(ADD_EVENT_PIPELINE) + .bind(row.profile_id) + .bind(version) + .fetch_one(&self.db.pool) + .await?; + + Ok(AdminIngestionProfileDefaultResponse { + profile_id: row.profile_id, + version: row.version, + updated_at: row.updated_at, + }) + } +} + +pub(crate) async fn resolve_add_event_profile( + pool: &PgPool, + tenant_id: &str, + project_id: &str, + selector: Option<&IngestionProfileSelector>, +) -> Result<ResolvedIngestionProfile> { + seed_default_profile(pool, tenant_id, project_id).await?; + + let selector = if let Some(selector) = selector { + selector.clone() + } else { + select_default_selector(pool, tenant_id, project_id).await? + }; + let row = select_profile(pool, tenant_id, project_id, &selector).await?; + let parsed = parse_profile(row.profile)?; + let merged = parsed.with_defaults(); + + if merged.schema_version != 1 { + return Err(Error::InvalidRequest { + message: "Unsupported ingestion profile schema version.".to_string(), + }); + } + + let prompt_schema = merged.prompt_schema.ok_or_else(|| Error::InvalidRequest { + message: "Missing prompt schema in ingestion profile.".to_string(), + })?; + let prompt_system_template = + merged.prompt_system_template.ok_or_else(|| Error::InvalidRequest { + message: "Missing system prompt template in ingestion profile.".to_string(), + })?; + let prompt_user_template = + merged.prompt_user_template.ok_or_else(|| Error::InvalidRequest { + message: "Missing user prompt template in ingestion profile.".to_string(), + })?; + + Ok(ResolvedIngestionProfile { + profile_ref: IngestionProfileRef { id: row.profile_id, version: row.version }, + prompt_schema, + prompt_system: prompt_system_template, + prompt_user_template, + model: merged.model, + temperature: merged.temperature, + timeout_ms: merged.timeout_ms, + }) +} + +fn default_schema_version() -> i32 { + 1 +} + +fn parse_profile(profile: Value) -> Result<IngestionProfileV1> { + let parsed = serde_json::from_value::<IngestionProfileV1>(profile.clone()).or_else(|_| { + if profile.is_object() { + Ok(IngestionProfileV1 { + schema_version: 1, + prompt_schema: Some(profile), + prompt_system_template: None, + prompt_user_template: None, + model: None, + temperature: None, + timeout_ms: None, + }) + } else { + Err(Error::InvalidRequest { + message: "Ingestion profile JSON has unsupported format.".to_string(), + }) + } + })?; + + Ok(parsed) +} + +fn builtin_profile_v1() -> IngestionProfileV1 { + IngestionProfileV1 { + schema_version: 1, + prompt_schema: Some(builtin_profile_schema()), + prompt_system_template: Some( + "You are a memory extraction engine for an agent memory system. Output must be valid JSON only and must match the provided schema exactly. \ +Extract at most MAX_NOTES high-signal, cross-session reusable memory notes from the given messages. \ +Each note must be one English sentence and must not contain any non-English text. \ +The structured field is optional. If present, summary must be short, facts must be short sentences supported by the evidence quotes, and concepts must be short phrases. \ +structured.entities and structured.relations should mirror the structured schema with optional entity and relation metadata and relation timestamps. \ +Preserve numbers, dates, percentages, currency amounts, tickers, URLs, and code snippets exactly. \ +Never store secrets or PII: API keys, tokens, private keys, seed phrases, passwords, bank IDs, personal addresses. \ +For every note, provide 1 to 2 evidence quotes copied verbatim from the input messages and include the message_index. \ +If you cannot provide verbatim evidence, omit the note. \ +If content is ephemeral or not useful long-term, return an empty notes array." + .to_string(), + ), + prompt_user_template: Some( + "Return JSON matching this exact schema:\n{SCHEMA}\nConstraints:\n- MAX_NOTES = {MAX_NOTES}\n- MAX_NOTE_CHARS = {MAX_NOTE_CHARS}\nHere are the messages as JSON:\n{MESSAGES_JSON}" + .to_string(), + ), + model: None, + temperature: None, + timeout_ms: None, + } +} + +fn builtin_profile_schema() -> Value { + serde_json::json!({ + "notes": [ + { + "type": "preference|constraint|decision|profile|fact|plan", + "key": "string|null", + "text": "English-only sentence <= MAX_NOTE_CHARS", + "structured": { + "summary": "string|null", + "facts": "string[]|null", + "concepts": "string[]|null", + "entities": [ + { + "canonical": "string|null", + "kind": "string|null", + "aliases": "string[]|null" + } + ], + "relations": [ + { + "subject": { + "canonical": "string|null", + "kind": "string|null", + "aliases": "string[]|null" + }, + "predicate": "string", + "object": { + "entity": { + "canonical": "string|null", + "kind": "string|null", + "aliases": "string[]|null" + }, + "value": "string|null" + }, + "valid_from": "string|null", + "valid_to": "string|null" + } + ] + }, + "importance": 0.0, + "confidence": 0.0, + "ttl_days": "number|null", + "scope_suggestion": "agent_private|project_shared|org_shared|null", + "evidence": [ + { "message_index": "number", "quote": "string" } + ], + "reason": "string" + } + ] + }) +} + +async fn select_profile_metadata( + pool: &PgPool, + tenant_id: &str, + project_id: &str, + selector: &IngestionProfileSelector, +) -> Result<ProfileMetadataRow> { + let row = if let Some(version) = selector.version { + sqlx::query_as::<_, ProfileMetadataRow>( + "\ +SELECT profile_id, version, profile, created_at, created_by +FROM memory_ingestion_profiles +WHERE tenant_id=$1 AND project_id=$2 AND pipeline=$3 AND profile_id=$4 AND version=$5", + ) + .bind(tenant_id) + .bind(project_id) + .bind(ADD_EVENT_PIPELINE) + .bind(selector.id.as_str()) + .bind(version) + .fetch_optional(pool) + .await? + } else { + sqlx::query_as::<_, ProfileMetadataRow>( + "\ +SELECT profile_id, version, profile, created_at, created_by +FROM memory_ingestion_profiles +WHERE tenant_id=$1 AND project_id=$2 AND pipeline=$3 AND profile_id=$4 +ORDER BY version DESC +LIMIT 1", + ) + .bind(tenant_id) + .bind(project_id) + .bind(ADD_EVENT_PIPELINE) + .bind(selector.id.as_str()) + .fetch_optional(pool) + .await? + }; + + row.ok_or_else(|| Error::InvalidRequest { + message: format!( + "Ingestion profile '{}' not found for tenant '{}' project '{}' pipeline '{}'.", + selector.id, tenant_id, project_id, ADD_EVENT_PIPELINE, + ), + }) +} + +async fn select_profile( + pool: &PgPool, + tenant_id: &str, + project_id: &str, + selector: &IngestionProfileSelector, +) -> Result<ProfileRow> { + let row = if let Some(version) = selector.version { + sqlx::query_as::<_, ProfileRow>( + "\ +SELECT profile_id, version, profile +FROM memory_ingestion_profiles +WHERE tenant_id=$1 AND project_id=$2 AND pipeline=$3 AND profile_id=$4 AND version=$5", + ) + .bind(tenant_id) + .bind(project_id) + .bind(ADD_EVENT_PIPELINE) + .bind(selector.id.as_str()) + .bind(version) + .fetch_optional(pool) + .await? + } else { + sqlx::query_as::<_, ProfileRow>( + "\ +SELECT profile_id, version, profile +FROM memory_ingestion_profiles +WHERE tenant_id=$1 AND project_id=$2 AND pipeline=$3 AND profile_id=$4 +ORDER BY version DESC +LIMIT 1", + ) + .bind(tenant_id) + .bind(project_id) + .bind(ADD_EVENT_PIPELINE) + .bind(selector.id.as_str()) + .fetch_optional(pool) + .await? + }; + + row.ok_or_else(|| Error::InvalidRequest { + message: format!( + "Ingestion profile '{}' not found for tenant '{}' project '{}' pipeline '{}'.", + selector.id, tenant_id, project_id, ADD_EVENT_PIPELINE + ), + }) +} + +async fn select_default_selector( + pool: &PgPool, + tenant_id: &str, + project_id: &str, +) -> Result<IngestionProfileSelector> { + let row = sqlx::query_as::<_, (String, Option<i32>)>( + "SELECT profile_id, version FROM memory_ingestion_profile_defaults WHERE tenant_id=$1 AND project_id=$2 AND pipeline=$3", + ) + .bind(tenant_id) + .bind(project_id) + .bind(ADD_EVENT_PIPELINE) + .fetch_optional(pool) + .await?; + let row = match row { + Some((profile_id, version)) => IngestionProfileSelector { id: profile_id, version }, + None => IngestionProfileSelector { + id: DEFAULT_PROFILE_ID.to_string(), + version: Some(DEFAULT_PROFILE_VERSION), + }, + }; + + Ok(row) +} + +async fn seed_default_profile(pool: &PgPool, tenant_id: &str, project_id: &str) -> Result<()> { + let profile = + serde_json::to_value(builtin_profile_v1()).map_err(|_| Error::InvalidRequest { + message: "Failed to serialize default ingestion profile.".to_string(), + })?; + + sqlx::query( + "\ +INSERT INTO memory_ingestion_profiles ( + tenant_id, + project_id, + pipeline, + profile_id, + version, + profile +) VALUES ($1,$2,$3,$4,$5,$6::jsonb) +ON CONFLICT DO NOTHING", + ) + .bind(tenant_id) + .bind(project_id) + .bind(ADD_EVENT_PIPELINE) + .bind(DEFAULT_PROFILE_ID) + .bind(DEFAULT_PROFILE_VERSION) + .bind(profile) + .execute(pool) + .await?; + sqlx::query( + "\ +INSERT INTO memory_ingestion_profile_defaults ( + tenant_id, + project_id, + pipeline, + profile_id, + version +) VALUES ($1,$2,$3,$4,$5) +ON CONFLICT DO NOTHING", + ) + .bind(tenant_id) + .bind(project_id) + .bind(ADD_EVENT_PIPELINE) + .bind(DEFAULT_PROFILE_ID) + .bind(DEFAULT_PROFILE_VERSION) + .execute(pool) + .await?; + + Ok(()) +} diff --git a/packages/elf-service/src/knowledge.rs b/packages/elf-service/src/knowledge.rs new file mode 100644 index 00000000..cdc9b24d --- /dev/null +++ b/packages/elf-service/src/knowledge.rs @@ -0,0 +1,2105 @@ +//! Deterministic derived knowledge page rebuild and readback service APIs. + +use std::collections::{BTreeMap, BTreeSet, HashMap}; + +use serde::{Deserialize, Serialize}; +use serde_json::{self, Map, Value}; +use sqlx::{Postgres, Transaction}; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{ElfService, Error, Result}; +use elf_domain::{ + english_gate, + knowledge::{ + KNOWLEDGE_PAGE_CONTRACT_SCHEMA_V1, KNOWLEDGE_PAGE_REBUILD_SCHEMA_V1, + KNOWLEDGE_PAGE_SOURCE_COVERAGE_SCHEMA_V1, KnowledgePageKind, KnowledgeSourceKind, + }, +}; +use elf_storage::{ + knowledge::{ + self, KnowledgeEventSource, KnowledgeNoteSource, KnowledgePageLintFindingInsert, + KnowledgePageSearchRow, KnowledgePageSectionInsert, KnowledgePageSourceRefInsert, + KnowledgePageUpsert, KnowledgeProposalSource, KnowledgeRelationSource, + }, + models::{ + KnowledgePage, KnowledgePageLintFinding, KnowledgePageSection, KnowledgePageSourceRef, + }, +}; + +const DEFAULT_LIST_LIMIT: i64 = 50; +const MAX_LIST_LIMIT: i64 = 200; +const SEARCH_SNIPPET_CHARS: usize = 280; + +/// Request to rebuild one derived knowledge page from explicit source ids. +#[derive(Clone, Debug, Deserialize)] +pub struct KnowledgePageRebuildRequest { + /// Tenant that owns the page and source records. + pub tenant_id: String, + /// Project that owns the page and source records. + pub project_id: String, + /// Agent requesting the rebuild. + pub agent_id: String, + /// Page kind. + pub page_kind: KnowledgePageKind, + /// Stable page key within the tenant/project/kind namespace. + pub page_key: String, + /// Optional display title; a deterministic title is generated when omitted. + pub title: Option<String>, + #[serde(default)] + /// Memory note sources to compile into the page. + pub note_ids: Vec<Uuid>, + #[serde(default)] + /// Durable add_event audit source ids to compile into the page. + pub event_ids: Vec<Uuid>, + #[serde(default)] + /// Graph relation fact ids to compile into the page. + pub relation_ids: Vec<Uuid>, + #[serde(default)] + /// Applied consolidation proposal ids to compile into the page. + pub proposal_ids: Vec<Uuid>, + #[serde(default = "empty_object")] + /// Provider metadata for nondeterministic or future LLM-derived rebuilds. + pub provider_metadata: Value, +} + +/// Response returned after rebuilding a derived knowledge page. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePageRebuildResponse { + /// Rebuilt page with sections, source refs, and lint findings. + pub page: KnowledgePageResponse, +} + +/// Request to get one derived knowledge page. +#[derive(Clone, Debug, Deserialize)] +pub struct KnowledgePageGetRequest { + /// Tenant that owns the page. + pub tenant_id: String, + /// Project that owns the page. + pub project_id: String, + /// Page identifier. + pub page_id: Uuid, +} + +/// Request to list derived knowledge pages. +#[derive(Clone, Debug, Deserialize)] +pub struct KnowledgePagesListRequest { + /// Tenant that owns the pages. + pub tenant_id: String, + /// Project that owns the pages. + pub project_id: String, + /// Optional page-kind filter. + pub page_kind: Option<KnowledgePageKind>, + /// Maximum number of pages to return. + pub limit: Option<u32>, +} + +/// Response returned by derived knowledge page listing. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePagesListResponse { + /// Returned pages. + pub pages: Vec<KnowledgePageSummary>, +} + +/// Request to lint one derived knowledge page against current source snapshots. +#[derive(Clone, Debug, Deserialize)] +pub struct KnowledgePageLintRequest { + /// Tenant that owns the page. + pub tenant_id: String, + /// Project that owns the page. + pub project_id: String, + /// Page identifier. + pub page_id: Uuid, +} + +/// Request to search derived knowledge page sections. +#[derive(Clone, Debug, Deserialize)] +pub struct KnowledgePageSearchRequest { + /// Tenant that owns the pages. + pub tenant_id: String, + /// Project that owns the pages. + pub project_id: String, + /// English-only query for page title, key, heading, or section content. + pub query: String, + /// Optional page-kind filter. + pub page_kind: Option<KnowledgePageKind>, + /// Maximum number of section snippets to return. + pub limit: Option<u32>, +} + +/// Response returned after linting one knowledge page. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePageLintResponse { + /// Page identifier. + pub page_id: Uuid, + /// Current lint findings. + pub findings: Vec<KnowledgePageLintFindingResponse>, +} + +/// Response returned by derived knowledge page section search. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePageSearchResponse { + /// Matching derived page snippets. + pub items: Vec<KnowledgePageSearchItem>, +} + +/// Summary DTO for one derived knowledge page. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePageSummary { + /// Page identifier. + pub page_id: Uuid, + /// Tenant that owns the page. + pub tenant_id: String, + /// Project that owns the page. + pub project_id: String, + /// Page kind. + pub page_kind: String, + /// Stable page key. + pub page_key: String, + /// Page title. + pub title: String, + /// Versioned page contract schema. + pub contract_schema: String, + /// Page lifecycle status. + pub status: String, + /// Canonical source snapshot hash. + pub rebuild_source_hash: String, + /// Canonical page content hash. + pub content_hash: String, + /// Source coverage metadata. + pub source_coverage: Value, + /// Rebuild metadata. + pub rebuild_metadata: Value, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, + /// Last rebuild timestamp. + pub rebuilt_at: OffsetDateTime, +} +impl From<KnowledgePage> for KnowledgePageSummary { + fn from(page: KnowledgePage) -> Self { + Self { + page_id: page.page_id, + tenant_id: page.tenant_id, + project_id: page.project_id, + page_kind: page.page_kind, + page_key: page.page_key, + title: page.title, + contract_schema: page.contract_schema, + status: page.status, + rebuild_source_hash: page.rebuild_source_hash, + content_hash: page.content_hash, + source_coverage: page.source_coverage, + rebuild_metadata: page.rebuild_metadata, + created_at: page.created_at, + updated_at: page.updated_at, + rebuilt_at: page.rebuilt_at, + } + } +} + +/// Full readback DTO for one derived knowledge page. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePageResponse { + /// Page summary. + pub page: KnowledgePageSummary, + /// Page sections. + pub sections: Vec<KnowledgePageSectionResponse>, + /// Normalized source refs. + pub source_refs: Vec<KnowledgePageSourceRefResponse>, + /// Lint findings. + pub lint_findings: Vec<KnowledgePageLintFindingResponse>, +} + +/// Readback DTO for one page section. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePageSectionResponse { + /// Section identifier. + pub section_id: Uuid, + /// Parent page identifier. + pub page_id: Uuid, + /// Stable section key. + pub section_key: String, + /// Section heading. + pub heading: String, + /// Section role. + pub role: String, + /// Section content. + pub content: String, + /// Display order. + pub ordinal: i32, + /// Serialized citation array. + pub citations: Value, + /// Reason this section is intentionally unsupported, when present. + pub unsupported_reason: Option<String>, + /// Count of section-local citations. + pub citation_count: usize, + /// Count of normalized source refs attached to this section. + pub source_ref_count: usize, + /// True when the section has both citations and normalized source backlinks. + pub coverage_complete: bool, + /// Section-local normalized source backlinks. + pub source_backlinks: Vec<KnowledgePageSectionSourceBacklink>, + /// Section content hash. + pub content_hash: String, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} +impl From<KnowledgePageSection> for KnowledgePageSectionResponse { + fn from(section: KnowledgePageSection) -> Self { + Self { + section_id: section.section_id, + page_id: section.page_id, + section_key: section.section_key, + heading: section.heading, + role: section.role, + content: section.content, + ordinal: section.ordinal, + citations: section.citations, + unsupported_reason: section.unsupported_reason, + citation_count: 0, + source_ref_count: 0, + coverage_complete: false, + source_backlinks: Vec::new(), + content_hash: section.content_hash, + created_at: section.created_at, + updated_at: section.updated_at, + } + } +} + +/// Section-local source backlink used by page readback and viewer provenance. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePageSectionSourceBacklink { + /// Source kind. + pub source_kind: String, + /// Authoritative source identifier. + pub source_id: Uuid, + /// Captured source status. + pub source_status: Option<String>, + /// Captured source update timestamp. + pub source_updated_at: Option<OffsetDateTime>, + /// Captured source content hash. + pub source_content_hash: Option<String>, +} +impl From<&KnowledgePageSourceRef> for KnowledgePageSectionSourceBacklink { + fn from(source_ref: &KnowledgePageSourceRef) -> Self { + Self { + source_kind: source_ref.source_kind.clone(), + source_id: source_ref.source_id, + source_status: source_ref.source_status.clone(), + source_updated_at: source_ref.source_updated_at, + source_content_hash: source_ref.source_content_hash.clone(), + } + } +} + +/// Readback DTO for one normalized source reference. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePageSourceRefResponse { + /// Source-reference row identifier. + pub ref_id: Uuid, + /// Parent page identifier. + pub page_id: Uuid, + /// Citing section, when section-scoped. + pub section_id: Option<Uuid>, + /// Source kind. + pub source_kind: String, + /// Authoritative source identifier. + pub source_id: Uuid, + /// Captured source status. + pub source_status: Option<String>, + /// Captured source update timestamp. + pub source_updated_at: Option<OffsetDateTime>, + /// Captured source content hash. + pub source_content_hash: Option<String>, + /// Captured source snapshot. + pub source_snapshot: Value, + /// Citation-local metadata. + pub citation_metadata: Value, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} +impl From<KnowledgePageSourceRef> for KnowledgePageSourceRefResponse { + fn from(source_ref: KnowledgePageSourceRef) -> Self { + Self { + ref_id: source_ref.ref_id, + page_id: source_ref.page_id, + section_id: source_ref.section_id, + source_kind: source_ref.source_kind, + source_id: source_ref.source_id, + source_status: source_ref.source_status, + source_updated_at: source_ref.source_updated_at, + source_content_hash: source_ref.source_content_hash, + source_snapshot: source_ref.source_snapshot, + citation_metadata: source_ref.citation_metadata, + created_at: source_ref.created_at, + } + } +} + +/// Readback DTO for one knowledge page lint finding. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePageLintFindingResponse { + /// Lint finding identifier. + pub finding_id: Uuid, + /// Parent page identifier. + pub page_id: Uuid, + /// Associated section, when available. + pub section_id: Option<Uuid>, + /// Finding type. + pub finding_type: String, + /// Finding severity. + pub severity: String, + /// Source kind associated with the finding, when available. + pub source_kind: Option<String>, + /// Source identifier associated with the finding, when available. + pub source_id: Option<Uuid>, + /// Human-readable finding message. + pub message: String, + /// Structured finding details. + pub details: Value, + /// Operator guidance for repair or rebuild. + pub repair_guidance: String, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} +impl From<KnowledgePageLintFinding> for KnowledgePageLintFindingResponse { + fn from(finding: KnowledgePageLintFinding) -> Self { + let repair_guidance = + repair_guidance_for_finding_type(finding.finding_type.as_str()).to_string(); + + Self { + finding_id: finding.finding_id, + page_id: finding.page_id, + section_id: finding.section_id, + finding_type: finding.finding_type, + severity: finding.severity, + source_kind: finding.source_kind, + source_id: finding.source_id, + message: finding.message, + repair_guidance, + details: finding.details, + created_at: finding.created_at, + } + } +} + +/// Search result for one derived knowledge page section. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePageSearchItem { + /// Result type discriminator for clients that mix pages with notes. + pub result_kind: String, + /// Derived page identifier. + pub page_id: Uuid, + /// Page kind. + pub page_kind: String, + /// Stable page key. + pub page_key: String, + /// Page title. + pub title: String, + /// Page lifecycle status. + pub status: String, + /// Section identifier. + pub section_id: Uuid, + /// Stable section key. + pub section_key: String, + /// Section heading. + pub heading: String, + /// Section role. + pub role: String, + /// Bounded matching section snippet. + pub snippet: String, + /// Section citations for visible provenance. + pub citations: Value, + /// Count of section-local citations. + pub citation_count: usize, + /// Count of normalized source refs attached to this section. + pub source_ref_count: usize, + /// Section-local source refs for backlink readback. + pub source_refs: Vec<KnowledgePageSourceRefResponse>, + /// Page-level source coverage metadata. + pub source_coverage: Value, + /// Page-level rebuild metadata. + pub rebuild_metadata: Value, + /// Lint summary for distinguishing clean, stale, and unsupported pages. + pub lint_summary: KnowledgePageLintSummary, + /// Trust state discriminator for viewer/search clients. + pub trust_state: String, + /// Explicit notice that the result is derived, not authoritative source truth. + pub derived_notice: String, + /// Repair or rebuild guidance when lint or coverage indicates risk. + pub repair_guidance: Option<String>, + /// Page update timestamp. + pub updated_at: OffsetDateTime, + /// Page rebuild timestamp. + pub rebuilt_at: OffsetDateTime, +} + +/// Aggregate lint counts for page search results. +#[derive(Clone, Debug, Serialize)] +pub struct KnowledgePageLintSummary { + /// Error finding count. + pub error_count: i64, + /// Warning finding count. + pub warning_count: i64, + /// Info finding count. + pub info_count: i64, + /// True when at least one error finding exists. + pub has_errors: bool, + /// True when at least one warning finding exists. + pub has_warnings: bool, +} + +#[derive(Clone, Debug)] +struct SourceSnapshot { + kind: KnowledgeSourceKind, + id: Uuid, + status: Option<String>, + updated_at: Option<OffsetDateTime>, + content_hash: Option<String>, + snapshot: Value, + citation_metadata: Value, + line: String, +} + +#[derive(Clone, Debug)] +struct DraftSection { + section_id: Uuid, + section_key: String, + heading: String, + role: String, + content: String, + ordinal: i32, + source_indexes: Vec<usize>, + unsupported_reason: Option<String>, + content_hash: String, + citations: Value, +} + +#[derive(Clone, Debug)] +struct LintDraft { + section_id: Option<Uuid>, + finding_type: String, + severity: String, + source_kind: Option<KnowledgeSourceKind>, + source_id: Option<Uuid>, + message: String, + details: Value, +} + +#[derive(Clone, Debug)] +struct SourceIds { + note_ids: Vec<Uuid>, + event_ids: Vec<Uuid>, + relation_ids: Vec<Uuid>, + proposal_ids: Vec<Uuid>, +} +impl SourceIds { + fn from_request(req: &KnowledgePageRebuildRequest) -> Result<Self> { + let ids = Self { + note_ids: sorted_unique(&req.note_ids), + event_ids: sorted_unique(&req.event_ids), + relation_ids: sorted_unique(&req.relation_ids), + proposal_ids: sorted_unique(&req.proposal_ids), + }; + + ids.validate_non_empty()?; + + Ok(ids) + } + + fn from_source_refs(source_refs: &[KnowledgePageSourceRef]) -> Result<Self> { + let mut note_ids = Vec::new(); + let mut event_ids = Vec::new(); + let mut relation_ids = Vec::new(); + let mut proposal_ids = Vec::new(); + + for source_ref in source_refs { + match KnowledgeSourceKind::parse(source_ref.source_kind.as_str()) { + Some(KnowledgeSourceKind::Note) => note_ids.push(source_ref.source_id), + Some(KnowledgeSourceKind::Event) => event_ids.push(source_ref.source_id), + Some(KnowledgeSourceKind::Relation) => relation_ids.push(source_ref.source_id), + Some(KnowledgeSourceKind::Proposal) => proposal_ids.push(source_ref.source_id), + None => { + return Err(Error::InvalidRequest { + message: "stored knowledge page source kind is invalid".to_string(), + }); + }, + } + } + + Ok(Self { + note_ids: sorted_unique(¬e_ids), + event_ids: sorted_unique(&event_ids), + relation_ids: sorted_unique(&relation_ids), + proposal_ids: sorted_unique(&proposal_ids), + }) + } + + fn validate_non_empty(&self) -> Result<()> { + if self.note_ids.is_empty() + && self.event_ids.is_empty() + && self.relation_ids.is_empty() + && self.proposal_ids.is_empty() + { + return Err(Error::InvalidRequest { + message: "at least one source id is required for a knowledge page rebuild" + .to_string(), + }); + } + + Ok(()) + } + + fn require_counts( + &self, + notes: usize, + events: usize, + relations: usize, + proposals: usize, + ) -> Result<()> { + if notes != self.note_ids.len() + || events != self.event_ids.len() + || relations != self.relation_ids.len() + || proposals != self.proposal_ids.len() + { + return Err(Error::InvalidRequest { + message: + "all requested knowledge page sources must exist and proposals must be applied" + .to_string(), + }); + } + + Ok(()) + } +} + +impl ElfService { + /// Rebuilds and persists one derived knowledge page from explicit source ids. + pub async fn knowledge_page_rebuild( + &self, + req: KnowledgePageRebuildRequest, + ) -> Result<KnowledgePageRebuildResponse> { + validate_context(req.tenant_id.as_str(), req.project_id.as_str(), req.agent_id.as_str())?; + validate_non_empty("page_key", req.page_key.as_str())?; + validate_object("provider_metadata", &req.provider_metadata)?; + + let ids = SourceIds::from_request(&req)?; + let title = + req.title.clone().unwrap_or_else(|| generated_title(req.page_kind, &req.page_key)); + let sources = self.resolve_sources(&req, &ids).await?; + let now = OffsetDateTime::now_utc(); + let source_snapshot = source_snapshot_value(&sources); + let source_hash = hash_json(&source_snapshot)?; + let mut sections = build_sections(&sources)?; + let lint = lint_unsupported_sections(§ions); + + for section in &mut sections { + section.citations = citations_value(section, &sources); + section.content_hash = hash_json(§ion_hash_payload(section))?; + } + + let source_coverage = + source_coverage_value(req.page_kind, &req.page_key, §ions, &sources); + let rebuild_metadata = rebuild_metadata(&source_hash, &req.provider_metadata); + let content_hash = + page_content_hash(&title, §ions, &source_coverage, &rebuild_metadata)?; + let page_id = Uuid::new_v4(); + let mut tx = self.db.pool.begin().await?; + let page = knowledge::upsert_knowledge_page( + &mut *tx, + KnowledgePageUpsert { + page_id, + tenant_id: req.tenant_id.as_str(), + project_id: req.project_id.as_str(), + page_kind: req.page_kind.as_str(), + page_key: req.page_key.as_str(), + title: title.as_str(), + contract_schema: KNOWLEDGE_PAGE_CONTRACT_SCHEMA_V1, + status: "active", + rebuild_source_hash: source_hash.as_str(), + content_hash: content_hash.as_str(), + source_coverage: &source_coverage, + source_snapshot: &source_snapshot, + rebuild_metadata: &rebuild_metadata, + now, + }, + ) + .await?; + + replace_page_children(&mut tx, page.page_id, §ions, &sources, &lint, now).await?; + + tx.commit().await?; + + Ok(KnowledgePageRebuildResponse { page: self.knowledge_page_response(page).await? }) + } + + /// Gets one derived knowledge page with sections, source refs, and lint findings. + pub async fn knowledge_page_get( + &self, + req: KnowledgePageGetRequest, + ) -> Result<KnowledgePageResponse> { + let page = knowledge::get_knowledge_page( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + req.page_id, + ) + .await? + .ok_or_else(|| Error::NotFound { message: "knowledge page not found".to_string() })?; + + self.knowledge_page_response(page).await + } + + /// Lists derived knowledge pages. + pub async fn knowledge_pages_list( + &self, + req: KnowledgePagesListRequest, + ) -> Result<KnowledgePagesListResponse> { + let page_kind = req.page_kind.map(KnowledgePageKind::as_str); + let pages = knowledge::list_knowledge_pages( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + page_kind, + bounded_limit(req.limit), + ) + .await? + .into_iter() + .map(KnowledgePageSummary::from) + .collect(); + + Ok(KnowledgePagesListResponse { pages }) + } + + /// Searches derived knowledge page sections and returns provenance-rich snippets. + pub async fn knowledge_pages_search( + &self, + req: KnowledgePageSearchRequest, + ) -> Result<KnowledgePageSearchResponse> { + validate_non_empty("tenant_id", req.tenant_id.as_str())?; + validate_non_empty("project_id", req.project_id.as_str())?; + validate_non_empty("query", req.query.as_str())?; + + if !english_gate::is_english_natural_language(req.query.as_str()) { + return Err(Error::NonEnglishInput { field: "$.query".to_string() }); + } + + let query = req.query.trim().to_ascii_lowercase(); + let query_pattern = format!("%{query}%"); + let page_kind = req.page_kind.map(KnowledgePageKind::as_str); + let rows = knowledge::search_knowledge_page_sections( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + page_kind, + query_pattern.as_str(), + bounded_limit(req.limit), + ) + .await?; + let page_ids = sorted_unique(&rows.iter().map(|row| row.page_id).collect::<Vec<_>>()); + let source_refs = + knowledge::list_knowledge_page_source_refs_for_pages(&self.db.pool, &page_ids).await?; + let source_refs_by_section = source_refs_by_section(&source_refs); + let items = rows + .into_iter() + .map(|row| { + let refs = cloned_source_refs(source_refs_by_section.get(&row.section_id)); + + knowledge_page_search_item(row, refs, req.query.as_str()) + }) + .collect(); + + Ok(KnowledgePageSearchResponse { items }) + } + + /// Lints a derived knowledge page against current source snapshots. + pub async fn knowledge_page_lint( + &self, + req: KnowledgePageLintRequest, + ) -> Result<KnowledgePageLintResponse> { + let page = knowledge::get_knowledge_page( + &self.db.pool, + req.tenant_id.as_str(), + req.project_id.as_str(), + req.page_id, + ) + .await? + .ok_or_else(|| Error::NotFound { message: "knowledge page not found".to_string() })?; + let source_refs = + knowledge::list_knowledge_page_source_refs(&self.db.pool, page.page_id).await?; + let sections = knowledge::list_knowledge_page_sections(&self.db.pool, page.page_id).await?; + let mut findings = self.lint_source_refs(&page, &source_refs).await?; + + findings.extend(lint_page_sections(&page, §ions, &source_refs)); + + let now = OffsetDateTime::now_utc(); + let mut tx = self.db.pool.begin().await?; + + knowledge::delete_knowledge_page_lint_findings(&mut *tx, page.page_id).await?; + + for finding in &findings { + insert_lint_finding(&mut tx, page.page_id, finding, now).await?; + } + + tx.commit().await?; + + let persisted = knowledge::list_knowledge_page_lint_findings(&self.db.pool, page.page_id) + .await? + .into_iter() + .map(KnowledgePageLintFindingResponse::from) + .collect(); + + Ok(KnowledgePageLintResponse { page_id: page.page_id, findings: persisted }) + } + + async fn knowledge_page_response(&self, page: KnowledgePage) -> Result<KnowledgePageResponse> { + let page_id = page.page_id; + let section_rows = knowledge::list_knowledge_page_sections(&self.db.pool, page_id).await?; + let source_ref_rows = + knowledge::list_knowledge_page_source_refs(&self.db.pool, page_id).await?; + let source_refs_by_section = source_refs_by_section(&source_ref_rows); + let sections = section_rows + .into_iter() + .map(|section| { + let refs = cloned_source_refs(source_refs_by_section.get(§ion.section_id)); + + section_response(section, refs) + }) + .collect(); + let source_refs = + source_ref_rows.into_iter().map(KnowledgePageSourceRefResponse::from).collect(); + let lint_findings = knowledge::list_knowledge_page_lint_findings(&self.db.pool, page_id) + .await? + .into_iter() + .map(KnowledgePageLintFindingResponse::from) + .collect(); + + Ok(KnowledgePageResponse { + page: KnowledgePageSummary::from(page), + sections, + source_refs, + lint_findings, + }) + } + + async fn resolve_sources( + &self, + req: &KnowledgePageRebuildRequest, + ids: &SourceIds, + ) -> Result<Vec<SourceSnapshot>> { + let (notes, events, relations, proposals) = self + .resolve_existing_source_rows(req.tenant_id.as_str(), req.project_id.as_str(), ids) + .await?; + + ids.require_counts(notes.len(), events.len(), relations.len(), proposals.len())?; + + Ok(source_snapshots(notes, events, relations, proposals)) + } + + async fn resolve_existing_source_rows( + &self, + tenant_id: &str, + project_id: &str, + ids: &SourceIds, + ) -> Result<( + Vec<KnowledgeNoteSource>, + Vec<KnowledgeEventSource>, + Vec<KnowledgeRelationSource>, + Vec<KnowledgeProposalSource>, + )> { + let notes = knowledge::fetch_knowledge_note_sources( + &self.db.pool, + tenant_id, + project_id, + &ids.note_ids, + ) + .await?; + let events = knowledge::fetch_knowledge_event_sources( + &self.db.pool, + tenant_id, + project_id, + &ids.event_ids, + ) + .await?; + let relations = knowledge::fetch_knowledge_relation_sources( + &self.db.pool, + tenant_id, + project_id, + &ids.relation_ids, + ) + .await?; + let proposals = knowledge::fetch_knowledge_proposal_sources( + &self.db.pool, + tenant_id, + project_id, + &ids.proposal_ids, + ) + .await?; + + Ok((notes, events, relations, proposals)) + } + + async fn lint_source_refs( + &self, + page: &KnowledgePage, + source_refs: &[KnowledgePageSourceRef], + ) -> Result<Vec<LintDraft>> { + let ids = SourceIds::from_source_refs(source_refs)?; + let current = self.resolve_current_source_map(page, &ids).await?; + let mut findings = Vec::new(); + + for source_ref in source_refs { + let key = current_key(source_ref.source_kind.as_str(), source_ref.source_id); + let Some(snapshot) = current.get(&key) else { + findings.push(missing_source_finding(source_ref)); + + continue; + }; + + if source_changed(source_ref, snapshot) { + findings.push(stale_source_finding(source_ref, snapshot)); + } + } + + Ok(findings) + } + + async fn resolve_current_source_map( + &self, + page: &KnowledgePage, + ids: &SourceIds, + ) -> Result<BTreeMap<String, SourceSnapshot>> { + let _page_kind = KnowledgePageKind::parse(page.page_kind.as_str()).ok_or_else(|| { + Error::InvalidRequest { message: "stored knowledge page kind is invalid".to_string() } + })?; + let (notes, events, relations, proposals) = self + .resolve_existing_source_rows(page.tenant_id.as_str(), page.project_id.as_str(), ids) + .await?; + let mut sources = source_snapshots(notes, events, relations, proposals); + + Ok(sources.drain(..).map(|source| (source_key(&source), source)).collect()) + } +} + +fn source_snapshots( + notes: Vec<KnowledgeNoteSource>, + events: Vec<KnowledgeEventSource>, + relations: Vec<KnowledgeRelationSource>, + proposals: Vec<KnowledgeProposalSource>, +) -> Vec<SourceSnapshot> { + let mut sources = Vec::new(); + + sources.extend(notes.into_iter().map(note_source_snapshot)); + sources.extend(events.into_iter().map(event_source_snapshot)); + sources.extend(relations.into_iter().map(relation_source_snapshot)); + sources.extend(proposals.into_iter().map(proposal_source_snapshot)); + sources.sort_by_key(source_sort_key); + + sources +} + +fn source_refs_by_section( + source_refs: &[KnowledgePageSourceRef], +) -> HashMap<Uuid, Vec<KnowledgePageSourceRef>> { + let mut by_section = HashMap::<Uuid, Vec<KnowledgePageSourceRef>>::new(); + + for source_ref in source_refs { + let Some(section_id) = source_ref.section_id else { + continue; + }; + + by_section.entry(section_id).or_default().push(clone_source_ref(source_ref)); + } + + by_section +} + +fn cloned_source_refs( + source_refs: Option<&Vec<KnowledgePageSourceRef>>, +) -> Vec<KnowledgePageSourceRef> { + source_refs.map(|refs| refs.iter().map(clone_source_ref).collect()).unwrap_or_default() +} + +fn clone_source_ref(source_ref: &KnowledgePageSourceRef) -> KnowledgePageSourceRef { + KnowledgePageSourceRef { + ref_id: source_ref.ref_id, + page_id: source_ref.page_id, + section_id: source_ref.section_id, + source_kind: source_ref.source_kind.clone(), + source_id: source_ref.source_id, + source_status: source_ref.source_status.clone(), + source_updated_at: source_ref.source_updated_at, + source_content_hash: source_ref.source_content_hash.clone(), + source_snapshot: source_ref.source_snapshot.clone(), + citation_metadata: source_ref.citation_metadata.clone(), + created_at: source_ref.created_at, + } +} + +fn section_response( + section: KnowledgePageSection, + source_refs: Vec<KnowledgePageSourceRef>, +) -> KnowledgePageSectionResponse { + let citation_count = citation_count(§ion.citations); + let source_ref_count = source_refs.len(); + let source_backlinks = + source_refs.iter().map(KnowledgePageSectionSourceBacklink::from).collect(); + + KnowledgePageSectionResponse { + citation_count, + source_ref_count, + coverage_complete: citation_count > 0 && source_ref_count > 0, + source_backlinks, + ..KnowledgePageSectionResponse::from(section) + } +} + +fn knowledge_page_search_item( + row: KnowledgePageSearchRow, + source_refs: Vec<KnowledgePageSourceRef>, + query: &str, +) -> KnowledgePageSearchItem { + let source_ref_count = usize::try_from(row.section_source_ref_count).unwrap_or(0); + let citation_count = citation_count(&row.citations); + let lint_summary = KnowledgePageLintSummary { + error_count: row.lint_error_count, + warning_count: row.lint_warning_count, + info_count: row.lint_info_count, + has_errors: row.lint_error_count > 0, + has_warnings: row.lint_warning_count > 0, + }; + let coverage_complete = + row.source_coverage.get("coverage_complete").and_then(Value::as_bool).unwrap_or(false); + let trust_state = search_trust_state(&lint_summary, coverage_complete, &row); + let repair_guidance = search_repair_guidance(&trust_state); + + KnowledgePageSearchItem { + result_kind: "knowledge_page_section".to_string(), + page_id: row.page_id, + page_kind: row.page_kind, + page_key: row.page_key, + title: row.title, + status: row.status, + section_id: row.section_id, + section_key: row.section_key, + heading: row.heading, + role: row.role, + snippet: snippet_for_query(row.content.as_str(), query, SEARCH_SNIPPET_CHARS), + citations: row.citations, + citation_count, + source_ref_count, + source_refs: source_refs.into_iter().map(KnowledgePageSourceRefResponse::from).collect(), + source_coverage: row.source_coverage, + rebuild_metadata: row.rebuild_metadata, + lint_summary, + trust_state, + derived_notice: + "Derived knowledge page snippet. Verify cited source notes, events, relations, or proposals before treating it as authoritative." + .to_string(), + repair_guidance, + updated_at: row.page_updated_at, + rebuilt_at: row.rebuilt_at, + } +} + +fn search_trust_state( + lint: &KnowledgePageLintSummary, + coverage_complete: bool, + row: &KnowledgePageSearchRow, +) -> String { + if lint.has_errors { + return "derived_error".to_string(); + } + if lint.has_warnings || row.unsupported_reason.is_some() { + return "derived_warning".to_string(); + } + + if !coverage_complete || row.section_source_ref_count == 0 { + return "derived_low_coverage".to_string(); + } + + "derived_clean".to_string() +} + +fn search_repair_guidance(trust_state: &str) -> Option<String> { + match trust_state { + "derived_error" => Some( + "Run knowledge page lint, inspect stale or missing source refs, then rebuild the page from current authoritative sources." + .to_string(), + ), + "derived_warning" => Some( + "Inspect unsupported or stale findings before using this derived snippet; rebuild after source review." + .to_string(), + ), + "derived_low_coverage" => Some( + "Rebuild with complete citations or add source-backed sections before relying on this page." + .to_string(), + ), + _ => None, + } +} + +fn build_sections(sources: &[SourceSnapshot]) -> Result<Vec<DraftSection>> { + let note_indexes = source_indexes(sources, KnowledgeSourceKind::Note); + let event_indexes = source_indexes(sources, KnowledgeSourceKind::Event); + let relation_indexes = source_indexes(sources, KnowledgeSourceKind::Relation); + let proposal_indexes = source_indexes(sources, KnowledgeSourceKind::Proposal); + let mut sections = Vec::new(); + + push_section( + &mut sections, + "source-notes", + "Source Notes", + "current_truth", + sources, + note_indexes, + ); + push_section(&mut sections, "event-audits", "Event Audits", "history", sources, event_indexes); + push_section(&mut sections, "relations", "Relations", "relations", sources, relation_indexes); + push_section( + &mut sections, + "reviewed-proposals", + "Reviewed Proposals", + "proposals", + sources, + proposal_indexes, + ); + + if sections.is_empty() { + return Err(Error::InvalidRequest { + message: "knowledge page rebuild did not produce any cited sections".to_string(), + }); + } + + Ok(sections) +} + +fn push_section( + sections: &mut Vec<DraftSection>, + section_key: &str, + heading: &str, + role: &str, + sources: &[SourceSnapshot], + source_indexes: Vec<usize>, +) { + if source_indexes.is_empty() { + return; + } + + let ordinal = i32::try_from(sections.len()).unwrap_or(i32::MAX); + let content = source_indexes + .iter() + .filter_map(|index| sources.get(*index)) + .map(|source| format!("- {}", source.line)) + .collect::<Vec<_>>() + .join("\n"); + + sections.push(DraftSection { + section_id: Uuid::new_v4(), + section_key: section_key.to_string(), + heading: heading.to_string(), + role: role.to_string(), + content, + ordinal, + source_indexes, + unsupported_reason: None, + content_hash: String::new(), + citations: Value::Array(Vec::new()), + }); +} + +fn lint_unsupported_sections(sections: &[DraftSection]) -> Vec<LintDraft> { + sections + .iter() + .filter_map(|section| { + section.unsupported_reason.as_ref().map(|reason| LintDraft { + section_id: Some(section.section_id), + finding_type: "unsupported_claim".to_string(), + severity: "warning".to_string(), + source_kind: None, + source_id: None, + message: format!("Knowledge page section has unsupported content: {reason}"), + details: serde_json::json!({ + "section_key": section.section_key, + "unsupported_reason": reason, + "repair_guidance": repair_guidance_for_finding_type("unsupported_claim"), + }), + }) + }) + .collect() +} + +fn lint_page_sections( + page: &KnowledgePage, + sections: &[KnowledgePageSection], + source_refs: &[KnowledgePageSourceRef], +) -> Vec<LintDraft> { + let source_refs_by_section = source_refs_by_section(source_refs); + let mut findings = Vec::new(); + + for section in sections { + findings.extend(lint_one_section(section, &source_refs_by_section)); + } + + if !coverage_complete(page.source_coverage.as_object()) { + findings.push(low_source_coverage_finding(page)); + } + + findings +} + +fn lint_one_section( + section: &KnowledgePageSection, + source_refs_by_section: &HashMap<Uuid, Vec<KnowledgePageSourceRef>>, +) -> Vec<LintDraft> { + let citation_count = citation_count(§ion.citations); + let source_ref_count = + source_refs_by_section.get(§ion.section_id).map(Vec::len).unwrap_or_default(); + let mut findings = Vec::new(); + + if let Some(reason) = §ion.unsupported_reason { + findings.push(section_finding( + section, + "unsupported_claim", + "warning", + "Knowledge page section contains unsupported content.", + serde_json::json!({ + "unsupported_reason": reason, + "citation_count": citation_count, + "source_ref_count": source_ref_count, + }), + )); + } + + if citation_count == 0 && section.unsupported_reason.is_none() { + findings.push(section_finding( + section, + "missing_citation", + "error", + "Knowledge page section has no citations.", + serde_json::json!({ "source_ref_count": source_ref_count }), + )); + } + if source_ref_count == 0 && section.unsupported_reason.is_none() { + findings.push(section_finding( + section, + "missing_source_ref", + "error", + "Knowledge page section has no normalized source backlinks.", + serde_json::json!({ "citation_count": citation_count }), + )); + } + + findings +} + +fn section_finding( + section: &KnowledgePageSection, + finding_type: &str, + severity: &str, + message: &str, + details: Value, +) -> LintDraft { + LintDraft { + section_id: Some(section.section_id), + finding_type: finding_type.to_string(), + severity: severity.to_string(), + source_kind: None, + source_id: None, + message: message.to_string(), + details: with_repair_guidance( + details, + section.section_key.as_str(), + repair_guidance_for_finding_type(finding_type), + ), + } +} + +fn low_source_coverage_finding(page: &KnowledgePage) -> LintDraft { + LintDraft { + section_id: None, + finding_type: "low_source_coverage".to_string(), + severity: "warning".to_string(), + source_kind: None, + source_id: None, + message: "Knowledge page source coverage is incomplete.".to_string(), + details: serde_json::json!({ + "source_coverage": page.source_coverage.clone(), + "repair_guidance": repair_guidance_for_finding_type("low_source_coverage"), + }), + } +} + +fn with_repair_guidance(details: Value, section_key: &str, guidance: &str) -> Value { + let mut object = details.as_object().cloned().unwrap_or_default(); + + object.insert("section_key".to_string(), Value::String(section_key.to_string())); + object.insert("repair_guidance".to_string(), Value::String(guidance.to_string())); + + Value::Object(object) +} + +fn coverage_complete(coverage: Option<&Map<String, Value>>) -> bool { + let Some(coverage) = coverage else { + return false; + }; + let source_count = coverage.get("source_count").and_then(Value::as_u64).unwrap_or(0); + let cited_count = coverage.get("cited_source_count").and_then(Value::as_u64).unwrap_or(0); + let complete = coverage.get("coverage_complete").and_then(Value::as_bool).unwrap_or(false); + + complete && source_count == cited_count +} + +fn citation_count(citations: &Value) -> usize { + citations.as_array().map(Vec::len).unwrap_or_default() +} + +fn source_indexes(sources: &[SourceSnapshot], kind: KnowledgeSourceKind) -> Vec<usize> { + sources + .iter() + .enumerate() + .filter_map(|(index, source)| (source.kind == kind).then_some(index)) + .collect() +} + +fn citations_value(section: &DraftSection, sources: &[SourceSnapshot]) -> Value { + Value::Array( + section + .source_indexes + .iter() + .filter_map(|index| sources.get(*index)) + .map(source_citation_value) + .collect(), + ) +} + +fn note_source_snapshot(row: KnowledgeNoteSource) -> SourceSnapshot { + let content_hash = hash_text(row.text.as_str()); + let line = format!("{}{}", note_prefix(&row), row.text); + let snapshot = serde_json::json!({ + "kind": "note", + "note_id": row.note_id, + "agent_id": row.agent_id.clone(), + "scope": row.scope.clone(), + "type": row.note_type.clone(), + "key": row.key.clone(), + "status": row.status.clone(), + "updated_at": row.updated_at, + "created_at": row.created_at, + "expires_at": row.expires_at, + "embedding_version": row.embedding_version.clone(), + "content_hash": content_hash, + "source_ref": row.source_ref.clone(), + "importance": row.importance, + "confidence": row.confidence, + }); + + SourceSnapshot { + kind: KnowledgeSourceKind::Note, + id: row.note_id, + status: Some(row.status), + updated_at: Some(row.updated_at), + content_hash: Some(content_hash), + snapshot, + citation_metadata: serde_json::json!({ "section_role": "source_note" }), + line, + } +} + +fn event_source_snapshot(row: KnowledgeEventSource) -> SourceSnapshot { + let content_hash = hash_json_lossy(&row.details); + let line = format!( + "add_event audit {} {} for {}{}", + row.note_op, + row.policy_decision, + row.note_type, + row.note_key.as_ref().map(|key| format!(" key {key}")).unwrap_or_default() + ); + let snapshot = serde_json::json!({ + "kind": "event", + "decision_id": row.decision_id, + "agent_id": row.agent_id.clone(), + "scope": row.scope.clone(), + "pipeline": row.pipeline.clone(), + "note_type": row.note_type.clone(), + "note_key": row.note_key.clone(), + "note_id": row.note_id, + "policy_decision": row.policy_decision.clone(), + "note_op": row.note_op.clone(), + "reason_code": row.reason_code.clone(), + "details_hash": content_hash, + "ts": row.ts, + }); + + SourceSnapshot { + kind: KnowledgeSourceKind::Event, + id: row.decision_id, + status: Some(row.policy_decision), + updated_at: Some(row.ts), + content_hash: Some(content_hash), + snapshot, + citation_metadata: serde_json::json!({ "section_role": "event_audit" }), + line, + } +} + +fn relation_source_snapshot(row: KnowledgeRelationSource) -> SourceSnapshot { + let object = row.object_entity.clone().or(row.object_value.clone()).unwrap_or_default(); + let temporal_status = if row.valid_to.is_some() { "historical" } else { "current" }; + let line = format!("{} {} {} ({temporal_status}).", row.subject, row.predicate, object); + let content_hash = hash_text(line.as_str()); + let snapshot = serde_json::json!({ + "kind": "relation", + "fact_id": row.fact_id, + "agent_id": row.agent_id.clone(), + "scope": row.scope.clone(), + "subject": { "canonical": row.subject.clone(), "kind": row.subject_kind.clone() }, + "predicate": row.predicate.clone(), + "object": { + "entity": row.object_entity.clone(), + "kind": row.object_kind.clone(), + "value": row.object_value.clone() + }, + "valid_from": row.valid_from, + "valid_to": row.valid_to, + "updated_at": row.updated_at, + "content_hash": content_hash, + "evidence_notes": row.evidence_notes.clone(), + }); + + SourceSnapshot { + kind: KnowledgeSourceKind::Relation, + id: row.fact_id, + status: Some(temporal_status.to_string()), + updated_at: Some(row.updated_at), + content_hash: Some(content_hash), + snapshot, + citation_metadata: serde_json::json!({ "section_role": "relation_fact" }), + line, + } +} + +fn proposal_source_snapshot(row: KnowledgeProposalSource) -> SourceSnapshot { + let content_hash = hash_json_lossy(&serde_json::json!({ + "diff": row.diff.clone(), + "proposed_payload": row.proposed_payload.clone(), + "review_state": row.review_state.clone(), + })); + let summary = + row.diff.get("summary").and_then(Value::as_str).unwrap_or("Applied consolidation proposal"); + let line = format!("Applied proposal {}: {summary}", row.proposal_kind); + let snapshot = serde_json::json!({ + "kind": "proposal", + "proposal_id": row.proposal_id, + "run_id": row.run_id, + "agent_id": row.agent_id.clone(), + "proposal_kind": row.proposal_kind.clone(), + "apply_intent": row.apply_intent.clone(), + "review_state": row.review_state.clone(), + "source_refs": row.source_refs.clone(), + "source_snapshot": row.source_snapshot.clone(), + "lineage": row.lineage.clone(), + "diff": row.diff.clone(), + "confidence": row.confidence, + "unsupported_claim_flags": row.unsupported_claim_flags.clone(), + "contradiction_markers": row.contradiction_markers.clone(), + "staleness_markers": row.staleness_markers.clone(), + "target_ref": row.target_ref.clone(), + "proposed_payload_hash": content_hash, + "updated_at": row.updated_at, + }); + + SourceSnapshot { + kind: KnowledgeSourceKind::Proposal, + id: row.proposal_id, + status: Some(row.review_state), + updated_at: Some(row.updated_at), + content_hash: Some(content_hash), + snapshot, + citation_metadata: serde_json::json!({ "section_role": "reviewed_proposal" }), + line, + } +} + +fn source_citation_value(source: &SourceSnapshot) -> Value { + serde_json::json!({ + "source_kind": source.kind.as_str(), + "source_id": source.id, + "source_status": source.status.clone(), + "source_updated_at": source.updated_at, + "source_content_hash": source.content_hash.clone(), + "source_snapshot": source.snapshot.clone(), + "citation_metadata": source.citation_metadata.clone(), + }) +} + +fn source_snapshot_value(sources: &[SourceSnapshot]) -> Value { + serde_json::json!({ + "schema": KNOWLEDGE_PAGE_CONTRACT_SCHEMA_V1, + "sources": sources.iter().map(source_citation_value).collect::<Vec<_>>(), + }) +} + +fn source_coverage_value( + page_kind: KnowledgePageKind, + page_key: &str, + sections: &[DraftSection], + sources: &[SourceSnapshot], +) -> Value { + let cited = sections + .iter() + .flat_map(|section| section.source_indexes.iter().copied()) + .collect::<BTreeSet<_>>(); + let counts = source_counts(sources); + + serde_json::json!({ + "schema": KNOWLEDGE_PAGE_SOURCE_COVERAGE_SCHEMA_V1, + "page_kind": page_kind.as_str(), + "page_key": page_key, + "source_counts": counts, + "source_count": sources.len(), + "cited_source_count": cited.len(), + "section_count": sections.len(), + "unsupported_section_count": sections.iter().filter(|section| section.unsupported_reason.is_some()).count(), + "coverage_complete": cited.len() == sources.len(), + }) +} + +fn source_counts(sources: &[SourceSnapshot]) -> Value { + let mut counts = BTreeMap::<&str, usize>::new(); + + for source in sources { + *counts.entry(source.kind.as_str()).or_insert(0) += 1; + } + + serde_json::json!(counts) +} + +fn rebuild_metadata(source_hash: &str, provider_metadata: &Value) -> Value { + let llm_derived = + provider_metadata.get("llm_derived").and_then(Value::as_bool).unwrap_or(false); + + serde_json::json!({ + "schema": KNOWLEDGE_PAGE_REBUILD_SCHEMA_V1, + "source_snapshot_hash": source_hash, + "deterministic": !llm_derived, + "provider_metadata": provider_metadata, + "allowed_variance": if llm_derived { + serde_json::json!(["LLM-derived page text may vary; provider metadata records the nondeterministic input path."]) + } else { + serde_json::json!([]) + }, + }) +} + +fn section_hash_payload(section: &DraftSection) -> Value { + serde_json::json!({ + "section_key": section.section_key.clone(), + "heading": section.heading.clone(), + "role": section.role.clone(), + "content": section.content.clone(), + "citations": section.citations.clone(), + "unsupported_reason": section.unsupported_reason.clone(), + }) +} + +fn page_content_hash( + title: &str, + sections: &[DraftSection], + source_coverage: &Value, + rebuild_metadata: &Value, +) -> Result<String> { + hash_json(&serde_json::json!({ + "title": title, + "sections": sections.iter().map(section_hash_payload).collect::<Vec<_>>(), + "source_coverage": source_coverage, + "rebuild_metadata": rebuild_metadata, + })) +} + +fn missing_source_finding(source_ref: &KnowledgePageSourceRef) -> LintDraft { + LintDraft { + section_id: source_ref.section_id, + finding_type: "stale_source_ref".to_string(), + severity: "error".to_string(), + source_kind: KnowledgeSourceKind::parse(source_ref.source_kind.as_str()), + source_id: Some(source_ref.source_id), + message: "Knowledge page source reference no longer resolves.".to_string(), + details: serde_json::json!({ + "source_kind": source_ref.source_kind.clone(), + "source_id": source_ref.source_id, + "repair_guidance": repair_guidance_for_finding_type("stale_source_ref"), + }), + } +} + +fn stale_source_finding( + source_ref: &KnowledgePageSourceRef, + current: &SourceSnapshot, +) -> LintDraft { + LintDraft { + section_id: source_ref.section_id, + finding_type: "stale_source_ref".to_string(), + severity: "warning".to_string(), + source_kind: Some(current.kind), + source_id: Some(current.id), + message: "Knowledge page source reference snapshot is stale.".to_string(), + details: serde_json::json!({ + "stored": { + "status": source_ref.source_status.clone(), + "updated_at": source_ref.source_updated_at, + "content_hash": source_ref.source_content_hash.clone(), + }, + "current": { + "status": current.status.clone(), + "updated_at": current.updated_at, + "content_hash": current.content_hash.clone(), + }, + "repair_guidance": repair_guidance_for_finding_type("stale_source_ref"), + }), + } +} + +fn repair_guidance_for_finding_type(finding_type: &str) -> &'static str { + match finding_type { + "stale_source_ref" => + "Inspect the stale or missing source, then rebuild the page from current authoritative sources.", + "unsupported_claim" => + "Replace the unsupported section content with source-backed text or rebuild from cited sources.", + "missing_citation" => + "Rebuild the page section with explicit citations or mark the section unsupported with a reason.", + "missing_source_ref" => + "Rebuild the page so each section citation is normalized into knowledge_page_source_refs.", + "low_source_coverage" => + "Rebuild with all intended sources or remove uncited material before relying on this page.", + _ => "Inspect the finding and rebuild the page after source review.", + } +} + +fn source_changed(source_ref: &KnowledgePageSourceRef, current: &SourceSnapshot) -> bool { + source_ref.source_status.as_deref() != current.status.as_deref() + || source_ref.source_updated_at != current.updated_at + || source_ref.source_content_hash.as_deref() != current.content_hash.as_deref() +} + +fn snippet_for_query(content: &str, query: &str, max_chars: usize) -> String { + let normalized = normalize_whitespace(content); + let query = query.trim(); + + if query.is_empty() { + return truncate_chars(normalized.as_str(), max_chars); + } + + let lower = normalized.to_ascii_lowercase(); + let lower_query = query.to_ascii_lowercase(); + let Some(byte_idx) = lower.find(lower_query.as_str()) else { + return truncate_chars(normalized.as_str(), max_chars); + }; + let before_chars = normalized[..byte_idx].chars().count(); + let start = before_chars.saturating_sub(40); + let mut snippet: String = normalized.chars().skip(start).take(max_chars).collect(); + + if start > 0 { + snippet = format!("...{snippet}"); + } + if normalized.chars().count() > start + snippet.chars().count() { + snippet.push_str("..."); + } + + snippet +} + +fn normalize_whitespace(raw: &str) -> String { + let mut out = String::with_capacity(raw.len()); + let mut prev_space = false; + + for ch in raw.chars() { + if ch.is_whitespace() { + if !prev_space { + out.push(' '); + + prev_space = true; + } + + continue; + } + + out.push(ch); + + prev_space = false; + } + + out.trim().to_string() +} + +fn truncate_chars(raw: &str, max_chars: usize) -> String { + if raw.chars().count() <= max_chars { + return raw.to_string(); + } + + const TRUNCATION_MARKER: &str = "..."; + + let marker_chars = TRUNCATION_MARKER.chars().count(); + + if max_chars <= marker_chars { + return TRUNCATION_MARKER.chars().take(max_chars).collect(); + } + + let truncated_chars = max_chars - marker_chars; + let mut out = raw.chars().take(truncated_chars).collect::<String>(); + + out.push_str(TRUNCATION_MARKER); + + out +} + +fn source_sort_key(source: &SourceSnapshot) -> (String, Uuid) { + (source.kind.as_str().to_string(), source.id) +} + +fn source_key(source: &SourceSnapshot) -> String { + current_key(source.kind.as_str(), source.id) +} + +fn current_key(kind: &str, source_id: Uuid) -> String { + format!("{kind}:{source_id}") +} + +fn note_prefix(row: &KnowledgeNoteSource) -> String { + row.key + .as_ref() + .map(|key| format!("[{}:{key}] ", row.note_type)) + .unwrap_or_else(|| format!("[{}] ", row.note_type)) +} + +fn generated_title(page_kind: KnowledgePageKind, page_key: &str) -> String { + format!("{} Knowledge Page: {page_key}", title_kind(page_kind)) +} + +fn title_kind(page_kind: KnowledgePageKind) -> &'static str { + match page_kind { + KnowledgePageKind::Project => "Project", + KnowledgePageKind::Entity => "Entity", + KnowledgePageKind::Concept => "Concept", + KnowledgePageKind::Issue => "Issue", + KnowledgePageKind::Decision => "Decision", + } +} + +fn sorted_unique(ids: &[Uuid]) -> Vec<Uuid> { + ids.iter().copied().collect::<BTreeSet<_>>().into_iter().collect() +} + +fn bounded_limit(limit: Option<u32>) -> i64 { + limit.map(i64::from).unwrap_or(DEFAULT_LIST_LIMIT).clamp(1, MAX_LIST_LIMIT) +} + +fn validate_context(tenant_id: &str, project_id: &str, agent_id: &str) -> Result<()> { + validate_non_empty("tenant_id", tenant_id)?; + validate_non_empty("project_id", project_id)?; + + validate_non_empty("agent_id", agent_id) +} + +fn validate_non_empty(field: &'static str, value: &str) -> Result<()> { + if value.trim().is_empty() { + return Err(Error::InvalidRequest { message: format!("{field} must not be empty.") }); + } + + Ok(()) +} + +fn validate_object(field: &str, value: &Value) -> Result<()> { + if matches!(value, Value::Object(_)) { + Ok(()) + } else { + Err(Error::InvalidRequest { message: format!("{field} must be a JSON object.") }) + } +} + +fn empty_object() -> Value { + Value::Object(Map::new()) +} + +fn hash_text(text: &str) -> String { + blake3::hash(text.as_bytes()).to_hex().to_string() +} + +fn hash_json_lossy(value: &Value) -> String { + serde_json::to_vec(value) + .map(|raw| blake3::hash(&raw).to_hex().to_string()) + .unwrap_or_else(|_| hash_text(value.to_string().as_str())) +} + +fn hash_json(value: &Value) -> Result<String> { + let raw = serde_json::to_vec(value).map_err(|err| Error::InvalidRequest { + message: format!("failed to serialize knowledge page payload: {err}"), + })?; + + Ok(blake3::hash(&raw).to_hex().to_string()) +} + +async fn replace_page_children( + tx: &mut Transaction<'_, Postgres>, + page_id: Uuid, + sections: &[DraftSection], + sources: &[SourceSnapshot], + lint: &[LintDraft], + now: OffsetDateTime, +) -> Result<()> { + knowledge::delete_knowledge_page_children(&mut **tx, page_id).await?; + + for section in sections { + insert_section(tx, page_id, section, now).await?; + + for source_index in §ion.source_indexes { + let source = sources.get(*source_index).ok_or_else(|| Error::InvalidRequest { + message: "knowledge page section referenced an unknown source".to_string(), + })?; + + insert_source_ref(tx, page_id, section.section_id, source, now).await?; + } + } + for finding in lint { + insert_lint_finding(tx, page_id, finding, now).await?; + } + + Ok(()) +} + +async fn insert_section( + tx: &mut Transaction<'_, Postgres>, + page_id: Uuid, + section: &DraftSection, + now: OffsetDateTime, +) -> Result<()> { + knowledge::insert_knowledge_page_section( + &mut **tx, + KnowledgePageSectionInsert { + section_id: section.section_id, + page_id, + section_key: section.section_key.as_str(), + heading: section.heading.as_str(), + role: section.role.as_str(), + content: section.content.as_str(), + ordinal: section.ordinal, + citations: §ion.citations, + unsupported_reason: section.unsupported_reason.as_deref(), + content_hash: section.content_hash.as_str(), + now, + }, + ) + .await + .map_err(Error::from) +} + +async fn insert_source_ref( + tx: &mut Transaction<'_, Postgres>, + page_id: Uuid, + section_id: Uuid, + source: &SourceSnapshot, + now: OffsetDateTime, +) -> Result<()> { + knowledge::insert_knowledge_page_source_ref( + &mut **tx, + KnowledgePageSourceRefInsert { + ref_id: Uuid::new_v4(), + page_id, + section_id: Some(section_id), + source_kind: source.kind.as_str(), + source_id: source.id, + source_status: source.status.as_deref(), + source_updated_at: source.updated_at, + source_content_hash: source.content_hash.as_deref(), + source_snapshot: &source.snapshot, + citation_metadata: &source.citation_metadata, + now, + }, + ) + .await + .map_err(Error::from) +} + +async fn insert_lint_finding( + tx: &mut Transaction<'_, Postgres>, + page_id: Uuid, + finding: &LintDraft, + now: OffsetDateTime, +) -> Result<()> { + knowledge::insert_knowledge_page_lint_finding( + &mut **tx, + KnowledgePageLintFindingInsert { + finding_id: Uuid::new_v4(), + page_id, + section_id: finding.section_id, + finding_type: finding.finding_type.as_str(), + severity: finding.severity.as_str(), + source_kind: finding.source_kind.map(KnowledgeSourceKind::as_str), + source_id: finding.source_id, + message: finding.message.as_str(), + details: &finding.details, + now, + }, + ) + .await + .map_err(Error::from) +} + +#[cfg(test)] +mod tests { + use crate::knowledge::{ + self, KnowledgePage, KnowledgePageKind, KnowledgePageSearchRow, KnowledgePageSection, + KnowledgePageSourceRef, KnowledgeSourceKind, OffsetDateTime, SourceSnapshot, Uuid, + }; + + fn test_source(kind: KnowledgeSourceKind, raw_id: u128, line: &str) -> SourceSnapshot { + let id = Uuid::from_u128(raw_id); + let content_hash = knowledge::hash_text(line); + + SourceSnapshot { + kind, + id, + status: Some("active".to_string()), + updated_at: Some(OffsetDateTime::UNIX_EPOCH), + content_hash: Some(content_hash.clone()), + snapshot: serde_json::json!({ + "kind": kind.as_str(), + "id": id, + "status": "active", + "updated_at": OffsetDateTime::UNIX_EPOCH, + "content_hash": content_hash, + }), + citation_metadata: serde_json::json!({ "fixture": "knowledge_unit" }), + line: line.to_string(), + } + } + + #[test] + fn build_sections_preserves_citations_and_deterministic_hashes() { + let sources = vec![ + test_source(KnowledgeSourceKind::Note, 1, "A source note supports the page."), + test_source(KnowledgeSourceKind::Event, 2, "An event audit supports the page."), + test_source(KnowledgeSourceKind::Relation, 3, "A relation supports the page."), + test_source(KnowledgeSourceKind::Proposal, 4, "An applied proposal supports the page."), + ]; + let mut first_sections = + knowledge::build_sections(&sources).expect("sections should build"); + + for section in &mut first_sections { + section.citations = knowledge::citations_value(section, &sources); + section.content_hash = knowledge::hash_json(&knowledge::section_hash_payload(section)) + .expect("section hash should serialize"); + } + + assert_eq!(first_sections.len(), 4); + assert!(first_sections.iter().all(|section| { + section.citations.as_array().is_some_and(|citations| !citations.is_empty()) + })); + + let coverage = knowledge::source_coverage_value( + KnowledgePageKind::Project, + "elf", + &first_sections, + &sources, + ); + let metadata = knowledge::rebuild_metadata("source-hash", &knowledge::empty_object()); + let first_hash = knowledge::page_content_hash("ELF", &first_sections, &coverage, &metadata) + .expect("page hash should serialize"); + let second_hash = + knowledge::page_content_hash("ELF", &first_sections, &coverage, &metadata) + .expect("page hash should serialize"); + + assert_eq!(coverage["coverage_complete"], true); + assert_eq!(metadata["deterministic"], true); + assert_eq!(first_hash, second_hash); + } + + #[test] + fn rebuild_metadata_records_llm_variance() { + let metadata = knowledge::rebuild_metadata( + "source-hash", + &serde_json::json!({ + "llm_derived": true, + "provider_id": "fixture", + "model": "fixture-model", + }), + ); + + assert_eq!(metadata["deterministic"], false); + assert!(metadata["allowed_variance"].as_array().is_some_and(|items| !items.is_empty())); + assert_eq!(metadata["provider_metadata"]["provider_id"], "fixture"); + } + + #[test] + fn stale_source_comparison_detects_changed_snapshot() { + let source_id = Uuid::from_u128(42); + let stored = KnowledgePageSourceRef { + ref_id: Uuid::from_u128(1), + page_id: Uuid::from_u128(2), + section_id: Some(Uuid::from_u128(3)), + source_kind: "note".to_string(), + source_id, + source_status: Some("active".to_string()), + source_updated_at: Some(OffsetDateTime::UNIX_EPOCH), + source_content_hash: Some("old-hash".to_string()), + source_snapshot: serde_json::json!({}), + citation_metadata: serde_json::json!({}), + created_at: OffsetDateTime::UNIX_EPOCH, + }; + let current = SourceSnapshot { + kind: KnowledgeSourceKind::Note, + id: source_id, + status: Some("active".to_string()), + updated_at: Some(OffsetDateTime::UNIX_EPOCH), + content_hash: Some("new-hash".to_string()), + snapshot: serde_json::json!({}), + citation_metadata: serde_json::json!({}), + line: "Updated note source.".to_string(), + }; + let finding = knowledge::stale_source_finding(&stored, ¤t); + + assert!(knowledge::source_changed(&stored, ¤t)); + assert_eq!(finding.finding_type, "stale_source_ref"); + assert_eq!(finding.source_kind, Some(KnowledgeSourceKind::Note)); + assert_eq!(finding.source_id, Some(source_id)); + } + + #[test] + fn lint_page_sections_detects_unsupported_missing_and_low_coverage() { + let page = test_page(); + let unsupported = test_section( + Uuid::from_u128(10), + "unsupported", + serde_json::json!([]), + Some("No source supports this claim.".to_string()), + ); + let missing = test_section(Uuid::from_u128(11), "missing", serde_json::json!([]), None); + let findings = knowledge::lint_page_sections(&page, &[unsupported, missing], &[]); + let finding_types = + findings.iter().map(|finding| finding.finding_type.as_str()).collect::<Vec<_>>(); + + assert!(finding_types.contains(&"unsupported_claim")); + assert!(finding_types.contains(&"missing_citation")); + assert!(finding_types.contains(&"missing_source_ref")); + assert!(finding_types.contains(&"low_source_coverage")); + assert!(findings.iter().all(|finding| { + finding + .details + .get("repair_guidance") + .and_then(serde_json::Value::as_str) + .is_some_and(|guidance| !guidance.is_empty()) + })); + } + + #[test] + fn search_item_marks_derived_page_snippet_with_provenance() { + let section_id = Uuid::from_u128(20); + let source_ref = test_source_ref(section_id); + let row = KnowledgePageSearchRow { + page_id: Uuid::from_u128(21), + page_kind: "project".to_string(), + page_key: "elf".to_string(), + title: "ELF Knowledge".to_string(), + status: "active".to_string(), + source_coverage: serde_json::json!({ + "source_count": 1, + "cited_source_count": 1, + "coverage_complete": true + }), + rebuild_metadata: serde_json::json!({ "deterministic": true }), + page_updated_at: OffsetDateTime::UNIX_EPOCH, + rebuilt_at: OffsetDateTime::UNIX_EPOCH, + section_id, + section_key: "source-notes".to_string(), + heading: "Source Notes".to_string(), + role: "current_truth".to_string(), + content: "Derived knowledge pages cite source notes before they are trusted." + .to_string(), + ordinal: 0, + citations: serde_json::json!([{ "source_kind": "note", "source_id": source_ref.source_id }]), + unsupported_reason: None, + lint_error_count: 0, + lint_warning_count: 1, + lint_info_count: 0, + section_source_ref_count: 1, + }; + let item = knowledge::knowledge_page_search_item(row, vec![source_ref], "source notes"); + + assert_eq!(item.result_kind, "knowledge_page_section"); + assert_eq!(item.trust_state, "derived_warning"); + assert_eq!(item.citation_count, 1); + assert_eq!(item.source_ref_count, 1); + assert_eq!(item.source_refs.len(), 1); + assert!(item.derived_notice.contains("Derived knowledge page snippet")); + assert!(item.repair_guidance.is_some()); + assert!(item.snippet.contains("source notes")); + } + + fn test_page() -> KnowledgePage { + KnowledgePage { + page_id: Uuid::from_u128(1), + tenant_id: "tenant".to_string(), + project_id: "project".to_string(), + page_kind: "project".to_string(), + page_key: "elf".to_string(), + title: "ELF".to_string(), + contract_schema: "elf.knowledge_page/v1".to_string(), + status: "active".to_string(), + rebuild_source_hash: "source-hash".to_string(), + content_hash: "content-hash".to_string(), + source_coverage: serde_json::json!({ + "source_count": 2, + "cited_source_count": 1, + "coverage_complete": false + }), + source_snapshot: serde_json::json!({}), + rebuild_metadata: serde_json::json!({}), + created_at: OffsetDateTime::UNIX_EPOCH, + updated_at: OffsetDateTime::UNIX_EPOCH, + rebuilt_at: OffsetDateTime::UNIX_EPOCH, + } + } + + fn test_section( + section_id: Uuid, + section_key: &str, + citations: serde_json::Value, + unsupported_reason: Option<String>, + ) -> KnowledgePageSection { + KnowledgePageSection { + section_id, + page_id: Uuid::from_u128(1), + section_key: section_key.to_string(), + heading: section_key.to_string(), + role: "current_truth".to_string(), + content: "Section content.".to_string(), + ordinal: 0, + citations, + unsupported_reason, + content_hash: "section-hash".to_string(), + created_at: OffsetDateTime::UNIX_EPOCH, + updated_at: OffsetDateTime::UNIX_EPOCH, + } + } + + fn test_source_ref(section_id: Uuid) -> KnowledgePageSourceRef { + KnowledgePageSourceRef { + ref_id: Uuid::from_u128(30), + page_id: Uuid::from_u128(21), + section_id: Some(section_id), + source_kind: "note".to_string(), + source_id: Uuid::from_u128(31), + source_status: Some("active".to_string()), + source_updated_at: Some(OffsetDateTime::UNIX_EPOCH), + source_content_hash: Some("source-hash".to_string()), + source_snapshot: serde_json::json!({}), + citation_metadata: serde_json::json!({}), + created_at: OffsetDateTime::UNIX_EPOCH, + } + } +} diff --git a/packages/elf-service/src/lib.rs b/packages/elf-service/src/lib.rs index 4da3d587..726e1e87 100644 --- a/packages/elf-service/src/lib.rs +++ b/packages/elf-service/src/lib.rs @@ -1,119 +1,324 @@ +#![cfg_attr(test, allow(unused_crate_dependencies))] + +//! Service-layer request models and orchestration for ELF. + pub mod add_event; pub mod add_note; pub mod admin; +pub mod admin_graph_predicates; +pub mod consolidation; +pub mod core_blocks; pub mod delete; +pub mod docs; +pub mod graph; +pub mod graph_query; +pub mod knowledge; pub mod list; pub mod notes; +pub mod progressive_search; +pub mod provenance; pub mod search; +pub mod sharing; +pub mod structured_fields; pub mod time_serde; pub mod update; -// std +mod access; +mod error; +mod graph_ingestion; +mod ingest_audit; +mod ingestion_profiles; +mod ranking_explain_v2; + +pub use self::{ + add_event::{AddEventRequest, AddEventResponse, AddEventResult, EventMessage}, + add_note::{AddNoteInput, AddNoteRequest, AddNoteResponse, AddNoteResult}, + admin::RebuildReport, + admin_graph_predicates::{ + AdminGraphPredicateAliasAddRequest, AdminGraphPredicateAliasResponse, + AdminGraphPredicateAliasesListRequest, AdminGraphPredicateAliasesResponse, + AdminGraphPredicatePatchRequest, AdminGraphPredicateResponse, + AdminGraphPredicatesListRequest, AdminGraphPredicatesListResponse, + }, + consolidation::{ + ConsolidationProposalGetRequest, ConsolidationProposalInput, ConsolidationProposalResponse, + ConsolidationProposalReviewEventResponse, ConsolidationProposalReviewRequest, + ConsolidationProposalsListRequest, ConsolidationProposalsListResponse, + ConsolidationRunCreateRequest, ConsolidationRunCreateResponse, ConsolidationRunGetRequest, + ConsolidationRunResponse, ConsolidationRunsListRequest, ConsolidationRunsListResponse, + }, + core_blocks::{ + CoreBlockAttachRequest, CoreBlockAttachResponse, CoreBlockDetachRequest, + CoreBlockDetachResponse, CoreBlockItem, CoreBlockRecord, CoreBlockUpsertRequest, + CoreBlockUpsertResponse, CoreBlocksGetRequest, CoreBlocksResponse, + ELF_CORE_MEMORY_BLOCKS_SCHEMA_V1, + }, + delete::{DeleteRequest, DeleteResponse}, + docs::{ + DocType, DocsExcerptResponse, DocsExcerptsGetRequest, DocsGetRequest, DocsGetResponse, + DocsPutRequest, DocsPutResponse, DocsSearchL0Request, DocsSearchL0Response, + TextPositionSelector, TextQuoteSelector, + }, + error::{Error, Result}, + graph::RelationTemporalStatus, + graph_query::{ + ELF_GRAPH_QUERY_SCHEMA_V1, GraphQueryEntity, GraphQueryEntityRef, GraphQueryExplain, + GraphQueryFact, GraphQueryObject, GraphQueryObjectEntity, GraphQueryPredicate, + GraphQueryPredicateRef, GraphQueryRequest, GraphQueryResponse, + }, + ingestion_profiles::{ + AdminIngestionProfileCreateRequest, AdminIngestionProfileDefaultGetRequest, + AdminIngestionProfileDefaultResponse, AdminIngestionProfileDefaultSetRequest, + AdminIngestionProfileGetRequest, AdminIngestionProfileListRequest, + AdminIngestionProfileResponse, AdminIngestionProfileSummary, + AdminIngestionProfileVersionsListRequest, AdminIngestionProfileVersionsListResponse, + AdminIngestionProfilesListResponse, IngestionProfileRef, IngestionProfileSelector, + }, + knowledge::{ + KnowledgePageGetRequest, KnowledgePageLintFindingResponse, KnowledgePageLintRequest, + KnowledgePageLintResponse, KnowledgePageLintSummary, KnowledgePageRebuildRequest, + KnowledgePageRebuildResponse, KnowledgePageResponse, KnowledgePageSearchItem, + KnowledgePageSearchRequest, KnowledgePageSearchResponse, KnowledgePageSectionResponse, + KnowledgePageSectionSourceBacklink, KnowledgePageSourceRefResponse, KnowledgePageSummary, + KnowledgePagesListRequest, KnowledgePagesListResponse, + }, + list::{ListItem, ListRequest, ListResponse}, + notes::{NoteFetchRequest, NoteFetchResponse}, + progressive_search::{ + SearchDetailsError, SearchDetailsRequest, SearchDetailsResponse, SearchDetailsResult, + SearchIndexItem, SearchIndexPlannedResponse, SearchIndexResponse, SearchSessionGetRequest, + SearchTimelineGroup, SearchTimelineRequest, SearchTimelineResponse, + }, + provenance::{ + MemoryHistoryEvent, MemoryHistoryGetRequest, MemoryHistoryResponse, + NoteProvenanceBundleResponse, NoteProvenanceGetRequest, NoteProvenanceIndexingOutbox, + NoteProvenanceIngestDecision, NoteProvenanceNote, NoteProvenanceNoteVersion, + NoteProvenanceRecentTrace, + }, + search::{ + BlendRankingOverride, BlendSegmentOverride, PayloadLevel, QueryPlan, QueryPlanBlendSegment, + QueryPlanBudget, QueryPlanDynamicGate, QueryPlanFusionPolicy, QueryPlanIntent, + QueryPlanRerankPolicy, QueryPlanRetrievalStage, QueryPlanRewrite, QueryPlanStage, + RankingRequestOverride, SearchExplain, SearchExplainItem, SearchExplainRequest, + SearchExplainResponse, SearchExplainTrajectory, SearchExplainTrajectoryStage, SearchItem, + SearchRawPlannedResponse, SearchRequest, SearchResponse, SearchTrace, + SearchTrajectoryResponse, SearchTrajectoryStage, SearchTrajectoryStageItem, + SearchTrajectorySummary, SearchTrajectorySummaryStage, TraceBundleGetRequest, + TraceBundleResponse, TraceGetRequest, TraceGetResponse, TraceRecentListRequest, + TraceRecentListResponse, TraceTrajectoryGetRequest, + }, + sharing::{ + GranteeKind, PublishNoteRequest, PublishNoteResponse, ShareScope, SpaceGrantItem, + SpaceGrantRevokeRequest, SpaceGrantRevokeResponse, SpaceGrantUpsertRequest, + SpaceGrantUpsertResponse, SpaceGrantsListRequest, SpaceGrantsListResponse, + UnpublishNoteRequest, UnpublishNoteResponse, + }, + structured_fields::StructuredFields, + update::{UpdateRequest, UpdateResponse}, +}; + use std::{future::Future, pin::Pin, sync::Arc}; -// crates.io +use serde::{Deserialize, Serialize}; use serde_json::Value; -use sqlx::Row; +use sqlx::PgExecutor; +use time::OffsetDateTime; use uuid::Uuid; -// self -pub use add_event::{AddEventRequest, AddEventResponse, AddEventResult, EventMessage}; -pub use add_note::{AddNoteInput, AddNoteRequest, AddNoteResponse, AddNoteResult}; -pub use admin::RebuildReport; -pub use delete::{DeleteRequest, DeleteResponse}; use elf_config::{Config, EmbeddingProviderConfig, LlmProviderConfig, ProviderConfig}; +use elf_domain::writegate::RejectCode; use elf_providers::{embedding, extractor, rerank}; use elf_storage::{db::Db, models::MemoryNote, qdrant::QdrantStore}; -pub use list::{ListItem, ListRequest, ListResponse}; -pub use notes::{NoteFetchRequest, NoteFetchResponse}; -pub use search::{ - SearchBoost, SearchExplain, SearchExplainItem, SearchExplainRequest, SearchExplainResponse, - SearchItem, SearchRequest, SearchResponse, SearchTrace, -}; -pub use update::{UpdateRequest, UpdateResponse}; - -pub type ServiceResult<T> = Result<T, ServiceError>; +/// Boxed future type used by provider traits. pub type BoxFuture<'a, T> = Pin<Box<dyn Future<Output = T> + Send + 'a>>; +/// Rejection code emitted when event evidence quotes do not match the source messages. pub const REJECT_EVIDENCE_MISMATCH: &str = "REJECT_EVIDENCE_MISMATCH"; - +/// Rejection code emitted when a write policy and extracted output disagree. +pub const REJECT_WRITE_POLICY_MISMATCH: &str = "REJECT_WRITE_POLICY_MISMATCH"; + +const RESOLVE_UPDATE_QUERY: &str = "\ +WITH key_match AS ( + SELECT note_id + FROM memory_notes + WHERE tenant_id = $1 + AND project_id = $2 + AND agent_id = $3 + AND scope = $4 + AND type = $5 + AND $6::text IS NOT NULL + AND key = $6 + AND status = 'active' + AND (expires_at IS NULL OR expires_at > $7) + LIMIT 1 +), +existing AS ( + SELECT note_id + FROM memory_notes + WHERE tenant_id = $1 + AND project_id = $2 + AND agent_id = $3 + AND scope = $4 + AND type = $5 + AND status = 'active' + AND (expires_at IS NULL OR expires_at > $7) +), +best AS ( + SELECT + note_id, + (1 - (vec <=> $8::text::vector))::real AS similarity + FROM note_embeddings + WHERE note_id = ANY(ARRAY(SELECT note_id FROM existing)) + AND embedding_version = $9 + ORDER BY similarity DESC + LIMIT 1 +) + SELECT + (SELECT note_id FROM key_match) AS key_note_id, + (SELECT note_id FROM best) AS best_note_id, + (SELECT similarity FROM best) AS best_similarity"; + +/// Embedding provider contract used by the service layer. pub trait EmbeddingProvider where Self: Send + Sync, { + /// Embeds one or more texts into dense vectors. fn embed<'a>( &'a self, cfg: &'a EmbeddingProviderConfig, texts: &'a [String], - ) -> BoxFuture<'a, color_eyre::Result<Vec<Vec<f32>>>>; + ) -> BoxFuture<'a, Result<Vec<Vec<f32>>>>; } +/// Rerank provider contract used by the service layer. pub trait RerankProvider where Self: Send + Sync, { + /// Scores candidate documents for one query. fn rerank<'a>( &'a self, cfg: &'a ProviderConfig, query: &'a str, docs: &'a [String], - ) -> BoxFuture<'a, color_eyre::Result<Vec<f32>>>; + ) -> BoxFuture<'a, Result<Vec<f32>>>; } +/// Extractor provider contract used by the service layer. pub trait ExtractorProvider where Self: Send + Sync, { + /// Extracts structured JSON output from a message transcript. fn extract<'a>( &'a self, cfg: &'a LlmProviderConfig, messages: &'a [Value], - ) -> BoxFuture<'a, color_eyre::Result<Value>>; + ) -> BoxFuture<'a, Result<Value>>; } -#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +/// Note operation emitted by service mutations. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] #[serde(rename_all = "SCREAMING_SNAKE_CASE")] pub enum NoteOp { + /// A new note was inserted. Add, + /// An existing note was updated. Update, + /// No persisted change was required. None, + /// A note was deleted. Delete, + /// The request was rejected before persistence. Rejected, } -#[derive(Debug)] -pub enum ServiceError { - NonEnglishInput { field: String }, - InvalidRequest { message: String }, - ScopeDenied { message: String }, - Provider { message: String }, - Storage { message: String }, - Qdrant { message: String }, +#[derive(Clone, Copy, Debug)] +pub(crate) enum UpdateDecision { + Add { note_id: Uuid, metadata: UpdateDecisionMetadata }, + Update { note_id: Uuid, metadata: UpdateDecisionMetadata }, + None { note_id: Uuid, metadata: UpdateDecisionMetadata }, } +impl UpdateDecision { + pub(crate) fn note_id(&self) -> Uuid { + match self { + Self::Add { note_id, .. } + | Self::Update { note_id, .. } + | Self::None { note_id, .. } => *note_id, + } + } -#[derive(Debug, Clone, Copy)] -pub(crate) enum UpdateDecision { - Add { note_id: Uuid }, - Update { note_id: Uuid }, - None { note_id: Uuid }, + pub(crate) fn metadata(&self) -> UpdateDecisionMetadata { + match self { + Self::Add { metadata, .. } + | Self::Update { metadata, .. } + | Self::None { metadata, .. } => *metadata, + } + } +} + +#[derive(Clone, Copy, Debug)] +pub(crate) struct UpdateDecisionMetadata { + pub similarity_best: Option<f32>, + pub key_match: bool, + pub matched_dup: bool, } +/// Provider bundle used by `ElfService`. #[derive(Clone)] pub struct Providers { + /// Dense embedding provider implementation. pub embedding: Arc<dyn EmbeddingProvider>, + /// Rerank provider implementation. pub rerank: Arc<dyn RerankProvider>, + /// Structured extraction provider implementation. pub extractor: Arc<dyn ExtractorProvider>, } +impl Providers { + /// Builds a provider bundle from explicit provider implementations. + pub fn new( + embedding: Arc<dyn EmbeddingProvider>, + rerank: Arc<dyn RerankProvider>, + extractor: Arc<dyn ExtractorProvider>, + ) -> Self { + Self { embedding, rerank, extractor } + } +} + +impl Default for Providers { + fn default() -> Self { + let provider = Arc::new(DefaultProviders); + + Self { embedding: provider.clone(), rerank: provider.clone(), extractor: provider } + } +} +/// Main service container for ELF request handling. pub struct ElfService { + /// Repository configuration snapshot. pub cfg: Config, + /// Postgres storage handle. pub db: Db, + /// Qdrant storage handle. pub qdrant: QdrantStore, + /// External model-provider adapters. pub providers: Providers, } +impl ElfService { + /// Builds a service with the default provider adapters. + pub fn new(cfg: Config, db: Db, qdrant: QdrantStore) -> Self { + Self { cfg, db, qdrant, providers: Providers::default() } + } + + /// Builds a service with explicit provider adapters. + pub fn with_providers(cfg: Config, db: Db, qdrant: QdrantStore, providers: Providers) -> Self { + Self { cfg, db, qdrant, providers } + } +} -pub(crate) struct ResolveUpdateArgs<'a> { +struct ResolveUpdateArgs<'a> { pub(crate) cfg: &'a Config, pub(crate) providers: &'a Providers, pub(crate) tenant_id: &'a str, @@ -123,57 +328,31 @@ pub(crate) struct ResolveUpdateArgs<'a> { pub(crate) note_type: &'a str, pub(crate) key: Option<&'a str>, pub(crate) text: &'a str, - pub(crate) now: time::OffsetDateTime, + pub(crate) now: OffsetDateTime, } -pub(crate) struct InsertVersionArgs<'a> { +struct InsertVersionArgs<'a> { pub(crate) note_id: Uuid, pub(crate) op: &'a str, pub(crate) prev_snapshot: Option<Value>, pub(crate) new_snapshot: Option<Value>, pub(crate) reason: &'a str, pub(crate) actor: &'a str, - pub(crate) ts: time::OffsetDateTime, + pub(crate) ts: OffsetDateTime, } struct DefaultProviders; - -impl std::fmt::Display for ServiceError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::NonEnglishInput { field } => { - write!(f, "Non-English input detected at {field}.") - }, - Self::InvalidRequest { message } => write!(f, "Invalid request: {message}"), - Self::ScopeDenied { message } => write!(f, "Scope denied: {message}"), - Self::Provider { message } => write!(f, "Provider error: {message}"), - Self::Storage { message } => write!(f, "Storage error: {message}"), - Self::Qdrant { message } => write!(f, "Qdrant error: {message}"), - } - } -} - -impl std::error::Error for ServiceError {} - -impl From<sqlx::Error> for ServiceError { - fn from(err: sqlx::Error) -> Self { - Self::Storage { message: err.to_string() } - } -} - -impl From<color_eyre::Report> for ServiceError { - fn from(err: color_eyre::Report) -> Self { - Self::Provider { message: err.to_string() } - } -} - impl EmbeddingProvider for DefaultProviders { fn embed<'a>( &'a self, cfg: &'a EmbeddingProviderConfig, texts: &'a [String], - ) -> BoxFuture<'a, color_eyre::Result<Vec<Vec<f32>>>> { - Box::pin(embedding::embed(cfg, texts)) + ) -> BoxFuture<'a, Result<Vec<Vec<f32>>>> { + Box::pin(async move { + embedding::embed(cfg, texts) + .await + .map_err(|err| Error::Provider { message: err.to_string() }) + }) } } @@ -183,8 +362,12 @@ impl RerankProvider for DefaultProviders { cfg: &'a ProviderConfig, query: &'a str, docs: &'a [String], - ) -> BoxFuture<'a, color_eyre::Result<Vec<f32>>> { - Box::pin(rerank::rerank(cfg, query, docs)) + ) -> BoxFuture<'a, Result<Vec<f32>>> { + Box::pin(async move { + rerank::rerank(cfg, query, docs) + .await + .map_err(|err| Error::Provider { message: err.to_string() }) + }) } } @@ -193,35 +376,12 @@ impl ExtractorProvider for DefaultProviders { &'a self, cfg: &'a LlmProviderConfig, messages: &'a [Value], - ) -> BoxFuture<'a, color_eyre::Result<Value>> { - Box::pin(extractor::extract(cfg, messages)) - } -} - -impl Providers { - pub fn new( - embedding: Arc<dyn EmbeddingProvider>, - rerank: Arc<dyn RerankProvider>, - extractor: Arc<dyn ExtractorProvider>, - ) -> Self { - Self { embedding, rerank, extractor } - } -} - -impl Default for Providers { - fn default() -> Self { - let provider = Arc::new(DefaultProviders); - Self { embedding: provider.clone(), rerank: provider.clone(), extractor: provider } - } -} - -impl ElfService { - pub fn new(cfg: Config, db: Db, qdrant: QdrantStore) -> Self { - Self { cfg, db, qdrant, providers: Providers::default() } - } - - pub fn with_providers(cfg: Config, db: Db, qdrant: QdrantStore, providers: Providers) -> Self { - Self { cfg, db, qdrant, providers } + ) -> BoxFuture<'a, Result<Value>> { + Box::pin(async move { + extractor::extract(cfg, messages) + .await + .map_err(|err| Error::Provider { message: err.to_string() }) + }) } } @@ -234,10 +394,9 @@ pub(crate) fn embedding_version(cfg: &Config) -> String { ) } -pub(crate) fn writegate_reason_code(code: elf_domain::writegate::RejectCode) -> &'static str { - use elf_domain::writegate::RejectCode; +pub(crate) fn writegate_reason_code(code: RejectCode) -> &'static str { match code { - RejectCode::RejectCjk => "REJECT_CJK", + RejectCode::RejectNonEnglish => "REJECT_NON_ENGLISH", RejectCode::RejectTooLong => "REJECT_TOO_LONG", RejectCode::RejectSecret => "REJECT_SECRET", RejectCode::RejectInvalidType => "REJECT_INVALID_TYPE", @@ -248,40 +407,76 @@ pub(crate) fn writegate_reason_code(code: elf_domain::writegate::RejectCode) -> pub(crate) fn vector_to_pg(vec: &[f32]) -> String { let mut out = String::with_capacity(vec.len() * 8); + out.push('['); + for (i, value) in vec.iter().enumerate() { if i > 0 { out.push(','); } + out.push_str(&value.to_string()); } + out.push(']'); + out } -pub(crate) fn parse_pg_vector(text: &str) -> Result<Vec<f32>, ServiceError> { +pub(crate) fn parse_pg_vector(text: &str) -> Result<Vec<f32>> { let trimmed = text.trim(); let without_brackets = trimmed.strip_prefix('[').and_then(|s| s.strip_suffix(']')).ok_or_else(|| { - ServiceError::InvalidRequest { message: "Vector text is not bracketed.".to_string() } + Error::InvalidRequest { message: "Vector text is not bracketed.".to_string() } })?; + if without_brackets.trim().is_empty() { return Ok(Vec::new()); } + let mut vec = Vec::new(); + for part in without_brackets.split(',') { - let value: f32 = part.trim().parse().map_err(|_| ServiceError::InvalidRequest { + let value: f32 = part.trim().parse().map_err(|_| Error::InvalidRequest { message: "Vector text contains a non-numeric value.".to_string(), })?; + vec.push(value); } + Ok(vec) } -pub(crate) async fn resolve_update( - tx: &mut sqlx::Transaction<'_, sqlx::Postgres>, +pub(crate) fn note_snapshot(note: &MemoryNote) -> Value { + serde_json::json!({ + "note_id": note.note_id, + "tenant_id": note.tenant_id, + "project_id": note.project_id, + "agent_id": note.agent_id, + "scope": note.scope, + "type": note.r#type, + "key": note.key, + "text": note.text, + "importance": note.importance, + "confidence": note.confidence, + "status": note.status, + "created_at": note.created_at, + "updated_at": note.updated_at, + "expires_at": note.expires_at, + "embedding_version": note.embedding_version, + "source_ref": note.source_ref, + "hit_count": note.hit_count, + "last_hit_at": note.last_hit_at, + }) +} + +pub(crate) async fn resolve_update<'e, E>( + executor: E, args: ResolveUpdateArgs<'_>, -) -> ServiceResult<UpdateDecision> { +) -> Result<UpdateDecision> +where + E: PgExecutor<'e>, +{ let ResolveUpdateArgs { cfg, providers, @@ -294,106 +489,122 @@ pub(crate) async fn resolve_update( text, now, } = args; - - if let Some(key) = key.filter(|value| !value.trim().is_empty()) - && let Some(note_id) = sqlx::query_scalar::<_, Uuid>( - "SELECT note_id FROM memory_notes \ - WHERE tenant_id = $1 AND project_id = $2 AND agent_id = $3 AND scope = $4 \ - AND type = $5 AND key = $6 AND status = 'active' \ - AND (expires_at IS NULL OR expires_at > $7) \ - LIMIT 1", - ) - .bind(tenant_id) - .bind(project_id) - .bind(agent_id) - .bind(scope) - .bind(note_type) - .bind(key) - .bind(now) - .fetch_optional(&mut **tx) - .await? - { - return Ok(UpdateDecision::Update { note_id }); - } - - let existing_ids: Vec<Uuid> = sqlx::query_scalar( - "SELECT note_id FROM memory_notes \ - WHERE tenant_id = $1 AND project_id = $2 AND agent_id = $3 AND scope = $4 \ - AND type = $5 AND status = 'active' \ - AND (expires_at IS NULL OR expires_at > $6)", - ) - .bind(tenant_id) - .bind(project_id) - .bind(agent_id) - .bind(scope) - .bind(note_type) - .bind(now) - .fetch_all(&mut **tx) - .await?; - - if existing_ids.is_empty() { - return Ok(UpdateDecision::Add { note_id: Uuid::new_v4() }); - } - let embeddings = providers.embedding.embed(&cfg.providers.embedding, &[text.to_string()]).await?; let Some(vec) = embeddings.into_iter().next() else { - return Err(ServiceError::Provider { + return Err(Error::Provider { message: "Embedding provider returned no vectors.".to_string(), }); }; + if vec.len() != cfg.storage.qdrant.vector_dim as usize { - return Err(ServiceError::Provider { + return Err(Error::Provider { message: "Embedding vector dimension mismatch.".to_string(), }); } + let vec_text = vector_to_pg(&vec); let embed_version = embedding_version(cfg); - - let rows = sqlx::query( - "SELECT note_id, (1 - (vec <=> $1::vector))::real AS similarity \ - FROM note_embeddings WHERE note_id = ANY($2) AND embedding_version = $3", - ) - .bind(vec_text) - .bind(&existing_ids) - .bind(embed_version) - .fetch_all(&mut **tx) - .await?; - - let mut best: Option<(Uuid, f32)> = None; - for row in rows { - let note_id: Uuid = row.try_get("note_id")?; - let similarity: f32 = row.try_get("similarity")?; - if best.map(|(_, score)| similarity > score).unwrap_or(true) { - best = Some((note_id, similarity)); - } + let key = key.map(|value| value.trim()).filter(|value| !value.is_empty()); + let row: (Option<Uuid>, Option<Uuid>, Option<f32>) = sqlx::query_as(RESOLVE_UPDATE_QUERY) + .bind(tenant_id) + .bind(project_id) + .bind(agent_id) + .bind(scope) + .bind(note_type) + .bind(key) + .bind(now) + .bind(vec_text.as_str()) + .bind(embed_version.as_str()) + .fetch_one(executor) + .await?; + let (key_note_id, best_note_id, best_similarity) = row; + + if let Some(note_id) = key_note_id { + return Ok(UpdateDecision::Update { + note_id, + metadata: UpdateDecisionMetadata { + similarity_best: None, + key_match: true, + matched_dup: false, + }, + }); } - let Some((best_id, best_score)) = best else { - return Ok(UpdateDecision::Add { note_id: Uuid::new_v4() }); + let Some(best_id) = best_note_id else { + return Ok(UpdateDecision::Add { + note_id: Uuid::new_v4(), + metadata: UpdateDecisionMetadata { + similarity_best: None, + key_match: false, + matched_dup: false, + }, + }); + }; + let Some(best_score) = best_similarity else { + return Ok(UpdateDecision::Add { + note_id: Uuid::new_v4(), + metadata: UpdateDecisionMetadata { + similarity_best: None, + key_match: false, + matched_dup: false, + }, + }); }; if best_score >= cfg.memory.dup_sim_threshold { - return Ok(UpdateDecision::None { note_id: best_id }); + return Ok(UpdateDecision::None { + note_id: best_id, + metadata: UpdateDecisionMetadata { + similarity_best: Some(best_score), + key_match: false, + matched_dup: true, + }, + }); } if best_score >= cfg.memory.update_sim_threshold { - return Ok(UpdateDecision::Update { note_id: best_id }); + return Ok(UpdateDecision::Update { + note_id: best_id, + metadata: UpdateDecisionMetadata { + similarity_best: Some(best_score), + key_match: false, + matched_dup: false, + }, + }); } - Ok(UpdateDecision::Add { note_id: Uuid::new_v4() }) + Ok(UpdateDecision::Add { + note_id: Uuid::new_v4(), + metadata: UpdateDecisionMetadata { + similarity_best: Some(best_score), + key_match: false, + matched_dup: false, + }, + }) } -pub(crate) async fn insert_version( - tx: &mut sqlx::Transaction<'_, sqlx::Postgres>, - args: InsertVersionArgs<'_>, -) -> ServiceResult<()> { +pub(crate) async fn insert_version<'e, E>(executor: E, args: InsertVersionArgs<'_>) -> Result<Uuid> +where + E: PgExecutor<'e>, +{ let InsertVersionArgs { note_id, op, prev_snapshot, new_snapshot, reason, actor, ts } = args; + let version_id = Uuid::new_v4(); + sqlx::query( - "INSERT INTO memory_note_versions \ - (version_id, note_id, op, prev_snapshot, new_snapshot, reason, actor, ts) \ - VALUES ($1,$2,$3,$4,$5,$6,$7,$8)", + "\ +INSERT INTO memory_note_versions ( + version_id, + note_id, + op, + prev_snapshot, + new_snapshot, + reason, + actor, + ts +) +VALUES ($1,$2,$3,$4,$5,$6,$7,$8)", ) - .bind(Uuid::new_v4()) + .bind(version_id) .bind(note_id) .bind(op) .bind(prev_snapshot) @@ -401,22 +612,35 @@ pub(crate) async fn insert_version( .bind(reason) .bind(actor) .bind(ts) - .execute(&mut **tx) + .execute(executor) .await?; - Ok(()) + + Ok(version_id) } -pub(crate) async fn enqueue_outbox_tx( - tx: &mut sqlx::Transaction<'_, sqlx::Postgres>, +pub(crate) async fn enqueue_outbox_tx<'e, E>( + executor: E, note_id: Uuid, op: &str, embedding_version: &str, - now: time::OffsetDateTime, -) -> ServiceResult<()> { + now: OffsetDateTime, +) -> Result<()> +where + E: PgExecutor<'e>, +{ sqlx::query( - "INSERT INTO indexing_outbox \ - (outbox_id, note_id, op, embedding_version, status, created_at, updated_at, available_at) \ - VALUES ($1,$2,$3,$4,'PENDING',$5,$6,$7)", + "\ +INSERT INTO indexing_outbox ( + outbox_id, + note_id, + op, + embedding_version, + status, + created_at, + updated_at, + available_at +) +VALUES ($1,$2,$3,$4,'PENDING',$5,$6,$7)", ) .bind(Uuid::new_v4()) .bind(note_id) @@ -425,30 +649,8 @@ pub(crate) async fn enqueue_outbox_tx( .bind(now) .bind(now) .bind(now) - .execute(&mut **tx) + .execute(executor) .await?; - Ok(()) -} -pub(crate) fn note_snapshot(note: &MemoryNote) -> Value { - serde_json::json!({ - "note_id": note.note_id, - "tenant_id": note.tenant_id, - "project_id": note.project_id, - "agent_id": note.agent_id, - "scope": note.scope, - "type": note.r#type, - "key": note.key, - "text": note.text, - "importance": note.importance, - "confidence": note.confidence, - "status": note.status, - "created_at": note.created_at, - "updated_at": note.updated_at, - "expires_at": note.expires_at, - "embedding_version": note.embedding_version, - "source_ref": note.source_ref, - "hit_count": note.hit_count, - "last_hit_at": note.last_hit_at, - }) + Ok(()) } diff --git a/packages/elf-service/src/list.rs b/packages/elf-service/src/list.rs index a372d162..d1e94803 100644 --- a/packages/elf-service/src/list.rs +++ b/packages/elf-service/src/list.rs @@ -1,123 +1,280 @@ -// crates.io +//! Note listing APIs. + +use std::collections::HashSet; + +use serde::{Deserialize, Serialize}; use serde_json::Value; -use sqlx::QueryBuilder; +use sqlx::{PgPool, QueryBuilder}; use time::OffsetDateTime; use uuid::Uuid; -// self +use crate::{ + ElfService, Error, Result, + access::{self, ORG_PROJECT_ID}, +}; use elf_storage::models::MemoryNote; -use crate::{ElfService, ServiceError, ServiceResult}; - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Request payload for note listing. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct ListRequest { + /// Tenant to list notes from. pub tenant_id: String, + /// Project to list notes from. pub project_id: String, + /// Optional agent filter and required owner for `agent_private`. pub agent_id: Option<String>, + /// Optional scope filter. pub scope: Option<String>, + /// Optional lifecycle status filter. pub status: Option<String>, - #[serde(rename = "type")] - pub note_type: Option<String>, + /// Optional note-type filter. + pub r#type: Option<String>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// One note returned by `list`. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct ListItem { + /// Note identifier. pub note_id: Uuid, - #[serde(rename = "type")] - pub note_type: String, + /// Note type discriminator. + pub r#type: String, + /// Optional application-defined key. pub key: Option<String>, + /// Scope key for the note. pub scope: String, + /// Lifecycle status for the note. pub status: String, + /// Note body text. pub text: String, + /// Importance score. pub importance: f32, + /// Confidence score. pub confidence: f32, #[serde(with = "crate::time_serde")] + /// Last update timestamp. pub updated_at: OffsetDateTime, #[serde(with = "crate::time_serde::option")] + /// Optional expiry timestamp. pub expires_at: Option<OffsetDateTime>, + /// Structured source reference metadata. pub source_ref: Value, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Response payload for note listing. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct ListResponse { + /// Notes visible to the caller after access filtering. pub items: Vec<ListItem>, } impl ElfService { - pub async fn list(&self, req: ListRequest) -> ServiceResult<ListResponse> { + /// Lists notes visible to the caller under the requested filters. + pub async fn list(&self, req: ListRequest) -> Result<ListResponse> { + let now = OffsetDateTime::now_utc(); let tenant_id = req.tenant_id.trim(); let project_id = req.project_id.trim(); - if tenant_id.is_empty() || project_id.is_empty() { - return Err(ServiceError::InvalidRequest { - message: "tenant_id and project_id are required.".to_string(), - }); - } - if let Some(agent_id) = req.agent_id.as_ref() - && agent_id.trim().is_empty() - { - return Err(ServiceError::InvalidRequest { - message: "agent_id must not be empty when provided.".to_string(), - }); - } - if let Some(scope) = req.scope.as_ref() - && !self.cfg.scopes.allowed.iter().any(|value| value == scope) - { - return Err(ServiceError::ScopeDenied { message: "Scope is not allowed.".to_string() }); - } + let agent_id = req.agent_id.as_ref().map(|value| value.trim()).unwrap_or(""); + let requested_status = requested_list_status(req.status.as_ref()); + let status_for_note_read = + requested_status.unwrap_or("active").eq_ignore_ascii_case("active"); + let non_private_scopes = match req.scope.as_deref().map(str::trim) { + Some("agent_private") => None, + Some(scope) => Some(vec![scope.to_string()]), + None => Some( + self.cfg.scopes.allowed.iter().filter(|s| *s != "agent_private").cloned().collect(), + ), + }; - let mut builder = QueryBuilder::new( - "SELECT note_id, tenant_id, project_id, agent_id, scope, type, key, text, importance, confidence, status, created_at, updated_at, expires_at, embedding_version, source_ref, hit_count, last_hit_at \ - FROM memory_notes WHERE tenant_id = ", + validate_list_request(&req, tenant_id, project_id, agent_id, &self.cfg.scopes.allowed)?; + + let shared_grants = + list_shared_grants(&self.db.pool, tenant_id, project_id, agent_id, &non_private_scopes) + .await?; + let notes = + list_notes(&self.db.pool, &req, tenant_id, project_id, requested_status, agent_id, now) + .await?; + let items = map_list_items( + notes, + agent_id, + non_private_scopes.as_deref(), + &shared_grants, + status_for_note_read, + now, ); - builder.push_bind(tenant_id); + + Ok(ListResponse { items }) + } +} + +fn requested_list_status(requested_status: Option<&String>) -> Option<&str> { + requested_status.map(|value| value.trim()).filter(|value| !value.is_empty()) +} + +fn validate_list_request( + req: &ListRequest, + tenant_id: &str, + project_id: &str, + agent_id: &str, + allowed_scopes: &[String], +) -> Result<()> { + if tenant_id.is_empty() || project_id.is_empty() { + return Err(Error::InvalidRequest { + message: "tenant_id and project_id are required.".to_string(), + }); + } + + if let Some(scope) = req.scope.as_ref() + && !allowed_scopes.iter().any(|value| value == scope) + { + return Err(Error::ScopeDenied { message: "Scope is not allowed.".to_string() }); + } + if let Some(agent_id) = req.agent_id.as_ref() + && agent_id.trim().is_empty() + { + return Err(Error::InvalidRequest { + message: "agent_id must not be empty when provided.".to_string(), + }); + } + + if req.scope.as_deref() == Some("agent_private") && agent_id.is_empty() { + return Err(Error::ScopeDenied { + message: "agent_id is required for agent_private scope.".to_string(), + }); + } + + Ok(()) +} + +fn map_list_items( + notes: Vec<MemoryNote>, + agent_id: &str, + non_private_scopes: Option<&[String]>, + shared_grants: &HashSet<access::SharedSpaceGrantKey>, + status_for_note_read: bool, + now: OffsetDateTime, +) -> Vec<ListItem> { + notes + .into_iter() + .filter(|note| { + let Some(scopes) = non_private_scopes else { + return true; + }; + + if status_for_note_read { + return access::note_read_allowed(note, agent_id, scopes, shared_grants, now); + } + + note.agent_id == agent_id + || shared_grants.contains(&crate::access::SharedSpaceGrantKey { + scope: note.scope.clone(), + space_owner_agent_id: note.agent_id.clone(), + }) + }) + .map(|note| ListItem { + note_id: note.note_id, + r#type: note.r#type, + key: note.key, + scope: note.scope, + status: note.status, + text: note.text, + importance: note.importance, + confidence: note.confidence, + updated_at: note.updated_at, + expires_at: note.expires_at, + source_ref: note.source_ref, + }) + .collect() +} + +async fn list_shared_grants( + pool: &PgPool, + tenant_id: &str, + project_id: &str, + agent_id: &str, + non_private_scopes: &Option<Vec<String>>, +) -> Result<HashSet<access::SharedSpaceGrantKey>> { + if non_private_scopes.is_none() || agent_id.is_empty() { + return Ok(HashSet::new()); + } + + let org_shared_allowed = + non_private_scopes.as_ref().is_some_and(|scopes| scopes.iter().any(|s| s == "org_shared")); + + access::load_shared_read_grants_with_org_shared( + pool, + tenant_id, + project_id, + agent_id, + org_shared_allowed, + ) + .await +} + +async fn list_notes( + pool: &PgPool, + req: &ListRequest, + tenant_id: &str, + project_id: &str, + requested_status: Option<&str>, + agent_id: &str, + now: OffsetDateTime, +) -> Result<Vec<MemoryNote>> { + let mut builder = QueryBuilder::new( + "SELECT note_id, tenant_id, project_id, agent_id, scope, type, key, text, importance, confidence, status, created_at, updated_at, expires_at, embedding_version, source_ref, hit_count, last_hit_at \ + FROM memory_notes WHERE tenant_id = ", + ); + + builder.push_bind(tenant_id); + + let include_org_shared = match req.scope.as_deref().map(str::trim) { + None => true, + Some("org_shared") => true, + Some(_) => false, + }; + + if include_org_shared { + builder.push(" AND (project_id = "); + builder.push_bind(project_id); + builder.push(" OR (project_id = "); + builder.push_bind(ORG_PROJECT_ID); + builder.push(" AND scope = "); + builder.push_bind("org_shared"); + builder.push("))"); + } else { builder.push(" AND project_id = "); builder.push_bind(project_id); + } - if let Some(scope) = &req.scope { - builder.push(" AND scope = "); - builder.push_bind(scope); - if scope == "agent_private" { - let agent_id = req.agent_id.as_ref().map(|value| value.trim()).unwrap_or(""); - if agent_id.is_empty() { - return Err(ServiceError::ScopeDenied { - message: "agent_id is required for agent_private scope.".to_string(), - }); - } - builder.push(" AND agent_id = "); - builder.push_bind(agent_id); - } - } else { - builder.push(" AND scope != "); - builder.push_bind("agent_private"); - } - if let Some(status) = &req.status { - builder.push(" AND status = "); - builder.push_bind(status); - } - if let Some(note_type) = &req.note_type { - builder.push(" AND type = "); - builder.push_bind(note_type); + if let Some(scope) = &req.scope { + builder.push(" AND scope = "); + builder.push_bind(scope); + + if scope == "agent_private" { + builder.push(" AND agent_id = "); + builder.push_bind(agent_id); } + } else { + builder.push(" AND scope != "); + builder.push_bind("agent_private"); + } + if let Some(status) = requested_status { + builder.push(" AND status = "); + builder.push_bind(status); + } else { + builder.push(" AND status = "); + builder.push_bind("active"); + } - let notes: Vec<MemoryNote> = builder.build_query_as().fetch_all(&self.db.pool).await?; - - let items = notes - .into_iter() - .map(|note| ListItem { - note_id: note.note_id, - note_type: note.r#type, - key: note.key, - scope: note.scope, - status: note.status, - text: note.text, - importance: note.importance, - confidence: note.confidence, - updated_at: note.updated_at, - expires_at: note.expires_at, - source_ref: note.source_ref, - }) - .collect(); + if requested_status.unwrap_or("active").eq_ignore_ascii_case("active") { + builder.push(" AND (expires_at IS NULL OR expires_at > "); + builder.push_bind(now); + builder.push(")"); + } - Ok(ListResponse { items }) + if let Some(note_type) = &req.r#type { + builder.push(" AND type = "); + builder.push_bind(note_type); } + + builder.build_query_as().fetch_all(pool).await.map_err(Into::into) } diff --git a/packages/elf-service/src/notes.rs b/packages/elf-service/src/notes.rs index ef7ecdf4..5b4a2f5d 100644 --- a/packages/elf-service/src/notes.rs +++ b/packages/elf-service/src/notes.rs @@ -1,56 +1,136 @@ -// crates.io +//! Individual note fetch APIs. + +use std::{collections::HashSet, slice}; + +use serde::{Deserialize, Serialize}; use serde_json::Value; use time::OffsetDateTime; use uuid::Uuid; -// self +use crate::{ + ElfService, Error, Result, + access::{self, ORG_PROJECT_ID}, + structured_fields::{self, StructuredFields}, +}; use elf_storage::models::MemoryNote; -use crate::{ElfService, ServiceError, ServiceResult}; - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Request payload for fetching one note. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct NoteFetchRequest { + /// Tenant that owns the note. + pub tenant_id: String, + /// Project that owns the note. + pub project_id: String, + /// Agent requesting the read. + pub agent_id: String, + /// Identifier of the note to fetch. pub note_id: Uuid, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Response payload for fetching one note. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct NoteFetchResponse { + /// Note identifier. pub note_id: Uuid, + /// Tenant that owns the note. pub tenant_id: String, + /// Project that owns the note. pub project_id: String, + /// Agent that wrote the note. pub agent_id: String, + /// Scope key for the note. pub scope: String, - #[serde(rename = "type")] - pub note_type: String, + /// Note type discriminator. + pub r#type: String, + /// Optional application-defined key. pub key: Option<String>, + /// Note body text. pub text: String, + /// Importance score. pub importance: f32, + /// Confidence score. pub confidence: f32, + /// Lifecycle status for the note. pub status: String, #[serde(with = "crate::time_serde")] + /// Last update timestamp. pub updated_at: OffsetDateTime, #[serde(with = "crate::time_serde::option")] + /// Optional expiry timestamp. pub expires_at: Option<OffsetDateTime>, + /// Structured source reference metadata. pub source_ref: Value, + /// Structured fields stored for the note, when present. + pub structured: Option<StructuredFields>, } impl ElfService { - pub async fn get_note(&self, req: NoteFetchRequest) -> ServiceResult<NoteFetchResponse> { - let row: Option<MemoryNote> = - sqlx::query_as("SELECT * FROM memory_notes WHERE note_id = $1") - .bind(req.note_id) - .fetch_optional(&self.db.pool) - .await?; + /// Fetches one note when it is visible to the caller. + pub async fn get_note(&self, req: NoteFetchRequest) -> Result<NoteFetchResponse> { + let now = OffsetDateTime::now_utc(); + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let agent_id = req.agent_id.trim(); + + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { + return Err(Error::InvalidRequest { + message: "tenant_id, project_id, and agent_id are required.".to_string(), + }); + } + + let allowed_scopes = self.cfg.scopes.allowed.clone(); + let org_shared_allowed = allowed_scopes.iter().any(|scope| scope == "org_shared"); + let row: Option<MemoryNote> = sqlx::query_as::<_, MemoryNote>( + "\ +SELECT * +FROM memory_notes +WHERE note_id = $1 + AND tenant_id = $2 + AND ( + project_id = $3 + OR (project_id = $4 AND scope = 'org_shared') + )", + ) + .bind(req.note_id) + .bind(tenant_id) + .bind(project_id) + .bind(ORG_PROJECT_ID) + .fetch_optional(&self.db.pool) + .await?; let Some(note) = row else { - return Err(ServiceError::InvalidRequest { message: "Unknown note_id.".to_string() }); + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + }; + let shared_grants = if note.scope == "agent_private" { + HashSet::new() + } else { + access::load_shared_read_grants_with_org_shared( + &self.db.pool, + tenant_id, + project_id, + agent_id, + org_shared_allowed, + ) + .await? }; + + if !access::note_read_allowed(¬e, agent_id, &allowed_scopes, &shared_grants, now) { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + } + + let structured = structured_fields::fetch_structured_fields( + &self.db.pool, + slice::from_ref(¬e.note_id), + ) + .await? + .remove(¬e.note_id); + Ok(NoteFetchResponse { note_id: note.note_id, tenant_id: note.tenant_id, project_id: note.project_id, agent_id: note.agent_id, scope: note.scope, - note_type: note.r#type, + r#type: note.r#type, key: note.key, text: note.text, importance: note.importance, @@ -59,6 +139,7 @@ impl ElfService { updated_at: note.updated_at, expires_at: note.expires_at, source_ref: note.source_ref, + structured, }) } } diff --git a/packages/elf-service/src/progressive_search.rs b/packages/elf-service/src/progressive_search.rs new file mode 100644 index 00000000..32a8b50d --- /dev/null +++ b/packages/elf-service/src/progressive_search.rs @@ -0,0 +1,1235 @@ +//! Progressive-search APIs. + +use std::{ + cmp::Ordering, + collections::{BTreeMap, HashMap, hash_map::DefaultHasher, hash_set::HashSet}, + hash::{Hash, Hasher}, + str::FromStr, +}; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use sqlx::{FromRow, PgExecutor}; +use time::{Duration, OffsetDateTime}; +use uuid::Uuid; + +use crate::{ + ElfService, NoteFetchResponse, PayloadLevel, QueryPlan, SearchRequest, SearchTrajectorySummary, + access::{self, ORG_PROJECT_ID, SharedSpaceGrantKey}, + structured_fields::{self, StructuredFields}, +}; +use elf_config::Config; +use elf_domain::english_gate; +use elf_storage::models::MemoryNote; + +const SESSION_SLIDING_TTL_HOURS: i64 = 6; +const SESSION_ABSOLUTE_TTL_HOURS: i64 = 24; + +/// Lightweight session-storable search hit used by progressive-search APIs. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchIndexItem { + /// Note identifier. + pub note_id: Uuid, + /// Note type discriminator. + pub r#type: String, + /// Optional application-defined key. + pub key: Option<String>, + /// Scope key for the note. + pub scope: String, + /// Importance score. + pub importance: f32, + /// Confidence score. + pub confidence: f32, + #[serde(with = "crate::time_serde")] + /// Last update timestamp. + pub updated_at: OffsetDateTime, + #[serde(with = "crate::time_serde::option")] + /// Optional expiry timestamp. + pub expires_at: Option<OffsetDateTime>, + /// Final ranked score. + pub final_score: f32, + /// Short display summary. + pub summary: String, +} + +/// Response payload for initial indexed search results. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchIndexResponse { + /// Search trace identifier. + pub trace_id: Uuid, + /// Search session identifier used for follow-up requests. + pub search_session_id: Uuid, + #[serde(with = "crate::time_serde")] + /// Session expiry timestamp. + pub expires_at: OffsetDateTime, + /// Stored search hits. + pub items: Vec<SearchIndexItem>, + /// Optional condensed explain output. + pub trajectory_summary: Option<SearchTrajectorySummary>, +} + +/// Search-session mode used by progressive-search APIs. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum SearchSessionMode { + /// Quick-find session without a stored query plan. + QuickFind, + /// Planned-search session with a stored query plan. + PlannedSearch, +} +impl SearchSessionMode { + fn as_str(self) -> &'static str { + match self { + Self::QuickFind => "quick_find", + Self::PlannedSearch => "planned_search", + } + } +} + +impl FromStr for SearchSessionMode { + type Err = crate::Error; + + fn from_str(value: &str) -> std::result::Result<Self, Self::Err> { + match value { + "quick_find" => Ok(Self::QuickFind), + "planned_search" => Ok(Self::PlannedSearch), + _ => Err(crate::Error::Storage { + message: format!("Unknown search session mode: {value}"), + }), + } + } +} + +impl From<SearchSessionizePath> for SearchSessionMode { + fn from(path: SearchSessionizePath) -> Self { + match path { + SearchSessionizePath::Quick => Self::QuickFind, + SearchSessionizePath::Planned => Self::PlannedSearch, + } + } +} + +/// Response payload for reloading a stored search session. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchSessionGetResponse { + /// Search trace identifier. + pub trace_id: Uuid, + /// Search session identifier. + pub search_session_id: Uuid, + #[serde(with = "crate::time_serde")] + /// Session expiry timestamp. + pub expires_at: OffsetDateTime, + /// Stored hits after trimming to the requested limit. + pub items: Vec<SearchIndexItem>, + /// Session mode. + pub mode: SearchSessionMode, + /// Stored query plan for planned-search sessions. + pub query_plan: Option<QueryPlan>, + /// Optional condensed explain output. + pub trajectory_summary: Option<SearchTrajectorySummary>, +} + +/// Planned-search variant of the indexed search response. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchIndexPlannedResponse { + /// Search trace identifier. + pub trace_id: Uuid, + /// Search session identifier. + pub search_session_id: Uuid, + #[serde(with = "crate::time_serde")] + /// Session expiry timestamp. + pub expires_at: OffsetDateTime, + /// Stored hits. + pub items: Vec<SearchIndexItem>, + /// Optional condensed explain output. + pub trajectory_summary: Option<SearchTrajectorySummary>, + /// Stored query plan for the session. + pub query_plan: QueryPlan, +} + +/// Request payload for reloading a search session. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchSessionGetRequest { + /// Tenant that owns the session. + pub tenant_id: String, + /// Project that owns the session. + pub project_id: String, + /// Agent requesting the read. + pub agent_id: String, + /// Search session identifier. + pub search_session_id: Uuid, + #[serde(default)] + /// Desired payload-detail level. + pub payload_level: PayloadLevel, + /// Optional limit on returned items. + pub top_k: Option<u32>, + /// When true, extends the sliding session TTL. + pub touch: Option<bool>, +} + +/// Request payload for timeline projection of a search session. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchTimelineRequest { + /// Tenant that owns the session. + pub tenant_id: String, + /// Project that owns the session. + pub project_id: String, + /// Agent requesting the read. + pub agent_id: String, + /// Search session identifier. + pub search_session_id: Uuid, + /// Desired payload-detail level. + pub payload_level: PayloadLevel, + /// Optional timeline grouping mode. + pub group_by: Option<String>, +} + +/// One timeline bucket for a search session. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchTimelineGroup { + /// Group key, usually a day string. + pub date: String, + /// Items that belong to the group. + pub items: Vec<SearchIndexItem>, +} + +/// Response payload for timeline projection. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchTimelineResponse { + /// Search session identifier. + pub search_session_id: Uuid, + #[serde(with = "crate::time_serde")] + /// Session expiry timestamp. + pub expires_at: OffsetDateTime, + /// Timeline groups. + pub groups: Vec<SearchTimelineGroup>, +} + +/// Request payload for materializing details from a search session. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchDetailsRequest { + /// Tenant that owns the session. + pub tenant_id: String, + /// Project that owns the session. + pub project_id: String, + /// Agent requesting the read. + pub agent_id: String, + /// Search session identifier. + pub search_session_id: Uuid, + #[serde(default)] + /// Desired payload-detail level. + pub payload_level: PayloadLevel, + /// Requested subset of note identifiers. + pub note_ids: Vec<Uuid>, + /// When true, records note-hit metrics for returned details. + pub record_hits: Option<bool>, +} + +/// Per-note error payload for detail materialization. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchDetailsError { + /// Machine-readable error code. + pub code: String, + /// Human-readable error message. + pub message: String, +} + +/// Per-note detail result for a search session. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchDetailsResult { + /// Requested note identifier. + pub note_id: Uuid, + /// Materialized note payload, when loading succeeded. + pub note: Option<NoteFetchResponse>, + /// Per-note failure, when loading failed. + pub error: Option<SearchDetailsError>, +} + +/// Response payload for detail materialization. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchDetailsResponse { + /// Search session identifier. + pub search_session_id: Uuid, + #[serde(with = "crate::time_serde")] + /// Session expiry timestamp. + pub expires_at: OffsetDateTime, + /// Per-note results. + pub results: Vec<SearchDetailsResult>, +} + +struct HitItem { + note_id: Uuid, + chunk_id: Uuid, + rank: u32, + final_score: f32, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum SearchSessionizePath { + Quick, + Planned, +} + +struct SearchSessionizedOutput { + index: SearchIndexResponse, + query_plan: Option<QueryPlan>, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct SearchSessionItemRecord { + rank: u32, + note_id: Uuid, + chunk_id: Uuid, + final_score: f32, + #[serde(with = "crate::time_serde")] + updated_at: OffsetDateTime, + #[serde(with = "crate::time_serde::option")] + expires_at: Option<OffsetDateTime>, + r#type: String, + key: Option<String>, + scope: String, + importance: f32, + confidence: f32, + summary: String, +} +impl SearchSessionItemRecord { + fn to_index_item(&self) -> SearchIndexItem { + SearchIndexItem { + note_id: self.note_id, + r#type: self.r#type.clone(), + key: self.key.clone(), + scope: self.scope.clone(), + importance: self.importance, + confidence: self.confidence, + updated_at: self.updated_at, + expires_at: self.expires_at, + final_score: self.final_score, + summary: self.summary.clone(), + } + } +} + +struct SearchSession { + search_session_id: Uuid, + trace_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + query: String, + mode: SearchSessionMode, + trajectory_summary: Option<SearchTrajectorySummary>, + query_plan: Option<QueryPlan>, + items: Vec<SearchSessionItemRecord>, + created_at: OffsetDateTime, + expires_at: OffsetDateTime, +} + +#[derive(FromRow)] +struct SearchSessionRow { + search_session_id: Uuid, + trace_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + query: String, + mode: String, + trajectory_summary: Option<Value>, + query_plan: Option<Value>, + items: Value, + created_at: OffsetDateTime, + expires_at: OffsetDateTime, +} + +struct NewSearchSession<'a> { + search_session_id: Uuid, + trace_id: Uuid, + tenant_id: &'a str, + project_id: &'a str, + agent_id: &'a str, + read_profile: &'a str, + query: &'a str, + mode: SearchSessionMode, + trajectory_summary: Option<&'a SearchTrajectorySummary>, + query_plan: Option<&'a QueryPlan>, + items: &'a [SearchSessionItemRecord], + created_at: OffsetDateTime, + expires_at: OffsetDateTime, +} + +impl ElfService { + /// Runs the default progressive-search path and returns indexed results. + pub async fn search(&self, req: SearchRequest) -> crate::Result<SearchIndexResponse> { + let response = self.search_planned(req).await?; + + Ok(SearchIndexResponse { + trace_id: response.trace_id, + search_session_id: response.search_session_id, + expires_at: response.expires_at, + items: response.items, + trajectory_summary: response.trajectory_summary, + }) + } + + /// Runs quick-find search and stores a quick session without a query plan. + pub async fn search_quick(&self, req: SearchRequest) -> crate::Result<SearchIndexResponse> { + self.search_sessionized(req, SearchSessionizePath::Quick).await.map(|output| output.index) + } + + /// Runs planned search and stores a session with a query plan. + pub async fn search_planned( + &self, + req: SearchRequest, + ) -> crate::Result<SearchIndexPlannedResponse> { + let output = self.search_sessionized(req, SearchSessionizePath::Planned).await?; + let query_plan = output.query_plan.ok_or_else(|| crate::Error::Storage { + message: "Planned search response is missing query_plan.".to_string(), + })?; + + Ok(SearchIndexPlannedResponse { + trace_id: output.index.trace_id, + search_session_id: output.index.search_session_id, + expires_at: output.index.expires_at, + items: output.index.items, + trajectory_summary: output.index.trajectory_summary, + query_plan, + }) + } + + async fn search_sessionized( + &self, + req: SearchRequest, + path: SearchSessionizePath, + ) -> crate::Result<SearchSessionizedOutput> { + let top_k = req.top_k.unwrap_or(self.cfg.memory.top_k).max(1); + let candidate_k = req.candidate_k.unwrap_or(self.cfg.memory.candidate_k).max(top_k); + let mut raw_req = req.clone(); + + raw_req.top_k = Some(candidate_k); + raw_req.record_hits = Some(false); + + let (trace_id, raw_items, trajectory_summary, query_plan) = match path { + SearchSessionizePath::Quick => { + let raw = self.search_raw_quick(raw_req).await?; + + (raw.trace_id, raw.items, raw.trajectory_summary, None) + }, + SearchSessionizePath::Planned => { + let raw = self.search_raw_planned(raw_req).await?; + + (raw.trace_id, raw.items, raw.trajectory_summary, Some(raw.query_plan)) + }, + }; + let now = OffsetDateTime::now_utc(); + let expires_at = now + Duration::hours(SESSION_SLIDING_TTL_HOURS); + let search_session_id = Uuid::new_v4(); + let note_ids: Vec<Uuid> = raw_items.iter().map(|item| item.note_id).collect(); + let structured_by_note = + structured_fields::fetch_structured_fields(&self.db.pool, ¬e_ids).await?; + let mut items = Vec::with_capacity(raw_items.len()); + + for (idx, item) in raw_items.iter().enumerate() { + let summary = structured_by_note + .get(&item.note_id) + .and_then(|value| value.summary.clone()) + .unwrap_or_else(|| { + build_summary(&item.snippet, self.cfg.memory.max_note_chars as usize) + }); + + items.push(SearchSessionItemRecord { + rank: idx as u32 + 1, + note_id: item.note_id, + chunk_id: item.chunk_id, + final_score: item.final_score, + updated_at: item.updated_at, + expires_at: item.expires_at, + r#type: item.r#type.clone(), + key: item.key.clone(), + scope: item.scope.clone(), + importance: item.importance, + confidence: item.confidence, + summary, + }); + } + + store_search_session( + &self.db.pool, + NewSearchSession { + search_session_id, + trace_id, + tenant_id: &req.tenant_id, + project_id: &req.project_id, + agent_id: &req.agent_id, + read_profile: &req.read_profile, + query: &req.query, + mode: SearchSessionMode::from(path), + query_plan: query_plan.as_ref(), + trajectory_summary: trajectory_summary.as_ref(), + items: &items, + created_at: now, + expires_at, + }, + ) + .await?; + + let response_items: Vec<SearchIndexItem> = + items.into_iter().take(top_k as usize).map(|item| item.to_index_item()).collect(); + + Ok(SearchSessionizedOutput { + index: SearchIndexResponse { + trace_id, + search_session_id, + expires_at, + items: response_items, + trajectory_summary, + }, + query_plan, + }) + } + + /// Reloads a stored search session and optionally extends its TTL. + pub async fn search_session_get( + &self, + req: SearchSessionGetRequest, + ) -> crate::Result<SearchSessionGetResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let agent_id = req.agent_id.trim(); + + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { + return Err(crate::Error::InvalidRequest { + message: "tenant_id, project_id, and agent_id are required.".to_string(), + }); + } + + let now = OffsetDateTime::now_utc(); + let session = load_search_session(&self.db.pool, req.search_session_id, now).await?; + + validate_search_session_access(&session, tenant_id, project_id, agent_id)?; + + let touch = req.touch.unwrap_or(true); + let expires_at = if touch { + touch_search_session(&self.db.pool, &session, now).await? + } else { + session.expires_at + }; + let top_k = req.top_k.unwrap_or(self.cfg.memory.top_k).max(1); + let items: Vec<SearchIndexItem> = session + .items + .into_iter() + .take(top_k as usize) + .map(|item| item.to_index_item()) + .collect(); + + Ok(SearchSessionGetResponse { + trace_id: session.trace_id, + search_session_id: session.search_session_id, + expires_at, + items, + mode: session.mode, + query_plan: session.query_plan, + trajectory_summary: session.trajectory_summary, + }) + } + + /// Reprojects a stored search session into timeline groups. + pub async fn search_timeline( + &self, + req: SearchTimelineRequest, + ) -> crate::Result<SearchTimelineResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let agent_id = req.agent_id.trim(); + + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { + return Err(crate::Error::InvalidRequest { + message: "tenant_id, project_id, and agent_id are required.".to_string(), + }); + } + + let now = OffsetDateTime::now_utc(); + let session = load_search_session(&self.db.pool, req.search_session_id, now).await?; + + validate_search_session_access(&session, tenant_id, project_id, agent_id)?; + + let expires_at = touch_search_session(&self.db.pool, &session, now).await?; + let payload_level = req.payload_level; + let group_by = req.group_by.unwrap_or_else(|| { + if payload_level == PayloadLevel::L0 { "none".to_string() } else { "day".to_string() } + }); + + match group_by.as_str() { + "day" => build_timeline_by_day(session.search_session_id, expires_at, &session.items), + "none" => Ok(SearchTimelineResponse { + search_session_id: session.search_session_id, + expires_at, + groups: vec![SearchTimelineGroup { + date: "all".to_string(), + items: session + .items + .iter() + .map(SearchSessionItemRecord::to_index_item) + .collect(), + }], + }), + _ => Err(crate::Error::InvalidRequest { + message: "group_by must be one of: day, none.".to_string(), + }), + } + } + + /// Materializes selected note details out of a stored search session. + pub async fn search_details( + &self, + req: SearchDetailsRequest, + ) -> crate::Result<SearchDetailsResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let agent_id = req.agent_id.trim(); + + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { + return Err(crate::Error::InvalidRequest { + message: "tenant_id, project_id, and agent_id are required.".to_string(), + }); + } + + let now = OffsetDateTime::now_utc(); + let session = load_search_session(&self.db.pool, req.search_session_id, now).await?; + + validate_search_session_access(&session, tenant_id, project_id, agent_id)?; + + let expires_at = touch_search_session(&self.db.pool, &session, now).await?; + let mut by_note_id: HashMap<Uuid, SearchSessionItemRecord> = HashMap::new(); + + for item in &session.items { + by_note_id.insert(item.note_id, item.clone()); + } + + let mut requested_in_session = Vec::new(); + let mut seen = HashSet::new(); + + for note_id in &req.note_ids { + if by_note_id.contains_key(note_id) && seen.insert(*note_id) { + requested_in_session.push(*note_id); + } + } + + let mut notes_by_id = HashMap::new(); + + if !requested_in_session.is_empty() { + let rows: Vec<MemoryNote> = sqlx::query_as::<_, MemoryNote>( + "\ +SELECT * +FROM memory_notes +WHERE note_id = ANY($1::uuid[]) + AND tenant_id = $2 + AND ( + project_id = $3 + OR (project_id = $4 AND scope = 'org_shared') + )", + ) + .bind(requested_in_session.as_slice()) + .bind(session.tenant_id.as_str()) + .bind(session.project_id.as_str()) + .bind(ORG_PROJECT_ID) + .fetch_all(&self.db.pool) + .await?; + + for note in rows { + notes_by_id.insert(note.note_id, note); + } + } + + let structured_by_note = if req.payload_level == PayloadLevel::L0 { + HashMap::new() + } else { + structured_fields::fetch_structured_fields( + &self.db.pool, + requested_in_session.as_slice(), + ) + .await? + }; + let allowed_scopes = resolve_read_scopes(&self.cfg, &session.read_profile)?; + let shared_grants = access::load_shared_read_grants_with_org_shared( + &self.db.pool, + session.tenant_id.as_str(), + session.project_id.as_str(), + agent_id, + allowed_scopes.iter().any(|scope| scope == "org_shared"), + ) + .await?; + let record_hits = req.record_hits.unwrap_or(true); + let details_args = SearchDetailsBuildArgs { + session_items_by_note_id: &by_note_id, + notes_by_id: ¬es_by_id, + structured_by_note: &structured_by_note, + session: &session, + shared_grants: &shared_grants, + allowed_scopes: &allowed_scopes, + now, + record_hits_enabled: record_hits, + payload_level: req.payload_level, + max_note_chars: self.cfg.memory.max_note_chars as usize, + }; + let (results, hits) = build_search_details_results(req.note_ids, details_args); + + if !hits.is_empty() { + let mut tx = self.db.pool.begin().await?; + + record_detail_hits(&mut *tx, &session.query, &hits, now).await?; + + tx.commit().await?; + } + + Ok(SearchDetailsResponse { + search_session_id: session.search_session_id, + expires_at, + results, + }) + } +} + +struct SearchDetailsBuildArgs<'a> { + session_items_by_note_id: &'a HashMap<Uuid, SearchSessionItemRecord>, + notes_by_id: &'a HashMap<Uuid, MemoryNote>, + structured_by_note: &'a HashMap<Uuid, StructuredFields>, + session: &'a SearchSession, + shared_grants: &'a HashSet<SharedSpaceGrantKey>, + allowed_scopes: &'a [String], + now: OffsetDateTime, + record_hits_enabled: bool, + payload_level: PayloadLevel, + max_note_chars: usize, +} + +fn build_search_details_results( + requested_note_ids: Vec<Uuid>, + args: SearchDetailsBuildArgs<'_>, +) -> (Vec<SearchDetailsResult>, Vec<HitItem>) { + let mut results = Vec::with_capacity(requested_note_ids.len()); + let mut hits = Vec::new(); + let mut hit_seen = HashSet::new(); + + for note_id in requested_note_ids { + let Some(session_item) = args.session_items_by_note_id.get(¬e_id) else { + results.push(SearchDetailsResult { + note_id, + note: None, + error: Some(SearchDetailsError { + code: "NOT_IN_SESSION".to_string(), + message: "Requested note_id is not present in the search session.".to_string(), + }), + }); + + continue; + }; + let Some(note) = args.notes_by_id.get(¬e_id) else { + results.push(SearchDetailsResult { + note_id, + note: None, + error: Some(SearchDetailsError { + code: "NOTE_NOT_FOUND".to_string(), + message: "Note not found.".to_string(), + }), + }); + + continue; + }; + let error = validate_note_access( + note, + args.session, + args.allowed_scopes, + args.shared_grants, + args.now, + ); + + if let Some(error) = error { + results.push(SearchDetailsResult { note_id, note: None, error: Some(error) }); + + continue; + } + + let structured = if args.payload_level == PayloadLevel::L0 { + None + } else { + args.structured_by_note.get(¬e.note_id).cloned() + }; + let note_text = apply_payload_level_to_search_details_text( + note.text.as_str(), + structured.as_ref(), + args.payload_level, + args.max_note_chars, + ); + let source_ref = if args.payload_level == PayloadLevel::L2 { + note.source_ref.clone() + } else { + serde_json::json!({}) + }; + let note_response = NoteFetchResponse { + note_id: note.note_id, + tenant_id: note.tenant_id.clone(), + project_id: note.project_id.clone(), + agent_id: note.agent_id.clone(), + scope: note.scope.clone(), + r#type: note.r#type.clone(), + key: note.key.clone(), + text: note_text, + importance: note.importance, + confidence: note.confidence, + status: note.status.clone(), + updated_at: note.updated_at, + expires_at: note.expires_at, + source_ref, + structured, + }; + + results.push(SearchDetailsResult { note_id, note: Some(note_response), error: None }); + + if args.record_hits_enabled && hit_seen.insert(note_id) { + hits.push(HitItem { + note_id, + chunk_id: session_item.chunk_id, + rank: session_item.rank, + final_score: session_item.final_score, + }); + } + } + + (results, hits) +} + +fn apply_payload_level_to_search_details_text( + raw_text: &str, + structured: Option<&StructuredFields>, + payload_level: PayloadLevel, + max_note_chars: usize, +) -> String { + match payload_level { + PayloadLevel::L0 => build_summary(raw_text, max_note_chars), + PayloadLevel::L1 => { + let candidate_text = structured + .and_then(|item| item.summary.as_deref()) + .filter(|summary| !summary.trim().is_empty()) + .unwrap_or(raw_text); + + build_summary(candidate_text, max_note_chars) + }, + PayloadLevel::L2 => raw_text.to_string(), + } +} + +fn build_timeline_by_day( + search_session_id: Uuid, + expires_at: OffsetDateTime, + items: &[SearchSessionItemRecord], +) -> crate::Result<SearchTimelineResponse> { + let mut grouped: BTreeMap<String, Vec<SearchIndexItem>> = BTreeMap::new(); + + for item in items { + let date = item.updated_at.date().to_string(); + + grouped.entry(date).or_default().push(item.to_index_item()); + } + + let mut groups = Vec::with_capacity(grouped.len()); + + for (date, mut items) in grouped.into_iter().rev() { + items.sort_by(|a, b| { + b.updated_at + .cmp(&a.updated_at) + .then_with(|| b.final_score.partial_cmp(&a.final_score).unwrap_or(Ordering::Equal)) + }); + groups.push(SearchTimelineGroup { date, items }); + } + + Ok(SearchTimelineResponse { search_session_id, expires_at, groups }) +} + +fn build_summary(raw: &str, max_chars: usize) -> String { + let normalized = normalize_whitespace(raw); + + truncate_chars(&normalized, max_chars) +} + +fn normalize_whitespace(raw: &str) -> String { + let mut out = String::with_capacity(raw.len()); + let mut prev_space = false; + + for ch in raw.chars() { + if ch.is_whitespace() { + if !prev_space { + out.push(' '); + + prev_space = true; + } + + continue; + } + + out.push(ch); + + prev_space = false; + } + + out.trim().to_string() +} + +fn truncate_chars(raw: &str, max_chars: usize) -> String { + if raw.chars().count() <= max_chars { + return raw.to_string(); + } + + const TRUNCATION_MARKER: &str = "..."; + + let marker_chars = TRUNCATION_MARKER.chars().count(); + + if max_chars <= marker_chars { + return TRUNCATION_MARKER.chars().take(max_chars).collect(); + } + + let truncated_chars = max_chars - marker_chars; + let mut out = String::with_capacity(max_chars); + + for (idx, ch) in raw.chars().enumerate() { + if idx >= truncated_chars { + break; + } + + out.push(ch); + } + + out.push_str(TRUNCATION_MARKER); + + out +} + +fn resolve_read_scopes(cfg: &Config, profile: &str) -> crate::Result<Vec<String>> { + match profile { + "private_only" => Ok(cfg.scopes.read_profiles.private_only.clone()), + "private_plus_project" => Ok(cfg.scopes.read_profiles.private_plus_project.clone()), + "all_scopes" => Ok(cfg.scopes.read_profiles.all_scopes.clone()), + _ => Err(crate::Error::InvalidRequest { message: "Unknown read_profile.".to_string() }), + } +} + +fn validate_search_session_access( + session: &SearchSession, + tenant_id: &str, + project_id: &str, + agent_id: &str, +) -> crate::Result<()> { + if session.tenant_id != tenant_id + || session.project_id != project_id + || session.agent_id != agent_id + { + return Err(crate::Error::InvalidRequest { + message: "Unknown search_session_id.".to_string(), + }); + } + + Ok(()) +} + +fn validate_note_access( + note: &MemoryNote, + session: &SearchSession, + allowed_scopes: &[String], + shared_grants: &HashSet<SharedSpaceGrantKey>, + now: OffsetDateTime, +) -> Option<SearchDetailsError> { + if note.status != "active" { + return Some(SearchDetailsError { + code: "NOTE_INACTIVE".to_string(), + message: "Note is not active.".to_string(), + }); + } + if note.expires_at.map(|ts| ts <= now).unwrap_or(false) { + return Some(SearchDetailsError { + code: "NOTE_EXPIRED".to_string(), + message: "Note is expired.".to_string(), + }); + } + if !allowed_scopes.iter().any(|scope| scope == ¬e.scope) { + return Some(SearchDetailsError { + code: "SCOPE_DENIED".to_string(), + message: "Note scope is not allowed for this read_profile.".to_string(), + }); + } + if !access::note_read_allowed( + note, + session.agent_id.as_str(), + allowed_scopes, + shared_grants, + now, + ) { + return Some(SearchDetailsError { + code: "SCOPE_DENIED".to_string(), + message: "Note scope is not allowed for this read_profile.".to_string(), + }); + } + + None +} + +fn hash_query(query: &str) -> String { + let mut hasher = DefaultHasher::new(); + + Hash::hash(query, &mut hasher); + + format!("{:x}", hasher.finish()) +} + +async fn store_search_session<'e, E>( + executor: E, + session: NewSearchSession<'_>, +) -> crate::Result<()> +where + E: PgExecutor<'e>, +{ + let items_json = serde_json::to_value(session.items).map_err(|err| crate::Error::Storage { + message: format!("Failed to encode search session items: {err}"), + })?; + let query_plan_json = + session.query_plan.map(serde_json::to_value).transpose().map_err(|err| { + crate::Error::Storage { + message: format!("Failed to encode search session query plan: {err}"), + } + })?; + let trajectory_summary_json = + session.trajectory_summary.map(serde_json::to_value).transpose().map_err(|err| { + crate::Error::Storage { + message: format!("Failed to encode search session trajectory summary: {err}"), + } + })?; + + sqlx::query( + "\ +INSERT INTO search_sessions ( + search_session_id, + trace_id, + tenant_id, + project_id, + agent_id, + read_profile, + query, + mode, + trajectory_summary, + query_plan, + items, + created_at, + expires_at +) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)", + ) + .bind(session.search_session_id) + .bind(session.trace_id) + .bind(session.tenant_id.trim()) + .bind(session.project_id.trim()) + .bind(session.agent_id.trim()) + .bind(session.read_profile) + .bind(session.query) + .bind(session.mode.as_str()) + .bind(trajectory_summary_json) + .bind(query_plan_json) + .bind(items_json) + .bind(session.created_at) + .bind(session.expires_at) + .execute(executor) + .await?; + + Ok(()) +} + +async fn load_search_session<'e, E>( + executor: E, + search_session_id: Uuid, + now: OffsetDateTime, +) -> crate::Result<SearchSession> +where + E: PgExecutor<'e>, +{ + let row = sqlx::query_as::<_, SearchSessionRow>( + "\ +SELECT + search_session_id, + trace_id, + tenant_id, + project_id, + agent_id, + read_profile, + query, + mode, + trajectory_summary, + query_plan, + items, + created_at, + expires_at +FROM search_sessions +WHERE search_session_id = $1", + ) + .bind(search_session_id) + .fetch_optional(executor) + .await?; + let Some(row) = row else { + return Err(crate::Error::InvalidRequest { + message: "Unknown search_session_id.".to_string(), + }); + }; + let expires_at: OffsetDateTime = row.expires_at; + + if expires_at <= now { + return Err(crate::Error::InvalidRequest { + message: "Search session expired.".to_string(), + }); + } + + let items: Vec<SearchSessionItemRecord> = serde_json::from_value(row.items).map_err(|err| { + crate::Error::Storage { message: format!("Failed to decode search session items: {err}") } + })?; + let mode = SearchSessionMode::from_str(row.mode.as_str())?; + let query_plan = match row.query_plan { + Some(value) => + Some(serde_json::from_value(value).map_err(|err| crate::Error::Storage { + message: format!("Failed to decode search session query_plan: {err}"), + })?), + None => None, + }; + let trajectory_summary = match row.trajectory_summary { + Some(value) => + Some(serde_json::from_value(value).map_err(|err| crate::Error::Storage { + message: format!("Failed to decode search session trajectory summary: {err}"), + })?), + None => None, + }; + + Ok(SearchSession { + search_session_id: row.search_session_id, + trace_id: row.trace_id, + tenant_id: row.tenant_id, + project_id: row.project_id, + agent_id: row.agent_id, + read_profile: row.read_profile, + query: row.query, + items, + mode, + trajectory_summary, + query_plan, + created_at: row.created_at, + expires_at, + }) +} + +async fn touch_search_session<'e, E>( + executor: E, + session: &SearchSession, + now: OffsetDateTime, +) -> crate::Result<OffsetDateTime> +where + E: PgExecutor<'e>, +{ + let absolute_expires_at = session.created_at + Duration::hours(SESSION_ABSOLUTE_TTL_HOURS); + let sliding_expires_at = now + Duration::hours(SESSION_SLIDING_TTL_HOURS); + let touched = if sliding_expires_at < absolute_expires_at { + sliding_expires_at + } else { + absolute_expires_at + }; + + if touched <= session.expires_at { + return Ok(session.expires_at); + } + + sqlx::query( + "UPDATE search_sessions SET expires_at = $1 WHERE search_session_id = $2 AND expires_at < $1", + ) + .bind(touched) + .bind(session.search_session_id) + .execute(executor) + .await?; + + Ok(touched) +} + +async fn record_detail_hits<'e, E>( + executor: E, + query: &str, + items: &[HitItem], + now: OffsetDateTime, +) -> crate::Result<()> +where + E: PgExecutor<'e>, +{ + if !english_gate::is_english_natural_language(query) { + return Err(crate::Error::NonEnglishInput { field: "$.query".to_string() }); + } + + let query_hash = hash_query(query); + let mut hit_ids = Vec::with_capacity(items.len()); + let mut note_ids = Vec::with_capacity(items.len()); + let mut chunk_ids = Vec::with_capacity(items.len()); + let mut ranks = Vec::with_capacity(items.len()); + let mut final_scores = Vec::with_capacity(items.len()); + + for item in items { + let rank = i32::try_from(item.rank).map_err(|_| crate::Error::InvalidRequest { + message: "Search session rank is out of range.".to_string(), + })?; + + hit_ids.push(Uuid::new_v4()); + note_ids.push(item.note_id); + chunk_ids.push(item.chunk_id); + ranks.push(rank); + final_scores.push(item.final_score); + } + + sqlx::query( + "\ +WITH hits AS ( + SELECT * + FROM unnest( + $1::uuid[], + $2::uuid[], + $3::uuid[], + $4::int4[], + $5::real[] +) AS t(hit_id, note_id, chunk_id, rank, final_score) +), +updated AS ( +UPDATE memory_notes +SET + hit_count = hit_count + 1, + last_hit_at = $6 +WHERE note_id = ANY($2) +) +INSERT INTO memory_hits ( + hit_id, + note_id, + chunk_id, + query_hash, + rank, + final_score, + ts +) +SELECT + hit_id, + note_id, + chunk_id, + $7, + rank, + final_score, + $6 +FROM hits", + ) + .bind(&hit_ids) + .bind(¬e_ids) + .bind(&chunk_ids) + .bind(&ranks) + .bind(&final_scores) + .bind(now) + .bind(query_hash.as_str()) + .execute(executor) + .await?; + + Ok(()) +} diff --git a/packages/elf-service/src/provenance.rs b/packages/elf-service/src/provenance.rs new file mode 100644 index 00000000..c39af394 --- /dev/null +++ b/packages/elf-service/src/provenance.rs @@ -0,0 +1,1134 @@ +//! Provenance inspection APIs. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; +use serde_json::{self, Value}; +use sqlx::{FromRow, PgPool}; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{ElfService, Error, Result}; +use elf_storage::models::MemoryNote; + +const NOTE_PROVENANCE_BUNDLE_SCHEMA_V1: &str = "elf.note_provenance_bundle/v1"; +const NOTE_PROVENANCE_INGEST_DECISIONS_LIMIT: i64 = 100; +const NOTE_PROVENANCE_NOTE_VERSIONS_LIMIT: i64 = 100; +const NOTE_PROVENANCE_OUTBOX_LIMIT: i64 = 100; +const NOTE_PROVENANCE_RECENT_TRACES_LIMIT: i64 = 20; +const NOTE_PROVENANCE_HISTORY_LIMIT: i64 = 200; +const MEMORY_HISTORY_SCHEMA_V1: &str = "elf.memory_history/v1"; + +/// Request payload for note provenance lookup. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct NoteProvenanceGetRequest { + /// Tenant that owns the note. + pub tenant_id: String, + /// Project that owns the note. + pub project_id: String, + /// Identifier of the note to inspect. + pub note_id: Uuid, +} + +/// Request payload for memory-history lookup. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct MemoryHistoryGetRequest { + /// Tenant that owns the memory. + pub tenant_id: String, + /// Project that owns the memory. + pub project_id: String, + /// Identifier of the note to inspect. + pub note_id: Uuid, +} + +/// Timeline response for one memory. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct MemoryHistoryResponse { + /// History schema identifier. + pub schema: String, + /// Inspected note identifier. + pub note_id: Uuid, + /// Chronological memory events. + pub events: Vec<MemoryHistoryEvent>, +} + +/// Full provenance bundle for one note. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct NoteProvenanceBundleResponse { + /// Provenance bundle schema identifier. + pub schema: String, + /// Current persisted note snapshot. + pub note: NoteProvenanceNote, + /// Recorded ingestion decisions for the note. + pub ingest_decisions: Vec<NoteProvenanceIngestDecision>, + /// Version-history rows for the note. + pub note_versions: Vec<NoteProvenanceNoteVersion>, + /// Indexing outbox history for the note. + pub indexing_outbox: Vec<NoteProvenanceIndexingOutbox>, + /// Recent search traces that referenced the note. + pub recent_traces: Vec<NoteProvenanceRecentTrace>, + /// Chronological memory event timeline for the note. + pub history: Vec<MemoryHistoryEvent>, +} + +/// Current note snapshot returned by provenance APIs. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct NoteProvenanceNote { + /// Note identifier. + pub note_id: Uuid, + /// Tenant that owns the note. + pub tenant_id: String, + /// Project that owns the note. + pub project_id: String, + /// Agent that wrote the note. + pub agent_id: String, + /// Scope key for the note. + pub scope: String, + /// Note type discriminator. + pub r#type: String, + /// Optional application-defined key. + pub key: Option<String>, + /// Note body text. + pub text: String, + /// Importance score. + pub importance: f32, + /// Confidence score. + pub confidence: f32, + /// Lifecycle status. + pub status: String, + #[serde(with = "crate::time_serde")] + /// Creation timestamp. + pub created_at: OffsetDateTime, + #[serde(with = "crate::time_serde")] + /// Last update timestamp. + pub updated_at: OffsetDateTime, + #[serde(with = "crate::time_serde::option")] + /// Optional expiry timestamp. + pub expires_at: Option<OffsetDateTime>, + /// Structured source reference metadata. + pub source_ref: Value, + /// Embedding version associated with the note. + pub embedding_version: String, + /// Search hit counter. + pub hit_count: i64, + #[serde(with = "crate::time_serde::option")] + /// Timestamp of the most recent hit. + pub last_hit_at: Option<OffsetDateTime>, +} +impl From<MemoryNote> for NoteProvenanceNote { + fn from(note: MemoryNote) -> Self { + Self { + note_id: note.note_id, + tenant_id: note.tenant_id, + project_id: note.project_id, + agent_id: note.agent_id, + scope: note.scope, + r#type: note.r#type, + key: note.key, + text: note.text, + importance: note.importance, + confidence: note.confidence, + status: note.status, + created_at: note.created_at, + updated_at: note.updated_at, + expires_at: note.expires_at, + source_ref: note.source_ref, + embedding_version: note.embedding_version, + hit_count: note.hit_count, + last_hit_at: note.last_hit_at, + } + } +} + +/// One recorded ingestion decision for a note. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct NoteProvenanceIngestDecision { + /// Decision identifier. + pub decision_id: Uuid, + /// Tenant that owns the decision record. + pub tenant_id: String, + /// Project that owns the decision record. + pub project_id: String, + /// Agent that triggered the ingestion decision. + pub agent_id: String, + /// Scope key evaluated by the decision. + pub scope: String, + /// Pipeline name that produced the decision. + pub pipeline: String, + /// Note type discriminator under evaluation. + pub note_type: String, + /// Optional application-defined key under evaluation. + pub note_key: Option<String>, + /// Note identifier, when a note was persisted or matched. + pub note_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Note version produced by this decision, when applicable. + pub note_version_id: Option<Uuid>, + /// Pre-policy base decision. + pub base_decision: String, + /// Final policy decision. + pub policy_decision: String, + /// Persistence operation that followed the decision. + pub note_op: String, + /// Machine-readable reason code, if any. + pub reason_code: Option<String>, + /// Structured diagnostic details. + pub details: Value, + #[serde(with = "crate::time_serde")] + /// Decision timestamp. + pub ts: OffsetDateTime, +} +impl From<NoteIngestDecisionRow> for NoteProvenanceIngestDecision { + fn from(row: NoteIngestDecisionRow) -> Self { + Self { + decision_id: row.decision_id, + tenant_id: row.tenant_id, + project_id: row.project_id, + agent_id: row.agent_id, + scope: row.scope, + pipeline: row.pipeline, + note_type: row.note_type, + note_key: row.note_key, + note_id: row.note_id, + note_version_id: row.note_version_id, + base_decision: row.base_decision, + policy_decision: row.policy_decision, + note_op: row.note_op, + reason_code: row.reason_code, + details: row.details, + ts: row.ts, + } + } +} + +/// One version-history row for a note. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct NoteProvenanceNoteVersion { + /// Version row identifier. + pub version_id: Uuid, + /// Note identifier. + pub note_id: Uuid, + /// Operation recorded in the version row. + pub op: String, + #[serde(skip_serializing_if = "Option::is_none")] + /// Snapshot before the operation, when available. + pub prev_snapshot: Option<Value>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Snapshot after the operation, when available. + pub new_snapshot: Option<Value>, + /// Human-readable reason for the change. + pub reason: String, + /// Actor that performed the change. + pub actor: String, + #[serde(with = "crate::time_serde")] + /// Version timestamp. + pub ts: OffsetDateTime, +} +impl From<NoteVersionRow> for NoteProvenanceNoteVersion { + fn from(row: NoteVersionRow) -> Self { + Self { + version_id: row.version_id, + note_id: row.note_id, + op: row.op, + prev_snapshot: row.prev_snapshot, + new_snapshot: row.new_snapshot, + reason: row.reason, + actor: row.actor, + ts: row.ts, + } + } +} + +/// One indexing-outbox row for a note. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct NoteProvenanceIndexingOutbox { + /// Outbox identifier. + pub outbox_id: Uuid, + /// Note identifier. + pub note_id: Uuid, + /// Requested indexing operation. + pub op: String, + /// Embedding version targeted by the job. + pub embedding_version: String, + /// Current outbox status. + pub status: String, + /// Number of attempts already made. + pub attempts: i32, + #[serde(skip_serializing_if = "Option::is_none")] + /// Most recent failure text, if any. + pub last_error: Option<String>, + #[serde(with = "crate::time_serde")] + /// Earliest time the job may be claimed again. + pub available_at: OffsetDateTime, + #[serde(with = "crate::time_serde")] + /// Creation timestamp. + pub created_at: OffsetDateTime, + #[serde(with = "crate::time_serde")] + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} +impl From<NoteIndexingOutboxRow> for NoteProvenanceIndexingOutbox { + fn from(row: NoteIndexingOutboxRow) -> Self { + Self { + outbox_id: row.outbox_id, + note_id: row.note_id, + op: row.op, + embedding_version: row.embedding_version, + status: row.status, + attempts: row.attempts, + last_error: row.last_error, + available_at: row.available_at, + created_at: row.created_at, + updated_at: row.updated_at, + } + } +} + +/// Recent search trace that referenced the note. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct NoteProvenanceRecentTrace { + /// Search trace identifier. + pub trace_id: Uuid, + /// Tenant that owns the trace. + pub tenant_id: String, + /// Project that owns the trace. + pub project_id: String, + /// Agent that ran the search. + pub agent_id: String, + /// Read profile used for the trace. + pub read_profile: String, + /// Search query text. + pub query: String, + #[serde(with = "crate::time_serde")] + /// Trace creation timestamp. + pub created_at: OffsetDateTime, +} + +/// One normalized memory-history event. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct MemoryHistoryEvent { + /// Stable event identifier within its source table. + pub event_id: String, + /// Normalized event type. + pub event_type: String, + /// Subject kind for the event. + pub subject_type: String, + /// Inspected note identifier. + pub note_id: Uuid, + /// Durable source table behind the event. + pub source_table: String, + /// Source row identifier when available. + pub source_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Related note version, when an ingest decision produced a version row. + pub related_note_version_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Related ingest decision, when a version or history event was caused by ingestion. + pub related_decision_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Related consolidation proposal, when a derived memory proposal references the note. + pub related_proposal_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Actor that caused the event, when available. + pub actor: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Source operation string. + pub op: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Machine-readable reason code, when available. + pub reason_code: Option<String>, + /// Human-readable one-line event summary. + pub summary: String, + /// Source-specific event details. + pub details: Value, + #[serde(with = "crate::time_serde")] + /// Event timestamp. + pub ts: OffsetDateTime, +} + +#[derive(Clone, Debug)] +struct ValidatedNoteProvenanceRequest { + tenant_id: String, + project_id: String, + note_id: Uuid, +} + +#[derive(FromRow)] +struct NoteIngestDecisionRow { + decision_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + scope: String, + pipeline: String, + note_type: String, + note_key: Option<String>, + note_id: Option<Uuid>, + note_version_id: Option<Uuid>, + base_decision: String, + policy_decision: String, + note_op: String, + reason_code: Option<String>, + details: Value, + ts: OffsetDateTime, +} + +#[derive(FromRow)] +struct NoteVersionRow { + version_id: Uuid, + note_id: Uuid, + op: String, + prev_snapshot: Option<Value>, + new_snapshot: Option<Value>, + reason: String, + actor: String, + ts: OffsetDateTime, +} + +#[derive(FromRow)] +struct NoteIndexingOutboxRow { + outbox_id: Uuid, + note_id: Uuid, + op: String, + embedding_version: String, + status: String, + attempts: i32, + last_error: Option<String>, + available_at: OffsetDateTime, + created_at: OffsetDateTime, + updated_at: OffsetDateTime, +} + +#[derive(FromRow)] +struct NoteRecentTraceRow { + trace_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + query: String, + created_at: OffsetDateTime, +} + +#[derive(FromRow)] +struct NoteDerivedProposalRow { + proposal_id: Uuid, + run_id: Uuid, + agent_id: String, + proposal_kind: String, + apply_intent: String, + review_state: String, + source_refs: Value, + source_snapshot: Value, + lineage: Value, + diff: Value, + confidence: f32, + target_ref: Value, + proposed_payload: Value, + created_at: OffsetDateTime, +} + +#[derive(FromRow)] +struct NoteProposalReviewRow { + review_id: Uuid, + proposal_id: Uuid, + run_id: Uuid, + reviewer_agent_id: String, + action: String, + from_review_state: String, + to_review_state: String, + review_comment: Option<String>, + created_at: OffsetDateTime, + proposal_kind: String, + apply_intent: String, + diff: Value, +} + +impl ElfService { + /// Loads the current note plus recent provenance tables as one bundle. + pub async fn note_provenance_get( + &self, + req: NoteProvenanceGetRequest, + ) -> Result<NoteProvenanceBundleResponse> { + let req = validate_note_provenance_request(req)?; + let note = sqlx::query_as::<_, MemoryNote>( + "\ +SELECT * +FROM memory_notes +WHERE note_id = $1 + AND tenant_id = $2 + AND project_id = $3", + ) + .bind(req.note_id) + .bind(&req.tenant_id) + .bind(&req.project_id) + .fetch_optional(&self.db.pool) + .await?; + let Some(note_row) = note else { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + }; + let ingest_decisions = load_ingest_decisions(&self.db.pool, &req).await?; + let note_versions = + load_note_versions(&self.db.pool, &req.tenant_id, &req.project_id, req.note_id).await?; + let indexing_outbox = + load_indexing_outbox(&self.db.pool, &req.tenant_id, &req.project_id, req.note_id) + .await?; + let recent_traces = load_recent_traces_for_note( + &self.db.pool, + &req.tenant_id, + &req.project_id, + req.note_id, + ) + .await?; + let history = load_memory_history_events(&self.db.pool, &req, ¬e_row).await?; + + Ok(NoteProvenanceBundleResponse { + schema: NOTE_PROVENANCE_BUNDLE_SCHEMA_V1.to_string(), + note: NoteProvenanceNote::from(note_row), + ingest_decisions, + note_versions, + indexing_outbox, + recent_traces, + history, + }) + } + + /// Loads the normalized memory-history timeline for one note. + pub async fn memory_history_get( + &self, + req: MemoryHistoryGetRequest, + ) -> Result<MemoryHistoryResponse> { + let req = validate_note_provenance_request(NoteProvenanceGetRequest { + tenant_id: req.tenant_id, + project_id: req.project_id, + note_id: req.note_id, + })?; + let note_row = sqlx::query_as::<_, MemoryNote>( + "\ +SELECT * +FROM memory_notes +WHERE note_id = $1 + AND tenant_id = $2 + AND project_id = $3", + ) + .bind(req.note_id) + .bind(&req.tenant_id) + .bind(&req.project_id) + .fetch_optional(&self.db.pool) + .await?; + let Some(note_row) = note_row else { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + }; + let events = load_memory_history_events(&self.db.pool, &req, ¬e_row).await?; + + Ok(MemoryHistoryResponse { + schema: MEMORY_HISTORY_SCHEMA_V1.to_string(), + note_id: req.note_id, + events, + }) + } +} + +fn validate_note_provenance_request( + req: NoteProvenanceGetRequest, +) -> Result<ValidatedNoteProvenanceRequest> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + + if tenant_id.is_empty() || project_id.is_empty() { + return Err(Error::InvalidRequest { + message: "tenant_id and project_id are required.".to_string(), + }); + } + + Ok(ValidatedNoteProvenanceRequest { + tenant_id: tenant_id.to_string(), + project_id: project_id.to_string(), + note_id: req.note_id, + }) +} + +fn to_recent_trace(item: NoteRecentTraceRow) -> NoteProvenanceRecentTrace { + NoteProvenanceRecentTrace { + trace_id: item.trace_id, + tenant_id: item.tenant_id, + project_id: item.project_id, + agent_id: item.agent_id, + read_profile: item.read_profile, + query: item.query, + created_at: item.created_at, + } +} + +fn version_history_event( + version: &NoteProvenanceNoteVersion, + decision: Option<&&NoteProvenanceIngestDecision>, +) -> MemoryHistoryEvent { + let event_type = version_event_type(version.op.as_str(), version.reason.as_str()); + let related_decision_id = decision.map(|decision| decision.decision_id); + let details = serde_json::json!({ + "reason": version.reason, + "prev_snapshot": version.prev_snapshot, + "new_snapshot": version.new_snapshot, + "ingest_decision": decision.map(|decision| serde_json::json!({ + "decision_id": decision.decision_id, + "pipeline": decision.pipeline, + "base_decision": decision.base_decision, + "policy_decision": decision.policy_decision, + "note_op": decision.note_op, + "reason_code": decision.reason_code, + })), + }); + + MemoryHistoryEvent { + event_id: format!("memory_note_versions:{}", version.version_id), + event_type: event_type.to_string(), + subject_type: "note".to_string(), + note_id: version.note_id, + source_table: "memory_note_versions".to_string(), + source_id: Some(version.version_id), + related_note_version_id: Some(version.version_id), + related_decision_id, + related_proposal_id: None, + actor: Some(version.actor.clone()), + op: Some(version.op.clone()), + reason_code: None, + summary: version_summary(event_type, version.reason.as_str()), + details, + ts: version.ts, + } +} + +fn decision_history_event( + note_id: Uuid, + decision: &NoteProvenanceIngestDecision, +) -> MemoryHistoryEvent { + let event_type = decision_event_type(decision); + let details = serde_json::json!({ + "pipeline": decision.pipeline, + "note_type": decision.note_type, + "note_key": decision.note_key, + "base_decision": decision.base_decision, + "policy_decision": decision.policy_decision, + "note_op": decision.note_op, + "details": decision.details, + }); + + MemoryHistoryEvent { + event_id: format!("memory_ingest_decisions:{}", decision.decision_id), + event_type: event_type.to_string(), + subject_type: "note".to_string(), + note_id, + source_table: "memory_ingest_decisions".to_string(), + source_id: Some(decision.decision_id), + related_note_version_id: decision.note_version_id, + related_decision_id: Some(decision.decision_id), + related_proposal_id: None, + actor: Some(decision.agent_id.clone()), + op: Some(decision.note_op.clone()), + reason_code: decision.reason_code.clone(), + summary: decision_summary(event_type, decision), + details, + ts: decision.ts, + } +} + +fn expire_history_event(note: &MemoryNote, expires_at: OffsetDateTime) -> MemoryHistoryEvent { + MemoryHistoryEvent { + event_id: format!("memory_notes:{}:expire:{expires_at}", note.note_id), + event_type: "expire".to_string(), + subject_type: "note".to_string(), + note_id: note.note_id, + source_table: "memory_notes".to_string(), + source_id: Some(note.note_id), + related_note_version_id: None, + related_decision_id: None, + related_proposal_id: None, + actor: Some(note.agent_id.clone()), + op: Some("EXPIRE".to_string()), + reason_code: None, + summary: "Note reached its persisted expires_at timestamp.".to_string(), + details: serde_json::json!({ + "status": note.status, + "expires_at": expires_at, + }), + ts: expires_at, + } +} + +fn derived_proposal_history_event( + note_id: Uuid, + proposal: NoteDerivedProposalRow, +) -> MemoryHistoryEvent { + MemoryHistoryEvent { + event_id: format!("consolidation_proposals:{}", proposal.proposal_id), + event_type: "derived".to_string(), + subject_type: "note".to_string(), + note_id, + source_table: "consolidation_proposals".to_string(), + source_id: Some(proposal.proposal_id), + related_note_version_id: None, + related_decision_id: None, + related_proposal_id: Some(proposal.proposal_id), + actor: Some(proposal.agent_id), + op: Some(proposal.apply_intent.clone()), + reason_code: None, + summary: format!( + "Derived proposal '{}' was created with review_state '{}'.", + proposal.proposal_kind, proposal.review_state + ), + details: serde_json::json!({ + "run_id": proposal.run_id, + "proposal_kind": proposal.proposal_kind, + "apply_intent": proposal.apply_intent, + "review_state": proposal.review_state, + "source_refs": proposal.source_refs, + "source_snapshot": proposal.source_snapshot, + "lineage": proposal.lineage, + "diff": proposal.diff, + "confidence": proposal.confidence, + "target_ref": proposal.target_ref, + "proposed_payload": proposal.proposed_payload, + }), + ts: proposal.created_at, + } +} + +fn proposal_review_history_event( + note_id: Uuid, + review: NoteProposalReviewRow, +) -> MemoryHistoryEvent { + let event_type = proposal_review_event_type(review.action.as_str()); + + MemoryHistoryEvent { + event_id: format!("consolidation_proposal_reviews:{}", review.review_id), + event_type: event_type.to_string(), + subject_type: "note".to_string(), + note_id, + source_table: "consolidation_proposal_reviews".to_string(), + source_id: Some(review.review_id), + related_note_version_id: None, + related_decision_id: None, + related_proposal_id: Some(review.proposal_id), + actor: Some(review.reviewer_agent_id), + op: Some(review.action.clone()), + reason_code: None, + summary: format!( + "Proposal review action '{}' moved '{}' from '{}' to '{}'.", + review.action, review.proposal_kind, review.from_review_state, review.to_review_state + ), + details: serde_json::json!({ + "proposal_id": review.proposal_id, + "run_id": review.run_id, + "proposal_kind": review.proposal_kind, + "apply_intent": review.apply_intent, + "from_review_state": review.from_review_state, + "to_review_state": review.to_review_state, + "review_comment": review.review_comment, + "diff": review.diff, + }), + ts: review.created_at, + } +} + +fn should_emit_decision_event(decision: &NoteProvenanceIngestDecision) -> bool { + if matches!(decision.note_op.as_str(), "NONE" | "REJECTED") { + return true; + } + + decision.note_version_id.is_none() +} + +fn version_event_type(op: &str, reason: &str) -> &'static str { + let reason = reason.to_ascii_lowercase(); + + match op { + "ADD" => "add", + "UPDATE" => "update", + "DELETE" if reason.contains("expire") => "expire", + "DELETE" => "delete", + "PUBLISH" | "UNPUBLISH" => "related", + "DEPRECATE" | "INVALIDATE" => "invalidated", + _ => "related", + } +} + +fn decision_event_type(decision: &NoteProvenanceIngestDecision) -> &'static str { + if decision.policy_decision == "reject" || decision.note_op == "REJECTED" { + return "reject"; + } + if decision.policy_decision == "ignore" || decision.note_op == "NONE" { + return "ignore"; + } + + match decision.note_op.as_str() { + "ADD" => "add", + "UPDATE" => "update", + "DELETE" => "delete", + _ => "related", + } +} + +fn proposal_review_event_type(action: &str) -> &'static str { + match action { + "apply" => "applied", + "discard" | "defer" => "invalidated", + "approve" => "related", + _ => "related", + } +} + +fn version_summary(event_type: &str, reason: &str) -> String { + match event_type { + "add" => format!("Note was added by {reason}."), + "update" => format!("Note was updated by {reason}."), + "delete" => format!("Note was deleted by {reason}."), + "expire" => format!("Note expired through {reason}."), + "invalidated" => format!("Note was invalidated by {reason}."), + _ => format!("Note recorded related transition {reason}."), + } +} + +fn decision_summary(event_type: &str, decision: &NoteProvenanceIngestDecision) -> String { + let reason = decision.reason_code.as_deref().unwrap_or("no_reason_code"); + + match event_type { + "ignore" => format!("Ingestion ignored candidate memory with {reason}."), + "reject" => format!("Ingestion rejected candidate memory with {reason}."), + _ => format!( + "Ingestion recorded {} decision for operation {}.", + decision.policy_decision, decision.note_op + ), + } +} + +async fn load_ingest_decisions( + pool: &PgPool, + req: &ValidatedNoteProvenanceRequest, +) -> Result<Vec<NoteProvenanceIngestDecision>> { + let rows: Vec<NoteIngestDecisionRow> = sqlx::query_as::<_, NoteIngestDecisionRow>( + "\ +SELECT + decision_id, + tenant_id, + project_id, + agent_id, + scope, + pipeline, + note_type, + note_key, + note_id, + note_version_id, + base_decision, + policy_decision, + note_op, + reason_code, + details, + ts +FROM memory_ingest_decisions +WHERE note_id = $1 AND tenant_id = $2 AND project_id = $3 +ORDER BY ts DESC +LIMIT $4", + ) + .bind(req.note_id) + .bind(&req.tenant_id) + .bind(&req.project_id) + .bind(NOTE_PROVENANCE_INGEST_DECISIONS_LIMIT) + .fetch_all(pool) + .await?; + + Ok(rows.into_iter().map(NoteProvenanceIngestDecision::from).collect()) +} + +async fn load_note_versions( + pool: &PgPool, + tenant_id: &str, + project_id: &str, + note_id: Uuid, +) -> Result<Vec<NoteProvenanceNoteVersion>> { + let rows: Vec<NoteVersionRow> = sqlx::query_as::<_, NoteVersionRow>( + "\ +SELECT + memory_note_versions.version_id, + memory_note_versions.note_id, + memory_note_versions.op, + memory_note_versions.prev_snapshot, + memory_note_versions.new_snapshot, + memory_note_versions.reason, + memory_note_versions.actor, + memory_note_versions.ts +FROM memory_note_versions +JOIN memory_notes n ON n.note_id = memory_note_versions.note_id +WHERE memory_note_versions.note_id = $1 + AND n.tenant_id = $2 + AND n.project_id = $3 +ORDER BY memory_note_versions.ts DESC +LIMIT $4", + ) + .bind(note_id) + .bind(tenant_id) + .bind(project_id) + .bind(NOTE_PROVENANCE_NOTE_VERSIONS_LIMIT) + .fetch_all(pool) + .await?; + + Ok(rows.into_iter().map(NoteProvenanceNoteVersion::from).collect()) +} + +async fn load_indexing_outbox( + pool: &PgPool, + tenant_id: &str, + project_id: &str, + note_id: Uuid, +) -> Result<Vec<NoteProvenanceIndexingOutbox>> { + let rows: Vec<NoteIndexingOutboxRow> = sqlx::query_as::<_, NoteIndexingOutboxRow>( + "\ +SELECT + indexing_outbox.outbox_id, + indexing_outbox.note_id, + indexing_outbox.op, + indexing_outbox.embedding_version, + indexing_outbox.status, + indexing_outbox.attempts, + indexing_outbox.last_error, + indexing_outbox.available_at, + indexing_outbox.created_at, + indexing_outbox.updated_at +FROM indexing_outbox +JOIN memory_notes n ON n.note_id = indexing_outbox.note_id +WHERE indexing_outbox.note_id = $1 + AND n.tenant_id = $2 + AND n.project_id = $3 +ORDER BY indexing_outbox.updated_at DESC +LIMIT $4", + ) + .bind(note_id) + .bind(tenant_id) + .bind(project_id) + .bind(NOTE_PROVENANCE_OUTBOX_LIMIT) + .fetch_all(pool) + .await?; + + Ok(rows.into_iter().map(NoteProvenanceIndexingOutbox::from).collect()) +} + +async fn load_recent_traces_for_note( + pool: &PgPool, + tenant_id: &str, + project_id: &str, + note_id: Uuid, +) -> Result<Vec<NoteProvenanceRecentTrace>> { + let rows: Vec<NoteRecentTraceRow> = sqlx::query_as::<_, NoteRecentTraceRow>( + "\ +SELECT + trace_id, + tenant_id, + project_id, + agent_id, + read_profile, + query, + created_at +FROM search_traces +WHERE tenant_id = $1 + AND project_id = $2 + AND trace_id IN (SELECT DISTINCT trace_id FROM search_trace_items WHERE note_id = $3) +ORDER BY created_at DESC, trace_id DESC +LIMIT $4", + ) + .bind(tenant_id) + .bind(project_id) + .bind(note_id) + .bind(NOTE_PROVENANCE_RECENT_TRACES_LIMIT) + .fetch_all(pool) + .await?; + + Ok(rows.into_iter().map(to_recent_trace).collect()) +} + +async fn load_memory_history_events( + pool: &PgPool, + req: &ValidatedNoteProvenanceRequest, + note: &MemoryNote, +) -> Result<Vec<MemoryHistoryEvent>> { + let decisions = load_ingest_decisions(pool, req).await?; + let versions = load_note_versions(pool, &req.tenant_id, &req.project_id, req.note_id).await?; + let proposal_ref = serde_json::json!([{ "kind": "note", "id": req.note_id }]); + let proposals = load_derived_proposals_for_note(pool, req, &proposal_ref).await?; + let reviews = load_proposal_reviews_for_note(pool, req, &proposal_ref).await?; + let mut decision_by_version = HashMap::new(); + + for decision in &decisions { + if let Some(version_id) = decision.note_version_id { + decision_by_version.insert(version_id, decision); + } + } + + let mut events = Vec::new(); + + for version in &versions { + events.push(version_history_event(version, decision_by_version.get(&version.version_id))); + } + for decision in &decisions { + if should_emit_decision_event(decision) { + events.push(decision_history_event(req.note_id, decision)); + } + } + + if let Some(expires_at) = note.expires_at + && expires_at <= OffsetDateTime::now_utc() + && !events.iter().any(|event| event.event_type == "expire") + { + events.push(expire_history_event(note, expires_at)); + } + + for proposal in proposals { + events.push(derived_proposal_history_event(req.note_id, proposal)); + } + for review in reviews { + events.push(proposal_review_history_event(req.note_id, review)); + } + + events.sort_by(|left, right| { + left.ts.cmp(&right.ts).then_with(|| left.event_id.cmp(&right.event_id)) + }); + + let history_limit = NOTE_PROVENANCE_HISTORY_LIMIT as usize; + + if events.len() > history_limit { + let drop_count = events.len() - history_limit; + + events.drain(0..drop_count); + } + + Ok(events) +} + +async fn load_derived_proposals_for_note( + pool: &PgPool, + req: &ValidatedNoteProvenanceRequest, + proposal_ref: &Value, +) -> Result<Vec<NoteDerivedProposalRow>> { + let rows = sqlx::query_as::<_, NoteDerivedProposalRow>( + "\ +SELECT + proposal_id, + run_id, + agent_id, + proposal_kind, + apply_intent, + review_state, + source_refs, + source_snapshot, + lineage, + diff, + confidence, + COALESCE(target_ref, '{}'::jsonb) AS target_ref, + COALESCE(proposed_payload, '{}'::jsonb) AS proposed_payload, + created_at +FROM consolidation_proposals +WHERE tenant_id = $1 + AND project_id = $2 + AND source_refs @> $3 +ORDER BY created_at DESC, proposal_id DESC +LIMIT $4", + ) + .bind(&req.tenant_id) + .bind(&req.project_id) + .bind(proposal_ref) + .bind(NOTE_PROVENANCE_HISTORY_LIMIT) + .fetch_all(pool) + .await?; + + Ok(rows) +} + +async fn load_proposal_reviews_for_note( + pool: &PgPool, + req: &ValidatedNoteProvenanceRequest, + proposal_ref: &Value, +) -> Result<Vec<NoteProposalReviewRow>> { + let rows = sqlx::query_as::<_, NoteProposalReviewRow>( + "\ +SELECT + reviews.review_id, + reviews.proposal_id, + reviews.run_id, + reviews.reviewer_agent_id, + reviews.action, + reviews.from_review_state, + reviews.to_review_state, + reviews.review_comment, + reviews.created_at, + proposals.proposal_kind, + proposals.apply_intent, + proposals.diff +FROM consolidation_proposal_reviews reviews +JOIN consolidation_proposals proposals + ON proposals.proposal_id = reviews.proposal_id +WHERE reviews.tenant_id = $1 + AND reviews.project_id = $2 + AND proposals.source_refs @> $3 +ORDER BY reviews.created_at DESC, reviews.review_id DESC +LIMIT $4", + ) + .bind(&req.tenant_id) + .bind(&req.project_id) + .bind(proposal_ref) + .bind(NOTE_PROVENANCE_HISTORY_LIMIT) + .fetch_all(pool) + .await?; + + Ok(rows) +} + +#[cfg(test)] +mod tests { + use uuid::Uuid; + + use crate::provenance::{self, Error, NoteProvenanceGetRequest}; + + #[test] + fn normalize_note_provenance_request_trims_ids() { + let request = NoteProvenanceGetRequest { + tenant_id: " tenant-a ".to_string(), + project_id: " project-a\n".to_string(), + note_id: Uuid::new_v4(), + }; + let result = + provenance::validate_note_provenance_request(request).expect("expected valid request"); + + assert_eq!(result.tenant_id, "tenant-a"); + assert_eq!(result.project_id, "project-a"); + } + + #[test] + fn note_provenance_request_requires_tenant_and_project() { + let missing_tenant = NoteProvenanceGetRequest { + tenant_id: " ".to_string(), + project_id: "project-a".to_string(), + note_id: Uuid::new_v4(), + }; + let empty_project = NoteProvenanceGetRequest { + tenant_id: "tenant-a".to_string(), + project_id: " ".to_string(), + note_id: Uuid::new_v4(), + }; + let first = provenance::validate_note_provenance_request(missing_tenant) + .expect_err("expected tenant validation error"); + let second = provenance::validate_note_provenance_request(empty_project) + .expect_err("expected project validation error"); + + match first { + Error::InvalidRequest { message } => { + assert!(message.contains("tenant_id")); + }, + _ => panic!("tenant validation should produce InvalidRequest"), + } + match second { + Error::InvalidRequest { message } => { + assert!(message.contains("tenant_id") || message.contains("project_id")); + }, + _ => panic!("project validation should produce InvalidRequest"), + } + } +} diff --git a/packages/elf-service/src/ranking_explain_v2.rs b/packages/elf-service/src/ranking_explain_v2.rs new file mode 100644 index 00000000..d991a694 --- /dev/null +++ b/packages/elf-service/src/ranking_explain_v2.rs @@ -0,0 +1,231 @@ +use std::collections::BTreeMap; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use elf_config::Config; + +/// Schema identifier for ranking explanations returned by the search service. +pub const SEARCH_RANKING_EXPLAIN_SCHEMA_V2: &str = "search_ranking_explain/v2"; + +/// One named term that contributed to a ranking score. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchRankingTerm { + /// Stable term identifier. + pub name: String, + /// Numeric contribution for the term. + pub value: f32, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional raw inputs used to compute the term. + pub inputs: Option<BTreeMap<String, Value>>, +} + +/// Full ranking explanation for one search result. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchRankingExplain { + /// Explanation schema identifier. + pub schema: String, + /// Ranking-policy fingerprint used to compute the score. + pub policy_id: String, + /// Final blended score. + pub final_score: f32, + /// Individual score terms. + pub terms: Vec<SearchRankingTerm>, +} + +/// Arguments used to build per-term ranking explanations for a trace item. +pub struct TraceTermsArgs<'a> { + /// Service configuration snapshot. + pub cfg: &'a Config, + /// Whether blend ranking was enabled. + pub blend_enabled: bool, + /// Retrieval-score normalization label. + pub retrieval_normalization: &'a str, + /// Rerank-score normalization label. + pub rerank_normalization: &'a str, + /// Retrieval weight chosen by the blend policy. + pub blend_retrieval_weight: f32, + /// 1-based retrieval rank. + pub retrieval_rank: u32, + /// Normalized retrieval score. + pub retrieval_norm: f32, + /// Final retrieval contribution term. + pub retrieval_term: f32, + /// Raw rerank model score. + pub rerank_score: f32, + /// 1-based rerank rank. + pub rerank_rank: u32, + /// Normalized rerank score. + pub rerank_norm: f32, + /// Final rerank contribution term. + pub rerank_term: f32, + /// Tie-breaker contribution. + pub tie_breaker_score: f32, + /// Item importance score. + pub importance: f32, + /// Item age in days. + pub age_days: f32, + /// Item scope key. + pub scope: &'a str, + /// Scope-context boost contribution. + pub scope_context_boost: f32, + /// Lexical overlap ratio used by deterministic ranking. + pub deterministic_lexical_overlap_ratio: f32, + /// Deterministic lexical bonus contribution. + pub deterministic_lexical_bonus: f32, + /// Historical hit count. + pub deterministic_hit_count: i64, + /// Age of the last hit in days, when known. + pub deterministic_last_hit_age_days: Option<f32>, + /// Deterministic hit boost contribution. + pub deterministic_hit_boost: f32, + /// Deterministic decay penalty contribution. + pub deterministic_decay_penalty: f32, +} + +/// Removes raw inputs from ranking terms while keeping names and values. +pub fn strip_term_inputs(terms: &[SearchRankingTerm]) -> Vec<SearchRankingTerm> { + terms + .iter() + .map(|term| SearchRankingTerm { name: term.name.clone(), value: term.value, inputs: None }) + .collect() +} + +/// Builds the term list used by `SEARCH_RANKING_EXPLAIN_SCHEMA_V2`. +pub fn build_trace_terms_v2(args: TraceTermsArgs<'_>) -> Vec<SearchRankingTerm> { + let cfg = args.cfg; + let blend_enabled = args.blend_enabled; + let mut terms = Vec::new(); + let mut blend_retrieval_inputs = BTreeMap::new(); + + blend_retrieval_inputs.insert("enabled".to_string(), serde_json::json!(blend_enabled)); + blend_retrieval_inputs + .insert("retrieval_rank".to_string(), serde_json::json!(args.retrieval_rank)); + blend_retrieval_inputs + .insert("retrieval_norm".to_string(), serde_json::json!(args.retrieval_norm)); + blend_retrieval_inputs.insert( + "retrieval_normalization".to_string(), + serde_json::json!(args.retrieval_normalization), + ); + blend_retrieval_inputs.insert( + "blend_retrieval_weight".to_string(), + serde_json::json!(args.blend_retrieval_weight), + ); + terms.push(SearchRankingTerm { + name: "blend.retrieval".to_string(), + value: args.retrieval_term, + inputs: Some(blend_retrieval_inputs), + }); + + let mut blend_rerank_inputs = BTreeMap::new(); + + blend_rerank_inputs.insert("enabled".to_string(), serde_json::json!(blend_enabled)); + blend_rerank_inputs.insert("rerank_score".to_string(), serde_json::json!(args.rerank_score)); + blend_rerank_inputs.insert("rerank_rank".to_string(), serde_json::json!(args.rerank_rank)); + blend_rerank_inputs.insert("rerank_norm".to_string(), serde_json::json!(args.rerank_norm)); + blend_rerank_inputs + .insert("rerank_normalization".to_string(), serde_json::json!(args.rerank_normalization)); + blend_rerank_inputs.insert( + "blend_retrieval_weight".to_string(), + serde_json::json!(args.blend_retrieval_weight), + ); + terms.push(SearchRankingTerm { + name: "blend.rerank".to_string(), + value: args.rerank_term, + inputs: Some(blend_rerank_inputs), + }); + + let recency_decay = if cfg.ranking.recency_tau_days > 0.0 { + (-args.age_days / cfg.ranking.recency_tau_days).exp() + } else { + 1.0 + }; + let mut tie_breaker_inputs = BTreeMap::new(); + + tie_breaker_inputs.insert( + "tie_breaker_weight".to_string(), + serde_json::json!(cfg.ranking.tie_breaker_weight), + ); + tie_breaker_inputs.insert("importance".to_string(), serde_json::json!(args.importance)); + tie_breaker_inputs.insert("age_days".to_string(), serde_json::json!(args.age_days)); + tie_breaker_inputs + .insert("recency_tau_days".to_string(), serde_json::json!(cfg.ranking.recency_tau_days)); + tie_breaker_inputs.insert("recency_decay".to_string(), serde_json::json!(recency_decay)); + terms.push(SearchRankingTerm { + name: "tie_breaker".to_string(), + value: args.tie_breaker_score, + inputs: Some(tie_breaker_inputs), + }); + + let mut scope_boost_inputs = BTreeMap::new(); + + scope_boost_inputs.insert("scope".to_string(), serde_json::json!(args.scope)); + scope_boost_inputs.insert( + "scope_boost_weight".to_string(), + serde_json::json!(cfg.context.as_ref().and_then(|ctx| ctx.scope_boost_weight)), + ); + terms.push(SearchRankingTerm { + name: "context.scope_boost".to_string(), + value: args.scope_context_boost, + inputs: Some(scope_boost_inputs), + }); + + push_deterministic_terms(&mut terms, cfg, &args); + + terms +} + +fn push_deterministic_terms( + terms: &mut Vec<SearchRankingTerm>, + cfg: &Config, + args: &TraceTermsArgs<'_>, +) { + let det = &cfg.ranking.deterministic; + let mut lex_inputs = BTreeMap::new(); + + lex_inputs.insert("enabled".to_string(), serde_json::json!(det.enabled && det.lexical.enabled)); + lex_inputs.insert("weight".to_string(), serde_json::json!(det.lexical.weight)); + lex_inputs.insert("min_ratio".to_string(), serde_json::json!(det.lexical.min_ratio)); + lex_inputs + .insert("max_query_terms".to_string(), serde_json::json!(det.lexical.max_query_terms)); + lex_inputs.insert("max_text_terms".to_string(), serde_json::json!(det.lexical.max_text_terms)); + lex_inputs.insert( + "overlap_ratio".to_string(), + serde_json::json!(args.deterministic_lexical_overlap_ratio), + ); + terms.push(SearchRankingTerm { + name: "deterministic.lexical_bonus".to_string(), + value: args.deterministic_lexical_bonus, + inputs: Some(lex_inputs), + }); + + let mut hits_inputs = BTreeMap::new(); + + hits_inputs.insert("enabled".to_string(), serde_json::json!(det.enabled && det.hits.enabled)); + hits_inputs.insert("weight".to_string(), serde_json::json!(det.hits.weight)); + hits_inputs.insert("half_saturation".to_string(), serde_json::json!(det.hits.half_saturation)); + hits_inputs + .insert("last_hit_tau_days".to_string(), serde_json::json!(det.hits.last_hit_tau_days)); + hits_inputs.insert("hit_count".to_string(), serde_json::json!(args.deterministic_hit_count)); + hits_inputs.insert( + "last_hit_age_days".to_string(), + serde_json::json!(args.deterministic_last_hit_age_days), + ); + terms.push(SearchRankingTerm { + name: "deterministic.hit_boost".to_string(), + value: args.deterministic_hit_boost, + inputs: Some(hits_inputs), + }); + + let mut decay_inputs = BTreeMap::new(); + + decay_inputs.insert("enabled".to_string(), serde_json::json!(det.enabled && det.decay.enabled)); + decay_inputs.insert("weight".to_string(), serde_json::json!(det.decay.weight)); + decay_inputs.insert("tau_days".to_string(), serde_json::json!(det.decay.tau_days)); + decay_inputs.insert("age_days".to_string(), serde_json::json!(args.age_days)); + terms.push(SearchRankingTerm { + name: "deterministic.decay_penalty".to_string(), + value: args.deterministic_decay_penalty, + inputs: Some(decay_inputs), + }); +} diff --git a/packages/elf-service/src/search.rs b/packages/elf-service/src/search.rs index 923672db..efbbccb3 100644 --- a/packages/elf-service/src/search.rs +++ b/packages/elf-service/src/search.rs @@ -1,193 +1,1176 @@ -// std +//! Search APIs and ranking explanations. + +mod filter; +mod ranking; + +pub use crate::ranking_explain_v2::{SearchRankingExplain, SearchRankingTerm}; + use std::{ - collections::{HashMap, HashSet, hash_map::DefaultHasher}, - hash::{Hash, Hasher}, + cmp::Ordering, + collections::{BTreeMap, HashMap, HashSet, VecDeque}, slice, }; -// crates.io use qdrant_client::qdrant::{ Condition, Document, Filter, Fusion, MinShould, PrefetchQueryBuilder, Query, - QueryPointsBuilder, ScoredPoint, Value, point_id::PointIdOptions, value::Kind, + QueryPointsBuilder, ScoredPoint, }; -use serde::de::DeserializeOwned; -use sqlx::{QueryBuilder, Row}; +use serde::{Deserialize, Deserializer, Serialize, Serializer, de}; +use serde_json::Value; +use sqlx::{FromRow, PgConnection, PgExecutor, PgPool, QueryBuilder, Row}; use time::{Duration, OffsetDateTime}; use uuid::Uuid; -// self -use elf_domain::cjk; +use crate::{ + ElfService, Result, + access::{self, ORG_PROJECT_ID}, + graph::RelationTemporalStatus, + ranking_explain_v2::{self, SEARCH_RANKING_EXPLAIN_SCHEMA_V2, TraceTermsArgs}, +}; +use elf_config::{Config, SearchCache}; +use elf_domain::english_gate; use elf_storage::{ models::MemoryNote, qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME}, }; +use filter::{SearchFilter, SearchFilterImpact}; +use ranking::{ + NormalizationKind, ResolvedBlendPolicy, ResolvedDiversityPolicy, ResolvedRetrievalSourcesPolicy, +}; -use crate::{ElfService, ServiceError, ServiceResult}; - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +const TRACE_VERSION: i32 = 3; +const MAX_MATCHED_TERMS: usize = 8; +const MAX_TRAJECTORY_STAGE_ITEMS: usize = 256; +const MAX_CANDIDATE_K: u32 = 1_024; +const QUERY_PLAN_SCHEMA: &str = "elf.search.query_plan"; +const QUERY_PLAN_VERSION: &str = "v1"; +const SEARCH_RETRIEVAL_TRAJECTORY_SCHEMA_V1: &str = "search_retrieval_trajectory/v1"; +const SEARCH_FILTER_IMPACT_SCHEMA_V1: &str = "search_filter_impact/v1"; +const RECENT_TRACES_SCHEMA_V1: &str = "elf.recent_traces/v1"; +const TRACE_BUNDLE_SCHEMA_V1: &str = "elf.trace_bundle/v1"; +const MAX_RECENT_TRACES_LIMIT: u32 = 200; +const DEFAULT_RECENT_TRACES_LIMIT: u32 = 50; +const DEFAULT_BOUNDED_STAGE_ITEMS_LIMIT: u32 = 64; +const DEFAULT_FULL_STAGE_ITEMS_LIMIT: u32 = 256; +const DEFAULT_BOUNDED_CANDIDATES_LIMIT: u32 = 0; +const DEFAULT_FULL_CANDIDATES_LIMIT: u32 = 200; +const MAX_TRACE_BUNDLE_ITEMS_LIMIT: u32 = 256; +const MAX_TRACE_BUNDLE_CANDIDATES_LIMIT: u32 = 1_000; +const RELATION_CONTEXT_SQL: &str = r#" +WITH selected_facts AS ( + SELECT DISTINCT ON (snc.selected_note_id, gf.fact_id) + snc.selected_note_id, + gf.fact_id, + gf.scope, + subject_entity.canonical AS subject_canonical, + subject_entity.kind AS subject_kind, + gf.predicate, + gf.object_entity_id, + object_entity.canonical AS object_canonical, + object_entity.kind AS object_kind, + gf.object_value, + gf.valid_from, + gf.valid_to, + (gf.valid_from <= $4 AND (gf.valid_to IS NULL OR gf.valid_to > $4)) AS is_current + FROM unnest($7::uuid[]) AS snc(selected_note_id) + JOIN graph_fact_evidence gfe + ON gfe.note_id = snc.selected_note_id + JOIN graph_facts gf + ON gf.fact_id = gfe.fact_id + JOIN graph_entities subject_entity + ON subject_entity.entity_id = gf.subject_entity_id + AND subject_entity.tenant_id = $1 + AND subject_entity.project_id = $2 + LEFT JOIN graph_entities object_entity + ON object_entity.entity_id = gf.object_entity_id + AND object_entity.tenant_id = $1 + AND object_entity.project_id = $2 + WHERE gf.tenant_id = $1 + AND gf.project_id = $2 + AND ( + ($5 AND gf.scope = 'agent_private' AND gf.agent_id = $3) + OR gf.scope = ANY($6::text[]) + ) + AND gf.valid_from <= $4 + ORDER BY + snc.selected_note_id, + gf.fact_id, + (gf.valid_from <= $4 AND (gf.valid_to IS NULL OR gf.valid_to > $4)) DESC, + gf.valid_from DESC, + gf.fact_id ASC +), +ranked_facts AS ( + SELECT + selected_note_id, + fact_id, + scope, + subject_canonical, + subject_kind, + predicate, + object_entity_id, + object_canonical, + object_kind, + object_value, + valid_from, + valid_to, + is_current, + ROW_NUMBER() OVER ( + PARTITION BY selected_note_id + ORDER BY is_current DESC, valid_from DESC, fact_id ASC + ) AS fact_rank + FROM selected_facts +), +bounded_facts AS ( + SELECT + selected_note_id, + fact_id, + scope, + subject_canonical, + subject_kind, + predicate, + object_entity_id, + object_canonical, + object_kind, + object_value, + valid_from, + valid_to, + is_current, + fact_rank + FROM ranked_facts + WHERE fact_rank <= $9 +), +evidence_ranked AS ( + SELECT + bf.selected_note_id, + bf.fact_id, + bf.scope, + bf.subject_canonical, + bf.subject_kind, + bf.predicate, + bf.object_entity_id, + bf.object_canonical, + bf.object_kind, + bf.object_value, + bf.valid_from, + bf.valid_to, + bf.is_current, + bf.fact_rank, + e.note_id AS evidence_note_id, + e.created_at AS evidence_created_at, + ROW_NUMBER() OVER ( + PARTITION BY bf.selected_note_id, bf.fact_id + ORDER BY e.created_at ASC, e.note_id ASC + ) AS evidence_rank + FROM bounded_facts bf + JOIN graph_fact_evidence e + ON e.fact_id = bf.fact_id +), +fact_contexts AS ( + SELECT + selected_note_id, + fact_id, + scope, + subject_canonical, + subject_kind, + predicate, + object_entity_id, + object_canonical, + object_kind, + object_value, + valid_from, + valid_to, + is_current, + fact_rank, + ARRAY_AGG(evidence_note_id ORDER BY evidence_created_at ASC, evidence_note_id ASC) AS evidence_note_ids + FROM evidence_ranked + WHERE evidence_rank <= $8 + GROUP BY + selected_note_id, + fact_id, + scope, + subject_canonical, + subject_kind, + predicate, + object_entity_id, + object_canonical, + object_kind, + object_value, + valid_from, + valid_to, + is_current, + fact_rank +) +SELECT + selected_note_id AS note_id, + fact_id, + scope, + subject_canonical, + subject_kind, + predicate, + object_entity_id, + object_canonical, + object_kind, + object_value, + valid_from, + valid_to, + is_current, + evidence_note_ids +FROM fact_contexts +ORDER BY note_id, fact_rank +"#; + +/// Request payload for search APIs. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct SearchRequest { + /// Tenant to search within. pub tenant_id: String, + /// Project to search within. pub project_id: String, + /// Agent requesting the search. pub agent_id: String, + /// Optional auth token identifier used for role checks. + pub token_id: Option<String>, + #[serde(default)] + /// Requested payload-detail level. + pub payload_level: PayloadLevel, + /// Read profile that determines visible scopes. pub read_profile: String, + /// Search query text. pub query: String, + /// Requested number of returned items. pub top_k: Option<u32>, + /// Retrieval breadth before ranking and projection. pub candidate_k: Option<u32>, + + /// Optional structured filter expression. + pub filter: Option<Value>, + /// When true, records note-hit metrics for returned items. pub record_hits: Option<bool>, + /// Optional ranking-policy overrides. + pub ranking: Option<RankingRequestOverride>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct SearchBoost { - pub name: String, - pub score: f32, +/// Ranking override bundle supplied on a search request. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct RankingRequestOverride { + /// Blend-ranking override. + pub blend: Option<BlendRankingOverride>, + /// Diversity-ranking override. + pub diversity: Option<DiversityRankingOverride>, + /// Retrieval-source weighting override. + pub retrieval_sources: Option<RetrievalSourcesRankingOverride>, +} + +/// Blend-ranking override supplied on a search request. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct BlendRankingOverride { + /// Enables or disables blend ranking. + pub enabled: Option<bool>, + /// Override for rerank-score normalization. + pub rerank_normalization: Option<String>, + /// Override for retrieval-score normalization. + pub retrieval_normalization: Option<String>, + /// Override for blend segments. + pub segments: Option<Vec<BlendSegmentOverride>>, +} + +/// One blend segment override. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct BlendSegmentOverride { + /// Highest retrieval rank covered by the segment. + pub max_retrieval_rank: u32, + /// Retrieval weight applied within the segment. + pub retrieval_weight: f32, +} + +/// Diversity-ranking override supplied on a search request. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct DiversityRankingOverride { + /// Enables or disables diversity selection. + pub enabled: Option<bool>, + /// Similarity threshold for duplicate suppression. + pub sim_threshold: Option<f32>, + /// MMR lambda value. + pub mmr_lambda: Option<f32>, + /// Maximum number of candidates to skip while selecting diverse results. + pub max_skips: Option<u32>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Retrieval-source weighting override supplied on a search request. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct RetrievalSourcesRankingOverride { + /// Weight for fusion retrieval. + pub fusion_weight: Option<f32>, + /// Weight for structured-field retrieval. + pub structured_field_weight: Option<f32>, + /// Priority for fusion retrieval. + pub fusion_priority: Option<u32>, + /// Priority for structured-field retrieval. + pub structured_field_priority: Option<u32>, + /// Weight for recursive retrieval. + pub recursive_weight: Option<f32>, + /// Priority for recursive retrieval. + pub recursive_priority: Option<u32>, +} + +/// Full explanation attached to one search item. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct SearchExplain { - pub retrieval_score: Option<f32>, - pub retrieval_rank: Option<u32>, - pub rerank_score: f32, - pub tie_breaker_score: f32, - pub final_score: f32, - pub boosts: Vec<SearchBoost>, + /// Match-specific explanation. + pub r#match: SearchMatchExplain, + /// Ranking-term explanation. + pub ranking: SearchRankingExplain, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional relation-context snippets supporting the match. + pub relation_context: Option<Vec<SearchExplainRelationContext>>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional diversity-selection explanation. + pub diversity: Option<SearchDiversityExplain>, +} + +/// Relation-context row attached to a search explanation. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchExplainRelationContext { + /// Fact identifier. + pub fact_id: Uuid, + /// Scope key for the fact. + pub scope: String, + /// Subject entity reference. + pub subject: SearchExplainRelationEntityRef, + /// Predicate surface. + pub predicate: String, + /// Object payload. + pub object: SearchExplainRelationContextObject, + #[serde(with = "crate::time_serde")] + /// Start of the fact validity window. + pub valid_from: OffsetDateTime, + #[serde(with = "crate::time_serde::option")] + /// End of the fact validity window, if superseded. + pub valid_to: Option<OffsetDateTime>, + #[serde(default)] + /// Temporal state for the fact relative to the search read timestamp. + pub temporal_status: RelationTemporalStatus, + #[serde(default)] + /// Evidence note identifiers supporting the fact. + pub evidence_note_ids: Vec<Uuid>, +} + +/// Lightweight entity reference used in search explanations. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchExplainRelationEntityRef { + #[serde(skip_serializing_if = "Option::is_none")] + /// Canonical entity surface. + pub canonical: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional entity kind. + pub kind: Option<String>, +} + +/// Object payload used in search explanation relation context. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchExplainRelationContextObject { + #[serde(skip_serializing_if = "Option::is_none")] + /// Entity-shaped object value. + pub entity: Option<SearchExplainRelationEntityRef>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Scalar object value. + pub value: Option<String>, +} + +/// Match-level explanation for a search item. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchMatchExplain { + /// Query terms matched by the item. pub matched_terms: Vec<String>, + /// Fields that supplied the matches. pub matched_fields: Vec<String>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Diversity-selection explanation for a search item. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchDiversityExplain { + /// Whether diversity ranking was enabled. + pub enabled: bool, + /// Reason the item was selected. + pub selected_reason: String, + #[serde(skip_serializing_if = "Option::is_none")] + /// Reason the item was skipped, when applicable. + pub skipped_reason: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Nearest already selected note that influenced the decision. + pub nearest_selected_note_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Similarity to the nearest selected note. + pub similarity: Option<f32>, + #[serde(skip_serializing_if = "Option::is_none")] + /// MMR score used by diversity selection. + pub mmr_score: Option<f32>, + #[serde(default)] + /// Whether the item lacked an embedding needed for diversity scoring. + pub missing_embedding: bool, +} + +/// One ranked search result item. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct SearchItem { + /// Stable result-handle identifier for explain APIs. pub result_handle: Uuid, + /// Note identifier. pub note_id: Uuid, + /// Chunk identifier. pub chunk_id: Uuid, + /// Zero-based chunk position. pub chunk_index: i32, + /// Inclusive start byte offset of the snippet chunk. pub start_offset: i32, + /// Exclusive end byte offset of the snippet chunk. pub end_offset: i32, + /// Returned snippet text. pub snippet: String, - #[serde(rename = "type")] - pub note_type: String, + /// Note type discriminator. + pub r#type: String, + /// Optional application-defined key. pub key: Option<String>, + /// Scope key for the note. pub scope: String, + /// Importance score. pub importance: f32, + /// Confidence score. pub confidence: f32, #[serde(with = "crate::time_serde")] + /// Last update timestamp. pub updated_at: OffsetDateTime, #[serde(with = "crate::time_serde::option")] + /// Optional expiry timestamp. pub expires_at: Option<OffsetDateTime>, + /// Final ranked score. pub final_score: f32, - pub source_ref: serde_json::Value, + /// Structured source reference metadata. + pub source_ref: Value, + /// Item-level explanation payload. pub explain: SearchExplain, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Response payload for raw search results. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct SearchResponse { + /// Search trace identifier. + pub trace_id: Uuid, + /// Ranked search items. + pub items: Vec<SearchItem>, + /// Optional condensed explain output. + pub trajectory_summary: Option<SearchTrajectorySummary>, +} + +/// Planned-search variant of the raw search response. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchRawPlannedResponse { + /// Search trace identifier. pub trace_id: Uuid, + /// Ranked search items. pub items: Vec<SearchItem>, + /// Optional condensed explain output. + pub trajectory_summary: Option<SearchTrajectorySummary>, + /// Query plan used for the search. + pub query_plan: QueryPlan, +} + +/// Query plan emitted by planned search. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct QueryPlan { + /// Query-plan schema identifier. + pub schema: String, + /// Query-plan version string. + pub version: String, + /// Ordered planning stages. + pub stages: Vec<QueryPlanStage>, + /// Request intent snapshot. + pub intent: QueryPlanIntent, + /// Query rewrite output. + pub rewrite: QueryPlanRewrite, + /// Retrieval-stage plan. + pub retrieval_stages: Vec<QueryPlanRetrievalStage>, + /// Fusion-policy snapshot. + pub fusion_policy: QueryPlanFusionPolicy, + /// Rerank-policy snapshot. + pub rerank_policy: QueryPlanRerankPolicy, + /// Budget snapshot. + pub budget: QueryPlanBudget, +} + +/// One stage in a query plan. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct QueryPlanStage { + /// Stage name. + pub name: String, + /// Free-form stage details. + pub details: Value, +} + +/// Request intent captured in a query plan. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct QueryPlanIntent { + /// Original search query text. + pub query: String, + /// Tenant to search within. + pub tenant_id: String, + /// Project to search within. + pub project_id: String, + /// Agent requesting the search. + pub agent_id: String, + /// Read profile used for the search. + pub read_profile: String, + /// Scopes allowed by the read profile. + pub allowed_scopes: Vec<String>, +} + +/// Rewrite section of a query plan. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct QueryPlanRewrite { + /// Expansion mode label. + pub expansion_mode: String, + /// Expanded query strings. + pub expanded_queries: Vec<String>, + /// Dynamic-gate summary. + pub dynamic_gate: QueryPlanDynamicGate, +} + +/// Dynamic-query-expansion gate summary. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct QueryPlanDynamicGate { + /// Whether the dynamic gate was considered. + pub considered: bool, + /// Whether the dynamic gate decided to expand. + pub should_expand: Option<bool>, + /// Candidate count observed by the gate. + pub observed_candidates: Option<u32>, + /// Top score observed by the gate. + pub observed_top_score: Option<f32>, + /// Minimum candidates threshold. + pub min_candidates: u32, + /// Minimum top-score threshold. + pub min_top_score: f32, +} + +/// Retrieval-stage entry in a query plan. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct QueryPlanRetrievalStage { + /// Stage name. + pub name: String, + /// Retrieval source label. + pub source: String, + /// Whether the stage is enabled. + pub enabled: bool, + /// Candidate limit for the stage. + pub candidate_limit: u32, +} + +/// Fusion-policy snapshot used during search. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct QueryPlanFusionPolicy { + /// Fusion strategy label. + pub strategy: String, + /// Weight for fusion retrieval. + pub fusion_weight: f32, + /// Weight for structured-field retrieval. + pub structured_field_weight: f32, + /// Weight for recursive retrieval. + pub recursive_weight: f32, + /// Priority for fusion retrieval. + pub fusion_priority: u32, + /// Priority for structured-field retrieval. + pub structured_field_priority: u32, + /// Priority for recursive retrieval. + pub recursive_priority: u32, +} + +/// One blend segment in the rerank policy. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct QueryPlanBlendSegment { + /// Highest retrieval rank covered by the segment. + pub max_retrieval_rank: u32, + /// Retrieval weight applied within the segment. + pub retrieval_weight: f32, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Rerank-policy snapshot used during search. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct QueryPlanRerankPolicy { + /// Provider identifier. + pub provider_id: String, + /// Model identifier. + pub model: String, + /// Whether blend ranking was enabled. + pub blend_enabled: bool, + /// Rerank normalization label. + pub rerank_normalization: String, + /// Retrieval normalization label. + pub retrieval_normalization: String, + /// Blend segments used by the policy. + pub blend_segments: Vec<QueryPlanBlendSegment>, + /// Whether diversity ranking was enabled. + pub diversity_enabled: bool, + /// Diversity similarity threshold. + pub diversity_sim_threshold: f32, + /// Diversity MMR lambda. + pub diversity_mmr_lambda: f32, + /// Diversity max-skips limit. + pub diversity_max_skips: u32, +} + +/// Budget snapshot used during search. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct QueryPlanBudget { + /// Final top-k budget. + pub top_k: u32, + /// Candidate-k budget. + pub candidate_k: u32, + /// Prefilter candidate cap. + pub prefilter_max_candidates: u32, + /// Query-expansion cap. + pub expansion_max_queries: u32, + /// Whether ranking caches were enabled. + pub cache_enabled: bool, +} + +/// Request payload for loading one item-level explanation. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct SearchExplainRequest { + /// Tenant that owns the trace. + pub tenant_id: String, + /// Project that owns the trace. + pub project_id: String, + /// Agent requesting the explain payload. + pub agent_id: String, + /// Result-handle identifier returned by search. pub result_handle: Uuid, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Search trace metadata persisted for one search run. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct SearchTrace { + /// Search trace identifier. pub trace_id: Uuid, + /// Tenant that owns the trace. pub tenant_id: String, + /// Project that owns the trace. pub project_id: String, + /// Agent that ran the search. pub agent_id: String, + /// Read profile used for the search. pub read_profile: String, + /// Search query text. pub query: String, + /// Expansion mode label. pub expansion_mode: String, + /// Expanded query strings. pub expanded_queries: Vec<String>, + /// Scopes allowed by the read profile. pub allowed_scopes: Vec<String>, + /// Candidate count observed by the search. pub candidate_count: u32, + /// Top-k budget used by the search. pub top_k: u32, - pub config_snapshot: serde_json::Value, + /// Config snapshot captured for the trace. + pub config_snapshot: Value, #[serde(with = "crate::time_serde")] + /// Trace creation timestamp. pub created_at: OffsetDateTime, + /// Trace schema version. pub trace_version: i32, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Condensed search-trajectory explanation. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchTrajectorySummary { + /// Summary schema identifier. + pub schema: String, + /// Ordered summary stages. + pub stages: Vec<SearchTrajectorySummaryStage>, +} + +/// One stage in a condensed search trajectory. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchTrajectorySummaryStage { + /// Zero-based stage order. + pub stage_order: u32, + /// Stable stage name. + pub stage_name: String, + /// Number of items after the stage. + pub item_count: u32, + /// Free-form stage statistics. + pub stats: Value, +} + +/// One full search-trajectory stage. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchTrajectoryStage { + /// Zero-based stage order. + pub stage_order: u32, + /// Stable stage name. + pub stage_name: String, + /// Stage-level payload. + pub stage_payload: Value, + /// Item rows for the stage. + pub items: Vec<SearchTrajectoryStageItem>, +} + +/// One item row inside a search-trajectory stage. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchTrajectoryStageItem { + /// Stage-item identifier, when persisted. + pub item_id: Option<Uuid>, + /// Note identifier, when applicable. + pub note_id: Option<Uuid>, + /// Chunk identifier, when applicable. + pub chunk_id: Option<Uuid>, + /// Free-form per-item metrics. + pub metrics: Value, +} + +/// Full search-trajectory response. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchTrajectoryResponse { + /// Trace metadata. + pub trace: SearchTrace, + /// Condensed trajectory summary. + pub trajectory: SearchTrajectorySummary, + /// Full trajectory stages. + pub stages: Vec<SearchTrajectoryStage>, +} + +/// Item-level explain trajectory. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchExplainTrajectory { + /// Trajectory schema identifier. + pub schema: String, + /// Ordered explain stages. + pub stages: Vec<SearchExplainTrajectoryStage>, +} + +/// One stage in an item-level explain trajectory. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchExplainTrajectoryStage { + /// Zero-based stage order. + pub stage_order: u32, + /// Stable stage name. + pub stage_name: String, + /// Stage-level payload. + pub stage_payload: Value, + /// Per-item metrics. + pub metrics: Value, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional match information for the selected item. + pub match_info: Option<SearchExplainTrajectoryMatch>, +} + +/// Match reference for one explain trajectory stage. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SearchExplainTrajectoryMatch { + /// Match kind label. + pub kind: String, + /// Stage-item identifier, when persisted. + pub item_id: Option<Uuid>, + /// Note identifier, when applicable. + pub note_id: Option<Uuid>, + /// Chunk identifier, when applicable. + pub chunk_id: Option<Uuid>, +} + +/// Explain payload for one ranked search item. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct SearchExplainItem { + /// Stable result-handle identifier. pub result_handle: Uuid, + /// Note identifier. pub note_id: Uuid, + /// Chunk identifier, when applicable. pub chunk_id: Option<Uuid>, + /// 1-based final rank. pub rank: u32, + /// Item-level explanation payload. pub explain: SearchExplain, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Response payload for item-level explanations. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct SearchExplainResponse { + /// Trace metadata. pub trace: SearchTrace, + /// Explained item payload. pub item: SearchExplainItem, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional explain trajectory. + pub trajectory: Option<SearchExplainTrajectory>, } -const TRACE_VERSION: i32 = 1; -const MAX_MATCHED_TERMS: usize = 8; +/// Request payload for listing recent traces. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TraceRecentListRequest { + /// Tenant that owns the traces. + pub tenant_id: String, + /// Project that owns the traces. + pub project_id: String, + /// Agent requesting the list. + pub agent_id: String, -#[derive(Debug, Clone)] -struct QueryEmbedding { - text: String, - vector: Vec<f32>, + /// Maximum number of traces to return. + pub limit: Option<u32>, + + /// Cursor creation timestamp for pagination. + pub cursor_created_at: Option<OffsetDateTime>, + + /// Cursor trace identifier for pagination. + pub cursor_trace_id: Option<Uuid>, + + /// Optional agent filter. + pub agent_id_filter: Option<String>, + + /// Optional read-profile filter. + pub read_profile: Option<String>, + #[serde(with = "crate::time_serde::option")] + /// Optional lower bound for trace creation time. + pub created_after: Option<OffsetDateTime>, + #[serde(with = "crate::time_serde::option")] + /// Optional upper bound for trace creation time. + pub created_before: Option<OffsetDateTime>, } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum ExpansionMode { - Off, - Always, - Dynamic, +/// Header row returned by recent-trace listing. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct RecentTraceHeader { + /// Trace identifier. + pub trace_id: Uuid, + /// Tenant that owns the trace. + pub tenant_id: String, + /// Project that owns the trace. + pub project_id: String, + /// Agent that ran the trace. + pub agent_id: String, + /// Read profile used for the trace. + pub read_profile: String, + /// Search query text. + pub query: String, + #[serde(with = "crate::time_serde")] + /// Trace creation timestamp. + pub created_at: OffsetDateTime, } -#[derive(Debug, Clone, Copy)] -enum CacheKind { - Expansion, - Rerank, +/// Pagination cursor returned by recent-trace listing. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TraceRecentCursor { + #[serde(with = "crate::time_serde")] + /// Cursor creation timestamp. + pub created_at: OffsetDateTime, + /// Cursor trace identifier. + pub trace_id: Uuid, } -impl CacheKind { - fn as_str(self) -> &'static str { - match self { - Self::Expansion => "expansion", - Self::Rerank => "rerank", - } - } + +/// Response payload for recent-trace listing. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TraceRecentListResponse { + /// Response schema identifier. + pub schema: String, + /// Returned trace headers. + pub traces: Vec<RecentTraceHeader>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Cursor for the next page, when more results remain. + pub next_cursor: Option<TraceRecentCursor>, } -#[derive(Debug, Clone, Copy)] -struct RetrievalInfo { - score: f32, - rank: u32, +/// Request payload for loading a trace bundle. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TraceBundleGetRequest { + /// Tenant that owns the trace. + pub tenant_id: String, + /// Project that owns the trace. + pub project_id: String, + /// Agent requesting the bundle. + pub agent_id: String, + /// Trace identifier. + pub trace_id: Uuid, + #[serde(default)] + /// Bundle mode controlling output size. + pub mode: TraceBundleMode, + + /// Optional cap for per-stage items. + pub stage_items_limit: Option<u32>, + + /// Optional cap for replay candidates. + pub candidates_limit: Option<u32>, +} + +/// Response payload for trace bundles. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TraceBundleResponse { + /// Response schema identifier. + pub schema: String, + #[serde(with = "crate::time_serde")] + /// Bundle generation timestamp. + pub generated_at: OffsetDateTime, + /// Trace metadata. + pub trace: SearchTrace, + /// Explained items from the trace. + pub items: Vec<SearchExplainItem>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional condensed trajectory summary. + pub trajectory_summary: Option<SearchTrajectorySummary>, + /// Full trajectory stages. + pub stages: Vec<SearchTrajectoryStage>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional replay candidates. + pub candidates: Option<Vec<TraceReplayCandidate>>, +} + +/// Request payload for loading trace metadata and items. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TraceGetRequest { + /// Tenant that owns the trace. + pub tenant_id: String, + /// Project that owns the trace. + pub project_id: String, + /// Agent requesting the trace. + pub agent_id: String, + /// Trace identifier. + pub trace_id: Uuid, +} + +/// Request payload for loading full trajectory stages. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TraceTrajectoryGetRequest { + /// Tenant that owns the trace. + pub tenant_id: String, + /// Project that owns the trace. + pub project_id: String, + /// Agent requesting the trajectory. + pub agent_id: String, + /// Trace identifier. + pub trace_id: Uuid, +} + +/// Response payload for trace metadata and explained items. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TraceGetResponse { + /// Trace metadata. + pub trace: SearchTrace, + /// Explained items from the trace. + pub items: Vec<SearchExplainItem>, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional condensed trajectory summary. + pub trajectory_summary: Option<SearchTrajectorySummary>, +} + +/// Context needed to replay ranking against stored candidates. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TraceReplayContext { + /// Trace identifier. + pub trace_id: Uuid, + /// Search query text. + pub query: String, + /// Candidate count observed during the trace. + pub candidate_count: u32, + /// Top-k budget used during the trace. + pub top_k: u32, + #[serde(with = "crate::time_serde")] + /// Trace creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Candidate row used for replaying ranking offline. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TraceReplayCandidate { + /// Note identifier. + pub note_id: Uuid, + /// Chunk identifier. + pub chunk_id: Uuid, + /// Zero-based chunk position. + pub chunk_index: i32, + /// Candidate snippet text. + pub snippet: String, + /// 1-based retrieval rank. + pub retrieval_rank: u32, + #[serde(skip_serializing_if = "Option::is_none")] + /// Optional merged retrieval score captured before rerank. + pub retrieval_score: Option<f32>, + /// Raw rerank-model score. + pub rerank_score: f32, + /// Scope key for the note. + pub note_scope: String, + /// Note importance score. + pub note_importance: f32, + #[serde(with = "crate::time_serde")] + /// Note last-update timestamp. + pub note_updated_at: OffsetDateTime, + /// Note hit counter. + pub note_hit_count: i64, + #[serde(with = "crate::time_serde::option")] + /// Timestamp of the note's most recent hit. + pub note_last_hit_at: Option<OffsetDateTime>, + /// Whether the candidate was selected by diversity ranking. + pub diversity_selected: Option<bool>, + /// Final selected rank under diversity ranking. + pub diversity_selected_rank: Option<u32>, + /// Reason the candidate was selected by diversity ranking. + pub diversity_selected_reason: Option<String>, + /// Reason the candidate was skipped by diversity ranking. + pub diversity_skipped_reason: Option<String>, + /// Nearest selected note that influenced the diversity decision. + pub diversity_nearest_selected_note_id: Option<Uuid>, + /// Similarity to the nearest selected note. + pub diversity_similarity: Option<f32>, + /// MMR score used for diversity selection. + pub diversity_mmr_score: Option<f32>, + /// Whether the candidate lacked an embedding for diversity scoring. + pub diversity_missing_embedding: Option<bool>, +} + +/// Final replayed ranking item. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TraceReplayItem { + /// Note identifier. + pub note_id: Uuid, + /// Chunk identifier. + pub chunk_id: Uuid, + /// 1-based retrieval rank. + pub retrieval_rank: u32, + /// Final replayed score. + pub final_score: f32, + /// Recomputed explanation payload. + pub explain: SearchExplain, +} + +struct ScoreSnippetArgs<'a, 'k> { + query: &'a str, + snippet_items: Vec<ChunkSnippet>, + scope_context_boost_by_scope: &'a HashMap<&'k str, f32>, + det_query_tokens: &'a [String], + blend_policy: &'a ResolvedBlendPolicy, + cache_cfg: &'a SearchCache, + now: OffsetDateTime, + candidate_count: usize, + skip_rerank: bool, +} + +struct ScoreCandidateCtx<'a, 'k> { + cfg: &'a Config, + blend_policy: &'a ResolvedBlendPolicy, + scope_context_boost_by_scope: &'a HashMap<&'k str, f32>, + det_query_tokens: &'a [String], + now: OffsetDateTime, + total_rerank: u32, + total_retrieval: u32, +} + +struct MaybeDynamicSearchArgs<'a> { + path: RawSearchPath, + enabled: bool, + trace_id: Uuid, + query: &'a str, + tenant_id: &'a str, + project_id: &'a str, + agent_id: &'a str, + token_id: Option<&'a str>, + read_profile: &'a str, + allowed_scopes: &'a [String], + project_context_description: Option<&'a str>, + filter: &'a Filter, + service_filter: Option<&'a SearchFilter>, + candidate_k: u32, + requested_candidate_k: u32, + effective_candidate_k: u32, + top_k: u32, + record_hits_enabled: bool, + ranking_override: Option<&'a RankingRequestOverride>, + retrieval_sources_policy: &'a ResolvedRetrievalSourcesPolicy, + payload_level: PayloadLevel, +} + +struct SearchRetrievalArgs<'a> { + query: &'a str, + expansion_mode: ExpansionMode, + project_context_description: Option<&'a str>, + filter: &'a Filter, + candidate_k: u32, + baseline_vector: Option<&'a Vec<f32>>, + tenant_id: &'a str, + project_id: &'a str, + agent_id: &'a str, + allowed_scopes: &'a [String], + retrieval_sources_policy: &'a ResolvedRetrievalSourcesPolicy, +} + +struct RecursiveRetrievalArgs<'a> { + query: &'a str, + query_vec: &'a [f32], + filter: &'a Filter, + candidate_k: u32, + retrieval_sources_policy: &'a ResolvedRetrievalSourcesPolicy, + seed_candidates: &'a [ChunkCandidate], +} + +struct SearchRetrievalResult { + expanded_queries: Vec<String>, + candidates: Vec<ChunkCandidate>, + structured_matches: HashMap<Uuid, Vec<String>>, + recursive: Option<RecursiveRetrievalResult>, +} + +#[derive(Clone, Debug, Default)] +struct RecursiveRetrievalResult { + enabled: bool, + rounds_executed: u32, + scopes_seeded: usize, + scopes_queried: usize, + candidates_before: usize, + candidates_after: usize, + candidates_added: usize, + total_queries: u32, + stop_reason: Option<String>, + candidates: Vec<ChunkCandidate>, +} + +#[derive(Clone, Debug)] +struct QueryEmbedding { + text: String, + vector: Vec<f32>, } -#[derive(Debug, Clone)] +#[derive(Clone, Debug)] struct ChunkCandidate { chunk_id: Uuid, note_id: Uuid, chunk_index: i32, - retrieval_score: f32, retrieval_rank: u32, + retrieval_score: Option<f32>, + scope: Option<String>, + updated_at: Option<OffsetDateTime>, + embedding_version: Option<String>, } -#[derive(Debug, Clone)] +#[derive(Clone, Debug)] struct RerankCacheCandidate { chunk_id: Uuid, updated_at: OffsetDateTime, } -#[derive(Debug, Clone)] +#[derive(Clone, Debug)] struct NoteMeta { note_id: Uuid, note_type: String, key: Option<String>, scope: String, + agent_id: String, importance: f32, confidence: f32, updated_at: OffsetDateTime, expires_at: Option<OffsetDateTime>, - source_ref: serde_json::Value, + source_ref: Value, + embedding_version: String, + hit_count: i64, + last_hit_at: Option<OffsetDateTime>, } -#[derive(Debug, Clone, sqlx::FromRow)] +#[derive(Clone, Debug, FromRow)] struct ChunkRow { chunk_id: Uuid, note_id: Uuid, @@ -197,59 +1180,222 @@ struct ChunkRow { text: String, } -#[derive(Debug, Clone)] -struct ChunkMeta { - chunk_id: Uuid, - chunk_index: i32, - start_offset: i32, - end_offset: i32, +#[derive(Clone, Debug, FromRow)] +struct NoteVectorRow { + note_id: Uuid, + vec_text: String, } -#[derive(Debug, Clone)] -struct ChunkSnippet { - note: NoteMeta, - chunk: ChunkMeta, - snippet: String, +#[derive(Clone, Debug, FromRow)] +struct SearchExplainTraceRow { + trace_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + query: String, + expansion_mode: String, + expanded_queries: Value, + allowed_scopes: Value, + candidate_count: i32, + top_k: i32, + config_snapshot: Value, + trace_version: i32, + created_at: OffsetDateTime, + item_id: Uuid, + note_id: Uuid, + chunk_id: Option<Uuid>, + rank: i32, + explain: Value, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -struct ExpansionCachePayload { - queries: Vec<String>, +#[derive(Clone, Debug, FromRow)] +struct SearchRelationContextRow { + note_id: Uuid, + fact_id: Uuid, + scope: String, + subject_canonical: Option<String>, + subject_kind: Option<String>, + predicate: String, + object_entity_id: Option<Uuid>, + object_canonical: Option<String>, + object_kind: Option<String>, + object_value: Option<String>, + valid_from: OffsetDateTime, + valid_to: Option<OffsetDateTime>, + is_current: bool, + evidence_note_ids: Vec<Uuid>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -struct RerankCacheItem { - chunk_id: Uuid, +#[derive(Clone, Debug, FromRow)] +struct SearchTraceRow { + trace_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + query: String, + expansion_mode: String, + expanded_queries: Value, + allowed_scopes: Value, + candidate_count: i32, + top_k: i32, + config_snapshot: Value, + trace_version: i32, + created_at: OffsetDateTime, +} + +#[derive(Clone, Debug, FromRow)] +struct SearchTraceItemRow { + item_id: Uuid, + note_id: Uuid, + chunk_id: Option<Uuid>, + rank: i32, + explain: Value, +} + +#[derive(Clone, Debug, FromRow)] +struct SearchRecentTraceRow { + trace_id: Uuid, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + query: String, + created_at: OffsetDateTime, +} + +#[derive(Clone, Debug, FromRow)] +struct TraceCandidateSnapshotRow { + candidate_snapshot: Value, +} + +#[derive(Clone, Debug, FromRow)] +struct StructuredFieldHitRow { + note_id: Uuid, + field_kind: String, +} + +#[derive(Clone, Debug, FromRow)] +struct BestChunkForNoteRow { + note_id: Uuid, + chunk_id: Uuid, + chunk_index: i32, +} + +#[derive(Clone, Debug)] +struct ChunkMeta { + chunk_id: Uuid, + chunk_index: i32, + start_offset: i32, + end_offset: i32, +} + +#[derive(Clone, Debug)] +struct ChunkSnippet { + note: NoteMeta, + chunk: ChunkMeta, + snippet: String, + retrieval_rank: u32, + retrieval_score: Option<f32>, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ExpansionCachePayload { + queries: Vec<String>, +} + +#[derive(Debug, Deserialize)] +struct ExpansionOutput { + queries: Vec<String>, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct RerankCacheItem { + chunk_id: Uuid, updated_at: OffsetDateTime, score: f32, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] struct RerankCachePayload { items: Vec<RerankCacheItem>, } -#[derive(Debug, Clone)] +#[derive(Clone, Debug)] struct CachePayload { - value: serde_json::Value, + value: Value, size_bytes: usize, } -#[derive(Debug)] +#[derive(Clone, Debug)] struct ScoredChunk { item: ChunkSnippet, + final_score: f32, rerank_score: f32, + rerank_rank: u32, + rerank_norm: f32, + retrieval_norm: f32, + blend_retrieval_weight: f32, + retrieval_term: f32, + rerank_term: f32, tie_breaker_score: f32, - final_score: f32, + scope_context_boost: f32, + age_days: f32, + importance: f32, + deterministic_lexical_overlap_ratio: f32, + deterministic_lexical_bonus: f32, + deterministic_hit_count: i64, + deterministic_last_hit_age_days: Option<f32>, + deterministic_hit_boost: f32, + deterministic_decay_penalty: f32, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[derive(Clone, Debug)] +struct DiversityDecision { + selected: bool, + selected_rank: Option<u32>, + selected_reason: String, + skipped_reason: Option<String>, + nearest_selected_note_id: Option<Uuid>, + similarity: Option<f32>, + mmr_score: Option<f32>, + missing_embedding: bool, +} + +#[derive(Clone, Copy, Debug)] +struct DeterministicRankingTerms { + lexical_overlap_ratio: f32, + lexical_bonus: f32, + hit_count: i64, + last_hit_age_days: Option<f32>, + hit_boost: f32, + decay_penalty: f32, +} +impl Default for DeterministicRankingTerms { + fn default() -> Self { + Self { + lexical_overlap_ratio: 0.0, + lexical_bonus: 0.0, + hit_count: 0, + last_hit_age_days: None, + hit_boost: 0.0, + decay_penalty: 0.0, + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize)] struct TracePayload { trace: TraceRecord, items: Vec<TraceItemRecord>, + #[serde(default)] + candidates: Vec<TraceCandidateRecord>, + #[serde(default)] + stages: Vec<TraceTrajectoryStageRecord>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] struct TraceRecord { trace_id: Uuid, tenant_id: String, @@ -262,26 +1408,60 @@ struct TraceRecord { allowed_scopes: Vec<String>, candidate_count: u32, top_k: u32, - config_snapshot: serde_json::Value, + config_snapshot: Value, trace_version: i32, created_at: OffsetDateTime, expires_at: OffsetDateTime, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[derive(Clone, Debug, Deserialize, Serialize)] struct TraceItemRecord { item_id: Uuid, note_id: Uuid, chunk_id: Option<Uuid>, rank: u32, - retrieval_score: Option<f32>, - retrieval_rank: Option<u32>, - rerank_score: f32, - tie_breaker_score: f32, final_score: f32, - boosts: Vec<SearchBoost>, - matched_terms: Vec<String>, - matched_fields: Vec<String>, + explain: SearchExplain, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct TraceCandidateRecord { + candidate_id: Uuid, + note_id: Uuid, + chunk_id: Uuid, + chunk_index: i32, + snippet: String, + #[serde(default)] + candidate_snapshot: Value, + retrieval_rank: u32, + rerank_score: f32, + note_scope: String, + note_importance: f32, + note_updated_at: OffsetDateTime, + note_hit_count: i64, + note_last_hit_at: Option<OffsetDateTime>, + created_at: OffsetDateTime, + expires_at: OffsetDateTime, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct TraceTrajectoryStageRecord { + stage_id: Uuid, + stage_order: u32, + stage_name: String, + stage_payload: Value, + created_at: OffsetDateTime, + #[serde(default)] + items: Vec<TraceTrajectoryStageItemRecord>, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct TraceTrajectoryStageItemRecord { + id: Uuid, + item_id: Option<Uuid>, + note_id: Option<Uuid>, + chunk_id: Option<Uuid>, + metrics: Value, } struct TraceContext<'a> { @@ -301,10 +1481,16 @@ struct TraceContext<'a> { struct SearchTraceBuilder { trace: TraceRecord, items: Vec<TraceItemRecord>, + candidates: Vec<TraceCandidateRecord>, + stages: Vec<TraceTrajectoryStageRecord>, } - impl SearchTraceBuilder { - fn new(context: TraceContext<'_>, cfg: &elf_config::Config, now: OffsetDateTime) -> Self { + fn new( + context: TraceContext<'_>, + config_snapshot: Value, + retention_days: i64, + now: OffsetDateTime, + ) -> Self { let trace = TraceRecord { trace_id: context.trace_id, tenant_id: context.tenant_id.to_string(), @@ -312,593 +1498,3144 @@ impl SearchTraceBuilder { agent_id: context.agent_id.to_string(), read_profile: context.read_profile.to_string(), query: context.query.to_string(), - expansion_mode: expansion_mode_label(context.expansion_mode).to_string(), + expansion_mode: ranking::expansion_mode_label(context.expansion_mode).to_string(), expanded_queries: context.expanded_queries, allowed_scopes: context.allowed_scopes.to_vec(), candidate_count: context.candidate_count as u32, top_k: context.top_k, - config_snapshot: build_config_snapshot(cfg), + config_snapshot, trace_version: TRACE_VERSION, created_at: now, - expires_at: now + Duration::days(cfg.search.explain.retention_days), + expires_at: now + Duration::days(retention_days), }; - Self { trace, items: Vec::new() } + + Self { trace, items: Vec::new(), candidates: Vec::new(), stages: Vec::new() } } fn push_item(&mut self, item: TraceItemRecord) { self.items.push(item); } + fn push_candidate(&mut self, candidate: TraceCandidateRecord) { + self.candidates.push(candidate); + } + + fn push_stage(&mut self, stage: TraceTrajectoryStageRecord) { + self.stages.push(stage); + } + fn build(self) -> TracePayload { - TracePayload { trace: self.trace, items: self.items } + TracePayload { + trace: self.trace, + items: self.items, + candidates: self.candidates, + stages: self.stages, + } } } struct FinishSearchArgs<'a> { + path: RawSearchPath, trace_id: Uuid, query: &'a str, tenant_id: &'a str, project_id: &'a str, agent_id: &'a str, + token_id: Option<&'a str>, read_profile: &'a str, allowed_scopes: &'a [String], expanded_queries: Vec<String>, expansion_mode: ExpansionMode, candidates: Vec<ChunkCandidate>, + structured_matches: HashMap<Uuid, Vec<String>>, + recursive_retrieval: Option<RecursiveRetrievalResult>, top_k: u32, record_hits_enabled: bool, + ranking_override: Option<RankingRequestOverride>, + filter: Option<&'a SearchFilter>, + requested_candidate_k: u32, + effective_candidate_k: u32, + payload_level: PayloadLevel, } -impl ElfService { - pub async fn search(&self, req: SearchRequest) -> ServiceResult<SearchResponse> { - let tenant_id = req.tenant_id.trim(); - let project_id = req.project_id.trim(); - let agent_id = req.agent_id.trim(); - if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { - return Err(ServiceError::InvalidRequest { - message: "tenant_id, project_id, and agent_id are required.".to_string(), - }); - } - if cjk::contains_cjk(&req.query) { - return Err(ServiceError::NonEnglishInput { field: "$.query".to_string() }); - } - - let top_k = req.top_k.unwrap_or(self.cfg.memory.top_k).max(1); - let candidate_k = req.candidate_k.unwrap_or(self.cfg.memory.candidate_k).max(top_k); - let query = req.query.clone(); - let read_profile = req.read_profile.clone(); - let record_hits_enabled = req.record_hits.unwrap_or(false); - let expansion_mode = resolve_expansion_mode(&self.cfg); - let trace_id = Uuid::new_v4(); +struct FinishSearchPolicies { + blend_policy: ResolvedBlendPolicy, + diversity_policy: ResolvedDiversityPolicy, + retrieval_sources_policy: ResolvedRetrievalSourcesPolicy, + policy_snapshot: Value, + policy_id: String, +} - let allowed_scopes = resolve_scopes(&self.cfg, &read_profile)?; - if allowed_scopes.is_empty() { - return self - .finish_search(FinishSearchArgs { - trace_id, - query: &query, - tenant_id, - project_id, - agent_id, - read_profile: &read_profile, - allowed_scopes: &allowed_scopes, - expanded_queries: vec![query.clone()], - expansion_mode, - candidates: Vec::new(), - top_k, - record_hits_enabled, - }) - .await; - } +struct FinishSearchScoringResult { + query_tokens: Vec<String>, + filtered_candidates: Vec<ChunkCandidate>, + scored_count: usize, + snippet_count: usize, + filtered_candidate_count: usize, + filter_impact: Option<SearchFilterImpact>, + trace_candidates: Vec<TraceCandidateRecord>, + fused_results: Vec<ScoredChunk>, + selected_results: Vec<ScoredChunk>, + diversity_decisions: HashMap<Uuid, DiversityDecision>, + selected_count: usize, +} - let private_scope = "agent_private".to_string(); - let non_private_scopes: Vec<String> = - allowed_scopes.iter().filter(|scope| *scope != "agent_private").cloned().collect(); - let mut should_conditions = Vec::new(); - if allowed_scopes.iter().any(|scope| scope == "agent_private") { - let private_filter = Filter::all([ - Condition::matches("scope", private_scope), - Condition::matches("agent_id", agent_id.to_string()), - ]); - should_conditions.push(Condition::from(private_filter)); - } - if !non_private_scopes.is_empty() { - should_conditions.push(Condition::matches("scope", non_private_scopes)); - } +struct BuildTraceArgs<'a> { + path: RawSearchPath, + trace_id: Uuid, + query: &'a str, + tenant_id: &'a str, + project_id: &'a str, + agent_id: &'a str, + token_id: Option<&'a str>, + read_profile: &'a str, + expansion_mode: ExpansionMode, + expanded_queries: Vec<String>, + allowed_scopes: &'a [String], + candidate_count: usize, + filtered_candidate_count: usize, + snippet_count: usize, + scored_count: usize, + fused_count: usize, + selected_count: usize, + top_k: u32, + query_tokens: &'a [String], + structured_matches: &'a HashMap<Uuid, Vec<String>>, + recursive_retrieval: Option<&'a RecursiveRetrievalResult>, + policies: &'a FinishSearchPolicies, + diversity_decisions: &'a HashMap<Uuid, DiversityDecision>, + recall_candidates: Vec<ChunkCandidate>, + fused_results: Vec<ScoredChunk>, + selected_results: Vec<ScoredChunk>, + relation_contexts: HashMap<Uuid, Vec<SearchExplainRelationContext>>, + trace_candidates: Vec<TraceCandidateRecord>, + now: OffsetDateTime, + ranking_override: &'a Option<RankingRequestOverride>, + filter_impact: Option<SearchFilterImpact>, + payload_level: PayloadLevel, +} - let (should, min_should) = if should_conditions.is_empty() { - (Vec::new(), None) - } else { - (Vec::new(), Some(MinShould { min_count: 1, conditions: should_conditions })) - }; +struct BuildQueryPlanArgs<'a> { + path: RawSearchPath, + query: &'a str, + tenant_id: &'a str, + project_id: &'a str, + agent_id: &'a str, + read_profile: &'a str, + allowed_scopes: &'a [String], + expansion_mode: ExpansionMode, + expanded_queries: Vec<String>, + top_k: u32, + candidate_k: u32, + retrieval_sources_policy: &'a ResolvedRetrievalSourcesPolicy, + recursive_enabled: bool, + policies: &'a FinishSearchPolicies, + dynamic_gate: DynamicGateSummary, +} - let filter = Filter { - must: vec![ - Condition::matches("tenant_id", tenant_id.to_string()), - Condition::matches("project_id", project_id.to_string()), - Condition::matches("status", "active".to_string()), - ], - should, - must_not: Vec::new(), - min_should, - }; +struct RawSearchExecutionContext { + tenant_id: String, + project_id: String, + agent_id: String, + token_id: Option<String>, + top_k: u32, + candidate_k: u32, + requested_candidate_k: u32, + effective_candidate_k: u32, + query: String, + read_profile: String, + payload_level: PayloadLevel, + filter: Option<SearchFilter>, + record_hits_enabled: bool, + ranking_override: Option<RankingRequestOverride>, + retrieval_sources_policy: ResolvedRetrievalSourcesPolicy, + expansion_mode: ExpansionMode, + trace_id: Uuid, + project_context_description: Option<String>, + allowed_scopes: Vec<String>, + policies: FinishSearchPolicies, +} - let mut baseline_vector: Option<Vec<f32>> = None; - if expansion_mode == ExpansionMode::Dynamic { - let query_vec = self.embed_single_query(&query).await?; - baseline_vector = Some(query_vec.clone()); - let baseline_points = self - .run_fusion_query( - &[QueryEmbedding { text: query.clone(), vector: query_vec }], - &filter, - candidate_k, - ) - .await?; - let top_score = baseline_points.first().map(|point| point.score).unwrap_or(0.0); - let candidates = collect_chunk_candidates( - &baseline_points, - self.cfg.search.prefilter.max_candidates, - candidate_k, - ); - let should_expand = - should_expand_dynamic(baseline_points.len(), top_score, &self.cfg.search.dynamic); - if !should_expand { - return self - .finish_search(FinishSearchArgs { - trace_id, - query: &query, - tenant_id, - project_id, - agent_id, - read_profile: &read_profile, - allowed_scopes: &allowed_scopes, - expanded_queries: vec![query.clone()], - expansion_mode, - candidates, - top_k, - record_hits_enabled, - }) - .await; - } - } +struct QueryPlanStagesArgs<'a> { + path: RawSearchPath, + query: &'a str, + read_profile: &'a str, + allowed_scope_count: usize, + rewrite: &'a QueryPlanRewrite, + retrieval_stages: &'a [QueryPlanRetrievalStage], + fusion_policy: &'a QueryPlanFusionPolicy, + rerank_policy: &'a QueryPlanRerankPolicy, + budget: &'a QueryPlanBudget, +} - let queries = match expansion_mode { - ExpansionMode::Off => vec![query.clone()], - ExpansionMode::Always | ExpansionMode::Dynamic => self.expand_queries(&query).await, - }; +struct BuildSearchItemArgs<'a> { + cfg: &'a Config, + policy_id: &'a str, + blend_policy: &'a ResolvedBlendPolicy, + diversity_policy: &'a ResolvedDiversityPolicy, + diversity_decisions: &'a HashMap<Uuid, DiversityDecision>, + query_tokens: &'a [String], + structured_matches: &'a HashMap<Uuid, Vec<String>>, + relation_contexts: &'a HashMap<Uuid, Vec<SearchExplainRelationContext>>, + scored_chunk: ScoredChunk, + rank: u32, +} - let expanded_queries = queries.clone(); - let query_embeddings = - self.embed_queries(&queries, &query, baseline_vector.as_ref()).await?; - let fusion_points = self.run_fusion_query(&query_embeddings, &filter, candidate_k).await?; - let candidates = collect_chunk_candidates( - &fusion_points, - self.cfg.search.prefilter.max_candidates, - candidate_k, - ); +struct StructuredFieldRetrievalArgs<'a> { + tenant_id: &'a str, + project_id: &'a str, + agent_id: &'a str, + allowed_scopes: &'a [String], + query_vec: &'a [f32], + candidate_k: u32, + now: OffsetDateTime, +} - self.finish_search(FinishSearchArgs { - trace_id, - query: &query, - tenant_id, - project_id, - agent_id, - read_profile: &read_profile, - allowed_scopes: &allowed_scopes, - expanded_queries, - expansion_mode, - candidates, - top_k, - record_hits_enabled, - }) - .await - } +#[derive(Debug)] +struct FieldHit { + note_id: Uuid, + field_kind: String, +} - pub async fn search_explain( - &self, - req: SearchExplainRequest, - ) -> ServiceResult<SearchExplainResponse> { - let row = sqlx::query( - "SELECT \ - t.trace_id, t.tenant_id, t.project_id, t.agent_id, t.read_profile, t.query, \ - t.expansion_mode, t.expanded_queries, t.allowed_scopes, t.candidate_count, \ - t.top_k, t.config_snapshot, t.trace_version, t.created_at, \ - i.item_id, i.note_id, i.chunk_id, i.rank, i.retrieval_score, i.retrieval_rank, \ - i.rerank_score, i.tie_breaker_score, i.final_score, i.boosts, \ - i.matched_terms, i.matched_fields \ - FROM search_trace_items i \ - JOIN search_traces t ON i.trace_id = t.trace_id \ - WHERE i.item_id = $1", - ) - .bind(req.result_handle) - .fetch_optional(&self.db.pool) - .await?; +struct StructuredFieldHitArgs<'a> { + embed_version: &'a str, + tenant_id: &'a str, + project_id: &'a str, + agent_id: &'a str, + now: OffsetDateTime, + vec_text: &'a str, + retrieval_limit: i64, + private_allowed: bool, + non_private_scopes: &'a [String], +} - let Some(row) = row else { - return Err(ServiceError::InvalidRequest { - message: "Unknown result_handle or trace not yet persisted.".to_string(), - }); - }; +#[derive(Clone, Debug)] +struct StructuredFieldRetrievalResult { + candidates: Vec<ChunkCandidate>, + structured_matches: HashMap<Uuid, Vec<String>>, +} - let expanded_queries: Vec<String> = - decode_json(row.try_get("expanded_queries")?, "expanded_queries")?; - let allowed_scopes: Vec<String> = - decode_json(row.try_get("allowed_scopes")?, "allowed_scopes")?; - let config_snapshot: serde_json::Value = row.try_get("config_snapshot")?; - let boosts: Vec<SearchBoost> = decode_json(row.try_get("boosts")?, "boosts")?; - let matched_terms: Vec<String> = - decode_json(row.try_get("matched_terms")?, "matched_terms")?; - let matched_fields: Vec<String> = - decode_json(row.try_get("matched_fields")?, "matched_fields")?; +#[derive(Clone, Debug)] +struct RetrievalSourceCandidates { + source: RetrievalSourceKind, + candidates: Vec<ChunkCandidate>, +} - let trace = SearchTrace { - trace_id: row.try_get("trace_id")?, - tenant_id: row.try_get("tenant_id")?, - project_id: row.try_get("project_id")?, - agent_id: row.try_get("agent_id")?, - read_profile: row.try_get("read_profile")?, - query: row.try_get("query")?, - expansion_mode: row.try_get("expansion_mode")?, - expanded_queries, - allowed_scopes, - candidate_count: row.try_get::<i32, _>("candidate_count")? as u32, - top_k: row.try_get::<i32, _>("top_k")? as u32, - config_snapshot, - created_at: row.try_get("created_at")?, - trace_version: row.try_get("trace_version")?, - }; +#[derive(Clone, Debug)] +struct ScoredReplay { + note_id: Uuid, + chunk_id: Uuid, + retrieval_rank: u32, + final_score: f32, + rerank_score: f32, + rerank_rank: u32, + rerank_norm: f32, + retrieval_norm: f32, + blend_retrieval_weight: f32, + retrieval_term: f32, + rerank_term: f32, + tie_breaker_score: f32, + scope_context_boost: f32, + age_days: f32, + importance: f32, + note_scope: String, + deterministic_lexical_overlap_ratio: f32, + deterministic_lexical_bonus: f32, + deterministic_hit_count: i64, + deterministic_last_hit_age_days: Option<f32>, + deterministic_hit_boost: f32, + deterministic_decay_penalty: f32, +} - let explain = SearchExplain { - retrieval_score: row.try_get("retrieval_score")?, - retrieval_rank: row - .try_get::<Option<i32>, _>("retrieval_rank")? - .map(|rank| rank as u32), - rerank_score: row.try_get("rerank_score")?, - tie_breaker_score: row.try_get("tie_breaker_score")?, - final_score: row.try_get("final_score")?, - boosts, - matched_terms, - matched_fields, - }; +#[derive(Clone, Debug, Default)] +struct DynamicGateSummary { + considered: bool, + should_expand: Option<bool>, + observed_candidates: Option<u32>, + observed_top_score: Option<f32>, +} - let item = SearchExplainItem { - result_handle: row.try_get("item_id")?, - note_id: row.try_get("note_id")?, - chunk_id: row.try_get("chunk_id")?, - rank: row.try_get::<i32, _>("rank")? as u32, - explain, - }; +/// Bundle-size mode for trace exports. +#[derive(Clone, Copy, Debug, Deserialize, Serialize)] +#[serde(rename_all = "lowercase")] +#[derive(Default)] +pub enum TraceBundleMode { + #[default] + /// Return the bounded default export. + Bounded, + /// Return the full export. + Full, +} - Ok(SearchExplainResponse { trace, item }) +/// Payload-detail level used by search and trace APIs. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub enum PayloadLevel { + #[default] + /// Level 0 payloads. + L0, + /// Level 1 payloads. + L1, + /// Level 2 payloads. + L2, +} +impl PayloadLevel { + fn as_str(self) -> &'static str { + match self { + Self::L0 => "l0", + Self::L1 => "l1", + Self::L2 => "l2", + } } - async fn embed_single_query(&self, query: &str) -> ServiceResult<Vec<f32>> { - let embeddings = self - .providers - .embedding - .embed(&self.cfg.providers.embedding, slice::from_ref(&query.to_string())) - .await?; - let query_vec = embeddings.into_iter().next().ok_or_else(|| ServiceError::Provider { - message: "Embedding provider returned no vectors.".to_string(), - })?; - if query_vec.len() != self.cfg.storage.qdrant.vector_dim as usize { - return Err(ServiceError::Provider { - message: "Embedding vector dimension mismatch.".to_string(), - }); + fn parse(raw: &str) -> Option<Self> { + match raw.to_ascii_lowercase().as_str() { + "l0" => Some(Self::L0), + "l1" => Some(Self::L1), + "l2" => Some(Self::L2), + _ => None, } - Ok(query_vec) } +} - async fn embed_queries( - &self, - queries: &[String], - original_query: &str, +impl Serialize for PayloadLevel { + fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> + where + S: Serializer, + { + self.as_str().serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for PayloadLevel { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: Deserializer<'de>, + { + let raw = String::deserialize(deserializer)?; + + Self::parse(&raw).ok_or_else(|| de::Error::custom("payload_level must be l0, l1, or l2")) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum ExpansionMode { + Off, + Always, + Dynamic, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum RawSearchPath { + Quick, + Planned, +} + +#[derive(Clone, Copy, Debug)] +enum CacheKind { + Expansion, + Rerank, +} +impl CacheKind { + fn as_str(self) -> &'static str { + match self { + Self::Expansion => "expansion", + Self::Rerank => "rerank", + } + } +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +enum RetrievalSourceKind { + Fusion, + StructuredField, + Recursive, +} + +impl ElfService { + /// Runs the quick raw-search path and returns ranked items without a query plan. + pub async fn search_raw_quick(&self, req: SearchRequest) -> Result<SearchResponse> { + self.execute_search_raw_path(req, RawSearchPath::Quick).await.map(|response| { + SearchResponse { + trace_id: response.trace_id, + items: response.items, + trajectory_summary: response.trajectory_summary, + } + }) + } + + /// Runs the planned raw-search path and returns ranked items plus a query plan. + pub async fn search_raw_planned(&self, req: SearchRequest) -> Result<SearchRawPlannedResponse> { + self.execute_search_raw_path(req, RawSearchPath::Planned).await + } + + /// Runs the default raw-search path and returns ranked items. + pub async fn search_raw(&self, req: SearchRequest) -> Result<SearchResponse> { + self.search_raw_planned(req).await.map(|response| SearchResponse { + trace_id: response.trace_id, + items: response.items, + trajectory_summary: response.trajectory_summary, + }) + } + + async fn execute_search_raw_path( + &self, + req: SearchRequest, + path: RawSearchPath, + ) -> Result<SearchRawPlannedResponse> { + let context = self.prepare_raw_search_execution(req, path)?; + + if context.allowed_scopes.is_empty() { + return self.execute_search_raw_no_allowed_scopes(&context, path).await; + } + + let dynamic_gate_enabled = + path == RawSearchPath::Planned && context.expansion_mode == ExpansionMode::Dynamic; + + self.execute_search_raw_with_allowed_scopes(&context, path, dynamic_gate_enabled).await + } + + async fn execute_search_raw_no_allowed_scopes( + &self, + context: &RawSearchExecutionContext, + path: RawSearchPath, + ) -> Result<SearchRawPlannedResponse> { + let expanded_queries = vec![context.query.clone()]; + let response = self + .finish_search(FinishSearchArgs { + path, + trace_id: context.trace_id, + query: context.query.as_str(), + tenant_id: context.tenant_id.as_str(), + project_id: context.project_id.as_str(), + agent_id: context.agent_id.as_str(), + token_id: context.token_id.as_deref(), + read_profile: context.read_profile.as_str(), + allowed_scopes: &context.allowed_scopes, + expanded_queries: expanded_queries.clone(), + expansion_mode: context.expansion_mode, + candidates: Vec::new(), + structured_matches: HashMap::new(), + recursive_retrieval: None, + top_k: context.top_k, + record_hits_enabled: context.record_hits_enabled, + ranking_override: context.ranking_override.clone(), + payload_level: context.payload_level, + filter: context.filter.as_ref(), + requested_candidate_k: context.requested_candidate_k, + effective_candidate_k: context.effective_candidate_k, + }) + .await?; + + Ok(self.build_raw_planned_response( + context, + path, + response, + expanded_queries, + DynamicGateSummary::default(), + )) + } + + async fn execute_search_raw_with_allowed_scopes( + &self, + context: &RawSearchExecutionContext, + path: RawSearchPath, + dynamic_gate_enabled: bool, + ) -> Result<SearchRawPlannedResponse> { + let filter = build_search_filter( + context.tenant_id.as_str(), + context.project_id.as_str(), + context.agent_id.as_str(), + &context.allowed_scopes, + ); + let retrieval_candidate_k = if context.filter.is_some() { + context.effective_candidate_k + } else { + context.candidate_k + }; + let (baseline_vector, early_response, dynamic_gate) = self + .maybe_finish_dynamic_search(MaybeDynamicSearchArgs { + path, + enabled: dynamic_gate_enabled, + trace_id: context.trace_id, + query: context.query.as_str(), + tenant_id: context.tenant_id.as_str(), + project_id: context.project_id.as_str(), + agent_id: context.agent_id.as_str(), + token_id: context.token_id.as_deref(), + read_profile: context.read_profile.as_str(), + allowed_scopes: &context.allowed_scopes, + project_context_description: context.project_context_description.as_deref(), + filter: &filter, + service_filter: context.filter.as_ref(), + candidate_k: retrieval_candidate_k, + requested_candidate_k: context.requested_candidate_k, + effective_candidate_k: context.effective_candidate_k, + top_k: context.top_k, + record_hits_enabled: context.record_hits_enabled, + ranking_override: context.ranking_override.as_ref(), + retrieval_sources_policy: &context.retrieval_sources_policy, + payload_level: context.payload_level, + }) + .await?; + + if let Some(response) = early_response { + return Ok(self.build_raw_planned_response( + context, + path, + response, + vec![context.query.clone()], + dynamic_gate, + )); + } + + let retrieval = self + .retrieve_search_candidates(SearchRetrievalArgs { + query: context.query.as_str(), + expansion_mode: context.expansion_mode, + project_context_description: context.project_context_description.as_deref(), + filter: &filter, + candidate_k: retrieval_candidate_k, + baseline_vector: baseline_vector.as_ref(), + tenant_id: context.tenant_id.as_str(), + project_id: context.project_id.as_str(), + agent_id: context.agent_id.as_str(), + allowed_scopes: &context.allowed_scopes, + retrieval_sources_policy: &context.retrieval_sources_policy, + }) + .await?; + let expanded_queries = retrieval.expanded_queries.clone(); + let response = self + .finish_search(FinishSearchArgs { + path, + trace_id: context.trace_id, + query: context.query.as_str(), + tenant_id: context.tenant_id.as_str(), + project_id: context.project_id.as_str(), + agent_id: context.agent_id.as_str(), + token_id: context.token_id.as_deref(), + read_profile: context.read_profile.as_str(), + allowed_scopes: &context.allowed_scopes, + expanded_queries: retrieval.expanded_queries, + expansion_mode: context.expansion_mode, + candidates: retrieval.candidates, + structured_matches: retrieval.structured_matches, + recursive_retrieval: retrieval.recursive, + top_k: context.top_k, + record_hits_enabled: context.record_hits_enabled, + ranking_override: context.ranking_override.clone(), + payload_level: context.payload_level, + filter: context.filter.as_ref(), + requested_candidate_k: context.requested_candidate_k, + effective_candidate_k: context.effective_candidate_k, + }) + .await?; + + Ok(self.build_raw_planned_response(context, path, response, expanded_queries, dynamic_gate)) + } + + fn prepare_raw_search_execution( + &self, + req: SearchRequest, + path: RawSearchPath, + ) -> Result<RawSearchExecutionContext> { + let tenant_id = req.tenant_id.trim().to_string(); + let project_id = req.project_id.trim().to_string(); + let agent_id = req.agent_id.trim().to_string(); + let token_id = req + .token_id + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(|value| value.to_string()); + + validate_search_request_inputs( + tenant_id.as_str(), + project_id.as_str(), + agent_id.as_str(), + req.query.as_str(), + )?; + + let top_k = req.top_k.unwrap_or(self.cfg.memory.top_k).max(1); + let candidate_k = req.candidate_k.unwrap_or(self.cfg.memory.candidate_k).max(top_k); + let requested_candidate_k = candidate_k; + let filter = req + .filter + .as_ref() + .map(SearchFilter::parse) + .transpose() + .map_err(|err| crate::Error::InvalidRequest { message: err.to_string() })?; + let effective_candidate_k = if filter.is_some() { + requested_candidate_k.saturating_mul(3).min(MAX_CANDIDATE_K).max(top_k) + } else { + requested_candidate_k + }; + let query = req.query; + let read_profile = req.read_profile; + let record_hits_enabled = req.record_hits.unwrap_or(false); + let ranking_override = req.ranking; + let retrieval_sources_policy = ranking::resolve_retrieval_sources_policy( + &self.cfg.ranking.retrieval_sources, + ranking_override.as_ref().and_then(|override_| override_.retrieval_sources.as_ref()), + )?; + let expansion_mode = match path { + RawSearchPath::Quick => ExpansionMode::Off, + RawSearchPath::Planned => ranking::resolve_expansion_mode(&self.cfg), + }; + let trace_id = Uuid::new_v4(); + let project_context_description = self + .resolve_project_context_description(tenant_id.as_str(), project_id.as_str()) + .map(|value| value.to_string()); + let allowed_scopes = ranking::resolve_scopes(&self.cfg, read_profile.as_str())?; + let policies = self.resolve_finish_search_policies(ranking_override.as_ref())?; + + Ok(RawSearchExecutionContext { + tenant_id, + project_id, + agent_id, + token_id, + top_k, + candidate_k, + requested_candidate_k, + effective_candidate_k, + filter, + query, + read_profile, + payload_level: req.payload_level, + record_hits_enabled, + ranking_override, + retrieval_sources_policy, + expansion_mode, + trace_id, + project_context_description, + allowed_scopes, + policies, + }) + } + + fn build_raw_planned_response( + &self, + context: &RawSearchExecutionContext, + path: RawSearchPath, + response: SearchResponse, + expanded_queries: Vec<String>, + dynamic_gate: DynamicGateSummary, + ) -> SearchRawPlannedResponse { + let query_plan = self.build_query_plan(BuildQueryPlanArgs { + path, + query: context.query.as_str(), + tenant_id: context.tenant_id.as_str(), + project_id: context.project_id.as_str(), + agent_id: context.agent_id.as_str(), + read_profile: context.read_profile.as_str(), + allowed_scopes: &context.allowed_scopes, + expansion_mode: context.expansion_mode, + expanded_queries, + top_k: context.top_k, + candidate_k: context.candidate_k, + retrieval_sources_policy: &context.retrieval_sources_policy, + recursive_enabled: self.cfg.search.recursive.enabled, + policies: &context.policies, + dynamic_gate, + }); + + SearchRawPlannedResponse { + trace_id: response.trace_id, + items: response.items, + trajectory_summary: response.trajectory_summary, + query_plan, + } + } + + async fn maybe_finish_dynamic_search( + &self, + args: MaybeDynamicSearchArgs<'_>, + ) -> Result<(Option<Vec<f32>>, Option<SearchResponse>, DynamicGateSummary)> { + if !args.enabled { + return Ok((None, None, DynamicGateSummary::default())); + } + + let query_vec = + self.embed_single_query(args.query, args.project_context_description).await?; + let baseline_points = self + .run_fusion_query( + &[QueryEmbedding { text: args.query.to_string(), vector: query_vec.clone() }], + args.filter, + args.candidate_k, + ) + .await?; + let top_score = baseline_points.first().map(|point| point.score).unwrap_or(0.0); + let fusion_candidates = ranking::collect_chunk_candidates( + &baseline_points, + self.cfg.search.prefilter.max_candidates, + args.candidate_k, + ); + let should_expand = ranking::should_expand_dynamic( + baseline_points.len(), + top_score, + &self.cfg.search.dynamic, + ); + let dynamic_gate = DynamicGateSummary { + considered: true, + should_expand: Some(should_expand), + observed_candidates: Some(baseline_points.len() as u32), + observed_top_score: Some(top_score), + }; + + if should_expand { + return Ok((Some(query_vec), None, dynamic_gate)); + } + + let StructuredFieldRetrievalResult { + candidates: structured_candidates, + structured_matches, + } = self + .retrieve_structured_field_candidates(StructuredFieldRetrievalArgs { + tenant_id: args.tenant_id, + project_id: args.project_id, + agent_id: args.agent_id, + allowed_scopes: args.allowed_scopes, + query_vec: query_vec.as_slice(), + candidate_k: args.candidate_k, + now: OffsetDateTime::now_utc(), + }) + .await?; + let mut seed_candidates = + Vec::with_capacity(fusion_candidates.len() + structured_candidates.len()); + + seed_candidates.extend_from_slice(fusion_candidates.as_slice()); + seed_candidates.extend_from_slice(structured_candidates.as_slice()); + + let recursive = self + .run_recursive_retrieval(RecursiveRetrievalArgs { + query: args.query, + query_vec: query_vec.as_slice(), + filter: args.filter, + candidate_k: args.candidate_k, + retrieval_sources_policy: args.retrieval_sources_policy, + seed_candidates: seed_candidates.as_slice(), + }) + .await?; + let mut retrieval_sources = vec![ + RetrievalSourceCandidates { + source: RetrievalSourceKind::Fusion, + candidates: fusion_candidates, + }, + RetrievalSourceCandidates { + source: RetrievalSourceKind::StructuredField, + candidates: structured_candidates, + }, + ]; + + if recursive.enabled { + retrieval_sources.push(RetrievalSourceCandidates { + source: RetrievalSourceKind::Recursive, + candidates: recursive.candidates.clone(), + }); + } + + let merged_candidates = ranking::merge_retrieval_candidates( + retrieval_sources, + args.retrieval_sources_policy, + args.candidate_k, + ); + let response = self + .finish_search(FinishSearchArgs { + path: args.path, + trace_id: args.trace_id, + query: args.query, + tenant_id: args.tenant_id, + project_id: args.project_id, + agent_id: args.agent_id, + token_id: args.token_id, + read_profile: args.read_profile, + allowed_scopes: args.allowed_scopes, + expanded_queries: vec![args.query.to_string()], + expansion_mode: ExpansionMode::Dynamic, + candidates: merged_candidates, + structured_matches, + recursive_retrieval: Some(recursive), + top_k: args.top_k, + record_hits_enabled: args.record_hits_enabled, + ranking_override: args.ranking_override.cloned(), + payload_level: args.payload_level, + filter: args.service_filter, + requested_candidate_k: args.requested_candidate_k, + effective_candidate_k: args.effective_candidate_k, + }) + .await?; + + Ok((Some(query_vec), Some(response), dynamic_gate)) + } + + async fn retrieve_search_candidates( + &self, + args: SearchRetrievalArgs<'_>, + ) -> Result<SearchRetrievalResult> { + let queries = match args.expansion_mode { + ExpansionMode::Off => vec![args.query.to_string()], + ExpansionMode::Always | ExpansionMode::Dynamic => self.expand_queries(args.query).await, + }; + let expanded_queries = queries.clone(); + let query_embeddings = self + .embed_queries( + queries.as_slice(), + args.query, + args.baseline_vector, + args.project_context_description, + ) + .await?; + let fusion_points = + self.run_fusion_query(&query_embeddings, args.filter, args.candidate_k).await?; + let fusion_candidates = ranking::collect_chunk_candidates( + &fusion_points, + self.cfg.search.prefilter.max_candidates, + args.candidate_k, + ); + let original_query_vec = query_embeddings + .iter() + .find(|embedded| embedded.text == args.query) + .map(|embedded| embedded.vector.clone()) + .unwrap_or_else(Vec::new); + let original_query_vec = if original_query_vec.is_empty() { + self.embed_single_query(args.query, args.project_context_description).await? + } else { + original_query_vec + }; + let StructuredFieldRetrievalResult { + candidates: structured_candidates, + structured_matches, + } = self + .retrieve_structured_field_candidates(StructuredFieldRetrievalArgs { + tenant_id: args.tenant_id, + project_id: args.project_id, + agent_id: args.agent_id, + allowed_scopes: args.allowed_scopes, + query_vec: original_query_vec.as_slice(), + candidate_k: args.candidate_k, + now: OffsetDateTime::now_utc(), + }) + .await?; + let mut seed_candidates = + Vec::with_capacity(fusion_candidates.len() + structured_candidates.len()); + + seed_candidates.extend_from_slice(fusion_candidates.as_slice()); + seed_candidates.extend_from_slice(structured_candidates.as_slice()); + + let recursive = self + .run_recursive_retrieval(RecursiveRetrievalArgs { + query: args.query, + query_vec: original_query_vec.as_slice(), + filter: args.filter, + candidate_k: args.candidate_k, + retrieval_sources_policy: args.retrieval_sources_policy, + seed_candidates: seed_candidates.as_slice(), + }) + .await?; + let mut retrieval_sources = vec![ + RetrievalSourceCandidates { + source: RetrievalSourceKind::Fusion, + candidates: fusion_candidates, + }, + RetrievalSourceCandidates { + source: RetrievalSourceKind::StructuredField, + candidates: structured_candidates, + }, + ]; + + if recursive.enabled { + retrieval_sources.push(RetrievalSourceCandidates { + source: RetrievalSourceKind::Recursive, + candidates: recursive.candidates.clone(), + }); + } + + let merged_candidates = ranking::merge_retrieval_candidates( + retrieval_sources, + args.retrieval_sources_policy, + args.candidate_k, + ); + + Ok(SearchRetrievalResult { + expanded_queries, + candidates: merged_candidates, + structured_matches, + recursive: Some(recursive), + }) + } + + async fn run_recursive_retrieval( + &self, + args: RecursiveRetrievalArgs<'_>, + ) -> Result<RecursiveRetrievalResult> { + let recursive_config = &self.cfg.search.recursive; + let mut result = RecursiveRetrievalResult { + enabled: recursive_config.enabled + && args.retrieval_sources_policy.recursive_weight > 0.0, + ..Default::default() + }; + + if !result.enabled { + result.stop_reason = Some("disabled".to_string()); + + return Ok(result); + } + if args.query_vec.is_empty() { + result.stop_reason = Some("missing_query_vector".to_string()); + + return Ok(result); + } + + let mut seed_scopes = HashSet::<String>::new(); + + for candidate in args.seed_candidates { + if let Some(scope) = candidate.scope.as_deref() + && !scope.trim().is_empty() + { + seed_scopes.insert(scope.to_string()); + } + } + + result.scopes_seeded = seed_scopes.len(); + result.candidates_before = args.seed_candidates.len(); + + if seed_scopes.is_empty() { + result.stop_reason = Some("no_scope_seed".to_string()); + + return Ok(result); + } + + let max_depth = recursive_config.max_depth; + let max_children_per_node = + usize::try_from(recursive_config.max_children_per_node).unwrap_or(usize::MAX); + let max_nodes_per_scope = + usize::try_from(recursive_config.max_nodes_per_scope).unwrap_or(usize::MAX); + let max_total_nodes = + usize::try_from(recursive_config.max_total_nodes).unwrap_or(usize::MAX); + let child_query_embedding = + QueryEmbedding { text: args.query.to_string(), vector: args.query_vec.to_vec() }; + let per_query_candidate_k = + args.candidate_k.min(recursive_config.max_nodes_per_scope).max(1); + let (candidates, queried_scopes, rounds_executed, stop_reason) = self + .collect_recursive_candidates( + &args, + seed_scopes, + child_query_embedding, + max_depth, + max_children_per_node, + max_nodes_per_scope, + max_total_nodes, + per_query_candidate_k, + self.cfg.search.prefilter.max_candidates, + ) + .await?; + + result.scopes_queried = queried_scopes; + result.rounds_executed = rounds_executed; + result.total_queries = rounds_executed; + result.candidates = candidates; + result.candidates_added = result.candidates.len(); + result.candidates_after = result.candidates_before + result.candidates_added; + result.stop_reason = stop_reason.or(Some("converged".to_string())); + + Ok(result) + } + + #[allow(clippy::too_many_arguments)] + async fn collect_recursive_candidates( + &self, + args: &RecursiveRetrievalArgs<'_>, + seed_scopes: HashSet<String>, + child_query_embedding: QueryEmbedding, + max_depth: u32, + max_children_per_node: usize, + max_nodes_per_scope: usize, + max_total_nodes: usize, + per_query_candidate_k: u32, + prefilter_max_candidates: u32, + ) -> Result<(Vec<ChunkCandidate>, usize, u32, Option<String>)> { + let mut queued_scopes: VecDeque<(String, u32)> = VecDeque::new(); + let mut discovered_scopes = seed_scopes.clone(); + let mut recursion_candidates = Vec::<ChunkCandidate>::new(); + let mut seen_chunks = + args.seed_candidates.iter().map(|candidate| candidate.chunk_id).collect::<HashSet<_>>(); + let mut scope_counts: HashMap<String, u32> = HashMap::new(); + let mut queried_scopes = 0_usize; + let mut rounds_executed = 0_u32; + let mut stop_reason: Option<String> = None; + + for scope in seed_scopes { + queued_scopes.push_back((scope, 1)); + } + + while let Some((scope, depth)) = queued_scopes.pop_front() { + if depth > max_depth { + stop_reason = Some("max_depth".to_string()); + + break; + } + + queried_scopes = queried_scopes.saturating_add(1); + rounds_executed = rounds_executed.saturating_add(1); + + let mut scoped_filter = args.filter.clone(); + + scoped_filter.must.push(Condition::matches("scope", scope.clone())); + + let recursive_points = self + .run_fusion_query( + slice::from_ref(&child_query_embedding), + &scoped_filter, + per_query_candidate_k, + ) + .await?; + let scope_query_limit = per_query_candidate_k.min(max_nodes_per_scope as u32); + let recursive_candidates_for_scope = ranking::collect_chunk_candidates( + &recursive_points, + prefilter_max_candidates.min(scope_query_limit), + scope_query_limit, + ); + let mut child_scopes = HashSet::<String>::new(); + + for mut candidate in recursive_candidates_for_scope { + if recursion_candidates.len() >= max_total_nodes { + stop_reason = Some("max_total_nodes".to_string()); + + break; + } + + let scope_key = candidate.scope.clone().unwrap_or_else(|| scope.clone()); + let scope_count = scope_counts.entry(scope_key.clone()).or_default(); + + if (*scope_count as usize) >= max_nodes_per_scope { + continue; + } + if !seen_chunks.insert(candidate.chunk_id) { + continue; + } + + *scope_count = scope_count.saturating_add(1); + candidate.scope = Some(scope_key.clone()); + + recursion_candidates.push(candidate); + + if depth < max_depth + && child_scopes.len() < max_children_per_node + && !scope_key.is_empty() + && discovered_scopes.insert(scope_key.clone()) + { + child_scopes.insert(scope_key.clone()); + queued_scopes.push_back((scope_key.clone(), depth.saturating_add(1))); + } + } + + if stop_reason.is_some() { + break; + } + } + + Ok((recursion_candidates, queried_scopes, rounds_executed, stop_reason)) + } + + fn resolve_project_context_description<'a>( + &'a self, + tenant_id: &str, + project_id: &str, + ) -> Option<&'a str> { + let context = self.cfg.context.as_ref()?; + let descriptions = context.project_descriptions.as_ref()?; + let key = format!("{tenant_id}:{project_id}"); + let mut saw_non_english = false; + + if let Some(value) = descriptions.get(&key) { + let trimmed = value.trim(); + + if !trimmed.is_empty() { + if !english_gate::is_english_natural_language(trimmed) { + saw_non_english = true; + } else { + return Some(trimmed); + } + } + } + if let Some(value) = descriptions.get(project_id) { + let trimmed = value.trim(); + + if !trimmed.is_empty() { + if !english_gate::is_english_natural_language(trimmed) { + saw_non_english = true; + } else { + return Some(trimmed); + } + } + } + + if saw_non_english { + tracing::warn!( + tenant_id = %tenant_id, + project_id = %project_id, + "Project context description is non-English. Skipping context." + ); + } + + None + } + + /// Loads the explain payload for one result handle. + pub async fn search_explain(&self, req: SearchExplainRequest) -> Result<SearchExplainResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + + if tenant_id.is_empty() || project_id.is_empty() { + return Err(crate::Error::InvalidRequest { + message: "tenant_id and project_id are required.".to_string(), + }); + } + + let row = sqlx::query_as::<_, SearchExplainTraceRow>( + "\ +SELECT + t.trace_id, + t.tenant_id, + t.project_id, + t.agent_id, + t.read_profile, + t.query, + t.expansion_mode, + t.expanded_queries, + t.allowed_scopes, + t.candidate_count, + t.top_k, + t.config_snapshot, + t.trace_version, + t.created_at, + i.item_id, + i.note_id, + i.chunk_id, + i.rank, + i.explain +FROM search_trace_items i +JOIN search_traces t ON i.trace_id = t.trace_id + +WHERE i.item_id = $1 AND t.tenant_id = $2 AND t.project_id = $3", + ) + .bind(req.result_handle) + .bind(tenant_id) + .bind(project_id) + .fetch_optional(&self.db.pool) + .await?; + let Some(row) = row else { + return Err(crate::Error::InvalidRequest { + message: "Unknown result_handle or trace not yet persisted.".to_string(), + }); + }; + let expanded_queries: Vec<String> = + ranking::decode_json(row.expanded_queries, "expanded_queries")?; + let allowed_scopes: Vec<String> = + ranking::decode_json(row.allowed_scopes, "allowed_scopes")?; + let config_snapshot = row.config_snapshot; + let explain: SearchExplain = ranking::decode_json(row.explain, "explain")?; + let trace = SearchTrace { + trace_id: row.trace_id, + tenant_id: row.tenant_id, + project_id: row.project_id, + agent_id: row.agent_id, + read_profile: row.read_profile, + query: row.query, + expansion_mode: row.expansion_mode, + expanded_queries, + allowed_scopes, + candidate_count: row.candidate_count as u32, + top_k: row.top_k as u32, + config_snapshot, + created_at: row.created_at, + trace_version: row.trace_version, + }; + let item = SearchExplainItem { + result_handle: row.item_id, + note_id: row.note_id, + chunk_id: row.chunk_id, + rank: row.rank as u32, + explain, + }; + let trajectory = load_item_trajectory( + &self.db.pool, + row.trace_id, + row.item_id, + row.note_id, + row.chunk_id, + ) + .await?; + + Ok(SearchExplainResponse { trace, item, trajectory }) + } + + /// Loads trace metadata and explained items for one trace. + pub async fn trace_get(&self, req: TraceGetRequest) -> Result<TraceGetResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + + if req.agent_id.trim().is_empty() { + return Err(crate::Error::InvalidRequest { + message: "agent_id is required.".to_string(), + }); + } + if tenant_id.is_empty() || project_id.is_empty() { + return Err(crate::Error::InvalidRequest { + message: "tenant_id and project_id are required.".to_string(), + }); + } + + let row = sqlx::query_as::<_, SearchTraceRow>( + "\ +SELECT + trace_id, + tenant_id, + project_id, + agent_id, + read_profile, + query, + expansion_mode, + expanded_queries, + allowed_scopes, + candidate_count, + top_k, + config_snapshot, + trace_version, + created_at +FROM search_traces +WHERE trace_id = $1 AND tenant_id = $2 AND project_id = $3", + ) + .bind(req.trace_id) + .bind(tenant_id) + .bind(project_id) + .fetch_optional(&self.db.pool) + .await?; + let Some(row) = row else { + return Err(crate::Error::InvalidRequest { message: "Unknown trace_id.".to_string() }); + }; + let expanded_queries: Vec<String> = + ranking::decode_json(row.expanded_queries, "expanded_queries")?; + let allowed_scopes: Vec<String> = + ranking::decode_json(row.allowed_scopes, "allowed_scopes")?; + let config_snapshot = row.config_snapshot; + let trace = SearchTrace { + trace_id: row.trace_id, + tenant_id: row.tenant_id, + project_id: row.project_id, + agent_id: row.agent_id, + read_profile: row.read_profile, + query: row.query, + expansion_mode: row.expansion_mode, + expanded_queries, + allowed_scopes, + candidate_count: row.candidate_count as u32, + top_k: row.top_k as u32, + config_snapshot, + created_at: row.created_at, + trace_version: row.trace_version, + }; + let item_rows = sqlx::query_as::<_, SearchTraceItemRow>( + "\ +SELECT + item_id, + note_id, + chunk_id, + rank, + explain +FROM search_trace_items +WHERE trace_id = $1 +ORDER BY rank ASC", + ) + .bind(req.trace_id) + .fetch_all(&self.db.pool) + .await?; + let mut items = Vec::with_capacity(item_rows.len()); + + for row in item_rows { + let explain: SearchExplain = ranking::decode_json(row.explain, "explain")?; + + items.push(SearchExplainItem { + result_handle: row.item_id, + note_id: row.note_id, + chunk_id: row.chunk_id, + rank: row.rank as u32, + explain, + }); + } + + let trajectory_summary = load_trace_trajectory_summary(&self.db.pool, req.trace_id).await?; + + Ok(TraceGetResponse { trace, items, trajectory_summary }) + } + + /// Loads full trajectory stages for one trace. + pub async fn trace_trajectory_get( + &self, + req: TraceTrajectoryGetRequest, + ) -> Result<SearchTrajectoryResponse> { + let base = self + .trace_get(TraceGetRequest { + tenant_id: req.tenant_id, + project_id: req.project_id, + agent_id: req.agent_id, + trace_id: req.trace_id, + }) + .await?; + let stages = load_trace_trajectory_stages(&self.db.pool, req.trace_id).await?; + let trajectory = build_trajectory_summary_from_stages(stages.as_slice()); + + Ok(SearchTrajectoryResponse { trace: base.trace, trajectory, stages }) + } + + /// Lists recent traces with cursor-based pagination. + pub async fn trace_recent_list( + &self, + req: TraceRecentListRequest, + ) -> Result<TraceRecentListResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let caller_agent_id = req.agent_id.trim(); + let cursor_created_at = req.cursor_created_at; + let cursor_trace_id = req.cursor_trace_id; + let agent_id_filter = req.agent_id_filter.map(|value| value.trim().to_string()); + let read_profile = req.read_profile.map(|value| value.trim().to_string()); + let limit = req.limit.unwrap_or(DEFAULT_RECENT_TRACES_LIMIT); + + if cursor_created_at.is_some() != cursor_trace_id.is_some() { + return Err(crate::Error::InvalidRequest { + message: "cursor_created_at and cursor_trace_id must be both set or both omitted." + .to_string(), + }); + } + if caller_agent_id.is_empty() { + return Err(crate::Error::InvalidRequest { + message: "agent_id is required.".to_string(), + }); + } + if tenant_id.is_empty() || project_id.is_empty() { + return Err(crate::Error::InvalidRequest { + message: "tenant_id and project_id are required.".to_string(), + }); + } + if limit == 0 || limit > MAX_RECENT_TRACES_LIMIT { + return Err(crate::Error::InvalidRequest { + message: format!("limit must be between 1 and {MAX_RECENT_TRACES_LIMIT}."), + }); + } + + if let (Some(created_after), Some(created_before)) = (req.created_after, req.created_before) + && created_after >= created_before + { + return Err(crate::Error::InvalidRequest { + message: "created_after must be before created_before.".to_string(), + }); + } + + let agent_id_filter = agent_id_filter.as_deref(); + let read_profile = read_profile.as_deref(); + let fetch_limit = (limit + 1).min(MAX_RECENT_TRACES_LIMIT + 1); + let rows = sqlx::query_as::<_, SearchRecentTraceRow>( + "\ +SELECT + trace_id, + tenant_id, + project_id, + agent_id, + read_profile, + query, + created_at +FROM search_traces +WHERE tenant_id = $1 + AND project_id = $2 + AND ($3::text IS NULL OR agent_id = $3) + AND ($4::text IS NULL OR read_profile = $4) + AND ($5::timestamptz IS NULL OR created_at > $5) + AND ($6::timestamptz IS NULL OR created_at < $6) + AND ($7::timestamptz IS NULL OR $8::uuid IS NULL OR (created_at, trace_id) < ($7, $8)) +ORDER BY created_at DESC, trace_id DESC +LIMIT $9 +", + ) + .bind(tenant_id) + .bind(project_id) + .bind(agent_id_filter) + .bind(read_profile) + .bind(req.created_after) + .bind(req.created_before) + .bind(cursor_created_at) + .bind(cursor_trace_id) + .bind(fetch_limit as i64) + .fetch_all(&self.db.pool) + .await?; + let next_cursor = if rows.len() > limit as usize { + let cursor_row = &rows[limit as usize - 1]; + + Some(TraceRecentCursor { + created_at: cursor_row.created_at, + trace_id: cursor_row.trace_id, + }) + } else { + None + }; + let mut response_rows = rows; + + response_rows.truncate(limit as usize); + + let mut traces = Vec::with_capacity(response_rows.len()); + + for row in response_rows { + traces.push(RecentTraceHeader { + trace_id: row.trace_id, + tenant_id: row.tenant_id, + project_id: row.project_id, + agent_id: row.agent_id, + read_profile: row.read_profile, + query: row.query, + created_at: row.created_at, + }); + } + + Ok(TraceRecentListResponse { + schema: RECENT_TRACES_SCHEMA_V1.to_string(), + traces, + next_cursor, + }) + } + + /// Loads a trace bundle with optional trajectory and replay candidates. + pub async fn trace_bundle_get( + &self, + req: TraceBundleGetRequest, + ) -> Result<TraceBundleResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + + if req.agent_id.trim().is_empty() { + return Err(crate::Error::InvalidRequest { + message: "agent_id is required.".to_string(), + }); + } + if tenant_id.is_empty() || project_id.is_empty() { + return Err(crate::Error::InvalidRequest { + message: "tenant_id and project_id are required.".to_string(), + }); + } + + let base = self + .trace_get(TraceGetRequest { + tenant_id: tenant_id.to_string(), + project_id: project_id.to_string(), + agent_id: req.agent_id.trim().to_string(), + trace_id: req.trace_id, + }) + .await?; + let default_stage_items_limit = match req.mode { + TraceBundleMode::Bounded => DEFAULT_BOUNDED_STAGE_ITEMS_LIMIT, + TraceBundleMode::Full => DEFAULT_FULL_STAGE_ITEMS_LIMIT, + }; + let default_candidates_limit = match req.mode { + TraceBundleMode::Bounded => DEFAULT_BOUNDED_CANDIDATES_LIMIT, + TraceBundleMode::Full => DEFAULT_FULL_CANDIDATES_LIMIT, + }; + let stage_items_limit = req + .stage_items_limit + .unwrap_or(default_stage_items_limit) + .min(MAX_TRACE_BUNDLE_ITEMS_LIMIT); + let candidates_limit = req + .candidates_limit + .unwrap_or(default_candidates_limit) + .min(MAX_TRACE_BUNDLE_CANDIDATES_LIMIT); + let mut stages = load_trace_trajectory_stages(&self.db.pool, req.trace_id).await?; + + for stage in stages.iter_mut() { + stage.items.truncate(stage_items_limit as usize); + } + + let candidates = if candidates_limit == 0 { + None + } else { + let candidate_rows = sqlx::query_as::<_, TraceCandidateSnapshotRow>( + "\ +SELECT candidate_snapshot +FROM search_trace_candidates +WHERE trace_id = $1 +ORDER BY retrieval_rank ASC, candidate_id ASC +LIMIT $2 +", + ) + .bind(req.trace_id) + .bind(candidates_limit as i32) + .fetch_all(&self.db.pool) + .await?; + let mut candidates = Vec::with_capacity(candidate_rows.len()); + + for row in candidate_rows { + candidates + .push(ranking::decode_json(row.candidate_snapshot, "candidate_snapshot")?); + } + + if candidates.is_empty() { None } else { Some(candidates) } + }; + + Ok(TraceBundleResponse { + schema: TRACE_BUNDLE_SCHEMA_V1.to_string(), + generated_at: OffsetDateTime::now_utc(), + trace: base.trace, + items: base.items, + trajectory_summary: base.trajectory_summary, + stages, + candidates, + }) + } + + async fn embed_single_query( + &self, + query: &str, + project_context_description: Option<&str>, + ) -> Result<Vec<f32>> { + let input = ranking::build_dense_embedding_input(query, project_context_description); + let embeddings = self + .providers + .embedding + .embed(&self.cfg.providers.embedding, slice::from_ref(&input)) + .await?; + let query_vec = embeddings.into_iter().next().ok_or_else(|| crate::Error::Provider { + message: "Embedding provider returned no vectors.".to_string(), + })?; + + if query_vec.len() != self.cfg.storage.qdrant.vector_dim as usize { + return Err(crate::Error::Provider { + message: "Embedding vector dimension mismatch.".to_string(), + }); + } + + Ok(query_vec) + } + + async fn embed_queries( + &self, + queries: &[String], + original_query: &str, baseline_vector: Option<&Vec<f32>>, - ) -> ServiceResult<Vec<QueryEmbedding>> { + project_context_description: Option<&str>, + ) -> Result<Vec<QueryEmbedding>> { let mut extra_queries = Vec::new(); + let mut extra_inputs = Vec::new(); + for query in queries { if baseline_vector.is_some() && query == original_query { continue; } - extra_queries.push(query.clone()); + + extra_queries.push(query.clone()); + extra_inputs + .push(ranking::build_dense_embedding_input(query, project_context_description)); + } + + let mut embedded_iter = if extra_queries.is_empty() { + Vec::new().into_iter() + } else { + let embedded = self + .providers + .embedding + .embed(&self.cfg.providers.embedding, &extra_inputs) + .await?; + + if embedded.len() != extra_queries.len() { + return Err(crate::Error::Provider { + message: "Embedding provider returned mismatched vector count.".to_string(), + }); + } + + embedded.into_iter() + }; + let mut out = Vec::with_capacity(queries.len()); + + for query in queries { + let vector = if baseline_vector.is_some() && query == original_query { + baseline_vector + .ok_or_else(|| crate::Error::Provider { + message: "Embedding baseline vector is missing.".to_string(), + })? + .clone() + } else { + embedded_iter.next().ok_or_else(|| crate::Error::Provider { + message: "Embedding provider returned no vectors.".to_string(), + })? + }; + + if vector.len() != self.cfg.storage.qdrant.vector_dim as usize { + return Err(crate::Error::Provider { + message: "Embedding vector dimension mismatch.".to_string(), + }); + } + + out.push(QueryEmbedding { text: query.clone(), vector }); + } + + Ok(out) + } + + async fn run_fusion_query( + &self, + queries: &[QueryEmbedding], + filter: &Filter, + candidate_k: u32, + ) -> Result<Vec<ScoredPoint>> { + let mut search = QueryPointsBuilder::new(self.qdrant.collection.clone()); + + for query in queries { + let dense_prefetch = PrefetchQueryBuilder::default() + .query(Query::new_nearest(query.vector.clone())) + .using(DENSE_VECTOR_NAME) + .filter(filter.clone()) + .limit(candidate_k as u64); + let bm25_prefetch = PrefetchQueryBuilder::default() + .query(Query::new_nearest(Document::new(query.text.clone(), BM25_MODEL))) + .using(BM25_VECTOR_NAME) + .filter(filter.clone()) + .limit(candidate_k as u64); + + search = search.add_prefetch(dense_prefetch).add_prefetch(bm25_prefetch); + } + + let search = search.with_payload(true).query(Fusion::Rrf).limit(candidate_k as u64); + let response = self + .qdrant + .client + .query(search) + .await + .map_err(|err| crate::Error::Qdrant { message: err.to_string() })?; + + Ok(response.result) + } + + async fn expand_queries(&self, query: &str) -> Vec<String> { + let cfg = &self.cfg.search.expansion; + let cache_cfg = &self.cfg.search.cache; + let now = OffsetDateTime::now_utc(); + let cache_key = if cache_cfg.enabled { + match ranking::build_expansion_cache_key( + query, + cfg.max_queries, + cfg.include_original, + self.cfg.providers.llm_extractor.provider_id.as_str(), + self.cfg.providers.llm_extractor.model.as_str(), + self.cfg.providers.llm_extractor.temperature, + ) { + Ok(key) => Some(key), + Err(err) => { + tracing::warn!( + error = %err, + cache_kind = CacheKind::Expansion.as_str(), + "Cache key build failed." + ); + + None + }, + } + } else { + None + }; + + if let Some(key) = cache_key.as_ref() + && let Some(queries) = self.read_expansion_cache_queries(key, cache_cfg, now).await + { + return queries; + } + + let messages = + ranking::build_expansion_messages(query, cfg.max_queries, cfg.include_original); + let raw = match self + .providers + .extractor + .extract(&self.cfg.providers.llm_extractor, &messages) + .await + { + Ok(value) => value, + Err(err) => { + tracing::warn!(error = %err, "Query expansion failed; falling back to original query."); + + return vec![query.to_string()]; + }, + }; + let parsed: ExpansionOutput = match serde_json::from_value(raw) { + Ok(value) => value, + Err(err) => { + tracing::warn!(error = %err, "Query expansion returned invalid JSON; falling back to original query."); + + return vec![query.to_string()]; + }, + }; + let normalized = ranking::normalize_queries( + parsed.queries, + query, + cfg.include_original, + cfg.max_queries, + ); + let result = if normalized.is_empty() { vec![query.to_string()] } else { normalized }; + + if let Some(key) = cache_key { + self.store_expansion_cache_queries(&key, &result, cache_cfg).await; + } + + result + } + + async fn read_expansion_cache_queries( + &self, + key: &str, + cache_cfg: &SearchCache, + now: OffsetDateTime, + ) -> Option<Vec<String>> { + match fetch_cache_payload(&self.db.pool, CacheKind::Expansion, key, now).await { + Ok(Some(payload)) => { + tracing::info!( + cache_kind = CacheKind::Expansion.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + hit = true, + payload_size = payload.size_bytes, + ttl_days = cache_cfg.expansion_ttl_days, + "Cache hit." + ); + + let cached: ExpansionCachePayload = match serde_json::from_value(payload.value) { + Ok(value) => value, + Err(err) => { + tracing::warn!( + error = %err, + cache_kind = CacheKind::Expansion.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + "Cache payload decode failed." + ); + + ExpansionCachePayload { queries: Vec::new() } + }, + }; + + (!cached.queries.is_empty()).then_some(cached.queries) + }, + Ok(None) => { + tracing::info!( + cache_kind = CacheKind::Expansion.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + hit = false, + payload_size = 0_u64, + ttl_days = cache_cfg.expansion_ttl_days, + "Cache miss." + ); + + None + }, + Err(err) => { + tracing::warn!( + error = %err, + cache_kind = CacheKind::Expansion.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + "Cache read failed." + ); + + None + }, + } + } + + async fn store_expansion_cache_queries( + &self, + key: &str, + queries: &[String], + cache_cfg: &SearchCache, + ) { + let payload = ExpansionCachePayload { queries: queries.to_vec() }; + let payload_json = match serde_json::to_value(&payload) { + Ok(value) => value, + Err(err) => { + tracing::warn!( + error = %err, + cache_kind = CacheKind::Expansion.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + "Cache payload encode failed." + ); + + return; + }, + }; + let stored_at = OffsetDateTime::now_utc(); + let expires_at = stored_at + Duration::days(cache_cfg.expansion_ttl_days); + + match store_cache_payload( + &self.db.pool, + CacheKind::Expansion, + key, + payload_json, + stored_at, + expires_at, + cache_cfg.max_payload_bytes, + ) + .await + { + Ok(Some(payload_size)) => { + tracing::info!( + cache_kind = CacheKind::Expansion.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + hit = false, + payload_size, + ttl_days = cache_cfg.expansion_ttl_days, + "Cache stored." + ); + }, + Ok(None) => { + tracing::warn!( + cache_kind = CacheKind::Expansion.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + hit = false, + payload_size = 0_u64, + ttl_days = cache_cfg.expansion_ttl_days, + "Cache payload skipped due to size." + ); + }, + Err(err) => { + tracing::warn!( + error = %err, + cache_kind = CacheKind::Expansion.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + "Cache write failed." + ); + }, + } + } + + async fn retrieve_structured_field_candidates( + &self, + args: StructuredFieldRetrievalArgs<'_>, + ) -> Result<StructuredFieldRetrievalResult> { + let StructuredFieldRetrievalArgs { + tenant_id, + project_id, + agent_id, + allowed_scopes, + query_vec, + candidate_k, + now, + } = args; + + if query_vec.is_empty() { + return Ok(StructuredFieldRetrievalResult { + candidates: Vec::new(), + structured_matches: HashMap::new(), + }); + } + + let embed_version = crate::embedding_version(&self.cfg); + let vec_text = crate::vector_to_pg(query_vec); + let private_allowed = allowed_scopes.iter().any(|scope| scope == "agent_private"); + let non_private_scopes: Vec<String> = + allowed_scopes.iter().filter(|scope| *scope != "agent_private").cloned().collect(); + let retrieval_limit = i64::from(candidate_k.saturating_mul(4).clamp(16, 400)); + let rows = self + .fetch_structured_field_hits(StructuredFieldHitArgs { + embed_version: embed_version.as_str(), + tenant_id, + project_id, + agent_id, + now, + vec_text: vec_text.as_str(), + retrieval_limit, + private_allowed, + non_private_scopes: non_private_scopes.as_slice(), + }) + .await?; + let (ordered_note_ids, structured_matches_out) = build_structured_field_matches(rows); + + if ordered_note_ids.is_empty() { + return Ok(StructuredFieldRetrievalResult { + candidates: Vec::new(), + structured_matches: structured_matches_out, + }); + } + + let best_by_note = self + .fetch_best_chunks_for_notes( + embed_version.as_str(), + ordered_note_ids.as_slice(), + vec_text.as_str(), + ) + .await?; + let structured_candidates = build_structured_field_candidates( + candidate_k, + ordered_note_ids, + best_by_note, + embed_version.as_str(), + ); + + Ok(StructuredFieldRetrievalResult { + candidates: structured_candidates, + structured_matches: structured_matches_out, + }) + } + + async fn fetch_structured_field_hits( + &self, + args: StructuredFieldHitArgs<'_>, + ) -> Result<Vec<FieldHit>> { + if args.private_allowed && args.non_private_scopes.is_empty() { + self.fetch_structured_field_hits_private_only(args).await + } else if !args.private_allowed { + self.fetch_structured_field_hits_non_private_only(args).await + } else { + self.fetch_structured_field_hits_mixed(args).await + } + } + + async fn fetch_structured_field_hits_private_only( + &self, + args: StructuredFieldHitArgs<'_>, + ) -> Result<Vec<FieldHit>> { + let rows = sqlx::query_as::<_, StructuredFieldHitRow>( + "\ +SELECT + f.note_id, + f.field_kind +FROM memory_note_fields f +JOIN note_field_embeddings e + ON e.field_id = f.field_id + AND e.embedding_version = $1 +JOIN memory_notes n + ON n.note_id = f.note_id +WHERE n.tenant_id = $2 + AND n.project_id = $3 + AND n.status = 'active' + AND (n.expires_at IS NULL OR n.expires_at > $4) + AND n.scope = 'agent_private' + AND n.agent_id = $5 +ORDER BY e.vec <=> $6::text::vector ASC +LIMIT $7", + ) + .bind(args.embed_version) + .bind(args.tenant_id) + .bind(args.project_id) + .bind(args.now) + .bind(args.agent_id) + .bind(args.vec_text) + .bind(args.retrieval_limit) + .fetch_all(&self.db.pool) + .await?; + + Ok(rows + .into_iter() + .map(|row| FieldHit { note_id: row.note_id, field_kind: row.field_kind }) + .collect()) + } + + async fn fetch_structured_field_hits_non_private_only( + &self, + args: StructuredFieldHitArgs<'_>, + ) -> Result<Vec<FieldHit>> { + let rows = sqlx::query_as::<_, StructuredFieldHitRow>( + "\ +SELECT + f.note_id, + f.field_kind +FROM memory_note_fields f +JOIN note_field_embeddings e + ON e.field_id = f.field_id + AND e.embedding_version = $1 +JOIN memory_notes n + ON n.note_id = f.note_id +WHERE n.tenant_id = $2 + AND (n.project_id = $3 OR (n.project_id = $8 AND n.scope = 'org_shared')) + AND n.status = 'active' + AND (n.expires_at IS NULL OR n.expires_at > $4) + AND n.scope = ANY($5::text[]) +ORDER BY e.vec <=> $6::text::vector ASC +LIMIT $7", + ) + .bind(args.embed_version) + .bind(args.tenant_id) + .bind(args.project_id) + .bind(args.now) + .bind(args.non_private_scopes) + .bind(args.vec_text) + .bind(args.retrieval_limit) + .bind(ORG_PROJECT_ID) + .fetch_all(&self.db.pool) + .await?; + + Ok(rows + .into_iter() + .map(|row| FieldHit { note_id: row.note_id, field_kind: row.field_kind }) + .collect()) + } + + async fn fetch_structured_field_hits_mixed( + &self, + args: StructuredFieldHitArgs<'_>, + ) -> Result<Vec<FieldHit>> { + let rows = sqlx::query_as::<_, StructuredFieldHitRow>( + "\ +SELECT + f.note_id, + f.field_kind +FROM memory_note_fields f +JOIN note_field_embeddings e + ON e.field_id = f.field_id + AND e.embedding_version = $1 +JOIN memory_notes n + ON n.note_id = f.note_id +WHERE n.tenant_id = $2 + AND (n.project_id = $3 OR (n.project_id = $9 AND n.scope = 'org_shared')) + AND n.status = 'active' + AND (n.expires_at IS NULL OR n.expires_at > $4) + AND ( + (n.scope = 'agent_private' AND n.agent_id = $5) + OR n.scope = ANY($6::text[]) + ) +ORDER BY e.vec <=> $7::text::vector ASC +LIMIT $8", + ) + .bind(args.embed_version) + .bind(args.tenant_id) + .bind(args.project_id) + .bind(args.now) + .bind(args.agent_id) + .bind(args.non_private_scopes) + .bind(args.vec_text) + .bind(args.retrieval_limit) + .bind(ORG_PROJECT_ID) + .fetch_all(&self.db.pool) + .await?; + + Ok(rows + .into_iter() + .map(|row| FieldHit { note_id: row.note_id, field_kind: row.field_kind }) + .collect()) + } + + async fn fetch_best_chunks_for_notes( + &self, + embed_version: &str, + ordered_note_ids: &[Uuid], + vec_text: &str, + ) -> Result<HashMap<Uuid, (Uuid, i32)>> { + let best_chunks = sqlx::query_as::<_, BestChunkForNoteRow>( + "\ +SELECT DISTINCT ON (c.note_id) + c.note_id, + c.chunk_id, + c.chunk_index +FROM memory_note_chunks c +JOIN note_chunk_embeddings e + ON e.chunk_id = c.chunk_id + AND e.embedding_version = $1 +WHERE c.note_id = ANY($2::uuid[]) +ORDER BY c.note_id ASC, e.vec <=> $3::text::vector ASC", + ) + .bind(embed_version) + .bind(ordered_note_ids) + .bind(vec_text) + .fetch_all(&self.db.pool) + .await?; + let mut best_by_note = HashMap::new(); + + for row in best_chunks { + best_by_note.insert(row.note_id, (row.chunk_id, row.chunk_index)); + } + + Ok(best_by_note) + } + + async fn finish_search(&self, args: FinishSearchArgs<'_>) -> Result<SearchResponse> { + let now = OffsetDateTime::now_utc(); + let candidate_count = args.candidates.len(); + let candidate_note_ids: Vec<Uuid> = + args.candidates.iter().map(|candidate| candidate.note_id).collect(); + let policies = self.resolve_finish_search_policies(args.ranking_override.as_ref())?; + let note_meta = self + .fetch_note_meta_for_candidates( + args.tenant_id, + args.project_id, + args.agent_id, + args.allowed_scopes, + candidate_note_ids.as_slice(), + now, + ) + .await?; + let scoring = self + .build_finish_search_scoring( + args.query, + args.candidates, + ¬e_meta, + &policies, + args.top_k, + candidate_count, + args.filter, + args.requested_candidate_k, + args.effective_candidate_k, + now, + args.path == RawSearchPath::Quick, + ) + .await?; + let FinishSearchScoringResult { + query_tokens, + filtered_candidates, + scored_count, + snippet_count, + filtered_candidate_count, + filter_impact, + mut trace_candidates, + fused_results, + selected_results, + diversity_decisions, + selected_count, + } = scoring; + let relation_contexts = self + .build_relation_context_for_selected_results( + &selected_results, + args.tenant_id, + args.project_id, + args.agent_id, + args.allowed_scopes, + now, + ) + .await?; + + ranking::attach_diversity_decisions_to_trace_candidates( + &mut trace_candidates, + &diversity_decisions, + ); + + self.record_hits_if_enabled(args.record_hits_enabled, args.query, &selected_results, now) + .await?; + + let (items, trajectory_summary) = self + .build_items_and_write_trace(BuildTraceArgs { + path: args.path, + trace_id: args.trace_id, + query: args.query, + tenant_id: args.tenant_id, + project_id: args.project_id, + agent_id: args.agent_id, + token_id: args.token_id, + read_profile: args.read_profile, + expansion_mode: args.expansion_mode, + expanded_queries: args.expanded_queries, + allowed_scopes: args.allowed_scopes, + candidate_count, + filtered_candidate_count, + snippet_count, + scored_count, + fused_count: fused_results.len(), + selected_count, + top_k: args.top_k, + query_tokens: query_tokens.as_slice(), + structured_matches: &args.structured_matches, + policies: &policies, + diversity_decisions: &diversity_decisions, + recall_candidates: filtered_candidates, + fused_results, + selected_results, + relation_contexts, + trace_candidates, + recursive_retrieval: args.recursive_retrieval.as_ref(), + now, + ranking_override: &args.ranking_override, + filter_impact, + payload_level: args.payload_level, + }) + .await?; + + Ok(SearchResponse { + trace_id: args.trace_id, + items, + trajectory_summary: Some(trajectory_summary), + }) + } + + async fn build_items_and_write_trace( + &self, + args: BuildTraceArgs<'_>, + ) -> Result<(Vec<SearchItem>, SearchTrajectorySummary)> { + let trace_id = args.trace_id; + let (items, trajectory_summary, trace_payload) = self.build_items_and_trace_payload(args); + + self.write_trace_payload(trace_id, trace_payload).await?; + + Ok((items, trajectory_summary)) + } + + #[allow(clippy::too_many_arguments)] + async fn build_finish_search_scoring( + &self, + query: &str, + candidates: Vec<ChunkCandidate>, + note_meta: &HashMap<Uuid, NoteMeta>, + policies: &FinishSearchPolicies, + top_k: u32, + candidate_count: usize, + filter: Option<&SearchFilter>, + requested_candidate_k: u32, + effective_candidate_k: u32, + now: OffsetDateTime, + skip_rerank: bool, + ) -> Result<FinishSearchScoringResult> { + let (filtered_candidates, filter_impact) = self.apply_filter_to_candidates( + candidates, + note_meta, + filter, + requested_candidate_k, + effective_candidate_k, + ); + let filtered_candidate_count = filtered_candidates.len(); + let snippet_items = self.build_snippet_items(&filtered_candidates, note_meta).await?; + let snippet_count = snippet_items.len(); + let query_tokens = ranking::tokenize_query(query, MAX_MATCHED_TERMS); + let scope_context_boost_by_scope = + ranking::build_scope_context_boost_by_scope(&query_tokens, self.cfg.context.as_ref()); + let det_query_tokens = build_deterministic_query_tokens(&self.cfg, query); + let scored = self + .score_snippet_items(ScoreSnippetArgs { + query, + snippet_items, + scope_context_boost_by_scope: &scope_context_boost_by_scope, + det_query_tokens: det_query_tokens.as_slice(), + blend_policy: &policies.blend_policy, + cache_cfg: &self.cfg.search.cache, + now, + candidate_count, + skip_rerank, + }) + .await?; + let scored_count = scored.len(); + let trace_candidates = self.build_trace_candidates(&scored, now); + let results = select_best_scored_chunks(scored); + let fused_results = results.clone(); + let (selected_results, diversity_decisions) = + self.apply_diversity_policy(results, top_k, &policies.diversity_policy).await?; + let selected_count = selected_results.len(); + + Ok(FinishSearchScoringResult { + query_tokens, + filtered_candidates, + scored_count, + snippet_count, + filtered_candidate_count, + filter_impact, + trace_candidates, + fused_results, + selected_results, + diversity_decisions, + selected_count, + }) + } + + fn apply_filter_to_candidates( + &self, + candidates: Vec<ChunkCandidate>, + note_meta: &HashMap<Uuid, NoteMeta>, + filter: Option<&SearchFilter>, + requested_candidate_k: u32, + effective_candidate_k: u32, + ) -> (Vec<ChunkCandidate>, Option<SearchFilterImpact>) { + let filtered_candidates: Vec<ChunkCandidate> = candidates + .into_iter() + .filter(|candidate| ranking::candidate_matches_note(note_meta, candidate)) + .collect(); + + match filter { + Some(filter) => { + let (candidates, filter_impact) = filter.eval( + filtered_candidates, + note_meta, + requested_candidate_k, + effective_candidate_k, + ); + + (candidates, Some(filter_impact)) + }, + None => (filtered_candidates, None), + } + } + + async fn build_relation_context_for_selected_results( + &self, + selected_results: &[ScoredChunk], + tenant_id: &str, + project_id: &str, + agent_id: &str, + allowed_scopes: &[String], + now: OffsetDateTime, + ) -> Result<HashMap<Uuid, Vec<SearchExplainRelationContext>>> { + if !self.cfg.search.graph_context.enabled { + return Ok(HashMap::new()); + } + + let selected_note_ids: Vec<Uuid> = + selected_results.iter().map(|chunk| chunk.item.note.note_id).collect(); + + if selected_note_ids.is_empty() { + return Ok(HashMap::new()); + } + + self.fetch_relation_contexts_for_notes( + selected_note_ids.as_slice(), + tenant_id, + project_id, + agent_id, + allowed_scopes, + now, + ) + .await + } + + fn resolve_finish_search_policies( + &self, + ranking_override: Option<&RankingRequestOverride>, + ) -> Result<FinishSearchPolicies> { + let blend_policy = ranking::resolve_blend_policy( + &self.cfg.ranking.blend, + ranking_override.and_then(|override_| override_.blend.as_ref()), + )?; + let diversity_policy = ranking::resolve_diversity_policy( + &self.cfg.ranking.diversity, + ranking_override.and_then(|override_| override_.diversity.as_ref()), + )?; + let retrieval_sources_policy = ranking::resolve_retrieval_sources_policy( + &self.cfg.ranking.retrieval_sources, + ranking_override.and_then(|override_| override_.retrieval_sources.as_ref()), + )?; + let policy_snapshot = ranking::build_policy_snapshot( + &self.cfg, + &blend_policy, + &diversity_policy, + &retrieval_sources_policy, + ranking_override, + ); + let policy_hash = ranking::hash_policy_snapshot(&policy_snapshot)?; + let policy_id = format!("ranking_v2:{}", &policy_hash[..12.min(policy_hash.len())]); + + Ok(FinishSearchPolicies { + blend_policy, + diversity_policy, + retrieval_sources_policy, + policy_snapshot, + policy_id, + }) + } + + fn build_query_plan(&self, args: BuildQueryPlanArgs<'_>) -> QueryPlan { + let allowed_scopes = sorted_unique_strings(args.allowed_scopes.to_vec()); + let expanded_queries = sorted_unique_strings(args.expanded_queries); + let retrieval_stages = self.build_query_plan_retrieval_stages( + args.candidate_k, + args.retrieval_sources_policy, + args.recursive_enabled, + ); + let rewrite = + self.build_query_plan_rewrite(args.expansion_mode, expanded_queries, args.dynamic_gate); + let fusion_policy = self.build_query_plan_fusion_policy(args.retrieval_sources_policy); + let rerank_policy = self.build_query_plan_rerank_policy(args.policies); + let budget = self.build_query_plan_budget(args.top_k, args.candidate_k); + let stages = Self::build_query_plan_stages(QueryPlanStagesArgs { + path: args.path, + query: args.query, + read_profile: args.read_profile, + allowed_scope_count: allowed_scopes.len(), + rewrite: &rewrite, + retrieval_stages: &retrieval_stages, + fusion_policy: &fusion_policy, + rerank_policy: &rerank_policy, + budget: &budget, + }); + + QueryPlan { + schema: QUERY_PLAN_SCHEMA.to_string(), + version: QUERY_PLAN_VERSION.to_string(), + stages, + intent: QueryPlanIntent { + query: args.query.to_string(), + tenant_id: args.tenant_id.to_string(), + project_id: args.project_id.to_string(), + agent_id: args.agent_id.to_string(), + read_profile: args.read_profile.to_string(), + allowed_scopes, + }, + rewrite, + retrieval_stages, + fusion_policy, + rerank_policy, + budget, + } + } + + fn build_query_plan_retrieval_stages( + &self, + candidate_k: u32, + retrieval_sources_policy: &ResolvedRetrievalSourcesPolicy, + recursive_enabled: bool, + ) -> Vec<QueryPlanRetrievalStage> { + let mut stages = vec![ + QueryPlanRetrievalStage { + name: "fusion_dense_bm25".to_string(), + source: "qdrant_fusion".to_string(), + enabled: true, + candidate_limit: candidate_k, + }, + QueryPlanRetrievalStage { + name: "structured_field_vector".to_string(), + source: "postgres_vector".to_string(), + enabled: retrieval_sources_policy.structured_field_weight > 0.0, + candidate_limit: candidate_k, + }, + ]; + + if recursive_enabled { + stages.push(QueryPlanRetrievalStage { + name: "recursive_scope".to_string(), + source: "scope_graph".to_string(), + enabled: retrieval_sources_policy.recursive_weight > 0.0, + candidate_limit: candidate_k, + }); + } + + stages + } + + fn build_query_plan_rewrite( + &self, + expansion_mode: ExpansionMode, + expanded_queries: Vec<String>, + dynamic_gate: DynamicGateSummary, + ) -> QueryPlanRewrite { + QueryPlanRewrite { + expansion_mode: ranking::expansion_mode_label(expansion_mode).to_string(), + expanded_queries, + dynamic_gate: QueryPlanDynamicGate { + considered: dynamic_gate.considered, + should_expand: dynamic_gate.should_expand, + observed_candidates: dynamic_gate.observed_candidates, + observed_top_score: dynamic_gate.observed_top_score, + min_candidates: self.cfg.search.dynamic.min_candidates, + min_top_score: self.cfg.search.dynamic.min_top_score, + }, + } + } + + fn build_query_plan_fusion_policy( + &self, + retrieval_sources_policy: &ResolvedRetrievalSourcesPolicy, + ) -> QueryPlanFusionPolicy { + QueryPlanFusionPolicy { + strategy: "weighted_merge".to_string(), + fusion_weight: retrieval_sources_policy.fusion_weight, + structured_field_weight: retrieval_sources_policy.structured_field_weight, + recursive_weight: retrieval_sources_policy.recursive_weight, + fusion_priority: retrieval_sources_policy.fusion_priority, + structured_field_priority: retrieval_sources_policy.structured_field_priority, + recursive_priority: retrieval_sources_policy.recursive_priority, + } + } + + fn build_query_plan_rerank_policy( + &self, + policies: &FinishSearchPolicies, + ) -> QueryPlanRerankPolicy { + QueryPlanRerankPolicy { + provider_id: self.cfg.providers.rerank.provider_id.clone(), + model: self.cfg.providers.rerank.model.clone(), + blend_enabled: policies.blend_policy.enabled, + rerank_normalization: policies.blend_policy.rerank_normalization.as_str().to_string(), + retrieval_normalization: policies + .blend_policy + .retrieval_normalization + .as_str() + .to_string(), + blend_segments: policies + .blend_policy + .segments + .iter() + .map(|segment| QueryPlanBlendSegment { + max_retrieval_rank: segment.max_retrieval_rank, + retrieval_weight: segment.retrieval_weight, + }) + .collect(), + diversity_enabled: policies.diversity_policy.enabled, + diversity_sim_threshold: policies.diversity_policy.sim_threshold, + diversity_mmr_lambda: policies.diversity_policy.mmr_lambda, + diversity_max_skips: policies.diversity_policy.max_skips, + } + } + + fn build_query_plan_budget(&self, top_k: u32, candidate_k: u32) -> QueryPlanBudget { + QueryPlanBudget { + top_k, + candidate_k, + prefilter_max_candidates: self.cfg.search.prefilter.max_candidates, + expansion_max_queries: self.cfg.search.expansion.max_queries, + cache_enabled: self.cfg.search.cache.enabled, } + } - let mut embedded_iter = if extra_queries.is_empty() { - Vec::new().into_iter() + fn build_query_plan_stages(args: QueryPlanStagesArgs<'_>) -> Vec<QueryPlanStage> { + vec![ + QueryPlanStage { + name: "intent".to_string(), + details: serde_json::json!({ + "path": raw_search_path_label(args.path), + "query": args.query, + "read_profile": args.read_profile, + "allowed_scope_count": args.allowed_scope_count, + }), + }, + QueryPlanStage { + name: "rewrite".to_string(), + details: serde_json::json!({ + "expansion_mode": args.rewrite.expansion_mode.as_str(), + "expanded_query_count": args.rewrite.expanded_queries.len(), + "dynamic_gate_considered": args.rewrite.dynamic_gate.considered, + "dynamic_gate_should_expand": args.rewrite.dynamic_gate.should_expand, + }), + }, + QueryPlanStage { + name: "retrieval".to_string(), + details: serde_json::json!({ + "stages": args.retrieval_stages, + }), + }, + QueryPlanStage { + name: "fusion".to_string(), + details: serde_json::json!({ + "strategy": args.fusion_policy.strategy.as_str(), + "fusion_weight": args.fusion_policy.fusion_weight, + "structured_field_weight": args.fusion_policy.structured_field_weight, + }), + }, + QueryPlanStage { + name: "rerank".to_string(), + details: serde_json::json!({ + "provider_id": args.rerank_policy.provider_id.as_str(), + "model": args.rerank_policy.model.as_str(), + "blend_enabled": args.rerank_policy.blend_enabled, + "diversity_enabled": args.rerank_policy.diversity_enabled, + }), + }, + QueryPlanStage { + name: "budget".to_string(), + details: serde_json::json!({ + "top_k": args.budget.top_k, + "candidate_k": args.budget.candidate_k, + "prefilter_max_candidates": args.budget.prefilter_max_candidates, + "expansion_max_queries": args.budget.expansion_max_queries, + "cache_enabled": args.budget.cache_enabled, + }), + }, + ] + } + + async fn score_snippet_items( + &self, + args: ScoreSnippetArgs<'_, '_>, + ) -> Result<Vec<ScoredChunk>> { + let ScoreSnippetArgs { + query, + snippet_items, + scope_context_boost_by_scope, + det_query_tokens, + blend_policy, + cache_cfg, + now, + candidate_count, + skip_rerank, + } = args; + + if snippet_items.is_empty() { + return Ok(Vec::new()); + } + + let scores = if skip_rerank { + Self::build_quick_find_rerank_scores(&snippet_items) } else { - let embedded = self - .providers - .embedding - .embed(&self.cfg.providers.embedding, &extra_queries) - .await?; - if embedded.len() != extra_queries.len() { - return Err(ServiceError::Provider { - message: "Embedding provider returned mismatched vector count.".to_string(), - }); - } - embedded.into_iter() + self.rerank_snippet_items(query, snippet_items.as_slice(), cache_cfg, now).await? }; - let mut out = Vec::with_capacity(queries.len()); - for query in queries { - let vector = if baseline_vector.is_some() && query == original_query { - baseline_vector - .ok_or_else(|| ServiceError::Provider { - message: "Embedding baseline vector is missing.".to_string(), - })? - .clone() - } else { - embedded_iter.next().ok_or_else(|| ServiceError::Provider { - message: "Embedding provider returned no vectors.".to_string(), - })? - }; - if vector.len() != self.cfg.storage.qdrant.vector_dim as usize { - return Err(ServiceError::Provider { - message: "Embedding vector dimension mismatch.".to_string(), - }); + let rerank_ranks = ranking::build_rerank_ranks(&snippet_items, &scores); + let total_rerank = u32::try_from(scores.len()).unwrap_or(1).max(1); + let total_retrieval = u32::try_from(candidate_count).unwrap_or(1).max(1); + let score_ctx = ScoreCandidateCtx { + cfg: &self.cfg, + blend_policy, + scope_context_boost_by_scope, + det_query_tokens, + now, + total_rerank, + total_retrieval, + }; + let mut scored = Vec::with_capacity(snippet_items.len()); + + for ((item, rerank_score), rerank_rank) in + snippet_items.into_iter().zip(scores).zip(rerank_ranks) + { + scored.push(score_chunk_candidate(&score_ctx, item, rerank_score, rerank_rank)); + } + + Ok(scored) + } + + fn build_quick_find_rerank_scores(snippet_items: &[ChunkSnippet]) -> Vec<f32> { + let mut idxs: Vec<usize> = (0..snippet_items.len()).collect(); + + idxs.sort_by(|&a, &b| { + let ord = snippet_items[a].retrieval_rank.cmp(&snippet_items[b].retrieval_rank); + + if ord != Ordering::Equal { + return ord; } - out.push(QueryEmbedding { text: query.clone(), vector }); + + let ord = snippet_items[a].chunk.chunk_index.cmp(&snippet_items[b].chunk.chunk_index); + + if ord != Ordering::Equal { + return ord; + } + + snippet_items[a].chunk.chunk_id.cmp(&snippet_items[b].chunk.chunk_id) + }); + + let total = idxs.len(); + + if total == 0 { + return Vec::new(); } - Ok(out) + + let mut scores = vec![0_f32; total]; + + for (rank, idx) in idxs.into_iter().enumerate() { + scores[idx] = 1.0 / (rank as f32 + 1.0); + } + + scores } - async fn run_fusion_query( + fn build_trace_candidates( &self, - queries: &[QueryEmbedding], - filter: &Filter, - candidate_k: u32, - ) -> ServiceResult<Vec<ScoredPoint>> { - let mut search = QueryPointsBuilder::new(self.qdrant.collection.clone()); - for query in queries { - let dense_prefetch = PrefetchQueryBuilder::default() - .query(Query::new_nearest(query.vector.clone())) - .using(DENSE_VECTOR_NAME) - .filter(filter.clone()) - .limit(candidate_k as u64); - let bm25_prefetch = PrefetchQueryBuilder::default() - .query(Query::new_nearest(Document::new(query.text.clone(), BM25_MODEL))) - .using(BM25_VECTOR_NAME) - .filter(filter.clone()) - .limit(candidate_k as u64); - search = search.add_prefetch(dense_prefetch).add_prefetch(bm25_prefetch); + scored: &[ScoredChunk], + now: OffsetDateTime, + ) -> Vec<TraceCandidateRecord> { + if !self.cfg.search.explain.capture_candidates || scored.is_empty() { + return Vec::new(); } - let search = search.with_payload(true).query(Fusion::Rrf).limit(candidate_k as u64); - let response = self - .qdrant - .client - .query(search) - .await - .map_err(|err| ServiceError::Qdrant { message: err.to_string() })?; - Ok(response.result) + let candidate_expires_at = + now + Duration::days(self.cfg.search.explain.candidate_retention_days); + + scored + .iter() + .map(|scored_chunk| { + build_trace_candidate_record(scored_chunk, now, candidate_expires_at) + }) + .collect() } - async fn expand_queries(&self, query: &str) -> Vec<String> { - let cfg = &self.cfg.search.expansion; - let cache_cfg = &self.cfg.search.cache; - let now = OffsetDateTime::now_utc(); - let cache_key = if cache_cfg.enabled { - match build_expansion_cache_key( - query, - cache_cfg.expansion_version.as_str(), - cfg.max_queries, - cfg.include_original, - self.cfg.providers.llm_extractor.provider_id.as_str(), - self.cfg.providers.llm_extractor.model.as_str(), - self.cfg.providers.llm_extractor.temperature, - ) { - Ok(key) => Some(key), - Err(err) => { - tracing::warn!( - error = %err, - cache_kind = CacheKind::Expansion.as_str(), - "Cache key build failed." - ); - None - }, - } + async fn apply_diversity_policy( + &self, + results: Vec<ScoredChunk>, + top_k: u32, + diversity_policy: &ResolvedDiversityPolicy, + ) -> Result<(Vec<ScoredChunk>, HashMap<Uuid, DiversityDecision>)> { + let note_vectors = if diversity_policy.enabled { + fetch_note_vectors_for_diversity(&self.db.pool, results.as_slice()).await? } else { - None + HashMap::new() }; + let (selected_results, diversity_decisions) = + ranking::select_diverse_results(results, top_k, diversity_policy, ¬e_vectors); - if let Some(key) = cache_key.as_ref() { - match fetch_cache_payload(&self.db.pool, CacheKind::Expansion, key, now).await { - Ok(Some(payload)) => { - tracing::info!( - cache_kind = CacheKind::Expansion.as_str(), - cache_key_prefix = cache_key_prefix(key), - hit = true, - payload_size = payload.size_bytes, - ttl_days = cache_cfg.expansion_ttl_days, - "Cache hit." - ); - let cached: ExpansionCachePayload = match serde_json::from_value(payload.value) - { - Ok(value) => value, - Err(err) => { - tracing::warn!( - error = %err, - cache_kind = CacheKind::Expansion.as_str(), - cache_key_prefix = cache_key_prefix(key), - "Cache payload decode failed." - ); - ExpansionCachePayload { queries: Vec::new() } - }, - }; - if !cached.queries.is_empty() { - return cached.queries; - } - }, - Ok(None) => { - tracing::info!( - cache_kind = CacheKind::Expansion.as_str(), - cache_key_prefix = cache_key_prefix(key), - hit = false, - payload_size = 0_u64, - ttl_days = cache_cfg.expansion_ttl_days, - "Cache miss." - ); - }, - Err(err) => { - tracing::warn!( + Ok((selected_results, diversity_decisions)) + } + + async fn record_hits_if_enabled( + &self, + enabled: bool, + query: &str, + selected_results: &[ScoredChunk], + now: OffsetDateTime, + ) -> Result<()> { + if !enabled || selected_results.is_empty() { + return Ok(()); + } + + let mut tx = self.db.pool.begin().await?; + + record_hits(&mut *tx, query, selected_results, now).await?; + + tx.commit().await?; + + Ok(()) + } + + fn build_items_and_trace_payload( + &self, + args: BuildTraceArgs<'_>, + ) -> (Vec<SearchItem>, SearchTrajectorySummary, TracePayload) { + let mut trajectory_stages = build_trace_trajectory_stages(&args); + let trace_context = TraceContext { + trace_id: args.trace_id, + tenant_id: args.tenant_id, + project_id: args.project_id, + agent_id: args.agent_id, + read_profile: args.read_profile, + query: args.query, + expansion_mode: args.expansion_mode, + expanded_queries: args.expanded_queries.clone(), + allowed_scopes: args.allowed_scopes, + candidate_count: args.candidate_count, + top_k: args.top_k, + }; + let mut config_snapshot = ranking::build_config_snapshot( + &self.cfg, + &args.policies.blend_policy, + &args.policies.diversity_policy, + &args.policies.retrieval_sources_policy, + args.ranking_override.as_ref(), + args.policies.policy_id.as_str(), + &args.policies.policy_snapshot, + ); + + if let Some(object) = config_snapshot.as_object_mut() { + object.insert("audit".to_string(), build_trace_audit(args.agent_id, args.token_id)); + } + + let mut items = Vec::with_capacity(args.selected_results.len()); + let mut trace_builder = SearchTraceBuilder::new( + trace_context, + config_snapshot, + self.cfg.search.explain.retention_days, + args.now, + ); + let mut final_stage_items = Vec::new(); + + for candidate in args.trace_candidates { + trace_builder.push_candidate(candidate); + } + for (idx, scored_chunk) in args.selected_results.into_iter().enumerate() { + let rank = idx as u32 + 1; + let (item, trace_item) = build_search_item_and_trace_item(BuildSearchItemArgs { + cfg: &self.cfg, + policy_id: args.policies.policy_id.as_str(), + blend_policy: &args.policies.blend_policy, + diversity_policy: &args.policies.diversity_policy, + diversity_decisions: args.diversity_decisions, + query_tokens: args.query_tokens, + structured_matches: args.structured_matches, + relation_contexts: &args.relation_contexts, + scored_chunk, + rank, + }); + let item = apply_payload_level_to_search_item(item, args.payload_level); + + final_stage_items.push(TraceTrajectoryStageItemRecord { + id: Uuid::new_v4(), + item_id: Some(item.result_handle), + note_id: Some(item.note_id), + chunk_id: Some(item.chunk_id), + metrics: serde_json::json!({ + "rank": rank, + "final_score": item.final_score, + }), + }); + items.push(item); + trace_builder.push_item(trace_item); + } + + if let Some(stage) = + trajectory_stages.iter_mut().find(|stage| stage.stage_name == "selection.final") + { + stage.items = final_stage_items; + } + + let trajectory_summary = build_trajectory_summary_from_stages( + &trajectory_stages + .iter() + .map(|stage| SearchTrajectoryStage { + stage_order: stage.stage_order, + stage_name: stage.stage_name.clone(), + stage_payload: stage.stage_payload.clone(), + items: stage + .items + .iter() + .map(|item| SearchTrajectoryStageItem { + item_id: item.item_id, + note_id: item.note_id, + chunk_id: item.chunk_id, + metrics: item.metrics.clone(), + }) + .collect(), + }) + .collect::<Vec<_>>(), + ); + + for stage in trajectory_stages { + trace_builder.push_stage(stage); + } + + (items, trajectory_summary, trace_builder.build()) + } + + async fn write_trace_payload(&self, trace_id: Uuid, trace_payload: TracePayload) -> Result<()> { + match self.cfg.search.explain.write_mode.trim().to_ascii_lowercase().as_str() { + "inline" => { + let mut tx = self.db.pool.begin().await?; + + persist_trace_inline(&mut tx, trace_payload).await?; + + tx.commit().await?; + }, + _ => + if let Err(err) = enqueue_trace(&self.db.pool, trace_payload).await { + tracing::error!( error = %err, - cache_kind = CacheKind::Expansion.as_str(), - cache_key_prefix = cache_key_prefix(key), - "Cache read failed." + trace_id = %trace_id, + "Failed to enqueue search trace." ); }, + } + + Ok(()) + } + + async fn build_snippet_items( + &self, + filtered_candidates: &[ChunkCandidate], + note_meta: &HashMap<Uuid, NoteMeta>, + ) -> Result<Vec<ChunkSnippet>> { + if filtered_candidates.is_empty() { + return Ok(Vec::new()); + } + + let pairs = ranking::collect_neighbor_pairs(filtered_candidates); + let chunk_rows = fetch_chunks_by_pair(&self.db.pool, &pairs).await?; + let mut chunk_by_id = HashMap::new(); + let mut chunk_by_note_index = HashMap::new(); + + for row in chunk_rows { + chunk_by_note_index.insert((row.note_id, row.chunk_index), row.clone()); + chunk_by_id.insert(row.chunk_id, row); + } + + let mut items = Vec::new(); + + for candidate in filtered_candidates { + let Some(chunk_row) = chunk_by_id.get(&candidate.chunk_id) else { + tracing::warn!( + chunk_id = %candidate.chunk_id, + "Chunk metadata missing for candidate." + ); + + continue; + }; + let snippet = ranking::stitch_snippet( + candidate.note_id, + chunk_row.chunk_index, + &chunk_by_note_index, + ); + + if snippet.is_empty() { + continue; } + + let Some(note) = note_meta.get(&candidate.note_id) else { continue }; + let chunk = ChunkMeta { + chunk_id: chunk_row.chunk_id, + chunk_index: chunk_row.chunk_index, + start_offset: chunk_row.start_offset, + end_offset: chunk_row.end_offset, + }; + + items.push(ChunkSnippet { + note: note.clone(), + chunk, + snippet, + retrieval_rank: candidate.retrieval_rank, + retrieval_score: candidate.retrieval_score, + }); } - let messages = build_expansion_messages(query, cfg.max_queries, cfg.include_original); - let raw = match self - .providers - .extractor - .extract(&self.cfg.providers.llm_extractor, &messages) - .await - { - Ok(value) => value, - Err(err) => { - tracing::warn!(error = %err, "Query expansion failed; falling back to original query."); - return vec![query.to_string()]; - }, - }; + Ok(items) + } - let parsed: ExpansionOutput = match serde_json::from_value(raw) { - Ok(value) => value, - Err(err) => { - tracing::warn!(error = %err, "Query expansion returned invalid JSON; falling back to original query."); - return vec![query.to_string()]; - }, - }; + async fn rerank_snippet_items( + &self, + query: &str, + snippet_items: &[ChunkSnippet], + cache_cfg: &SearchCache, + now: OffsetDateTime, + ) -> Result<Vec<f32>> { + if snippet_items.is_empty() { + return Ok(Vec::new()); + } - let normalized = - normalize_queries(parsed.queries, query, cfg.include_original, cfg.max_queries); - let result = if normalized.is_empty() { vec![query.to_string()] } else { normalized }; + let (cache_candidates, signature) = Self::build_rerank_cache_signature(snippet_items); + let mut cache_key: Option<String> = None; + let mut cached_scores: Option<Vec<f32>> = None; - if let Some(key) = cache_key { - let payload = ExpansionCachePayload { queries: result.clone() }; - let payload_json = match serde_json::to_value(&payload) { - Ok(value) => value, + if cache_cfg.enabled { + match ranking::build_rerank_cache_key( + query, + self.cfg.providers.rerank.provider_id.as_str(), + self.cfg.providers.rerank.model.as_str(), + &signature, + ) { + Ok(key) => { + cache_key = Some(key.clone()); + cached_scores = self + .read_rerank_cache_scores(&key, cache_candidates.as_slice(), cache_cfg, now) + .await; + }, Err(err) => { tracing::warn!( error = %err, - cache_kind = CacheKind::Expansion.as_str(), - cache_key_prefix = cache_key_prefix(&key), - "Cache payload encode failed." + cache_kind = CacheKind::Rerank.as_str(), + "Cache key build failed." ); - return result; }, - }; - let stored_at = OffsetDateTime::now_utc(); - let expires_at = stored_at + Duration::days(cache_cfg.expansion_ttl_days); - match store_cache_payload( - &self.db.pool, - CacheKind::Expansion, - &key, - payload_json, - stored_at, - expires_at, - cache_cfg.max_payload_bytes, + } + } + + if let Some(scores) = cached_scores { + return Ok(scores); + } + + let docs: Vec<String> = snippet_items.iter().map(|item| item.snippet.clone()).collect(); + let scores = self.providers.rerank.rerank(&self.cfg.providers.rerank, query, &docs).await?; + + if scores.len() != snippet_items.len() { + return Err(crate::Error::Provider { + message: "Rerank provider returned mismatched score count.".to_string(), + }); + } + if cache_cfg.enabled + && let Some(key) = cache_key.as_ref() + && !cache_candidates.is_empty() + { + self.store_rerank_cache_scores( + key, + cache_candidates.as_slice(), + scores.as_slice(), + cache_cfg, ) - .await - { - Ok(Some(payload_size)) => { + .await; + } + + Ok(scores) + } + + fn build_rerank_cache_signature( + snippet_items: &[ChunkSnippet], + ) -> (Vec<RerankCacheCandidate>, Vec<(Uuid, OffsetDateTime)>) { + let candidates: Vec<RerankCacheCandidate> = snippet_items + .iter() + .map(|item| RerankCacheCandidate { + chunk_id: item.chunk.chunk_id, + updated_at: item.note.updated_at, + }) + .collect(); + let signature: Vec<(Uuid, OffsetDateTime)> = + candidates.iter().map(|candidate| (candidate.chunk_id, candidate.updated_at)).collect(); + + (candidates, signature) + } + + async fn read_rerank_cache_scores( + &self, + key: &str, + cache_candidates: &[RerankCacheCandidate], + cache_cfg: &SearchCache, + now: OffsetDateTime, + ) -> Option<Vec<f32>> { + match fetch_cache_payload(&self.db.pool, CacheKind::Rerank, key, now).await { + Ok(Some(payload)) => { + let decoded: RerankCachePayload = match serde_json::from_value(payload.value) { + Ok(value) => value, + Err(err) => { + tracing::warn!( + error = %err, + cache_kind = CacheKind::Rerank.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + "Cache payload decode failed." + ); + + RerankCachePayload { items: Vec::new() } + }, + }; + + if let Some(scores) = ranking::build_cached_scores(&decoded, cache_candidates) { tracing::info!( - cache_kind = CacheKind::Expansion.as_str(), - cache_key_prefix = cache_key_prefix(&key), - hit = false, - payload_size, - ttl_days = cache_cfg.expansion_ttl_days, - "Cache stored." + cache_kind = CacheKind::Rerank.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + hit = true, + payload_size = payload.size_bytes, + ttl_days = cache_cfg.rerank_ttl_days, + "Cache hit." ); - }, - Ok(None) => { + + Some(scores) + } else { tracing::warn!( - cache_kind = CacheKind::Expansion.as_str(), - cache_key_prefix = cache_key_prefix(&key), + cache_kind = CacheKind::Rerank.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), hit = false, - payload_size = 0_u64, - ttl_days = cache_cfg.expansion_ttl_days, - "Cache payload skipped due to size." - ); - }, - Err(err) => { - tracing::warn!( - error = %err, - cache_kind = CacheKind::Expansion.as_str(), - cache_key_prefix = cache_key_prefix(&key), - "Cache write failed." + payload_size = payload.size_bytes, + ttl_days = cache_cfg.rerank_ttl_days, + "Cache payload did not match candidates." ); - }, - } + + None + } + }, + Ok(None) => { + tracing::info!( + cache_kind = CacheKind::Rerank.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + hit = false, + payload_size = 0_u64, + ttl_days = cache_cfg.rerank_ttl_days, + "Cache miss." + ); + + None + }, + Err(err) => { + tracing::warn!( + error = %err, + cache_kind = CacheKind::Rerank.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + "Cache read failed." + ); + + None + }, } + } - result + async fn store_rerank_cache_scores( + &self, + key: &str, + cache_candidates: &[RerankCacheCandidate], + scores: &[f32], + cache_cfg: &SearchCache, + ) { + let payload = RerankCachePayload { + items: cache_candidates + .iter() + .zip(scores.iter()) + .map(|(candidate, score)| RerankCacheItem { + chunk_id: candidate.chunk_id, + updated_at: candidate.updated_at, + score: *score, + }) + .collect(), + }; + + match serde_json::to_value(&payload) { + Ok(payload_json) => { + let stored_at = OffsetDateTime::now_utc(); + let expires_at = stored_at + Duration::days(cache_cfg.rerank_ttl_days); + + match store_cache_payload( + &self.db.pool, + CacheKind::Rerank, + key, + payload_json, + stored_at, + expires_at, + cache_cfg.max_payload_bytes, + ) + .await + { + Ok(Some(payload_size)) => { + tracing::info!( + cache_kind = CacheKind::Rerank.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + hit = false, + payload_size, + ttl_days = cache_cfg.rerank_ttl_days, + "Cache stored." + ); + }, + Ok(None) => { + tracing::warn!( + cache_kind = CacheKind::Rerank.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + hit = false, + payload_size = 0_u64, + ttl_days = cache_cfg.rerank_ttl_days, + "Cache payload skipped due to size." + ); + }, + Err(err) => { + tracing::warn!( + error = %err, + cache_kind = CacheKind::Rerank.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + "Cache write failed." + ); + }, + } + }, + Err(err) => { + tracing::warn!( + error = %err, + cache_kind = CacheKind::Rerank.as_str(), + cache_key_prefix = ranking::cache_key_prefix(key), + "Cache payload encode failed." + ); + }, + } } - async fn finish_search(&self, args: FinishSearchArgs<'_>) -> ServiceResult<SearchResponse> { - let FinishSearchArgs { - trace_id, - query, + async fn fetch_note_meta_for_candidates( + &self, + tenant_id: &str, + project_id: &str, + agent_id: &str, + allowed_scopes: &[String], + candidate_note_ids: &[Uuid], + now: OffsetDateTime, + ) -> Result<HashMap<Uuid, NoteMeta>> { + if candidate_note_ids.is_empty() { + return Ok(HashMap::new()); + } + + let org_shared_allowed = allowed_scopes.iter().any(|scope| scope == "org_shared"); + let shared_grants = access::load_shared_read_grants_with_org_shared( + &self.db.pool, tenant_id, project_id, agent_id, - read_profile, - allowed_scopes, - expanded_queries, - expansion_mode, - candidates, - top_k, - record_hits_enabled, - } = args; - let now = OffsetDateTime::now_utc(); - let cache_cfg = &self.cfg.search.cache; - let candidate_count = candidates.len(); - let retrieval_map: HashMap<Uuid, RetrievalInfo> = candidates - .iter() - .map(|candidate| { - ( - candidate.chunk_id, - RetrievalInfo { - score: candidate.retrieval_score, - rank: candidate.retrieval_rank, - }, - ) - }) - .collect(); - - let candidate_note_ids: Vec<Uuid> = - candidates.iter().map(|candidate| candidate.note_id).collect(); - let mut notes: Vec<MemoryNote> = if candidate_note_ids.is_empty() { - Vec::new() - } else { - sqlx::query_as( - "SELECT * FROM memory_notes WHERE note_id = ANY($1) AND tenant_id = $2 AND project_id = $3", - ) - .bind(&candidate_note_ids) - .bind(tenant_id) - .bind(project_id) - .fetch_all(&self.db.pool) - .await? - }; - + org_shared_allowed, + ) + .await?; + let notes: Vec<MemoryNote> = sqlx::query_as( + "\ +SELECT * +FROM memory_notes +WHERE note_id = ANY($1::uuid[]) + AND tenant_id = $2 + AND ( + project_id = $3 + OR (project_id = $4 AND scope = 'org_shared') + )", + ) + .bind(candidate_note_ids) + .bind(tenant_id) + .bind(project_id) + .bind(ORG_PROJECT_ID) + .fetch_all(&self.db.pool) + .await?; let mut note_meta = HashMap::new(); - for note in notes.drain(..) { - if note.tenant_id != tenant_id || note.project_id != project_id { - continue; - } - if note.scope == "agent_private" && note.agent_id != agent_id { - continue; - } - if note.status != "active" { - continue; - } - if !allowed_scopes.contains(¬e.scope) { - continue; - } - if note.expires_at.map(|ts| ts <= now).unwrap_or(false) { + + for note in notes { + if !access::note_read_allowed(¬e, agent_id, allowed_scopes, &shared_grants, now) { continue; } + note_meta.insert( note.note_id, NoteMeta { @@ -906,525 +4643,1336 @@ impl ElfService { note_type: note.r#type, key: note.key, scope: note.scope, + agent_id: note.agent_id, importance: note.importance, confidence: note.confidence, updated_at: note.updated_at, expires_at: note.expires_at, source_ref: note.source_ref, + embedding_version: note.embedding_version, + hit_count: note.hit_count, + last_hit_at: note.last_hit_at, }, ); } - let filtered_candidates: Vec<ChunkCandidate> = candidates - .into_iter() - .filter(|candidate| note_meta.contains_key(&candidate.note_id)) - .collect(); - let snippet_items = if filtered_candidates.is_empty() { - Vec::new() - } else { - let pairs = collect_neighbor_pairs(&filtered_candidates); - let chunk_rows = fetch_chunks_by_pair(&self.db.pool, &pairs).await?; - let mut chunk_by_id = HashMap::new(); - let mut chunk_by_note_index = HashMap::new(); - for row in chunk_rows { - chunk_by_note_index.insert((row.note_id, row.chunk_index), row.clone()); - chunk_by_id.insert(row.chunk_id, row); - } + Ok(note_meta) + } - let mut items = Vec::new(); - for candidate in &filtered_candidates { - let Some(chunk_row) = chunk_by_id.get(&candidate.chunk_id) else { - tracing::warn!( - chunk_id = %candidate.chunk_id, - "Chunk metadata missing for candidate." - ); - continue; - }; - let snippet = - stitch_snippet(candidate.note_id, chunk_row.chunk_index, &chunk_by_note_index); - if snippet.is_empty() { - continue; + async fn fetch_relation_contexts_for_notes( + &self, + note_ids: &[Uuid], + tenant_id: &str, + project_id: &str, + agent_id: &str, + allowed_scopes: &[String], + now: OffsetDateTime, + ) -> Result<HashMap<Uuid, Vec<SearchExplainRelationContext>>> { + if note_ids.is_empty() { + return Ok(HashMap::new()); + } + + let private_allowed = allowed_scopes.iter().any(|scope| scope == "agent_private"); + let non_private_scopes: Vec<String> = + allowed_scopes.iter().filter(|scope| *scope != "agent_private").cloned().collect(); + let (max_evidence_notes_per_fact, max_facts_per_item) = self.relation_context_bounds(); + let rows = self + .fetch_relation_context_rows( + note_ids, + tenant_id, + project_id, + agent_id, + &non_private_scopes, + private_allowed, + now, + max_evidence_notes_per_fact, + max_facts_per_item, + ) + .await?; + + Ok(Self::group_relation_context_rows(rows)) + } + + fn relation_context_bounds(&self) -> (i32, i32) { + let max_evidence_notes_per_fact = + i32::try_from(self.cfg.search.graph_context.max_evidence_notes_per_fact) + .unwrap_or(i32::MAX); + let max_facts_per_item = + i32::try_from(self.cfg.search.graph_context.max_facts_per_item).unwrap_or(i32::MAX); + + (max_evidence_notes_per_fact, max_facts_per_item) + } + + #[allow(clippy::too_many_arguments)] + async fn fetch_relation_context_rows( + &self, + note_ids: &[Uuid], + tenant_id: &str, + project_id: &str, + agent_id: &str, + non_private_scopes: &[String], + private_allowed: bool, + now: OffsetDateTime, + max_evidence_notes_per_fact: i32, + max_facts_per_item: i32, + ) -> Result<Vec<SearchRelationContextRow>> { + Ok(sqlx::query_as::<_, SearchRelationContextRow>(RELATION_CONTEXT_SQL) + .bind(tenant_id) + .bind(project_id) + .bind(agent_id) + .bind(now) + .bind(private_allowed) + .bind(non_private_scopes) + .bind(note_ids) + .bind(max_evidence_notes_per_fact) + .bind(max_facts_per_item) + .fetch_all(&self.db.pool) + .await?) + } + + fn group_relation_context_rows( + rows: Vec<SearchRelationContextRow>, + ) -> HashMap<Uuid, Vec<SearchExplainRelationContext>> { + let mut relation_context_by_note: HashMap<Uuid, Vec<SearchExplainRelationContext>> = + HashMap::new(); + + for row in rows { + let object = if row.object_entity_id.is_some() { + SearchExplainRelationContextObject { + entity: Some(SearchExplainRelationEntityRef { + canonical: row.object_canonical, + kind: row.object_kind, + }), + value: None, } - let Some(note) = note_meta.get(&candidate.note_id) else { - continue; - }; - let chunk = ChunkMeta { - chunk_id: chunk_row.chunk_id, - chunk_index: chunk_row.chunk_index, - start_offset: chunk_row.start_offset, - end_offset: chunk_row.end_offset, - }; - items.push(ChunkSnippet { note: note.clone(), chunk, snippet }); - } - items - }; + } else { + SearchExplainRelationContextObject { entity: None, value: row.object_value } + }; - let mut scored: Vec<ScoredChunk> = Vec::new(); - if !snippet_items.is_empty() { - let mut cached_scores: Option<Vec<f32>> = None; - let mut cache_key: Option<String> = None; - let mut cache_candidates: Vec<RerankCacheCandidate> = Vec::new(); - - if cache_cfg.enabled { - let candidates: Vec<RerankCacheCandidate> = snippet_items - .iter() - .map(|item| RerankCacheCandidate { - chunk_id: item.chunk.chunk_id, - updated_at: item.note.updated_at, - }) - .collect(); - let signature: Vec<(Uuid, OffsetDateTime)> = candidates - .iter() - .map(|candidate| (candidate.chunk_id, candidate.updated_at)) - .collect(); - match build_rerank_cache_key( - query, - cache_cfg.rerank_version.as_str(), - self.cfg.providers.rerank.provider_id.as_str(), - self.cfg.providers.rerank.model.as_str(), - &signature, - ) { - Ok(key) => { - cache_key = Some(key.clone()); - cache_candidates = candidates; - match fetch_cache_payload(&self.db.pool, CacheKind::Rerank, &key, now).await - { - Ok(Some(payload)) => { - let decoded: RerankCachePayload = - match serde_json::from_value(payload.value) { - Ok(value) => value, - Err(err) => { - tracing::warn!( - error = %err, - cache_kind = CacheKind::Rerank.as_str(), - cache_key_prefix = cache_key_prefix(&key), - "Cache payload decode failed." - ); - RerankCachePayload { items: Vec::new() } - }, - }; - if let Some(scores) = - build_cached_scores(&decoded, &cache_candidates) - { - tracing::info!( - cache_kind = CacheKind::Rerank.as_str(), - cache_key_prefix = cache_key_prefix(&key), - hit = true, - payload_size = payload.size_bytes, - ttl_days = cache_cfg.rerank_ttl_days, - "Cache hit." - ); - cached_scores = Some(scores); - } else { - tracing::warn!( - cache_kind = CacheKind::Rerank.as_str(), - cache_key_prefix = cache_key_prefix(&key), - hit = false, - payload_size = payload.size_bytes, - ttl_days = cache_cfg.rerank_ttl_days, - "Cache payload did not match candidates." - ); - } - }, - Ok(None) => { - tracing::info!( - cache_kind = CacheKind::Rerank.as_str(), - cache_key_prefix = cache_key_prefix(&key), - hit = false, - payload_size = 0_u64, - ttl_days = cache_cfg.rerank_ttl_days, - "Cache miss." - ); - }, - Err(err) => { - tracing::warn!( - error = %err, - cache_kind = CacheKind::Rerank.as_str(), - cache_key_prefix = cache_key_prefix(&key), - "Cache read failed." - ); - }, - } + relation_context_by_note.entry(row.note_id).or_default().push( + SearchExplainRelationContext { + fact_id: row.fact_id, + scope: row.scope, + subject: SearchExplainRelationEntityRef { + canonical: row.subject_canonical, + kind: row.subject_kind, }, - Err(err) => { - tracing::warn!( - error = %err, - cache_kind = CacheKind::Rerank.as_str(), - "Cache key build failed." - ); + predicate: row.predicate, + object, + valid_from: row.valid_from, + valid_to: row.valid_to, + temporal_status: if row.is_current { + RelationTemporalStatus::Current + } else { + RelationTemporalStatus::Historical }, - } + evidence_note_ids: row.evidence_note_ids, + }, + ); + } + + relation_context_by_note + } +} + +pub(crate) fn resolve_read_profile_scopes(cfg: &Config, profile: &str) -> Result<Vec<String>> { + ranking::resolve_scopes(cfg, profile) +} + +/// Computes the stable ranking-policy identifier for a search configuration. +pub fn ranking_policy_id( + cfg: &Config, + ranking_override: Option<&RankingRequestOverride>, +) -> Result<String> { + let blend_policy = ranking::resolve_blend_policy( + &cfg.ranking.blend, + ranking_override.and_then(|value| value.blend.as_ref()), + )?; + let diversity_policy = ranking::resolve_diversity_policy( + &cfg.ranking.diversity, + ranking_override.and_then(|value| value.diversity.as_ref()), + )?; + let retrieval_sources_policy = ranking::resolve_retrieval_sources_policy( + &cfg.ranking.retrieval_sources, + ranking_override.and_then(|value| value.retrieval_sources.as_ref()), + )?; + let snapshot = ranking::build_policy_snapshot( + cfg, + &blend_policy, + &diversity_policy, + &retrieval_sources_policy, + ranking_override, + ); + let hash = ranking::hash_policy_snapshot(&snapshot)?; + let prefix = &hash[..12.min(hash.len())]; + + Ok(format!("ranking_v2:{prefix}")) +} + +/// Replays ranking against stored trace candidates and returns the final top-k items. +pub fn replay_ranking_from_candidates( + cfg: &Config, + trace: &TraceReplayContext, + ranking_override: Option<&RankingRequestOverride>, + candidates: &[TraceReplayCandidate], + top_k: u32, +) -> Result<Vec<TraceReplayItem>> { + let query_tokens = ranking::tokenize_query(trace.query.as_str(), MAX_MATCHED_TERMS); + let scope_context_boost_by_scope = + ranking::build_scope_context_boost_by_scope(&query_tokens, cfg.context.as_ref()); + let det_query_tokens = build_deterministic_query_tokens(cfg, trace.query.as_str()); + let blend_policy = ranking::resolve_blend_policy( + &cfg.ranking.blend, + ranking_override.and_then(|override_| override_.blend.as_ref()), + )?; + let diversity_policy = ranking::resolve_diversity_policy( + &cfg.ranking.diversity, + ranking_override.and_then(|override_| override_.diversity.as_ref()), + )?; + let policy_id = ranking_policy_id(cfg, ranking_override)?; + let now = trace.created_at; + let total_rerank = u32::try_from(candidates.len()).unwrap_or(1).max(1); + let total_retrieval = trace.candidate_count.max(1); + let rerank_ranks = ranking::build_rerank_ranks_for_replay(candidates); + let replay_diversity_decisions = ranking::extract_replay_diversity_decisions(candidates); + let score_ctx = ScoreCandidateCtx { + cfg, + blend_policy: &blend_policy, + scope_context_boost_by_scope: &scope_context_boost_by_scope, + det_query_tokens: det_query_tokens.as_slice(), + now, + total_rerank, + total_retrieval, + }; + let mut best_by_note: BTreeMap<Uuid, ScoredReplay> = BTreeMap::new(); + + for (candidate, rerank_rank) in candidates.iter().zip(rerank_ranks) { + let scored = score_replay_candidate(&score_ctx, candidate, rerank_rank); + let replace = match best_by_note.get(&candidate.note_id) { + None => true, + Some(existing) => should_replace_replay_best(existing, &scored), + }; + + if replace { + best_by_note.insert(candidate.note_id, scored); + } + } + + let mut results: Vec<ScoredReplay> = best_by_note.into_values().collect(); + + results.sort_by(cmp_scored_replay); + + let results = apply_replay_diversity_selection( + results, + top_k, + diversity_policy.enabled, + &replay_diversity_decisions, + ); + + Ok(build_replay_items( + cfg, + &blend_policy, + &diversity_policy, + policy_id.as_str(), + &replay_diversity_decisions, + results, + )) +} + +fn apply_payload_level_to_search_item( + mut item: SearchItem, + payload_level: PayloadLevel, +) -> SearchItem { + if payload_level == PayloadLevel::L2 { + return item; + } + + item.source_ref = serde_json::json!({}); + + item +} + +fn validate_search_request_inputs( + tenant_id: &str, + project_id: &str, + agent_id: &str, + query: &str, +) -> Result<()> { + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { + return Err(crate::Error::InvalidRequest { + message: "tenant_id, project_id, and agent_id are required.".to_string(), + }); + } + if !english_gate::is_english_natural_language(query) { + return Err(crate::Error::NonEnglishInput { field: "$.query".to_string() }); + } + + Ok(()) +} + +fn raw_search_path_label(path: RawSearchPath) -> &'static str { + match path { + RawSearchPath::Quick => "quick", + RawSearchPath::Planned => "planned", + } +} + +fn sorted_unique_strings(mut values: Vec<String>) -> Vec<String> { + values.sort(); + values.dedup(); + + values +} + +fn build_trajectory_summary_from_stages( + stages: &[SearchTrajectoryStage], +) -> SearchTrajectorySummary { + let summary_stages = stages + .iter() + .map(|stage| { + let stats = + stage.stage_payload.get("stats").cloned().unwrap_or_else(|| serde_json::json!({})); + + SearchTrajectorySummaryStage { + stage_order: stage.stage_order, + stage_name: stage.stage_name.clone(), + item_count: stage.items.len() as u32, + stats, } + }) + .collect(); - let scores = if let Some(scores) = cached_scores { - scores - } else { - let docs: Vec<String> = - snippet_items.iter().map(|item| item.snippet.clone()).collect(); - let scores = - self.providers.rerank.rerank(&self.cfg.providers.rerank, query, &docs).await?; - if scores.len() != snippet_items.len() { - return Err(ServiceError::Provider { - message: "Rerank provider returned mismatched score count.".to_string(), - }); - } + SearchTrajectorySummary { + schema: SEARCH_RETRIEVAL_TRAJECTORY_SCHEMA_V1.to_string(), + stages: summary_stages, + } +} + +fn build_search_filter( + tenant_id: &str, + project_id: &str, + agent_id: &str, + allowed_scopes: &[String], +) -> Filter { + let private_scope = "agent_private".to_string(); + let non_private_scopes: Vec<String> = + allowed_scopes.iter().filter(|scope| *scope != "agent_private").cloned().collect(); + let mut scope_should_conditions = Vec::new(); + + if allowed_scopes.iter().any(|scope| scope == "agent_private") { + let private_filter = Filter::all([ + Condition::matches("scope", private_scope), + Condition::matches("agent_id", agent_id.to_string()), + ]); + + scope_should_conditions.push(Condition::from(private_filter)); + } + if !non_private_scopes.is_empty() { + scope_should_conditions.push(Condition::matches("scope", non_private_scopes)); + } + + let scope_min_should = if scope_should_conditions.is_empty() { + None + } else { + Some(MinShould { min_count: 1, conditions: scope_should_conditions }) + }; + let mut project_or_org_branches = vec![Condition::from(Filter { + must: vec![Condition::matches("project_id", project_id.to_string())], + should: Vec::new(), + must_not: Vec::new(), + min_should: scope_min_should, + })]; + + if allowed_scopes.iter().any(|scope| scope == "org_shared") { + let org_filter = Filter::all([ + Condition::matches("project_id", ORG_PROJECT_ID.to_string()), + Condition::matches("scope", "org_shared".to_string()), + ]); + + project_or_org_branches.push(Condition::from(org_filter)); + } + + Filter { + must: vec![ + Condition::matches("tenant_id", tenant_id.to_string()), + Condition::matches("status", "active".to_string()), + ], + should: Vec::new(), + must_not: Vec::new(), + min_should: Some(MinShould { min_count: 1, conditions: project_or_org_branches }), + } +} + +fn select_best_scored_chunks(scored: Vec<ScoredChunk>) -> Vec<ScoredChunk> { + let mut best_by_note: HashMap<Uuid, ScoredChunk> = HashMap::new(); + + for scored_item in scored { + let note_id = scored_item.item.note.note_id; + let replace = match best_by_note.get(¬e_id) { + Some(existing) => scored_item.final_score > existing.final_score, + None => true, + }; + + if replace { + best_by_note.insert(note_id, scored_item); + } + } + + let mut results: Vec<ScoredChunk> = best_by_note.into_values().collect(); + + results.sort_by(cmp_scored_chunk); + + results +} + +fn cmp_scored_chunk(a: &ScoredChunk, b: &ScoredChunk) -> Ordering { + let ord = ranking::cmp_f32_desc(a.final_score, b.final_score); + + if ord != Ordering::Equal { + return ord; + } + + let ord = a.item.retrieval_rank.cmp(&b.item.retrieval_rank); + + if ord != Ordering::Equal { + return ord; + } + + let ord = a.item.note.note_id.cmp(&b.item.note.note_id); + + if ord != Ordering::Equal { + return ord; + } + + a.item.chunk.chunk_id.cmp(&b.item.chunk.chunk_id) +} + +fn score_chunk_candidate( + ctx: &ScoreCandidateCtx<'_, '_>, + item: ChunkSnippet, + rerank_score: f32, + rerank_rank: u32, +) -> ScoredChunk { + let importance = item.note.importance; + let retrieval_rank = item.retrieval_rank; + let age_days = (ctx.now - item.note.updated_at).as_seconds_f32() / 86_400.0; + let decay = if ctx.cfg.ranking.recency_tau_days > 0.0 { + (-age_days / ctx.cfg.ranking.recency_tau_days).exp() + } else { + 1.0 + }; + let base = (1.0 + 0.6 * importance) * decay; + let tie_breaker_score = ctx.cfg.ranking.tie_breaker_weight * base; + let scope_context_boost = + ctx.scope_context_boost_by_scope.get(item.note.scope.as_str()).copied().unwrap_or(0.0); + let rerank_norm = match ctx.blend_policy.rerank_normalization { + NormalizationKind::Rank => ranking::rank_normalize(rerank_rank, ctx.total_rerank), + }; + let retrieval_norm = match ctx.blend_policy.retrieval_normalization { + NormalizationKind::Rank => ranking::rank_normalize(retrieval_rank, ctx.total_retrieval), + }; + let blend_retrieval_weight = if ctx.blend_policy.enabled { + ranking::retrieval_weight_for_rank(retrieval_rank, &ctx.blend_policy.segments) + } else { + 0.0 + }; + let retrieval_term = blend_retrieval_weight * retrieval_norm; + let rerank_term = (1.0 - blend_retrieval_weight) * rerank_norm; + let det_terms = ranking::compute_deterministic_ranking_terms( + ctx.cfg, + ctx.det_query_tokens, + item.snippet.as_str(), + item.note.hit_count, + item.note.last_hit_at, + age_days, + ctx.now, + ); + let final_score = retrieval_term + + rerank_term + + tie_breaker_score + + scope_context_boost + + det_terms.lexical_bonus + + det_terms.hit_boost + + det_terms.decay_penalty; + + ScoredChunk { + item, + final_score, + rerank_score, + rerank_rank, + rerank_norm, + retrieval_norm, + blend_retrieval_weight, + retrieval_term, + rerank_term, + tie_breaker_score, + scope_context_boost, + age_days, + importance, + deterministic_lexical_overlap_ratio: det_terms.lexical_overlap_ratio, + deterministic_lexical_bonus: det_terms.lexical_bonus, + deterministic_hit_count: det_terms.hit_count, + deterministic_last_hit_age_days: det_terms.last_hit_age_days, + deterministic_hit_boost: det_terms.hit_boost, + deterministic_decay_penalty: det_terms.decay_penalty, + } +} + +fn build_trace_candidate_record( + scored_chunk: &ScoredChunk, + now: OffsetDateTime, + expires_at: OffsetDateTime, +) -> TraceCandidateRecord { + let note = &scored_chunk.item.note; + + TraceCandidateRecord { + candidate_id: Uuid::new_v4(), + note_id: note.note_id, + chunk_id: scored_chunk.item.chunk.chunk_id, + chunk_index: scored_chunk.item.chunk.chunk_index, + snippet: scored_chunk.item.snippet.clone(), + candidate_snapshot: serde_json::to_value(TraceReplayCandidate { + note_id: note.note_id, + chunk_id: scored_chunk.item.chunk.chunk_id, + chunk_index: scored_chunk.item.chunk.chunk_index, + snippet: scored_chunk.item.snippet.clone(), + retrieval_rank: scored_chunk.item.retrieval_rank, + retrieval_score: scored_chunk.item.retrieval_score, + rerank_score: scored_chunk.rerank_score, + note_scope: note.scope.clone(), + note_importance: note.importance, + note_updated_at: note.updated_at, + note_hit_count: note.hit_count, + note_last_hit_at: note.last_hit_at, + diversity_selected: None, + diversity_selected_rank: None, + diversity_selected_reason: None, + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: None, + diversity_mmr_score: None, + diversity_missing_embedding: None, + }) + .unwrap_or_else(|_| serde_json::json!({})), + retrieval_rank: scored_chunk.item.retrieval_rank, + rerank_score: scored_chunk.rerank_score, + note_scope: note.scope.clone(), + note_importance: note.importance, + note_updated_at: note.updated_at, + note_hit_count: note.hit_count, + note_last_hit_at: note.last_hit_at, + created_at: now, + expires_at, + } +} + +fn build_search_item_and_trace_item( + args: BuildSearchItemArgs<'_>, +) -> (SearchItem, TraceItemRecord) { + let (matched_terms, matched_fields) = ranking::match_terms_in_text( + args.query_tokens, + args.scored_chunk.item.snippet.as_str(), + args.scored_chunk.item.note.key.as_deref(), + MAX_MATCHED_TERMS, + ); + let matched_fields = ranking::merge_matched_fields( + matched_fields, + args.structured_matches.get(&args.scored_chunk.item.note.note_id), + ); + let trace_terms = ranking_explain_v2::build_trace_terms_v2(TraceTermsArgs { + cfg: args.cfg, + blend_enabled: args.blend_policy.enabled, + retrieval_normalization: args.blend_policy.retrieval_normalization.as_str(), + rerank_normalization: args.blend_policy.rerank_normalization.as_str(), + blend_retrieval_weight: args.scored_chunk.blend_retrieval_weight, + retrieval_rank: args.scored_chunk.item.retrieval_rank, + retrieval_norm: args.scored_chunk.retrieval_norm, + retrieval_term: args.scored_chunk.retrieval_term, + rerank_score: args.scored_chunk.rerank_score, + rerank_rank: args.scored_chunk.rerank_rank, + rerank_norm: args.scored_chunk.rerank_norm, + rerank_term: args.scored_chunk.rerank_term, + tie_breaker_score: args.scored_chunk.tie_breaker_score, + importance: args.scored_chunk.importance, + age_days: args.scored_chunk.age_days, + scope: args.scored_chunk.item.note.scope.as_str(), + scope_context_boost: args.scored_chunk.scope_context_boost, + deterministic_lexical_overlap_ratio: args.scored_chunk.deterministic_lexical_overlap_ratio, + deterministic_lexical_bonus: args.scored_chunk.deterministic_lexical_bonus, + deterministic_hit_count: args.scored_chunk.deterministic_hit_count, + deterministic_last_hit_age_days: args.scored_chunk.deterministic_last_hit_age_days, + deterministic_hit_boost: args.scored_chunk.deterministic_hit_boost, + deterministic_decay_penalty: args.scored_chunk.deterministic_decay_penalty, + }); + let response_terms = ranking_explain_v2::strip_term_inputs(&trace_terms); + let relation_context = + args.relation_contexts.get(&args.scored_chunk.item.note.note_id).cloned(); + let diversity = if args.diversity_policy.enabled { + args.diversity_decisions + .get(&args.scored_chunk.item.note.note_id) + .map(ranking::build_diversity_explain) + } else { + None + }; + let response_explain = SearchExplain { + r#match: SearchMatchExplain { + matched_terms: matched_terms.clone(), + matched_fields: matched_fields.clone(), + }, + ranking: SearchRankingExplain { + schema: SEARCH_RANKING_EXPLAIN_SCHEMA_V2.to_string(), + policy_id: args.policy_id.to_string(), + final_score: args.scored_chunk.final_score, + terms: response_terms, + }, + relation_context: relation_context.clone(), + diversity: diversity.clone(), + }; + let trace_explain = SearchExplain { + r#match: SearchMatchExplain { matched_terms, matched_fields }, + ranking: SearchRankingExplain { + schema: SEARCH_RANKING_EXPLAIN_SCHEMA_V2.to_string(), + policy_id: args.policy_id.to_string(), + final_score: args.scored_chunk.final_score, + terms: trace_terms, + }, + relation_context, + diversity, + }; + let result_handle = Uuid::new_v4(); + let note = &args.scored_chunk.item.note; + let chunk = &args.scored_chunk.item.chunk; + let item = SearchItem { + result_handle, + note_id: note.note_id, + chunk_id: chunk.chunk_id, + chunk_index: chunk.chunk_index, + start_offset: chunk.start_offset, + end_offset: chunk.end_offset, + snippet: args.scored_chunk.item.snippet.clone(), + r#type: note.note_type.clone(), + key: note.key.clone(), + scope: note.scope.clone(), + importance: note.importance, + confidence: note.confidence, + updated_at: note.updated_at, + expires_at: note.expires_at, + final_score: args.scored_chunk.final_score, + source_ref: note.source_ref.clone(), + explain: response_explain, + }; + let trace_item = TraceItemRecord { + item_id: result_handle, + note_id: note.note_id, + chunk_id: Some(chunk.chunk_id), + rank: args.rank, + final_score: args.scored_chunk.final_score, + explain: trace_explain, + }; + + (item, trace_item) +} - if cache_cfg.enabled - && let Some(key) = cache_key.as_ref() - && !cache_candidates.is_empty() - { - let payload = RerankCachePayload { - items: cache_candidates - .iter() - .zip(scores.iter()) - .map(|(candidate, score)| RerankCacheItem { - chunk_id: candidate.chunk_id, - updated_at: candidate.updated_at, - score: *score, - }) - .collect(), - }; - match serde_json::to_value(&payload) { - Ok(payload_json) => { - let stored_at = OffsetDateTime::now_utc(); - let expires_at = stored_at + Duration::days(cache_cfg.rerank_ttl_days); - match store_cache_payload( - &self.db.pool, - CacheKind::Rerank, - key, - payload_json, - stored_at, - expires_at, - cache_cfg.max_payload_bytes, - ) - .await - { - Ok(Some(payload_size)) => { - tracing::info!( - cache_kind = CacheKind::Rerank.as_str(), - cache_key_prefix = cache_key_prefix(key), - hit = false, - payload_size, - ttl_days = cache_cfg.rerank_ttl_days, - "Cache stored." - ); - }, - Ok(None) => { - tracing::warn!( - cache_kind = CacheKind::Rerank.as_str(), - cache_key_prefix = cache_key_prefix(key), - hit = false, - payload_size = 0_u64, - ttl_days = cache_cfg.rerank_ttl_days, - "Cache payload skipped due to size." - ); - }, - Err(err) => { - tracing::warn!( - error = %err, - cache_kind = CacheKind::Rerank.as_str(), - cache_key_prefix = cache_key_prefix(key), - "Cache write failed." - ); - }, - } - }, - Err(err) => { - tracing::warn!( - error = %err, - cache_kind = CacheKind::Rerank.as_str(), - cache_key_prefix = cache_key_prefix(key), - "Cache payload encode failed." - ); - }, - } - } +fn build_structured_field_matches(rows: Vec<FieldHit>) -> (Vec<Uuid>, HashMap<Uuid, Vec<String>>) { + let mut structured_matches: HashMap<Uuid, HashSet<String>> = HashMap::new(); + let mut ordered_note_ids = Vec::new(); + let mut seen_notes = HashSet::new(); + + for row in rows { + let label = match row.field_kind.as_str() { + "summary" => "summary", + "fact" => "facts", + "concept" => "concepts", + _ => continue, + }; - scores - }; + structured_matches.entry(row.note_id).or_default().insert(label.to_string()); - scored = Vec::with_capacity(snippet_items.len()); - for (item, rerank_score) in snippet_items.into_iter().zip(scores.into_iter()) { - let age_days = (now - item.note.updated_at).as_seconds_f32() / 86_400.0; - let decay = if self.cfg.ranking.recency_tau_days > 0.0 { - (-age_days / self.cfg.ranking.recency_tau_days).exp() - } else { - 1.0 - }; - let base = (1.0 + 0.6 * item.note.importance) * decay; - let tie_breaker_score = self.cfg.ranking.tie_breaker_weight * base; - let final_score = rerank_score + tie_breaker_score; - scored.push(ScoredChunk { item, rerank_score, tie_breaker_score, final_score }); - } + if seen_notes.insert(row.note_id) { + ordered_note_ids.push(row.note_id); } + } - let mut best_by_note: HashMap<Uuid, ScoredChunk> = HashMap::new(); - for scored_item in scored { - let note_id = scored_item.item.note.note_id; - let replace = match best_by_note.get(¬e_id) { - Some(existing) => scored_item.final_score > existing.final_score, - None => true, - }; - if replace { - best_by_note.insert(note_id, scored_item); - } - } - let mut results: Vec<ScoredChunk> = best_by_note.into_values().collect(); - results.sort_by(|a, b| { - b.final_score.partial_cmp(&a.final_score).unwrap_or(std::cmp::Ordering::Equal) - }); - results.truncate(top_k as usize); + let mut structured_matches_out: HashMap<Uuid, Vec<String>> = HashMap::new(); - if record_hits_enabled && !results.is_empty() { - record_hits(&self.db.pool, query, &results, now).await?; - } + for (note_id, fields) in structured_matches { + let mut fields: Vec<String> = fields.into_iter().collect(); - let query_tokens = tokenize_query(query, MAX_MATCHED_TERMS); - let mut items = Vec::with_capacity(results.len()); - let trace_context = TraceContext { - trace_id, - tenant_id, - project_id, - agent_id, - read_profile, - query, - expansion_mode, - expanded_queries, - allowed_scopes, - candidate_count, - top_k, - }; - let mut trace_builder = SearchTraceBuilder::new(trace_context, &self.cfg, now); - for (idx, scored_chunk) in results.into_iter().enumerate() { - let rank = idx as u32 + 1; - let retrieval = retrieval_map.get(&scored_chunk.item.chunk.chunk_id).copied(); - let (matched_terms, matched_fields) = match_terms_in_text( - &query_tokens, - &scored_chunk.item.snippet, - scored_chunk.item.note.key.as_deref(), - MAX_MATCHED_TERMS, - ); - let boosts = vec![SearchBoost { - name: "recency_importance".to_string(), - score: scored_chunk.tie_breaker_score, - }]; - let explain = SearchExplain { - retrieval_score: retrieval.map(|entry| entry.score), - retrieval_rank: retrieval.map(|entry| entry.rank), - rerank_score: scored_chunk.rerank_score, - tie_breaker_score: scored_chunk.tie_breaker_score, - final_score: scored_chunk.final_score, - boosts: boosts.clone(), - matched_terms: matched_terms.clone(), - matched_fields: matched_fields.clone(), - }; - let result_handle = Uuid::new_v4(); - let note = &scored_chunk.item.note; - let chunk = &scored_chunk.item.chunk; - items.push(SearchItem { - result_handle, - note_id: note.note_id, - chunk_id: chunk.chunk_id, - chunk_index: chunk.chunk_index, - start_offset: chunk.start_offset, - end_offset: chunk.end_offset, - snippet: scored_chunk.item.snippet.clone(), - note_type: note.note_type.clone(), - key: note.key.clone(), - scope: note.scope.clone(), - importance: note.importance, - confidence: note.confidence, - updated_at: note.updated_at, - expires_at: note.expires_at, - final_score: scored_chunk.final_score, - source_ref: note.source_ref.clone(), - explain, - }); - trace_builder.push_item(TraceItemRecord { - item_id: result_handle, - note_id: note.note_id, - chunk_id: Some(chunk.chunk_id), - rank, - retrieval_score: retrieval.map(|entry| entry.score), - retrieval_rank: retrieval.map(|entry| entry.rank), - rerank_score: scored_chunk.rerank_score, - tie_breaker_score: scored_chunk.tie_breaker_score, - final_score: scored_chunk.final_score, - boosts, - matched_terms, - matched_fields, - }); - } + fields.sort(); + structured_matches_out.insert(note_id, fields); + } + + (ordered_note_ids, structured_matches_out) +} + +fn build_structured_field_candidates( + candidate_k: u32, + ordered_note_ids: Vec<Uuid>, + best_by_note: HashMap<Uuid, (Uuid, i32)>, + embed_version: &str, +) -> Vec<ChunkCandidate> { + let mut structured_candidates = Vec::new(); + let mut next_rank = 1_u32; - let trace_payload = trace_builder.build(); - if let Err(err) = enqueue_trace(&self.db.pool, trace_payload).await { - tracing::error!(error = %err, trace_id = %trace_id, "Failed to enqueue search trace."); + for note_id in ordered_note_ids { + if structured_candidates.len() >= candidate_k as usize { + break; } - Ok(SearchResponse { trace_id, items }) + let Some((chunk_id, chunk_index)) = best_by_note.get(¬e_id) else { continue }; + + structured_candidates.push(ChunkCandidate { + chunk_id: *chunk_id, + note_id, + chunk_index: *chunk_index, + retrieval_rank: next_rank, + retrieval_score: None, + scope: None, + updated_at: None, + embedding_version: Some(embed_version.to_string()), + }); + + next_rank = next_rank.saturating_add(1); } + + structured_candidates } -#[derive(Debug, serde::Deserialize)] -struct ExpansionOutput { - queries: Vec<String>, +fn build_deterministic_query_tokens(cfg: &Config, query: &str) -> Vec<String> { + if cfg.ranking.deterministic.enabled + && cfg.ranking.deterministic.lexical.enabled + && cfg.ranking.deterministic.lexical.max_query_terms > 0 + { + ranking::tokenize_query(query, cfg.ranking.deterministic.lexical.max_query_terms as usize) + } else { + Vec::new() + } } -fn resolve_expansion_mode(cfg: &elf_config::Config) -> ExpansionMode { - match cfg.search.expansion.mode.as_str() { - "off" => ExpansionMode::Off, - "always" => ExpansionMode::Always, - "dynamic" => ExpansionMode::Dynamic, - _ => ExpansionMode::Off, +fn build_trace_audit(actor_id: &str, token_id: Option<&str>) -> Value { + match token_id.map(str::trim).filter(|value| !value.is_empty()) { + Some(token_id) => serde_json::json!({ "actor_id": actor_id, "token_id": token_id }), + None => serde_json::json!({ "actor_id": actor_id }), } } -fn should_expand_dynamic( - candidate_count: usize, - top_score: f32, - cfg: &elf_config::SearchDynamic, -) -> bool { - candidate_count < cfg.min_candidates as usize || top_score < cfg.min_top_score +fn build_trace_trajectory_stages(args: &BuildTraceArgs<'_>) -> Vec<TraceTrajectoryStageRecord> { + let path_label = raw_search_path_label(args.path); + + vec![ + build_trace_rewrite_stage(args, path_label), + build_trace_recall_stage(args, path_label), + build_trace_fusion_stage(args, path_label), + build_trace_rerank_stage(args, path_label), + build_trace_final_stage(args, path_label), + ] } -fn normalize_queries( - queries: Vec<String>, - original: &str, - include_original: bool, - max_queries: u32, -) -> Vec<String> { - let mut out = Vec::new(); - let mut seen = HashSet::new(); +fn build_trace_rewrite_stage( + args: &BuildTraceArgs<'_>, + path_label: &str, +) -> TraceTrajectoryStageRecord { + let expanded_queries = sorted_unique_strings(args.expanded_queries.clone()); + + TraceTrajectoryStageRecord { + stage_id: Uuid::new_v4(), + stage_order: 1, + stage_name: "rewrite.expansion".to_string(), + stage_payload: serde_json::json!({ + "schema": SEARCH_RETRIEVAL_TRAJECTORY_SCHEMA_V1, + "path": path_label, + "inputs": { + "query": args.query, + "expansion_mode": ranking::expansion_mode_label(args.expansion_mode), + }, + "outputs": { + "expanded_queries": expanded_queries, + }, + "stats": { + "expanded_query_count": args.expanded_queries.len(), + }, + }), + created_at: args.now, + items: Vec::new(), + } +} + +fn build_trace_recall_stage( + args: &BuildTraceArgs<'_>, + path_label: &str, +) -> TraceTrajectoryStageRecord { + let mut stage_payload = serde_json::json!({ + "schema": SEARCH_RETRIEVAL_TRAJECTORY_SCHEMA_V1, + "path": path_label, + "stats": { + "candidate_count_before_filter": args.candidate_count, + "candidate_count_after_filter": args.filtered_candidate_count, + "snippet_count": args.snippet_count, + }, + }); - if include_original { - push_query(&mut out, &mut seen, original); + if let Some(filter_impact) = &args.filter_impact + && let Some(payload) = stage_payload.as_object_mut() + { + payload.insert("filter_impact".to_string(), filter_impact.to_stage_payload()); } - for query in queries { - if out.len() >= max_queries as usize { - break; - } - push_query(&mut out, &mut seen, &query); + if let Some(recursive_retrieval) = args.recursive_retrieval + && recursive_retrieval.enabled + && let Some(payload) = stage_payload.as_object_mut() + { + payload.insert( + "recursive".to_string(), + serde_json::json!({ + "enabled": true, + "scopes_seeded": recursive_retrieval.scopes_seeded, + "scopes_queried": recursive_retrieval.scopes_queried, + "candidates_before": recursive_retrieval.candidates_before, + "candidates_added": recursive_retrieval.candidates_added, + "candidates_after": recursive_retrieval.candidates_after, + "rounds_executed": recursive_retrieval.rounds_executed, + "total_queries": recursive_retrieval.total_queries, + "stop_reason": recursive_retrieval + .stop_reason + .clone() + .unwrap_or_else(|| "converged".to_string()), + }), + ); + } + + let items: Vec<TraceTrajectoryStageItemRecord> = args + .recall_candidates + .iter() + .take(MAX_TRAJECTORY_STAGE_ITEMS) + .map(|candidate| TraceTrajectoryStageItemRecord { + id: Uuid::new_v4(), + item_id: None, + note_id: Some(candidate.note_id), + chunk_id: Some(candidate.chunk_id), + metrics: serde_json::json!({ + "retrieval_rank": candidate.retrieval_rank, + "chunk_index": candidate.chunk_index, + }), + }) + .collect(); + + TraceTrajectoryStageRecord { + stage_id: Uuid::new_v4(), + stage_order: 2, + stage_name: "recall.candidates".to_string(), + stage_payload, + created_at: args.now, + items, } - out.truncate(max_queries as usize); - out } -fn push_query(out: &mut Vec<String>, seen: &mut HashSet<String>, value: &str) { - let trimmed = value.trim(); - if trimmed.is_empty() || cjk::contains_cjk(trimmed) { - return; +fn build_trace_fusion_stage( + args: &BuildTraceArgs<'_>, + path_label: &str, +) -> TraceTrajectoryStageRecord { + let items: Vec<TraceTrajectoryStageItemRecord> = args + .fused_results + .iter() + .take(MAX_TRAJECTORY_STAGE_ITEMS) + .map(|scored| TraceTrajectoryStageItemRecord { + id: Uuid::new_v4(), + item_id: None, + note_id: Some(scored.item.note.note_id), + chunk_id: Some(scored.item.chunk.chunk_id), + metrics: serde_json::json!({ + "retrieval_rank": scored.item.retrieval_rank, + "final_score": scored.final_score, + }), + }) + .collect(); + + TraceTrajectoryStageRecord { + stage_id: Uuid::new_v4(), + stage_order: 3, + stage_name: "fusion.merge".to_string(), + stage_payload: serde_json::json!({ + "schema": SEARCH_RETRIEVAL_TRAJECTORY_SCHEMA_V1, + "path": path_label, + "stats": { + "scored_count": args.scored_count, + "fused_count": args.fused_count, + }, + "decisions": { + "fusion_weight": args.policies.retrieval_sources_policy.fusion_weight, + "structured_field_weight": args.policies.retrieval_sources_policy.structured_field_weight, + "fusion_priority": args.policies.retrieval_sources_policy.fusion_priority, + "structured_field_priority": args.policies.retrieval_sources_policy.structured_field_priority, + }, + }), + created_at: args.now, + items, } - let key = trimmed.to_lowercase(); - if seen.insert(key) { - out.push(trimmed.to_string()); +} + +fn build_trace_rerank_stage( + args: &BuildTraceArgs<'_>, + path_label: &str, +) -> TraceTrajectoryStageRecord { + let items: Vec<TraceTrajectoryStageItemRecord> = args + .fused_results + .iter() + .take(MAX_TRAJECTORY_STAGE_ITEMS) + .map(|scored| TraceTrajectoryStageItemRecord { + id: Uuid::new_v4(), + item_id: None, + note_id: Some(scored.item.note.note_id), + chunk_id: Some(scored.item.chunk.chunk_id), + metrics: serde_json::json!({ + "rerank_score": scored.rerank_score, + "rerank_rank": scored.rerank_rank, + "rerank_norm": scored.rerank_norm, + "retrieval_norm": scored.retrieval_norm, + "final_score": scored.final_score, + }), + }) + .collect(); + + TraceTrajectoryStageRecord { + stage_id: Uuid::new_v4(), + stage_order: 4, + stage_name: "rerank.score".to_string(), + stage_payload: serde_json::json!({ + "schema": SEARCH_RETRIEVAL_TRAJECTORY_SCHEMA_V1, + "path": path_label, + "stats": { + "reranked_count": args.scored_count, + }, + "decisions": { + "blend_enabled": args.policies.blend_policy.enabled, + "diversity_enabled": args.policies.diversity_policy.enabled, + }, + }), + created_at: args.now, + items, } } -fn build_expansion_messages( - query: &str, - max_queries: u32, - include_original: bool, -) -> Vec<serde_json::Value> { - let schema = serde_json::json!({ - "queries": ["string"] - }); - let schema_text = serde_json::to_string_pretty(&schema) - .unwrap_or_else(|_| "{\"queries\": [\"string\"]}".to_string()); - let system_prompt = "You are a query expansion engine for a memory retrieval system. \ -Output must be valid JSON only and must match the provided schema exactly. \ -Generate short English-only query variations that preserve the original intent. \ -Do not include any CJK characters. Do not add explanations or extra fields."; - let user_prompt = format!( - "Return JSON matching this exact schema:\n{schema}\nConstraints:\n- MAX_QUERIES = {max}\n- INCLUDE_ORIGINAL = {include}\nOriginal query:\n{query}", - schema = schema_text, - max = max_queries, - include = include_original, - query = query - ); - vec![ - serde_json::json!({ "role": "system", "content": system_prompt }), - serde_json::json!({ "role": "user", "content": user_prompt }), - ] +fn build_trace_final_stage( + args: &BuildTraceArgs<'_>, + path_label: &str, +) -> TraceTrajectoryStageRecord { + TraceTrajectoryStageRecord { + stage_id: Uuid::new_v4(), + stage_order: 5, + stage_name: "selection.final".to_string(), + stage_payload: serde_json::json!({ + "schema": SEARCH_RETRIEVAL_TRAJECTORY_SCHEMA_V1, + "path": path_label, + "stats": { + "selected_count": args.selected_count, + "top_k": args.top_k, + }, + }), + created_at: args.now, + items: Vec::new(), + } } -fn collect_chunk_candidates( - points: &[ScoredPoint], - max_candidates: u32, - candidate_k: u32, -) -> Vec<ChunkCandidate> { - let limit = if max_candidates == 0 || max_candidates >= candidate_k { - points.len() +fn score_replay_candidate( + ctx: &ScoreCandidateCtx<'_, '_>, + candidate: &TraceReplayCandidate, + rerank_rank: u32, +) -> ScoredReplay { + let importance = candidate.note_importance; + let retrieval_rank = candidate.retrieval_rank; + let age_days = (ctx.now - candidate.note_updated_at).as_seconds_f32() / 86_400.0; + let decay = if ctx.cfg.ranking.recency_tau_days > 0.0 { + (-age_days / ctx.cfg.ranking.recency_tau_days).exp() } else { - max_candidates as usize + 1.0 }; - let mut out = Vec::new(); - let mut seen = HashSet::new(); - for (idx, point) in points.iter().take(limit).enumerate() { - let chunk_id = point - .id - .as_ref() - .and_then(point_id_to_uuid) - .or_else(|| payload_uuid(&point.payload, "chunk_id")); - let Some(chunk_id) = chunk_id else { - tracing::warn!("Chunk candidate missing chunk_id."); - continue; - }; - if !seen.insert(chunk_id) { - continue; + let base = (1.0 + 0.6 * importance) * decay; + let tie_breaker_score = ctx.cfg.ranking.tie_breaker_weight * base; + let scope_context_boost = + ctx.scope_context_boost_by_scope.get(candidate.note_scope.as_str()).copied().unwrap_or(0.0); + let rerank_norm = match ctx.blend_policy.rerank_normalization { + NormalizationKind::Rank => ranking::rank_normalize(rerank_rank, ctx.total_rerank), + }; + let retrieval_norm = match ctx.blend_policy.retrieval_normalization { + NormalizationKind::Rank => ranking::rank_normalize(retrieval_rank, ctx.total_retrieval), + }; + let blend_retrieval_weight = if ctx.blend_policy.enabled { + ranking::retrieval_weight_for_rank(retrieval_rank, &ctx.blend_policy.segments) + } else { + 0.0 + }; + let retrieval_term = blend_retrieval_weight * retrieval_norm; + let rerank_term = (1.0 - blend_retrieval_weight) * rerank_norm; + let det_terms = ranking::compute_deterministic_ranking_terms( + ctx.cfg, + ctx.det_query_tokens, + candidate.snippet.as_str(), + candidate.note_hit_count, + candidate.note_last_hit_at, + age_days, + ctx.now, + ); + let final_score = retrieval_term + + rerank_term + + tie_breaker_score + + scope_context_boost + + det_terms.lexical_bonus + + det_terms.hit_boost + + det_terms.decay_penalty; + + ScoredReplay { + note_id: candidate.note_id, + chunk_id: candidate.chunk_id, + retrieval_rank, + final_score, + rerank_score: candidate.rerank_score, + rerank_rank, + rerank_norm, + retrieval_norm, + blend_retrieval_weight, + retrieval_term, + rerank_term, + tie_breaker_score, + scope_context_boost, + age_days, + importance, + note_scope: candidate.note_scope.clone(), + deterministic_lexical_overlap_ratio: det_terms.lexical_overlap_ratio, + deterministic_lexical_bonus: det_terms.lexical_bonus, + deterministic_hit_count: det_terms.hit_count, + deterministic_last_hit_age_days: det_terms.last_hit_age_days, + deterministic_hit_boost: det_terms.hit_boost, + deterministic_decay_penalty: det_terms.decay_penalty, + } +} + +fn should_replace_replay_best(existing: &ScoredReplay, scored: &ScoredReplay) -> bool { + let ord = ranking::cmp_f32_desc(scored.final_score, existing.final_score); + + if ord != Ordering::Equal { + ord == Ordering::Less + } else { + scored.retrieval_rank < existing.retrieval_rank + } +} + +fn cmp_scored_replay(a: &ScoredReplay, b: &ScoredReplay) -> Ordering { + let ord = ranking::cmp_f32_desc(a.final_score, b.final_score); + + if ord != Ordering::Equal { + return ord; + } + + let ord = a.retrieval_rank.cmp(&b.retrieval_rank); + + if ord != Ordering::Equal { + return ord; + } + + let ord = a.note_id.cmp(&b.note_id); + + if ord != Ordering::Equal { + return ord; + } + + a.chunk_id.cmp(&b.chunk_id) +} + +fn apply_replay_diversity_selection( + mut results: Vec<ScoredReplay>, + top_k: u32, + diversity_enabled: bool, + replay_diversity_decisions: &HashMap<Uuid, DiversityDecision>, +) -> Vec<ScoredReplay> { + if diversity_enabled && !replay_diversity_decisions.is_empty() { + let mut selected: Vec<ScoredReplay> = results + .iter() + .filter(|scored| { + replay_diversity_decisions + .get(&scored.note_id) + .map(|decision| decision.selected) + .unwrap_or(false) + }) + .cloned() + .collect(); + + selected.sort_by(|a, b| { + let rank_a = replay_diversity_decisions + .get(&a.note_id) + .and_then(|decision| decision.selected_rank) + .unwrap_or(u32::MAX); + let rank_b = replay_diversity_decisions + .get(&b.note_id) + .and_then(|decision| decision.selected_rank) + .unwrap_or(u32::MAX); + let ord = rank_a.cmp(&rank_b); + + if ord != Ordering::Equal { + return ord; + } + + a.note_id.cmp(&b.note_id) + }); + + if !selected.is_empty() { + results = selected; } - let Some(note_id) = payload_uuid(&point.payload, "note_id") else { - tracing::warn!(chunk_id = %chunk_id, "Chunk candidate missing note_id."); - continue; - }; - let Some(chunk_index) = payload_i32(&point.payload, "chunk_index") else { - tracing::warn!(chunk_id = %chunk_id, "Chunk candidate missing chunk_index."); - continue; + } + + results.truncate(top_k.max(1) as usize); + + results +} + +fn build_replay_items( + cfg: &Config, + blend_policy: &ResolvedBlendPolicy, + diversity_policy: &ResolvedDiversityPolicy, + policy_id: &str, + replay_diversity_decisions: &HashMap<Uuid, DiversityDecision>, + results: Vec<ScoredReplay>, +) -> Vec<TraceReplayItem> { + let mut out = Vec::with_capacity(results.len()); + + for scored in results { + let terms = ranking_explain_v2::build_trace_terms_v2(TraceTermsArgs { + cfg, + blend_enabled: blend_policy.enabled, + retrieval_normalization: blend_policy.retrieval_normalization.as_str(), + rerank_normalization: blend_policy.rerank_normalization.as_str(), + blend_retrieval_weight: scored.blend_retrieval_weight, + retrieval_rank: scored.retrieval_rank, + retrieval_norm: scored.retrieval_norm, + retrieval_term: scored.retrieval_term, + rerank_score: scored.rerank_score, + rerank_rank: scored.rerank_rank, + rerank_norm: scored.rerank_norm, + rerank_term: scored.rerank_term, + tie_breaker_score: scored.tie_breaker_score, + importance: scored.importance, + age_days: scored.age_days, + scope: scored.note_scope.as_str(), + scope_context_boost: scored.scope_context_boost, + deterministic_lexical_overlap_ratio: scored.deterministic_lexical_overlap_ratio, + deterministic_lexical_bonus: scored.deterministic_lexical_bonus, + deterministic_hit_count: scored.deterministic_hit_count, + deterministic_last_hit_age_days: scored.deterministic_last_hit_age_days, + deterministic_hit_boost: scored.deterministic_hit_boost, + deterministic_decay_penalty: scored.deterministic_decay_penalty, + }); + let explain = SearchExplain { + r#match: SearchMatchExplain { matched_terms: Vec::new(), matched_fields: Vec::new() }, + ranking: SearchRankingExplain { + schema: SEARCH_RANKING_EXPLAIN_SCHEMA_V2.to_string(), + policy_id: policy_id.to_string(), + final_score: scored.final_score, + terms, + }, + relation_context: None, + diversity: if diversity_policy.enabled { + replay_diversity_decisions + .get(&scored.note_id) + .map(ranking::build_diversity_explain) + } else { + None + }, }; - out.push(ChunkCandidate { - chunk_id, - note_id, - chunk_index, - retrieval_score: point.score, - retrieval_rank: idx as u32 + 1, + + out.push(TraceReplayItem { + note_id: scored.note_id, + chunk_id: scored.chunk_id, + retrieval_rank: scored.retrieval_rank, + final_score: scored.final_score, + explain, }); } + out } -fn collect_neighbor_pairs(candidates: &[ChunkCandidate]) -> Vec<(Uuid, i32)> { - let mut seen = HashSet::new(); - let mut out = Vec::new(); - for candidate in candidates { - let mut indices = Vec::with_capacity(3); - indices.push(candidate.chunk_index); - if let Some(prev) = candidate.chunk_index.checked_sub(1) { - indices.push(prev); - } - if let Some(next) = candidate.chunk_index.checked_add(1) { - indices.push(next); - } - for idx in indices { - let key = (candidate.note_id, idx); - if seen.insert(key) { - out.push(key); - } +async fn load_trace_trajectory_summary( + pool: &PgPool, + trace_id: Uuid, +) -> Result<Option<SearchTrajectorySummary>> { + let stages = load_trace_trajectory_stages(pool, trace_id).await?; + + if stages.is_empty() { + Ok(None) + } else { + Ok(Some(build_trajectory_summary_from_stages(stages.as_slice()))) + } +} + +async fn load_trace_trajectory_stages( + pool: &PgPool, + trace_id: Uuid, +) -> Result<Vec<SearchTrajectoryStage>> { + let rows = sqlx::query( + "\ + SELECT + s.stage_id, + s.stage_order, + s.stage_name, + s.stage_payload, + i.item_id, + i.note_id, + i.chunk_id, + i.metrics +FROM search_trace_stages s +LEFT JOIN search_trace_stage_items i ON i.stage_id = s.stage_id +WHERE s.trace_id = $1 +ORDER BY s.stage_order ASC, i.item_id ASC NULLS LAST, i.note_id ASC NULLS LAST", + ) + .bind(trace_id) + .fetch_all(pool) + .await?; + let mut stages = Vec::new(); + let mut stage_pos_by_id: HashMap<Uuid, usize> = HashMap::new(); + + for row in rows { + let stage_id: Uuid = row.try_get("stage_id")?; + let idx = if let Some(idx) = stage_pos_by_id.get(&stage_id).copied() { + idx + } else { + let stage_order: i32 = row.try_get("stage_order")?; + let stage_name: String = row.try_get("stage_name")?; + let stage_payload: Value = row.try_get("stage_payload")?; + let idx = stages.len(); + + stages.push(SearchTrajectoryStage { + stage_order: stage_order as u32, + stage_name, + stage_payload, + items: Vec::new(), + }); + stage_pos_by_id.insert(stage_id, idx); + + idx + }; + let item_metrics: Option<Value> = row.try_get("metrics")?; + + if let Some(metrics) = item_metrics { + stages[idx].items.push(SearchTrajectoryStageItem { + item_id: row.try_get("item_id")?, + note_id: row.try_get("note_id")?, + chunk_id: row.try_get("chunk_id")?, + metrics, + }); } } - out + + Ok(stages) +} + +async fn load_item_trajectory( + pool: &PgPool, + trace_id: Uuid, + item_id: Uuid, + note_id: Uuid, + trace_item_chunk_id: Option<Uuid>, +) -> Result<Option<SearchExplainTrajectory>> { + let rows = sqlx::query( + "\ +SELECT + s.stage_order, + s.stage_name, + s.stage_payload, + i.item_id, + i.note_id, + i.chunk_id, + i.metrics +FROM search_trace_stages s +LEFT JOIN search_trace_stage_items i + ON i.stage_id = s.stage_id + AND ( + i.item_id = $2 + OR ( + i.item_id IS NULL + AND i.note_id = $3 + AND ($4 IS NULL OR i.chunk_id = $4) + ) + ) +WHERE s.trace_id = $1 +ORDER BY s.stage_order ASC, i.item_id ASC NULLS LAST, i.note_id ASC NULLS LAST", + ) + .bind(trace_id) + .bind(item_id) + .bind(note_id) + .bind(trace_item_chunk_id) + .fetch_all(pool) + .await?; + + if rows.is_empty() { + return Ok(None); + } + + let mut stages = Vec::with_capacity(rows.len()); + let mut stage_pos_by_order: HashMap<u32, usize> = HashMap::new(); + + for row in rows { + let stage_order: i32 = row.try_get("stage_order")?; + let stage_name: String = row.try_get("stage_name")?; + let stage_payload: Value = row.try_get("stage_payload")?; + let stage_order = stage_order as u32; + let idx = if let Some(idx) = stage_pos_by_order.get(&stage_order).copied() { + idx + } else { + let idx = stages.len(); + + stages.push(SearchExplainTrajectoryStage { + stage_order, + stage_name, + stage_payload, + metrics: serde_json::json!({}), + match_info: None, + }); + stage_pos_by_order.insert(stage_order, idx); + + idx + }; + let item_metrics: Option<Value> = row.try_get("metrics")?; + let matched_item_id: Option<Uuid> = row.try_get("item_id")?; + let matched_note_id: Option<Uuid> = row.try_get("note_id")?; + let matched_chunk_id: Option<Uuid> = row.try_get("chunk_id")?; + + if let Some(metrics) = item_metrics { + let match_kind = if matched_item_id.is_some() { + "item_id" + } else if trace_item_chunk_id.is_some() { + "note_chunk" + } else { + "note" + }; + + stages[idx].match_info = Some(SearchExplainTrajectoryMatch { + kind: match_kind.to_string(), + item_id: matched_item_id, + note_id: matched_note_id, + chunk_id: matched_chunk_id, + }); + stages[idx].metrics = metrics; + } + } + + Ok(Some(SearchExplainTrajectory { + schema: SEARCH_RETRIEVAL_TRAJECTORY_SCHEMA_V1.to_string(), + stages, + })) } -async fn fetch_chunks_by_pair( - pool: &sqlx::PgPool, - pairs: &[(Uuid, i32)], -) -> ServiceResult<Vec<ChunkRow>> { +async fn fetch_chunks_by_pair<'e, E>(executor: E, pairs: &[(Uuid, i32)]) -> Result<Vec<ChunkRow>> +where + E: PgExecutor<'e>, +{ if pairs.is_empty() { return Ok(Vec::new()); } + let mut builder = QueryBuilder::new( "SELECT chunk_id, note_id, chunk_index, start_offset, end_offset, text \ - FROM memory_note_chunks WHERE ", + FROM memory_note_chunks WHERE ", ); let mut separated = builder.separated(" OR "); + for (note_id, chunk_index) in pairs { separated.push("("); separated @@ -1434,314 +5982,486 @@ async fn fetch_chunks_by_pair( .push_bind_unseparated(chunk_index) .push_unseparated(")"); } + let query = builder.build_query_as(); - let rows = query.fetch_all(pool).await?; + let rows = query.fetch_all(executor).await?; + Ok(rows) } -fn stitch_snippet( - note_id: Uuid, - chunk_index: i32, - chunks: &HashMap<(Uuid, i32), ChunkRow>, -) -> String { - let mut out = String::new(); - let indices = [chunk_index.checked_sub(1), Some(chunk_index), chunk_index.checked_add(1)]; - for index in indices.into_iter().flatten() { - if let Some(chunk) = chunks.get(&(note_id, index)) { - out.push_str(chunk.text.as_str()); - } +async fn fetch_note_vectors_for_diversity<'e, E>( + executor: E, + scored: &[ScoredChunk], +) -> Result<HashMap<Uuid, Vec<f32>>> +where + E: PgExecutor<'e>, +{ + if scored.is_empty() { + return Ok(HashMap::new()); } - out.trim().to_string() -} -fn expansion_mode_label(mode: ExpansionMode) -> &'static str { - match mode { - ExpansionMode::Off => "off", - ExpansionMode::Always => "always", - ExpansionMode::Dynamic => "dynamic", - } -} + let mut note_ids = Vec::new(); + let mut embedding_versions = Vec::new(); + let mut seen = HashSet::new(); -fn tokenize_query(query: &str, max_terms: usize) -> Vec<String> { - let mut normalized = String::with_capacity(query.len()); - for ch in query.chars() { - if ch.is_ascii_alphanumeric() { - normalized.push(ch.to_ascii_lowercase()); - } else { - normalized.push(' '); + for scored_chunk in scored { + let note_id = scored_chunk.item.note.note_id; + + if seen.insert(note_id) { + note_ids.push(note_id); + embedding_versions.push(scored_chunk.item.note.embedding_version.clone()); } } - let mut out = Vec::new(); - let mut seen = HashSet::new(); - for token in normalized.split_whitespace() { - if token.len() < 2 { - continue; - } - if seen.insert(token) { - out.push(token.to_string()); - } - if out.len() >= max_terms { - break; - } + let rows = sqlx::query_as::<_, NoteVectorRow>( + "\ +WITH expected AS ( + SELECT * + FROM unnest($1::uuid[], $2::text[]) AS t(note_id, embedding_version) +) +SELECT + e.note_id, + n.vec::text AS vec_text +FROM expected e +JOIN note_embeddings n + ON n.note_id = e.note_id + AND n.embedding_version = e.embedding_version", + ) + .bind(note_ids.as_slice()) + .bind(embedding_versions.as_slice()) + .fetch_all(executor) + .await?; + let mut out = HashMap::new(); + + for row in rows { + let vec = crate::parse_pg_vector(row.vec_text.as_str())?; + + out.insert(row.note_id, vec); } - out + + Ok(out) } -fn match_terms_in_text( - tokens: &[String], - text: &str, - key: Option<&str>, - max_terms: usize, -) -> (Vec<String>, Vec<String>) { - if tokens.is_empty() { - return (Vec::new(), Vec::new()); - } - let text = text.to_lowercase(); - let key = key.map(|value| value.to_lowercase()); - let mut matched_terms = Vec::new(); - let mut matched_fields = HashSet::new(); - for token in tokens { - let mut matched = false; - if text.contains(token) { - matched_fields.insert("text"); - matched = true; - } - if let Some(key) = key.as_ref() - && key.contains(token) - { - matched_fields.insert("key"); - matched = true; - } - if matched { - matched_terms.push(token.clone()); - } - if matched_terms.len() >= max_terms { - break; +async fn enqueue_trace<'e, E>(executor: E, payload: TracePayload) -> Result<()> +where + E: PgExecutor<'e>, +{ + let now = OffsetDateTime::now_utc(); + let payload_json = serde_json::to_value(&payload).map_err(|err| crate::Error::Storage { + message: format!("Failed to encode search trace payload: {err}"), + })?; + + sqlx::query( + "\ +INSERT INTO search_trace_outbox ( + outbox_id, + trace_id, + status, + attempts, + last_error, + available_at, + payload, + created_at, + updated_at +) +VALUES ($1, $2, 'PENDING', 0, NULL, $3, $4, $3, $3)", + ) + .bind(Uuid::new_v4()) + .bind(payload.trace.trace_id) + .bind(now) + .bind(payload_json) + .execute(executor) + .await?; + + Ok(()) +} + +async fn persist_trace_inline(executor: &mut PgConnection, payload: TracePayload) -> Result<()> { + let trace = payload.trace; + let items = payload.items; + let candidates = payload.candidates; + let stages = payload.stages; + let trace_id = trace.trace_id; + + persist_trace_inline_header(executor, &trace).await?; + persist_trace_inline_items(executor, trace_id, items).await?; + persist_trace_inline_stages(executor, trace_id, stages).await?; + persist_trace_inline_candidates(executor, trace_id, candidates).await?; + + Ok(()) +} + +async fn persist_trace_inline_stages( + executor: &mut PgConnection, + trace_id: Uuid, + stages: Vec<TraceTrajectoryStageRecord>, +) -> Result<()> { + if stages.is_empty() { + return Ok(()); + } + + let mut item_records = Vec::new(); + let mut stage_builder = QueryBuilder::new( + "\ +INSERT INTO search_trace_stages ( + stage_id, + trace_id, + stage_order, + stage_name, + stage_payload, + created_at +) ", + ); + + stage_builder.push_values(stages, |mut b, stage| { + for item in stage.items { + item_records.push((stage.stage_id, item)); } + + b.push_bind(stage.stage_id) + .push_bind(trace_id) + .push_bind(stage.stage_order as i32) + .push_bind(stage.stage_name) + .push_bind(stage.stage_payload) + .push_bind(stage.created_at); + }); + stage_builder.push(" ON CONFLICT (stage_id) DO NOTHING"); + stage_builder.build().execute(&mut *executor).await?; + + if item_records.is_empty() { + return Ok(()); } - let mut fields: Vec<String> = - matched_fields.into_iter().map(|field| field.to_string()).collect(); - fields.sort(); - (matched_terms, fields) -} -fn decode_json<T>(value: serde_json::Value, label: &str) -> ServiceResult<T> -where - T: DeserializeOwned, -{ - serde_json::from_value(value) - .map_err(|err| ServiceError::Storage { message: format!("Invalid {label} value: {err}") }) -} + let mut item_builder = QueryBuilder::new( + "\ +INSERT INTO search_trace_stage_items ( + id, + stage_id, + item_id, + note_id, + chunk_id, + metrics +) ", + ); -fn build_config_snapshot(cfg: &elf_config::Config) -> serde_json::Value { - serde_json::json!({ - "search": { - "expansion": { - "mode": cfg.search.expansion.mode.as_str(), - "max_queries": cfg.search.expansion.max_queries, - "include_original": cfg.search.expansion.include_original, - }, - "dynamic": { - "min_candidates": cfg.search.dynamic.min_candidates, - "min_top_score": cfg.search.dynamic.min_top_score, - }, - "prefilter": { - "max_candidates": cfg.search.prefilter.max_candidates, - }, - "explain": { - "retention_days": cfg.search.explain.retention_days, - }, - }, - "ranking": { - "recency_tau_days": cfg.ranking.recency_tau_days, - "tie_breaker_weight": cfg.ranking.tie_breaker_weight, - }, - "providers": { - "embedding": { - "provider_id": cfg.providers.embedding.provider_id.as_str(), - "model": cfg.providers.embedding.model.as_str(), - "dimensions": cfg.providers.embedding.dimensions, - }, - "rerank": { - "provider_id": cfg.providers.rerank.provider_id.as_str(), - "model": cfg.providers.rerank.model.as_str(), - }, - }, - "storage": { - "qdrant": { - "vector_dim": cfg.storage.qdrant.vector_dim, - "collection": cfg.storage.qdrant.collection.as_str(), - }, - }, - }) -} + item_builder.push_values(item_records, |mut b, (stage_id, item)| { + b.push_bind(item.id) + .push_bind(stage_id) + .push_bind(item.item_id) + .push_bind(item.note_id) + .push_bind(item.chunk_id) + .push_bind(item.metrics); + }); + item_builder.push(" ON CONFLICT (id) DO NOTHING"); + item_builder.build().execute(executor).await?; -fn resolve_scopes(cfg: &elf_config::Config, profile: &str) -> ServiceResult<Vec<String>> { - match profile { - "private_only" => Ok(cfg.scopes.read_profiles.private_only.clone()), - "private_plus_project" => Ok(cfg.scopes.read_profiles.private_plus_project.clone()), - "all_scopes" => Ok(cfg.scopes.read_profiles.all_scopes.clone()), - _ => Err(ServiceError::InvalidRequest { message: "Unknown read_profile.".to_string() }), - } + Ok(()) } -fn point_id_to_uuid(point_id: &qdrant_client::qdrant::PointId) -> Option<Uuid> { - match &point_id.point_id_options { - Some(PointIdOptions::Uuid(id)) => Uuid::parse_str(id).ok(), - _ => None, - } +async fn persist_trace_inline_header( + executor: &mut PgConnection, + trace: &TraceRecord, +) -> Result<()> { + let expanded_queries_json = serde_json::to_value(&trace.expanded_queries).map_err(|err| { + crate::Error::Storage { message: format!("Failed to encode expanded_queries: {err}") } + })?; + let allowed_scopes_json = serde_json::to_value(&trace.allowed_scopes).map_err(|err| { + crate::Error::Storage { message: format!("Failed to encode allowed_scopes: {err}") } + })?; + + sqlx::query( + "\ +INSERT INTO search_traces ( + trace_id, + tenant_id, + project_id, + agent_id, + read_profile, + query, + expansion_mode, + expanded_queries, + allowed_scopes, + candidate_count, + top_k, + config_snapshot, + trace_version, + created_at, + expires_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15 +) + ON CONFLICT (trace_id) DO NOTHING", + ) + .bind(trace.trace_id) + .bind(trace.tenant_id.as_str()) + .bind(trace.project_id.as_str()) + .bind(trace.agent_id.as_str()) + .bind(trace.read_profile.as_str()) + .bind(trace.query.as_str()) + .bind(trace.expansion_mode.as_str()) + .bind(expanded_queries_json) + .bind(allowed_scopes_json) + .bind(trace.candidate_count as i32) + .bind(trace.top_k as i32) + .bind(trace.config_snapshot.clone()) + .bind(trace.trace_version) + .bind(trace.created_at) + .bind(trace.expires_at) + .execute(executor) + .await?; + + Ok(()) } -fn payload_uuid(payload: &HashMap<String, Value>, key: &str) -> Option<Uuid> { - let value = payload.get(key)?; - match &value.kind { - Some(Kind::StringValue(text)) => Uuid::parse_str(text).ok(), - _ => None, +async fn persist_trace_inline_items( + executor: &mut PgConnection, + trace_id: Uuid, + items: Vec<TraceItemRecord>, +) -> Result<()> { + if items.is_empty() { + return Ok(()); } + + let mut builder = QueryBuilder::new( + "\ +INSERT INTO search_trace_items ( + item_id, + trace_id, + note_id, + chunk_id, + rank, + final_score, + explain +) ", + ); + + builder.push_values(items, |mut b, item| { + let explain_json = + serde_json::to_value(item.explain).expect("SearchExplain must be JSON-serializable."); + + b.push_bind(item.item_id) + .push_bind(trace_id) + .push_bind(item.note_id) + .push_bind(item.chunk_id) + .push_bind(item.rank as i32) + .push_bind(item.final_score) + .push_bind(explain_json); + }); + + builder.push(" ON CONFLICT (item_id) DO NOTHING"); + builder.build().execute(executor).await?; + + Ok(()) } -fn payload_i32(payload: &HashMap<String, Value>, key: &str) -> Option<i32> { - let value = payload.get(key)?; - match &value.kind { - Some(Kind::IntegerValue(value)) => i32::try_from(*value).ok(), - Some(Kind::DoubleValue(value)) => - if value.fract() == 0.0 { - i32::try_from(*value as i64).ok() - } else { - None - }, - _ => None, +async fn persist_trace_inline_candidates( + executor: &mut PgConnection, + trace_id: Uuid, + candidates: Vec<TraceCandidateRecord>, +) -> Result<()> { + if candidates.is_empty() { + return Ok(()); } -} -async fn enqueue_trace(pool: &sqlx::PgPool, payload: TracePayload) -> ServiceResult<()> { - let now = OffsetDateTime::now_utc(); - let payload_json = serde_json::to_value(&payload).map_err(|err| ServiceError::Storage { - message: format!("Failed to encode search trace payload: {err}"), - })?; - sqlx::query( - "INSERT INTO search_trace_outbox \ - (outbox_id, trace_id, status, attempts, last_error, available_at, payload, created_at, updated_at) \ - VALUES ($1,$2,'PENDING',0,NULL,$3,$4,$3,$3)", - ) - .bind(Uuid::new_v4()) - .bind(payload.trace.trace_id) - .bind(now) - .bind(payload_json) - .execute(pool) - .await?; + let mut builder = QueryBuilder::new( + "\ +INSERT INTO search_trace_candidates ( + candidate_id, + trace_id, + note_id, + chunk_id, + chunk_index, + snippet, + candidate_snapshot, + retrieval_rank, + rerank_score, + note_scope, + note_importance, + note_updated_at, + note_hit_count, + note_last_hit_at, + created_at, + expires_at +) ", + ); + + builder.push_values(candidates, |mut b, candidate| { + b.push_bind(candidate.candidate_id) + .push_bind(trace_id) + .push_bind(candidate.note_id) + .push_bind(candidate.chunk_id) + .push_bind(candidate.chunk_index) + .push_bind(candidate.snippet) + .push_bind(candidate.candidate_snapshot) + .push_bind(candidate.retrieval_rank as i32) + .push_bind(candidate.rerank_score) + .push_bind(candidate.note_scope) + .push_bind(candidate.note_importance) + .push_bind(candidate.note_updated_at) + .push_bind(candidate.note_hit_count) + .push_bind(candidate.note_last_hit_at) + .push_bind(candidate.created_at) + .push_bind(candidate.expires_at); + }); + builder.push(" ON CONFLICT (candidate_id) DO NOTHING"); + builder.build().execute(executor).await?; + Ok(()) } -async fn record_hits( - pool: &sqlx::PgPool, +async fn record_hits<'e, E>( + executor: E, query: &str, scored: &[ScoredChunk], now: OffsetDateTime, -) -> ServiceResult<()> { - let query_hash = hash_query(query); - let mut tx = pool.begin().await?; - - for (rank, scored_chunk) in scored.iter().enumerate() { - let note = &scored_chunk.item.note; - sqlx::query( - "UPDATE memory_notes SET hit_count = hit_count + 1, last_hit_at = $1 WHERE note_id = $2", - ) - .bind(now) - .bind(note.note_id) - .execute(&mut *tx) - .await?; - - sqlx::query( - "INSERT INTO memory_hits (hit_id, note_id, chunk_id, query_hash, rank, final_score, ts) \ - VALUES ($1,$2,$3,$4,$5,$6,$7)", - ) - .bind(Uuid::new_v4()) - .bind(note.note_id) - .bind(scored_chunk.item.chunk.chunk_id) - .bind(&query_hash) - .bind(rank as i32) - .bind(scored_chunk.final_score) - .bind(now) - .execute(&mut *tx) - .await?; +) -> Result<()> +where + E: PgExecutor<'e>, +{ + if scored.is_empty() { + return Ok(()); } - tx.commit().await?; - Ok(()) -} + let query_hash = ranking::hash_query(query); + let mut hit_ids = Vec::with_capacity(scored.len()); + let mut note_ids = Vec::with_capacity(scored.len()); + let mut chunk_ids = Vec::with_capacity(scored.len()); + let mut ranks = Vec::with_capacity(scored.len()); + let mut final_scores = Vec::with_capacity(scored.len()); -fn hash_query(query: &str) -> String { - let mut hasher = DefaultHasher::new(); - Hash::hash(query, &mut hasher); - format!("{:x}", hasher.finish()) -} + for (rank, scored_chunk) in scored.iter().enumerate() { + hit_ids.push(Uuid::new_v4()); + note_ids.push(scored_chunk.item.note.note_id); + chunk_ids.push(scored_chunk.item.chunk.chunk_id); + ranks.push(rank as i32); + final_scores.push(scored_chunk.final_score); + } -fn hash_cache_key(payload: &serde_json::Value) -> ServiceResult<String> { - let raw = serde_json::to_vec(payload).map_err(|err| ServiceError::Storage { - message: format!("Failed to encode cache key payload: {err}"), - })?; - Ok(blake3::hash(&raw).to_hex().to_string()) -} + sqlx::query( + "\ +WITH hits AS ( + SELECT * + FROM unnest( + $1::uuid[], + $2::uuid[], + $3::uuid[], + $4::int4[], + $5::real[] + ) AS t(hit_id, note_id, chunk_id, rank, final_score) +), +updated AS ( + UPDATE memory_notes + SET + hit_count = hit_count + 1, + last_hit_at = $6 + WHERE note_id = ANY($2) +) +INSERT INTO memory_hits ( + hit_id, + note_id, + chunk_id, + query_hash, + rank, + final_score, + ts +) +SELECT + hit_id, + note_id, + chunk_id, + $7, + rank, + final_score, + $6 + FROM hits", + ) + .bind(&hit_ids) + .bind(¬e_ids) + .bind(&chunk_ids) + .bind(&ranks) + .bind(&final_scores) + .bind(now) + .bind(query_hash.as_str()) + .execute(executor) + .await?; -fn cache_key_prefix(key: &str) -> &str { - let len = key.len().min(12); - &key[..len] + Ok(()) } -async fn fetch_cache_payload( - pool: &sqlx::PgPool, +async fn fetch_cache_payload<'e, E>( + executor: E, kind: CacheKind, key: &str, now: OffsetDateTime, -) -> ServiceResult<Option<CachePayload>> { - let row = sqlx::query( - "SELECT payload FROM llm_cache WHERE cache_kind = $1 AND cache_key = $2 AND expires_at > $3", +) -> Result<Option<CachePayload>> +where + E: PgExecutor<'e>, +{ + let payload: Option<Value> = sqlx::query_scalar( + "\ +WITH updated AS ( + UPDATE llm_cache + SET + last_accessed_at = $3, + hit_count = hit_count + 1 + WHERE + cache_kind = $1 + AND cache_key = $2 + AND expires_at > $3 + RETURNING payload +) + SELECT payload +FROM updated", ) .bind(kind.as_str()) .bind(key) .bind(now) - .fetch_optional(pool) + .fetch_optional(executor) .await?; - let Some(row) = row else { + let Some(payload) = payload else { return Ok(None); }; - - let payload: serde_json::Value = row.try_get("payload")?; let size_bytes = serde_json::to_vec(&payload) - .map_err(|err| ServiceError::Storage { + .map_err(|err| crate::Error::Storage { message: format!("Failed to encode cache payload: {err}"), })? .len(); - sqlx::query( - "UPDATE llm_cache \ - SET last_accessed_at = $1, hit_count = hit_count + 1 \ - WHERE cache_kind = $2 AND cache_key = $3", - ) - .bind(now) - .bind(kind.as_str()) - .bind(key) - .execute(pool) - .await?; - Ok(Some(CachePayload { value: payload, size_bytes })) } -async fn store_cache_payload( - pool: &sqlx::PgPool, +async fn store_cache_payload<'e, E>( + executor: E, kind: CacheKind, key: &str, - payload: serde_json::Value, + payload: Value, now: OffsetDateTime, expires_at: OffsetDateTime, max_payload_bytes: Option<u64>, -) -> ServiceResult<Option<usize>> { - let payload_bytes = serde_json::to_vec(&payload).map_err(|err| ServiceError::Storage { +) -> Result<Option<usize>> +where + E: PgExecutor<'e>, +{ + let payload_bytes = serde_json::to_vec(&payload).map_err(|err| crate::Error::Storage { message: format!("Failed to encode cache payload: {err}"), })?; let payload_size = payload_bytes.len(); + if let Some(max) = max_payload_bytes && payload_size as u64 > max { @@ -1749,14 +6469,23 @@ async fn store_cache_payload( } sqlx::query( - "INSERT INTO llm_cache \ - (cache_id, cache_kind, cache_key, payload, created_at, last_accessed_at, expires_at, hit_count) \ - VALUES ($1,$2,$3,$4,$5,$5,$6,0) \ - ON CONFLICT (cache_kind, cache_key) DO UPDATE SET \ - payload = EXCLUDED.payload, \ - last_accessed_at = EXCLUDED.last_accessed_at, \ - expires_at = EXCLUDED.expires_at, \ - hit_count = 0", + "\ + INSERT INTO llm_cache ( + cache_id, + cache_kind, + cache_key, + payload, + created_at, + last_accessed_at, + expires_at, + hit_count +) +VALUES ($1, $2, $3, $4, $5, $5, $6, 0) +ON CONFLICT (cache_kind, cache_key) DO UPDATE SET +payload = EXCLUDED.payload, + last_accessed_at = EXCLUDED.last_accessed_at, + expires_at = EXCLUDED.expires_at, + hit_count = 0", ) .bind(Uuid::new_v4()) .bind(kind.as_str()) @@ -1764,96 +6493,64 @@ async fn store_cache_payload( .bind(payload) .bind(now) .bind(expires_at) - .execute(pool) + .execute(executor) .await?; Ok(Some(payload_size)) } -fn build_expansion_cache_key( - query: &str, - version: &str, - max_queries: u32, - include_original: bool, - provider_id: &str, - model: &str, - temperature: f32, -) -> ServiceResult<String> { - let payload = serde_json::json!({ - "kind": "expansion", - "query": query.trim(), - "provider_id": provider_id, - "model": model, - "temperature": temperature, - "version": version, - "max_queries": max_queries, - "include_original": include_original, - }); - hash_cache_key(&payload) -} +#[cfg(test)] +mod tests { + use serde_json::Value; + + use crate::search::{ + self, BlendRankingOverride, ChunkCandidate, ChunkMeta, ChunkSnippet, HashMap, NoteMeta, + OffsetDateTime, RankingRequestOverride, RerankCacheCandidate, RerankCacheItem, + RerankCachePayload, RetrievalSourceCandidates, RetrievalSourceKind, + RetrievalSourcesRankingOverride, ScoredChunk, TraceReplayCandidate, TraceReplayContext, + Uuid, ranking, + }; + use elf_config::{Config, SearchDynamic}; -fn build_rerank_cache_key( - query: &str, - version: &str, - provider_id: &str, - model: &str, - candidates: &[(Uuid, OffsetDateTime)], -) -> ServiceResult<String> { - let signature: Vec<serde_json::Value> = candidates - .iter() - .map(|(chunk_id, updated_at)| { - serde_json::json!({ - "chunk_id": chunk_id, - "updated_at": updated_at, - }) - }) - .collect(); - let payload = serde_json::json!({ - "kind": "rerank", - "query": query.trim(), - "provider_id": provider_id, - "model": model, - "version": version, - "candidates": signature, - }); - hash_cache_key(&payload) -} + #[test] + fn dense_embedding_input_includes_project_context_suffix() { + let input = ranking::build_dense_embedding_input( + "Find payments code.", + Some("This is a billing API."), + ); -fn build_cached_scores( - payload: &RerankCachePayload, - candidates: &[RerankCacheCandidate], -) -> Option<Vec<f32>> { - if payload.items.len() != candidates.len() { - return None; + assert!(input.starts_with("Find payments code.\n\nProject context:\n")); + assert!(input.contains("This is a billing API.")); } - let mut map = HashMap::new(); - for item in &payload.items { - let key = (item.chunk_id, item.updated_at.unix_timestamp(), item.updated_at.nanosecond()); - map.insert(key, item.score); + #[test] + fn dense_embedding_input_skips_empty_project_context() { + let input = ranking::build_dense_embedding_input("Find payments code.", Some(" ")); + + assert_eq!(input, "Find payments code."); } - let mut out = Vec::with_capacity(candidates.len()); - for candidate in candidates { - let key = ( - candidate.chunk_id, - candidate.updated_at.unix_timestamp(), - candidate.updated_at.nanosecond(), - ); - let score = map.get(&key)?; - out.push(*score); + #[test] + fn scope_description_boost_matches_whole_tokens_only() { + let tokens = vec!["go".to_string()]; + let boost = ranking::scope_description_boost(&tokens, "MongoDB operational notes.", 0.1); + + assert_eq!(boost, 0.0); } - Some(out) -} -#[cfg(test)] -mod tests { - use super::*; + #[test] + fn scope_description_boost_scales_by_fraction_of_matched_tokens() { + let tokens = vec!["security".to_string(), "policy".to_string(), "deployment".to_string()]; + let boost = ranking::scope_description_boost(&tokens, "Security policy notes.", 0.12); + + assert!((boost - 0.08).abs() < 1e-4, "Unexpected boost: {boost}"); + } #[test] fn normalize_queries_includes_original_and_dedupes() { let queries = vec!["alpha".to_string(), "beta".to_string(), "alpha".to_string()]; - let normalized = normalize_queries(queries, "alpha", true, 4); + let normalized = ranking::normalize_queries(queries, "alpha", true, 4); + assert_eq!(normalized, vec!["alpha".to_string(), "beta".to_string()]); } @@ -1861,24 +6558,205 @@ mod tests { fn normalize_queries_respects_max_queries() { let queries = vec!["one".to_string(), "two".to_string(), "three".to_string(), "four".to_string()]; - let normalized = normalize_queries(queries, "zero", true, 3); + let normalized = ranking::normalize_queries(queries, "zero", true, 3); + assert_eq!(normalized.len(), 3); } #[test] fn dynamic_trigger_checks_candidates_and_score() { - let cfg = elf_config::SearchDynamic { min_candidates: 10, min_top_score: 0.2 }; - assert!(should_expand_dynamic(5, 0.9, &cfg)); - assert!(should_expand_dynamic(20, 0.1, &cfg)); - assert!(!should_expand_dynamic(20, 0.9, &cfg)); + let cfg = SearchDynamic { min_candidates: 10, min_top_score: 0.2 }; + + assert!(ranking::should_expand_dynamic(5, 0.9, &cfg)); + assert!(ranking::should_expand_dynamic(20, 0.1, &cfg)); + assert!(!ranking::should_expand_dynamic(20, 0.9, &cfg)); + } + + #[test] + fn rank_normalize_maps_rank_to_unit_interval() { + assert!((ranking::rank_normalize(1, 1) - 1.0).abs() < 1e-6); + assert!((ranking::rank_normalize(1, 5) - 1.0).abs() < 1e-6); + assert!((ranking::rank_normalize(3, 5) - 0.5).abs() < 1e-6); + assert!((ranking::rank_normalize(5, 5) - 0.0).abs() < 1e-6); + assert!((ranking::rank_normalize(0, 5) - 0.0).abs() < 1e-6); + } + + #[test] + fn build_trace_audit_includes_token_id_when_present() { + let audit = search::build_trace_audit("agent-a", Some("tok-123")); + + assert_eq!(audit.get("actor_id"), Some(&Value::from("agent-a"))); + assert_eq!(audit.get("token_id"), Some(&Value::from("tok-123"))); + } + + #[test] + fn build_trace_audit_omits_token_id_when_empty() { + let audit = search::build_trace_audit("agent-a", Some(" ")); + + assert_eq!(audit.get("actor_id"), Some(&Value::from("agent-a"))); + assert!(audit.get("token_id").is_none()); + } + + fn test_chunk_candidate(note_id: Uuid, retrieval_rank: u32) -> ChunkCandidate { + ChunkCandidate { + chunk_id: Uuid::new_v4(), + note_id, + chunk_index: 0, + retrieval_rank, + retrieval_score: None, + scope: None, + updated_at: None, + embedding_version: Some("v1".to_string()), + } + } + + fn default_retrieval_sources_policy() -> ranking::ResolvedRetrievalSourcesPolicy { + ranking::ResolvedRetrievalSourcesPolicy { + fusion_weight: 1.0, + structured_field_weight: 1.0, + recursive_weight: 0.0, + fusion_priority: 1, + structured_field_priority: 0, + recursive_priority: 0, + } + } + + #[test] + fn merge_retrieval_candidates_keeps_structured_hits_under_full_fusion_capacity() { + let mut fusion = Vec::new(); + + for rank in 1..=10 { + fusion.push(test_chunk_candidate(Uuid::new_v4(), rank)); + } + + let structured = vec![test_chunk_candidate(Uuid::new_v4(), 1)]; + let structured_chunk_id = structured[0].chunk_id; + let merged = ranking::merge_retrieval_candidates( + vec![ + RetrievalSourceCandidates { + source: RetrievalSourceKind::Fusion, + candidates: fusion, + }, + RetrievalSourceCandidates { + source: RetrievalSourceKind::StructuredField, + candidates: structured, + }, + ], + &default_retrieval_sources_policy(), + 10, + ); + let merged_chunk_ids: Vec<Uuid> = + merged.iter().map(|candidate| candidate.chunk_id).collect(); + + assert!( + merged_chunk_ids.contains(&structured_chunk_id), + "Structured candidate was dropped by retrieval fusion." + ); + } + + #[test] + fn merge_retrieval_candidates_prefers_dual_source_signal_on_tie() { + let shared_note_id = Uuid::new_v4(); + let shared_chunk_id = Uuid::new_v4(); + let fusion_only_note_id = Uuid::new_v4(); + let fusion_only_chunk_id = Uuid::new_v4(); + let fusion = vec![ + ChunkCandidate { + chunk_id: shared_chunk_id, + note_id: shared_note_id, + chunk_index: 0, + retrieval_rank: 9, + retrieval_score: None, + scope: None, + updated_at: None, + embedding_version: Some("v1".to_string()), + }, + ChunkCandidate { + chunk_id: fusion_only_chunk_id, + note_id: fusion_only_note_id, + chunk_index: 0, + retrieval_rank: 1, + retrieval_score: None, + scope: None, + updated_at: None, + embedding_version: Some("v1".to_string()), + }, + ]; + let structured = vec![ChunkCandidate { + chunk_id: shared_chunk_id, + note_id: shared_note_id, + chunk_index: 0, + retrieval_rank: 1, + retrieval_score: None, + scope: None, + updated_at: None, + embedding_version: Some("v1".to_string()), + }]; + let merged = ranking::merge_retrieval_candidates( + vec![ + RetrievalSourceCandidates { + source: RetrievalSourceKind::Fusion, + candidates: fusion, + }, + RetrievalSourceCandidates { + source: RetrievalSourceKind::StructuredField, + candidates: structured, + }, + ], + &default_retrieval_sources_policy(), + 1, + ); + let first = merged.first().expect("Expected merged candidate."); + + assert_eq!(first.chunk_id, shared_chunk_id); + } + + #[test] + fn retrieval_weight_for_rank_uses_first_matching_segment_or_last() { + let segments = vec![ + ranking::BlendSegment { max_retrieval_rank: 3, retrieval_weight: 0.7 }, + ranking::BlendSegment { max_retrieval_rank: 10, retrieval_weight: 0.2 }, + ]; + + assert!((ranking::retrieval_weight_for_rank(1, &segments) - 0.7).abs() < 1e-6); + assert!((ranking::retrieval_weight_for_rank(3, &segments) - 0.7).abs() < 1e-6); + assert!((ranking::retrieval_weight_for_rank(4, &segments) - 0.2).abs() < 1e-6); + assert!((ranking::retrieval_weight_for_rank(999, &segments) - 0.2).abs() < 1e-6); + } + + #[test] + fn blend_math_is_linear_and_additive() { + let segments = vec![ + ranking::BlendSegment { max_retrieval_rank: 2, retrieval_weight: 0.7 }, + ranking::BlendSegment { max_retrieval_rank: 10, retrieval_weight: 0.2 }, + ]; + let retrieval_rank = 3; + let rerank_rank = 2; + let retrieval_norm = ranking::rank_normalize(retrieval_rank, 10); + let rerank_norm = ranking::rank_normalize(rerank_rank, 4); + let blend_retrieval_weight = ranking::retrieval_weight_for_rank(retrieval_rank, &segments); + + assert!((blend_retrieval_weight - 0.2).abs() < 1e-6); + assert!((retrieval_norm - (7.0 / 9.0)).abs() < 1e-6); + assert!((rerank_norm - (2.0 / 3.0)).abs() < 1e-6); + + let retrieval_term = blend_retrieval_weight * retrieval_norm; + let rerank_term = (1.0 - blend_retrieval_weight) * rerank_norm; + let tie_breaker_score = 0.1; + let scope_context_boost = 0.0; + let final_score = retrieval_term + rerank_term + tie_breaker_score + scope_context_boost; + let expected = (0.2 * (7.0 / 9.0)) + (0.8 * (2.0 / 3.0)) + 0.1; + + assert!((final_score - expected).abs() < 1e-6, "Unexpected final_score: {final_score}"); } #[test] - fn expansion_cache_key_changes_with_version() { - let key_a = build_expansion_cache_key("alpha", "v1", 4, true, "llm", "model", 0.1_f32) + fn expansion_cache_key_changes_with_max_queries() { + let key_a = ranking::build_expansion_cache_key("alpha", 4, true, "llm", "model", 0.1_f32) .expect("Expected cache key."); - let key_b = build_expansion_cache_key("alpha", "v2", 4, true, "llm", "model", 0.1_f32) + let key_b = ranking::build_expansion_cache_key("alpha", 5, true, "llm", "model", 0.1_f32) .expect("Expected cache key."); + assert_ne!(key_a, key_b); } @@ -1887,10 +6765,11 @@ mod tests { let ts_a = OffsetDateTime::from_unix_timestamp(1).expect("Valid timestamp."); let ts_b = OffsetDateTime::from_unix_timestamp(2).expect("Valid timestamp."); let chunk_id = Uuid::new_v4(); - let key_a = build_rerank_cache_key("q", "v1", "rerank", "model", &[(chunk_id, ts_a)]) + let key_a = ranking::build_rerank_cache_key("q", "rerank", "model", &[(chunk_id, ts_a)]) .expect("Expected cache key."); - let key_b = build_rerank_cache_key("q", "v1", "rerank", "model", &[(chunk_id, ts_b)]) + let key_b = ranking::build_rerank_cache_key("q", "rerank", "model", &[(chunk_id, ts_b)]) .expect("Expected cache key."); + assert_ne!(key_a, key_b); } @@ -1907,12 +6786,518 @@ mod tests { chunk_id: Uuid::new_v4(), updated_at: OffsetDateTime::from_unix_timestamp(1).expect("Valid timestamp."), }]; - assert!(build_cached_scores(&payload, &candidates).is_none()); + + assert!(ranking::build_cached_scores(&payload, &candidates).is_none()); } #[test] fn cache_key_prefix_is_stable() { - let prefix = cache_key_prefix("abcd1234efgh5678"); + let prefix = ranking::cache_key_prefix("abcd1234efgh5678"); + assert_eq!(prefix, "abcd1234efgh"); } + + #[test] + fn lexical_overlap_ratio_is_deterministic_and_bounded() { + let query_tokens = vec!["deploy".to_string(), "steps".to_string()]; + let ratio = ranking::lexical_overlap_ratio(&query_tokens, "Deploy steps for staging.", 128); + + assert!((ratio - 1.0).abs() < 1e-6, "Unexpected ratio: {ratio}"); + + let ratio = ranking::lexical_overlap_ratio(&query_tokens, "Deploy only.", 128); + + assert!((ratio - 0.5).abs() < 1e-6, "Unexpected ratio: {ratio}"); + assert!((0.0..=1.0).contains(&ratio), "Ratio must be in [0, 1]."); + } + + #[test] + fn deterministic_ranking_terms_do_not_apply_when_disabled() { + let mut cfg = parse_example_config(); + + cfg.ranking.deterministic.enabled = false; + cfg.ranking.deterministic.lexical.enabled = true; + cfg.ranking.deterministic.hits.enabled = true; + cfg.ranking.deterministic.decay.enabled = true; + + let now = OffsetDateTime::from_unix_timestamp(1_000_000).expect("Valid timestamp."); + let note = NoteMeta { + note_id: Uuid::new_v4(), + note_type: "fact".to_string(), + key: None, + scope: "project_shared".to_string(), + agent_id: "agent-a".to_string(), + importance: 0.1, + confidence: 0.9, + updated_at: now, + expires_at: None, + source_ref: serde_json::json!({}), + embedding_version: "v1".to_string(), + hit_count: 8, + last_hit_at: Some(now), + }; + let chunk = + ChunkMeta { chunk_id: Uuid::new_v4(), chunk_index: 0, start_offset: 0, end_offset: 10 }; + let item = ChunkSnippet { + note, + chunk, + snippet: "deploy steps".to_string(), + retrieval_rank: 1, + retrieval_score: None, + }; + let mut scored = ScoredChunk { + item, + final_score: 1.0, + rerank_score: 0.5, + rerank_rank: 1, + rerank_norm: 1.0, + retrieval_norm: 1.0, + blend_retrieval_weight: 0.5, + retrieval_term: 0.5, + rerank_term: 0.5, + tie_breaker_score: 0.0, + scope_context_boost: 0.0, + age_days: 30.0, + importance: 0.1, + deterministic_lexical_overlap_ratio: 0.0, + deterministic_lexical_bonus: 0.0, + deterministic_hit_count: 0, + deterministic_last_hit_age_days: None, + deterministic_hit_boost: 0.0, + deterministic_decay_penalty: 0.0, + }; + let terms = ranking::compute_deterministic_ranking_terms( + &cfg, + &ranking::tokenize_query( + "deploy steps", + cfg.ranking.deterministic.lexical.max_query_terms as usize, + ), + scored.item.snippet.as_str(), + scored.item.note.hit_count, + scored.item.note.last_hit_at, + scored.age_days, + now, + ); + + scored.final_score += terms.lexical_bonus + terms.hit_boost + terms.decay_penalty; + scored.deterministic_lexical_overlap_ratio = terms.lexical_overlap_ratio; + scored.deterministic_lexical_bonus = terms.lexical_bonus; + scored.deterministic_hit_count = terms.hit_count; + scored.deterministic_last_hit_age_days = terms.last_hit_age_days; + scored.deterministic_hit_boost = terms.hit_boost; + scored.deterministic_decay_penalty = terms.decay_penalty; + + assert!((scored.final_score - 1.0).abs() < 1e-6, "Score must not change."); + assert!((scored.deterministic_lexical_bonus - 0.0).abs() < 1e-6); + assert!((scored.deterministic_hit_boost - 0.0).abs() < 1e-6); + assert!((scored.deterministic_decay_penalty - 0.0).abs() < 1e-6); + } + + #[test] + fn deterministic_ranking_terms_apply_and_are_bounded() { + let mut cfg = parse_example_config(); + + cfg.ranking.deterministic.enabled = true; + cfg.ranking.deterministic.lexical.enabled = true; + cfg.ranking.deterministic.hits.enabled = true; + cfg.ranking.deterministic.decay.enabled = true; + + let now = OffsetDateTime::from_unix_timestamp(1_000_000).expect("Valid timestamp."); + let note = NoteMeta { + note_id: Uuid::new_v4(), + note_type: "fact".to_string(), + key: None, + scope: "project_shared".to_string(), + agent_id: "agent-a".to_string(), + importance: 0.1, + confidence: 0.9, + updated_at: now, + expires_at: None, + source_ref: serde_json::json!({}), + embedding_version: "v1".to_string(), + hit_count: 8, + last_hit_at: Some(now), + }; + let chunk = + ChunkMeta { chunk_id: Uuid::new_v4(), chunk_index: 0, start_offset: 0, end_offset: 10 }; + let item = ChunkSnippet { + note, + chunk, + snippet: "deploy steps".to_string(), + retrieval_rank: 1, + retrieval_score: None, + }; + let mut scored = ScoredChunk { + item, + final_score: 1.0, + rerank_score: 0.5, + rerank_rank: 1, + rerank_norm: 1.0, + retrieval_norm: 1.0, + blend_retrieval_weight: 0.5, + retrieval_term: 0.5, + rerank_term: 0.5, + tie_breaker_score: 0.0, + scope_context_boost: 0.0, + age_days: 30.0, + importance: 0.1, + deterministic_lexical_overlap_ratio: 0.0, + deterministic_lexical_bonus: 0.0, + deterministic_hit_count: 0, + deterministic_last_hit_age_days: None, + deterministic_hit_boost: 0.0, + deterministic_decay_penalty: 0.0, + }; + let terms = ranking::compute_deterministic_ranking_terms( + &cfg, + &ranking::tokenize_query( + "deploy steps", + cfg.ranking.deterministic.lexical.max_query_terms as usize, + ), + scored.item.snippet.as_str(), + scored.item.note.hit_count, + scored.item.note.last_hit_at, + scored.age_days, + now, + ); + + scored.final_score += terms.lexical_bonus + terms.hit_boost + terms.decay_penalty; + scored.deterministic_lexical_overlap_ratio = terms.lexical_overlap_ratio; + scored.deterministic_lexical_bonus = terms.lexical_bonus; + scored.deterministic_hit_count = terms.hit_count; + scored.deterministic_last_hit_age_days = terms.last_hit_age_days; + scored.deterministic_hit_boost = terms.hit_boost; + scored.deterministic_decay_penalty = terms.decay_penalty; + + assert!(scored.final_score.is_finite(), "Score must be finite."); + assert!((0.0..=1.0).contains(&scored.deterministic_lexical_overlap_ratio)); + assert!(scored.deterministic_lexical_bonus >= 0.0); + assert!(scored.deterministic_hit_boost >= 0.0); + assert!(scored.deterministic_decay_penalty <= 0.0); + + let expected_lex = cfg.ranking.deterministic.lexical.weight; + + assert!((scored.deterministic_lexical_bonus - expected_lex).abs() < 1e-6); + + let expected_hit = cfg.ranking.deterministic.hits.weight * 0.5; + + assert!((scored.deterministic_hit_boost - expected_hit).abs() < 1e-6); + } + + fn test_scored_chunk(note_id: Uuid, retrieval_rank: u32, now: OffsetDateTime) -> ScoredChunk { + let note = NoteMeta { + note_id, + note_type: "fact".to_string(), + key: None, + scope: "project_shared".to_string(), + agent_id: "agent-a".to_string(), + importance: 0.1, + confidence: 0.9, + updated_at: now, + expires_at: None, + source_ref: serde_json::json!({}), + embedding_version: "v1".to_string(), + hit_count: 0, + last_hit_at: None, + }; + let chunk = ChunkMeta { + chunk_id: Uuid::new_v4(), + chunk_index: i32::try_from(retrieval_rank.saturating_sub(1)).unwrap_or(0), + start_offset: 0, + end_offset: 16, + }; + let item = ChunkSnippet { + note, + chunk, + snippet: format!("snippet-{retrieval_rank}"), + retrieval_rank, + retrieval_score: None, + }; + + ScoredChunk { + item, + final_score: 0.0, + rerank_score: 0.0, + rerank_rank: retrieval_rank, + rerank_norm: 0.0, + retrieval_norm: 0.0, + blend_retrieval_weight: 0.5, + retrieval_term: 0.0, + rerank_term: 0.0, + tie_breaker_score: 0.0, + scope_context_boost: 0.0, + age_days: 0.0, + importance: 0.1, + deterministic_lexical_overlap_ratio: 0.0, + deterministic_lexical_bonus: 0.0, + deterministic_hit_count: 0, + deterministic_last_hit_age_days: None, + deterministic_hit_boost: 0.0, + deterministic_decay_penalty: 0.0, + } + } + + #[test] + fn diversity_selection_skips_high_similarity_when_alternative_exists() { + let now = OffsetDateTime::from_unix_timestamp(0).expect("Valid timestamp."); + let note_a = Uuid::new_v4(); + let note_b = Uuid::new_v4(); + let note_c = Uuid::new_v4(); + let candidates = vec![ + test_scored_chunk(note_a, 1, now), + test_scored_chunk(note_b, 2, now), + test_scored_chunk(note_c, 3, now), + ]; + let mut vectors = HashMap::new(); + + vectors.insert(note_a, vec![1.0, 0.0]); + vectors.insert(note_b, vec![0.99, 0.01]); + vectors.insert(note_c, vec![0.0, 1.0]); + + let policy = ranking::ResolvedDiversityPolicy { + enabled: true, + sim_threshold: 0.9, + mmr_lambda: 0.7, + max_skips: 64, + }; + let (selected, decisions) = + ranking::select_diverse_results(candidates, 2, &policy, &vectors); + let selected_ids: Vec<Uuid> = selected.iter().map(|item| item.item.note.note_id).collect(); + + assert_eq!(selected_ids, vec![note_a, note_c]); + assert_eq!( + decisions.get(¬e_b).and_then(|decision| decision.skipped_reason.as_deref()), + Some("similarity_threshold") + ); + } + + #[test] + fn diversity_selection_backfills_when_max_skips_is_reached() { + let now = OffsetDateTime::from_unix_timestamp(0).expect("Valid timestamp."); + let note_a = Uuid::new_v4(); + let note_b = Uuid::new_v4(); + let candidates = vec![test_scored_chunk(note_a, 1, now), test_scored_chunk(note_b, 2, now)]; + let mut vectors = HashMap::new(); + + vectors.insert(note_a, vec![1.0, 0.0]); + vectors.insert(note_b, vec![0.99, 0.01]); + + let policy = ranking::ResolvedDiversityPolicy { + enabled: true, + sim_threshold: 0.9, + mmr_lambda: 0.7, + max_skips: 0, + }; + let (selected, decisions) = + ranking::select_diverse_results(candidates, 2, &policy, &vectors); + let selected_ids: Vec<Uuid> = selected.iter().map(|item| item.item.note.note_id).collect(); + let selected_reason = + decisions.get(¬e_b).map(|decision| decision.selected_reason.as_str()); + + assert_eq!(selected_ids, vec![note_a, note_b]); + assert_eq!(selected_reason, Some("max_skips_backfill")); + } + + #[test] + fn replay_diversity_decisions_prefer_selected_entry_for_same_note() { + let now = OffsetDateTime::from_unix_timestamp(0).expect("Valid timestamp."); + let note_id = Uuid::new_v4(); + let first = TraceReplayCandidate { + note_id, + chunk_id: Uuid::new_v4(), + chunk_index: 0, + snippet: "first".to_string(), + retrieval_rank: 2, + retrieval_score: None, + rerank_score: 0.2, + note_scope: "project_shared".to_string(), + note_importance: 0.1, + note_updated_at: now, + note_hit_count: 0, + note_last_hit_at: None, + diversity_selected: Some(false), + diversity_selected_rank: None, + diversity_selected_reason: Some("not_selected".to_string()), + diversity_skipped_reason: Some("lower_mmr".to_string()), + diversity_nearest_selected_note_id: None, + diversity_similarity: Some(0.95), + diversity_mmr_score: Some(0.12), + diversity_missing_embedding: Some(false), + }; + let second = TraceReplayCandidate { + note_id, + chunk_id: Uuid::new_v4(), + chunk_index: 1, + snippet: "second".to_string(), + retrieval_rank: 1, + retrieval_score: None, + rerank_score: 0.3, + note_scope: "project_shared".to_string(), + note_importance: 0.1, + note_updated_at: now, + note_hit_count: 0, + note_last_hit_at: None, + diversity_selected: Some(true), + diversity_selected_rank: Some(2), + diversity_selected_reason: Some("mmr".to_string()), + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: Some(0.35), + diversity_mmr_score: Some(0.44), + diversity_missing_embedding: Some(false), + }; + let decisions = ranking::extract_replay_diversity_decisions(&[first, second]); + let decision = decisions.get(¬e_id).expect("Expected merged decision."); + + assert!(decision.selected); + assert_eq!(decision.selected_rank, Some(2)); + assert_eq!(decision.selected_reason, "mmr"); + } + + fn parse_example_config() -> Config { + let root_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../.."); + let path = root_dir.join("elf.example.toml"); + + elf_config::load(&path).expect("elf.example.toml must remain parseable and valid.") + } + + #[test] + fn ranking_policy_id_is_stable_and_has_expected_format() { + let cfg = parse_example_config(); + let id_a = search::ranking_policy_id(&cfg, None).expect("Expected policy id."); + let id_b = search::ranking_policy_id(&cfg, None).expect("Expected policy id."); + + assert_eq!(id_a, id_b); + assert!(id_a.starts_with("ranking_v2:"), "Unexpected policy id: {id_a}"); + assert_eq!(id_a.len(), "ranking_v2:".len() + 12, "Unexpected policy id: {id_a}"); + } + + #[test] + fn ranking_policy_id_changes_with_override() { + let cfg = parse_example_config(); + let base = search::ranking_policy_id(&cfg, None).expect("Expected base policy id."); + let override_ = RankingRequestOverride { + blend: Some(BlendRankingOverride { + enabled: Some(false), + rerank_normalization: None, + retrieval_normalization: None, + segments: None, + }), + diversity: None, + retrieval_sources: None, + }; + let overridden = search::ranking_policy_id(&cfg, Some(&override_)) + .expect("Expected overridden policy id."); + + assert_ne!(base, overridden); + } + + #[test] + fn ranking_policy_id_changes_with_retrieval_source_override() { + let cfg = parse_example_config(); + let base = search::ranking_policy_id(&cfg, None).expect("Expected base policy id."); + let override_ = RankingRequestOverride { + blend: None, + diversity: None, + retrieval_sources: Some(RetrievalSourcesRankingOverride { + fusion_weight: Some(0.75), + structured_field_weight: Some(1.25), + recursive_weight: Some(0.0), + fusion_priority: Some(2), + structured_field_priority: Some(1), + recursive_priority: Some(0), + }), + }; + let overridden = search::ranking_policy_id(&cfg, Some(&override_)) + .expect("Expected overridden policy id."); + + assert_ne!(base, overridden); + } + + #[test] + fn replay_ranking_policy_id_matches_ranking_policy_id() { + let cfg = parse_example_config(); + let expected = search::ranking_policy_id(&cfg, None).expect("Expected policy id."); + let now = OffsetDateTime::from_unix_timestamp(0).expect("Valid timestamp."); + let trace = TraceReplayContext { + trace_id: Uuid::new_v4(), + query: "deployment steps".to_string(), + candidate_count: 3, + top_k: 2, + created_at: now, + }; + let candidates = vec![ + TraceReplayCandidate { + note_id: Uuid::new_v4(), + chunk_id: Uuid::new_v4(), + chunk_index: 0, + snippet: "deployment steps".to_string(), + retrieval_rank: 1, + retrieval_score: None, + rerank_score: 0.1, + note_scope: "project_shared".to_string(), + note_importance: 0.1, + note_updated_at: now, + note_hit_count: 0, + note_last_hit_at: None, + diversity_selected: None, + diversity_selected_rank: None, + diversity_selected_reason: None, + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: None, + diversity_mmr_score: None, + diversity_missing_embedding: None, + }, + TraceReplayCandidate { + note_id: Uuid::new_v4(), + chunk_id: Uuid::new_v4(), + chunk_index: 0, + snippet: "deployment steps".to_string(), + retrieval_rank: 2, + retrieval_score: None, + rerank_score: 0.9, + note_scope: "project_shared".to_string(), + note_importance: 0.1, + note_updated_at: now, + note_hit_count: 0, + note_last_hit_at: None, + diversity_selected: None, + diversity_selected_rank: None, + diversity_selected_reason: None, + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: None, + diversity_mmr_score: None, + diversity_missing_embedding: None, + }, + TraceReplayCandidate { + note_id: Uuid::new_v4(), + chunk_id: Uuid::new_v4(), + chunk_index: 0, + snippet: "deployment steps".to_string(), + retrieval_rank: 3, + retrieval_score: None, + rerank_score: 0.2, + note_scope: "org_shared".to_string(), + note_importance: 0.1, + note_updated_at: now, + note_hit_count: 0, + note_last_hit_at: None, + diversity_selected: None, + diversity_selected_rank: None, + diversity_selected_reason: None, + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: None, + diversity_mmr_score: None, + diversity_missing_embedding: None, + }, + ]; + let out = search::replay_ranking_from_candidates(&cfg, &trace, None, &candidates, 2) + .expect("Expected replay output."); + + for item in out { + assert_eq!(item.explain.ranking.policy_id, expected); + } + } } diff --git a/packages/elf-service/src/search/filter.rs b/packages/elf-service/src/search/filter.rs new file mode 100644 index 00000000..7e94077e --- /dev/null +++ b/packages/elf-service/src/search/filter.rs @@ -0,0 +1,1133 @@ +use std::{ + cmp::Ordering, + collections::HashMap, + fmt::{Display, Formatter}, +}; + +use serde::Serialize; +use serde_json::{Map, Value}; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; +use uuid::Uuid; + +use crate::search::{ChunkCandidate, NoteMeta, SEARCH_FILTER_IMPACT_SCHEMA_V1}; + +const SEARCH_FILTER_EXPR_SCHEMA_V1: &str = "search_filter_expr/v1"; +const MAX_FILTER_DEPTH: usize = 8; +const MAX_FILTER_NODES: usize = 128; +const MAX_IN_LIST_ITEMS: usize = 128; +const MAX_STRING_BYTES: usize = 512; + +#[derive(Clone, Debug)] +pub(crate) struct FilterParseError { + path: String, + message: String, +} +impl Display for FilterParseError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}: {}", self.path, self.message) + } +} + +#[derive(Clone, Debug)] +pub(crate) struct SearchFilter { + expr: FilterExpr, + json: Value, +} +impl SearchFilter { + fn as_value(&self) -> Value { + self.json.clone() + } + + fn evaluate(&self, note: &NoteMeta) -> (bool, Option<String>) { + self.expr.evaluate(note) + } + + pub(crate) fn parse(raw: &Value) -> Result<Self, FilterParseError> { + let path = "$.filter"; + let obj = raw.as_object().ok_or_else(|| FilterParseError { + path: path.to_string(), + message: "filter must be an object.".to_string(), + })?; + let schema = obj.get("schema").and_then(Value::as_str).ok_or_else(|| FilterParseError { + path: format!("{path}.schema"), + message: "filter.schema is required.".to_string(), + })?; + + if schema != SEARCH_FILTER_EXPR_SCHEMA_V1 { + return Err(FilterParseError { + path: format!("{path}.schema"), + message: format!( + "unsupported filter schema '{schema}', expected '{SEARCH_FILTER_EXPR_SCHEMA_V1}'." + ), + }); + } + + let expr = obj.get("expr").ok_or_else(|| FilterParseError { + path: format!("{path}.expr"), + message: "filter.expr is required.".to_string(), + })?; + let mut state = FilterParseState::default(); + let parsed = parse_expr(expr, "$.filter.expr", 1, &mut state)?; + + Ok(Self { + expr: parsed.clone(), + json: serde_json::json!({"schema": SEARCH_FILTER_EXPR_SCHEMA_V1, "expr": parsed.to_value()}), + }) + } + + pub(crate) fn eval( + &self, + candidates: Vec<ChunkCandidate>, + note_meta: &HashMap<Uuid, NoteMeta>, + requested_candidate_k: u32, + effective_candidate_k: u32, + ) -> (Vec<ChunkCandidate>, SearchFilterImpact) { + let impact = SearchFilterImpact::from_eval( + self, + candidates.as_slice(), + note_meta, + requested_candidate_k, + effective_candidate_k, + ); + let pre = candidates.len(); + let mut kept = Vec::with_capacity(impact.candidate_count_post); + + for candidate in candidates { + let Some(note) = note_meta.get(&candidate.note_id) else { + continue; + }; + + if self.expr.evaluate(note).0 { + kept.push(candidate); + } + } + + let post = kept.len(); + + ( + kept, + SearchFilterImpact { + candidate_count_post: post, + dropped_total: pre.saturating_sub(post), + ..impact + }, + ) + } +} + +#[derive(Clone, Debug, Serialize)] +pub(crate) struct SearchFilterImpact { + requested_candidate_k: u32, + effective_candidate_k: u32, + candidate_count_pre: usize, + candidate_count_post: usize, + dropped_total: usize, + top_drop_reasons: Vec<SearchFilterDropReason>, + filter: Value, +} +impl SearchFilterImpact { + pub(crate) fn from_eval( + filter: &SearchFilter, + note_candidates: &[ChunkCandidate], + note_meta: &HashMap<Uuid, NoteMeta>, + requested_candidate_k: u32, + effective_candidate_k: u32, + ) -> Self { + let pre = note_candidates.len(); + let mut kept: Vec<ChunkCandidate> = Vec::new(); + let mut dropped_reason_counts: HashMap<String, usize> = HashMap::new(); + + for candidate in note_candidates { + let Some(note) = note_meta.get(&candidate.note_id) else { + dropped_reason_counts + .entry("note_meta_missing".to_string()) + .and_modify(|count| *count += 1) + .or_insert(1); + + continue; + }; + let (keep, reason) = filter.evaluate(note); + + if keep { + kept.push(candidate.clone()); + } else { + dropped_reason_counts + .entry(reason.unwrap_or_else(|| "filter.no_match".to_string())) + .and_modify(|count| *count += 1) + .or_insert(1); + } + } + + let mut top_drop_reasons: Vec<_> = dropped_reason_counts + .into_iter() + .map(|(reason, count)| SearchFilterDropReason { reason, count }) + .collect(); + + top_drop_reasons.sort_by(|a, b| match b.count.cmp(&a.count) { + Ordering::Equal => a.reason.cmp(&b.reason), + other => other, + }); + top_drop_reasons.truncate(5); + + let post = kept.len(); + + Self { + requested_candidate_k, + effective_candidate_k, + candidate_count_pre: pre, + candidate_count_post: post, + dropped_total: pre.saturating_sub(post), + top_drop_reasons, + filter: filter.as_value(), + } + } + + pub(crate) fn to_stage_payload(&self) -> Value { + serde_json::json!({ + "schema": SEARCH_FILTER_IMPACT_SCHEMA_V1, + "requested_candidate_k": self.requested_candidate_k, + "effective_candidate_k": self.effective_candidate_k, + "candidate_count_pre": self.candidate_count_pre, + "candidate_count_post": self.candidate_count_post, + "dropped_total": self.dropped_total, + "top_drop_reasons": self.top_drop_reasons, + "filter": self.filter, + }) + } +} + +#[derive(Clone, Debug, Serialize)] +pub(crate) struct SearchFilterDropReason { + reason: String, + count: usize, +} + +#[derive(Default)] +struct FilterParseState { + nodes: usize, + max_depth: usize, +} + +#[derive(Clone, Debug)] +enum FilterField { + Type, + Key, + Scope, + AgentId, + Importance, + Confidence, + UpdatedAt, + ExpiresAt, + HitCount, + LastHitAt, +} +impl FilterField { + fn as_str(&self) -> &'static str { + match self { + Self::Type => "type", + Self::Key => "key", + Self::Scope => "scope", + Self::AgentId => "agent_id", + Self::Importance => "importance", + Self::Confidence => "confidence", + Self::UpdatedAt => "updated_at", + Self::ExpiresAt => "expires_at", + Self::HitCount => "hit_count", + Self::LastHitAt => "last_hit_at", + } + } + + fn parse(path: &str, raw: &Value) -> Result<Self, FilterParseError> { + let field = raw + .as_str() + .ok_or_else(|| FilterParseError { + path: path.to_string(), + message: "filter field must be a string.".to_string(), + })? + .to_ascii_lowercase(); + + match field.as_str() { + "type" => Ok(Self::Type), + "key" => Ok(Self::Key), + "scope" => Ok(Self::Scope), + "agent_id" => Ok(Self::AgentId), + "importance" => Ok(Self::Importance), + "confidence" => Ok(Self::Confidence), + "updated_at" => Ok(Self::UpdatedAt), + "expires_at" => Ok(Self::ExpiresAt), + "hit_count" => Ok(Self::HitCount), + "last_hit_at" => Ok(Self::LastHitAt), + _ => Err(FilterParseError { + path: path.to_string(), + message: format!( + "field '{}' is not in allowlist: type, key, scope, agent_id, importance, confidence, updated_at, expires_at, hit_count, last_hit_at", + field, + ), + }), + } + } + + fn lookup_note_value(&self, note: &NoteMeta) -> FilterNodeValue { + FilterExpr::lookup_note_value(self, note) + } +} + +#[derive(Clone, Debug)] +enum FilterExpr { + And(Vec<Self>), + Or(Vec<Self>), + Not(Box<Self>), + Eq { field: FilterField, value: FilterValue }, + Neq { field: FilterField, value: FilterValue }, + In { field: FilterField, values: Vec<FilterValue> }, + Contains { field: FilterField, value: String }, + Gt { field: FilterField, value: FilterValue }, + Gte { field: FilterField, value: FilterValue }, + Lt { field: FilterField, value: FilterValue }, + Lte { field: FilterField, value: FilterValue }, +} +impl FilterExpr { + fn to_value(&self) -> Value { + match self { + Self::And(exprs) => { + serde_json::json!({ "op": "and", "args": Value::Array(exprs.iter().map(Self::to_value).collect()) }) + }, + Self::Or(exprs) => { + serde_json::json!({ "op": "or", "args": Value::Array(exprs.iter().map(Self::to_value).collect()) }) + }, + Self::Not(expr) => { + serde_json::json!({ "op": "not", "expr": expr.to_value() }) + }, + Self::Eq { field, value } => { + serde_json::json!({ "op": "eq", "field": field.as_str(), "value": value.to_value() }) + }, + Self::Neq { field, value } => { + serde_json::json!({ "op": "neq", "field": field.as_str(), "value": value.to_value() }) + }, + Self::In { field, values } => { + serde_json::json!({ + "op": "in", + "field": field.as_str(), + "value": Value::Array(values.iter().map(FilterValue::to_value).collect()) + }) + }, + Self::Contains { field, value } => { + serde_json::json!({ "op": "contains", "field": field.as_str(), "value": value }) + }, + Self::Gt { field, value } => { + serde_json::json!({ "op": "gt", "field": field.as_str(), "value": value.to_value() }) + }, + Self::Gte { field, value } => { + serde_json::json!({ "op": "gte", "field": field.as_str(), "value": value.to_value() }) + }, + Self::Lt { field, value } => { + serde_json::json!({ "op": "lt", "field": field.as_str(), "value": value.to_value() }) + }, + Self::Lte { field, value } => { + serde_json::json!({ "op": "lte", "field": field.as_str(), "value": value.to_value() }) + }, + } + } + + fn evaluate(&self, note: &NoteMeta) -> (bool, Option<String>) { + match self { + Self::And(nodes) => Self::evaluate_and(nodes, note), + Self::Or(nodes) => Self::evaluate_or(nodes, note), + Self::Not(node) => Self::evaluate_not(node, note), + Self::Eq { field, value } => Self::evaluate_eq(field, value, note), + Self::Neq { field, value } => Self::evaluate_neq(field, value, note), + Self::In { field, values } => Self::evaluate_in(field, values, note), + Self::Contains { field, value } => Self::evaluate_contains(field, value, note), + Self::Gt { field, value } => Self::evaluate_gt(field, value, note), + Self::Gte { field, value } => Self::evaluate_gte(field, value, note), + Self::Lt { field, value } => Self::evaluate_lt(field, value, note), + Self::Lte { field, value } => Self::evaluate_lte(field, value, note), + } + } + + fn evaluate_and(nodes: &[Self], note: &NoteMeta) -> (bool, Option<String>) { + for node in nodes { + let (passed, reason) = node.evaluate(note); + + if !passed { + return (false, reason); + } + } + + (true, None) + } + + fn evaluate_or(nodes: &[Self], note: &NoteMeta) -> (bool, Option<String>) { + let mut first_reason = None; + + for node in nodes { + let (passed, reason) = node.evaluate(note); + + if passed { + return (true, None); + } + if first_reason.is_none() { + first_reason = reason; + } + } + + (false, first_reason.or_else(|| Some("or.no_match".to_string()))) + } + + fn evaluate_not(node: &Self, note: &NoteMeta) -> (bool, Option<String>) { + let (passed, reason) = node.evaluate(note); + + if passed { (false, Some("not.true".to_string())) } else { (true, reason) } + } + + fn evaluate_eq( + field: &FilterField, + value: &FilterValue, + note: &NoteMeta, + ) -> (bool, Option<String>) { + let note_value = field.lookup_note_value(note); + let filter_value = value.to_node_value(); + let matches = note_value == filter_value; + + (matches, Some(format!("eq:{}", field.as_str())).filter(|_| !matches)) + } + + fn evaluate_neq( + field: &FilterField, + value: &FilterValue, + note: &NoteMeta, + ) -> (bool, Option<String>) { + let note_value = field.lookup_note_value(note); + let filter_value = value.to_node_value(); + let matches = note_value != filter_value; + + (matches, Some(format!("neq:{}", field.as_str())).filter(|_| !matches)) + } + + fn evaluate_in( + field: &FilterField, + values: &[FilterValue], + note: &NoteMeta, + ) -> (bool, Option<String>) { + let note_value = field.lookup_note_value(note); + let matches = values.iter().any(|value| note_value == FilterNodeValue::from(value)); + + (matches, Some(format!("in:{}", field.as_str())).filter(|_| !matches)) + } + + fn evaluate_contains( + field: &FilterField, + value: &str, + note: &NoteMeta, + ) -> (bool, Option<String>) { + let note_value = field.lookup_note_value(note); + let note_text = match note_value { + FilterNodeValue::String(s) => s, + _ => { + return (false, Some(format!("contains:{}", field.as_str()))); + }, + }; + let matches = note_text.contains(value); + + (matches, Some(format!("contains:{}", field.as_str())).filter(|_| !matches)) + } + + fn evaluate_gt( + field: &FilterField, + value: &FilterValue, + note: &NoteMeta, + ) -> (bool, Option<String>) { + match field.lookup_note_value(note) { + FilterNodeValue::Number(note_value) => { + let matches = note_value > value.to_numeric(); + + (matches, Some(format!("gt:{}", field.as_str())).filter(|_| !matches)) + }, + FilterNodeValue::DateTime(note_value) => { + let matches = match value { + FilterValue::DateTime(filter_value) => note_value > *filter_value, + _ => false, + }; + + (matches, Some(format!("gt:{}", field.as_str())).filter(|_| !matches)) + }, + _ => (false, Some(format!("gt:{}", field.as_str()))), + } + } + + fn evaluate_gte( + field: &FilterField, + value: &FilterValue, + note: &NoteMeta, + ) -> (bool, Option<String>) { + match field.lookup_note_value(note) { + FilterNodeValue::Number(note_value) => { + let matches = note_value >= value.to_numeric(); + + (matches, Some(format!("gte:{}", field.as_str())).filter(|_| !matches)) + }, + FilterNodeValue::DateTime(note_value) => { + let matches = match value { + FilterValue::DateTime(filter_value) => note_value >= *filter_value, + _ => false, + }; + + (matches, Some(format!("gte:{}", field.as_str())).filter(|_| !matches)) + }, + _ => (false, Some(format!("gte:{}", field.as_str()))), + } + } + + fn evaluate_lt( + field: &FilterField, + value: &FilterValue, + note: &NoteMeta, + ) -> (bool, Option<String>) { + match field.lookup_note_value(note) { + FilterNodeValue::Number(note_value) => { + let matches = note_value < value.to_numeric(); + + (matches, Some(format!("lt:{}", field.as_str())).filter(|_| !matches)) + }, + FilterNodeValue::DateTime(note_value) => { + let matches = match value { + FilterValue::DateTime(filter_value) => note_value < *filter_value, + _ => false, + }; + + (matches, Some(format!("lt:{}", field.as_str())).filter(|_| !matches)) + }, + _ => (false, Some(format!("lt:{}", field.as_str()))), + } + } + + fn evaluate_lte( + field: &FilterField, + value: &FilterValue, + note: &NoteMeta, + ) -> (bool, Option<String>) { + match field.lookup_note_value(note) { + FilterNodeValue::Number(note_value) => { + let matches = note_value <= value.to_numeric(); + + (matches, Some(format!("lte:{}", field.as_str())).filter(|_| !matches)) + }, + FilterNodeValue::DateTime(note_value) => { + let matches = match value { + FilterValue::DateTime(filter_value) => note_value <= *filter_value, + _ => false, + }; + + (matches, Some(format!("lte:{}", field.as_str())).filter(|_| !matches)) + }, + _ => (false, Some(format!("lte:{}", field.as_str()))), + } + } + + fn lookup_note_value(field: &FilterField, note: &NoteMeta) -> FilterNodeValue { + match field { + FilterField::Type => FilterNodeValue::String(note.note_type.clone()), + FilterField::Key => FilterNodeValue::String(note.key.clone().unwrap_or_default()), + FilterField::Scope => FilterNodeValue::String(note.scope.clone()), + FilterField::AgentId => FilterNodeValue::String(note.agent_id.clone()), + FilterField::Importance => FilterNodeValue::Number(note.importance as f64), + FilterField::Confidence => FilterNodeValue::Number(note.confidence as f64), + FilterField::HitCount => FilterNodeValue::Number(note.hit_count as f64), + FilterField::UpdatedAt => FilterNodeValue::DateTime(note.updated_at), + FilterField::ExpiresAt => + note.expires_at.map_or(FilterNodeValue::Null, FilterNodeValue::DateTime), + FilterField::LastHitAt => + note.last_hit_at.map_or(FilterNodeValue::Null, FilterNodeValue::DateTime), + } + } + + fn parse_args( + value: &Value, + path: &str, + depth: usize, + state: &mut FilterParseState, + ) -> Result<Vec<Self>, FilterParseError> { + let nodes = value.as_array().ok_or_else(|| FilterParseError { + path: path.to_string(), + message: "op args must be an array.".to_string(), + })?; + + if nodes.is_empty() { + return Err(FilterParseError { + path: path.to_string(), + message: "op args must contain at least one node.".to_string(), + }); + } + + nodes + .iter() + .enumerate() + .map(|(index, node)| { + let child_path = format!("{path}[{index}]"); + + parse_expr(node, &child_path, depth.saturating_add(1), state) + }) + .collect() + } + + fn parse_in_values( + field: &FilterField, + value: &Value, + path: &str, + ) -> Result<Vec<FilterValue>, FilterParseError> { + let values = value.as_array().ok_or_else(|| FilterParseError { + path: path.to_string(), + message: "in value must be an array.".to_string(), + })?; + + if values.len() > MAX_IN_LIST_ITEMS { + return Err(FilterParseError { + path: path.to_string(), + message: format!( + "in list exceeds maximum size ({}/{})", + values.len(), + MAX_IN_LIST_ITEMS + ), + }); + } + + values + .iter() + .enumerate() + .map(|(index, raw)| { + let item_path = format!("{path}[{index}]"); + + parse_value(field, raw, &item_path) + }) + .collect() + } + + fn validate_metrics( + path: &str, + depth: usize, + state: &mut FilterParseState, + ) -> Result<(), FilterParseError> { + state.nodes = state.nodes.saturating_add(1); + state.max_depth = state.max_depth.max(depth); + + if state.nodes > MAX_FILTER_NODES { + return Err(FilterParseError { + path: path.to_string(), + message: format!( + "filter exceeds node limit ({}/{})", + state.nodes, MAX_FILTER_NODES + ), + }); + } + if state.max_depth > MAX_FILTER_DEPTH { + return Err(FilterParseError { + path: path.to_string(), + message: format!( + "filter exceeds depth limit ({}/{})", + state.max_depth, MAX_FILTER_DEPTH + ), + }); + } + + Ok(()) + } + + fn parse_leaf( + raw: &Map<String, Value>, + op: &str, + path: &str, + ) -> Result<Self, FilterParseError> { + let field = FilterField::parse( + &format!("{path}.field"), + raw.get("field").ok_or_else(|| FilterParseError { + path: format!("{path}.field"), + message: "op node is missing required field 'field'.".to_string(), + })?, + )?; + let path_value = format!("{path}.value"); + let value_raw = raw.get("value").ok_or_else(|| FilterParseError { + path: format!("{path}.value"), + message: "op node is missing required field 'value'.".to_string(), + })?; + let value = parse_value(&field, value_raw, &path_value)?; + + match op { + "eq" => Ok(Self::Eq { field, value }), + "neq" => Ok(Self::Neq { field, value }), + "contains" => match value { + FilterValue::String(value) => Ok(Self::Contains { field, value }), + _ => Err(FilterParseError { + path: path_value, + message: "contains requires a string value.".to_string(), + }), + }, + "gt" => Ok(Self::Gt { field, value }), + "gte" => Ok(Self::Gte { field, value }), + "lt" => Ok(Self::Lt { field, value }), + "lte" => Ok(Self::Lte { field, value }), + "in" => { + let values = Self::parse_in_values(&field, value_raw, &path_value)?; + + Ok(Self::In { field, values }) + }, + _ => Err(FilterParseError { + path: path.to_string(), + message: format!("unsupported leaf op '{op}'."), + }), + } + } +} + +impl Default for FilterExpr { + fn default() -> Self { + Self::Eq { field: FilterField::Type, value: FilterValue::Null } + } +} + +#[derive(Clone, Debug)] +enum FilterValue { + String(String), + Number(f64), + DateTime(OffsetDateTime), + Null, +} +impl FilterValue { + fn to_node_value(&self) -> FilterNodeValue { + match self { + Self::String(value) => FilterNodeValue::String(value.clone()), + Self::Number(value) => FilterNodeValue::Number(*value), + Self::DateTime(value) => FilterNodeValue::DateTime(*value), + Self::Null => FilterNodeValue::Null, + } + } + + fn to_value(&self) -> Value { + match self { + Self::String(value) => Value::String(value.clone()), + Self::Number(value) => serde_json::json!(value), + Self::DateTime(value) => Value::String(value.format(&Rfc3339).unwrap_or_default()), + Self::Null => Value::Null, + } + } + + fn to_numeric(&self) -> f64 { + match self { + Self::Number(value) => *value, + _ => 0.0, + } + } +} + +impl PartialEq for FilterValue { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::String(lhs), Self::String(rhs)) => lhs == rhs, + (Self::Number(lhs), Self::Number(rhs)) => lhs == rhs, + (Self::DateTime(lhs), Self::DateTime(rhs)) => lhs == rhs, + (Self::Null, Self::Null) => true, + _ => false, + } + } +} + +#[derive(Clone, Debug)] +enum FilterNodeValue { + String(String), + Number(f64), + DateTime(OffsetDateTime), + Null, +} +impl From<&FilterValue> for FilterNodeValue { + fn from(value: &FilterValue) -> Self { + match value { + FilterValue::String(value) => Self::String(value.clone()), + FilterValue::Number(value) => Self::Number(*value), + FilterValue::DateTime(value) => Self::DateTime(*value), + FilterValue::Null => Self::Null, + } + } +} + +impl PartialEq for FilterNodeValue { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::String(lhs), Self::String(rhs)) => lhs == rhs, + (Self::Number(lhs), Self::Number(rhs)) => lhs == rhs, + (Self::DateTime(lhs), Self::DateTime(rhs)) => lhs == rhs, + (Self::Null, Self::Null) => true, + _ => false, + } + } +} + +fn parse_expr( + value: &Value, + path: &str, + depth: usize, + state: &mut FilterParseState, +) -> Result<FilterExpr, FilterParseError> { + FilterExpr::validate_metrics(path, depth, state)?; + + let Some(map) = value.as_object() else { + return Err(FilterParseError { + path: path.to_string(), + message: "filter node must be an object.".to_string(), + }); + }; + let op = map.get("op").and_then(Value::as_str).ok_or_else(|| FilterParseError { + path: path.to_string(), + message: "filter node is missing required string op.".to_string(), + })?; + + match op { + "and" => { + let args = map.get("args").ok_or_else(|| FilterParseError { + path: format!("{path}.args"), + message: "and node requires args.".to_string(), + })?; + let args = FilterExpr::parse_args(args, &format!("{path}.args"), depth, state)?; + + Ok(FilterExpr::And(args)) + }, + "or" => { + let args = map.get("args").ok_or_else(|| FilterParseError { + path: format!("{path}.args"), + message: "or node requires args.".to_string(), + })?; + let args = FilterExpr::parse_args(args, &format!("{path}.args"), depth, state)?; + + Ok(FilterExpr::Or(args)) + }, + "not" => { + let expr = map.get("expr").ok_or_else(|| FilterParseError { + path: format!("{path}.expr"), + message: "not node requires expr.".to_string(), + })?; + let child = parse_expr(expr, &format!("{path}.expr"), depth.saturating_add(1), state)?; + + Ok(FilterExpr::Not(Box::new(child))) + }, + "in" => FilterExpr::parse_leaf(map, op, path), + "eq" | "neq" | "gt" | "gte" | "lt" | "lte" | "contains" => + FilterExpr::parse_leaf(map, op, path), + _ => Err(FilterParseError { + path: path.to_string(), + message: format!("unsupported filter op '{op}'."), + }), + } +} + +fn parse_string(path: &str, raw: &Value) -> Result<String, FilterParseError> { + let value = raw.as_str().ok_or_else(|| FilterParseError { + path: path.to_string(), + message: "string value expected.".to_string(), + })?; + + if value.len() > MAX_STRING_BYTES { + return Err(FilterParseError { + path: path.to_string(), + message: format!("string value exceeds maximum bytes ({}).", MAX_STRING_BYTES), + }); + } + + Ok(value.to_string()) +} + +fn parse_value( + field: &FilterField, + raw: &Value, + path: &str, +) -> Result<FilterValue, FilterParseError> { + match field { + FilterField::Type | FilterField::Key | FilterField::Scope | FilterField::AgentId => + match raw { + Value::String(_) | Value::Null if matches!(field, FilterField::Key) => { + if raw.is_null() { + Ok(FilterValue::Null) + } else { + parse_string(path, raw).map(FilterValue::String) + } + }, + _ => parse_string(path, raw).map(FilterValue::String), + }, + FilterField::Importance | FilterField::Confidence | FilterField::HitCount => { + let value = raw.as_f64().ok_or_else(|| FilterParseError { + path: path.to_string(), + message: "numeric value expected.".to_string(), + })?; + + Ok(FilterValue::Number(value)) + }, + FilterField::UpdatedAt => + OffsetDateTime::parse(parse_string(path, raw)?.as_str(), &Rfc3339) + .map(FilterValue::DateTime) + .map_err(|_| FilterParseError { + path: path.to_string(), + message: "datetime value must be RFC3339.".to_string(), + }), + FilterField::ExpiresAt | FilterField::LastHitAt => + if raw.is_null() { + Ok(FilterValue::Null) + } else { + OffsetDateTime::parse(parse_string(path, raw)?.as_str(), &Rfc3339) + .map(FilterValue::DateTime) + .map_err(|_| FilterParseError { + path: path.to_string(), + message: "datetime value must be RFC3339.".to_string(), + }) + }, + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use serde_json::{Map, Value}; + use time::OffsetDateTime; + use uuid::Uuid; + + use crate::search::filter::{ + ChunkCandidate, MAX_FILTER_NODES, MAX_IN_LIST_ITEMS, MAX_STRING_BYTES, NoteMeta, + SEARCH_FILTER_EXPR_SCHEMA_V1, SearchFilter, + }; + + fn note_meta() -> NoteMeta { + NoteMeta { + note_id: Uuid::new_v4(), + note_type: "fact".to_string(), + key: Some("foo".to_string()), + scope: "project_shared".to_string(), + agent_id: "agent-a".to_string(), + importance: 0.9, + confidence: 0.8, + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).expect("timestamp"), + expires_at: None, + source_ref: Value::Object(Map::new()), + embedding_version: "provider:model:1".to_string(), + hit_count: 4, + last_hit_at: None, + } + } + + #[test] + fn parse_requires_known_schema() { + let raw = serde_json::json!({ "schema": "bad", "expr": { "op": "eq", "field": "scope", "value": "project_shared" } }); + + assert!(SearchFilter::parse(&raw).is_err()); + } + + #[test] + fn parse_and_validate_depth_limit() { + let mut expr = + serde_json::json!({ "op": "eq", "field": "scope", "value": "project_shared" }); + + for _ in 0..9 { + expr = serde_json::json!({ "op": "not", "expr": expr }); + } + + let raw = serde_json::json!({ "schema": SEARCH_FILTER_EXPR_SCHEMA_V1, "expr": expr }); + + assert!(SearchFilter::parse(&raw).is_err()); + } + + #[test] + fn parse_and_validate_node_limit() { + let leaf = serde_json::json!({ "op": "eq", "field": "scope", "value": "project_shared" }); + let mut args = Vec::with_capacity(MAX_FILTER_NODES); + + for _ in 0..(MAX_FILTER_NODES - 1) { + args.push(leaf.clone()); + } + + let expr = serde_json::json!({ "op": "and", "args": args }); + let raw = serde_json::json!({ "schema": SEARCH_FILTER_EXPR_SCHEMA_V1, "expr": expr }); + + assert!(SearchFilter::parse(&raw).is_ok()); + + let expr = serde_json::json!({ "op": "and", "args": [expr, leaf] }); + let raw = serde_json::json!({ "schema": SEARCH_FILTER_EXPR_SCHEMA_V1, "expr": expr }); + + assert!( + SearchFilter::parse(&raw).is_err(), + "expected parse failure when node count is greater than limit" + ); + } + + #[test] + fn parse_in_list_limit() { + let values = (0_i32..=MAX_IN_LIST_ITEMS as i32) + .map(|value| serde_json::json!(value)) + .collect::<Vec<_>>(); + let raw = serde_json::json!({ + "schema": SEARCH_FILTER_EXPR_SCHEMA_V1, + "expr": { + "op": "in", + "field": "importance", + "value": values, + }, + }); + + assert!(SearchFilter::parse(&raw).is_err()); + } + + #[test] + fn parse_rejects_unknown_field_with_json_path() { + let raw = serde_json::json!({ + "schema": SEARCH_FILTER_EXPR_SCHEMA_V1, + "expr": { "op": "eq", "field": "bad_field", "value": "project_shared" }, + }); + let err = SearchFilter::parse(&raw).expect_err("expected unknown field error"); + + assert!(err.to_string().contains("$.filter.expr")); + assert!(err.to_string().contains("not in allowlist")); + } + + #[test] + fn parse_rejects_invalid_value_type_with_json_path() { + let raw = serde_json::json!({ + "schema": SEARCH_FILTER_EXPR_SCHEMA_V1, + "expr": { "op": "eq", "field": "importance", "value": "wrong" }, + }); + let err = SearchFilter::parse(&raw).expect_err("expected invalid value type error"); + + assert!(err.to_string().contains("$.filter.expr.value")); + } + + #[test] + fn parse_rejects_oversize_string_with_json_path() { + let value = "x".repeat(MAX_STRING_BYTES + 1); + let raw = serde_json::json!({ + "schema": SEARCH_FILTER_EXPR_SCHEMA_V1, + "expr": { "op": "eq", "field": "scope", "value": value }, + }); + let err = SearchFilter::parse(&raw).expect_err("expected string too long error"); + + assert!(err.to_string().contains("$.filter.expr.value")); + } + + #[test] + fn eval_filters_note_metadata() { + let raw = serde_json::json!({ + "schema": SEARCH_FILTER_EXPR_SCHEMA_V1, + "expr": { + "op": "and", + "args": [ + { "op": "eq", "field": "scope", "value": "project_shared" }, + { "op": "gte", "field": "importance", "value": 0.5 }, + ], + }, + }); + let filter = SearchFilter::parse(&raw).expect("valid filter"); + let meta = note_meta(); + let note_meta = HashMap::from([(meta.note_id, meta)]); + let candidate = ChunkCandidate { + note_id: Uuid::new_v4(), + chunk_id: Uuid::new_v4(), + chunk_index: 0, + retrieval_rank: 1, + retrieval_score: None, + scope: Some("project_shared".to_string()), + updated_at: None, + embedding_version: None, + }; + let (result, impact) = filter.eval(vec![candidate], ¬e_meta, 10, 12); + + assert_eq!(result.len(), 0); + assert_eq!(impact.requested_candidate_k, 10); + assert_eq!(impact.effective_candidate_k, 12); + } + + #[test] + fn filter_impact_lists_top_drop_reasons_deterministically() { + let filter = SearchFilter::parse(&serde_json::json!({ + "schema": SEARCH_FILTER_EXPR_SCHEMA_V1, + "expr": { "op": "eq", "field": "scope", "value": "project_shared" }, + })) + .expect("valid filter"); + let first = Uuid::new_v4(); + let second = Uuid::new_v4(); + let third = Uuid::new_v4(); + let mut note_meta = HashMap::new(); + + note_meta.insert( + first, + NoteMeta { + note_id: first, + note_type: "fact".to_string(), + key: Some("k1".to_string()), + scope: "agent_private".to_string(), + agent_id: "a".to_string(), + importance: 0.9, + confidence: 0.9, + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).expect("timestamp"), + expires_at: None, + source_ref: Value::Object(Map::new()), + embedding_version: "provider:model:1".to_string(), + hit_count: 0, + last_hit_at: None, + }, + ); + note_meta.insert( + second, + NoteMeta { + note_id: second, + note_type: "fact".to_string(), + key: Some("k2".to_string()), + scope: "agent_private".to_string(), + agent_id: "a".to_string(), + importance: 0.9, + confidence: 0.9, + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_001).expect("timestamp"), + expires_at: None, + source_ref: Value::Object(Map::new()), + embedding_version: "provider:model:1".to_string(), + hit_count: 0, + last_hit_at: None, + }, + ); + + let candidates = vec![ + ChunkCandidate { + note_id: first, + chunk_id: Uuid::new_v4(), + chunk_index: 0, + retrieval_rank: 1, + retrieval_score: None, + scope: None, + updated_at: None, + embedding_version: None, + }, + ChunkCandidate { + note_id: second, + chunk_id: Uuid::new_v4(), + chunk_index: 1, + retrieval_rank: 2, + retrieval_score: None, + scope: None, + updated_at: None, + embedding_version: None, + }, + ChunkCandidate { + note_id: third, + chunk_id: Uuid::new_v4(), + chunk_index: 2, + retrieval_rank: 3, + retrieval_score: None, + scope: None, + updated_at: None, + embedding_version: None, + }, + ]; + let (_, impact) = filter.eval(candidates, ¬e_meta, 10, 20); + + assert_eq!(impact.candidate_count_pre, 3); + assert_eq!(impact.candidate_count_post, 0); + assert_eq!(impact.dropped_total, 3); + assert_eq!(impact.top_drop_reasons.len(), 2); + assert_eq!(impact.top_drop_reasons[0].reason, "eq:scope"); + assert_eq!(impact.top_drop_reasons[0].count, 2); + assert_eq!(impact.top_drop_reasons[1].reason, "note_meta_missing"); + assert_eq!(impact.top_drop_reasons[1].count, 1); + } +} diff --git a/packages/elf-service/src/search/ranking.rs b/packages/elf-service/src/search/ranking.rs new file mode 100644 index 00000000..e5397982 --- /dev/null +++ b/packages/elf-service/src/search/ranking.rs @@ -0,0 +1,42 @@ +mod cache; +mod diversity; +mod policy; +mod query; +mod retrieval; +mod text; + +pub(super) use self::{ + cache::{ + build_cached_scores, build_expansion_cache_key, build_rerank_cache_key, cache_key_prefix, + decode_json, hash_query, + }, + diversity::{ + attach_diversity_decisions_to_trace_candidates, build_diversity_explain, + build_rerank_ranks, build_rerank_ranks_for_replay, extract_replay_diversity_decisions, + select_diverse_results, + }, + policy::{ + NormalizationKind, ResolvedBlendPolicy, ResolvedDiversityPolicy, + ResolvedRetrievalSourcesPolicy, build_config_snapshot, build_policy_snapshot, + hash_policy_snapshot, resolve_blend_policy, resolve_diversity_policy, + resolve_retrieval_sources_policy, resolve_scopes, retrieval_weight_for_rank, + }, + query::{ + build_expansion_messages, expansion_mode_label, normalize_queries, resolve_expansion_mode, + should_expand_dynamic, + }, + retrieval::{ + candidate_matches_note, cmp_f32_desc, collect_chunk_candidates, collect_neighbor_pairs, + merge_retrieval_candidates, rank_normalize, stitch_snippet, + }, + text::{ + build_dense_embedding_input, build_scope_context_boost_by_scope, + compute_deterministic_ranking_terms, match_terms_in_text, merge_matched_fields, + tokenize_query, + }, +}; +#[cfg(test)] +pub(super) use self::{ + policy::BlendSegment, + text::{lexical_overlap_ratio, scope_description_boost}, +}; diff --git a/packages/elf-service/src/search/ranking/cache.rs b/packages/elf-service/src/search/ranking/cache.rs new file mode 100644 index 00000000..fb3fa8a4 --- /dev/null +++ b/packages/elf-service/src/search/ranking/cache.rs @@ -0,0 +1,128 @@ +use std::{ + collections::{HashMap, hash_map::DefaultHasher}, + hash::{Hash, Hasher}, +}; + +use serde::de::DeserializeOwned; +use serde_json::Value; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{ + Error, Result, + search::{RerankCacheCandidate, RerankCachePayload}, +}; + +const EXPANSION_CACHE_SCHEMA_VERSION: i32 = 1; +const RERANK_CACHE_SCHEMA_VERSION: i32 = 1; + +pub fn decode_json<T>(value: Value, label: &str) -> Result<T> +where + T: DeserializeOwned, +{ + serde_json::from_value(value) + .map_err(|err| Error::Storage { message: format!("Invalid {label} value: {err}") }) +} + +pub fn hash_query(query: &str) -> String { + let mut hasher = DefaultHasher::new(); + + Hash::hash(query, &mut hasher); + + format!("{:x}", hasher.finish()) +} + +pub fn hash_cache_key(payload: &Value) -> Result<String> { + let raw = serde_json::to_vec(payload).map_err(|err| Error::Storage { + message: format!("Failed to encode cache key payload: {err}"), + })?; + + Ok(blake3::hash(&raw).to_hex().to_string()) +} + +pub fn cache_key_prefix(key: &str) -> &str { + let len = key.len().min(12); + + &key[..len] +} + +pub fn build_expansion_cache_key( + query: &str, + max_queries: u32, + include_original: bool, + provider_id: &str, + model: &str, + temperature: f32, +) -> Result<String> { + let payload = serde_json::json!({ + "kind": "expansion", + "schema_version": EXPANSION_CACHE_SCHEMA_VERSION, + "query": query.trim(), + "provider_id": provider_id, + "model": model, + "temperature": temperature, + "max_queries": max_queries, + "include_original": include_original, + }); + + hash_cache_key(&payload) +} + +pub fn build_rerank_cache_key( + query: &str, + provider_id: &str, + model: &str, + candidates: &[(Uuid, OffsetDateTime)], +) -> Result<String> { + let signature: Vec<Value> = candidates + .iter() + .map(|(chunk_id, updated_at)| { + serde_json::json!({ + "chunk_id": chunk_id, + "updated_at": updated_at, + }) + }) + .collect(); + let payload = serde_json::json!({ + "kind": "rerank", + "schema_version": RERANK_CACHE_SCHEMA_VERSION, + "query": query.trim(), + "provider_id": provider_id, + "model": model, + "candidates": signature, + }); + + hash_cache_key(&payload) +} + +pub fn build_cached_scores( + payload: &RerankCachePayload, + candidates: &[RerankCacheCandidate], +) -> Option<Vec<f32>> { + if payload.items.len() != candidates.len() { + return None; + } + + let mut map = HashMap::new(); + + for item in &payload.items { + let key = (item.chunk_id, item.updated_at.unix_timestamp(), item.updated_at.nanosecond()); + + map.insert(key, item.score); + } + + let mut out = Vec::with_capacity(candidates.len()); + + for candidate in candidates { + let key = ( + candidate.chunk_id, + candidate.updated_at.unix_timestamp(), + candidate.updated_at.nanosecond(), + ); + let score = map.get(&key)?; + + out.push(*score); + } + + Some(out) +} diff --git a/packages/elf-service/src/search/ranking/diversity.rs b/packages/elf-service/src/search/ranking/diversity.rs new file mode 100644 index 00000000..ea09085f --- /dev/null +++ b/packages/elf-service/src/search/ranking/diversity.rs @@ -0,0 +1,498 @@ +use std::{cmp::Ordering, collections::HashMap}; + +use uuid::Uuid; + +use crate::search::{ + ChunkSnippet, DiversityDecision, ScoredChunk, SearchDiversityExplain, TraceCandidateRecord, + TraceReplayCandidate, + ranking::{policy::ResolvedDiversityPolicy, retrieval}, +}; + +#[derive(Clone, Copy)] +struct DiversityPick { + remaining_pos: usize, + mmr_score: f32, + nearest_note_id: Option<Uuid>, + similarity: Option<f32>, + missing_embedding: bool, + retrieval_rank: u32, +} +impl DiversityPick { + fn better_than(self, other: &Self) -> bool { + self.mmr_score > other.mmr_score + || (self.mmr_score == other.mmr_score && self.retrieval_rank < other.retrieval_rank) + } +} + +pub fn build_diversity_explain(decision: &DiversityDecision) -> SearchDiversityExplain { + SearchDiversityExplain { + enabled: true, + selected_reason: decision.selected_reason.clone(), + skipped_reason: decision.skipped_reason.clone(), + nearest_selected_note_id: decision.nearest_selected_note_id, + similarity: decision.similarity, + mmr_score: decision.mmr_score, + missing_embedding: decision.missing_embedding, + } +} + +pub fn cosine_similarity(lhs: &[f32], rhs: &[f32]) -> Option<f32> { + if lhs.is_empty() || lhs.len() != rhs.len() { + return None; + } + + let mut dot = 0.0_f32; + let mut lhs_norm = 0.0_f32; + let mut rhs_norm = 0.0_f32; + + for (l, r) in lhs.iter().zip(rhs.iter()) { + dot += l * r; + lhs_norm += l * l; + rhs_norm += r * r; + } + + if lhs_norm <= f32::EPSILON || rhs_norm <= f32::EPSILON { + return None; + } + + Some((dot / (lhs_norm.sqrt() * rhs_norm.sqrt())).clamp(-1.0, 1.0)) +} + +pub fn nearest_selected_similarity( + note_id: Uuid, + candidates: &[ScoredChunk], + selected_indices: &[usize], + note_vectors: &HashMap<Uuid, Vec<f32>>, +) -> (Option<f32>, Option<Uuid>, bool) { + let Some(candidate_vec) = note_vectors.get(¬e_id) else { + return (None, None, true); + }; + let mut best_similarity: Option<f32> = None; + let mut nearest_note_id: Option<Uuid> = None; + + for selected_idx in selected_indices { + let selected_note_id = candidates[*selected_idx].item.note.note_id; + let Some(selected_vec) = note_vectors.get(&selected_note_id) else { + continue; + }; + let Some(similarity) = cosine_similarity(candidate_vec, selected_vec) else { + continue; + }; + + if best_similarity.map(|value| similarity > value).unwrap_or(true) { + best_similarity = Some(similarity); + nearest_note_id = Some(selected_note_id); + } + } + + (best_similarity, nearest_note_id, false) +} + +pub fn select_diverse_results( + candidates: Vec<ScoredChunk>, + top_k: u32, + policy: &ResolvedDiversityPolicy, + note_vectors: &HashMap<Uuid, Vec<f32>>, +) -> (Vec<ScoredChunk>, HashMap<Uuid, DiversityDecision>) { + if candidates.is_empty() || top_k == 0 { + return (Vec::new(), HashMap::new()); + } + if !policy.enabled { + return select_diverse_results_disabled(candidates, top_k, note_vectors); + } + + select_diverse_results_enabled(candidates, top_k, policy, note_vectors) +} + +pub fn attach_diversity_decisions_to_trace_candidates( + candidates: &mut [TraceCandidateRecord], + decisions: &HashMap<Uuid, DiversityDecision>, +) { + for candidate in candidates { + let Some(decision) = decisions.get(&candidate.note_id) else { continue }; + let mut snapshot = candidate.candidate_snapshot.clone(); + let Some(object) = snapshot.as_object_mut() else { continue }; + + object.insert("diversity_selected".to_string(), serde_json::json!(decision.selected)); + object.insert( + "diversity_selected_rank".to_string(), + serde_json::json!(decision.selected_rank), + ); + object.insert( + "diversity_selected_reason".to_string(), + serde_json::json!(decision.selected_reason), + ); + object.insert( + "diversity_skipped_reason".to_string(), + serde_json::json!(decision.skipped_reason), + ); + object.insert( + "diversity_nearest_selected_note_id".to_string(), + serde_json::json!(decision.nearest_selected_note_id), + ); + object.insert("diversity_similarity".to_string(), serde_json::json!(decision.similarity)); + object.insert("diversity_mmr_score".to_string(), serde_json::json!(decision.mmr_score)); + object.insert( + "diversity_missing_embedding".to_string(), + serde_json::json!(decision.missing_embedding), + ); + + candidate.candidate_snapshot = snapshot; + } +} + +pub fn extract_replay_diversity_decisions( + candidates: &[TraceReplayCandidate], +) -> HashMap<Uuid, DiversityDecision> { + let mut out: HashMap<Uuid, DiversityDecision> = HashMap::new(); + + for candidate in candidates { + let has_diversity = candidate.diversity_selected.is_some() + || candidate.diversity_selected_rank.is_some() + || candidate.diversity_selected_reason.is_some() + || candidate.diversity_skipped_reason.is_some() + || candidate.diversity_nearest_selected_note_id.is_some() + || candidate.diversity_similarity.is_some() + || candidate.diversity_mmr_score.is_some() + || candidate.diversity_missing_embedding.is_some(); + + if !has_diversity { + continue; + } + + let selected = candidate.diversity_selected.unwrap_or(false); + let decision = DiversityDecision { + selected, + selected_rank: candidate.diversity_selected_rank, + selected_reason: candidate + .diversity_selected_reason + .clone() + .unwrap_or_else(|| "replay_selected".to_string()), + skipped_reason: candidate.diversity_skipped_reason.clone(), + nearest_selected_note_id: candidate.diversity_nearest_selected_note_id, + similarity: candidate.diversity_similarity, + mmr_score: candidate.diversity_mmr_score, + missing_embedding: candidate.diversity_missing_embedding.unwrap_or(false), + }; + let replace = match out.get(&candidate.note_id) { + None => true, + Some(existing) => + if decision.selected != existing.selected { + decision.selected + } else { + let lhs = decision.selected_rank.unwrap_or(u32::MAX); + let rhs = existing.selected_rank.unwrap_or(u32::MAX); + + lhs < rhs + }, + }; + + if replace { + out.insert(candidate.note_id, decision); + } + } + + out +} + +pub fn build_rerank_ranks(items: &[ChunkSnippet], scores: &[f32]) -> Vec<u32> { + let n = items.len(); + + if n == 0 { + return Vec::new(); + } + + let mut idxs: Vec<usize> = (0..n).collect(); + + idxs.sort_by(|&a, &b| { + let score_a = scores.get(a).copied().unwrap_or(f32::NAN); + let score_b = scores.get(b).copied().unwrap_or(f32::NAN); + let ord = retrieval::cmp_f32_desc(score_a, score_b); + + if ord != Ordering::Equal { + return ord; + } + if items[a].note.note_id == items[b].note.note_id { + let ord = items[a].chunk.chunk_index.cmp(&items[b].chunk.chunk_index); + + if ord != Ordering::Equal { + return ord; + } + } + + let ord = items[a].retrieval_rank.cmp(&items[b].retrieval_rank); + + if ord != Ordering::Equal { + return ord; + } + + items[a].chunk.chunk_id.cmp(&items[b].chunk.chunk_id) + }); + + let mut ranks = vec![0_u32; n]; + + for (pos, idx) in idxs.into_iter().enumerate() { + ranks[idx] = pos as u32 + 1; + } + + ranks +} + +pub fn build_rerank_ranks_for_replay(candidates: &[TraceReplayCandidate]) -> Vec<u32> { + let n = candidates.len(); + + if n == 0 { + return Vec::new(); + } + + let mut idxs: Vec<usize> = (0..n).collect(); + + idxs.sort_by(|&a, &b| { + let score_a = candidates.get(a).map(|candidate| candidate.rerank_score).unwrap_or(f32::NAN); + let score_b = candidates.get(b).map(|candidate| candidate.rerank_score).unwrap_or(f32::NAN); + let ord = retrieval::cmp_f32_desc(score_a, score_b); + + if ord != Ordering::Equal { + return ord; + } + + let ra = candidates.get(a).map(|candidate| candidate.retrieval_rank).unwrap_or(0); + let rb = candidates.get(b).map(|candidate| candidate.retrieval_rank).unwrap_or(0); + let ord = ra.cmp(&rb); + + if ord != Ordering::Equal { + return ord; + } + + let na = candidates.get(a).map(|candidate| candidate.note_id).unwrap_or(Uuid::nil()); + let nb = candidates.get(b).map(|candidate| candidate.note_id).unwrap_or(Uuid::nil()); + let ord = na.cmp(&nb); + + if ord != Ordering::Equal { + return ord; + } + + let ca = candidates.get(a).map(|candidate| candidate.chunk_id).unwrap_or(Uuid::nil()); + let cb = candidates.get(b).map(|candidate| candidate.chunk_id).unwrap_or(Uuid::nil()); + + ca.cmp(&cb) + }); + + let mut ranks = vec![0_u32; n]; + + for (pos, idx) in idxs.into_iter().enumerate() { + ranks[idx] = pos as u32 + 1; + } + + ranks +} + +fn select_diverse_results_disabled( + candidates: Vec<ScoredChunk>, + top_k: u32, + note_vectors: &HashMap<Uuid, Vec<f32>>, +) -> (Vec<ScoredChunk>, HashMap<Uuid, DiversityDecision>) { + let mut decisions = HashMap::new(); + let mut selected = Vec::new(); + + for (idx, candidate) in candidates.into_iter().enumerate() { + let selected_rank = (idx < top_k as usize).then_some(idx as u32 + 1); + let is_selected = selected_rank.is_some(); + let note_id = candidate.item.note.note_id; + let missing_embedding = !note_vectors.contains_key(¬e_id); + + decisions.insert( + note_id, + DiversityDecision { + selected: is_selected, + selected_rank, + selected_reason: if is_selected { + "disabled_passthrough".to_string() + } else { + "disabled_truncate".to_string() + }, + skipped_reason: if is_selected { + None + } else { + Some("disabled_truncate".to_string()) + }, + nearest_selected_note_id: None, + similarity: None, + mmr_score: None, + missing_embedding, + }, + ); + + if is_selected { + selected.push(candidate); + } + } + + (selected, decisions) +} + +fn select_diverse_results_enabled( + candidates: Vec<ScoredChunk>, + top_k: u32, + policy: &ResolvedDiversityPolicy, + note_vectors: &HashMap<Uuid, Vec<f32>>, +) -> (Vec<ScoredChunk>, HashMap<Uuid, DiversityDecision>) { + let total = u32::try_from(candidates.len()).unwrap_or(1).max(1); + let relevance_by_idx: Vec<f32> = + (0..candidates.len()).map(|idx| retrieval::rank_normalize(idx as u32 + 1, total)).collect(); + let mut remaining_indices: Vec<usize> = (0..candidates.len()).collect(); + let mut selected_indices: Vec<usize> = Vec::new(); + let mut decisions: HashMap<Uuid, DiversityDecision> = HashMap::new(); + let first_idx = remaining_indices.remove(0); + let first_note_id = candidates[first_idx].item.note.note_id; + let first_missing_embedding = !note_vectors.contains_key(&first_note_id); + + selected_indices.push(first_idx); + decisions.insert( + first_note_id, + DiversityDecision { + selected: true, + selected_rank: Some(1), + selected_reason: "top_relevance".to_string(), + skipped_reason: None, + nearest_selected_note_id: None, + similarity: None, + mmr_score: Some(relevance_by_idx[first_idx]), + missing_embedding: first_missing_embedding, + }, + ); + + while selected_indices.len() < top_k as usize && !remaining_indices.is_empty() { + let Some((selected_pick, selected_reason)) = pick_next_candidate( + &remaining_indices, + &candidates, + &selected_indices, + note_vectors, + &relevance_by_idx, + policy, + ) else { + break; + }; + let picked_idx = remaining_indices.remove(selected_pick.remaining_pos); + + selected_indices.push(picked_idx); + + let selected_note_id = candidates[picked_idx].item.note.note_id; + + decisions.insert( + selected_note_id, + DiversityDecision { + selected: true, + selected_rank: Some(selected_indices.len() as u32), + selected_reason: selected_reason.to_string(), + skipped_reason: None, + nearest_selected_note_id: selected_pick.nearest_note_id, + similarity: selected_pick.similarity, + mmr_score: Some(selected_pick.mmr_score), + missing_embedding: selected_pick.missing_embedding, + }, + ); + } + + for candidate_idx in remaining_indices { + let note_id = candidates[candidate_idx].item.note.note_id; + let (similarity, nearest_note_id, missing_embedding) = + nearest_selected_similarity(note_id, &candidates, &selected_indices, note_vectors); + let skipped_reason = + if similarity.map(|value| value > policy.sim_threshold).unwrap_or(false) { + "similarity_threshold" + } else { + "lower_mmr" + }; + let redundancy = similarity.unwrap_or(0.0); + let mmr_score = policy.mmr_lambda * relevance_by_idx[candidate_idx] + - (1.0 - policy.mmr_lambda) * redundancy; + + decisions.insert( + note_id, + DiversityDecision { + selected: false, + selected_rank: None, + selected_reason: "not_selected".to_string(), + skipped_reason: Some(skipped_reason.to_string()), + nearest_selected_note_id: nearest_note_id, + similarity, + mmr_score: Some(mmr_score), + missing_embedding, + }, + ); + } + + let selected = selected_indices.into_iter().map(|idx| candidates[idx].clone()).collect(); + + (selected, decisions) +} + +fn pick_next_candidate( + remaining_indices: &[usize], + candidates: &[ScoredChunk], + selected_indices: &[usize], + note_vectors: &HashMap<Uuid, Vec<f32>>, + relevance_by_idx: &[f32], + policy: &ResolvedDiversityPolicy, +) -> Option<(DiversityPick, &'static str)> { + let mut best_non_filtered: Option<DiversityPick> = None; + let mut best_filtered: Option<DiversityPick> = None; + let mut best_any: Option<DiversityPick> = None; + let mut filtered_count = 0_u32; + + for (remaining_pos, candidate_idx) in remaining_indices.iter().copied().enumerate() { + let note_id = candidates[candidate_idx].item.note.note_id; + let (similarity, nearest_note_id, missing_embedding) = + nearest_selected_similarity(note_id, candidates, selected_indices, note_vectors); + let redundancy = similarity.unwrap_or(0.0); + let mmr_score = policy.mmr_lambda * relevance_by_idx[candidate_idx] + - (1.0 - policy.mmr_lambda) * redundancy; + let high_similarity = similarity.map(|value| value > policy.sim_threshold).unwrap_or(false); + + if high_similarity { + filtered_count += 1; + } + + let candidate_pick = DiversityPick { + remaining_pos, + mmr_score, + nearest_note_id, + similarity, + missing_embedding, + retrieval_rank: candidates[candidate_idx].item.retrieval_rank, + }; + + if best_any.as_ref().map(|current| candidate_pick.better_than(current)).unwrap_or(true) { + best_any = Some(candidate_pick); + } + if high_similarity { + if best_filtered + .as_ref() + .map(|current| candidate_pick.better_than(current)) + .unwrap_or(true) + { + best_filtered = Some(candidate_pick); + } + + continue; + } + if best_non_filtered + .as_ref() + .map(|current| candidate_pick.better_than(current)) + .unwrap_or(true) + { + best_non_filtered = Some(candidate_pick); + } + } + + if let Some(best) = best_non_filtered { + return Some((best, "mmr")); + } + + if filtered_count >= policy.max_skips { + return best_any.map(|best| (best, "max_skips_backfill")); + } + + best_filtered.map(|best| (best, "threshold_backfill")) +} diff --git a/packages/elf-service/src/search/ranking/policy.rs b/packages/elf-service/src/search/ranking/policy.rs new file mode 100644 index 00000000..86f51d93 --- /dev/null +++ b/packages/elf-service/src/search/ranking/policy.rs @@ -0,0 +1,461 @@ +use serde_json::Value; + +use crate::{ + Error, Result, + search::{ + BlendRankingOverride, DiversityRankingOverride, RankingRequestOverride, + RetrievalSourcesRankingOverride, + }, +}; +use elf_config::{Config, RankingBlend, RankingDiversity, RankingRetrievalSources}; + +#[derive(Clone, Copy, Debug)] +pub enum NormalizationKind { + Rank, +} +impl NormalizationKind { + pub fn as_str(self) -> &'static str { + match self { + Self::Rank => "rank", + } + } +} + +#[derive(Clone, Debug)] +pub struct BlendSegment { + pub max_retrieval_rank: u32, + pub retrieval_weight: f32, +} + +#[derive(Clone, Debug)] +pub struct ResolvedBlendPolicy { + pub enabled: bool, + pub rerank_normalization: NormalizationKind, + pub retrieval_normalization: NormalizationKind, + pub segments: Vec<BlendSegment>, +} + +#[derive(Clone, Debug)] +pub struct ResolvedDiversityPolicy { + pub enabled: bool, + pub sim_threshold: f32, + pub mmr_lambda: f32, + pub max_skips: u32, +} + +#[derive(Clone, Debug)] +pub struct ResolvedRetrievalSourcesPolicy { + pub fusion_weight: f32, + pub structured_field_weight: f32, + pub recursive_weight: f32, + pub fusion_priority: u32, + pub structured_field_priority: u32, + pub recursive_priority: u32, +} + +pub fn build_config_snapshot( + cfg: &Config, + blend_policy: &ResolvedBlendPolicy, + diversity_policy: &ResolvedDiversityPolicy, + retrieval_sources_policy: &ResolvedRetrievalSourcesPolicy, + ranking_override: Option<&RankingRequestOverride>, + policy_id: &str, + policy_snapshot: &Value, +) -> Value { + let override_json = ranking_override.and_then(|value| serde_json::to_value(value).ok()); + + serde_json::json!({ + "search": { + "expansion": { + "mode": cfg.search.expansion.mode.as_str(), + "max_queries": cfg.search.expansion.max_queries, + "include_original": cfg.search.expansion.include_original, + }, + "dynamic": { + "min_candidates": cfg.search.dynamic.min_candidates, + "min_top_score": cfg.search.dynamic.min_top_score, + }, + "prefilter": { + "max_candidates": cfg.search.prefilter.max_candidates, + }, + "explain": { + "retention_days": cfg.search.explain.retention_days, + }, + }, + "ranking": { + "policy_id": policy_id, + "policy_snapshot": policy_snapshot.clone(), + "recency_tau_days": cfg.ranking.recency_tau_days, + "tie_breaker_weight": cfg.ranking.tie_breaker_weight, + "deterministic": { + "enabled": cfg.ranking.deterministic.enabled, + "lexical": { + "enabled": cfg.ranking.deterministic.lexical.enabled, + "weight": cfg.ranking.deterministic.lexical.weight, + "min_ratio": cfg.ranking.deterministic.lexical.min_ratio, + "max_query_terms": cfg.ranking.deterministic.lexical.max_query_terms, + "max_text_terms": cfg.ranking.deterministic.lexical.max_text_terms, + }, + "hits": { + "enabled": cfg.ranking.deterministic.hits.enabled, + "weight": cfg.ranking.deterministic.hits.weight, + "half_saturation": cfg.ranking.deterministic.hits.half_saturation, + "last_hit_tau_days": cfg.ranking.deterministic.hits.last_hit_tau_days, + }, + "decay": { + "enabled": cfg.ranking.deterministic.decay.enabled, + "weight": cfg.ranking.deterministic.decay.weight, + "tau_days": cfg.ranking.deterministic.decay.tau_days, + }, + }, + "blend": { + "enabled": blend_policy.enabled, + "rerank_normalization": blend_policy.rerank_normalization.as_str(), + "retrieval_normalization": blend_policy.retrieval_normalization.as_str(), + "segments": blend_policy + .segments + .iter() + .map(|segment| { + serde_json::json!({ + "max_retrieval_rank": segment.max_retrieval_rank, + "retrieval_weight": segment.retrieval_weight, + }) + }) + .collect::<Vec<_>>(), + }, + "diversity": { + "enabled": diversity_policy.enabled, + "sim_threshold": diversity_policy.sim_threshold, + "mmr_lambda": diversity_policy.mmr_lambda, + "max_skips": diversity_policy.max_skips, + }, + "retrieval_sources": { + "fusion_weight": retrieval_sources_policy.fusion_weight, + "structured_field_weight": retrieval_sources_policy.structured_field_weight, + "recursive_weight": retrieval_sources_policy.recursive_weight, + "fusion_priority": retrieval_sources_policy.fusion_priority, + "structured_field_priority": retrieval_sources_policy.structured_field_priority, + "recursive_priority": retrieval_sources_policy.recursive_priority, + }, + "override": override_json, + }, + "providers": { + "embedding": { + "provider_id": cfg.providers.embedding.provider_id.as_str(), + "model": cfg.providers.embedding.model.as_str(), + "dimensions": cfg.providers.embedding.dimensions, + }, + "rerank": { + "provider_id": cfg.providers.rerank.provider_id.as_str(), + "model": cfg.providers.rerank.model.as_str(), + }, + }, + "storage": { + "qdrant": { + "vector_dim": cfg.storage.qdrant.vector_dim, + "collection": cfg.storage.qdrant.collection.as_str(), + }, + }, + "context": { + "scope_boost_weight": cfg.context.as_ref().and_then(|ctx| ctx.scope_boost_weight), + "project_description_count": cfg + .context + .as_ref() + .and_then(|ctx| ctx.project_descriptions.as_ref()) + .map(|descriptions| descriptions.len()) + .unwrap_or(0), + "scope_description_count": cfg + .context + .as_ref() + .and_then(|ctx| ctx.scope_descriptions.as_ref()) + .map(|descriptions| descriptions.len()) + .unwrap_or(0), + }, + }) +} + +pub fn build_policy_snapshot( + cfg: &Config, + blend_policy: &ResolvedBlendPolicy, + diversity_policy: &ResolvedDiversityPolicy, + retrieval_sources_policy: &ResolvedRetrievalSourcesPolicy, + ranking_override: Option<&RankingRequestOverride>, +) -> Value { + let override_json = ranking_override.and_then(|value| serde_json::to_value(value).ok()); + + serde_json::json!({ + "ranking": { + "recency_tau_days": cfg.ranking.recency_tau_days, + "tie_breaker_weight": cfg.ranking.tie_breaker_weight, + "deterministic": { + "enabled": cfg.ranking.deterministic.enabled, + "lexical": { + "enabled": cfg.ranking.deterministic.lexical.enabled, + "weight": cfg.ranking.deterministic.lexical.weight, + "min_ratio": cfg.ranking.deterministic.lexical.min_ratio, + "max_query_terms": cfg.ranking.deterministic.lexical.max_query_terms, + "max_text_terms": cfg.ranking.deterministic.lexical.max_text_terms, + }, + "hits": { + "enabled": cfg.ranking.deterministic.hits.enabled, + "weight": cfg.ranking.deterministic.hits.weight, + "half_saturation": cfg.ranking.deterministic.hits.half_saturation, + "last_hit_tau_days": cfg.ranking.deterministic.hits.last_hit_tau_days, + }, + "decay": { + "enabled": cfg.ranking.deterministic.decay.enabled, + "weight": cfg.ranking.deterministic.decay.weight, + "tau_days": cfg.ranking.deterministic.decay.tau_days, + }, + }, + "blend": { + "enabled": blend_policy.enabled, + "rerank_normalization": blend_policy.rerank_normalization.as_str(), + "retrieval_normalization": blend_policy.retrieval_normalization.as_str(), + "segments": blend_policy + .segments + .iter() + .map(|segment| { + serde_json::json!({ + "max_retrieval_rank": segment.max_retrieval_rank, + "retrieval_weight": segment.retrieval_weight, + }) + }) + .collect::<Vec<_>>(), + }, + "diversity": { + "enabled": diversity_policy.enabled, + "sim_threshold": diversity_policy.sim_threshold, + "mmr_lambda": diversity_policy.mmr_lambda, + "max_skips": diversity_policy.max_skips, + }, + "retrieval_sources": { + "fusion_weight": retrieval_sources_policy.fusion_weight, + "structured_field_weight": retrieval_sources_policy.structured_field_weight, + "recursive_weight": retrieval_sources_policy.recursive_weight, + "fusion_priority": retrieval_sources_policy.fusion_priority, + "structured_field_priority": retrieval_sources_policy.structured_field_priority, + "recursive_priority": retrieval_sources_policy.recursive_priority, + }, + "override": override_json, + }, + "context": { + "scope_boost_weight": cfg.context.as_ref().and_then(|ctx| ctx.scope_boost_weight), + "project_description_count": cfg + .context + .as_ref() + .and_then(|ctx| ctx.project_descriptions.as_ref()) + .map(|descriptions| descriptions.len()) + .unwrap_or(0), + "scope_description_count": cfg + .context + .as_ref() + .and_then(|ctx| ctx.scope_descriptions.as_ref()) + .map(|descriptions| descriptions.len()) + .unwrap_or(0), + }, + }) +} + +pub fn hash_policy_snapshot(payload: &Value) -> Result<String> { + let raw = serde_json::to_vec(payload).map_err(|err| Error::Storage { + message: format!("Failed to encode policy snapshot: {err}"), + })?; + + Ok(blake3::hash(&raw).to_hex().to_string()) +} + +pub fn resolve_blend_policy( + cfg: &RankingBlend, + override_: Option<&BlendRankingOverride>, +) -> Result<ResolvedBlendPolicy> { + let enabled = override_.and_then(|value| value.enabled).unwrap_or(cfg.enabled); + let rerank_norm = override_ + .and_then(|value| value.rerank_normalization.as_deref()) + .unwrap_or(cfg.rerank_normalization.as_str()); + let retrieval_norm = override_ + .and_then(|value| value.retrieval_normalization.as_deref()) + .unwrap_or(cfg.retrieval_normalization.as_str()); + let rerank_normalization = + parse_normalization_kind(rerank_norm, "ranking.blend.rerank_normalization")?; + let retrieval_normalization = + parse_normalization_kind(retrieval_norm, "ranking.blend.retrieval_normalization")?; + let segments: Vec<BlendSegment> = + if let Some(override_segments) = override_.and_then(|value| value.segments.as_ref()) { + override_segments + .iter() + .map(|segment| BlendSegment { + max_retrieval_rank: segment.max_retrieval_rank, + retrieval_weight: segment.retrieval_weight, + }) + .collect::<Vec<_>>() + } else { + cfg.segments + .iter() + .map(|segment| BlendSegment { + max_retrieval_rank: segment.max_retrieval_rank, + retrieval_weight: segment.retrieval_weight, + }) + .collect::<Vec<_>>() + }; + + validate_blend_segments(&segments)?; + + Ok(ResolvedBlendPolicy { enabled, rerank_normalization, retrieval_normalization, segments }) +} + +pub fn resolve_diversity_policy( + cfg: &RankingDiversity, + override_: Option<&DiversityRankingOverride>, +) -> Result<ResolvedDiversityPolicy> { + let enabled = override_.and_then(|value| value.enabled).unwrap_or(cfg.enabled); + let sim_threshold = + override_.and_then(|value| value.sim_threshold).unwrap_or(cfg.sim_threshold); + let mmr_lambda = override_.and_then(|value| value.mmr_lambda).unwrap_or(cfg.mmr_lambda); + let max_skips = override_.and_then(|value| value.max_skips).unwrap_or(cfg.max_skips); + + if !sim_threshold.is_finite() { + return Err(Error::InvalidRequest { + message: "ranking.diversity.sim_threshold must be a finite number.".to_string(), + }); + } + if !(0.0..=1.0).contains(&sim_threshold) { + return Err(Error::InvalidRequest { + message: "ranking.diversity.sim_threshold must be in the range 0.0-1.0.".to_string(), + }); + } + if !mmr_lambda.is_finite() { + return Err(Error::InvalidRequest { + message: "ranking.diversity.mmr_lambda must be a finite number.".to_string(), + }); + } + if !(0.0..=1.0).contains(&mmr_lambda) { + return Err(Error::InvalidRequest { + message: "ranking.diversity.mmr_lambda must be in the range 0.0-1.0.".to_string(), + }); + } + + Ok(ResolvedDiversityPolicy { enabled, sim_threshold, mmr_lambda, max_skips }) +} + +pub fn resolve_retrieval_sources_policy( + cfg: &RankingRetrievalSources, + override_: Option<&RetrievalSourcesRankingOverride>, +) -> Result<ResolvedRetrievalSourcesPolicy> { + let fusion_weight = + override_.and_then(|value| value.fusion_weight).unwrap_or(cfg.fusion_weight); + let structured_field_weight = override_ + .and_then(|value| value.structured_field_weight) + .unwrap_or(cfg.structured_field_weight); + let recursive_weight = + override_.and_then(|value| value.recursive_weight).unwrap_or(structured_field_weight); + let fusion_priority = + override_.and_then(|value| value.fusion_priority).unwrap_or(cfg.fusion_priority); + let structured_field_priority = override_ + .and_then(|value| value.structured_field_priority) + .unwrap_or(cfg.structured_field_priority); + let recursive_priority = override_ + .and_then(|value| value.recursive_priority) + .unwrap_or(structured_field_priority.saturating_add(1)); + + for (path, value) in [ + ("ranking.retrieval_sources.fusion_weight", fusion_weight), + ("ranking.retrieval_sources.structured_field_weight", structured_field_weight), + ("ranking.retrieval_sources.recursive_weight", recursive_weight), + ] { + if !value.is_finite() { + return Err(Error::InvalidRequest { + message: format!("{path} must be a finite number."), + }); + } + if value < 0.0 { + return Err(Error::InvalidRequest { + message: format!("{path} must be zero or greater."), + }); + } + } + + if fusion_weight <= 0.0 && structured_field_weight <= 0.0 && recursive_weight <= 0.0 { + return Err(Error::InvalidRequest { + message: "At least one retrieval source weight must be greater than zero.".to_string(), + }); + } + + Ok(ResolvedRetrievalSourcesPolicy { + fusion_weight, + structured_field_weight, + recursive_weight, + fusion_priority, + structured_field_priority, + recursive_priority, + }) +} + +pub fn parse_normalization_kind(value: &str, label: &str) -> Result<NormalizationKind> { + match value.trim().to_ascii_lowercase().as_str() { + "rank" => Ok(NormalizationKind::Rank), + other => Err(Error::InvalidRequest { + message: format!("{label} must be one of: rank. Got {other}."), + }), + } +} + +pub fn validate_blend_segments(segments: &[BlendSegment]) -> Result<()> { + if segments.is_empty() { + return Err(Error::InvalidRequest { + message: "ranking.blend.segments must be non-empty.".to_string(), + }); + } + + let mut last_max = 0_u32; + + for (idx, segment) in segments.iter().enumerate() { + if segment.max_retrieval_rank == 0 { + return Err(Error::InvalidRequest { + message: "ranking.blend.segments.max_retrieval_rank must be greater than zero." + .to_string(), + }); + } + if idx > 0 && segment.max_retrieval_rank <= last_max { + return Err(Error::InvalidRequest { + message: "ranking.blend.segments.max_retrieval_rank must be strictly increasing." + .to_string(), + }); + } + if !segment.retrieval_weight.is_finite() { + return Err(Error::InvalidRequest { + message: "ranking.blend.segments.retrieval_weight must be a finite number." + .to_string(), + }); + } + if !(0.0..=1.0).contains(&segment.retrieval_weight) { + return Err(Error::InvalidRequest { + message: "ranking.blend.segments.retrieval_weight must be in the range 0.0-1.0." + .to_string(), + }); + } + + last_max = segment.max_retrieval_rank; + } + + Ok(()) +} + +pub fn retrieval_weight_for_rank(rank: u32, segments: &[BlendSegment]) -> f32 { + for segment in segments { + if rank <= segment.max_retrieval_rank { + return segment.retrieval_weight; + } + } + + segments.last().map(|segment| segment.retrieval_weight).unwrap_or(0.5) +} + +pub fn resolve_scopes(cfg: &Config, profile: &str) -> Result<Vec<String>> { + match profile { + "private_only" => Ok(cfg.scopes.read_profiles.private_only.clone()), + "private_plus_project" => Ok(cfg.scopes.read_profiles.private_plus_project.clone()), + "all_scopes" => Ok(cfg.scopes.read_profiles.all_scopes.clone()), + _ => Err(Error::InvalidRequest { message: "Unknown read_profile.".to_string() }), + } +} diff --git a/packages/elf-service/src/search/ranking/query.rs b/packages/elf-service/src/search/ranking/query.rs new file mode 100644 index 00000000..a67bf427 --- /dev/null +++ b/packages/elf-service/src/search/ranking/query.rs @@ -0,0 +1,96 @@ +use std::collections::HashSet; + +use serde_json::Value; + +use crate::search::ExpansionMode; +use elf_config::{Config, SearchDynamic}; +use elf_domain::english_gate; + +pub fn resolve_expansion_mode(cfg: &Config) -> ExpansionMode { + match cfg.search.expansion.mode.as_str() { + "off" => ExpansionMode::Off, + "always" => ExpansionMode::Always, + "dynamic" => ExpansionMode::Dynamic, + _ => ExpansionMode::Off, + } +} + +pub fn should_expand_dynamic(candidate_count: usize, top_score: f32, cfg: &SearchDynamic) -> bool { + candidate_count < cfg.min_candidates as usize || top_score < cfg.min_top_score +} + +pub fn normalize_queries( + queries: Vec<String>, + original: &str, + include_original: bool, + max_queries: u32, +) -> Vec<String> { + let mut out = Vec::new(); + let mut seen = HashSet::new(); + + if include_original { + push_query(&mut out, &mut seen, original); + } + + for query in queries { + if out.len() >= max_queries as usize { + break; + } + + push_query(&mut out, &mut seen, &query); + } + + out.truncate(max_queries as usize); + + out +} + +pub fn push_query(out: &mut Vec<String>, seen: &mut HashSet<String>, value: &str) { + let trimmed = value.trim(); + + if trimmed.is_empty() || !english_gate::is_english_natural_language(trimmed) { + return; + } + + let key = trimmed.to_lowercase(); + + if seen.insert(key) { + out.push(trimmed.to_string()); + } +} + +pub fn build_expansion_messages( + query: &str, + max_queries: u32, + include_original: bool, +) -> Vec<Value> { + let schema = serde_json::json!({ + "queries": ["string"] + }); + let schema_text = serde_json::to_string_pretty(&schema) + .unwrap_or_else(|_| "{\"queries\": [\"string\"]}".to_string()); + let system_prompt = "You are a query expansion engine for a memory retrieval system. \ +Output must be valid JSON only and must match the provided schema exactly. \ +Generate short English-only query variations that preserve the original intent. \ +Do not include any non-English text. Do not add explanations or extra fields."; + let user_prompt = format!( + "Return JSON matching this exact schema:\n{schema}\nConstraints:\n- MAX_QUERIES = {max}\n- INCLUDE_ORIGINAL = {include}\nOriginal query:\n{query}", + schema = schema_text, + max = max_queries, + include = include_original, + query = query + ); + + vec![ + serde_json::json!({ "role": "system", "content": system_prompt }), + serde_json::json!({ "role": "user", "content": user_prompt }), + ] +} + +pub fn expansion_mode_label(mode: ExpansionMode) -> &'static str { + match mode { + ExpansionMode::Off => "off", + ExpansionMode::Always => "always", + ExpansionMode::Dynamic => "dynamic", + } +} diff --git a/packages/elf-service/src/search/ranking/retrieval.rs b/packages/elf-service/src/search/ranking/retrieval.rs new file mode 100644 index 00000000..1f7d826f --- /dev/null +++ b/packages/elf-service/src/search/ranking/retrieval.rs @@ -0,0 +1,355 @@ +use std::{ + cmp::Ordering, + collections::{HashMap, HashSet}, +}; + +use qdrant_client::qdrant::{PointId, ScoredPoint, Value, point_id::PointIdOptions, value::Kind}; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; +use uuid::Uuid; + +use crate::search::{ + ChunkCandidate, ChunkRow, NoteMeta, RetrievalSourceCandidates, RetrievalSourceKind, + ranking::policy::ResolvedRetrievalSourcesPolicy, +}; + +pub fn collect_chunk_candidates( + points: &[ScoredPoint], + max_candidates: u32, + candidate_k: u32, +) -> Vec<ChunkCandidate> { + let limit = if max_candidates == 0 || max_candidates >= candidate_k { + points.len() + } else { + max_candidates as usize + }; + let mut out = Vec::new(); + let mut seen = HashSet::new(); + + for (idx, point) in points.iter().take(limit).enumerate() { + let chunk_id = point + .id + .as_ref() + .and_then(point_id_to_uuid) + .or_else(|| payload_uuid(&point.payload, "chunk_id")); + let Some(chunk_id) = chunk_id else { + tracing::warn!("Chunk candidate missing chunk_id."); + + continue; + }; + + if !seen.insert(chunk_id) { + continue; + } + + let Some(note_id) = payload_uuid(&point.payload, "note_id") else { + tracing::warn!(chunk_id = %chunk_id, "Chunk candidate missing note_id."); + + continue; + }; + let Some(chunk_index) = payload_i32(&point.payload, "chunk_index") else { + tracing::warn!(chunk_id = %chunk_id, "Chunk candidate missing chunk_index."); + + continue; + }; + let updated_at = payload_rfc3339(&point.payload, "updated_at"); + let embedding_version = payload_string(&point.payload, "embedding_version"); + let scope = payload_string(&point.payload, "scope"); + + out.push(ChunkCandidate { + chunk_id, + note_id, + chunk_index, + retrieval_rank: idx as u32 + 1, + retrieval_score: Some(point.score), + updated_at, + embedding_version, + scope, + }); + } + + out +} + +pub fn retrieval_source_weight( + policy: &ResolvedRetrievalSourcesPolicy, + source: RetrievalSourceKind, +) -> f32 { + match source { + RetrievalSourceKind::Fusion => policy.fusion_weight, + RetrievalSourceKind::StructuredField => policy.structured_field_weight, + RetrievalSourceKind::Recursive => policy.recursive_weight, + } +} + +pub fn retrieval_source_priority( + policy: &ResolvedRetrievalSourcesPolicy, + source: RetrievalSourceKind, +) -> u32 { + match source { + RetrievalSourceKind::StructuredField => policy.structured_field_priority, + RetrievalSourceKind::Fusion => policy.fusion_priority, + RetrievalSourceKind::Recursive => policy.recursive_priority, + } +} + +pub fn retrieval_source_kind_order(source: RetrievalSourceKind) -> u8 { + match source { + RetrievalSourceKind::StructuredField => 0, + RetrievalSourceKind::Fusion => 1, + RetrievalSourceKind::Recursive => 2, + } +} + +pub fn merge_retrieval_candidates( + sources: Vec<RetrievalSourceCandidates>, + policy: &ResolvedRetrievalSourcesPolicy, + candidate_k: u32, +) -> Vec<ChunkCandidate> { + if candidate_k == 0 { + return Vec::new(); + } + + #[derive(Debug)] + struct MergedRetrievalCandidate { + candidate: ChunkCandidate, + source_ranks: HashMap<RetrievalSourceKind, u32>, + combined_score: f32, + } + + let mut by_chunk: HashMap<Uuid, MergedRetrievalCandidate> = HashMap::new(); + let mut source_totals: HashMap<RetrievalSourceKind, u32> = HashMap::new(); + + for source in sources { + let mut seen_for_source = HashSet::new(); + + for candidate in &source.candidates { + if seen_for_source.insert(candidate.chunk_id) { + *source_totals.entry(source.source).or_insert(0) += 1; + } + } + for candidate in source.candidates { + let chunk_id = candidate.chunk_id; + let rank = candidate.retrieval_rank; + + match by_chunk.get_mut(&chunk_id) { + Some(existing) => { + let entry = existing.source_ranks.entry(source.source).or_insert(rank); + + *entry = (*entry).min(rank); + }, + None => { + let mut source_ranks = HashMap::new(); + + source_ranks.insert(source.source, rank); + by_chunk.insert( + chunk_id, + MergedRetrievalCandidate { candidate, source_ranks, combined_score: 0.0 }, + ); + }, + } + } + } + + if by_chunk.is_empty() { + return Vec::new(); + } + + for total in source_totals.values_mut() { + *total = (*total).max(1); + } + + let mut source_order: Vec<RetrievalSourceKind> = source_totals.keys().copied().collect(); + + source_order.sort_by(|left, right| { + retrieval_source_priority(policy, *left) + .cmp(&retrieval_source_priority(policy, *right)) + .then_with(|| { + retrieval_source_kind_order(*left).cmp(&retrieval_source_kind_order(*right)) + }) + }); + + let mut merged: Vec<MergedRetrievalCandidate> = by_chunk.into_values().collect(); + + for candidate in &mut merged { + let mut combined_score = 0.0_f32; + + for (source, rank) in &candidate.source_ranks { + let total = source_totals.get(source).copied().unwrap_or(1); + + combined_score += + retrieval_source_weight(policy, *source) * rank_normalize(*rank, total); + } + + candidate.combined_score = combined_score; + } + + merged.sort_by(|left, right| { + cmp_f32_desc(left.combined_score, right.combined_score) + .then_with(|| right.source_ranks.len().cmp(&left.source_ranks.len())) + .then_with(|| { + for source in &source_order { + let lhs = left.source_ranks.get(source).copied(); + let rhs = right.source_ranks.get(source).copied(); + let ord = rank_asc(lhs, rhs); + + if ord != Ordering::Equal { + return ord; + } + } + + Ordering::Equal + }) + .then_with(|| left.candidate.chunk_id.cmp(&right.candidate.chunk_id)) + }); + + let mut out = Vec::new(); + + for (idx, mut candidate) in merged.into_iter().take(candidate_k as usize).enumerate() { + candidate.candidate.retrieval_rank = idx as u32 + 1; + candidate.candidate.retrieval_score = Some(candidate.combined_score); + + out.push(candidate.candidate); + } + + out +} + +pub fn rank_asc(left: Option<u32>, right: Option<u32>) -> Ordering { + let lhs = left.unwrap_or(u32::MAX); + let rhs = right.unwrap_or(u32::MAX); + + lhs.cmp(&rhs) +} + +pub fn candidate_matches_note( + note_meta: &HashMap<Uuid, NoteMeta>, + candidate: &ChunkCandidate, +) -> bool { + let Some(note) = note_meta.get(&candidate.note_id) else { return false }; + + if let Some(version) = candidate.embedding_version.as_deref() + && version != note.embedding_version.as_str() + { + return false; + } + if let Some(ts) = candidate.updated_at + && ts != note.updated_at + { + return false; + } + + true +} + +pub fn collect_neighbor_pairs(candidates: &[ChunkCandidate]) -> Vec<(Uuid, i32)> { + let mut seen = HashSet::new(); + let mut out = Vec::new(); + + for candidate in candidates { + let mut indices = Vec::with_capacity(3); + + indices.push(candidate.chunk_index); + + if let Some(prev) = candidate.chunk_index.checked_sub(1) { + indices.push(prev); + } + if let Some(next) = candidate.chunk_index.checked_add(1) { + indices.push(next); + } + + for idx in indices { + let key = (candidate.note_id, idx); + + if seen.insert(key) { + out.push(key); + } + } + } + + out +} + +pub fn stitch_snippet( + note_id: Uuid, + chunk_index: i32, + chunks: &HashMap<(Uuid, i32), ChunkRow>, +) -> String { + let indices = [chunk_index.checked_sub(1), Some(chunk_index), chunk_index.checked_add(1)]; + let mut out = String::new(); + + for index in indices.into_iter().flatten() { + if let Some(chunk) = chunks.get(&(note_id, index)) { + out.push_str(chunk.text.as_str()); + } + } + + out.trim().to_string() +} + +pub fn rank_normalize(rank: u32, total: u32) -> f32 { + if total <= 1 { + return 1.0; + } + if rank == 0 { + return 0.0; + } + + let denom = (total - 1) as f32; + let pos = (rank.saturating_sub(1)) as f32; + + (1.0 - pos / denom).clamp(0.0, 1.0) +} + +pub fn cmp_f32_desc(a: f32, b: f32) -> Ordering { + match (a.is_nan(), b.is_nan()) { + (true, true) => Ordering::Equal, + (true, false) => Ordering::Greater, + (false, true) => Ordering::Less, + (false, false) => b.partial_cmp(&a).unwrap_or(Ordering::Equal), + } +} +pub fn point_id_to_uuid(point_id: &PointId) -> Option<Uuid> { + match &point_id.point_id_options { + Some(PointIdOptions::Uuid(id)) => Uuid::parse_str(id).ok(), + _ => None, + } +} + +pub fn payload_uuid(payload: &HashMap<String, Value>, key: &str) -> Option<Uuid> { + let value = payload.get(key)?; + + match &value.kind { + Some(Kind::StringValue(text)) => Uuid::parse_str(text).ok(), + _ => None, + } +} + +pub fn payload_string(payload: &HashMap<String, Value>, key: &str) -> Option<String> { + let value = payload.get(key)?; + + match &value.kind { + Some(Kind::StringValue(text)) => Some(text.to_string()), + _ => None, + } +} + +pub fn payload_rfc3339(payload: &HashMap<String, Value>, key: &str) -> Option<OffsetDateTime> { + let text = payload_string(payload, key)?; + + OffsetDateTime::parse(text.as_str(), &Rfc3339).ok() +} + +pub fn payload_i32(payload: &HashMap<String, Value>, key: &str) -> Option<i32> { + let value = payload.get(key)?; + + match &value.kind { + Some(Kind::IntegerValue(value)) => i32::try_from(*value).ok(), + Some(Kind::DoubleValue(value)) => + if value.fract() == 0.0 { + i32::try_from(*value as i64).ok() + } else { + None + }, + _ => None, + } +} diff --git a/packages/elf-service/src/search/ranking/text.rs b/packages/elf-service/src/search/ranking/text.rs new file mode 100644 index 00000000..f37807fe --- /dev/null +++ b/packages/elf-service/src/search/ranking/text.rs @@ -0,0 +1,315 @@ +use std::collections::{HashMap, HashSet}; + +use time::OffsetDateTime; + +use crate::search::DeterministicRankingTerms; +use elf_config::{Config, Context}; +use elf_domain::english_gate; + +pub fn build_dense_embedding_input( + query: &str, + project_context_description: Option<&str>, +) -> String { + let Some(description) = project_context_description else { return query.to_string() }; + let trimmed = description.trim(); + + if trimmed.is_empty() { + return query.to_string(); + } + + format!("{query}\n\nProject context:\n{trimmed}") +} + +pub fn build_scope_context_boost_by_scope<'a>( + tokens: &[String], + context: Option<&'a Context>, +) -> HashMap<&'a str, f32> { + let Some(context) = context else { return HashMap::new() }; + let Some(weight) = context.scope_boost_weight else { return HashMap::new() }; + + if weight <= 0.0 || tokens.is_empty() { + return HashMap::new(); + } + + let Some(descriptions) = context.scope_descriptions.as_ref() else { return HashMap::new() }; + let mut out = HashMap::new(); + + for (scope, description) in descriptions { + let boost = scope_description_boost(tokens, description, weight); + + if boost > 0.0 { + out.insert(scope.as_str(), boost); + } + } + + out +} + +pub fn scope_description_boost(tokens: &[String], description: &str, weight: f32) -> f32 { + if weight <= 0.0 || tokens.is_empty() { + return 0.0; + } + + let trimmed = description.trim(); + + if trimmed.is_empty() || !english_gate::is_english_natural_language(trimmed) { + return 0.0; + } + + let mut normalized = String::with_capacity(trimmed.len()); + + for ch in trimmed.chars() { + if ch.is_ascii_alphanumeric() { + normalized.push(ch.to_ascii_lowercase()); + } else { + normalized.push(' '); + } + } + + let mut description_tokens = HashSet::new(); + + for token in normalized.split_whitespace() { + if token.len() < 2 { + continue; + } + + description_tokens.insert(token); + } + + if description_tokens.is_empty() { + return 0.0; + } + + let mut matched = 0_usize; + + for token in tokens { + if description_tokens.contains(token.as_str()) { + matched += 1; + } + } + + if matched == 0 { + return 0.0; + } + + weight * (matched as f32 / tokens.len() as f32) +} + +pub fn tokenize_query(query: &str, max_terms: usize) -> Vec<String> { + let mut normalized = String::with_capacity(query.len()); + + for ch in query.chars() { + if ch.is_ascii_alphanumeric() { + normalized.push(ch.to_ascii_lowercase()); + } else { + normalized.push(' '); + } + } + + let mut out = Vec::new(); + let mut seen = HashSet::new(); + + for token in normalized.split_whitespace() { + if token.len() < 2 { + continue; + } + if seen.insert(token) { + out.push(token.to_string()); + } + if out.len() >= max_terms { + break; + } + } + + out +} + +pub fn tokenize_text_terms(text: &str, max_terms: usize) -> HashSet<String> { + if max_terms == 0 { + return HashSet::new(); + } + + let mut normalized = String::with_capacity(text.len()); + + for ch in text.chars() { + if ch.is_ascii_alphanumeric() { + normalized.push(ch.to_ascii_lowercase()); + } else { + normalized.push(' '); + } + } + + let mut out = HashSet::new(); + + for token in normalized.split_whitespace() { + if token.len() < 2 { + continue; + } + + out.insert(token.to_string()); + + if out.len() >= max_terms { + break; + } + } + + out +} + +pub fn lexical_overlap_ratio(query_tokens: &[String], text: &str, max_text_terms: usize) -> f32 { + if query_tokens.is_empty() { + return 0.0; + } + + let text_terms = tokenize_text_terms(text, max_text_terms); + + if text_terms.is_empty() { + return 0.0; + } + + let mut matched = 0_usize; + + for token in query_tokens { + if text_terms.contains(token.as_str()) { + matched += 1; + } + } + + matched as f32 / query_tokens.len() as f32 +} + +pub fn compute_deterministic_ranking_terms( + cfg: &Config, + query_tokens: &[String], + snippet: &str, + note_hit_count: i64, + note_last_hit_at: Option<OffsetDateTime>, + age_days: f32, + now: OffsetDateTime, +) -> DeterministicRankingTerms { + let det = &cfg.ranking.deterministic; + + if !det.enabled { + return DeterministicRankingTerms::default(); + } + + let mut out = DeterministicRankingTerms::default(); + + if det.lexical.enabled && det.lexical.weight > 0.0 && !query_tokens.is_empty() { + let ratio = + lexical_overlap_ratio(query_tokens, snippet, det.lexical.max_text_terms as usize); + + out.lexical_overlap_ratio = ratio; + + let min_ratio = det.lexical.min_ratio.clamp(0.0, 1.0); + let scaled = if ratio >= min_ratio && min_ratio < 1.0 { + ((ratio - min_ratio) / (1.0 - min_ratio)).clamp(0.0, 1.0) + } else if ratio >= 1.0 && min_ratio >= 1.0 { + 1.0 + } else { + 0.0 + }; + + out.lexical_bonus = det.lexical.weight * scaled; + } + if det.hits.enabled && det.hits.weight > 0.0 { + let hit_count = note_hit_count.max(0); + + out.hit_count = hit_count; + + let half = det.hits.half_saturation; + let hit_saturation = if half > 0.0 && hit_count > 0 { + let hc = hit_count as f32; + + (hc / (hc + half)).clamp(0.0, 1.0) + } else { + 0.0 + }; + let last_hit_age_days = + note_last_hit_at.map(|ts| ((now - ts).as_seconds_f32() / 86_400.0).max(0.0)); + + out.last_hit_age_days = last_hit_age_days; + + let tau = det.hits.last_hit_tau_days; + let recency = if tau > 0.0 { + match last_hit_age_days { + Some(days) => (-days / tau).exp(), + None => 1.0, + } + } else { + 1.0 + }; + + out.hit_boost = det.hits.weight * hit_saturation * recency; + } + if det.decay.enabled && det.decay.weight > 0.0 { + let age_days = age_days.max(0.0); + let tau = det.decay.tau_days; + let staleness = if tau > 0.0 { 1.0 - (-age_days / tau).exp() } else { 0.0 }; + + out.decay_penalty = -det.decay.weight * staleness.clamp(0.0, 1.0); + } + + out +} + +pub fn match_terms_in_text( + tokens: &[String], + text: &str, + key: Option<&str>, + max_terms: usize, +) -> (Vec<String>, Vec<String>) { + if tokens.is_empty() { + return (Vec::new(), Vec::new()); + } + + let text = text.to_lowercase(); + let key = key.map(|value| value.to_lowercase()); + let mut matched_terms = Vec::new(); + let mut matched_fields = HashSet::new(); + + for token in tokens { + let mut matched = false; + + if text.contains(token) { + matched_fields.insert("text"); + + matched = true; + } + + if let Some(key) = key.as_ref() + && key.contains(token) + { + matched_fields.insert("key"); + + matched = true; + } + + if matched { + matched_terms.push(token.clone()); + } + if matched_terms.len() >= max_terms { + break; + } + } + + let mut fields: Vec<String> = + matched_fields.into_iter().map(|field| field.to_string()).collect(); + + fields.sort(); + + (matched_terms, fields) +} + +pub fn merge_matched_fields(mut base: Vec<String>, extra: Option<&Vec<String>>) -> Vec<String> { + if let Some(extra) = extra { + for field in extra { + base.push(field.clone()); + } + + base.sort(); + base.dedup(); + } + + base +} diff --git a/packages/elf-service/src/sharing.rs b/packages/elf-service/src/sharing.rs new file mode 100644 index 00000000..7687f723 --- /dev/null +++ b/packages/elf-service/src/sharing.rs @@ -0,0 +1,723 @@ +//! Cross-agent sharing APIs. + +use std::fmt::{Display, Formatter}; + +use serde::{Deserialize, Serialize}; +use sqlx::FromRow; +use uuid::Uuid; + +use crate::{ + ElfService, Error, InsertVersionArgs, + access::{self, ORG_PROJECT_ID}, +}; +use elf_storage::models::MemoryNote; + +const PROJECT_SPACE_GRANT_UPSERT_SQL: &str = "\ +INSERT INTO memory_space_grants ( + grant_id, + tenant_id, + project_id, + scope, + space_owner_agent_id, + grantee_kind, + grantee_agent_id, + granted_by_agent_id, + granted_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9 +) +ON CONFLICT (tenant_id, project_id, scope, space_owner_agent_id) +WHERE revoked_at IS NULL AND grantee_kind = 'project' +DO UPDATE +SET + granted_by_agent_id = EXCLUDED.granted_by_agent_id, + granted_at = EXCLUDED.granted_at, + revoked_at = NULL, + revoked_by_agent_id = NULL"; +const AGENT_SPACE_GRANT_UPSERT_SQL: &str = "\ +INSERT INTO memory_space_grants ( + grant_id, + tenant_id, + project_id, + scope, + space_owner_agent_id, + grantee_kind, + grantee_agent_id, + granted_by_agent_id, + granted_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9 +) +ON CONFLICT (tenant_id, project_id, scope, space_owner_agent_id, grantee_agent_id) +WHERE revoked_at IS NULL AND grantee_kind = 'agent' +DO UPDATE +SET + granted_by_agent_id = EXCLUDED.granted_by_agent_id, + granted_at = EXCLUDED.granted_at, + revoked_at = NULL, + revoked_by_agent_id = NULL"; + +/// Shareable scopes that can be published or granted. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ShareScope { + /// Project-shared scope. + ProjectShared, + /// Organization-shared scope. + OrgShared, +} +impl ShareScope { + fn as_str(&self) -> &'static str { + match self { + Self::ProjectShared => "project_shared", + Self::OrgShared => "org_shared", + } + } +} + +impl Display for ShareScope { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + self.as_str().fmt(f) + } +} + +/// Grantee classes supported by space grants. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum GranteeKind { + /// Grant the scope to all project readers. + Project, + /// Grant the scope to one named agent. + Agent, +} + +/// Request payload for publishing a note into a shared scope. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct PublishNoteRequest { + /// Tenant that owns the note. + pub tenant_id: String, + /// Project that owns the note. + pub project_id: String, + /// Agent requesting the publish operation. + pub agent_id: String, + /// Identifier of the note to publish. + pub note_id: Uuid, + /// Target shared scope. + pub scope: ShareScope, +} + +/// Response payload for note publishing. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct PublishNoteResponse { + /// Identifier of the affected note. + pub note_id: Uuid, + /// Effective scope after publishing. + pub scope: String, +} + +/// Request payload for returning a note to its non-shared scope. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct UnpublishNoteRequest { + /// Tenant that owns the note. + pub tenant_id: String, + /// Project that owns the note. + pub project_id: String, + /// Agent requesting the unpublish operation. + pub agent_id: String, + /// Identifier of the note to unpublish. + pub note_id: Uuid, +} + +/// Response payload for note unpublishing. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct UnpublishNoteResponse { + /// Identifier of the affected note. + pub note_id: Uuid, + /// Effective scope after unpublishing. + pub scope: String, +} + +/// Request payload for granting a shared scope. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SpaceGrantUpsertRequest { + /// Tenant that owns the scope. + pub tenant_id: String, + /// Project that owns the scope. + pub project_id: String, + /// Agent requesting the grant. + pub agent_id: String, + /// Shared scope to grant. + pub scope: ShareScope, + /// Grantee class. + pub grantee_kind: GranteeKind, + /// Grantee agent identifier when `grantee_kind` is `agent`. + pub grantee_agent_id: Option<String>, +} + +/// Response payload for grant upsert. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SpaceGrantUpsertResponse { + /// Granted scope. + pub scope: String, + /// Grantee class. + pub grantee_kind: GranteeKind, + /// Grantee agent identifier when applicable. + pub grantee_agent_id: Option<String>, + /// Whether a grant row is active after the operation. + pub granted: bool, +} + +/// Request payload for revoking a shared-scope grant. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SpaceGrantRevokeRequest { + /// Tenant that owns the scope. + pub tenant_id: String, + /// Project that owns the scope. + pub project_id: String, + /// Agent requesting the revoke operation. + pub agent_id: String, + /// Shared scope to revoke. + pub scope: ShareScope, + /// Grantee class. + pub grantee_kind: GranteeKind, + /// Grantee agent identifier when `grantee_kind` is `agent`. + pub grantee_agent_id: Option<String>, +} + +/// Response payload for grant revocation. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SpaceGrantRevokeResponse { + /// Whether an active grant was revoked. + pub revoked: bool, +} + +/// Request payload for listing shared-scope grants. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SpaceGrantsListRequest { + /// Tenant that owns the scope. + pub tenant_id: String, + /// Project that owns the scope. + pub project_id: String, + /// Agent requesting the list. + pub agent_id: String, + /// Shared scope to inspect. + pub scope: ShareScope, +} + +/// One active space grant returned by `space_grants_list`. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SpaceGrantItem { + /// Granted scope. + pub scope: ShareScope, + /// Grantee class. + pub grantee_kind: GranteeKind, + /// Grantee agent identifier when applicable. + pub grantee_agent_id: Option<String>, + /// Agent that created the grant. + pub granted_by_agent_id: String, + /// Grant creation timestamp. + pub granted_at: time::OffsetDateTime, +} + +/// Response payload for grant listing. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SpaceGrantsListResponse { + /// Active grants visible to the caller. + pub grants: Vec<SpaceGrantItem>, +} + +impl ElfService { + /// Publishes an owned note into a shared scope. + pub async fn publish_note( + &self, + req: PublishNoteRequest, + ) -> crate::Result<PublishNoteResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let agent_id = req.agent_id.trim(); + + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { + return Err(Error::InvalidRequest { + message: "tenant_id, project_id, and agent_id are required.".to_string(), + }); + } + + let mut tx = self.db.pool.begin().await?; + let mut note: MemoryNote = sqlx::query_as::<_, MemoryNote>( + "\ +SELECT * +FROM memory_notes +WHERE note_id = $1 + AND tenant_id = $2 + AND project_id IN ($3, $4) +FOR UPDATE", + ) + .bind(req.note_id) + .bind(tenant_id) + .bind(project_id) + .bind(ORG_PROJECT_ID) + .fetch_optional(&mut *tx) + .await? + .ok_or_else(|| Error::InvalidRequest { message: "Note not found.".to_string() })?; + + if note.agent_id != agent_id { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + } + if note.status != "active" { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + } + if note.expires_at.map(|ts| ts <= time::OffsetDateTime::now_utc()).unwrap_or(false) { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + } + + let scope = req.scope.as_str(); + let scope_allowed = match scope { + "project_shared" => self.cfg.scopes.write_allowed.project_shared, + "org_shared" => self.cfg.scopes.write_allowed.org_shared, + _ => false, + }; + + if !scope_allowed { + return Err(Error::ScopeDenied { message: "Scope is not allowed.".to_string() }); + } + + let target_project_id = if scope == "org_shared" { ORG_PROJECT_ID } else { project_id }; + + access::ensure_active_project_scope_grant( + &mut *tx, + tenant_id, + target_project_id, + scope, + agent_id, + ) + .await?; + + if note.scope == scope && note.project_id == target_project_id { + return Ok(PublishNoteResponse { note_id: note.note_id, scope: note.scope }); + } + + let now = time::OffsetDateTime::now_utc(); + let prev_snapshot = crate::note_snapshot(¬e); + + note.scope = scope.to_string(); + note.project_id = target_project_id.to_string(); + note.updated_at = now; + + crate::insert_version( + &mut *tx, + InsertVersionArgs { + note_id: note.note_id, + op: "PUBLISH", + prev_snapshot: Some(prev_snapshot), + new_snapshot: Some(crate::note_snapshot(¬e)), + reason: "publish_note", + actor: agent_id, + ts: now, + }, + ) + .await?; + sqlx::query( + "UPDATE memory_notes SET scope = $1, project_id = $2, updated_at = $3 WHERE note_id = $4", + ) + .bind(scope) + .bind(note.project_id.as_str()) + .bind(now) + .bind(note.note_id) + .execute(&mut *tx) + .await?; + crate::enqueue_outbox_tx(&mut *tx, note.note_id, "UPSERT", ¬e.embedding_version, now) + .await?; + + tx.commit().await?; + + Ok(PublishNoteResponse { note_id: note.note_id, scope: note.scope }) + } + + /// Returns a previously published note to its non-shared scope. + pub async fn unpublish_note( + &self, + req: UnpublishNoteRequest, + ) -> crate::Result<UnpublishNoteResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let agent_id = req.agent_id.trim(); + + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { + return Err(Error::InvalidRequest { + message: "tenant_id, project_id, and agent_id are required.".to_string(), + }); + } + + let mut tx = self.db.pool.begin().await?; + let mut note: MemoryNote = sqlx::query_as::<_, MemoryNote>( + "\ +SELECT * +FROM memory_notes +WHERE note_id = $1 + AND tenant_id = $2 + AND project_id IN ($3, $4) +FOR UPDATE", + ) + .bind(req.note_id) + .bind(tenant_id) + .bind(project_id) + .bind(ORG_PROJECT_ID) + .fetch_optional(&mut *tx) + .await? + .ok_or_else(|| Error::InvalidRequest { message: "Note not found.".to_string() })?; + + if note.agent_id != agent_id { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + } + if note.status != "active" { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + } + if note.expires_at.map(|ts| ts <= time::OffsetDateTime::now_utc()).unwrap_or(false) { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + } + if !self.cfg.scopes.write_allowed.agent_private { + return Err(Error::ScopeDenied { message: "Scope is not allowed.".to_string() }); + } + if note.scope == "agent_private" { + return Ok(UnpublishNoteResponse { note_id: note.note_id, scope: note.scope }); + } + + let now = time::OffsetDateTime::now_utc(); + let prev_snapshot = crate::note_snapshot(¬e); + + if note.scope == "org_shared" && note.project_id == ORG_PROJECT_ID { + note.project_id = project_id.to_string(); + } + + note.scope = "agent_private".to_string(); + note.updated_at = now; + + crate::insert_version( + &mut *tx, + InsertVersionArgs { + note_id: note.note_id, + op: "UNPUBLISH", + prev_snapshot: Some(prev_snapshot), + new_snapshot: Some(crate::note_snapshot(¬e)), + reason: "unpublish_note", + actor: agent_id, + ts: now, + }, + ) + .await?; + sqlx::query( + "UPDATE memory_notes SET scope = $1, project_id = $2, updated_at = $3 WHERE note_id = $4", + ) + .bind(note.scope.as_str()) + .bind(note.project_id.as_str()) + .bind(now) + .bind(note.note_id) + .execute(&mut *tx) + .await?; + crate::enqueue_outbox_tx(&mut *tx, note.note_id, "UPSERT", ¬e.embedding_version, now) + .await?; + + tx.commit().await?; + + Ok(UnpublishNoteResponse { note_id: note.note_id, scope: note.scope }) + } + + /// Creates or reactivates a shared-scope grant. + pub async fn space_grant_upsert( + &self, + req: SpaceGrantUpsertRequest, + ) -> crate::Result<SpaceGrantUpsertResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let agent_id = req.agent_id.trim(); + + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { + return Err(Error::InvalidRequest { + message: "tenant_id, project_id, and agent_id are required.".to_string(), + }); + } + + let scope = req.scope.as_str(); + let scope_allowed = match scope { + "project_shared" => self.cfg.scopes.write_allowed.project_shared, + "org_shared" => self.cfg.scopes.write_allowed.org_shared, + _ => false, + }; + + if !scope_allowed { + return Err(Error::ScopeDenied { message: "Scope is not allowed.".to_string() }); + } + if req.grantee_kind == GranteeKind::Agent + && req.grantee_agent_id.as_ref().is_none_or(|id| id.trim().is_empty()) + { + return Err(Error::InvalidRequest { + message: "grantee_agent_id is required for agent grantee_kind.".to_string(), + }); + } + + let grantee_agent_id = req + .grantee_agent_id + .as_ref() + .map(|value| value.trim()) + .filter(|value| !value.is_empty()) + .map(ToString::to_string); + + if req.grantee_kind == GranteeKind::Project && grantee_agent_id.is_some() { + return Err(Error::InvalidRequest { + message: "grantee_agent_id must be empty for project grantee_kind.".to_string(), + }); + } + + let grantee_agent_id_ref = grantee_agent_id.as_deref(); + let now = time::OffsetDateTime::now_utc(); + let effective_project_id = if scope == "org_shared" { ORG_PROJECT_ID } else { project_id }; + + if req.grantee_kind == GranteeKind::Project { + self.upsert_project_grant(tenant_id, effective_project_id, scope, agent_id, now) + .await?; + } else { + self.upsert_agent_grant( + tenant_id, + effective_project_id, + scope, + agent_id, + grantee_agent_id_ref, + now, + ) + .await?; + } + + Ok(SpaceGrantUpsertResponse { + scope: scope.to_string(), + grantee_kind: req.grantee_kind, + grantee_agent_id, + granted: true, + }) + } + + async fn upsert_project_grant( + &self, + tenant_id: &str, + project_id: &str, + scope: &str, + agent_id: &str, + now: time::OffsetDateTime, + ) -> crate::Result<()> { + sqlx::query(PROJECT_SPACE_GRANT_UPSERT_SQL) + .bind(Uuid::new_v4()) + .bind(tenant_id) + .bind(project_id) + .bind(scope) + .bind(agent_id) + .bind("project") + .bind::<Option<&str>>(None) + .bind(agent_id) + .bind(now) + .execute(&self.db.pool) + .await?; + + Ok(()) + } + + async fn upsert_agent_grant( + &self, + tenant_id: &str, + project_id: &str, + scope: &str, + agent_id: &str, + grantee_agent_id: Option<&str>, + now: time::OffsetDateTime, + ) -> crate::Result<()> { + sqlx::query(AGENT_SPACE_GRANT_UPSERT_SQL) + .bind(Uuid::new_v4()) + .bind(tenant_id) + .bind(project_id) + .bind(scope) + .bind(agent_id) + .bind("agent") + .bind(grantee_agent_id) + .bind(agent_id) + .bind(now) + .execute(&self.db.pool) + .await?; + + Ok(()) + } + + /// Revokes a shared-scope grant. + pub async fn space_grant_revoke( + &self, + req: SpaceGrantRevokeRequest, + ) -> crate::Result<SpaceGrantRevokeResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let agent_id = req.agent_id.trim(); + + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { + return Err(Error::InvalidRequest { + message: "tenant_id, project_id, and agent_id are required.".to_string(), + }); + } + + let scope = req.scope.as_str(); + let grantee_agent_id = req + .grantee_agent_id + .as_deref() + .map(|value| value.trim()) + .filter(|value| !value.is_empty()); + + if req.grantee_kind == GranteeKind::Agent && grantee_agent_id.is_none() { + return Err(Error::InvalidRequest { + message: "grantee_agent_id is required for agent grantee_kind.".to_string(), + }); + } + if req.grantee_kind == GranteeKind::Project && grantee_agent_id.is_some() { + return Err(Error::InvalidRequest { + message: "grantee_agent_id must be empty for project grantee_kind.".to_string(), + }); + } + + let scope_allowed = match scope { + "project_shared" => self.cfg.scopes.write_allowed.project_shared, + "org_shared" => self.cfg.scopes.write_allowed.org_shared, + _ => false, + }; + + if !scope_allowed { + return Err(Error::ScopeDenied { message: "Scope is not allowed.".to_string() }); + } + + let effective_project_id = if scope == "org_shared" { ORG_PROJECT_ID } else { project_id }; + let revocation = sqlx::query( + "\ +UPDATE memory_space_grants +SET revoked_at = $7, + revoked_by_agent_id = $8 +WHERE tenant_id = $1 + AND project_id = $2 + AND scope = $3 + AND space_owner_agent_id = $4 + AND grantee_kind = $5 + AND ((grantee_kind = 'project' AND grantee_agent_id IS NULL) + OR (grantee_kind = 'agent' AND grantee_agent_id = $6)) + AND revoked_at IS NULL", + ) + .bind(tenant_id) + .bind(effective_project_id) + .bind(scope) + .bind(agent_id) + .bind(match req.grantee_kind { + GranteeKind::Project => "project", + GranteeKind::Agent => "agent", + }) + .bind(grantee_agent_id) + .bind(time::OffsetDateTime::now_utc()) + .bind(agent_id) + .execute(&self.db.pool) + .await?; + + if revocation.rows_affected() == 0 { + return Err(Error::InvalidRequest { message: "No active grant found.".to_string() }); + } + + Ok(SpaceGrantRevokeResponse { revoked: true }) + } + + /// Lists active grants for a shared scope. + pub async fn space_grants_list( + &self, + req: SpaceGrantsListRequest, + ) -> crate::Result<SpaceGrantsListResponse> { + let tenant_id = req.tenant_id.trim(); + let project_id = req.project_id.trim(); + let agent_id = req.agent_id.trim(); + + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { + return Err(Error::InvalidRequest { + message: "tenant_id, project_id, and agent_id are required.".to_string(), + }); + } + + let scope = req.scope.as_str(); + let scope_allowed = match scope { + "project_shared" => self.cfg.scopes.write_allowed.project_shared, + "org_shared" => self.cfg.scopes.write_allowed.org_shared, + _ => false, + }; + + if !scope_allowed { + return Err(Error::ScopeDenied { message: "Scope is not allowed.".to_string() }); + } + + let effective_project_id = if scope == "org_shared" { ORG_PROJECT_ID } else { project_id }; + + #[derive(FromRow)] + struct Row { + scope: String, + grantee_kind: String, + grantee_agent_id: Option<String>, + granted_by_agent_id: String, + granted_at: time::OffsetDateTime, + } + + let rows = sqlx::query_as::<_, Row>( + "\ +SELECT scope, grantee_kind, grantee_agent_id, granted_by_agent_id, granted_at +FROM memory_space_grants +WHERE tenant_id = $1 + AND project_id = $2 + AND space_owner_agent_id = $3 + AND scope = $4 + AND revoked_at IS NULL +ORDER BY granted_at DESC", + ) + .bind(tenant_id) + .bind(effective_project_id) + .bind(agent_id) + .bind(scope) + .fetch_all(&self.db.pool) + .await?; + let mut grants = Vec::with_capacity(rows.len()); + + for row in rows { + let grantee_kind = match row.grantee_kind.as_str() { + "agent" => GranteeKind::Agent, + "project" => GranteeKind::Project, + _ => continue, + }; + let scope = match row.scope.as_str() { + "project_shared" => ShareScope::ProjectShared, + "org_shared" => ShareScope::OrgShared, + _ => continue, + }; + + grants.push(SpaceGrantItem { + scope, + grantee_kind, + grantee_agent_id: row.grantee_agent_id, + granted_by_agent_id: row.granted_by_agent_id, + granted_at: row.granted_at, + }); + } + + Ok(SpaceGrantsListResponse { grants }) + } +} diff --git a/packages/elf-service/src/structured_fields.rs b/packages/elf-service/src/structured_fields.rs new file mode 100644 index 00000000..075de2bd --- /dev/null +++ b/packages/elf-service/src/structured_fields.rs @@ -0,0 +1,716 @@ +//! Structured-field validation and persistence helpers. + +use std::{collections::HashMap, slice}; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use sqlx::{PgConnection, PgPool}; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{Error, Result}; +use elf_domain::{english_gate, evidence}; + +const MAX_LIST_ITEMS: usize = 64; +const MAX_ENTITIES: usize = 32; +const MAX_RELATIONS: usize = 64; +const MAX_ALIASES: usize = 16; +const MAX_ITEM_CHARS: usize = 1_000; + +/// Structured note fields emitted by extraction and stored alongside a note. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct StructuredFields { + /// Optional one-paragraph summary. + pub summary: Option<String>, + /// Optional fact statements grounded in the note text. + pub facts: Option<Vec<String>>, + /// Optional concept labels grounded in the note text. + pub concepts: Option<Vec<String>>, + /// Optional graph entities extracted from the note. + pub entities: Option<Vec<StructuredEntity>>, + /// Optional graph relations extracted from the note. + pub relations: Option<Vec<StructuredRelation>>, +} +impl StructuredFields { + /// Returns `true` when no persisted summary, fact, or concept content is present. + pub fn is_effectively_empty(&self) -> bool { + let summary_empty = self.summary.as_ref().map(|v| v.trim().is_empty()).unwrap_or(true); + let facts_empty = self + .facts + .as_ref() + .map(|items| items.iter().all(|v| v.trim().is_empty())) + .unwrap_or(true); + let concepts_empty = self + .concepts + .as_ref() + .map(|items| items.iter().all(|v| v.trim().is_empty())) + .unwrap_or(true); + + summary_empty && facts_empty && concepts_empty + } + + /// Returns `true` when graph entities or relations are present. + pub fn has_graph_fields(&self) -> bool { + self.entities.as_ref().is_some_and(|entities| !entities.is_empty()) + || self.relations.as_ref().is_some_and(|relations| !relations.is_empty()) + } +} + +/// One extracted entity candidate. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct StructuredEntity { + /// Canonical surface for the entity. + pub canonical: Option<String>, + /// Optional entity kind such as person or organization. + pub kind: Option<String>, + /// Optional alternate surfaces for the entity. + pub aliases: Option<Vec<String>>, +} + +/// One extracted relation candidate. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +#[serde(default)] +pub struct StructuredRelation { + /// Relation subject entity. + pub subject: Option<StructuredEntity>, + /// Predicate surface for the relation. + pub predicate: Option<String>, + /// Relation object, either an entity or scalar value. + pub object: Option<StructuredRelationObject>, + #[serde(with = "crate::time_serde::option")] + /// Optional validity-window start. + pub valid_from: Option<OffsetDateTime>, + #[serde(with = "crate::time_serde::option")] + /// Optional validity-window end. + pub valid_to: Option<OffsetDateTime>, +} + +/// Extracted relation object. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct StructuredRelationObject { + /// Entity-shaped object value. + pub entity: Option<StructuredEntity>, + /// Scalar object value. + pub value: Option<String>, +} + +#[derive(Clone, Debug, Deserialize)] +struct SourceRefEvidenceQuote { + quote: String, +} + +/// Validates structured fields against note text, evidence bindings, and size limits. +pub fn validate_structured_fields( + structured: &StructuredFields, + note_text: &str, + source_ref: &Value, + add_event_evidence: Option<&[(usize, String)]>, +) -> Result<()> { + let evidence_quotes: Vec<String> = if let Some(event_evidence) = add_event_evidence { + event_evidence.iter().map(|(_, quote)| quote.clone()).collect() + } else { + extract_source_ref_quotes(source_ref) + }; + + if let Some(summary) = structured.summary.as_ref() { + validate_text_field(summary, "structured.summary")?; + } + if let Some(entities) = structured.entities.as_ref() { + validate_list_field_count(entities.len(), MAX_ENTITIES, "structured.entities")?; + + for (idx, entity) in entities.iter().enumerate() { + let base = format!("structured.entities[{idx}]"); + + validate_structured_entity(entity, &base, true)?; + } + } + if let Some(relations) = structured.relations.as_ref() { + validate_list_field_count(relations.len(), MAX_RELATIONS, "structured.relations")?; + + for (idx, relation) in relations.iter().enumerate() { + validate_structured_relation( + relation, + note_text, + &evidence_quotes, + &format!("structured.relations[{idx}]"), + )?; + } + } + if let Some(facts) = structured.facts.as_ref() { + validate_list_field(facts, "structured.facts")?; + + for (idx, fact) in facts.iter().enumerate() { + validate_text_field(fact, &format!("structured.facts[{idx}]"))?; + + if !fact_is_evidence_bound(fact, note_text, &evidence_quotes) { + return Err(Error::InvalidRequest { + message: format!( + "structured.facts[{idx}] is not supported by note text or evidence quotes." + ), + }); + } + } + } + if let Some(concepts) = structured.concepts.as_ref() { + validate_list_field(concepts, "structured.concepts")?; + + for (idx, concept) in concepts.iter().enumerate() { + validate_text_field(concept, &format!("structured.concepts[{idx}]"))?; + } + } + + Ok(()) +} + +/// Validates event-evidence quotes against their source messages. +pub fn event_evidence_quotes(messages: &[String], evidence: &[(usize, String)]) -> Result<()> { + for (idx, (message_index, quote)) in evidence.iter().enumerate() { + if quote.trim().is_empty() { + return Err(Error::InvalidRequest { + message: format!("evidence[{idx}].quote must not be empty."), + }); + } + if !evidence::evidence_matches(messages, *message_index, quote) { + return Err(Error::InvalidRequest { + message: format!("evidence[{idx}] does not match its source message."), + }); + } + } + + Ok(()) +} + +/// Upserts summary, fact, and concept fields for one note inside an existing transaction. +pub async fn upsert_structured_fields_tx( + executor: &mut PgConnection, + note_id: Uuid, + structured: &StructuredFields, + now: OffsetDateTime, +) -> Result<()> { + if let Some(summary) = structured.summary.as_ref() { + replace_kind(executor, note_id, "summary", slice_single(summary), now).await?; + } + if let Some(facts) = structured.facts.as_ref() { + replace_kind(executor, note_id, "fact", facts.as_slice(), now).await?; + } + if let Some(concepts) = structured.concepts.as_ref() { + replace_kind(executor, note_id, "concept", concepts.as_slice(), now).await?; + } + + Ok(()) +} + +/// Fetches persisted structured fields for the provided note identifiers. +pub async fn fetch_structured_fields( + pool: &PgPool, + note_ids: &[Uuid], +) -> Result<HashMap<Uuid, StructuredFields>> { + if note_ids.is_empty() { + return Ok(HashMap::new()); + } + + let rows = sqlx::query_as::<_, (Uuid, String, i32, String)>( + "\ +SELECT + note_id, + field_kind, + item_index, + text +FROM memory_note_fields +WHERE note_id = ANY($1::uuid[]) +ORDER BY note_id ASC, field_kind ASC, item_index ASC", + ) + .bind(note_ids.to_vec()) + .fetch_all(pool) + .await?; + let mut out: HashMap<Uuid, StructuredFields> = HashMap::new(); + + for row in rows { + let (note_id, field_kind, _item_index, text) = row; + let entry = out.entry(note_id).or_default(); + + match field_kind.as_str() { + "summary" => + if entry.summary.is_none() && !text.trim().is_empty() { + entry.summary = Some(text); + }, + "fact" => { + entry.facts.get_or_insert_with(Vec::new).push(text); + }, + "concept" => { + entry.concepts.get_or_insert_with(Vec::new).push(text); + }, + _ => {}, + } + } + + out.retain(|_, value| !value.is_effectively_empty()); + + Ok(out) +} + +fn validate_structured_entity( + entity: &StructuredEntity, + base: &str, + require_canonical: bool, +) -> Result<()> { + if require_canonical { + validate_required_text_field(entity.canonical.as_ref(), &format!("{base}.canonical"))?; + } + + if let Some(kind) = entity.kind.as_ref() { + validate_text_field(kind, &format!("{base}.kind"))?; + } + if let Some(aliases) = entity.aliases.as_ref() { + validate_list_field_count(aliases.len(), MAX_ALIASES, &format!("{base}.aliases"))?; + + for (alias_idx, alias) in aliases.iter().enumerate() { + validate_text_field(alias, &format!("{base}.aliases[{alias_idx}]"))?; + } + } + + Ok(()) +} + +fn validate_structured_relation( + relation: &StructuredRelation, + note_text: &str, + evidence_quotes: &[String], + base: &str, +) -> Result<()> { + if relation.predicate.is_none() { + return Err(Error::InvalidRequest { message: format!("{base}.predicate is required.") }); + } + + let subject = relation + .subject + .as_ref() + .ok_or_else(|| Error::InvalidRequest { message: format!("{base}.subject is required.") })?; + + validate_structured_entity(subject, &format!("{base}.subject"), true)?; + + let predicate = relation.predicate.as_ref().ok_or_else(|| Error::InvalidRequest { + message: format!("{base}.predicate is required."), + })?; + + validate_text_field(predicate, &format!("{base}.predicate"))?; + + let object = relation + .object + .as_ref() + .ok_or_else(|| Error::InvalidRequest { message: format!("{base}.object is required.") })?; + + match (&object.entity, object.value.as_ref()) { + (Some(entity), None) => { + validate_structured_entity(entity, &format!("{base}.object.entity"), true)?; + + let canonical = entity.canonical.as_deref().ok_or_else(|| Error::InvalidRequest { + message: format!("{base}.object.entity.canonical is required."), + })?; + + if !fact_is_evidence_bound(canonical, note_text, evidence_quotes) { + return Err(Error::InvalidRequest { + message: format!( + "{base}.object.entity.canonical is not supported by note text or evidence quotes." + ), + }); + } + }, + (None, Some(value)) => { + validate_text_field(value, &format!("{base}.object.value"))?; + + if !fact_is_evidence_bound(value, note_text, evidence_quotes) { + return Err(Error::InvalidRequest { + message: format!( + "{base}.object.value is not supported by note text or evidence quotes." + ), + }); + } + }, + (_, _) => { + return Err(Error::InvalidRequest { + message: format!("{base}.object must provide exactly one of entity or value."), + }); + }, + } + + if !fact_is_evidence_bound( + subject.canonical.as_deref().unwrap_or_default(), + note_text, + evidence_quotes, + ) { + return Err(Error::InvalidRequest { + message: format!( + "{base}.subject.canonical is not supported by note text or evidence quotes." + ), + }); + } + if !fact_is_evidence_bound(predicate, note_text, evidence_quotes) { + return Err(Error::InvalidRequest { + message: format!("{base}.predicate is not supported by note text or evidence quotes."), + }); + } + + if let (Some(valid_from), Some(valid_to)) = (relation.valid_from, relation.valid_to) + && valid_to <= valid_from + { + return Err(Error::InvalidRequest { + message: format!("{base}.valid_to must be greater than valid_from."), + }); + } + + Ok(()) +} + +fn validate_list_field(items: &[String], label: &str) -> Result<()> { + if items.len() > MAX_LIST_ITEMS { + return Err(Error::InvalidRequest { + message: format!("{label} must have at most {MAX_LIST_ITEMS} items."), + }); + } + + Ok(()) +} + +fn validate_text_field(value: &str, label: &str) -> Result<()> { + let trimmed = value.trim(); + + if trimmed.is_empty() { + return Err(Error::InvalidRequest { message: format!("{label} must not be empty.") }); + } + if trimmed.chars().count() > MAX_ITEM_CHARS { + return Err(Error::InvalidRequest { + message: format!("{label} must be at most {MAX_ITEM_CHARS} characters."), + }); + } + if !english_gate::is_english_natural_language(trimmed) { + return Err(Error::NonEnglishInput { field: label.to_string() }); + } + + Ok(()) +} + +fn validate_required_text_field(value: Option<&String>, label: &str) -> Result<()> { + let Some(value) = value else { + return Err(Error::InvalidRequest { message: format!("{label} is required.") }); + }; + + validate_text_field(value, label) +} + +fn validate_list_field_count(len: usize, max: usize, label: &str) -> Result<()> { + if len > max { + return Err(Error::InvalidRequest { + message: format!("{label} must have at most {max} items."), + }); + } + + Ok(()) +} + +fn extract_source_ref_quotes(source_ref: &Value) -> Vec<String> { + let Some(evidence) = source_ref.get("evidence") else { return Vec::new() }; + let Ok(quotes) = serde_json::from_value::<Vec<SourceRefEvidenceQuote>>(evidence.clone()) else { + return Vec::new(); + }; + + quotes.into_iter().map(|q| q.quote).collect() +} + +fn fact_is_evidence_bound(fact: &str, note_text: &str, evidence_quotes: &[String]) -> bool { + let trimmed = fact.trim(); + + if trimmed.is_empty() { + return false; + } + if note_text.contains(trimmed) { + return true; + } + + for quote in evidence_quotes { + if quote.contains(trimmed) { + return true; + } + } + + false +} + +fn slice_single(value: &String) -> &[String] { + slice::from_ref(value) +} + +async fn replace_kind( + executor: &mut PgConnection, + note_id: Uuid, + kind: &str, + items: &[String], + now: OffsetDateTime, +) -> Result<()> { + sqlx::query("DELETE FROM memory_note_fields WHERE note_id = $1 AND field_kind = $2") + .bind(note_id) + .bind(kind) + .execute(&mut *executor) + .await?; + + for (idx, value) in items.iter().enumerate() { + let trimmed = value.trim(); + + if trimmed.is_empty() { + continue; + } + + sqlx::query( + "\ +INSERT INTO memory_note_fields ( + field_id, + note_id, + field_kind, + item_index, + text, + created_at, + updated_at +) +VALUES ($1,$2,$3,$4,$5,$6,$7)", + ) + .bind(Uuid::new_v4()) + .bind(note_id) + .bind(kind) + .bind(idx as i32) + .bind(trimmed) + .bind(now) + .bind(now) + .execute(&mut *executor) + .await?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use time::OffsetDateTime; + + use crate::{ + Error, + structured_fields::{ + self, StructuredEntity, StructuredFields, StructuredRelation, StructuredRelationObject, + }, + }; + + fn structured_relation( + subject: &str, + predicate: &str, + object: StructuredRelationObject, + valid_from: Option<OffsetDateTime>, + valid_to: Option<OffsetDateTime>, + ) -> StructuredFields { + StructuredFields { + summary: None, + facts: None, + concepts: None, + entities: None, + relations: Some(vec![StructuredRelation { + subject: Some(StructuredEntity { + canonical: Some(subject.to_string()), + kind: None, + aliases: None, + }), + predicate: Some(predicate.to_string()), + object: Some(object), + valid_from, + valid_to, + }]), + } + } + + #[test] + fn fact_binding_accepts_note_text_substring() { + let structured = StructuredFields { + summary: None, + facts: Some(vec!["Deploy uses reranking".to_string()]), + concepts: None, + entities: None, + relations: None, + }; + let res = structured_fields::validate_structured_fields( + &structured, + "Deploy uses reranking after retrieval.", + &serde_json::json!({}), + None, + ); + + assert!(res.is_ok()); + } + + #[test] + fn fact_binding_rejects_without_text_or_evidence() { + let structured = StructuredFields { + summary: None, + facts: Some(vec!["Nonexistent claim.".to_string()]), + concepts: None, + entities: None, + relations: None, + }; + let res = structured_fields::validate_structured_fields( + &structured, + "Some note.", + &serde_json::json!({}), + None, + ); + + assert!(res.is_err()); + } + + #[test] + fn relation_object_requires_exactly_one_of_entity_or_value() { + let structured = structured_relation( + "alice", + "owns", + StructuredRelationObject { + entity: Some(StructuredEntity { + canonical: Some("Acme".to_string()), + kind: None, + aliases: None, + }), + value: Some("Acme corp".to_string()), + }, + None, + None, + ); + let res = structured_fields::validate_structured_fields( + &structured, + "alice owns Acme corp.", + &serde_json::json!({ + "evidence": [{"quote": "alice owns Acme"}] + }), + None, + ); + let err = res.expect_err("relation should reject object with both entity and value"); + let message = match err { + Error::InvalidRequest { message } => message, + _ => panic!("expected invalid request, got {err:?}"), + }; + + assert_eq!( + message, + "structured.relations[0].object must provide exactly one of entity or value." + ); + } + + #[test] + fn relation_rejects_valid_to_not_after_valid_from() { + let structured = structured_relation( + "alice", + "met", + StructuredRelationObject { entity: None, value: Some("bob".to_string()) }, + Some(OffsetDateTime::from_unix_timestamp(1_700_000_000).expect("valid timestamp")), + Some(OffsetDateTime::from_unix_timestamp(1_700_000_000).expect("valid timestamp")), + ); + let res = structured_fields::validate_structured_fields( + &structured, + "alice met bob", + &serde_json::json!({ + "evidence": [{"quote": "alice met bob"}] + }), + None, + ); + let err = res.expect_err("relation should require valid_to greater than valid_from"); + let message = match err { + Error::InvalidRequest { message } => message, + _ => panic!("expected invalid request, got {err:?}"), + }; + + assert_eq!(message, "structured.relations[0].valid_to must be greater than valid_from."); + } + + #[test] + fn relation_checks_subject_predicate_and_object_value_are_evidence_bound() { + let subject_message = match structured_fields::validate_structured_fields( + &structured_relation( + "alice", + "caused", + StructuredRelationObject { entity: None, value: Some("outage".to_string()) }, + None, + None, + ), + "a critical outage was logged.", + &serde_json::json!({"evidence": [{"quote": "caused an outage"}]}), + None, + ) { + Err(Error::InvalidRequest { message }) => message, + res => panic!("expected invalid request, got {res:?}"), + }; + + assert!( + subject_message.contains("structured.relations[0].subject.canonical is not supported") + ); + + let predicate_message = match structured_fields::validate_structured_fields( + &structured_relation( + "operator", + "discovered", + StructuredRelationObject { entity: None, value: Some("outage".to_string()) }, + None, + None, + ), + "operator monitored a system outage.", + &serde_json::json!({"evidence": [{"quote": "operator saw outage"}]}), + None, + ) { + Err(Error::InvalidRequest { message }) => message, + res => panic!("expected invalid request, got {res:?}"), + }; + + assert!(predicate_message.contains("structured.relations[0].predicate is not supported")); + + let object_message = match structured_fields::validate_structured_fields( + &structured_relation( + "operator", + "noticed", + StructuredRelationObject { + entity: None, + value: Some("service interruption".to_string()), + }, + None, + None, + ), + "The operator noticed service latency during testing.", + &serde_json::json!({"evidence": [{"quote": "The operator noticed service behavior"}]}), + None, + ) { + Err(Error::InvalidRequest { message }) => message, + res => panic!("expected invalid request, got {res:?}"), + }; + + assert!(object_message.contains("structured.relations[0].object.value is not supported")); + } + + #[test] + fn relation_accepts_valid_structured_relation() { + let structured = structured_relation( + "alice", + "works at", + StructuredRelationObject { + entity: Some(StructuredEntity { + canonical: Some("acme corp".to_string()), + kind: None, + aliases: None, + }), + value: None, + }, + Some(OffsetDateTime::from_unix_timestamp(1_699_900_000).expect("valid timestamp")), + Some(OffsetDateTime::from_unix_timestamp(1_700_000_000).expect("valid timestamp")), + ); + let res = structured_fields::validate_structured_fields( + &structured, + "alice works at acme corp and reported progress.", + &serde_json::json!({ + "evidence": [{"quote": "works at acme corp"}] + }), + None, + ); + + assert!(res.is_ok()); + } +} diff --git a/packages/elf-service/src/time_serde.rs b/packages/elf-service/src/time_serde.rs index a9e0a789..dd8dbded 100644 --- a/packages/elf-service/src/time_serde.rs +++ b/packages/elf-service/src/time_serde.rs @@ -1,45 +1,26 @@ -// crates.io -use serde::{Deserialize, Deserializer, Serializer, de::Error as DeError, ser::Error as SerError}; +//! `OffsetDateTime` serde helpers. + +pub mod option; + +use serde::{Deserialize, Deserializer, Serializer}; use time::{OffsetDateTime, format_description::well_known::Rfc3339}; +/// Serializes an `OffsetDateTime` as RFC 3339. pub fn serialize<S>(value: &OffsetDateTime, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer, { - let formatted = value.format(&Rfc3339).map_err(SerError::custom)?; + let formatted = value.format(&Rfc3339).map_err(serde::ser::Error::custom)?; + serializer.serialize_str(&formatted) } +/// Deserializes an RFC 3339 string into an `OffsetDateTime`. pub fn deserialize<'de, D>(deserializer: D) -> Result<OffsetDateTime, D::Error> where D: Deserializer<'de>, { let raw = String::deserialize(deserializer)?; - OffsetDateTime::parse(&raw, &Rfc3339).map_err(DeError::custom) -} - -pub mod option { - use super::*; - - pub fn serialize<S>(value: &Option<OffsetDateTime>, serializer: S) -> Result<S::Ok, S::Error> - where - S: Serializer, - { - match value { - Some(value) => super::serialize(value, serializer), - None => serializer.serialize_none(), - } - } - pub fn deserialize<'de, D>(deserializer: D) -> Result<Option<OffsetDateTime>, D::Error> - where - D: Deserializer<'de>, - { - let raw = Option::<String>::deserialize(deserializer)?; - match raw { - Some(value) => - OffsetDateTime::parse(&value, &Rfc3339).map(Some).map_err(DeError::custom), - None => Ok(None), - } - } + OffsetDateTime::parse(&raw, &Rfc3339).map_err(serde::de::Error::custom) } diff --git a/packages/elf-service/src/time_serde/option.rs b/packages/elf-service/src/time_serde/option.rs new file mode 100644 index 00000000..2dc3e6af --- /dev/null +++ b/packages/elf-service/src/time_serde/option.rs @@ -0,0 +1,30 @@ +//! Optional `OffsetDateTime` serde helpers. + +use serde::{Deserialize as _, Deserializer, Serializer, de::Error}; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; + +use crate::time_serde; + +/// Serializes an optional `OffsetDateTime` as RFC 3339. +pub fn serialize<S>(value: &Option<OffsetDateTime>, serializer: S) -> Result<S::Ok, S::Error> +where + S: Serializer, +{ + match value { + Some(value) => time_serde::serialize(value, serializer), + None => serializer.serialize_none(), + } +} + +/// Deserializes an optional RFC 3339 string into an `OffsetDateTime`. +pub fn deserialize<'de, D>(deserializer: D) -> Result<Option<OffsetDateTime>, D::Error> +where + D: Deserializer<'de>, +{ + let raw = Option::<String>::deserialize(deserializer)?; + + match raw { + Some(value) => OffsetDateTime::parse(&value, &Rfc3339).map(Some).map_err(Error::custom), + None => Ok(None), + } +} diff --git a/packages/elf-service/src/update.rs b/packages/elf-service/src/update.rs index 82d4b3b3..b508a522 100644 --- a/packages/elf-service/src/update.rs +++ b/packages/elf-service/src/update.rs @@ -1,39 +1,60 @@ -// crates.io +//! Note update APIs. + +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use sqlx::{Postgres, Transaction}; use time::OffsetDateTime; use uuid::Uuid; -// self -use elf_domain::{cjk, ttl, writegate}; +use crate::{ElfService, Error, InsertVersionArgs, NoteOp, Result, access::ORG_PROJECT_ID}; +use elf_domain::{ + english_gate, ttl, + writegate::{self, NoteInput}, +}; use elf_storage::models::MemoryNote; -use crate::{ElfService, InsertVersionArgs, NoteOp, ServiceError, ServiceResult}; - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Request payload for note updates. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct UpdateRequest { + /// Tenant that owns the note. pub tenant_id: String, + /// Project that owns the note. pub project_id: String, + /// Agent requesting the update. pub agent_id: String, + /// Identifier of the note to update. pub note_id: Uuid, + /// Optional replacement note text. pub text: Option<String>, + /// Optional replacement importance score. pub importance: Option<f32>, + /// Optional replacement confidence score. pub confidence: Option<f32>, + /// Optional TTL override in days. pub ttl_days: Option<i64>, } -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +/// Response payload for note updates. +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct UpdateResponse { + /// Identifier of the affected note. pub note_id: Uuid, + /// Operation that was applied. pub op: NoteOp, + /// Machine-readable rejection code, if the update was rejected. pub reason_code: Option<String>, } impl ElfService { - pub async fn update(&self, req: UpdateRequest) -> ServiceResult<UpdateResponse> { + /// Updates mutable note fields when the caller still owns an active note. + pub async fn update(&self, req: UpdateRequest) -> Result<UpdateResponse> { + let now = OffsetDateTime::now_utc(); let tenant_id = req.tenant_id.trim(); let project_id = req.project_id.trim(); let agent_id = req.agent_id.trim(); + if tenant_id.is_empty() || project_id.is_empty() || agent_id.is_empty() { - return Err(ServiceError::InvalidRequest { + return Err(Error::InvalidRequest { message: "tenant_id, project_id, and agent_id are required.".to_string(), }); } @@ -42,41 +63,31 @@ impl ElfService { && req.confidence.is_none() && req.ttl_days.is_none() { - return Err(ServiceError::InvalidRequest { - message: "No updates provided.".to_string(), - }); + return Err(Error::InvalidRequest { message: "No updates provided.".to_string() }); } + let text_update = req.text.clone(); let mut tx = self.db.pool.begin().await?; - let mut note: MemoryNote = sqlx::query_as( - "SELECT * FROM memory_notes \ - WHERE note_id = $1 AND tenant_id = $2 AND project_id = $3 AND agent_id = $4 \ - FOR UPDATE", - ) - .bind(req.note_id) - .bind(tenant_id) - .bind(project_id) - .bind(agent_id) - .fetch_optional(&mut *tx) - .await? - .ok_or_else(|| ServiceError::InvalidRequest { message: "Note not found.".to_string() })?; + let mut note = load_note_for_update(&mut tx, req.note_id, tenant_id, project_id).await?; - let prev_snapshot = crate::note_snapshot(¬e); + validate_note_is_updatable(¬e, agent_id, now)?; + let prev_snapshot = crate::note_snapshot(¬e); let candidate_text = if let Some(text) = text_update.as_ref() { - if cjk::contains_cjk(text) { - return Err(ServiceError::NonEnglishInput { field: "$.text".to_string() }); + if !english_gate::is_english_natural_language(text) { + return Err(Error::NonEnglishInput { field: "$.text".to_string() }); } + text.clone() } else { note.text.clone() }; - - let gate = writegate::NoteInput { + let gate = NoteInput { note_type: note.r#type.clone(), scope: note.scope.clone(), text: candidate_text, }; + if let Err(code) = writegate::writegate(&gate, &self.cfg) { return Ok(UpdateResponse { note_id: note.note_id, @@ -85,7 +96,6 @@ impl ElfService { }); } - let now = OffsetDateTime::now_utc(); let next_text = text_update.unwrap_or_else(|| note.text.clone()); let next_importance = req.importance.unwrap_or(note.importance); let next_confidence = req.confidence.unwrap_or(note.confidence); @@ -93,7 +103,6 @@ impl ElfService { Some(ttl_days) => ttl::compute_expires_at(Some(ttl_days), ¬e.r#type, &self.cfg, now), None => note.expires_at, }; - let changed = next_text != note.text || (next_importance - note.importance).abs() > f32::EPSILON || (next_confidence - note.confidence).abs() > f32::EPSILON @@ -101,6 +110,7 @@ impl ElfService { if !changed { tx.commit().await?; + return Ok(UpdateResponse { note_id: note.note_id, op: NoteOp::None, @@ -114,43 +124,103 @@ impl ElfService { note.expires_at = next_expires_at; note.updated_at = now; - sqlx::query( - "UPDATE memory_notes SET text = $1, importance = $2, confidence = $3, updated_at = $4, expires_at = $5 WHERE note_id = $6", - ) - .bind(¬e.text) - .bind(note.importance) - .bind(note.confidence) - .bind(note.updated_at) - .bind(note.expires_at) - .bind(note.note_id) - .execute(&mut *tx) - .await?; - - crate::insert_version( - &mut tx, - InsertVersionArgs { - note_id: note.note_id, - op: "UPDATE", - prev_snapshot: Some(prev_snapshot), - new_snapshot: Some(crate::note_snapshot(¬e)), - reason: "update", - actor: "update", - ts: note.updated_at, - }, - ) - .await?; - - crate::enqueue_outbox_tx( - &mut tx, - note.note_id, - "UPSERT", - ¬e.embedding_version, - note.updated_at, - ) - .await?; + persist_note_update(&mut tx, ¬e, prev_snapshot, agent_id).await?; tx.commit().await?; Ok(UpdateResponse { note_id: note.note_id, op: NoteOp::Update, reason_code: None }) } } + +fn validate_note_is_updatable( + note: &MemoryNote, + agent_id: &str, + now: OffsetDateTime, +) -> Result<()> { + if note.agent_id != agent_id { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + } + if !note.status.eq_ignore_ascii_case("active") { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + } + + if let Some(expires_at) = note.expires_at + && expires_at <= now + { + return Err(Error::InvalidRequest { message: "Note not found.".to_string() }); + } + + Ok(()) +} + +async fn load_note_for_update( + tx: &mut Transaction<'_, Postgres>, + note_id: Uuid, + tenant_id: &str, + project_id: &str, +) -> Result<MemoryNote> { + sqlx::query_as::<_, MemoryNote>( + "\ +SELECT * +FROM memory_notes +WHERE note_id = $1 AND tenant_id = $2 AND project_id IN ($3, $4) +FOR UPDATE", + ) + .bind(note_id) + .bind(tenant_id) + .bind(project_id) + .bind(ORG_PROJECT_ID) + .fetch_optional(&mut **tx) + .await? + .ok_or_else(|| Error::InvalidRequest { message: "Note not found.".to_string() }) +} + +async fn persist_note_update( + tx: &mut Transaction<'_, Postgres>, + note: &MemoryNote, + prev_snapshot: Value, + request_agent_id: &str, +) -> Result<()> { + sqlx::query( + "\ +UPDATE memory_notes +SET + text = $1, + importance = $2, + confidence = $3, + updated_at = $4, + expires_at = $5 +WHERE note_id = $6", + ) + .bind(note.text.as_str()) + .bind(note.importance) + .bind(note.confidence) + .bind(note.updated_at) + .bind(note.expires_at) + .bind(note.note_id) + .execute(&mut **tx) + .await?; + crate::insert_version( + &mut **tx, + InsertVersionArgs { + note_id: note.note_id, + op: "UPDATE", + prev_snapshot: Some(prev_snapshot), + new_snapshot: Some(crate::note_snapshot(note)), + reason: "update", + actor: request_agent_id, + ts: note.updated_at, + }, + ) + .await?; + crate::enqueue_outbox_tx( + &mut **tx, + note.note_id, + "UPSERT", + ¬e.embedding_version, + note.updated_at, + ) + .await?; + + Ok(()) +} diff --git a/packages/elf-service/tests/acceptance.rs b/packages/elf-service/tests/acceptance.rs index c4138064..7d776b15 100644 --- a/packages/elf-service/tests/acceptance.rs +++ b/packages/elf-service/tests/acceptance.rs @@ -1,270 +1,5 @@ -mod chunking { - pub use elf_chunking::ChunkingConfig; -} +#![allow(unused_crate_dependencies)] -#[path = "acceptance/add_note_no_llm.rs"] mod add_note_no_llm; -#[path = "acceptance/chunk_search.rs"] mod chunk_search; -#[path = "acceptance/english_only_boundary.rs"] mod english_only_boundary; -#[path = "acceptance/evidence_binding.rs"] mod evidence_binding; -#[path = "acceptance/idempotency.rs"] mod idempotency; -#[path = "acceptance/outbox_eventual_consistency.rs"] mod outbox_eventual_consistency; -#[path = "acceptance/rebuild_qdrant.rs"] mod rebuild_qdrant; -#[path = "acceptance/sot_vectors.rs"] mod sot_vectors; +//! Acceptance-test entrypoint for the service package. -// std -use std::{ - env, - sync::{ - Arc, - atomic::{AtomicUsize, Ordering}, - }, -}; - -// crates.io -use serde_json::{Map, Value}; - -// self -use elf_service::{ElfService, EmbeddingProvider, ExtractorProvider, Providers, RerankProvider}; -use elf_storage::{db::Db, qdrant::QdrantStore}; -use elf_testkit::TestDatabase; - -pub fn test_qdrant_url() -> Option<String> { - env::var("ELF_QDRANT_URL").ok() -} - -pub async fn test_db() -> Option<elf_testkit::TestDatabase> { - let base_dsn = elf_testkit::env_dsn()?; - let db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); - Some(db) -} - -pub fn test_config( - dsn: String, - qdrant_url: String, - vector_dim: u32, - collection: String, -) -> elf_config::Config { - elf_config::Config { - service: elf_config::Service { - http_bind: "127.0.0.1:0".to_string(), - mcp_bind: "127.0.0.1:0".to_string(), - admin_bind: "127.0.0.1:0".to_string(), - log_level: "info".to_string(), - }, - storage: elf_config::Storage { - postgres: elf_config::Postgres { dsn, pool_max_conns: 2 }, - qdrant: elf_config::Qdrant { url: qdrant_url, collection, vector_dim }, - }, - providers: elf_config::Providers { - embedding: dummy_embedding_provider(), - rerank: dummy_provider(), - llm_extractor: dummy_llm_provider(), - }, - scopes: elf_config::Scopes { - allowed: vec![ - "agent_private".to_string(), - "project_shared".to_string(), - "org_shared".to_string(), - ], - read_profiles: elf_config::ReadProfiles { - private_only: vec!["agent_private".to_string()], - private_plus_project: vec![ - "agent_private".to_string(), - "project_shared".to_string(), - ], - all_scopes: vec![ - "agent_private".to_string(), - "project_shared".to_string(), - "org_shared".to_string(), - ], - }, - precedence: elf_config::ScopePrecedence { - agent_private: 30, - project_shared: 20, - org_shared: 10, - }, - write_allowed: elf_config::ScopeWriteAllowed { - agent_private: true, - project_shared: true, - org_shared: true, - }, - }, - memory: elf_config::Memory { - max_notes_per_add_event: 3, - max_note_chars: 240, - dup_sim_threshold: 0.92, - update_sim_threshold: 0.85, - candidate_k: 60, - top_k: 12, - }, - search: elf_config::Search { - expansion: elf_config::SearchExpansion { - mode: "off".to_string(), - max_queries: 4, - include_original: true, - }, - dynamic: elf_config::SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, - prefilter: elf_config::SearchPrefilter { max_candidates: 0 }, - cache: elf_config::SearchCache { - enabled: true, - expansion_ttl_days: 7, - rerank_ttl_days: 7, - max_payload_bytes: Some(262_144), - expansion_version: "v1".to_string(), - rerank_version: "v1".to_string(), - }, - explain: elf_config::SearchExplain { retention_days: 7 }, - }, - ranking: elf_config::Ranking { recency_tau_days: 60.0, tie_breaker_weight: 0.1 }, - lifecycle: elf_config::Lifecycle { - ttl_days: elf_config::TtlDays { - plan: 14, - fact: 180, - preference: 0, - constraint: 0, - decision: 0, - profile: 0, - }, - purge_deleted_after_days: 30, - purge_deprecated_after_days: 180, - }, - chunking: elf_config::Chunking { - enabled: true, - max_tokens: 512, - overlap_tokens: 128, - tokenizer_repo: None, - }, - security: elf_config::Security { - bind_localhost_only: true, - reject_cjk: true, - redact_secrets_on_write: true, - evidence_min_quotes: 1, - evidence_max_quotes: 2, - evidence_max_quote_chars: 320, - }, - } -} - -pub async fn build_service( - cfg: elf_config::Config, - providers: Providers, -) -> color_eyre::Result<ElfService> { - let db = Db::connect(&cfg.storage.postgres).await?; - db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; - let qdrant = QdrantStore::new(&cfg.storage.qdrant)?; - Ok(ElfService::with_providers(cfg, db, qdrant, providers)) -} - -pub async fn reset_db(pool: &sqlx::PgPool) -> color_eyre::Result<()> { - sqlx::query( - "TRUNCATE memory_hits, memory_note_versions, note_chunk_embeddings, memory_note_chunks, \ - note_embeddings, search_trace_items, search_traces, search_trace_outbox, indexing_outbox, \ - memory_notes", - ) - .execute(pool) - .await?; - Ok(()) -} - -pub struct StubEmbedding { - pub vector_dim: u32, -} - -impl EmbeddingProvider for StubEmbedding { - fn embed<'a>( - &'a self, - _cfg: &'a elf_config::EmbeddingProviderConfig, - texts: &'a [String], - ) -> elf_service::BoxFuture<'a, color_eyre::Result<Vec<Vec<f32>>>> { - let dim = self.vector_dim as usize; - let vectors = texts.iter().map(|_| vec![0.0; dim]).collect(); - Box::pin(async move { Ok(vectors) }) - } -} - -pub struct SpyEmbedding { - pub vector_dim: u32, - pub calls: Arc<AtomicUsize>, -} - -impl EmbeddingProvider for SpyEmbedding { - fn embed<'a>( - &'a self, - _cfg: &'a elf_config::EmbeddingProviderConfig, - texts: &'a [String], - ) -> elf_service::BoxFuture<'a, color_eyre::Result<Vec<Vec<f32>>>> { - self.calls.fetch_add(1, Ordering::SeqCst); - let dim = self.vector_dim as usize; - let vectors = texts.iter().map(|_| vec![0.0; dim]).collect(); - Box::pin(async move { Ok(vectors) }) - } -} - -pub struct StubRerank; - -impl RerankProvider for StubRerank { - fn rerank<'a>( - &'a self, - _cfg: &'a elf_config::ProviderConfig, - _query: &'a str, - docs: &'a [String], - ) -> elf_service::BoxFuture<'a, color_eyre::Result<Vec<f32>>> { - let scores = vec![0.5; docs.len()]; - Box::pin(async move { Ok(scores) }) - } -} - -pub struct SpyExtractor { - pub calls: Arc<AtomicUsize>, - pub payload: Value, -} - -impl ExtractorProvider for SpyExtractor { - fn extract<'a>( - &'a self, - _cfg: &'a elf_config::LlmProviderConfig, - _messages: &'a [Value], - ) -> elf_service::BoxFuture<'a, color_eyre::Result<Value>> { - let payload = self.payload.clone(); - self.calls.fetch_add(1, Ordering::SeqCst); - Box::pin(async move { Ok(payload) }) - } -} - -pub fn dummy_embedding_provider() -> elf_config::EmbeddingProviderConfig { - elf_config::EmbeddingProviderConfig { - provider_id: "test".to_string(), - api_base: "http://127.0.0.1:1".to_string(), - api_key: "test-key".to_string(), - path: "/".to_string(), - model: "test".to_string(), - dimensions: 3, - timeout_ms: 1000, - default_headers: Map::new(), - } -} - -pub fn dummy_provider() -> elf_config::ProviderConfig { - elf_config::ProviderConfig { - provider_id: "test".to_string(), - api_base: "http://127.0.0.1:1".to_string(), - api_key: "test-key".to_string(), - path: "/".to_string(), - model: "test".to_string(), - timeout_ms: 1000, - default_headers: Map::new(), - } -} - -pub fn dummy_llm_provider() -> elf_config::LlmProviderConfig { - elf_config::LlmProviderConfig { - provider_id: "test".to_string(), - api_base: "http://127.0.0.1:1".to_string(), - api_key: "test-key".to_string(), - path: "/".to_string(), - model: "test".to_string(), - temperature: 0.1, - timeout_ms: 1000, - default_headers: Map::new(), - } -} +#[path = "acceptance/suite.rs"] mod acceptance; diff --git a/packages/elf-service/tests/acceptance/add_note_no_llm.rs b/packages/elf-service/tests/acceptance/add_note_no_llm.rs index 9df0db92..c0e224f6 100644 --- a/packages/elf-service/tests/acceptance/add_note_no_llm.rs +++ b/packages/elf-service/tests/acceptance/add_note_no_llm.rs @@ -1,41 +1,45 @@ -// std use std::sync::{ Arc, atomic::{AtomicUsize, Ordering}, }; -// crates.io +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; use elf_service::{AddNoteInput, AddNoteRequest, Providers}; -// self -use super::{ - SpyExtractor, StubEmbedding, StubRerank, build_service, test_config, test_db, test_qdrant_url, -}; - #[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run this test."] async fn add_note_does_not_call_llm() { - let Some(test_db) = test_db().await else { + let Some(test_db) = acceptance::test_db().await else { eprintln!("Skipping add_note_does_not_call_llm; set ELF_PG_DSN to run this test."); + return; }; - let Some(qdrant_url) = test_qdrant_url() else { + let Some(qdrant_url) = acceptance::test_qdrant_url() else { eprintln!("Skipping add_note_does_not_call_llm; set ELF_QDRANT_URL to run this test."); + return; }; let calls = Arc::new(AtomicUsize::new(0)); let extractor = SpyExtractor { calls: calls.clone(), payload: serde_json::json!({ "notes": [] }) }; let providers = Providers::new( - Arc::new(StubEmbedding { vector_dim: 3 }), + Arc::new(StubEmbedding { vector_dim: 4_096 }), Arc::new(StubRerank), Arc::new(extractor), ); - let collection = test_db.collection_name("elf_acceptance"); - let cfg = test_config(test_db.dsn().to_string(), qdrant_url, 3, collection); - let service = build_service(cfg, providers).await.expect("Failed to build service."); - super::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); let request = AddNoteRequest { tenant_id: "t".to_string(), @@ -43,17 +47,20 @@ async fn add_note_does_not_call_llm() { agent_id: "a".to_string(), scope: "agent_private".to_string(), notes: vec![AddNoteInput { - note_type: "preference".to_string(), + r#type: "preference".to_string(), key: Some("preferred_language".to_string()), text: "Preference: Use English.".to_string(), + structured: None, importance: 0.5, confidence: 0.9, ttl_days: None, source_ref: serde_json::json!({}), + write_policy: None, }], }; + let _ = service.add_note(request).await.expect("add_note failed."); - service.add_note(request).await.expect("add_note failed."); assert_eq!(calls.load(Ordering::SeqCst), 0); + test_db.cleanup().await.expect("Failed to cleanup test database."); } diff --git a/packages/elf-service/tests/acceptance/chunk_search.rs b/packages/elf-service/tests/acceptance/chunk_search.rs index 68d1a958..867ba014 100644 --- a/packages/elf-service/tests/acceptance/chunk_search.rs +++ b/packages/elf-service/tests/acceptance/chunk_search.rs @@ -1,28 +1,24 @@ -// std use std::{ collections::HashMap, sync::{Arc, atomic::AtomicUsize}, }; -// crates.io use qdrant_client::{ - client::Payload, - qdrant::{ - CreateCollectionBuilder, Distance, Document, Modifier, PointStruct, - SparseVectorParamsBuilder, SparseVectorsConfigBuilder, UpsertPointsBuilder, Vector, - VectorParamsBuilder, VectorsConfigBuilder, - }, + Payload, + qdrant::{Document, PointStruct, UpsertPointsBuilder, Vector}, }; use serde_json::Value; -use time::OffsetDateTime; +use sqlx::PgExecutor; +use time::{Duration, OffsetDateTime}; use uuid::Uuid; -// self -use super::{ - SpyExtractor, StubEmbedding, StubRerank, build_service, test_config, test_db, test_qdrant_url, -}; +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; use elf_config::ProviderConfig; -use elf_service::{BoxFuture, ElfService, Providers, RerankProvider, SearchRequest}; +use elf_service::{ + BoxFuture, ElfService, NoteFetchResponse, PayloadLevel, Providers, RelationTemporalStatus, + RerankProvider, Result, SearchDetailsRequest, SearchRequest, SearchTimelineRequest, + TraceTrajectoryGetRequest, +}; use elf_storage::qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME}; use elf_testkit::TestDatabase; @@ -35,15 +31,15 @@ struct TestContext { struct KeywordRerank { keyword: &'static str, } - impl RerankProvider for KeywordRerank { fn rerank<'a>( &'a self, _cfg: &'a ProviderConfig, _query: &'a str, docs: &'a [String], - ) -> BoxFuture<'a, color_eyre::Result<Vec<f32>>> { + ) -> BoxFuture<'a, Result<Vec<f32>>> { let keyword = self.keyword; + Box::pin(async move { Ok(docs.iter().map(|doc| if doc.contains(keyword) { 1.0 } else { 0.1 }).collect()) }) @@ -55,7 +51,7 @@ where R: RerankProvider + Send + Sync + 'static, { Providers::new( - Arc::new(StubEmbedding { vector_dim: 3 }), + Arc::new(StubEmbedding { vector_dim: 4_096 }), Arc::new(rerank), Arc::new(SpyExtractor { calls: Arc::new(AtomicUsize::new(0)), @@ -64,20 +60,83 @@ where ) } +fn build_payload( + note_id: Uuid, + chunk_id: Uuid, + chunk_index: i32, + start_offset: i32, + end_offset: i32, +) -> Payload { + let mut payload = Payload::new(); + + payload.insert("note_id", note_id.to_string()); + payload.insert("chunk_id", chunk_id.to_string()); + payload.insert("chunk_index", Value::from(chunk_index)); + payload.insert("start_offset", Value::from(start_offset)); + payload.insert("end_offset", Value::from(end_offset)); + payload.insert("tenant_id", "t"); + payload.insert("project_id", "p"); + payload.insert("agent_id", "a"); + payload.insert("scope", "agent_private"); + payload.insert("status", "active"); + + payload +} + +fn build_vectors(text: &str) -> HashMap<String, Vector> { + let mut vectors = HashMap::new(); + + vectors.insert(DENSE_VECTOR_NAME.to_string(), Vector::from(vec![0.0_f32; 4_096])); + vectors.insert( + BM25_VECTOR_NAME.to_string(), + Vector::from(Document::new(text.to_string(), BM25_MODEL)), + ); + + vectors +} + +fn build_payload_shape_search_request(payload_level: PayloadLevel) -> SearchRequest { + SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level, + query: "payload".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + } +} + async fn setup_context(test_name: &str, providers: Providers) -> Option<TestContext> { - let Some(test_db) = test_db().await else { + let Some(test_db) = acceptance::test_db().await else { eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); + return None; }; - let Some(qdrant_url) = test_qdrant_url() else { + let Some(qdrant_url) = acceptance::test_qdrant_url() else { eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); + return None; }; - let collection = test_db.collection_name("elf_acceptance"); - let cfg = test_config(test_db.dsn().to_string(), qdrant_url, 3, collection); - let service = build_service(cfg, providers).await.expect("Failed to build service."); - super::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + reset_collection(&service).await; let embedding_version = format!( @@ -86,64 +145,165 @@ async fn setup_context(test_name: &str, providers: Providers) -> Option<TestCont service.cfg.providers.embedding.model, service.cfg.storage.qdrant.vector_dim ); + Some(TestContext { service, test_db, embedding_version }) } async fn reset_collection(service: &ElfService) { - let _ = service.qdrant.client.delete_collection(service.qdrant.collection.clone()).await; - let mut vectors_config = VectorsConfigBuilder::default(); - vectors_config - .add_named_vector_params(DENSE_VECTOR_NAME, VectorParamsBuilder::new(3, Distance::Cosine)); - let mut sparse_vectors_config = SparseVectorsConfigBuilder::default(); - sparse_vectors_config.add_named_vector_params( - BM25_VECTOR_NAME, - SparseVectorParamsBuilder::default().modifier(Modifier::Idf as i32), - ); - service - .qdrant - .client - .create_collection( - CreateCollectionBuilder::new(service.qdrant.collection.clone()) - .vectors_config(vectors_config) - .sparse_vectors_config(sparse_vectors_config), - ) - .await - .expect("Failed to create Qdrant collection."); + acceptance::reset_qdrant_collection( + &service.qdrant.client, + &service.qdrant.collection, + service.qdrant.vector_dim, + ) + .await + .expect("Failed to reset Qdrant collection."); +} + +async fn insert_note<'e, E>(executor: E, note_id: Uuid, note_text: &str, embedding_version: &str) +where + E: PgExecutor<'e>, +{ + insert_note_with_importance_and_source_ref( + executor, + note_id, + note_text, + embedding_version, + 0.4_f32, + 0.9_f32, + "agent_private", + serde_json::json!({}), + ) + .await; } -async fn insert_note(pool: &sqlx::PgPool, note_id: Uuid, note_text: &str, embedding_version: &str) { +async fn insert_note_with_importance<'e, E>( + executor: E, + note_id: Uuid, + note_text: &str, + embedding_version: &str, + importance: f32, + confidence: f32, + scope: &str, +) where + E: PgExecutor<'e>, +{ + insert_note_with_importance_and_source_ref( + executor, + note_id, + note_text, + embedding_version, + importance, + confidence, + scope, + serde_json::json!({}), + ) + .await; +} + +#[allow(clippy::too_many_arguments)] +async fn insert_note_with_importance_and_source_ref<'e, E>( + executor: E, + note_id: Uuid, + note_text: &str, + embedding_version: &str, + importance: f32, + confidence: f32, + scope: &str, + source_ref: Value, +) where + E: PgExecutor<'e>, +{ let now = OffsetDateTime::now_utc(); + sqlx::query( - "INSERT INTO memory_notes \ - (note_id, tenant_id, project_id, agent_id, scope, type, key, text, importance, confidence, status, created_at, updated_at, expires_at, embedding_version, source_ref, hit_count, last_hit_at) \ - VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18)", + "\ +INSERT INTO memory_notes ( + note_id, + tenant_id, + project_id, + agent_id, + scope, + type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref, + hit_count, + last_hit_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15, + $16, + $17, + $18 +)", ) .bind(note_id) .bind("t") .bind("p") .bind("a") - .bind("agent_private") + .bind(scope) .bind("fact") - .bind::<Option<String>>(None) + .bind(Option::<String>::None) .bind(note_text) - .bind(0.4_f32) - .bind(0.9_f32) + .bind(importance) + .bind(confidence) .bind("active") .bind(now) .bind(now) - .bind::<Option<OffsetDateTime>>(None) + .bind(Option::<OffsetDateTime>::None) .bind(embedding_version) - .bind(serde_json::json!({})) + .bind(source_ref) .bind(0_i64) - .bind::<Option<OffsetDateTime>>(None) - .execute(pool) + .bind(Option::<OffsetDateTime>::None) + .execute(executor) .await .expect("Failed to insert memory note."); } #[allow(clippy::too_many_arguments)] -async fn insert_chunk( - pool: &sqlx::PgPool, +async fn insert_summary_field_row<'e, E>(executor: E, field_id: Uuid, note_id: Uuid, summary: &str) +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO memory_note_fields (field_id, note_id, field_kind, item_index, text) +VALUES ($1, $2, $3, $4, $5)", + ) + .bind(field_id) + .bind(note_id) + .bind("summary") + .bind(0_i32) + .bind(summary) + .execute(executor) + .await + .expect("Failed to insert note summary field."); +} + +#[allow(clippy::too_many_arguments)] +async fn insert_chunk<'e, E>( + executor: E, chunk_id: Uuid, note_id: Uuid, chunk_index: i32, @@ -151,11 +311,21 @@ async fn insert_chunk( end_offset: i32, text: &str, embedding_version: &str, -) { +) where + E: PgExecutor<'e>, +{ sqlx::query( - "INSERT INTO memory_note_chunks \ - (chunk_id, note_id, chunk_index, start_offset, end_offset, text, embedding_version) \ - VALUES ($1,$2,$3,$4,$5,$6,$7)", + "\ +INSERT INTO memory_note_chunks ( + chunk_id, + note_id, + chunk_index, + start_offset, + end_offset, + text, + embedding_version +) +VALUES ($1, $2, $3, $4, $5, $6, $7)", ) .bind(chunk_id) .bind(note_id) @@ -164,42 +334,11 @@ async fn insert_chunk( .bind(end_offset) .bind(text) .bind(embedding_version) - .execute(pool) + .execute(executor) .await .expect("Failed to insert chunk metadata."); } -fn build_payload( - note_id: Uuid, - chunk_id: Uuid, - chunk_index: i32, - start_offset: i32, - end_offset: i32, -) -> Payload { - let mut payload = Payload::new(); - payload.insert("note_id", note_id.to_string()); - payload.insert("chunk_id", chunk_id.to_string()); - payload.insert("chunk_index", Value::from(chunk_index)); - payload.insert("start_offset", Value::from(start_offset)); - payload.insert("end_offset", Value::from(end_offset)); - payload.insert("tenant_id", "t"); - payload.insert("project_id", "p"); - payload.insert("agent_id", "a"); - payload.insert("scope", "agent_private"); - payload.insert("status", "active"); - payload -} - -fn build_vectors(text: &str) -> HashMap<String, Vector> { - let mut vectors = HashMap::new(); - vectors.insert(DENSE_VECTOR_NAME.to_string(), Vector::from(vec![0.0; 3])); - vectors.insert( - BM25_VECTOR_NAME.to_string(), - Vector::from(Document::new(text.to_string(), BM25_MODEL)), - ); - vectors -} - async fn upsert_point( service: &ElfService, chunk_id: Uuid, @@ -212,6 +351,7 @@ async fn upsert_point( let payload = build_payload(note_id, chunk_id, chunk_index, start_offset, end_offset); let vectors = build_vectors(text); let point = PointStruct::new(chunk_id.to_string(), vectors, payload); + service .qdrant .client @@ -222,6 +362,314 @@ async fn upsert_point( .expect("Failed to upsert Qdrant point."); } +async fn fetch_raw_source_ref_for_level( + context: &TestContext, + note_id: Uuid, + payload_level: PayloadLevel, +) -> Value { + let response = context + .service + .search_raw(build_payload_shape_search_request(payload_level)) + .await + .expect("Search failed."); + let item = response.items.first().expect("Expected search result."); + + assert_eq!(item.note_id, note_id); + + item.source_ref.clone() +} + +async fn fetch_search_detail_note_for_level( + context: &TestContext, + search_session_id: Uuid, + note_id: Uuid, + payload_level: PayloadLevel, +) -> NoteFetchResponse { + let response = context + .service + .search_details(SearchDetailsRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + search_session_id, + payload_level, + note_ids: vec![note_id], + record_hits: Some(false), + }) + .await + .expect("Search details failed."); + + response + .results + .first() + .and_then(|item| item.note.as_ref()) + .expect("Expected note details.") + .clone() +} + +async fn insert_graph_entity<'e, E>( + executor: E, + entity_id: Uuid, + canonical: &str, + kind: Option<&str>, +) where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO graph_entities ( + entity_id, + tenant_id, + project_id, + canonical, + canonical_norm, + kind +) +VALUES ($1, $2, $3, $4, $5, $6)", + ) + .bind(entity_id) + .bind("t") + .bind("p") + .bind(canonical) + .bind(canonical.to_lowercase()) + .bind(kind) + .execute(executor) + .await + .expect("Failed to insert graph entity."); +} + +async fn insert_graph_predicate<'e, E>(executor: E, predicate_id: Uuid, canonical: &str) +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO graph_predicates ( + predicate_id, + scope_key, + tenant_id, + project_id, + canonical, + canonical_norm, + cardinality, + status +) +VALUES ($1, $2, $3, $4, $5, $6, 'single', 'active')", + ) + .bind(predicate_id) + .bind("__project__:p") + .bind("t") + .bind("p") + .bind(canonical) + .bind(canonical.to_lowercase()) + .execute(executor) + .await + .expect("Failed to insert graph predicate."); +} + +#[allow(clippy::too_many_arguments)] +async fn insert_graph_fact<'e, E>( + executor: E, + fact_id: Uuid, + subject_entity_id: Uuid, + predicate: &str, + predicate_id: Uuid, + object_value: &str, + valid_from: OffsetDateTime, + valid_to: Option<OffsetDateTime>, +) where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO graph_facts ( + fact_id, + tenant_id, + project_id, + agent_id, + scope, + subject_entity_id, + predicate, + predicate_id, + object_entity_id, + object_value, + valid_from, + valid_to +) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, NULL, $9, $10, $11)", + ) + .bind(fact_id) + .bind("t") + .bind("p") + .bind("a") + .bind("agent_private") + .bind(subject_entity_id) + .bind(predicate) + .bind(predicate_id) + .bind(object_value) + .bind(valid_from) + .bind(valid_to) + .execute(executor) + .await + .expect("Failed to insert graph fact."); +} + +async fn insert_graph_fact_evidence<'e, E>( + executor: E, + fact_id: Uuid, + note_id: Uuid, + created_at: OffsetDateTime, +) where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO graph_fact_evidence (evidence_id, fact_id, note_id, created_at) +VALUES ($1, $2, $3, $4)", + ) + .bind(Uuid::new_v4()) + .bind(fact_id) + .bind(note_id) + .bind(created_at) + .execute(executor) + .await + .expect("Failed to insert graph fact evidence."); +} + +async fn setup_graph_context_test( + test_name: &str, + providers: Providers, + max_facts_per_item: u32, + max_evidence_notes_per_fact: u32, +) -> Option<TestContext> { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); + + return None; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); + + return None; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let mut cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + + cfg.search.graph_context.enabled = true; + cfg.search.graph_context.max_facts_per_item = max_facts_per_item; + cfg.search.graph_context.max_evidence_notes_per_fact = max_evidence_notes_per_fact; + + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + reset_collection(&service).await; + + let embedding_version = format!( + "{}:{}:{}", + service.cfg.providers.embedding.provider_id, + service.cfg.providers.embedding.model, + service.cfg.storage.qdrant.vector_dim + ); + + Some(TestContext { service, test_db, embedding_version }) +} + +async fn seed_relation_context_fixture( + service: &ElfService, + embedding_version: &str, +) -> (Uuid, Uuid, Uuid) { + let now = OffsetDateTime::now_utc(); + let note_id = Uuid::new_v4(); + let note_id_2 = Uuid::new_v4(); + let chunk_id = Uuid::new_v4(); + let chunk_text = "Alice mentors Bob about projects and priorities."; + let subject_id = Uuid::new_v4(); + let newer_fact_id = Uuid::new_v4(); + let predicate_id = Uuid::new_v4(); + let older_fact_id = Uuid::new_v4(); + let older_fact_valid_from = now - Duration::seconds(10); + let newer_fact_valid_from = now - Duration::seconds(5); + let note_1_evidence_created_at = now - Duration::seconds(30); + let note_2_evidence_created_at = now - Duration::seconds(10); + + insert_note(&service.db.pool, note_id, chunk_text, embedding_version).await; + insert_note( + &service.db.pool, + note_id_2, + "Second note for evidence ordering.", + embedding_version, + ) + .await; + insert_chunk( + &service.db.pool, + chunk_id, + note_id, + 0, + 0, + chunk_text.len() as i32, + chunk_text, + embedding_version, + ) + .await; + upsert_point(service, chunk_id, note_id, 0, 0, chunk_text.len() as i32, chunk_text).await; + insert_graph_entity(&service.db.pool, subject_id, "Alice", Some("person")).await; + insert_graph_predicate(&service.db.pool, predicate_id, "mentors").await; + insert_graph_fact( + &service.db.pool, + older_fact_id, + subject_id, + "mentors", + predicate_id, + "Bob", + older_fact_valid_from, + Some(newer_fact_valid_from), + ) + .await; + insert_graph_fact_evidence( + &service.db.pool, + older_fact_id, + note_id, + note_1_evidence_created_at, + ) + .await; + insert_graph_fact( + &service.db.pool, + newer_fact_id, + subject_id, + "mentors", + predicate_id, + "Carol", + newer_fact_valid_from, + None, + ) + .await; + insert_graph_fact_evidence( + &service.db.pool, + newer_fact_id, + note_id, + note_1_evidence_created_at, + ) + .await; + insert_graph_fact_evidence( + &service.db.pool, + newer_fact_id, + note_id_2, + note_2_evidence_created_at, + ) + .await; + + (note_id, newer_fact_id, older_fact_id) +} + #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] async fn search_returns_chunk_items() { @@ -229,10 +677,10 @@ async fn search_returns_chunk_items() { let Some(context) = setup_context("search_returns_chunk_items", providers).await else { return; }; - let note_id = Uuid::new_v4(); let chunk_id = Uuid::new_v4(); let note_text = "First sentence. Second sentence."; + insert_note(&context.service.db.pool, note_id, note_text, &context.embedding_version).await; insert_chunk( &context.service.db.pool, @@ -250,20 +698,24 @@ async fn search_returns_chunk_items() { let response = context .service - .search(SearchRequest { + .search_raw(SearchRequest { tenant_id: "t".to_string(), project_id: "p".to_string(), agent_id: "a".to_string(), + token_id: None, read_profile: "private_only".to_string(), + payload_level: Default::default(), query: "First".to_string(), top_k: Some(5), candidate_k: Some(10), + filter: None, record_hits: Some(false), + ranking: None, }) .await .expect("Search failed."); - let item = response.items.first().expect("Expected search result."); + assert_eq!(item.chunk_id, chunk_id); assert!(!item.snippet.is_empty()); @@ -272,23 +724,141 @@ async fn search_returns_chunk_items() { #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn search_stitches_adjacent_chunks() { +async fn search_raw_quick_includes_relation_context_and_respects_fact_bounds() { let providers = build_providers(StubRerank); - let Some(context) = setup_context("search_stitches_adjacent_chunks", providers).await else { + let Some(context) = setup_graph_context_test( + "search_raw_quick_includes_relation_context_and_respects_fact_bounds", + providers, + 1, + 1, + ) + .await + else { return; }; - - let note_id = Uuid::new_v4(); - let chunk_texts = ["First sentence. ", "Second sentence. ", "Third sentence."]; - let note_text = chunk_texts.concat(); - insert_note(&context.service.db.pool, note_id, ¬e_text, &context.embedding_version).await; - - let mut offset = 0_i32; - let mut chunk_ids = Vec::new(); - for (index, chunk_text) in chunk_texts.iter().enumerate() { - let chunk_id = Uuid::new_v4(); - let start = offset; + let fixture = seed_relation_context_fixture(&context.service, &context.embedding_version).await; + let note_id = fixture.0; + let newer_fact_id = fixture.1; + let response = context + .service + .search_raw_quick(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "Alice".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search failed."); + let item = response.items.first().expect("Expected search result."); + let relation_context = item + .explain + .relation_context + .as_ref() + .expect("Expected relation context in search explain."); + + assert_eq!(relation_context.len(), 1, "Expected relation context to be truncated to one fact."); + assert_eq!( + relation_context[0].fact_id, newer_fact_id, + "Expected the most recent fact after truncation." + ); + assert_eq!(relation_context[0].object.value.as_deref(), Some("Carol")); + assert_eq!(relation_context[0].temporal_status, RelationTemporalStatus::Current); + assert!(relation_context[0].valid_to.is_none()); + assert_eq!(relation_context[0].evidence_note_ids.len(), 1); + assert_eq!(relation_context[0].evidence_note_ids[0], note_id); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn search_raw_quick_marks_historical_relation_context() { + let providers = build_providers(StubRerank); + let Some(context) = setup_graph_context_test( + "search_raw_quick_marks_historical_relation_context", + providers, + 2, + 2, + ) + .await + else { + return; + }; + let fixture = seed_relation_context_fixture(&context.service, &context.embedding_version).await; + let older_fact_id = fixture.2; + let response = context + .service + .search_raw_quick(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "Alice".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search failed."); + let item = response.items.first().expect("Expected search result."); + let relation_context = item + .explain + .relation_context + .as_ref() + .expect("Expected relation context in search explain."); + + assert_eq!( + relation_context.len(), + 2, + "Expected current and historical relation facts in context.", + ); + assert_eq!(relation_context[0].temporal_status, RelationTemporalStatus::Current); + + let historical = relation_context + .iter() + .find(|context| context.fact_id == older_fact_id) + .expect("Expected historical fact in relation context."); + + assert_eq!(historical.object.value.as_deref(), Some("Bob")); + assert_eq!(historical.temporal_status, RelationTemporalStatus::Historical); + assert!(historical.valid_to.is_some()); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn search_stitches_adjacent_chunks() { + let providers = build_providers(StubRerank); + let Some(context) = setup_context("search_stitches_adjacent_chunks", providers).await else { + return; + }; + let note_id = Uuid::new_v4(); + let chunk_texts = ["First sentence. ", "Second sentence. ", "Third sentence."]; + let note_text = chunk_texts.concat(); + + insert_note(&context.service.db.pool, note_id, ¬e_text, &context.embedding_version).await; + + let mut offset = 0_i32; + let mut chunk_ids = Vec::new(); + + for (index, chunk_text) in chunk_texts.iter().enumerate() { + let chunk_id = Uuid::new_v4(); + let start = offset; let end = start + chunk_text.len() as i32; + insert_chunk( &context.service.db.pool, chunk_id, @@ -300,29 +870,36 @@ async fn search_stitches_adjacent_chunks() { &context.embedding_version, ) .await; + chunk_ids.push((chunk_id, start, end, *chunk_text)); + offset = end; } let (chunk_id, start, end, text) = chunk_ids[1]; + upsert_point(&context.service, chunk_id, note_id, 1, start, end, text).await; let response = context .service - .search(SearchRequest { + .search_raw(SearchRequest { tenant_id: "t".to_string(), project_id: "p".to_string(), agent_id: "a".to_string(), + token_id: None, read_profile: "private_only".to_string(), + payload_level: Default::default(), query: "Second".to_string(), top_k: Some(5), candidate_k: Some(10), + filter: None, record_hits: Some(false), + ranking: None, }) .await .expect("Search failed."); - let item = response.items.first().expect("Expected search result."); + assert_eq!(item.chunk_id, chunk_id); assert!(item.snippet.contains("First sentence.")); assert!(item.snippet.contains("Second sentence.")); @@ -339,26 +916,29 @@ async fn search_skips_missing_chunk_metadata() { else { return; }; - let note_id = Uuid::new_v4(); let chunk_id = Uuid::new_v4(); let note_text = "Missing chunk metadata."; - insert_note(&context.service.db.pool, note_id, note_text, &context.embedding_version).await; + insert_note(&context.service.db.pool, note_id, note_text, &context.embedding_version).await; upsert_point(&context.service, chunk_id, note_id, 0, 0, note_text.len() as i32, note_text) .await; let response = context .service - .search(SearchRequest { + .search_raw(SearchRequest { tenant_id: "t".to_string(), project_id: "p".to_string(), agent_id: "a".to_string(), + token_id: None, read_profile: "private_only".to_string(), + payload_level: Default::default(), query: "Missing".to_string(), top_k: Some(5), candidate_k: Some(10), + filter: None, record_hits: Some(false), + ranking: None, }) .await .expect("Search failed."); @@ -368,6 +948,274 @@ async fn search_skips_missing_chunk_metadata() { context.test_db.cleanup().await.expect("Failed to cleanup test database."); } +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn progressive_search_returns_index_timeline_and_details() { + let providers = build_providers(StubRerank); + let Some(context) = + setup_context("progressive_search_returns_index_timeline_and_details", providers).await + else { + return; + }; + let note_id = Uuid::new_v4(); + let chunk_id = Uuid::new_v4(); + let note_text = "Progressive retrieval works best with staged expansion."; + + insert_note(&context.service.db.pool, note_id, note_text, &context.embedding_version).await; + insert_chunk( + &context.service.db.pool, + chunk_id, + note_id, + 0, + 0, + note_text.len() as i32, + note_text, + &context.embedding_version, + ) + .await; + upsert_point(&context.service, chunk_id, note_id, 0, 0, note_text.len() as i32, note_text) + .await; + + let index = context + .service + .search(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "Progressive".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search index failed."); + + assert!(!index.items.is_empty()); + + let timeline = context + .service + .search_timeline(SearchTimelineRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + search_session_id: index.search_session_id, + payload_level: Default::default(), + group_by: None, + }) + .await + .expect("Search timeline failed."); + + assert!(!timeline.groups.is_empty()); + + let details = context + .service + .search_details(SearchDetailsRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + search_session_id: index.search_session_id, + payload_level: Default::default(), + note_ids: vec![note_id], + record_hits: Some(false), + }) + .await + .expect("Search details failed."); + let returned = details + .results + .first() + .and_then(|result| result.note.as_ref()) + .expect("Expected note details."); + + assert_eq!(returned.note_id, note_id); + assert_eq!(returned.text, note_text); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn search_raw_payload_level_shapes_source_ref() { + let providers = build_providers(StubRerank); + let Some(context) = + setup_context("search_raw_payload_level_shapes_source_ref", providers).await + else { + return; + }; + let note_id = Uuid::new_v4(); + let chunk_id = Uuid::new_v4(); + let note_text = "Payload shaping should control the raw item source_ref payload."; + let source_ref = serde_json::json!({ + "schema": "note_source_ref/v1", + "locator": { + "doc_id": Uuid::new_v4().to_string(), + "chunk_id": Uuid::new_v4().to_string() + }, + "metadata": { + "long_field": "A long metadata body to represent a heavy source reference shape." + } + }); + + insert_note_with_importance_and_source_ref( + &context.service.db.pool, + note_id, + note_text, + &context.embedding_version, + 0.9_f32, + 1.0, + "agent_private", + source_ref.clone(), + ) + .await; + insert_chunk( + &context.service.db.pool, + chunk_id, + note_id, + 0, + 0, + note_text.len() as i32, + note_text, + &context.embedding_version, + ) + .await; + upsert_point(&context.service, chunk_id, note_id, 0, 0, note_text.len() as i32, note_text) + .await; + + let l0 = fetch_raw_source_ref_for_level(&context, note_id, PayloadLevel::L0).await; + let l1 = fetch_raw_source_ref_for_level(&context, note_id, PayloadLevel::L1).await; + let l2 = fetch_raw_source_ref_for_level(&context, note_id, PayloadLevel::L2).await; + + assert_eq!(l0, serde_json::json!({})); + assert_eq!(l1, serde_json::json!({})); + assert_eq!(l2, source_ref); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn search_details_payload_level_shapes_text_and_fields() { + let providers = build_providers(StubRerank); + let Some(context) = + setup_context("search_details_payload_level_shapes_text_and_fields", providers).await + else { + return; + }; + let note_id = Uuid::new_v4(); + let chunk_id = Uuid::new_v4(); + let max_note_chars = context.service.cfg.memory.max_note_chars as usize; + let note_text_seed = + "This is the long note body used for detail shaping and payload truncation. "; + let note_text = note_text_seed.repeat((max_note_chars / note_text_seed.len()) + 2); + let source_ref = serde_json::json!({ + "schema": "note_source_ref/v1", + "locator": { + "document_id": Uuid::new_v4().to_string(), + "chunk_id": Uuid::new_v4().to_string(), + "extra": "field with rich details for l2 retention" + }, + }); + let structured_summary = "Structured summary about payload levels and compact text behavior."; + let field_id = Uuid::new_v4(); + + assert!(note_text.len() > max_note_chars); + + insert_note_with_importance_and_source_ref( + &context.service.db.pool, + note_id, + note_text.as_str(), + &context.embedding_version, + 0.8_f32, + 1.0, + "agent_private", + source_ref.clone(), + ) + .await; + insert_summary_field_row(&context.service.db.pool, field_id, note_id, structured_summary).await; + insert_chunk( + &context.service.db.pool, + chunk_id, + note_id, + 0, + 0, + note_text.len() as i32, + note_text.as_str(), + &context.embedding_version, + ) + .await; + upsert_point( + &context.service, + chunk_id, + note_id, + 0, + 0, + note_text.len() as i32, + note_text.as_str(), + ) + .await; + + let index = context + .service + .search(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: PayloadLevel::L2, + query: "payload".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search index failed."); + let l0 = fetch_search_detail_note_for_level( + &context, + index.search_session_id, + note_id, + PayloadLevel::L0, + ) + .await; + let l1 = fetch_search_detail_note_for_level( + &context, + index.search_session_id, + note_id, + PayloadLevel::L1, + ) + .await; + let l2 = fetch_search_detail_note_for_level( + &context, + index.search_session_id, + note_id, + PayloadLevel::L2, + ) + .await; + + assert!(l0.text.chars().count() <= max_note_chars + 3); + assert!(l1.text.chars().count() <= max_note_chars + 3); + assert!(l0.text.ends_with("...")); + assert_eq!(l2.text, note_text); + assert_ne!(l0.text, l1.text); + assert_ne!(l0.text, note_text); + assert_ne!(l1.text, note_text); + assert!(l1.text.contains("Structured summary")); + assert_eq!(l0.source_ref, serde_json::json!({})); + assert_eq!(l1.source_ref, serde_json::json!({})); + assert_eq!(l2.source_ref, source_ref); + assert!(l0.structured.is_none()); + assert!(l1.structured.is_some()); + assert!(l2.structured.is_some()); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} + #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] async fn search_dedupes_note_results() { @@ -375,18 +1223,20 @@ async fn search_dedupes_note_results() { let Some(context) = setup_context("search_dedupes_note_results", providers).await else { return; }; - let note_id = Uuid::new_v4(); let chunk_texts = ["preferred alpha. ", "bridge chunk. ", "other alpha."]; let note_text = chunk_texts.concat(); + insert_note(&context.service.db.pool, note_id, ¬e_text, &context.embedding_version).await; let mut offset = 0_i32; let mut chunk_ids = Vec::new(); + for (index, chunk_text) in chunk_texts.iter().enumerate() { let chunk_id = Uuid::new_v4(); let start = offset; let end = start + chunk_text.len() as i32; + insert_chunk( &context.service.db.pool, chunk_id, @@ -398,33 +1248,223 @@ async fn search_dedupes_note_results() { &context.embedding_version, ) .await; + chunk_ids.push((chunk_id, start, end, *chunk_text)); + offset = end; } let (chunk_id_a, start_a, end_a, text_a) = chunk_ids[0]; let (chunk_id_c, start_c, end_c, text_c) = chunk_ids[2]; + upsert_point(&context.service, chunk_id_a, note_id, 0, start_a, end_a, text_a).await; upsert_point(&context.service, chunk_id_c, note_id, 2, start_c, end_c, text_c).await; let response = context .service - .search(SearchRequest { + .search_raw(SearchRequest { tenant_id: "t".to_string(), project_id: "p".to_string(), agent_id: "a".to_string(), + token_id: None, read_profile: "private_only".to_string(), + payload_level: Default::default(), query: "alpha".to_string(), top_k: Some(5), candidate_k: Some(10), + filter: None, record_hits: Some(false), + ranking: None, }) .await .expect("Search failed."); - let item = response.items.first().expect("Expected search result."); + assert_eq!(response.items.len(), 1); - assert_eq!(item.chunk_id, chunk_id_a); + assert_eq!(item.note_id, note_id); + assert!( + item.chunk_id == chunk_id_a || item.chunk_id == chunk_id_c, + "Expected deduped result chunk_id to be one of the ingested chunks." + ); context.test_db.cleanup().await.expect("Failed to cleanup test database."); } + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run this test."] +async fn search_filter_affects_candidate_set_and_records_filter_impact() { + let provider = build_providers(StubRerank); + let low_note_text = "alpha low confidence note"; + let high_note_text = "alpha high confidence note"; + let low_note_id = Uuid::new_v4(); + let high_note_id = Uuid::new_v4(); + let low_chunk_id = Uuid::new_v4(); + let high_chunk_id = Uuid::new_v4(); + let mut context = match setup_context( + "search_filter_affects_candidate_set_and_records_filter_impact", + provider, + ) + .await + { + Some(context) => context, + None => return, + }; + + context.service.cfg.search.explain.write_mode = "inline".to_string(); + + seed_filter_impact_notes( + &context, + low_note_id, + high_note_id, + low_chunk_id, + high_chunk_id, + low_note_text, + high_note_text, + ) + .await; + + let response = context + .service + .search_raw(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "alpha".to_string(), + top_k: Some(1), + candidate_k: Some(10), + filter: Some(serde_json::json!({ + "schema": "search_filter_expr/v1", + "expr": { "op": "gte", "field": "importance", "value": 0.5 }, + })), + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search failed."); + + assert_eq!(response.items.len(), 1); + assert_eq!(response.items[0].note_id, high_note_id); + + let filter_impact = load_filter_impact_from_trace(&context, response.trace_id).await; + let filter = filter_impact.get("filter").expect("Expected filter object in filter_impact."); + let requested_candidate_k = filter_impact + .get("requested_candidate_k") + .and_then(Value::as_u64) + .expect("Expected requested_candidate_k."); + let effective_candidate_k = filter_impact + .get("effective_candidate_k") + .and_then(Value::as_u64) + .expect("Expected effective_candidate_k."); + + assert_eq!( + filter_impact.get("schema"), + Some(&Value::String("search_filter_impact/v1".to_string())) + ); + assert_eq!(requested_candidate_k, 10); + assert_eq!(effective_candidate_k, 30); + assert_eq!(filter.get("schema"), Some(&Value::String("search_filter_expr/v1".to_string()))); + assert_eq!(filter_impact.get("candidate_count_pre"), Some(&Value::from(2_u64))); + assert_eq!(filter_impact.get("candidate_count_post"), Some(&Value::from(1_u64))); + assert_eq!(filter_impact.get("dropped_total"), Some(&Value::from(1_u64))); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +async fn seed_filter_impact_notes( + context: &TestContext, + low_note_id: Uuid, + high_note_id: Uuid, + low_chunk_id: Uuid, + high_chunk_id: Uuid, + low_note_text: &str, + high_note_text: &str, +) { + insert_note_with_importance( + &context.service.db.pool, + low_note_id, + low_note_text, + &context.embedding_version, + 0.2, + 0.2, + "agent_private", + ) + .await; + insert_note_with_importance( + &context.service.db.pool, + high_note_id, + high_note_text, + &context.embedding_version, + 0.9, + 0.9, + "agent_private", + ) + .await; + insert_chunk( + &context.service.db.pool, + low_chunk_id, + low_note_id, + 0, + 0, + low_note_text.len() as i32, + low_note_text, + &context.embedding_version, + ) + .await; + insert_chunk( + &context.service.db.pool, + high_chunk_id, + high_note_id, + 0, + 0, + high_note_text.len() as i32, + high_note_text, + &context.embedding_version, + ) + .await; + upsert_point( + &context.service, + low_chunk_id, + low_note_id, + 0, + 0, + low_note_text.len() as i32, + low_note_text, + ) + .await; + upsert_point( + &context.service, + high_chunk_id, + high_note_id, + 0, + 0, + high_note_text.len() as i32, + high_note_text, + ) + .await; +} + +async fn load_filter_impact_from_trace(context: &TestContext, trace_id: Uuid) -> Value { + let trajectory = context + .service + .trace_trajectory_get(TraceTrajectoryGetRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + trace_id, + }) + .await + .expect("Failed to fetch trace trajectory."); + + trajectory + .stages + .iter() + .find(|stage| stage.stage_name == "recall.candidates") + .expect("Expected recall.candidates stage.") + .stage_payload + .get("filter_impact") + .expect("Expected filter_impact in recall stage.") + .clone() +} diff --git a/packages/elf-service/tests/acceptance/chunking.rs b/packages/elf-service/tests/acceptance/chunking.rs new file mode 100644 index 00000000..c5e8d276 --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunking.rs @@ -0,0 +1 @@ +pub use elf_chunking::ChunkingConfig; diff --git a/packages/elf-service/tests/acceptance/consolidation.rs b/packages/elf-service/tests/acceptance/consolidation.rs new file mode 100644 index 00000000..696776e0 --- /dev/null +++ b/packages/elf-service/tests/acceptance/consolidation.rs @@ -0,0 +1,380 @@ +use std::sync::{Arc, atomic::AtomicUsize}; + +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; +use elf_chunking::ChunkingConfig; +use elf_domain::consolidation::{ + ConsolidationApplyIntent, ConsolidationInputRef, ConsolidationLineage, ConsolidationMarker, + ConsolidationMarkerSeverity, ConsolidationMarkers, ConsolidationProposalDiff, + ConsolidationReviewAction, ConsolidationSourceKind, ConsolidationSourceSnapshot, + ConsolidationUnsupportedClaimFlag, +}; +use elf_service::{ + AddNoteInput, AddNoteRequest, ConsolidationProposalGetRequest, ConsolidationProposalInput, + ConsolidationProposalReviewRequest, ConsolidationProposalsListRequest, + ConsolidationProposalsListResponse, ConsolidationRunCreateRequest, + ConsolidationRunCreateResponse, ConsolidationRunGetRequest, ElfService, Providers, +}; +use elf_storage::{db::Db, qdrant::QdrantStore}; +use elf_testkit::TestDatabase; +use elf_worker::worker::{self, WorkerState}; + +const TENANT_ID: &str = "tenant_consolidation"; +const PROJECT_ID: &str = "project_consolidation"; +const AGENT_ID: &str = "agent_consolidation"; + +struct ConsolidationFixture { + service: ElfService, + _test_db: TestDatabase, +} + +fn source_ref(note_id: Uuid) -> ConsolidationInputRef { + ConsolidationInputRef { + kind: ConsolidationSourceKind::Note, + id: note_id, + snapshot: ConsolidationSourceSnapshot { + status: Some("active".to_string()), + updated_at: Some(OffsetDateTime::UNIX_EPOCH), + content_hash: Some("blake3:acceptance-source".to_string()), + embedding_version: Some("test:test:4096".to_string()), + trace_version: None, + source_ref: serde_json::json!({ "schema": "acceptance/v1" }), + metadata: serde_json::json!({ "fixture": "consolidation" }), + }, + } +} + +fn lineage(source: &ConsolidationInputRef) -> ConsolidationLineage { + ConsolidationLineage { + source_refs: vec![source.clone()], + parent_run_id: None, + parent_proposal_ids: Vec::new(), + } +} + +fn proposal_input(source: &ConsolidationInputRef, kind: &str) -> ConsolidationProposalInput { + ConsolidationProposalInput { + proposal_kind: kind.to_string(), + apply_intent: ConsolidationApplyIntent::CreateDerivedNote, + source_refs: vec![source.clone()], + source_snapshot: serde_json::json!({ "source_count": 1 }), + lineage: lineage(source), + confidence: 0.82, + unsupported_claim_flags: vec![ConsolidationUnsupportedClaimFlag { + claim_id: Some("unsupported-claim".to_string()), + message: "The source does not prove that source notes may be rewritten.".to_string(), + source: Some(source.clone()), + }], + markers: ConsolidationMarkers { + contradictions: vec![ConsolidationMarker { + severity: ConsolidationMarkerSeverity::High, + message: "Stale rewrite evidence conflicts with the proposal-only rule." + .to_string(), + source: Some(source.clone()), + }], + staleness: Vec::new(), + }, + diff: ConsolidationProposalDiff { + summary: "Create a reviewed derived note without changing source evidence.".to_string(), + before: serde_json::json!({}), + after: serde_json::json!({ + "target": "derived_note", + "text": "Fact: Consolidation proposals are derived and reviewable." + }), + }, + target_ref: serde_json::json!({}), + proposed_payload: serde_json::json!({ + "type": "fact", + "text": "Fact: Consolidation proposals are derived and reviewable." + }), + } +} + +fn proposal_id_by_kind(response: &ConsolidationProposalsListResponse, proposal_kind: &str) -> Uuid { + response + .proposals + .iter() + .find(|proposal| proposal.proposal_kind == proposal_kind) + .map(|proposal| proposal.proposal_id) + .expect("proposal kind should be present") +} + +async fn setup_service(test_name: &str) -> Option<ConsolidationFixture> { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); + + return None; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); + + return None; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let extractor = SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(extractor), + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + Some(ConsolidationFixture { service, _test_db: test_db }) +} + +async fn insert_source_note(service: &ElfService, key: &str, text: &str) -> Uuid { + let response = service + .add_note(AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(key.to_string()), + text: text.to_string(), + structured: None, + importance: 0.7, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({ "schema": "acceptance/v1", "key": key }), + write_policy: None, + }], + }) + .await + .expect("add_note should persist source note"); + + response.results[0].note_id.expect("source note id should be present") +} + +async fn create_run_with_proposals( + service: &ElfService, + source: &ConsolidationInputRef, + proposals: Vec<ConsolidationProposalInput>, +) -> ConsolidationRunCreateResponse { + service + .consolidation_run_create(ConsolidationRunCreateRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + job_kind: "manual".to_string(), + input_refs: vec![source.clone()], + source_snapshot: serde_json::json!({ "source_count": 1 }), + lineage: lineage(source), + proposals, + }) + .await + .expect("consolidation run should be created") +} + +async fn process_consolidation_worker(service: &ElfService) { + let tokenizer = elf_chunking::load_tokenizer(&service.cfg.chunking.tokenizer_repo) + .expect("worker tokenizer should load"); + let mut embedding = acceptance::dummy_embedding_provider(); + + embedding.dimensions = service.cfg.storage.qdrant.vector_dim; + + let worker_state = WorkerState { + db: Db::connect(&service.cfg.storage.postgres).await.expect("Failed to connect worker DB."), + qdrant: QdrantStore::new(&service.cfg.storage.qdrant) + .expect("Failed to build Qdrant store."), + docs_qdrant: QdrantStore::new_with_collection( + &service.cfg.storage.qdrant, + &service.cfg.storage.qdrant.docs_collection, + ) + .expect("Failed to build docs Qdrant store."), + embedding, + chunking: ChunkingConfig { + max_tokens: service.cfg.chunking.max_tokens, + overlap_tokens: service.cfg.chunking.overlap_tokens, + }, + tokenizer, + }; + + worker::process_once(&worker_state).await.expect("consolidation worker should process once"); +} + +async fn materialized_proposals( + service: &ElfService, + run_id: Uuid, +) -> ConsolidationProposalsListResponse { + service + .consolidation_proposals_list(ConsolidationProposalsListRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + run_id: Some(run_id), + review_state: None, + limit: None, + }) + .await + .expect("consolidation proposals should be listed") +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run this test."] +async fn apply_action_is_audited_without_source_rewrite() { + let Some(fixture) = setup_service("apply_action_is_audited_without_source_rewrite").await + else { + return; + }; + let service = &fixture.service; + let source_text = + "Fact: Current consolidation output is derived and never rewrites source notes."; + let note_id = insert_source_note(service, "consolidation_source_rule", source_text).await; + let source = source_ref(note_id); + let created = + create_run_with_proposals(service, &source, vec![proposal_input(&source, "derived_note")]) + .await; + + assert_eq!(created.run.status, "pending"); + assert!(created.proposals.is_empty()); + + process_consolidation_worker(service).await; + + let completed = service + .consolidation_run_get(ConsolidationRunGetRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + run_id: created.run.run_id, + }) + .await + .expect("consolidation run should remain readable"); + let materialized = materialized_proposals(service, created.run.run_id).await; + let proposal = &materialized.proposals[0]; + let job_status: String = + sqlx::query_scalar("SELECT status FROM consolidation_run_jobs WHERE job_id = $1") + .bind(created.job_id) + .fetch_one(&service.db.pool) + .await + .expect("consolidation job should be queryable"); + + assert_eq!(completed.status, "completed"); + assert_eq!(job_status, "DONE"); + assert_eq!(materialized.proposals.len(), 1); + assert_eq!(proposal.review_state, "proposed"); + assert_eq!(proposal.unsupported_claim_flags.as_array().map(Vec::len), Some(1)); + assert_eq!(proposal.contradiction_markers.as_array().map(Vec::len), Some(1)); + + let reviewed = service + .consolidation_proposal_review(ConsolidationProposalReviewRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + reviewer_agent_id: AGENT_ID.to_string(), + proposal_id: proposal.proposal_id, + review_action: ConsolidationReviewAction::Apply, + review_comment: Some("Apply reviewed derived proposal.".to_string()), + }) + .await + .expect("review action should apply"); + + assert_eq!(reviewed.review_state, "applied"); + assert_eq!(reviewed.review_events.len(), 2); + assert_eq!(reviewed.review_events[0].action, "approve"); + assert_eq!(reviewed.review_events[0].from_review_state, "proposed"); + assert_eq!(reviewed.review_events[0].to_review_state, "approved"); + assert_eq!(reviewed.review_events[1].action, "apply"); + assert_eq!(reviewed.review_events[1].from_review_state, "approved"); + assert_eq!(reviewed.review_events[1].to_review_state, "applied"); + + let stored_text: String = + sqlx::query_scalar("SELECT text FROM memory_notes WHERE note_id = $1") + .bind(note_id) + .fetch_one(&service.db.pool) + .await + .expect("source note should still exist"); + let version_count: i64 = + sqlx::query_scalar("SELECT count(*) FROM memory_note_versions WHERE note_id = $1") + .bind(note_id) + .fetch_one(&service.db.pool) + .await + .expect("source note versions should be queryable"); + + assert_eq!(stored_text, source_text); + assert_eq!(version_count, 1); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run this test."] +async fn discard_and_defer_actions_remain_auditable() { + let Some(fixture) = setup_service("discard_and_defer_actions_remain_auditable").await else { + return; + }; + let service = &fixture.service; + let note_id = insert_source_note( + service, + "consolidation_review_actions", + "Fact: Discarded and deferred proposals remain auditable.", + ) + .await; + let source = source_ref(note_id); + let created = create_run_with_proposals( + service, + &source, + vec![ + proposal_input(&source, "contradiction_report"), + proposal_input(&source, "preference_candidate"), + ], + ) + .await; + + process_consolidation_worker(service).await; + + let materialized = materialized_proposals(service, created.run.run_id).await; + let discarded_id = proposal_id_by_kind(&materialized, "contradiction_report"); + let deferred_id = proposal_id_by_kind(&materialized, "preference_candidate"); + let discarded = service + .consolidation_proposal_review(ConsolidationProposalReviewRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + reviewer_agent_id: AGENT_ID.to_string(), + proposal_id: discarded_id, + review_action: ConsolidationReviewAction::Discard, + review_comment: Some("Discard stale synthesis.".to_string()), + }) + .await + .expect("discard should be allowed"); + let deferred = service + .consolidation_proposal_review(ConsolidationProposalReviewRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + reviewer_agent_id: AGENT_ID.to_string(), + proposal_id: deferred_id, + review_action: ConsolidationReviewAction::Defer, + review_comment: Some("Defer until more evidence is available.".to_string()), + }) + .await + .expect("defer should be allowed"); + let deferred_readback = service + .consolidation_proposal_get(ConsolidationProposalGetRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + proposal_id: deferred_id, + }) + .await + .expect("deferred proposal should remain readable"); + + assert_eq!(discarded.review_state, "rejected"); + assert_eq!(discarded.review_events.len(), 1); + assert_eq!(discarded.review_events[0].action, "discard"); + assert_eq!(deferred.review_state, "archived"); + assert_eq!(deferred.review_events.len(), 1); + assert_eq!(deferred.review_events[0].action, "defer"); + assert_eq!(deferred_readback.review_events.len(), 1); + assert_eq!(deferred_readback.review_events[0].to_review_state, "archived"); +} diff --git a/packages/elf-service/tests/acceptance/docs_extension_v1.rs b/packages/elf-service/tests/acceptance/docs_extension_v1.rs new file mode 100644 index 00000000..9a236c9a --- /dev/null +++ b/packages/elf-service/tests/acceptance/docs_extension_v1.rs @@ -0,0 +1,1940 @@ +use std::{collections::HashSet, future::IntoFuture, string::ToString, sync::Arc, time::Instant}; + +use ahash::AHashMap; +use axum::{Json, Router, extract::State, http::StatusCode, response::IntoResponse, routing}; +use qdrant_client::qdrant::{ + CreateFieldIndexCollection, FieldType, GetPointsBuilder, PayloadSchemaType, RetrievedPoint, + value, +}; +use serde_json::Map; +use sqlx::{FromRow, PgPool}; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; +use tokenizers::{Tokenizer, models::wordlevel::WordLevel}; +use tokio::{ + net::TcpListener, + sync::{oneshot, oneshot::Sender}, + task::JoinHandle, +}; +use uuid::Uuid; + +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank, chunking::ChunkingConfig}; +use elf_config::EmbeddingProviderConfig; +use elf_service::{ + AddNoteInput, AddNoteRequest, BoxFuture, DocsExcerptsGetRequest, DocsGetRequest, + DocsPutRequest, DocsPutResponse, DocsSearchL0Request, ElfService, EmbeddingProvider, Error, + PayloadLevel, Providers, Result, SearchRequest, TextQuoteSelector, + docs::DocRetrievalTrajectory, +}; +use elf_storage::{db::Db, qdrant::QdrantStore}; +use elf_testkit::TestDatabase; +use elf_worker::worker::{self, WorkerState}; + +const TEST_CONTENT: &str = + "ELF docs extension v1 stores evidence. Keyword: peregrine.\nSecond sentence for chunking."; +const DOCS_SEARCH_FILTER_INDEXES: [(&str, PayloadSchemaType, FieldType); 9] = [ + ("scope", PayloadSchemaType::Keyword, FieldType::Keyword), + ("status", PayloadSchemaType::Keyword, FieldType::Keyword), + ("doc_type", PayloadSchemaType::Keyword, FieldType::Keyword), + ("agent_id", PayloadSchemaType::Keyword, FieldType::Keyword), + ("updated_at", PayloadSchemaType::Datetime, FieldType::Datetime), + ("doc_ts", PayloadSchemaType::Datetime, FieldType::Datetime), + ("thread_id", PayloadSchemaType::Keyword, FieldType::Keyword), + ("domain", PayloadSchemaType::Keyword, FieldType::Keyword), + ("repo", PayloadSchemaType::Keyword, FieldType::Keyword), +]; + +#[derive(FromRow)] +struct DocOutboxCounts { + total: i64, + done: i64, + failed: i64, +} + +#[derive(FromRow)] +struct NoteOutboxCounts { + total: i64, + done: i64, + failed: i64, +} + +struct DocsContext { + test_db: TestDatabase, + service: ElfService, +} + +struct NonZeroSearchEmbedding; +impl EmbeddingProvider for NonZeroSearchEmbedding { + fn embed<'a>( + &'a self, + cfg: &'a EmbeddingProviderConfig, + texts: &'a [String], + ) -> BoxFuture<'a, Result<Vec<Vec<f32>>>> { + let vector = vec![0.1_f32; cfg.dimensions as usize]; + + Box::pin(async move { Ok(vec![vector; texts.len()]) }) + } +} + +struct DocsFilterFixtureIds { + search_domain_doc_id: Uuid, + search_other_domain_doc_id: Uuid, + repo_doc_id: Uuid, + repo_other_doc_id: Uuid, +} + +fn build_test_tokenizer() -> Tokenizer { + let mut vocab = AHashMap::new(); + + vocab.insert("<unk>".to_string(), 0_u32); + + let model = WordLevel::builder() + .vocab(vocab) + .unk_token("<unk>".to_string()) + .build() + .expect("Failed to build test tokenizer."); + + Tokenizer::new(model) +} + +fn payload_string(payload_value: &qdrant_client::qdrant::Value) -> Option<&str> { + match payload_value.kind.as_ref() { + Some(value::Kind::StringValue(value)) => Some(value.as_str()), + _ => None, + } +} + +fn trajectory_stage_stats<'a>( + trajectory: &'a DocRetrievalTrajectory, + stage_name: &str, +) -> Option<&'a serde_json::Value> { + trajectory.stages.iter().find(|stage| stage.stage_name == stage_name).map(|stage| &stage.stats) +} + +fn configure_recency_bias_settings(service: &mut ElfService) { + service.providers.embedding = Arc::new(NonZeroSearchEmbedding); + service.cfg.ranking.tie_breaker_weight = 1_000.0; + service.cfg.ranking.recency_tau_days = 36_500.0; +} + +async fn wait_for_doc_outbox_done( + pool: &PgPool, + doc_id: Uuid, + timeout: std::time::Duration, +) -> bool { + let deadline = Instant::now() + timeout; + + loop { + let row: Option<DocOutboxCounts> = sqlx::query_as::<_, DocOutboxCounts>( + "\ +SELECT + COUNT(*) AS total, + COUNT(*) FILTER (WHERE status = 'DONE') AS done, + COUNT(*) FILTER (WHERE status = 'FAILED') AS failed +FROM doc_indexing_outbox +WHERE doc_id = $1", + ) + .bind(doc_id) + .fetch_optional(pool) + .await + .ok() + .flatten(); + + if let Some(row) = row.as_ref() + && row.total > 0 + && row.done == row.total + { + return true; + } + if let Some(row) = row.as_ref() + && row.failed > 0 + { + return false; + } + + if Instant::now() >= deadline { + return false; + } + + tokio::time::sleep(std::time::Duration::from_millis(200)).await; + } +} + +async fn wait_for_note_outbox_done( + pool: &PgPool, + note_id: Uuid, + timeout: std::time::Duration, +) -> bool { + let deadline = Instant::now() + timeout; + + loop { + let row: Option<NoteOutboxCounts> = sqlx::query_as::<_, NoteOutboxCounts>( + "\ +SELECT + COUNT(*) AS total, + COUNT(*) FILTER (WHERE status = 'DONE') AS done, + COUNT(*) FILTER (WHERE status = 'FAILED') AS failed +FROM indexing_outbox +WHERE note_id = $1", + ) + .bind(note_id) + .fetch_optional(pool) + .await + .ok() + .flatten(); + + if let Some(row) = row.as_ref() + && row.total > 0 + && row.done == row.total + { + return true; + } + if let Some(row) = row.as_ref() + && row.failed > 0 + { + return false; + } + + if Instant::now() >= deadline { + return false; + } + + tokio::time::sleep(std::time::Duration::from_millis(200)).await; + } +} + +async fn start_embed_server() -> (String, Sender<()>) { + let app = Router::new().route("/embeddings", routing::post(embed_handler)).with_state(()); + let listener = TcpListener::bind("127.0.0.1:0").await.expect("Failed to bind embed server."); + let addr = listener.local_addr().expect("Failed to read embed server address."); + let (tx, rx) = oneshot::channel(); + let server = axum::serve(listener, app).with_graceful_shutdown(async move { + let _ = rx.await; + }); + + tokio::spawn(async move { + let _ = server.into_future().await; + }); + + (format!("http://{addr}"), tx) +} + +async fn embed_handler( + State(()): State<()>, + Json(payload): Json<serde_json::Value>, +) -> impl IntoResponse { + let inputs = + payload.get("input").and_then(|value| value.as_array()).cloned().unwrap_or_default(); + let data: Vec<_> = inputs + .iter() + .enumerate() + .map(|(index, _)| { + let embedding: Vec<f32> = vec![0.1_f32; 4_096]; + + serde_json::json!({ + "index": index, + "embedding": embedding, + }) + }) + .collect(); + + (StatusCode::OK, Json(serde_json::json!({ "data": data }))).into_response() +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_put_get_excerpts_and_search_l0_work_end_to_end() { + let Some(ctx) = setup_docs_context().await else { return }; + let DocsContext { test_db, service } = ctx; + let put = put_test_doc(&service).await; + + assert_doc_get(&service, put.doc_id).await; + assert_doc_excerpt(&service, put.doc_id, put.content_hash.as_str()).await; + + let (handle, shutdown) = spawn_doc_worker(&service).await; + + assert!( + wait_for_doc_outbox_done(&service.db.pool, put.doc_id, std::time::Duration::from_secs(15)) + .await, + "Expected doc outbox to reach DONE." + ); + + assert_docs_search_l0(&service, put.doc_id).await; + + let _ = shutdown.send(()); + + handle.abort(); + + let _ = handle.await; + + drop(service); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_search_l0_respects_scope_doc_type_agent_id_and_updated_after_filters() { + let Some(ctx) = setup_docs_context().await else { return }; + let ( + test_db, + service, + shared_knowledge_doc, + _older_shared_knowledge_doc, + private_chat_doc, + handle, + shutdown, + ) = create_docs_search_filter_fixture(ctx).await; + let shared_scope_results = search_doc_ids_with_filters( + &service, + Some("project_shared"), + None, + None, + None, + None, + "reader", + ) + .await; + + assert!(shared_scope_results.contains(&shared_knowledge_doc)); + assert!(!shared_scope_results.contains(&private_chat_doc)); + + let chat_results = + search_doc_ids_with_filters(&service, None, Some("chat"), None, None, None, "reader").await; + + assert!(!chat_results.contains(&private_chat_doc)); + assert!(!chat_results.contains(&shared_knowledge_doc)); + + let assistant_chat_results = + search_doc_ids_with_filters(&service, None, Some("chat"), None, None, None, "assistant") + .await; + + assert!(assistant_chat_results.contains(&private_chat_doc)); + assert!(!assistant_chat_results.contains(&shared_knowledge_doc)); + + let assistant_results = + search_doc_ids_with_filters(&service, None, None, Some("assistant"), None, None, "reader") + .await; + + assert!(!assistant_results.contains(&private_chat_doc)); + assert!(!assistant_results.contains(&shared_knowledge_doc)); + + let past = (OffsetDateTime::now_utc() - time::Duration::seconds(60)) + .format(&Rfc3339) + .expect("Failed to format past RFC3339 timestamp."); + let future = (OffsetDateTime::now_utc() + time::Duration::seconds(60)) + .format(&Rfc3339) + .expect("Failed to format future RFC3339 timestamp."); + let updated_after_past_results = + search_doc_ids_with_filters(&service, None, None, None, Some(&past), None, "reader").await; + + assert!(updated_after_past_results.contains(&shared_knowledge_doc)); + assert!(!updated_after_past_results.contains(&private_chat_doc)); + + let updated_after_future_results = + search_doc_ids_with_filters(&service, None, None, None, Some(&future), None, "reader") + .await; + + assert!(updated_after_future_results.is_empty()); + + cleanup_docs_filter_fixture(test_db, handle, shutdown).await; +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_search_l0_respects_thread_id_filter_for_chat_docs() { + let Some(ctx) = setup_docs_context().await else { return }; + let ( + test_db, + service, + shared_knowledge_doc, + older_shared_knowledge_doc, + private_chat_doc, + handle, + shutdown, + ) = create_docs_search_filter_fixture(ctx).await; + let thread_filter_results = service + .docs_search_l0(DocsSearchL0Request { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + caller_agent_id: "assistant".to_string(), + scope: None, + status: None, + doc_type: Some("chat".to_string()), + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: Some("shared-chat-thread".to_string()), + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + read_profile: "private_plus_project".to_string(), + query: "peregrine".to_string(), + top_k: Some(20), + candidate_k: Some(50), + explain: None, + }) + .await + .expect("Failed to search docs with thread_id filter."); + let thread_filtered_docs = + thread_filter_results.items.into_iter().map(|item| item.doc_id).collect::<HashSet<_>>(); + + assert!(thread_filtered_docs.contains(&private_chat_doc)); + assert!(!thread_filtered_docs.contains(&shared_knowledge_doc)); + assert!(!thread_filtered_docs.contains(&older_shared_knowledge_doc)); + + cleanup_docs_filter_fixture(test_db, handle, shutdown).await; +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run this test."] +async fn docs_search_l0_requires_chat_doc_type_for_thread_id() { + let Some(ctx) = setup_docs_context().await else { return }; + let ( + test_db, + service, + _shared_knowledge_doc, + _older_shared_knowledge_doc, + _private_chat_doc, + handle, + shutdown, + ) = create_docs_search_filter_fixture(ctx).await; + let result = service + .docs_search_l0(DocsSearchL0Request { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + caller_agent_id: "assistant".to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: Some("shared-chat-thread".to_string()), + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + read_profile: "private_plus_project".to_string(), + query: "peregrine".to_string(), + top_k: Some(20), + candidate_k: Some(50), + explain: None, + }) + .await; + + match result { + Err(Error::InvalidRequest { message }) => { + assert!(message.contains("thread_id requires")); + }, + other => + panic!("Expected InvalidRequest for thread_id without chat doc_type, got {other:?}"), + } + + cleanup_docs_filter_fixture(test_db, handle, shutdown).await; +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run this test."] +async fn docs_put_applies_write_policy_and_excerpt_by_chunk_id_is_verified() { + let Some(ctx) = setup_docs_context().await else { return }; + let DocsContext { test_db, service } = ctx; + let content = "Alpha normal text then secret sk-abcdef and trailing content."; + let secret = "sk-abcdef"; + let start = content.find(secret).expect("Expected secret in content."); + let end = start + secret.len(); + let write_policy = serde_json::from_value(serde_json::json!({ + "exclusions": [{"start": start, "end": end}], + })) + .expect("Failed to build write_policy."); + let put = service + .docs_put(DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "owner".to_string(), + scope: "project_shared".to_string(), + doc_type: None, + title: Some("Docs write_policy sample".to_string()), + write_policy: Some(write_policy), + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + }), + content: content.to_string(), + }) + .await + .expect("Failed to put doc with write policy."); + let (handle, shutdown) = spawn_doc_worker(&service).await; + + assert!( + wait_for_doc_outbox_done(&service.db.pool, put.doc_id, std::time::Duration::from_secs(15)) + .await, + "Expected doc outbox to reach DONE." + ); + + let chunk_id = fetch_first_doc_chunk_id(&service, put.doc_id) + .await + .expect("Expected chunk id from transformed doc."); + let excerpt = service + .docs_excerpts_get(DocsExcerptsGetRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "reader".to_string(), + read_profile: "private_plus_project".to_string(), + doc_id: put.doc_id, + level: "L1".to_string(), + chunk_id: Some(chunk_id), + quote: None, + position: None, + explain: None, + }) + .await + .expect("Failed to hydrate excerpt by chunk_id."); + + assert!(excerpt.verification.verified); + assert!(!excerpt.excerpt.is_empty()); + assert!(!excerpt.excerpt.contains(secret)); + assert_eq!(excerpt.verification.content_hash, put.content_hash); + assert!(put.write_policy_audit.is_some()); + + let _ = shutdown.send(()); + + handle.abort(); + + let _ = handle.await; + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_search_l0_respects_doc_ts_filter() { + let Some(ctx) = setup_docs_context().await else { return }; + let ( + test_db, + service, + shared_knowledge_doc, + older_shared_knowledge_doc, + private_chat_doc, + handle, + shutdown, + ) = create_docs_search_filter_fixture(ctx).await; + let doc_ts_windowed_results = service + .docs_search_l0(DocsSearchL0Request { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + caller_agent_id: "reader".to_string(), + scope: Some("project_shared".to_string()), + status: None, + doc_type: None, + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: Some("2026-01-01T00:00:00Z".to_string()), + ts_lte: Some("2026-12-31T23:59:59Z".to_string()), + read_profile: "all_scopes".to_string(), + query: "peregrine".to_string(), + top_k: Some(20), + candidate_k: Some(50), + explain: None, + }) + .await + .expect("Failed to search docs by doc_ts range."); + let doc_ts_windowed_ids = + doc_ts_windowed_results.items.into_iter().map(|item| item.doc_id).collect::<HashSet<_>>(); + + assert!(doc_ts_windowed_ids.contains(&shared_knowledge_doc)); + assert!(!doc_ts_windowed_ids.contains(&older_shared_knowledge_doc)); + assert!(!doc_ts_windowed_ids.contains(&private_chat_doc)); + + cleanup_docs_filter_fixture(test_db, handle, shutdown).await; +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run this test."] +async fn docs_search_l0_sparse_mode_records_expected_vector_search_channels() { + let Some(ctx) = setup_docs_context().await else { return }; + let DocsContext { test_db, service } = ctx; + let doc = put_test_doc(&service).await; + let (handle, shutdown) = spawn_doc_worker(&service).await; + + assert!( + wait_for_doc_outbox_done(&service.db.pool, doc.doc_id, std::time::Duration::from_secs(15)) + .await, + "Expected doc outbox to reach DONE." + ); + + let cases = [ + ("off", vec!["dense"]), + ("on", vec!["dense", "sparse"]), + ("auto", vec!["dense", "sparse"]), + ]; + + for (sparse_mode, expected_channels) in cases { + let response = service + .docs_search_l0(DocsSearchL0Request { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + caller_agent_id: "reader".to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: Some(sparse_mode.to_string()), + domain: None, + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + read_profile: "private_plus_project".to_string(), + query: "https://elf.example/docs?query=peregrine".to_string(), + top_k: Some(20), + candidate_k: Some(50), + explain: Some(true), + }) + .await + .expect("Failed to search docs with sparse_mode set."); + let trajectory = response.trajectory.as_ref().expect("Expected explain trajectory."); + let vector_search_stats = trajectory_stage_stats(trajectory, "vector_search") + .expect("Expected vector_search stage in trajectory."); + let vector_search_channels = vector_search_stats + .get("channels") + .and_then(serde_json::Value::as_array) + .expect("Expected vector_search stats channels."); + let observed_channels = vector_search_channels + .iter() + .map(|channel| channel.as_str().expect("Expected channel string.").to_string()) + .collect::<Vec<_>>(); + + assert_eq!(observed_channels, expected_channels); + } + + let _ = shutdown.send(()); + + handle.abort(); + + let _ = handle.await; + + drop(service); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run this test."] +async fn docs_search_l0_filters_include_and_exclude_by_doc_type_and_domain_or_repo() { + let Some(ctx) = setup_docs_context().await else { return }; + let docs = seed_docs_filter_fixtures(&ctx).await; + let DocsContext { test_db, service } = ctx; + let (handle, shutdown) = spawn_doc_worker(&service).await; + + for doc_id in [ + docs.search_domain_doc_id, + docs.search_other_domain_doc_id, + docs.repo_doc_id, + docs.repo_other_doc_id, + ] + .iter() + { + assert!( + wait_for_doc_outbox_done(&service.db.pool, *doc_id, std::time::Duration::from_secs(15)) + .await, + "Expected docs outbox to reach DONE." + ); + } + + let search_domain_results = service + .docs_search_l0(DocsSearchL0Request { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + caller_agent_id: "reader".to_string(), + scope: Some("project_shared".to_string()), + status: None, + doc_type: Some("search".to_string()), + sparse_mode: None, + domain: Some("docs.example.com".to_string()), + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + read_profile: "all_scopes".to_string(), + query: "peregrine".to_string(), + top_k: Some(20), + candidate_k: Some(50), + explain: None, + }) + .await + .expect("Failed to search docs by domain."); + let search_domain_result_ids = + search_domain_results.items.into_iter().map(|item| item.doc_id).collect::<HashSet<_>>(); + + assert!(search_domain_result_ids.contains(&docs.search_domain_doc_id)); + assert!(!search_domain_result_ids.contains(&docs.search_other_domain_doc_id)); + assert!(!search_domain_result_ids.contains(&docs.repo_doc_id)); + assert!(!search_domain_result_ids.contains(&docs.repo_other_doc_id)); + + let repo_results = service + .docs_search_l0(DocsSearchL0Request { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + caller_agent_id: "reader".to_string(), + scope: Some("project_shared".to_string()), + status: None, + doc_type: Some("dev".to_string()), + sparse_mode: None, + domain: None, + repo: Some("elf-org/docs".to_string()), + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + read_profile: "all_scopes".to_string(), + query: "peregrine".to_string(), + top_k: Some(20), + candidate_k: Some(50), + explain: None, + }) + .await + .expect("Failed to search docs by repo."); + let repo_result_ids = + repo_results.items.into_iter().map(|item| item.doc_id).collect::<HashSet<_>>(); + + assert!(repo_result_ids.contains(&docs.repo_doc_id)); + assert!(!repo_result_ids.contains(&docs.repo_other_doc_id)); + assert!(!repo_result_ids.contains(&docs.search_domain_doc_id)); + assert!(!repo_result_ids.contains(&docs.search_other_domain_doc_id)); + + let _ = shutdown.send(()); + + handle.abort(); + + let _ = handle.await; + + drop(service); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +async fn seed_docs_filter_fixtures(ctx: &DocsContext) -> DocsFilterFixtureIds { + let search_domain_doc = put_test_doc_with( + &ctx.service, + "owner", + "project_shared", + Some("search"), + "Docs domain include sample", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "search", + "ts": "2026-02-25T12:00:00Z", + "query": "How to fetch docs", + "domain": "docs.example.com", + "url": "https://docs.example.com/guide", + }), + TEST_CONTENT, + ) + .await; + let search_other_domain_doc = put_test_doc_with( + &ctx.service, + "owner", + "project_shared", + Some("search"), + "Docs domain exclude sample", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "search", + "ts": "2026-02-25T12:00:00Z", + "query": "How to build", + "domain": "api.example.org", + "url": "https://api.example.org/reference", + }), + TEST_CONTENT, + ) + .await; + let repo_doc = put_test_doc_with( + &ctx.service, + "owner", + "project_shared", + Some("dev"), + "Docs repo include sample", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "dev", + "ts": "2026-02-25T12:00:00Z", + "repo": "elf-org/docs", + "commit_sha": "9f0a3f4c4eb58bfcf4a5f4f9d0c7be0e13c2f8d19", + }), + TEST_CONTENT, + ) + .await; + let repo_other_doc = put_test_doc_with( + &ctx.service, + "owner", + "project_shared", + Some("dev"), + "Docs repo exclude sample", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "dev", + "ts": "2026-02-25T12:00:00Z", + "repo": "other-org/docs", + "commit_sha": "4e3d9ec4d2a59a2f6c7d7f3d4c6e8a5b1f7b9d3f", + }), + TEST_CONTENT, + ) + .await; + + DocsFilterFixtureIds { + search_domain_doc_id: search_domain_doc.doc_id, + search_other_domain_doc_id: search_other_domain_doc.doc_id, + repo_doc_id: repo_doc.doc_id, + repo_other_doc_id: repo_other_doc.doc_id, + } +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run this test."] +async fn docs_search_l0_recency_bias_orders_newer_doc_first_and_records_projection_signals() { + let Some(ctx) = setup_docs_context().await else { return }; + let DocsContext { test_db, mut service } = ctx; + + configure_recency_bias_settings(&mut service); + + let (handle, shutdown) = seed_recency_bias_docs_for_search(&service).await; + + assert_docs_search_l0_recency_projection(&service).await; + + let _ = shutdown.send(()); + + handle.abort(); + + let _ = handle.await; + + drop(service); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +async fn seed_recency_bias_docs_for_search(service: &ElfService) -> (JoinHandle<()>, Sender<()>) { + let newer_doc = put_test_doc_with( + service, + "owner", + "project_shared", + Some("knowledge"), + "Recency newer doc", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-27T12:00:00Z", + }), + TEST_CONTENT, + ) + .await; + let older_doc = put_test_doc_with( + service, + "owner", + "project_shared", + Some("knowledge"), + "Recency older doc", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-20T12:00:00Z", + }), + TEST_CONTENT, + ) + .await; + let (handle, shutdown) = spawn_doc_worker(service).await; + + assert!( + wait_for_doc_outbox_done( + &service.db.pool, + newer_doc.doc_id, + std::time::Duration::from_secs(15), + ) + .await, + "Expected newer doc outbox to reach DONE." + ); + assert!( + wait_for_doc_outbox_done( + &service.db.pool, + older_doc.doc_id, + std::time::Duration::from_secs(15), + ) + .await, + "Expected older doc outbox to reach DONE." + ); + + let older_ts = OffsetDateTime::parse("2020-01-01T00:00:00Z", &Rfc3339) + .expect("Failed to parse older doc timestamp."); + + sqlx::query("UPDATE doc_documents SET updated_at = $1 WHERE doc_id = $2") + .bind(older_ts) + .bind(older_doc.doc_id) + .execute(&service.db.pool) + .await + .expect("Failed to set deterministic updated_at for older doc."); + + (handle, shutdown) +} + +async fn assert_docs_search_l0_recency_projection(service: &ElfService) { + let results = service + .docs_search_l0(DocsSearchL0Request { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + caller_agent_id: "reader".to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + read_profile: "private_plus_project".to_string(), + query: "peregrine".to_string(), + top_k: Some(2), + candidate_k: Some(20), + explain: Some(true), + }) + .await + .expect("Failed to search docs for recency ordering."); + let ordered_ids = results.items.iter().map(|item| item.doc_id).collect::<Vec<_>>(); + + assert!(ordered_ids.len() >= 2); + + let newest_id = results + .items + .iter() + .max_by_key(|item| item.updated_at.unix_timestamp()) + .expect("Expected returned item.") + .doc_id; + + assert_eq!(results.items[0].doc_id, newest_id); + assert!(results.items[0].updated_at > results.items[1].updated_at); + + let trajectory = results.trajectory.as_ref().expect("Expected explain trajectory."); + let result_projection = trajectory_stage_stats(trajectory, "result_projection") + .expect("Expected result_projection stage in trajectory."); + + assert!(result_projection.get("pre_authorization_candidates").is_some()); + assert!(result_projection.get("returned_items").is_some()); + assert!(result_projection.get("recency_tau_days").is_some()); + assert!(result_projection.get("tie_breaker_weight").is_some()); + assert_eq!( + result_projection.get("recency_boost_applied"), + Some(&serde_json::Value::Bool(true)) + ); +} + +async fn create_docs_search_filter_fixture( + ctx: DocsContext, +) -> (TestDatabase, ElfService, Uuid, Uuid, Uuid, JoinHandle<()>, Sender<()>) { + let DocsContext { test_db, service } = ctx; + let shared_knowledge_doc = put_test_doc_with( + &service, + "owner", + "project_shared", + None, + "Docs filter sample", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + }), + TEST_CONTENT, + ) + .await; + let older_shared_knowledge_doc = put_test_doc_with( + &service, + "owner", + "project_shared", + None, + "Docs old filter sample", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2025-01-01T10:00:00Z", + }), + TEST_CONTENT, + ) + .await; + let private_chat_doc = put_test_doc_with( + &service, + "assistant", + "agent_private", + Some("chat"), + "Docs chat sample", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "chat", + "ts": "2026-02-25T12:00:00Z", + "thread_id": "shared-chat-thread", + "role": "assistant" + }), + TEST_CONTENT, + ) + .await; + let (handle, shutdown) = spawn_doc_worker(&service).await; + + assert!( + wait_for_doc_outbox_done( + &service.db.pool, + shared_knowledge_doc.doc_id, + std::time::Duration::from_secs(15) + ) + .await, + "Expected shared docs outbox to reach DONE." + ); + assert!( + wait_for_doc_outbox_done( + &service.db.pool, + older_shared_knowledge_doc.doc_id, + std::time::Duration::from_secs(15) + ) + .await, + "Expected older shared docs outbox to reach DONE." + ); + assert!( + wait_for_doc_outbox_done( + &service.db.pool, + private_chat_doc.doc_id, + std::time::Duration::from_secs(15) + ) + .await, + "Expected private docs outbox to reach DONE." + ); + + ( + test_db, + service, + shared_knowledge_doc.doc_id, + older_shared_knowledge_doc.doc_id, + private_chat_doc.doc_id, + handle, + shutdown, + ) +} + +async fn cleanup_docs_filter_fixture( + test_db: TestDatabase, + _handle: JoinHandle<()>, + shutdown: Sender<()>, +) { + let _ = shutdown.send(()); + + _handle.abort(); + + let _ = _handle.await; + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_put_rejects_non_english_source_ref() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping docs_extension_v1; set ELF_PG_DSN to run this test."); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!( + "Skipping docs_extension_v1; set ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run this test." + ); + + return; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(SpyExtractor { + calls: Arc::new(Default::default()), + payload: serde_json::json!({ "notes": [] }), + }), + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + let result = service + .docs_put(DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "owner".to_string(), + scope: "project_shared".to_string(), + doc_type: None, + title: Some("Docs rejection sample".to_string()), + write_policy: None, + source_ref: serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + "notes": "你好" + }), + content: TEST_CONTENT.to_string(), + }) + .await; + + match result { + Err(Error::NonEnglishInput { field }) => { + assert_eq!(field, "$.source_ref[\"notes\"]"); + }, + other => panic!("Expected NonEnglishInput, got {other:?}"), + } + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_put_rejects_missing_and_invalid_source_ref() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping docs_extension_v1; set ELF_PG_DSN to run this test."); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!( + "Skipping docs_extension_v1; set ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run this test." + ); + + return; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(SpyExtractor { + calls: Arc::new(Default::default()), + payload: serde_json::json!({ "notes": [] }), + }), + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + let result = service + .docs_put(DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "owner".to_string(), + scope: "project_shared".to_string(), + doc_type: None, + title: Some("Docs rejection sample".to_string()), + write_policy: None, + source_ref: serde_json::json!("legacy-shape"), + content: TEST_CONTENT.to_string(), + }) + .await; + + match result { + Err(Error::InvalidRequest { message }) => { + assert!(message.contains("source_ref must be a JSON object")); + }, + other => panic!("Expected InvalidRequest for non-object source_ref, got {other:?}"), + } + + let result = service + .docs_put(DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "owner".to_string(), + scope: "project_shared".to_string(), + doc_type: None, + title: Some("Docs rejection sample".to_string()), + write_policy: None, + source_ref: serde_json::json!({ + "schema": "source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + }), + content: TEST_CONTENT.to_string(), + }) + .await; + + match result { + Err(Error::InvalidRequest { message }) => { + assert!(message.contains("doc_source_ref/v1")); + }, + other => panic!("Expected InvalidRequest for wrong source_ref schema, got {other:?}"), + } + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_search_l0_requires_qdrant_payload_indexes_for_filters() { + let Some(ctx) = setup_docs_context().await else { return }; + let DocsContext { test_db, service } = ctx; + let doc = put_test_doc(&service).await; + let (handle, shutdown) = spawn_doc_worker(&service).await; + + assert!( + wait_for_doc_outbox_done(&service.db.pool, doc.doc_id, std::time::Duration::from_secs(15)) + .await, + "Expected doc outbox to reach DONE." + ); + + verify_docs_qdrant_filter_indexes(&service).await; + + let _ = shutdown.send(()); + + handle.abort(); + + let _ = handle.await; + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_search_l0_projects_source_ref_payload_fields() { + let Some(ctx) = setup_docs_context().await else { return }; + let DocsContext { test_db, service } = ctx; + let source_ts = "2025-01-01T10:00:00Z"; + let cases = [ + ( + "chat", + "Docs chat source ref sample", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "chat", + "ts": source_ts, + "thread_id": "thread-42", + "role": "assistant" + }), + ("thread_id", "thread-42"), + ["domain", "repo"], + ), + ( + "search", + "Docs search source ref sample", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "search", + "ts": source_ts, + "query": "What is payload indexing?", + "url": "https://docs.example.com/search", + "domain": "docs.example.com", + "provider": "web" + }), + ("domain", "docs.example.com"), + ["thread_id", "repo"], + ), + ( + "dev", + "Docs dev source ref sample", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "dev", + "ts": source_ts, + "repo": "elf-org/docs", + "commit_sha": "9f0a3f4c4eb58bfcf4a5f4f9d0c7be0e13c2f8d19" + }), + ("repo", "elf-org/docs"), + ["thread_id", "domain"], + ), + ]; + let mut docs = Vec::new(); + + for (doc_type, title, source_ref, expected_present, expected_absent) in cases { + let doc = put_test_doc_with( + &service, + "owner", + "project_shared", + Some(doc_type), + title, + source_ref, + TEST_CONTENT, + ) + .await; + + docs.push((doc.doc_id, expected_present, expected_absent)); + } + + let (handle, shutdown) = spawn_doc_worker(&service).await; + + for (doc_id, expected_present, expected_absent) in &docs { + assert!( + wait_for_doc_outbox_done(&service.db.pool, *doc_id, std::time::Duration::from_secs(15)) + .await, + "Expected doc outbox to reach DONE." + ); + + let point = fetch_first_doc_chunk_point(&service, *doc_id) + .await + .expect("Expected doc chunk point in Qdrant."); + + assert_eq!(point.payload.get("doc_ts").and_then(payload_string), Some(source_ts)); + assert_eq!( + point.payload.get(expected_present.0).and_then(payload_string), + Some(expected_present.1) + ); + + for key in expected_absent { + assert!(!point.payload.contains_key(*key)); + } + } + + _ = shutdown.send(()); + + handle.abort(); + + let _ = handle.await; + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +async fn setup_docs_context() -> Option<DocsContext> { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping docs_extension_v1; set ELF_PG_DSN to run this test."); + + return None; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!( + "Skipping docs_extension_v1; set ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run this test." + ); + + return None; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(SpyExtractor { + calls: Arc::new(Default::default()), + payload: serde_json::json!({ "notes": [] }), + }), + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + acceptance::reset_qdrant_collection( + &service.qdrant.client, + &service.qdrant.collection, + service.qdrant.vector_dim, + ) + .await + .expect("Failed to reset Qdrant memory collection."); + acceptance::reset_qdrant_collection( + &service.qdrant.client, + &service.cfg.storage.qdrant.docs_collection, + service.qdrant.vector_dim, + ) + .await + .expect("Failed to reset Qdrant docs collection."); + + Some(DocsContext { test_db, service }) +} + +async fn fetch_first_doc_chunk_id(db: &ElfService, doc_id: Uuid) -> Option<Uuid> { + sqlx::query_scalar::<_, Uuid>( + "SELECT chunk_id FROM doc_chunks WHERE doc_id = $1 ORDER BY chunk_index LIMIT 1", + ) + .bind(doc_id) + .fetch_optional(&db.db.pool) + .await + .expect("Failed to fetch doc chunk id.") +} + +async fn fetch_first_doc_chunk_point(service: &ElfService, doc_id: Uuid) -> Option<RetrievedPoint> { + let chunk_id = fetch_first_doc_chunk_id(service, doc_id).await?; + let response = service + .qdrant + .client + .get_points( + GetPointsBuilder::new( + service.cfg.storage.qdrant.docs_collection.clone(), + vec![chunk_id.to_string().into()], + ) + .with_payload(true), + ) + .await + .expect("Failed to fetch doc chunk point from Qdrant."); + + response.result.into_iter().next() +} + +async fn put_test_doc(service: &ElfService) -> DocsPutResponse { + put_test_doc_with( + service, + "owner", + "project_shared", + None, + "Docs v1", + serde_json::json!({ + "schema": "doc_source_ref/v1", + "doc_type": "knowledge", + "ts": "2026-02-25T12:00:00Z", + "uri": "acceptance://knowledge/v1" + }), + TEST_CONTENT, + ) + .await +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_search_l0_returns_pointer_and_explain_trajectory() { + let Some(ctx) = setup_docs_context().await else { return }; + let DocsContext { test_db, service } = ctx; + let doc = put_test_doc(&service).await; + let (handle, shutdown) = spawn_doc_worker(&service).await; + + assert!( + wait_for_doc_outbox_done(&service.db.pool, doc.doc_id, std::time::Duration::from_secs(15)) + .await, + "Expected doc outbox to reach DONE." + ); + + let results = service + .docs_search_l0(DocsSearchL0Request { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + caller_agent_id: "reader".to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + read_profile: "private_plus_project".to_string(), + query: "peregrine".to_string(), + top_k: Some(5), + candidate_k: Some(20), + explain: Some(true), + }) + .await + .expect("Failed to search docs."); + + assert_eq!( + results.trajectory.as_ref().map(|trajectory| trajectory.schema.as_str()), + Some("doc_retrieval_trajectory/v1") + ); + assert!(results.trajectory.is_some()); + assert!(!results.items.is_empty()); + assert!(results.items[0].pointer.schema == "source_ref/v1"); + assert!(!results.items[0].pointer.reference.doc_id.is_nil()); + assert!(!results.items[0].pointer.reference.chunk_id.is_nil()); + assert_eq!(results.items[0].pointer.resolver, "elf_doc_ext/v1"); + assert!(!results.trace_id.is_nil()); + + let _ = shutdown.send(()); + + handle.abort(); + + let _ = handle.await; + + drop(service); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_search_l0_note_pointer_roundtrip_hydrates_doc() { + let Some(ctx) = setup_docs_context().await else { return }; + let DocsContext { test_db, service } = ctx; + let doc = put_test_doc(&service).await; + let (handle, shutdown) = spawn_doc_worker(&service).await; + + assert!( + wait_for_doc_outbox_done(&service.db.pool, doc.doc_id, std::time::Duration::from_secs(15)) + .await, + "Expected doc outbox to reach DONE." + ); + + let (source_ref, source_ref_doc_id, source_ref_chunk_id) = + fetch_docs_pointer_source_ref(&service).await; + let note_id = add_note_with_pointer_source_ref(&service, source_ref.clone()).await; + + assert!( + wait_for_note_outbox_done(&service.db.pool, note_id, std::time::Duration::from_secs(15)) + .await, + "Expected note outbox to reach DONE." + ); + + let search_results = service + .search_raw_quick(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "agent".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: PayloadLevel::L2, + query: "peregrine".to_string(), + top_k: Some(5), + candidate_k: Some(20), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Failed to search note with doc pointer source_ref."); + let has_pointer_source_ref = + search_results.items.into_iter().any(|item| item.source_ref == source_ref); + + assert!( + has_pointer_source_ref, + "Expected search result to include note with pointer source_ref." + ); + + let excerpt = service + .docs_excerpts_get(DocsExcerptsGetRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "reader".to_string(), + read_profile: "private_plus_project".to_string(), + doc_id: source_ref_doc_id, + level: "L1".to_string(), + chunk_id: Some(source_ref_chunk_id), + quote: None, + position: None, + explain: None, + }) + .await + .expect("Failed to hydrate excerpt from pointer source_ref."); + + assert!(excerpt.verification.verified); + + let _ = shutdown.send(()); + + handle.abort(); + + let _ = handle.await; + + drop(service); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +async fn fetch_docs_pointer_source_ref(service: &ElfService) -> (serde_json::Value, Uuid, Uuid) { + let search = service + .docs_search_l0(DocsSearchL0Request { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + caller_agent_id: "reader".to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + read_profile: "private_plus_project".to_string(), + query: "peregrine".to_string(), + top_k: Some(5), + candidate_k: Some(20), + explain: None, + }) + .await + .expect("Failed to search docs for source_ref pointer."); + + assert!(!search.items.is_empty(), "Expected docs_search_l0 to return source_ref pointer."); + + let pointer = search.items[0].pointer.clone(); + let source_ref = + serde_json::to_value(&pointer).expect("Failed to serialize docs_search_l0 pointer."); + + (source_ref, pointer.reference.doc_id, pointer.reference.chunk_id) +} + +async fn add_note_with_pointer_source_ref( + service: &ElfService, + source_ref: serde_json::Value, +) -> Uuid { + let note = service + .add_note(AddNoteRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "agent".to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some("doc_pointer_note".to_string()), + text: "Peregrine note for source_ref hydration check.".to_string(), + structured: None, + importance: 0.5, + confidence: 0.9, + ttl_days: None, + source_ref, + write_policy: None, + }], + }) + .await + .expect("Failed to add note from docs pointer."); + + note.results[0].note_id.expect("Expected note_id in add_note result.") +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL (or ELF_QDRANT_GRPC_URL) to run."] +async fn docs_excerpts_get_supports_l0_and_returns_locator_and_optional_trajectory() { + let Some(ctx) = setup_docs_context().await else { return }; + let DocsContext { test_db, service } = ctx; + let doc = put_test_doc(&service).await; + let (handle, shutdown) = spawn_doc_worker(&service).await; + + assert!( + wait_for_doc_outbox_done(&service.db.pool, doc.doc_id, std::time::Duration::from_secs(15)) + .await, + "Expected doc outbox to reach DONE." + ); + + let excerpt = service + .docs_excerpts_get(DocsExcerptsGetRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "reader".to_string(), + read_profile: "private_plus_project".to_string(), + doc_id: doc.doc_id, + level: "L0".to_string(), + chunk_id: None, + quote: Some(TextQuoteSelector { + exact: "Keyword: peregrine.".to_string(), + prefix: Some("evidence. ".to_string()), + suffix: Some("\nSecond".to_string()), + }), + position: None, + explain: Some(true), + }) + .await + .expect("Failed to hydrate excerpt."); + + assert_eq!(excerpt.locator.selector_kind, "quote"); + assert!(excerpt.locator.match_end_offset > excerpt.locator.match_start_offset); + assert!(excerpt.excerpt.len() <= 256); + assert!(excerpt.trajectory.is_some()); + assert_eq!( + excerpt.trajectory.as_ref().map(|trajectory| trajectory.schema.as_str()), + Some("doc_retrieval_trajectory/v1") + ); + assert!(!excerpt.trace_id.is_nil()); + + let no_explain = service + .docs_excerpts_get(DocsExcerptsGetRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "reader".to_string(), + read_profile: "private_plus_project".to_string(), + doc_id: doc.doc_id, + level: "L0".to_string(), + chunk_id: None, + quote: Some(TextQuoteSelector { + exact: "Keyword: peregrine.".to_string(), + prefix: Some("evidence. ".to_string()), + suffix: Some("\nSecond".to_string()), + }), + position: None, + explain: Some(false), + }) + .await + .expect("Failed to hydrate excerpt."); + + assert!(no_explain.trajectory.is_none()); + + let _ = shutdown.send(()); + + handle.abort(); + + let _ = handle.await; + + drop(service); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +async fn put_test_doc_with( + service: &ElfService, + agent_id: &str, + scope: &str, + doc_type: Option<&str>, + title: &str, + source_ref: serde_json::Value, + content: &str, +) -> DocsPutResponse { + service + .docs_put(DocsPutRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: agent_id.to_string(), + scope: scope.to_string(), + doc_type: doc_type.map(ToString::to_string), + title: Some(title.to_string()), + write_policy: None, + source_ref, + content: content.to_string(), + }) + .await + .expect("Failed to put doc.") +} + +async fn search_doc_ids_with_filters( + service: &ElfService, + scope: Option<&str>, + doc_type: Option<&str>, + agent_id: Option<&str>, + updated_after: Option<&str>, + updated_before: Option<&str>, + caller_agent_id: &str, +) -> HashSet<Uuid> { + let results = service + .docs_search_l0(DocsSearchL0Request { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + caller_agent_id: caller_agent_id.to_string(), + scope: scope.map(str::to_string), + status: None, + doc_type: doc_type.map(str::to_string), + sparse_mode: None, + domain: None, + repo: None, + agent_id: agent_id.map(str::to_string), + thread_id: None, + updated_after: updated_after.map(str::to_string), + updated_before: updated_before.map(str::to_string), + ts_gte: None, + ts_lte: None, + read_profile: "all_scopes".to_string(), + query: "peregrine".to_string(), + top_k: Some(20), + candidate_k: Some(50), + explain: None, + }) + .await + .expect("Failed to search docs."); + + results.items.into_iter().map(|item| item.doc_id).collect() +} + +async fn verify_docs_qdrant_filter_indexes(service: &ElfService) { + let mut payload_schema = service + .qdrant + .client + .collection_info(&service.cfg.storage.qdrant.docs_collection) + .await + .expect("Failed to fetch Qdrant docs collection info.") + .result + .expect("Qdrant collection info is missing.") + .payload_schema; + + for (field_name, payload_type, index_type) in DOCS_SEARCH_FILTER_INDEXES { + let missing_or_wrong = match payload_schema.get(field_name) { + Some(schema) => schema.data_type != payload_type as i32, + None => true, + }; + + if missing_or_wrong { + let request = CreateFieldIndexCollection { + collection_name: service.cfg.storage.qdrant.docs_collection.clone(), + wait: Some(true), + field_name: field_name.to_string(), + field_type: Some(index_type as i32), + field_index_params: None, + ordering: None, + timeout: None, + }; + + service + .qdrant + .client + .create_field_index(request) + .await + .expect("Failed to create required Qdrant payload index."); + } + } + + payload_schema = service + .qdrant + .client + .collection_info(&service.cfg.storage.qdrant.docs_collection) + .await + .expect("Failed to fetch Qdrant docs collection info.") + .result + .expect("Qdrant collection info is missing.") + .payload_schema; + + for (field_name, payload_type, _) in DOCS_SEARCH_FILTER_INDEXES { + let schema = payload_schema.get(field_name).expect("Expected required payload field."); + + assert_eq!( + schema.data_type, payload_type as i32, + "Unexpected payload type for {field_name}." + ); + } +} + +async fn assert_doc_get(service: &ElfService, doc_id: Uuid) { + let get_as_owner = service + .docs_get(DocsGetRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "owner".to_string(), + read_profile: "private_plus_project".to_string(), + doc_id, + }) + .await + .expect("Failed to get doc as owner."); + + assert_eq!(get_as_owner.scope, "project_shared"); + assert_eq!(get_as_owner.doc_type, "knowledge"); + assert_eq!(get_as_owner.agent_id, "owner"); + assert_eq!(get_as_owner.title.as_deref(), Some("Docs v1")); + + let get_as_reader = service + .docs_get(DocsGetRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "reader".to_string(), + read_profile: "private_plus_project".to_string(), + doc_id, + }) + .await + .expect("Failed to get doc as reader (expected project grant)."); + + assert_eq!(get_as_reader.doc_id, doc_id); +} + +async fn assert_doc_excerpt(service: &ElfService, doc_id: Uuid, content_hash: &str) { + let excerpts = service + .docs_excerpts_get(DocsExcerptsGetRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "reader".to_string(), + read_profile: "private_plus_project".to_string(), + doc_id, + level: "L1".to_string(), + chunk_id: None, + quote: Some(TextQuoteSelector { + exact: "Keyword: peregrine.".to_string(), + prefix: Some("evidence. ".to_string()), + suffix: Some("\nSecond".to_string()), + }), + position: None, + explain: None, + }) + .await + .expect("Failed to get excerpt."); + + assert!(excerpts.verification.verified); + assert!(excerpts.excerpt.contains("Keyword: peregrine.")); + assert_eq!(excerpts.verification.content_hash, content_hash); +} + +async fn spawn_doc_worker(service: &ElfService) -> (JoinHandle<()>, Sender<()>) { + let (api_base, shutdown) = start_embed_server().await; + let worker_state = WorkerState { + db: Db::connect(&service.cfg.storage.postgres).await.expect("Failed to connect worker DB."), + qdrant: QdrantStore::new(&service.cfg.storage.qdrant) + .expect("Failed to build Qdrant store."), + docs_qdrant: QdrantStore::new_with_collection( + &service.cfg.storage.qdrant, + &service.cfg.storage.qdrant.docs_collection, + ) + .expect("Failed to build docs Qdrant store."), + embedding: EmbeddingProviderConfig { + provider_id: "test".to_string(), + api_base, + api_key: "test-key".to_string(), + path: "/embeddings".to_string(), + model: "test".to_string(), + dimensions: 4_096, + timeout_ms: 1_000, + default_headers: Map::new(), + }, + chunking: ChunkingConfig { max_tokens: 64, overlap_tokens: 8 }, + tokenizer: build_test_tokenizer(), + }; + let handle = tokio::spawn(async move { + let _ = worker::run_worker(worker_state).await; + }); + + (handle, shutdown) +} + +async fn assert_docs_search_l0(service: &ElfService, doc_id: Uuid) { + let results = service + .docs_search_l0(DocsSearchL0Request { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + caller_agent_id: "reader".to_string(), + scope: None, + status: None, + doc_type: None, + sparse_mode: None, + domain: None, + repo: None, + agent_id: None, + thread_id: None, + updated_after: None, + updated_before: None, + ts_gte: None, + ts_lte: None, + read_profile: "private_plus_project".to_string(), + query: "peregrine".to_string(), + top_k: Some(5), + candidate_k: Some(20), + explain: None, + }) + .await + .expect("Failed to search docs."); + + assert!(!results.items.is_empty()); + assert_eq!(results.items[0].doc_id, doc_id); + assert_eq!(results.items[0].doc_type, "knowledge"); + assert!(results.items[0].snippet.contains("peregrine")); +} diff --git a/packages/elf-service/tests/acceptance/english_only_boundary.rs b/packages/elf-service/tests/acceptance/english_only_boundary.rs index baae9eb1..09fba084 100644 --- a/packages/elf-service/tests/acceptance/english_only_boundary.rs +++ b/packages/elf-service/tests/acceptance/english_only_boundary.rs @@ -1,156 +1,321 @@ -// std use std::sync::{Arc, atomic::AtomicUsize}; -// crates.io +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; use elf_service::{ - AddEventRequest, AddNoteInput, AddNoteRequest, ElfService, EventMessage, Providers, - SearchRequest, ServiceError, + AddEventRequest, AddNoteInput, AddNoteRequest, ElfService, Error, EventMessage, Providers, + SearchRequest, }; -// self -use super::{ - SpyExtractor, StubEmbedding, StubRerank, build_service, test_config, test_db, test_qdrant_url, -}; +async fn build_test_service( + dsn: String, + qdrant_url: String, + collection: String, + docs_collection: String, +) -> Option<ElfService> { + let extractor = SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(extractor), + ); + let cfg = acceptance::test_config(dsn, qdrant_url, 4_096, collection, docs_collection); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + Some(service) +} #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn rejects_cjk_in_add_note() { - let Some(test_db) = test_db().await else { +async fn rejects_non_english_in_add_note() { + let Some(test_db) = acceptance::test_db().await else { eprintln!("Skipping english_only_boundary; set ELF_PG_DSN to run this test."); + return; }; - let Some(qdrant_url) = test_qdrant_url() else { + let Some(qdrant_url) = acceptance::test_qdrant_url() else { eprintln!("Skipping english_only_boundary; set ELF_QDRANT_URL to run this test."); + return; }; let collection = test_db.collection_name("elf_acceptance"); - let Some(service) = build_test_service(test_db.dsn().to_string(), qdrant_url, collection).await + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let Some(service) = + build_test_service(test_db.dsn().to_string(), qdrant_url, collection, docs_collection) + .await else { return; }; - let request = AddNoteRequest { tenant_id: "t".to_string(), project_id: "p".to_string(), agent_id: "a".to_string(), scope: "agent_private".to_string(), notes: vec![AddNoteInput { - note_type: "fact".to_string(), + r#type: "fact".to_string(), key: None, text: "你好".to_string(), + structured: None, importance: 0.4, confidence: 0.9, ttl_days: None, source_ref: serde_json::json!({}), + write_policy: None, }], }; - let result = service.add_note(request).await; + match result { - Err(ServiceError::NonEnglishInput { field }) => { + Err(Error::NonEnglishInput { field }) => { assert_eq!(field, "$.notes[0].text"); }, other => panic!("Expected NonEnglishInput, got {other:?}"), } + test_db.cleanup().await.expect("Failed to cleanup test database."); } #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn rejects_cjk_in_add_event() { - let Some(test_db) = test_db().await else { +async fn rejects_cyrillic_in_add_note() { + let Some(test_db) = acceptance::test_db().await else { eprintln!("Skipping english_only_boundary; set ELF_PG_DSN to run this test."); + return; }; - let Some(qdrant_url) = test_qdrant_url() else { + let Some(qdrant_url) = acceptance::test_qdrant_url() else { eprintln!("Skipping english_only_boundary; set ELF_QDRANT_URL to run this test."); + return; }; let collection = test_db.collection_name("elf_acceptance"); - let Some(service) = build_test_service(test_db.dsn().to_string(), qdrant_url, collection).await + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let Some(service) = + build_test_service(test_db.dsn().to_string(), qdrant_url, collection, docs_collection) + .await else { return; }; + let request = AddNoteRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: None, + text: "Привет мир".to_string(), + structured: None, + importance: 0.4, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({}), + write_policy: None, + }], + }; + let result = service.add_note(request).await; + + match result { + Err(Error::NonEnglishInput { field }) => { + assert_eq!(field, "$.notes[0].text"); + }, + other => panic!("Expected NonEnglishInput, got {other:?}"), + } + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn rejects_non_english_in_add_event() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping english_only_boundary; set ELF_PG_DSN to run this test."); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping english_only_boundary; set ELF_QDRANT_URL to run this test."); + return; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let Some(service) = + build_test_service(test_db.dsn().to_string(), qdrant_url, collection, docs_collection) + .await + else { + return; + }; let request = AddEventRequest { tenant_id: "t".to_string(), project_id: "p".to_string(), agent_id: "a".to_string(), scope: Some("agent_private".to_string()), dry_run: Some(true), + ingestion_profile: None, messages: vec![EventMessage { role: "user".to_string(), content: "こんにちは".to_string(), ts: None, msg_id: None, + write_policy: None, }], }; - let result = service.add_event(request).await; + match result { - Err(ServiceError::NonEnglishInput { field }) => { + Err(Error::NonEnglishInput { field }) => { assert_eq!(field, "$.messages[0].content"); }, other => panic!("Expected NonEnglishInput, got {other:?}"), } + test_db.cleanup().await.expect("Failed to cleanup test database."); } #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn rejects_cjk_in_search() { - let Some(test_db) = test_db().await else { +async fn rejects_cyrillic_in_add_event() { + let Some(test_db) = acceptance::test_db().await else { eprintln!("Skipping english_only_boundary; set ELF_PG_DSN to run this test."); + return; }; - let Some(qdrant_url) = test_qdrant_url() else { + let Some(qdrant_url) = acceptance::test_qdrant_url() else { eprintln!("Skipping english_only_boundary; set ELF_QDRANT_URL to run this test."); + return; }; let collection = test_db.collection_name("elf_acceptance"); - let Some(service) = build_test_service(test_db.dsn().to_string(), qdrant_url, collection).await + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let Some(service) = + build_test_service(test_db.dsn().to_string(), qdrant_url, collection, docs_collection) + .await else { return; }; + let request = AddEventRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: Some("agent_private".to_string()), + dry_run: Some(true), + ingestion_profile: None, + messages: vec![EventMessage { + role: "user".to_string(), + content: "Это не английский текст.".to_string(), + ts: None, + msg_id: None, + write_policy: None, + }], + }; + let result = service.add_event(request).await; + + match result { + Err(Error::NonEnglishInput { field }) => { + assert_eq!(field, "$.messages[0].content"); + }, + other => panic!("Expected NonEnglishInput, got {other:?}"), + } + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn rejects_non_english_in_search() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping english_only_boundary; set ELF_PG_DSN to run this test."); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping english_only_boundary; set ELF_QDRANT_URL to run this test."); + return; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let Some(service) = + build_test_service(test_db.dsn().to_string(), qdrant_url, collection, docs_collection) + .await + else { + return; + }; let request = SearchRequest { tenant_id: "t".to_string(), project_id: "p".to_string(), agent_id: "a".to_string(), + token_id: None, read_profile: "private_only".to_string(), + payload_level: Default::default(), query: "안녕하세요".to_string(), top_k: Some(5), candidate_k: Some(10), + filter: None, record_hits: Some(false), + ranking: None, }; - let result = service.search(request).await; + match result { - Err(ServiceError::NonEnglishInput { field }) => { + Err(Error::NonEnglishInput { field }) => { assert_eq!(field, "$.query"); }, other => panic!("Expected NonEnglishInput, got {other:?}"), } + test_db.cleanup().await.expect("Failed to cleanup test database."); } -async fn build_test_service( - dsn: String, - qdrant_url: String, - collection: String, -) -> Option<ElfService> { - let extractor = SpyExtractor { - calls: Arc::new(AtomicUsize::new(0)), - payload: serde_json::json!({ "notes": [] }), +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn rejects_cyrillic_in_search() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping english_only_boundary; set ELF_PG_DSN to run this test."); + + return; }; - let providers = Providers::new( - Arc::new(StubEmbedding { vector_dim: 3 }), - Arc::new(StubRerank), - Arc::new(extractor), - ); + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping english_only_boundary; set ELF_QDRANT_URL to run this test."); - let cfg = test_config(dsn, qdrant_url, 3, collection); - let service = build_service(cfg, providers).await.expect("Failed to build service."); - super::reset_db(&service.db.pool).await.expect("Failed to reset test database."); - Some(service) + return; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let Some(service) = + build_test_service(test_db.dsn().to_string(), qdrant_url, collection, docs_collection) + .await + else { + return; + }; + let request = SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "Привет".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }; + let result = service.search(request).await; + + match result { + Err(Error::NonEnglishInput { field }) => { + assert_eq!(field, "$.query"); + }, + other => panic!("Expected NonEnglishInput, got {other:?}"), + } + + test_db.cleanup().await.expect("Failed to cleanup test database."); } diff --git a/packages/elf-service/tests/acceptance/evidence_binding.rs b/packages/elf-service/tests/acceptance/evidence_binding.rs index c879a544..e46c9e07 100644 --- a/packages/elf-service/tests/acceptance/evidence_binding.rs +++ b/packages/elf-service/tests/acceptance/evidence_binding.rs @@ -1,55 +1,62 @@ -// std use std::sync::{Arc, atomic::AtomicUsize}; -// crates.io -use elf_service::{AddEventRequest, EventMessage, NoteOp, Providers, REJECT_EVIDENCE_MISMATCH}; - -// self -use super::{ - SpyExtractor, StubEmbedding, StubRerank, build_service, test_config, test_db, test_qdrant_url, +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; +use elf_domain::memory_policy::MemoryPolicyDecision; +use elf_service::{ + AddEventRequest, EventMessage, NoteOp, Providers, REJECT_EVIDENCE_MISMATCH, + REJECT_WRITE_POLICY_MISMATCH, }; #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] async fn rejects_invalid_evidence_quote() { - let Some(test_db) = test_db().await else { + let Some(test_db) = acceptance::test_db().await else { eprintln!("Skipping rejects_invalid_evidence_quote; set ELF_PG_DSN to run this test."); + return; }; - let Some(qdrant_url) = test_qdrant_url() else { - eprintln!("Skipping rejects_invalid_evidence_quote; set ELF_QDRANT_URL to run this test.",); + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping rejects_invalid_evidence_quote; set ELF_QDRANT_URL to run this test."); + return; }; let extractor_payload = serde_json::json!({ - "notes": [ - { - "type": "fact", - "key": "project_workflow", - "text": "Fact: The workflow uses TODO markers.", - "importance": 0.5, - "confidence": 0.8, - "ttl_days": null, - "scope_suggestion": "agent_private", - "evidence": [ - { "message_index": 0, "quote": "This quote does not exist." } - ], - "reason": "test" - } + "notes": [ + { + "type": "fact", + "key": "project_workflow", + "text": "Fact: The workflow uses TODO markers.", + "importance": 0.5, + "confidence": 0.8, + "ttl_days": null, + "scope_suggestion": "agent_private", + "evidence": [ + { "message_index": 0, "quote": "This quote does not exist." } + ], + "reason": "test" + } ] }); - let extractor = SpyExtractor { calls: Arc::new(AtomicUsize::new(0)), payload: extractor_payload }; let providers = Providers::new( - Arc::new(StubEmbedding { vector_dim: 3 }), + Arc::new(StubEmbedding { vector_dim: 4_096 }), Arc::new(StubRerank), Arc::new(extractor), ); - let collection = test_db.collection_name("elf_acceptance"); - let cfg = test_config(test_db.dsn().to_string(), qdrant_url, 3, collection); - let service = build_service(cfg, providers).await.expect("Failed to build service."); - super::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); let request = AddEventRequest { tenant_id: "t".to_string(), @@ -57,18 +64,108 @@ async fn rejects_invalid_evidence_quote() { agent_id: "a".to_string(), scope: Some("agent_private".to_string()), dry_run: Some(false), + ingestion_profile: None, messages: vec![EventMessage { role: "user".to_string(), content: "This is a message without the expected quote.".to_string(), ts: None, msg_id: None, + write_policy: None, }], }; - let response = service.add_event(request).await.expect("add_event failed."); - assert_eq!(response.results.len(), 1); let result = &response.results[0]; + + assert_eq!(response.results.len(), 1); assert_eq!(result.op, NoteOp::Rejected); assert_eq!(result.reason_code.as_deref(), Some(REJECT_EVIDENCE_MISMATCH)); + assert_eq!(result.policy_decision, MemoryPolicyDecision::Reject); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn rejects_transformed_quote_mismatch_with_write_policy() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!( + "Skipping rejects_transformed_quote_mismatch_with_write_policy; set ELF_PG_DSN to run." + ); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!( + "Skipping rejects_transformed_quote_mismatch_with_write_policy; set ELF_QDRANT_URL to run." + ); + + return; + }; + let extractor_payload = serde_json::json!({ + "notes": [ + { + "type": "fact", + "key": "project_workflow", + "text": "Fact: The workflow uses TODO markers.", + "importance": 0.5, + "confidence": 0.8, + "ttl_days": null, + "scope_suggestion": "agent_private", + "evidence": [ + { "message_index": 0, "quote": "Alice mentors Bob." } + ], + "reason": "test" + } + ] + }); + let extractor = + SpyExtractor { calls: Arc::new(AtomicUsize::new(0)), payload: extractor_payload }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(extractor), + ); + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + let request = AddEventRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: Some("agent_private".to_string()), + dry_run: Some(false), + ingestion_profile: None, + messages: vec![EventMessage { + role: "user".to_string(), + content: "Alice mentors Bob.".to_string(), + ts: None, + msg_id: None, + write_policy: Some( + serde_json::from_value( + serde_json::json!({ "redactions": [{ "kind": "remove", "span": { "start": 0, "end": 5 } }] }), + ) + .expect("Failed to build write_policy."), + ), + }], + }; + let response = service.add_event(request).await.expect("add_event failed."); + let result = &response.results[0]; + + assert_eq!(response.results.len(), 1); + assert_eq!(result.op, NoteOp::Rejected); + assert_eq!(result.reason_code.as_deref(), Some(REJECT_WRITE_POLICY_MISMATCH)); + assert_eq!(result.policy_decision, MemoryPolicyDecision::Reject); + test_db.cleanup().await.expect("Failed to cleanup test database."); } diff --git a/packages/elf-service/tests/acceptance/graph_ingestion.rs b/packages/elf-service/tests/acceptance/graph_ingestion.rs new file mode 100644 index 00000000..511c2195 --- /dev/null +++ b/packages/elf-service/tests/acceptance/graph_ingestion.rs @@ -0,0 +1,812 @@ +use std::{ + collections::hash_map::DefaultHasher, + hash::{Hash, Hasher}, + sync::{Arc, atomic::AtomicUsize}, +}; + +use sqlx::{FromRow, PgPool}; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; +use elf_config::EmbeddingProviderConfig; +use elf_domain::memory_policy::MemoryPolicyDecision; +use elf_service::{ + AddEventRequest, AddNoteInput, AddNoteRequest, BoxFuture, ElfService, EmbeddingProvider, + EventMessage, GraphQueryEntityRef, GraphQueryPredicateRef, GraphQueryRequest, NoteOp, + Providers, RelationTemporalStatus, Result, StructuredFields, +}; + +const TEST_TENANT: &str = "t"; +const TEST_PROJECT: &str = "p"; +const TEST_SCOPE: &str = "agent_private"; +const GRAPH_REL_SUBJECT: &str = "alice"; +const GRAPH_REL_PREDICATE: &str = "mentors"; +const GRAPH_REL_OBJECT: &str = "Bob"; + +#[derive(Debug, FromRow)] +struct GraphFactRow { + fact_id: Uuid, + predicate_id: Option<Uuid>, + object_value: Option<String>, + valid_from: OffsetDateTime, + valid_to: Option<OffsetDateTime>, +} + +struct HashEmbedding { + vector_dim: u32, +} +impl EmbeddingProvider for HashEmbedding { + fn embed<'a>( + &'a self, + _: &'a EmbeddingProviderConfig, + texts: &'a [String], + ) -> BoxFuture<'a, Result<Vec<Vec<f32>>>> { + let vector_dim = self.vector_dim as usize; + let vectors = texts + .iter() + .map(|text| { + let mut values = Vec::with_capacity(vector_dim); + + for idx in 0..vector_dim { + let mut hasher = DefaultHasher::new(); + + text.hash(&mut hasher); + idx.hash(&mut hasher); + + let raw = hasher.finish(); + let normalized = ((raw % 2_000_000) as f32 / 1_000_000.0) - 1.0; + + values.push(normalized); + } + + values + }) + .collect(); + + Box::pin(async move { Ok(vectors) }) + } +} + +fn fact_note(key: &str, text: &str, predicate: &str, object_value: &str) -> AddNoteInput { + let structured = serde_json::from_value::<StructuredFields>(serde_json::json!({ + "relations": [{ + "subject": { "canonical": "Alice" }, + "predicate": predicate, + "object": { "value": object_value } + }] + })) + .expect("Failed to build structured fields."); + + AddNoteInput { + r#type: "fact".to_string(), + key: Some(key.to_string()), + text: text.to_string(), + structured: Some(structured), + importance: 0.8, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({}), + write_policy: None, + } +} + +fn assert_graph_policy_from_op(op: NoteOp, policy_decision: MemoryPolicyDecision) { + match op { + NoteOp::Add => assert_eq!(policy_decision, MemoryPolicyDecision::Remember), + NoteOp::Update => assert_eq!(policy_decision, MemoryPolicyDecision::Update), + _ => {}, + } +} + +fn duplicate_fact_attaches_multiple_evidence_request() -> AddNoteRequest { + AddNoteRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "agent_private".to_string(), + notes: vec![ + AddNoteInput { + r#type: "fact".to_string(), + key: Some("mentorship-a".to_string()), + text: "Alice mentors Bob in 2026.".to_string(), + structured: Some( + serde_json::from_value::<elf_service::structured_fields::StructuredFields>( + serde_json::json!({ + "relations": [{ + "subject": { "canonical": "Alice" }, + "predicate": "mentors", + "object": { "value": "Bob" } + }] + }), + ) + .expect("Failed to build structured fields."), + ), + importance: 0.8, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({}), + write_policy: None, + }, + AddNoteInput { + r#type: "fact".to_string(), + key: Some("mentorship-b".to_string()), + text: "Alice also mentors Bob often.".to_string(), + structured: Some( + serde_json::from_value::<elf_service::structured_fields::StructuredFields>( + serde_json::json!({ + "relations": [{ + "subject": { "canonical": "Alice" }, + "predicate": "mentors", + "object": { "value": "Bob" } + }] + }), + ) + .expect("Failed to build structured fields."), + ), + importance: 0.7, + confidence: 0.8, + ttl_days: None, + source_ref: serde_json::json!({}), + write_policy: None, + }, + ], + } +} + +fn works_at_graph_query_request(as_of: OffsetDateTime) -> GraphQueryRequest { + GraphQueryRequest { + tenant_id: TEST_TENANT.to_string(), + project_id: TEST_PROJECT.to_string(), + agent_id: "a".to_string(), + read_profile: "private_only".to_string(), + subject: GraphQueryEntityRef::Surface { surface: "Alice".to_string() }, + predicate: Some(GraphQueryPredicateRef::Surface { surface: "works at".to_string() }), + scopes: Some(vec![TEST_SCOPE.to_string()]), + as_of: Some(as_of), + limit: Some(10), + explain: Some(true), + } +} + +async fn graph_fact_id(pool: &PgPool) -> Uuid { + sqlx::query_scalar( + "\ +SELECT gf.fact_id +FROM graph_facts gf +JOIN graph_entities ge ON ge.entity_id = gf.subject_entity_id +WHERE ge.canonical_norm = $1 + AND gf.predicate = $2 + AND gf.object_value = $3 + AND gf.tenant_id = $4 + AND gf.project_id = $5 + AND gf.scope = $6", + ) + .bind(GRAPH_REL_SUBJECT) + .bind(GRAPH_REL_PREDICATE) + .bind(GRAPH_REL_OBJECT) + .bind(TEST_TENANT) + .bind(TEST_PROJECT) + .bind(TEST_SCOPE) + .fetch_one(pool) + .await + .expect("Failed to load fact.") +} + +async fn graph_fact_count(pool: &PgPool) -> i64 { + sqlx::query_scalar( + "\ +SELECT COUNT(*) +FROM graph_facts gf +JOIN graph_entities ge ON ge.entity_id = gf.subject_entity_id +WHERE ge.canonical_norm = $1 + AND gf.predicate = $2 + AND gf.object_value = $3 + AND gf.tenant_id = $4 + AND gf.project_id = $5 + AND gf.scope = $6", + ) + .bind(GRAPH_REL_SUBJECT) + .bind(GRAPH_REL_PREDICATE) + .bind(GRAPH_REL_OBJECT) + .bind(TEST_TENANT) + .bind(TEST_PROJECT) + .bind(TEST_SCOPE) + .fetch_one(pool) + .await + .expect("Failed to count fact rows.") +} + +async fn graph_fact_evidence_count(pool: &PgPool, fact_id: Uuid) -> i64 { + sqlx::query_scalar("SELECT COUNT(*) FROM graph_fact_evidence WHERE fact_id = $1") + .bind(fact_id) + .fetch_one(pool) + .await + .expect("Failed to load fact evidence.") +} + +async fn graph_fact_evidence_count_for_note(pool: &PgPool, fact_id: Uuid, note_id: Uuid) -> i64 { + sqlx::query_scalar( + "SELECT COUNT(*) FROM graph_fact_evidence WHERE fact_id = $1 AND note_id = $2", + ) + .bind(fact_id) + .bind(note_id) + .fetch_one(pool) + .await + .expect("Failed to load note evidence.") +} + +async fn graph_fact_row(pool: &PgPool, predicate: &str, object_value: &str) -> GraphFactRow { + sqlx::query_as::<_, GraphFactRow>( + "\ +SELECT + gf.fact_id, + gf.predicate_id, + gf.object_value, + gf.valid_from, + gf.valid_to +FROM graph_facts gf +JOIN graph_entities ge ON ge.entity_id = gf.subject_entity_id +WHERE ge.canonical_norm = $1 + AND gf.predicate = $2 + AND gf.object_value = $3 + AND gf.tenant_id = $4 + AND gf.project_id = $5 + AND gf.scope = $6", + ) + .bind(GRAPH_REL_SUBJECT) + .bind(predicate) + .bind(object_value) + .bind(TEST_TENANT) + .bind(TEST_PROJECT) + .bind(TEST_SCOPE) + .fetch_one(pool) + .await + .expect("Failed to load fact row.") +} + +async fn add_fact_note( + service: &ElfService, + key: &str, + text: &str, + predicate: &str, + object_value: &str, +) -> Uuid { + let response = service + .add_note(AddNoteRequest { + tenant_id: TEST_TENANT.to_string(), + project_id: TEST_PROJECT.to_string(), + agent_id: "a".to_string(), + scope: TEST_SCOPE.to_string(), + notes: vec![fact_note(key, text, predicate, object_value)], + }) + .await + .expect("add_note failed."); + + assert_eq!(response.results.len(), 1); + assert_eq!(response.results[0].op, NoteOp::Add); + + assert_graph_policy_from_op(response.results[0].op, response.results[0].policy_decision); + + response.results[0].note_id.expect("Expected note_id.") +} + +async fn activate_single_predicate(pool: &PgPool, predicate_id: Uuid) { + sqlx::query( + "\ +UPDATE graph_predicates +SET status = 'active', cardinality = 'single', updated_at = now() +WHERE predicate_id = $1", + ) + .bind(predicate_id) + .execute(pool) + .await + .expect("Failed to activate predicate."); +} + +async fn active_object_value_at( + pool: &PgPool, + predicate_id: Uuid, + at: OffsetDateTime, +) -> Option<String> { + sqlx::query_scalar( + "\ +SELECT gf.object_value +FROM graph_facts gf +JOIN graph_entities ge ON ge.entity_id = gf.subject_entity_id +WHERE ge.canonical_norm = $1 + AND gf.tenant_id = $2 + AND gf.project_id = $3 + AND gf.scope = $4 + AND gf.predicate_id = $5 + AND gf.valid_from <= $6 + AND (gf.valid_to IS NULL OR gf.valid_to > $6) +LIMIT 1", + ) + .bind(GRAPH_REL_SUBJECT) + .bind(TEST_TENANT) + .bind(TEST_PROJECT) + .bind(TEST_SCOPE) + .bind(predicate_id) + .bind(at) + .fetch_one(pool) + .await + .expect("Failed to load active fact object_value.") +} + +async fn active_fact_count_at(pool: &PgPool, predicate_id: Uuid, at: OffsetDateTime) -> i64 { + sqlx::query_scalar( + "\ +SELECT COUNT(*) +FROM graph_facts gf +JOIN graph_entities ge ON ge.entity_id = gf.subject_entity_id +WHERE ge.canonical_norm = $1 + AND gf.tenant_id = $2 + AND gf.project_id = $3 + AND gf.scope = $4 + AND gf.predicate_id = $5 + AND gf.valid_from <= $6 + AND (gf.valid_to IS NULL OR gf.valid_to > $6)", + ) + .bind(GRAPH_REL_SUBJECT) + .bind(TEST_TENANT) + .bind(TEST_PROJECT) + .bind(TEST_SCOPE) + .bind(predicate_id) + .bind(at) + .fetch_one(pool) + .await + .expect("Failed to count active facts.") +} + +async fn supersession_count( + pool: &PgPool, + from_fact_id: Uuid, + to_fact_id: Uuid, + note_id: Uuid, +) -> i64 { + sqlx::query_scalar( + "\ +SELECT COUNT(*) +FROM graph_fact_supersessions +WHERE from_fact_id = $1 + AND to_fact_id = $2 + AND note_id = $3", + ) + .bind(from_fact_id) + .bind(to_fact_id) + .bind(note_id) + .fetch_one(pool) + .await + .expect("Failed to count supersessions.") +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn add_note_duplicate_fact_attaches_multiple_evidence() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!( + "Skipping add_note_duplicate_fact_attaches_multiple_evidence; set ELF_PG_DSN to run.", + ); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!( + "Skipping add_note_duplicate_fact_attaches_multiple_evidence; set ELF_QDRANT_URL to run.", + ); + + return; + }; + let providers = Providers::new( + Arc::new(HashEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }), + ); + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + let response = service + .add_note(duplicate_fact_attaches_multiple_evidence_request()) + .await + .expect("add_note failed."); + + assert_eq!(response.results.len(), 2); + assert_eq!(response.results[0].op, NoteOp::Add); + assert_eq!(response.results[1].op, NoteOp::Add); + + assert_graph_policy_from_op(response.results[0].op, response.results[0].policy_decision); + assert_graph_policy_from_op(response.results[1].op, response.results[1].policy_decision); + + let first_note_id = response.results[0].note_id.expect("Expected note_id."); + let second_note_id = response.results[1].note_id.expect("Expected note_id."); + + assert_ne!(first_note_id, second_note_id); + + let fact_id = graph_fact_id(&service.db.pool).await; + let fact_count = graph_fact_count(&service.db.pool).await; + let evidence_count = graph_fact_evidence_count(&service.db.pool, fact_id).await; + + assert_eq!(fact_count, 1); + assert_eq!(evidence_count, 2); + + let first_evidence_count = + graph_fact_evidence_count_for_note(&service.db.pool, fact_id, first_note_id).await; + let second_evidence_count = + graph_fact_evidence_count_for_note(&service.db.pool, fact_id, second_note_id).await; + + assert_eq!(first_evidence_count, 1); + assert_eq!(second_evidence_count, 1); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn add_note_single_predicate_supersedes_conflicting_fact() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!( + "Skipping add_note_single_predicate_supersedes_conflicting_fact; set ELF_PG_DSN to run.", + ); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!( + "Skipping add_note_single_predicate_supersedes_conflicting_fact; set ELF_QDRANT_URL to run.", + ); + + return; + }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }), + ); + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + let old_note_id = + add_fact_note(&service, "employment-a", "Alice works at Initech.", "works at", "Initech") + .await; + let fact_a = graph_fact_row(&service.db.pool, "works at", "Initech").await; + let predicate_id = fact_a.predicate_id.expect("Expected predicate_id."); + + activate_single_predicate(&service.db.pool, predicate_id).await; + + tokio::time::sleep(std::time::Duration::from_millis(1)).await; + + let note_id = + add_fact_note(&service, "employment-b", "Alice works at Globex.", "works at", "Globex") + .await; + let fact_a = graph_fact_row(&service.db.pool, "works at", "Initech").await; + let fact_b = graph_fact_row(&service.db.pool, "works at", "Globex").await; + + assert_eq!(fact_a.predicate_id, Some(predicate_id)); + assert_eq!(fact_b.predicate_id, Some(predicate_id)); + assert_eq!(fact_a.object_value.as_deref(), Some("Initech")); + assert_eq!(fact_b.object_value.as_deref(), Some("Globex")); + assert_eq!(fact_a.valid_to, Some(fact_b.valid_from)); + assert!(fact_b.valid_to.is_none()); + + let t_before = fact_b.valid_from - time::Duration::microseconds(1); + let active_before = active_object_value_at(&service.db.pool, predicate_id, t_before).await; + + assert_eq!(active_before.as_deref(), Some("Initech")); + + let t_after = fact_b.valid_from + time::Duration::microseconds(1); + let active_after = active_object_value_at(&service.db.pool, predicate_id, t_after).await; + + assert_eq!(active_after.as_deref(), Some("Globex")); + + let historical_replay = service + .graph_query(works_at_graph_query_request(t_before)) + .await + .expect("historical graph query failed."); + + assert_eq!(historical_replay.facts.len(), 1); + assert_eq!(historical_replay.facts[0].object.value.as_deref(), Some("Initech")); + assert_eq!(historical_replay.facts[0].valid_to, Some(fact_b.valid_from)); + assert_eq!(historical_replay.facts[0].temporal_status, RelationTemporalStatus::Historical); + assert_eq!(historical_replay.facts[0].evidence_note_ids, vec![old_note_id]); + + let current_readback = service + .graph_query(works_at_graph_query_request(t_after)) + .await + .expect("current graph query failed."); + + assert_eq!(current_readback.facts.len(), 1); + assert_eq!(current_readback.facts[0].object.value.as_deref(), Some("Globex")); + assert_eq!(current_readback.facts[0].temporal_status, RelationTemporalStatus::Current); + assert_eq!(current_readback.facts[0].evidence_note_ids, vec![note_id]); + + let supersession_count = + supersession_count(&service.db.pool, fact_a.fact_id, fact_b.fact_id, note_id).await; + + assert_eq!(supersession_count, 1); + + let now = OffsetDateTime::now_utc(); + let active_count = active_fact_count_at(&service.db.pool, predicate_id, now).await; + + assert_eq!(active_count, 1); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn add_note_invalid_relation_rejected_has_field_path() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!( + "Skipping add_note_invalid_relation_rejected_has_field_path; set ELF_PG_DSN to run." + ); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!( + "Skipping add_note_invalid_relation_rejected_has_field_path; set ELF_QDRANT_URL to run.", + ); + + return; + }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }), + ); + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + let response = service + .add_note(AddNoteRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some("mentorship".to_string()), + text: "Alice mentors Bob.".to_string(), + structured: Some( + serde_json::from_value::<elf_service::structured_fields::StructuredFields>( + serde_json::json!({ + "relations": [{ + "subject": { "canonical": "Alice" }, + "object": { "value": "Bob" } + }] + }), + ) + .expect("Failed to build structured fields."), + ), + importance: 0.8, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({}), + write_policy: None, + }], + }) + .await + .expect("add_note failed."); + + assert_eq!(response.results.len(), 1); + assert_eq!(response.results[0].op, NoteOp::Rejected); + assert_eq!(response.results[0].reason_code.as_deref(), Some("REJECT_STRUCTURED_INVALID")); + assert_eq!( + response.results[0].field_path, + Some("structured.relations[0].predicate".to_string()), + ); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn add_note_persists_graph_relations() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping add_note_persists_graph_relations; set ELF_PG_DSN to run."); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping add_note_persists_graph_relations; set ELF_QDRANT_URL to run."); + + return; + }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }), + ); + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + let response = service + .add_note(AddNoteRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some("mentorship".to_string()), + text: "Alice mentors Bob.".to_string(), + structured: Some( + serde_json::from_value::<elf_service::structured_fields::StructuredFields>( + serde_json::json!({ + "relations": [{ + "subject": { "canonical": "Alice" }, + "predicate": "mentors", + "object": { "value": "Bob" } + }] + }), + ) + .expect("Failed to build structured fields."), + ), + importance: 0.8, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({}), + write_policy: None, + }], + }) + .await + .expect("add_note failed."); + + assert_eq!(response.results.len(), 1); + assert_eq!(response.results[0].op, NoteOp::Add); + + assert_graph_policy_from_op(response.results[0].op, response.results[0].policy_decision); + + let note_id = response.results[0].note_id.expect("Expected note_id."); + let fact_id = graph_fact_id(&service.db.pool).await; + let fact_count = graph_fact_count(&service.db.pool).await; + let evidence_count = + graph_fact_evidence_count_for_note(&service.db.pool, fact_id, note_id).await; + + assert_eq!(fact_count, 1); + assert_eq!(evidence_count, 1); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn add_event_persists_graph_relations() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping add_event_persists_graph_relations; set ELF_PG_DSN to run."); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping add_event_persists_graph_relations; set ELF_QDRANT_URL to run."); + + return; + }; + let extractor_payload = serde_json::json!({ + "notes": [{ + "type": "fact", + "key": "mentorship", + "text": "Alice mentors Bob.", + "structured": { + "relations": [{ + "subject": { "canonical": "Alice" }, + "predicate": "mentors", + "object": { "value": "Bob" } + }] + }, + "importance": 0.8, + "confidence": 0.9, + "ttl_days": null, + "scope_suggestion": "agent_private", + "evidence": [{ "message_index": 0, "quote": "Alice mentors Bob." }], + "reason": "test" + }] + }); + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(SpyExtractor { calls: Arc::new(AtomicUsize::new(0)), payload: extractor_payload }), + ); + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + let response = service + .add_event(AddEventRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: Some("agent_private".to_string()), + dry_run: Some(false), + ingestion_profile: None, + messages: vec![EventMessage { + role: "user".to_string(), + content: "Alice mentors Bob.".to_string(), + ts: None, + msg_id: None, + write_policy: None, + }], + }) + .await + .expect("add_event failed."); + + assert_eq!(response.results.len(), 1); + assert_eq!(response.results[0].op, NoteOp::Add); + + assert_graph_policy_from_op(response.results[0].op, response.results[0].policy_decision); + + let note_id = response.results[0].note_id.expect("Expected note_id."); + let fact_id = graph_fact_id(&service.db.pool).await; + let fact_count = graph_fact_count(&service.db.pool).await; + let evidence_count = + graph_fact_evidence_count_for_note(&service.db.pool, fact_id, note_id).await; + + assert_eq!(fact_count, 1); + assert_eq!(evidence_count, 1); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-service/tests/acceptance/idempotency.rs b/packages/elf-service/tests/acceptance/idempotency.rs index d4cb3e39..4236dc84 100644 --- a/packages/elf-service/tests/acceptance/idempotency.rs +++ b/packages/elf-service/tests/acceptance/idempotency.rs @@ -1,23 +1,20 @@ -// std use std::sync::{Arc, atomic::AtomicUsize}; -// crates.io +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; +use elf_domain::memory_policy::MemoryPolicyDecision; use elf_service::{AddNoteInput, AddNoteRequest, NoteOp, Providers}; -// self -use super::{ - SpyExtractor, StubEmbedding, StubRerank, build_service, test_config, test_db, test_qdrant_url, -}; - #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] async fn add_note_is_idempotent() { - let Some(test_db) = test_db().await else { + let Some(test_db) = acceptance::test_db().await else { eprintln!("Skipping add_note_is_idempotent; set ELF_PG_DSN to run this test."); + return; }; - let Some(qdrant_url) = test_qdrant_url() else { + let Some(qdrant_url) = acceptance::test_qdrant_url() else { eprintln!("Skipping add_note_is_idempotent; set ELF_QDRANT_URL to run this test."); + return; }; let extractor = SpyExtractor { @@ -25,15 +22,23 @@ async fn add_note_is_idempotent() { payload: serde_json::json!({ "notes": [] }), }; let providers = Providers::new( - Arc::new(StubEmbedding { vector_dim: 3 }), + Arc::new(StubEmbedding { vector_dim: 4_096 }), Arc::new(StubRerank), Arc::new(extractor), ); - let collection = test_db.collection_name("elf_acceptance"); - let cfg = test_config(test_db.dsn().to_string(), qdrant_url, 3, collection); - let service = build_service(cfg, providers).await.expect("Failed to build service."); - super::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); let request = AddNoteRequest { tenant_id: "t".to_string(), @@ -41,21 +46,24 @@ async fn add_note_is_idempotent() { agent_id: "a".to_string(), scope: "agent_private".to_string(), notes: vec![AddNoteInput { - note_type: "preference".to_string(), + r#type: "preference".to_string(), key: Some("preferred_language".to_string()), text: "Preference: Use English.".to_string(), + structured: None, importance: 0.5, confidence: 0.9, ttl_days: None, source_ref: serde_json::json!({}), + write_policy: None, }], }; - let first = service.add_note(request.clone()).await.expect("First add_note failed."); - assert_eq!(first.results.len(), 1); - let second = service.add_note(request).await.expect("Second add_note failed."); + + assert_eq!(first.results.len(), 1); assert_eq!(second.results.len(), 1); assert_eq!(second.results[0].op, NoteOp::None); + assert_eq!(second.results[0].policy_decision, MemoryPolicyDecision::Ignore); + test_db.cleanup().await.expect("Failed to cleanup test database."); } diff --git a/packages/elf-service/tests/acceptance/knowledge_pages.rs b/packages/elf-service/tests/acceptance/knowledge_pages.rs new file mode 100644 index 00000000..81ad83f3 --- /dev/null +++ b/packages/elf-service/tests/acceptance/knowledge_pages.rs @@ -0,0 +1,385 @@ +use std::sync::{Arc, atomic::AtomicUsize}; + +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; +use elf_domain::knowledge::KnowledgePageKind; +use elf_service::{ + AddNoteInput, AddNoteRequest, ElfService, KnowledgePageLintRequest, + KnowledgePageRebuildRequest, Providers, +}; +use elf_testkit::TestDatabase; + +const TENANT_ID: &str = "tenant_knowledge"; +const PROJECT_ID: &str = "project_knowledge"; +const AGENT_ID: &str = "agent_knowledge"; + +struct KnowledgeFixture { + service: ElfService, + _test_db: TestDatabase, +} + +async fn setup_service(test_name: &str) -> Option<KnowledgeFixture> { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); + + return None; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); + + return None; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let extractor = SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(extractor), + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + Some(KnowledgeFixture { service, _test_db: test_db }) +} + +async fn insert_source_note(service: &ElfService, key: &str, text: &str) -> Uuid { + let response = service + .add_note(AddNoteRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some(key.to_string()), + text: text.to_string(), + structured: None, + importance: 0.7, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({ "schema": "acceptance/v1", "key": key }), + write_policy: None, + }], + }) + .await + .expect("add_note should persist source note"); + + response.results[0].note_id.expect("source note id should be present") +} + +async fn insert_event_audit(service: &ElfService, note_id: Uuid) -> Uuid { + let decision_id = Uuid::new_v4(); + + sqlx::query( + "\ +INSERT INTO memory_ingest_decisions ( + decision_id, + tenant_id, + project_id, + agent_id, + scope, + pipeline, + note_type, + note_key, + note_id, + base_decision, + policy_decision, + note_op, + reason_code, + details, + ts +) +VALUES ($1,$2,$3,$4,'agent_private','add_event','fact','knowledge_event',$5,'remember','remember','ADD',NULL,$6,$7)", + ) + .bind(decision_id) + .bind(TENANT_ID) + .bind(PROJECT_ID) + .bind(AGENT_ID) + .bind(note_id) + .bind(serde_json::json!({ "fixture": "knowledge_page_event_audit" })) + .bind(OffsetDateTime::UNIX_EPOCH) + .execute(&service.db.pool) + .await + .expect("event audit should be inserted"); + + decision_id +} + +async fn insert_relation(service: &ElfService, note_id: Uuid) -> Uuid { + let subject_id = Uuid::new_v4(); + let fact_id = Uuid::new_v4(); + let evidence_id = Uuid::new_v4(); + + sqlx::query( + "\ +INSERT INTO graph_entities ( + entity_id, + tenant_id, + project_id, + canonical, + canonical_norm, + kind, + created_at, + updated_at +) +VALUES ($1,$2,$3,'ELF knowledge pages','elf knowledge pages','concept',$4,$4)", + ) + .bind(subject_id) + .bind(TENANT_ID) + .bind(PROJECT_ID) + .bind(OffsetDateTime::UNIX_EPOCH) + .execute(&service.db.pool) + .await + .expect("graph entity should be inserted"); + sqlx::query( + "\ +INSERT INTO graph_facts ( + fact_id, + tenant_id, + project_id, + agent_id, + scope, + subject_entity_id, + predicate, + predicate_id, + object_entity_id, + object_value, + valid_from, + valid_to, + created_at, + updated_at +) +VALUES ($1,$2,$3,$4,'project_shared',$5,'compile from',NULL,NULL,'authoritative source memory',$6,NULL,$6,$6)", + ) + .bind(fact_id) + .bind(TENANT_ID) + .bind(PROJECT_ID) + .bind(AGENT_ID) + .bind(subject_id) + .bind(OffsetDateTime::UNIX_EPOCH) + .execute(&service.db.pool) + .await + .expect("graph fact should be inserted"); + sqlx::query( + "\ +INSERT INTO graph_fact_evidence (evidence_id, fact_id, note_id, created_at) +VALUES ($1,$2,$3,$4)", + ) + .bind(evidence_id) + .bind(fact_id) + .bind(note_id) + .bind(OffsetDateTime::UNIX_EPOCH) + .execute(&service.db.pool) + .await + .expect("graph fact evidence should be inserted"); + + fact_id +} + +async fn insert_applied_proposal(service: &ElfService, note_id: Uuid) -> Uuid { + let run_id = Uuid::new_v4(); + let proposal_id = Uuid::new_v4(); + let source_refs = serde_json::json!([ + { + "kind": "note", + "id": note_id, + "snapshot": { + "status": "active", + "updated_at": "1970-01-01T00:00:00Z", + "metadata": { "fixture": "knowledge_pages" }, + "source_ref": {} + } + } + ]); + let lineage = serde_json::json!({ "source_refs": source_refs }); + + sqlx::query( + "\ +INSERT INTO consolidation_runs ( + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + job_kind, + status, + input_refs, + source_snapshot, + lineage, + error, + created_at, + updated_at, + completed_at +) +VALUES ($1,$2,$3,$4,'elf.consolidation/v1','manual','completed',$5,$6,$7,'{}'::jsonb,$8,$8,$8)", + ) + .bind(run_id) + .bind(TENANT_ID) + .bind(PROJECT_ID) + .bind(AGENT_ID) + .bind(&source_refs) + .bind(serde_json::json!({ "source_count": 1 })) + .bind(&lineage) + .bind(OffsetDateTime::UNIX_EPOCH) + .execute(&service.db.pool) + .await + .expect("consolidation run should be inserted"); + sqlx::query( + "\ +INSERT INTO consolidation_proposals ( + proposal_id, + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + proposal_kind, + apply_intent, + review_state, + source_refs, + source_snapshot, + lineage, + diff, + confidence, + unsupported_claim_flags, + contradiction_markers, + staleness_markers, + target_ref, + proposed_payload, + reviewer_agent_id, + review_comment, + reviewed_at, + created_at, + updated_at +) +VALUES ($1,$2,$3,$4,$5,'elf.consolidation/v1','knowledge_page','create_derived_knowledge_page','applied',$6,$7,$8,$9,0.9,'[]'::jsonb,'[]'::jsonb,'[]'::jsonb,'{}'::jsonb,$10,$5,'Apply derived page proposal.',$11,$11,$11)", + ) + .bind(proposal_id) + .bind(run_id) + .bind(TENANT_ID) + .bind(PROJECT_ID) + .bind(AGENT_ID) + .bind(&source_refs) + .bind(serde_json::json!({ "source_count": 1 })) + .bind(&lineage) + .bind(serde_json::json!({ + "summary": "Create a derived knowledge page from cited source memory.", + "before": {}, + "after": { "page_key": "knowledge-foundation" } + })) + .bind(serde_json::json!({ "page_key": "knowledge-foundation" })) + .bind(OffsetDateTime::UNIX_EPOCH) + .execute(&service.db.pool) + .await + .expect("consolidation proposal should be inserted"); + + proposal_id +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run this test."] +async fn rebuilds_pages_with_citations_and_detects_stale_sources() { + let Some(fixture) = + setup_service("rebuilds_pages_with_citations_and_detects_stale_sources").await + else { + return; + }; + let service = &fixture.service; + let note_id = insert_source_note( + service, + "knowledge_pages_foundation", + "Fact: Derived knowledge pages are rebuilt from authoritative source memory and keep citations.", + ) + .await; + let event_id = insert_event_audit(service, note_id).await; + let fact_id = insert_relation(service, note_id).await; + let proposal_id = insert_applied_proposal(service, note_id).await; + let first = service + .knowledge_page_rebuild(KnowledgePageRebuildRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + page_kind: KnowledgePageKind::Project, + page_key: "knowledge-foundation".to_string(), + title: Some("Knowledge Foundation".to_string()), + note_ids: vec![note_id], + event_ids: vec![event_id], + relation_ids: vec![fact_id], + proposal_ids: vec![proposal_id], + provider_metadata: serde_json::json!({}), + }) + .await + .expect("knowledge page should rebuild"); + + assert_eq!(first.page.sections.len(), 4); + assert_eq!(first.page.source_refs.len(), 4); + assert!(first.page.sections.iter().all(|section| { + section.citations.as_array().is_some_and(|citations| !citations.is_empty()) + })); + assert_eq!(first.page.page.source_coverage["coverage_complete"], true); + assert_eq!(first.page.page.rebuild_metadata["deterministic"], true); + + let second = service + .knowledge_page_rebuild(KnowledgePageRebuildRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: AGENT_ID.to_string(), + page_kind: KnowledgePageKind::Project, + page_key: "knowledge-foundation".to_string(), + title: Some("Knowledge Foundation".to_string()), + note_ids: vec![note_id], + event_ids: vec![event_id], + relation_ids: vec![fact_id], + proposal_ids: vec![proposal_id], + provider_metadata: serde_json::json!({}), + }) + .await + .expect("knowledge page should rebuild deterministically"); + + assert_eq!(first.page.page.page_id, second.page.page.page_id); + assert_eq!(first.page.page.rebuild_source_hash, second.page.page.rebuild_source_hash); + assert_eq!(first.page.page.content_hash, second.page.page.content_hash); + + sqlx::query( + "\ +UPDATE memory_notes +SET text = $1, updated_at = $2 +WHERE note_id = $3", + ) + .bind("Fact: Derived knowledge pages changed after the page snapshot was rebuilt.") + .bind(OffsetDateTime::now_utc()) + .bind(note_id) + .execute(&service.db.pool) + .await + .expect("source note should update"); + + let lint = service + .knowledge_page_lint(KnowledgePageLintRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + page_id: first.page.page.page_id, + }) + .await + .expect("knowledge page lint should run"); + + assert!(lint.findings.iter().any(|finding| { + finding.finding_type == "stale_source_ref" + && finding.source_kind.as_deref() == Some("note") + && finding.source_id == Some(note_id) + })); +} diff --git a/packages/elf-service/tests/acceptance/memory_history.rs b/packages/elf-service/tests/acceptance/memory_history.rs new file mode 100644 index 00000000..f803067d --- /dev/null +++ b/packages/elf-service/tests/acceptance/memory_history.rs @@ -0,0 +1,138 @@ +use std::{ + collections::HashSet, + sync::{Arc, atomic::AtomicUsize}, +}; + +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; +use elf_service::{ + AddNoteInput, AddNoteRequest, MemoryHistoryGetRequest, NoteOp, NoteProvenanceGetRequest, + Providers, +}; + +fn history_request(text: &str, importance: f32) -> AddNoteRequest { + AddNoteRequest { + tenant_id: "tenant-history".to_string(), + project_id: "project-history".to_string(), + agent_id: "agent-history".to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some("memory_history_target".to_string()), + text: text.to_string(), + structured: None, + importance, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({ "schema": "acceptance/history" }), + write_policy: None, + }], + } +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn memory_history_links_versions_and_ignored_decisions() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping memory_history_links_versions_and_ignored_decisions; set ELF_PG_DSN."); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!( + "Skipping memory_history_links_versions_and_ignored_decisions; set ELF_QDRANT_URL." + ); + + return; + }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }), + ); + let collection = test_db.collection_name("elf_history"); + let docs_collection = test_db.collection_name("elf_history_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + let first = service + .add_note(history_request( + "Fact: Memory history readback starts with original evidence.", + 0.7, + )) + .await + .expect("initial note should be added"); + let note_id = first.results[0].note_id.expect("add should return note id"); + + assert_eq!(first.results[0].op, NoteOp::Add); + + let updated = service + .add_note(history_request("Fact: Memory history readback records updated evidence.", 0.8)) + .await + .expect("second note should update by key"); + let ignored = service + .add_note(history_request("Fact: Memory history readback records updated evidence.", 0.8)) + .await + .expect("third note should be ignored as unchanged"); + + assert_eq!(updated.results[0].op, NoteOp::Update); + assert_eq!(ignored.results[0].op, NoteOp::None); + + let history = service + .memory_history_get(MemoryHistoryGetRequest { + tenant_id: "tenant-history".to_string(), + project_id: "project-history".to_string(), + note_id, + }) + .await + .expect("history should be readable"); + let event_types: HashSet<&str> = + history.events.iter().map(|event| event.event_type.as_str()).collect(); + + assert_eq!(history.schema, "elf.memory_history/v1"); + assert!(event_types.contains("add")); + assert!(event_types.contains("update")); + assert!(event_types.contains("ignore")); + assert!( + history + .events + .iter() + .filter(|event| matches!(event.event_type.as_str(), "add" | "update")) + .all(|event| event.related_decision_id.is_some() + && event.related_note_version_id.is_some()) + ); + + let linked_decision_count: i64 = sqlx::query_scalar( + "SELECT count(*) FROM memory_ingest_decisions WHERE note_id = $1 AND note_version_id IS NOT NULL", + ) + .bind(note_id) + .fetch_one(&service.db.pool) + .await + .expect("linked decision count should be queryable"); + + assert_eq!(linked_decision_count, 2); + + let provenance = service + .note_provenance_get(NoteProvenanceGetRequest { + tenant_id: "tenant-history".to_string(), + project_id: "project-history".to_string(), + note_id, + }) + .await + .expect("provenance should include history"); + + assert_eq!(provenance.history.len(), history.events.len()); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-service/tests/acceptance/outbox_eventual_consistency.rs b/packages/elf-service/tests/acceptance/outbox_eventual_consistency.rs index b54752dd..f054ad1d 100644 --- a/packages/elf-service/tests/acceptance/outbox_eventual_consistency.rs +++ b/packages/elf-service/tests/acceptance/outbox_eventual_consistency.rs @@ -1,8 +1,4 @@ -#[path = "../../../../apps/elf-worker/src/worker.rs"] mod worker; - -// std use std::{ - collections::HashMap, future::IntoFuture, sync::{ Arc, @@ -11,172 +7,63 @@ use std::{ time::{Duration, Instant}, }; -// crates.io +use ahash::AHashMap; use axum::{Json, Router, extract::State, http::StatusCode, response::IntoResponse, routing}; -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, Modifier, SparseVectorParamsBuilder, - SparseVectorsConfigBuilder, VectorParamsBuilder, VectorsConfigBuilder, -}; -use serde_json::Map; +use serde_json::{Map, Value}; +use sqlx::{FromRow, PgPool}; use time::OffsetDateTime; use tokenizers::{Tokenizer, models::wordlevel::WordLevel}; -use tokio::{net::TcpListener, sync::oneshot, time as tokio_time}; +use tokio::{ + net::TcpListener, + sync::{oneshot, oneshot::Sender}, + task::JoinHandle, +}; use uuid::Uuid; -// self -use super::{ - SpyExtractor, StubEmbedding, StubRerank, build_service, test_config, test_db, test_qdrant_url, -}; +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank, chunking::ChunkingConfig}; use elf_config::EmbeddingProviderConfig; -use elf_service::{AddNoteInput, AddNoteRequest, Providers}; -use elf_storage::{ - db::Db, - qdrant::{BM25_VECTOR_NAME, DENSE_VECTOR_NAME, QdrantStore}, -}; +use elf_service::{AddNoteInput, AddNoteRequest, ElfService, Providers}; +use elf_storage::{db::Db, qdrant::QdrantStore}; +use elf_worker::worker::{self, WorkerState}; -#[derive(sqlx::FromRow)] +#[derive(FromRow)] struct OutboxRow { status: String, attempts: i32, last_error: Option<String>, } -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn outbox_retries_to_done() { - let Some(test_db) = test_db().await else { - eprintln!("Skipping outbox_retries_to_done; set ELF_PG_DSN to run this test."); - return; - }; - let Some(qdrant_url) = test_qdrant_url() else { - eprintln!("Skipping outbox_retries_to_done; set ELF_QDRANT_URL to run this test."); - return; - }; - let request_count = Arc::new(AtomicUsize::new(0)); - let (api_base, shutdown) = start_embed_server(request_count.clone()).await; - - let extractor = SpyExtractor { - calls: Arc::new(AtomicUsize::new(0)), - payload: serde_json::json!({ "notes": [] }), - }; - let providers = Providers::new( - Arc::new(StubEmbedding { vector_dim: 3 }), - Arc::new(StubRerank), - Arc::new(extractor), - ); - - let collection = test_db.collection_name("elf_acceptance"); - let cfg = test_config(test_db.dsn().to_string(), qdrant_url, 3, collection); - let service = build_service(cfg, providers).await.expect("Failed to build service."); - super::reset_db(&service.db.pool).await.expect("Failed to reset test database."); - - let _ = service.qdrant.client.delete_collection(service.qdrant.collection.clone()).await; - let mut vectors_config = VectorsConfigBuilder::default(); - vectors_config - .add_named_vector_params(DENSE_VECTOR_NAME, VectorParamsBuilder::new(3, Distance::Cosine)); - let mut sparse_vectors_config = SparseVectorsConfigBuilder::default(); - sparse_vectors_config.add_named_vector_params( - BM25_VECTOR_NAME, - SparseVectorParamsBuilder::default().modifier(Modifier::Idf as i32), - ); - service - .qdrant - .client - .create_collection( - CreateCollectionBuilder::new(service.qdrant.collection.clone()) - .vectors_config(vectors_config) - .sparse_vectors_config(sparse_vectors_config), - ) - .await - .expect("Failed to create Qdrant collection."); - - let add_response = service - .add_note(AddNoteRequest { - tenant_id: "t".to_string(), - project_id: "p".to_string(), - agent_id: "a".to_string(), - scope: "agent_private".to_string(), - notes: vec![AddNoteInput { - note_type: "fact".to_string(), - key: Some("outbox_test".to_string()), - text: "Fact: Outbox should retry.".to_string(), - importance: 0.4, - confidence: 0.9, - ttl_days: None, - source_ref: serde_json::json!({}), - }], - }) - .await - .expect("Failed to add note."); - - let note_id = add_response.results[0].note_id.expect("Expected note_id in add_note result."); +fn build_test_tokenizer() -> Tokenizer { + let mut vocab = AHashMap::new(); - let worker_state = worker::WorkerState { - db: Db::connect(&service.cfg.storage.postgres).await.expect("Failed to connect worker DB."), - qdrant: QdrantStore::new(&service.cfg.storage.qdrant) - .expect("Failed to build Qdrant store."), - embedding: EmbeddingProviderConfig { - provider_id: "test".to_string(), - api_base, - api_key: "test-key".to_string(), - path: "/embeddings".to_string(), - model: "test".to_string(), - dimensions: 3, - timeout_ms: 1_000, - default_headers: Map::new(), - }, - chunking: crate::chunking::ChunkingConfig { max_tokens: 64, overlap_tokens: 8 }, - tokenizer: { - let mut vocab = HashMap::new(); - vocab.insert("<unk>".to_string(), 0); - let model = WordLevel::builder() - .vocab(vocab) - .unk_token("<unk>".to_string()) - .build() - .expect("Failed to build test tokenizer."); - Tokenizer::new(model) - }, - }; + vocab.insert("<unk>".to_string(), 0_u32); - let handle = tokio::spawn(async move { - let _ = worker::run_worker(worker_state).await; - }); + let model = WordLevel::builder() + .vocab(vocab) + .unk_token("<unk>".to_string()) + .build() + .expect("Failed to build test tokenizer."); - let failed = wait_for_status(&service.db.pool, note_id, "FAILED", Duration::from_secs(5)) - .await - .expect("Expected FAILED outbox status."); - assert_eq!(failed.attempts, 1); - assert!(failed.last_error.is_some()); - assert!(request_count.load(Ordering::SeqCst) >= 1); - - let now = OffsetDateTime::now_utc(); - sqlx::query("UPDATE indexing_outbox SET available_at = $1 WHERE note_id = $2") - .bind(now) - .bind(note_id) - .execute(&service.db.pool) - .await - .expect("Failed to update available_at."); - - let done = wait_for_status(&service.db.pool, note_id, "DONE", Duration::from_secs(5)) - .await - .expect("Expected DONE outbox status."); - assert!(done.attempts >= 1); - - handle.abort(); - let _ = shutdown.send(()); - test_db.cleanup().await.expect("Failed to cleanup test database."); + Tokenizer::new(model) } async fn wait_for_status( - pool: &sqlx::PgPool, + pool: &PgPool, note_id: Uuid, status: &str, timeout: Duration, ) -> Option<OutboxRow> { let deadline = Instant::now() + timeout; + loop { - let row: Option<OutboxRow> = sqlx::query_as( - "SELECT status, attempts, last_error FROM indexing_outbox WHERE note_id = $1", + let row: Option<OutboxRow> = sqlx::query_as::<_, OutboxRow>( + "\ +SELECT + status, + attempts, + last_error +FROM indexing_outbox +WHERE note_id = $1", ) .bind(note_id) .fetch_optional(pool) @@ -189,14 +76,16 @@ async fn wait_for_status( { return Some(row); } + if Instant::now() >= deadline { return None; } - tokio_time::sleep(Duration::from_millis(200)).await; + + tokio::time::sleep(Duration::from_millis(200)).await; } } -async fn start_embed_server(request_count: Arc<AtomicUsize>) -> (String, oneshot::Sender<()>) { +async fn start_embed_server(request_count: Arc<AtomicUsize>) -> (String, Sender<()>) { let app = Router::new().route("/embeddings", routing::post(embed_handler)).with_state(request_count); let listener = TcpListener::bind("127.0.0.1:0").await.expect("Failed to bind embed server."); @@ -205,17 +94,20 @@ async fn start_embed_server(request_count: Arc<AtomicUsize>) -> (String, oneshot let server = axum::serve(listener, app).with_graceful_shutdown(async move { let _ = rx.await; }); + tokio::spawn(async move { let _ = server.into_future().await; }); + (format!("http://{addr}"), tx) } async fn embed_handler( State(counter): State<Arc<AtomicUsize>>, - Json(payload): Json<serde_json::Value>, + Json(payload): Json<Value>, ) -> impl IntoResponse { let call_index = counter.fetch_add(1, Ordering::SeqCst); + if call_index == 0 { return StatusCode::INTERNAL_SERVER_ERROR.into_response(); } @@ -226,11 +118,144 @@ async fn embed_handler( .iter() .enumerate() .map(|(index, _)| { + let embedding: Vec<f32> = vec![0.1_f32; 4_096]; + serde_json::json!({ "index": index, - "embedding": [0.1, 0.2, 0.3] + "embedding": embedding }) }) .collect(); + (StatusCode::OK, Json(serde_json::json!({ "data": data }))).into_response() } + +async fn spawn_outbox_worker(service: &ElfService, api_base: String) -> JoinHandle<()> { + let worker_state = WorkerState { + db: Db::connect(&service.cfg.storage.postgres).await.expect("Failed to connect worker DB."), + qdrant: QdrantStore::new(&service.cfg.storage.qdrant) + .expect("Failed to build Qdrant store."), + docs_qdrant: QdrantStore::new_with_collection( + &service.cfg.storage.qdrant, + &service.cfg.storage.qdrant.docs_collection, + ) + .expect("Failed to build docs Qdrant store."), + embedding: EmbeddingProviderConfig { + provider_id: "test".to_string(), + api_base, + api_key: "test-key".to_string(), + path: "/embeddings".to_string(), + model: "test".to_string(), + dimensions: 4_096, + timeout_ms: 1_000, + default_headers: Map::new(), + }, + chunking: ChunkingConfig { max_tokens: 64, overlap_tokens: 8 }, + tokenizer: build_test_tokenizer(), + }; + + tokio::spawn(async move { + let _ = worker::run_worker(worker_state).await; + }) +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn outbox_retries_to_done() { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping outbox_retries_to_done; set ELF_PG_DSN to run this test."); + + return; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping outbox_retries_to_done; set ELF_QDRANT_URL to run this test."); + + return; + }; + let request_count = Arc::new(AtomicUsize::new(0)); + let (api_base, shutdown) = start_embed_server(request_count.clone()).await; + let extractor = SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(extractor), + ); + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + acceptance::reset_qdrant_collection( + &service.qdrant.client, + &service.qdrant.collection, + service.qdrant.vector_dim, + ) + .await + .expect("Failed to reset Qdrant collection."); + + let add_response = service + .add_note(AddNoteRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + scope: "agent_private".to_string(), + notes: vec![AddNoteInput { + r#type: "fact".to_string(), + key: Some("outbox_test".to_string()), + text: "Fact: Outbox should retry.".to_string(), + structured: None, + importance: 0.4, + confidence: 0.9, + ttl_days: None, + source_ref: serde_json::json!({}), + write_policy: None, + }], + }) + .await + .expect("Failed to add note."); + let note_id = add_response.results[0].note_id.expect("Expected note_id in add_note result."); + let handle = spawn_outbox_worker(&service, api_base).await; + let failed = wait_for_status(&service.db.pool, note_id, "FAILED", Duration::from_secs(15)) + .await + .expect("Expected FAILED outbox status."); + + assert_eq!(failed.attempts, 1); + assert!(failed.last_error.is_some()); + assert!(request_count.load(Ordering::SeqCst) >= 1); + + let now = OffsetDateTime::now_utc(); + + sqlx::query("UPDATE indexing_outbox SET available_at = $1 WHERE note_id = $2") + .bind(now) + .bind(note_id) + .execute(&service.db.pool) + .await + .expect("Failed to update available_at."); + + let done = wait_for_status(&service.db.pool, note_id, "DONE", Duration::from_secs(15)) + .await + .expect("Expected DONE outbox status."); + + assert!(done.attempts >= 1); + + let _ = shutdown.send(()); + + handle.abort(); + + let _ = handle.await; + + drop(service); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-service/tests/acceptance/rebuild_qdrant.rs b/packages/elf-service/tests/acceptance/rebuild_qdrant.rs index 752a8df9..d303797b 100644 --- a/packages/elf-service/tests/acceptance/rebuild_qdrant.rs +++ b/packages/elf-service/tests/acceptance/rebuild_qdrant.rs @@ -1,35 +1,161 @@ -// std use std::sync::{ Arc, atomic::{AtomicUsize, Ordering}, }; -// crates.io -use qdrant_client::qdrant::{ - CreateCollectionBuilder, Distance, Modifier, SparseVectorParamsBuilder, - SparseVectorsConfigBuilder, VectorParamsBuilder, VectorsConfigBuilder, -}; +use sqlx::PgPool; use time::OffsetDateTime; use uuid::Uuid; -// self -use super::{ - SpyEmbedding, SpyExtractor, StubRerank, build_service, test_config, test_db, test_qdrant_url, -}; +use crate::acceptance::{self, SpyEmbedding, SpyExtractor, StubRerank}; use elf_service::Providers; -use elf_storage::qdrant::{BM25_VECTOR_NAME, DENSE_VECTOR_NAME}; + +fn build_zero_vector_text(dim: usize) -> String { + let mut buf = String::with_capacity(2 + (dim * 2)); + + buf.push('['); + + for i in 0..dim { + if i > 0 { + buf.push(','); + } + + buf.push('0'); + } + + buf.push(']'); + + buf +} + +async fn insert_note(pool: &PgPool, note_id: Uuid, now: OffsetDateTime, embedding_version: &str) { + sqlx::query( + "\ +INSERT INTO memory_notes ( + note_id, + tenant_id, + project_id, + agent_id, + scope, + type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref, + hit_count, + last_hit_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15, + $16, + $17, + $18 +)", + ) + .bind(note_id) + .bind("t") + .bind("p") + .bind("a") + .bind("agent_private") + .bind("fact") + .bind(Option::<String>::None) + .bind("Fact: Rebuild works.") + .bind(0.5_f32) + .bind(0.9_f32) + .bind("active") + .bind(now) + .bind(now) + .bind(Option::<OffsetDateTime>::None) + .bind(embedding_version) + .bind(serde_json::json!({})) + .bind(0_i64) + .bind(Option::<OffsetDateTime>::None) + .execute(pool) + .await + .expect("Failed to insert memory note."); +} + +async fn insert_chunk(pool: &PgPool, chunk_id: Uuid, note_id: Uuid, embedding_version: &str) { + let text = "Fact: Rebuild works."; + + sqlx::query( + "\ +INSERT INTO memory_note_chunks ( + chunk_id, + note_id, + chunk_index, + start_offset, + end_offset, + text, + embedding_version +) +VALUES ($1, $2, $3, $4, $5, $6, $7)", + ) + .bind(chunk_id) + .bind(note_id) + .bind(0_i32) + .bind(0_i32) + .bind(text.len() as i32) + .bind(text) + .bind(embedding_version) + .execute(pool) + .await + .expect("Failed to insert chunk metadata."); +} + +async fn insert_chunk_embedding( + pool: &PgPool, + chunk_id: Uuid, + embedding_version: &str, + vec_text: &str, +) { + sqlx::query( + "\ +INSERT INTO note_chunk_embeddings (chunk_id, embedding_version, embedding_dim, vec) +VALUES ($1, $2, $3, $4::text::vector)", + ) + .bind(chunk_id) + .bind(embedding_version) + .bind(4_096_i32) + .bind(vec_text) + .execute(pool) + .await + .expect("Failed to insert chunk embedding."); +} #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] async fn rebuild_uses_postgres_vectors_only() { - let Some(test_db) = test_db().await else { + let Some(test_db) = acceptance::test_db().await else { eprintln!("Skipping rebuild_uses_postgres_vectors_only; set ELF_PG_DSN to run this test."); + return; }; - let Some(qdrant_url) = test_qdrant_url() else { + let Some(qdrant_url) = acceptance::test_qdrant_url() else { eprintln!( "Skipping rebuild_uses_postgres_vectors_only; set ELF_QDRANT_URL to run this test." ); + return; }; let embed_calls = Arc::new(AtomicUsize::new(0)); @@ -38,35 +164,30 @@ async fn rebuild_uses_postgres_vectors_only() { payload: serde_json::json!({ "notes": [] }), }; let providers = Providers::new( - Arc::new(SpyEmbedding { vector_dim: 3, calls: embed_calls.clone() }), + Arc::new(SpyEmbedding { vector_dim: 4_096, calls: embed_calls.clone() }), Arc::new(StubRerank), Arc::new(extractor), ); - let collection = test_db.collection_name("elf_acceptance"); - let cfg = test_config(test_db.dsn().to_string(), qdrant_url, 3, collection); - let service = build_service(cfg, providers).await.expect("Failed to build service."); - super::reset_db(&service.db.pool).await.expect("Failed to reset test database."); - - let _ = service.qdrant.client.delete_collection(service.qdrant.collection.clone()).await; - let mut vectors_config = VectorsConfigBuilder::default(); - vectors_config - .add_named_vector_params(DENSE_VECTOR_NAME, VectorParamsBuilder::new(3, Distance::Cosine)); - let mut sparse_vectors_config = SparseVectorsConfigBuilder::default(); - sparse_vectors_config.add_named_vector_params( - BM25_VECTOR_NAME, - SparseVectorParamsBuilder::default().modifier(Modifier::Idf as i32), + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, ); - service - .qdrant - .client - .create_collection( - CreateCollectionBuilder::new(service.qdrant.collection.clone()) - .vectors_config(vectors_config) - .sparse_vectors_config(sparse_vectors_config), - ) - .await - .expect("Failed to create Qdrant collection."); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + acceptance::reset_qdrant_collection( + &service.qdrant.client, + &service.qdrant.collection, + service.qdrant.vector_dim, + ) + .await + .expect("Failed to reset Qdrant collection."); let note_id = Uuid::new_v4(); let now = OffsetDateTime::now_utc(); @@ -76,67 +197,24 @@ async fn rebuild_uses_postgres_vectors_only() { service.cfg.providers.embedding.model, service.cfg.storage.qdrant.vector_dim ); - - sqlx::query( - "INSERT INTO memory_notes \ - (note_id, tenant_id, project_id, agent_id, scope, type, key, text, importance, confidence, status, created_at, updated_at, expires_at, embedding_version, source_ref, hit_count, last_hit_at) \ - VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18)", - ) - .bind(note_id) - .bind("t") - .bind("p") - .bind("a") - .bind("agent_private") - .bind("fact") - .bind::<Option<String>>(None) - .bind("Fact: Rebuild works.") - .bind(0.5_f32) - .bind(0.9_f32) - .bind("active") - .bind(now) - .bind(now) - .bind::<Option<OffsetDateTime>>(None) - .bind(&embedding_version) - .bind(serde_json::json!({})) - .bind(0_i64) - .bind::<Option<OffsetDateTime>>(None) - .execute(&service.db.pool) - .await - .expect("Failed to insert memory note."); - let chunk_id = Uuid::new_v4(); - let text = "Fact: Rebuild works."; - sqlx::query( - "INSERT INTO memory_note_chunks \ - (chunk_id, note_id, chunk_index, start_offset, end_offset, text, embedding_version) \ - VALUES ($1,$2,$3,$4,$5,$6,$7)", - ) - .bind(chunk_id) - .bind(note_id) - .bind(0_i32) - .bind(0_i32) - .bind(text.len() as i32) - .bind(text) - .bind(&embedding_version) - .execute(&service.db.pool) - .await - .expect("Failed to insert chunk metadata."); + let vec_text = build_zero_vector_text(4_096); - sqlx::query( - "INSERT INTO note_chunk_embeddings (chunk_id, embedding_version, embedding_dim, vec) \ - VALUES ($1,$2,$3,$4::vector)", + insert_note(&service.db.pool, note_id, now, embedding_version.as_str()).await; + insert_chunk(&service.db.pool, chunk_id, note_id, embedding_version.as_str()).await; + insert_chunk_embedding( + &service.db.pool, + chunk_id, + embedding_version.as_str(), + vec_text.as_str(), ) - .bind(chunk_id) - .bind(&embedding_version) - .bind(3_i32) - .bind("[0,0,0]") - .execute(&service.db.pool) - .await - .expect("Failed to insert chunk embedding."); + .await; let report = service.rebuild_qdrant().await.expect("Rebuild failed."); + assert_eq!(report.missing_vector_count, 0); assert!(report.rebuilt_count >= 1); assert_eq!(embed_calls.load(Ordering::SeqCst), 0); + test_db.cleanup().await.expect("Failed to cleanup test database."); } diff --git a/packages/elf-service/tests/acceptance/sot_vectors.rs b/packages/elf-service/tests/acceptance/sot_vectors.rs index a9760e58..f07f31c2 100644 --- a/packages/elf-service/tests/acceptance/sot_vectors.rs +++ b/packages/elf-service/tests/acceptance/sot_vectors.rs @@ -1,41 +1,157 @@ -// std use std::sync::{Arc, atomic::AtomicUsize}; -// crates.io +use sqlx::PgPool; use time::OffsetDateTime; use uuid::Uuid; -// self -use super::{ - SpyExtractor, StubEmbedding, StubRerank, build_service, reset_db, test_config, test_db, - test_qdrant_url, -}; +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; use elf_service::Providers; +fn build_zero_vector_text(dim: usize) -> String { + let mut buf = String::with_capacity(2 + (dim * 2)); + + buf.push('['); + + for i in 0..dim { + if i > 0 { + buf.push(','); + } + + buf.push('0'); + } + + buf.push(']'); + + buf +} + +async fn insert_note( + pool: &PgPool, + note_id: Uuid, + now: OffsetDateTime, + embedding_version: &str, + text: &str, +) { + sqlx::query( + "\ +INSERT INTO memory_notes ( + note_id, + tenant_id, + project_id, + agent_id, + scope, + type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref, + hit_count, + last_hit_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15, + $16, + $17, + $18 +)", + ) + .bind(note_id) + .bind("t") + .bind("p") + .bind("a") + .bind("agent_private") + .bind("fact") + .bind(Option::<String>::None) + .bind(text) + .bind(0.4_f32) + .bind(0.9_f32) + .bind("active") + .bind(now) + .bind(now) + .bind(Option::<OffsetDateTime>::None) + .bind(embedding_version) + .bind(serde_json::json!({})) + .bind(0_i64) + .bind(Option::<OffsetDateTime>::None) + .execute(pool) + .await + .expect("Failed to insert memory note."); +} + +async fn insert_embedding(pool: &PgPool, note_id: Uuid, embedding_version: &str, vec_text: &str) { + sqlx::query( + "\ +INSERT INTO note_embeddings ( + note_id, + embedding_version, + embedding_dim, + vec +) +VALUES ($1, $2, $3, $4::text::vector)", + ) + .bind(note_id) + .bind(embedding_version) + .bind(4_096_i32) + .bind(vec_text) + .execute(pool) + .await + .expect("Failed to insert embedding."); +} + #[tokio::test] #[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] async fn active_notes_have_vectors() { - let Some(test_db) = test_db().await else { + let Some(test_db) = acceptance::test_db().await else { eprintln!("Skipping active_notes_have_vectors; set ELF_PG_DSN to run this test."); + return; }; - let Some(qdrant_url) = test_qdrant_url() else { + let Some(qdrant_url) = acceptance::test_qdrant_url() else { eprintln!("Skipping active_notes_have_vectors; set ELF_QDRANT_URL to run this test."); + return; }; - let collection = test_db.collection_name("elf_acceptance"); - let cfg = test_config(test_db.dsn().to_string(), qdrant_url, 3, collection); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); let providers = Providers::new( - Arc::new(StubEmbedding { vector_dim: 3 }), + Arc::new(StubEmbedding { vector_dim: 4_096 }), Arc::new(StubRerank), Arc::new(SpyExtractor { calls: Arc::new(AtomicUsize::new(0)), payload: serde_json::json!({ "notes": [] }), }), ); - let service = build_service(cfg, providers).await.expect("Failed to build service."); - reset_db(&service.db.pool).await.expect("Failed to reset test database."); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); let note_id = Uuid::new_v4(); let now = OffsetDateTime::now_utc(); @@ -45,65 +161,46 @@ async fn active_notes_have_vectors() { service.cfg.providers.embedding.model, service.cfg.storage.qdrant.vector_dim ); + let vec_text = build_zero_vector_text(4_096); - sqlx::query( - "INSERT INTO memory_notes \ - (note_id, tenant_id, project_id, agent_id, scope, type, key, text, importance, confidence, status, created_at, updated_at, expires_at, embedding_version, source_ref, hit_count, last_hit_at) \ - VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18)", - ) - .bind(note_id) - .bind("t") - .bind("p") - .bind("a") - .bind("agent_private") - .bind("fact") - .bind::<Option<String>>(None) - .bind("Fact: Vector row exists.") - .bind(0.4_f32) - .bind(0.9_f32) - .bind("active") - .bind(now) - .bind(now) - .bind::<Option<OffsetDateTime>>(None) - .bind(&embedding_version) - .bind(serde_json::json!({})) - .bind(0_i64) - .bind::<Option<OffsetDateTime>>(None) - .execute(&service.db.pool) - .await - .expect("Failed to insert memory note."); - - sqlx::query( - "INSERT INTO note_embeddings (note_id, embedding_version, embedding_dim, vec) \ - VALUES ($1,$2,$3,$4::vector)", + insert_note( + &service.db.pool, + note_id, + now, + embedding_version.as_str(), + "Fact: Vector row exists.", ) - .bind(note_id) - .bind(&embedding_version) - .bind(3_i32) - .bind("[0,0,0]") - .execute(&service.db.pool) - .await - .expect("Failed to insert embedding."); + .await; + insert_embedding(&service.db.pool, note_id, embedding_version.as_str(), vec_text.as_str()) + .await; let missing: i64 = sqlx::query_scalar( - "SELECT COUNT(*) FROM memory_notes n \ - LEFT JOIN note_embeddings e ON n.note_id = e.note_id AND n.embedding_version = e.embedding_version \ - WHERE n.note_id = $1 AND e.note_id IS NULL", - ) + "\ +SELECT COUNT(*) AS \"missing!\" +FROM memory_notes n +LEFT JOIN note_embeddings e +ON n.note_id = e.note_id +AND n.embedding_version = e.embedding_version +WHERE n.note_id = $1 + AND e.note_id IS NULL", + ) .bind(note_id) .fetch_one(&service.db.pool) .await .expect("Failed to query missing embeddings."); + assert_eq!(missing, 0); let dim: i32 = sqlx::query_scalar( "SELECT embedding_dim FROM note_embeddings WHERE note_id = $1 AND embedding_version = $2", ) .bind(note_id) - .bind(&embedding_version) + .bind(embedding_version.as_str()) .fetch_one(&service.db.pool) .await .expect("Failed to query embedding dim."); - assert_eq!(dim, 3); + + assert_eq!(dim, 4_096); + test_db.cleanup().await.expect("Failed to cleanup test database."); } diff --git a/packages/elf-service/tests/acceptance/structured_field_retrieval.rs b/packages/elf-service/tests/acceptance/structured_field_retrieval.rs new file mode 100644 index 00000000..eb218cef --- /dev/null +++ b/packages/elf-service/tests/acceptance/structured_field_retrieval.rs @@ -0,0 +1,466 @@ +use std::{ + collections::HashMap, + sync::{Arc, atomic::AtomicUsize}, +}; + +use qdrant_client::{ + Payload, + qdrant::{Document, PointStruct, UpsertPointsBuilder, Vector}, +}; +use serde_json::Value; +use sqlx::PgExecutor; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::acceptance::{self, SpyExtractor, StubEmbedding}; +use elf_config::ProviderConfig; +use elf_service::{BoxFuture, ElfService, Providers, RerankProvider, Result, SearchRequest}; +use elf_storage::qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME}; +use elf_testkit::TestDatabase; + +struct TestContext { + service: ElfService, + test_db: TestDatabase, + embedding_version: String, +} + +struct UpsertPointArgs<'a> { + chunk_id: Uuid, + note_id: Uuid, + chunk_index: i32, + start_offset: i32, + end_offset: i32, + text: &'a str, + dense: Vec<f32>, +} + +struct KeywordRerank { + keyword: &'static str, +} +impl RerankProvider for KeywordRerank { + fn rerank<'a>( + &'a self, + _cfg: &'a ProviderConfig, + _query: &'a str, + docs: &'a [String], + ) -> BoxFuture<'a, Result<Vec<f32>>> { + let keyword = self.keyword; + + Box::pin(async move { + Ok(docs.iter().map(|doc| if doc.contains(keyword) { 1.0 } else { 0.1 }).collect()) + }) + } +} + +fn vec_text_zeros() -> String { + let mut buf = String::with_capacity(2 + (4_096 * 2)); + + buf.push('['); + + for i in 0..4_096 { + if i > 0 { + buf.push(','); + } + + buf.push('0'); + } + + buf.push(']'); + + buf +} + +fn build_payload( + note_id: Uuid, + chunk_id: Uuid, + chunk_index: i32, + start_offset: i32, + end_offset: i32, +) -> Payload { + let mut payload = Payload::new(); + + payload.insert("note_id", note_id.to_string()); + payload.insert("chunk_id", chunk_id.to_string()); + payload.insert("chunk_index", Value::from(chunk_index)); + payload.insert("start_offset", Value::from(start_offset)); + payload.insert("end_offset", Value::from(end_offset)); + payload.insert("tenant_id", "t"); + payload.insert("project_id", "p"); + payload.insert("agent_id", "a"); + payload.insert("scope", "agent_private"); + payload.insert("status", "active"); + + payload +} + +fn build_vectors(text: &str, dense: Vec<f32>) -> HashMap<String, Vector> { + let mut vectors = HashMap::new(); + + vectors.insert(DENSE_VECTOR_NAME.to_string(), Vector::from(dense)); + vectors.insert( + BM25_VECTOR_NAME.to_string(), + Vector::from(Document::new(text.to_string(), BM25_MODEL)), + ); + + vectors +} + +async fn setup_context(test_name: &str) -> Option<TestContext> { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); + + return None; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); + + return None; + }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(KeywordRerank { keyword: "ZEBRA" }), + Arc::new(SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }), + ); + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + acceptance::reset_qdrant_collection( + &service.qdrant.client, + &service.qdrant.collection, + service.qdrant.vector_dim, + ) + .await + .expect("Failed to reset Qdrant collection."); + + let embedding_version = format!( + "{}:{}:{}", + service.cfg.providers.embedding.provider_id, + service.cfg.providers.embedding.model, + service.cfg.storage.qdrant.vector_dim + ); + + Some(TestContext { service, test_db, embedding_version }) +} + +async fn insert_note<'e, E>(executor: E, note_id: Uuid, note_text: &str, embedding_version: &str) +where + E: PgExecutor<'e>, +{ + let now = OffsetDateTime::now_utc(); + + sqlx::query( + "\ +INSERT INTO memory_notes ( + note_id, + tenant_id, + project_id, + agent_id, + scope, + type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref, + hit_count, + last_hit_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15, + $16, + $17, + $18 +)", + ) + .bind(note_id) + .bind("t") + .bind("p") + .bind("a") + .bind("agent_private") + .bind("fact") + .bind(Option::<String>::None) + .bind(note_text) + .bind(0.4_f32) + .bind(0.9_f32) + .bind("active") + .bind(now) + .bind(now) + .bind(Option::<OffsetDateTime>::None) + .bind(embedding_version) + .bind(serde_json::json!({})) + .bind(0_i64) + .bind(Option::<OffsetDateTime>::None) + .execute(executor) + .await + .expect("Failed to insert memory note."); +} + +#[allow(clippy::too_many_arguments)] +async fn insert_chunk<'e, E>( + executor: E, + chunk_id: Uuid, + note_id: Uuid, + chunk_index: i32, + start_offset: i32, + end_offset: i32, + text: &str, + embedding_version: &str, +) where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO memory_note_chunks ( + chunk_id, + note_id, + chunk_index, + start_offset, + end_offset, + text, + embedding_version +) +VALUES ($1, $2, $3, $4, $5, $6, $7)", + ) + .bind(chunk_id) + .bind(note_id) + .bind(chunk_index) + .bind(start_offset) + .bind(end_offset) + .bind(text) + .bind(embedding_version) + .execute(executor) + .await + .expect("Failed to insert chunk metadata."); +} + +async fn insert_chunk_embedding<'e, E>(executor: E, chunk_id: Uuid, embedding_version: &str) +where + E: PgExecutor<'e>, +{ + let vec_text = vec_text_zeros(); + + sqlx::query( + "\ +INSERT INTO note_chunk_embeddings (chunk_id, embedding_version, embedding_dim, vec) +VALUES ($1, $2, $3, $4::text::vector)", + ) + .bind(chunk_id) + .bind(embedding_version) + .bind(4_096_i32) + .bind(vec_text.as_str()) + .execute(executor) + .await + .expect("Failed to insert chunk embedding."); +} + +async fn insert_fact_field_row<'e, E>(executor: E, field_id: Uuid, note_id: Uuid, fact_text: &str) +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO memory_note_fields (field_id, note_id, field_kind, item_index, text) +VALUES ($1, $2, $3, $4, $5)", + ) + .bind(field_id) + .bind(note_id) + .bind("fact") + .bind(0_i32) + .bind(fact_text) + .execute(executor) + .await + .expect("Failed to insert note field."); +} + +async fn insert_fact_field_embedding<'e, E>(executor: E, field_id: Uuid, embedding_version: &str) +where + E: PgExecutor<'e>, +{ + let vec_text = vec_text_zeros(); + + sqlx::query( + "\ +INSERT INTO note_field_embeddings (field_id, embedding_version, embedding_dim, vec) +VALUES ($1, $2, $3, $4::text::vector)", + ) + .bind(field_id) + .bind(embedding_version) + .bind(4_096_i32) + .bind(vec_text.as_str()) + .execute(executor) + .await + .expect("Failed to insert field embedding."); +} + +async fn upsert_point(service: &ElfService, args: UpsertPointArgs<'_>) { + let payload = build_payload( + args.note_id, + args.chunk_id, + args.chunk_index, + args.start_offset, + args.end_offset, + ); + let vectors = build_vectors(args.text, args.dense); + let point = PointStruct::new(args.chunk_id.to_string(), vectors, payload); + + service + .qdrant + .client + .upsert_points( + UpsertPointsBuilder::new(service.qdrant.collection.clone(), vec![point]).wait(true), + ) + .await + .expect("Failed to upsert Qdrant point."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn structured_fact_field_can_surface_note_and_marks_matched_fields() { + let Some(context) = + setup_context("structured_fact_field_can_surface_note_and_marks_matched_fields").await + else { + return; + }; + let query = "alpha unique"; + + for i in 0..20 { + let note_id = Uuid::new_v4(); + let chunk_id = Uuid::new_v4(); + let text = format!("Confuser {i}: {query}."); + + insert_note(&context.service.db.pool, note_id, &text, &context.embedding_version).await; + insert_chunk( + &context.service.db.pool, + chunk_id, + note_id, + 0, + 0, + text.len() as i32, + &text, + &context.embedding_version, + ) + .await; + upsert_point( + &context.service, + UpsertPointArgs { + chunk_id, + note_id, + chunk_index: 0, + start_offset: 0, + end_offset: text.len() as i32, + text: &text, + dense: vec![0.0_f32; 4_096], + }, + ) + .await; + } + + let structured_note_id = Uuid::new_v4(); + let structured_chunk_id = Uuid::new_v4(); + let structured_chunk_text = "ZEBRA chunk text does not include the query."; + + insert_note( + &context.service.db.pool, + structured_note_id, + "This note is generic.", + &context.embedding_version, + ) + .await; + insert_chunk( + &context.service.db.pool, + structured_chunk_id, + structured_note_id, + 0, + 0, + structured_chunk_text.len() as i32, + structured_chunk_text, + &context.embedding_version, + ) + .await; + insert_chunk_embedding( + &context.service.db.pool, + structured_chunk_id, + &context.embedding_version, + ) + .await; + upsert_point( + &context.service, + UpsertPointArgs { + chunk_id: structured_chunk_id, + note_id: structured_note_id, + chunk_index: 0, + start_offset: 0, + end_offset: structured_chunk_text.len() as i32, + text: structured_chunk_text, + dense: vec![1.0_f32; 4_096], + }, + ) + .await; + + let field_id = Uuid::new_v4(); + + insert_fact_field_row(&context.service.db.pool, field_id, structured_note_id, query).await; + insert_fact_field_embedding(&context.service.db.pool, field_id, &context.embedding_version) + .await; + + let response = context + .service + .search_raw(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: query.to_string(), + top_k: Some(1), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search failed."); + let item = response.items.first().expect("Expected search result."); + + assert_eq!(item.note_id, structured_note_id); + assert!( + item.explain.r#match.matched_fields.iter().any(|field| field == "facts"), + "Expected matched_fields to include facts; got {:?}", + item.explain.r#match.matched_fields + ); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-service/tests/acceptance/suite.rs b/packages/elf-service/tests/acceptance/suite.rs new file mode 100644 index 00000000..7db8daac --- /dev/null +++ b/packages/elf-service/tests/acceptance/suite.rs @@ -0,0 +1,508 @@ +mod add_note_no_llm; +mod chunk_search; +mod chunking; +mod consolidation; +mod docs_extension_v1; +mod english_only_boundary; +mod evidence_binding; +mod graph_ingestion; +mod idempotency; +mod knowledge_pages; +mod memory_history; +mod outbox_eventual_consistency; +mod rebuild_qdrant; +mod sot_vectors; +mod structured_field_retrieval; +mod trace_admin_observability; + +use std::{ + env, fs, + sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, + }, + time::Duration, +}; + +use ahash::AHashMap; +use qdrant_client::{ + QdrantError, + qdrant::{ + CreateCollectionBuilder, Distance, Modifier, SparseVectorParamsBuilder, + SparseVectorsConfigBuilder, VectorParamsBuilder, VectorsConfigBuilder, + }, +}; +use serde_json::{Map, Value}; +use sqlx::PgExecutor; +use tokenizers::{Tokenizer, models::wordlevel::WordLevel}; +use tokio::time; + +use elf_config::{ + Chunking, Config, EmbeddingProviderConfig, Lifecycle, LlmProviderConfig, Memory, MemoryPolicy, + Postgres, ProviderConfig, Ranking, RankingBlend, RankingBlendSegment, RankingDeterministic, + RankingDeterministicDecay, RankingDeterministicHits, RankingDeterministicLexical, + RankingDiversity, RankingRetrievalSources, ReadProfiles, ScopePrecedence, ScopeWriteAllowed, + Scopes, Search, SearchCache, SearchDynamic, SearchExpansion, SearchExplain, SearchGraphContext, + SearchPrefilter, SearchRecursive, Security, Service, Storage, TtlDays, +}; +use elf_service::{ + BoxFuture, ElfService, EmbeddingProvider, ExtractorProvider, RerankProvider, Result, +}; +use elf_storage::{ + db::Db, + qdrant::{BM25_VECTOR_NAME, DENSE_VECTOR_NAME, QdrantStore}, +}; +use elf_testkit::TestDatabase; + +type AcceptanceResult<T> = Result<T, AcceptanceFailure>; + +#[derive(Debug, thiserror::Error)] +enum AcceptanceFailure { + #[error(transparent)] + Storage(#[from] elf_storage::Error), + #[error(transparent)] + Sqlx(#[from] sqlx::Error), + #[error(transparent)] + Qdrant(#[from] QdrantError), + #[error("{0}")] + Message(String), +} + +pub struct StubEmbedding { + pub vector_dim: u32, +} +impl EmbeddingProvider for StubEmbedding { + fn embed<'a>( + &'a self, + _cfg: &'a EmbeddingProviderConfig, + texts: &'a [String], + ) -> BoxFuture<'a, Result<Vec<Vec<f32>>>> { + let dim = self.vector_dim as usize; + let vectors = texts.iter().map(|_| vec![0.0; dim]).collect(); + + Box::pin(async move { Ok(vectors) }) + } +} + +pub struct SpyEmbedding { + pub vector_dim: u32, + pub calls: Arc<AtomicUsize>, +} +impl EmbeddingProvider for SpyEmbedding { + fn embed<'a>( + &'a self, + _cfg: &'a EmbeddingProviderConfig, + texts: &'a [String], + ) -> BoxFuture<'a, Result<Vec<Vec<f32>>>> { + self.calls.fetch_add(1, Ordering::SeqCst); + + let dim = self.vector_dim as usize; + let vectors = texts.iter().map(|_| vec![0.0; dim]).collect(); + + Box::pin(async move { Ok(vectors) }) + } +} + +pub struct StubRerank; +impl RerankProvider for StubRerank { + fn rerank<'a>( + &'a self, + _cfg: &'a ProviderConfig, + _query: &'a str, + docs: &'a [String], + ) -> BoxFuture<'a, Result<Vec<f32>>> { + let scores = vec![0.5; docs.len()]; + + Box::pin(async move { Ok(scores) }) + } +} + +pub struct SpyExtractor { + pub calls: Arc<AtomicUsize>, + pub payload: Value, +} +impl ExtractorProvider for SpyExtractor { + fn extract<'a>( + &'a self, + _cfg: &'a LlmProviderConfig, + _messages: &'a [Value], + ) -> BoxFuture<'a, Result<Value>> { + let payload = self.payload.clone(); + + self.calls.fetch_add(1, Ordering::SeqCst); + + Box::pin(async move { Ok(payload) }) + } +} + +pub fn test_qdrant_url() -> Option<String> { + env::var("ELF_QDRANT_GRPC_URL").ok().or_else(|| env::var("ELF_QDRANT_URL").ok()) +} + +pub fn test_config( + dsn: String, + qdrant_url: String, + vector_dim: u32, + collection: String, + docs_collection: String, +) -> Config { + let mut embedding = dummy_embedding_provider(); + + embedding.dimensions = vector_dim; + + Config { + service: Service { + http_bind: "127.0.0.1:0".to_string(), + mcp_bind: "127.0.0.1:0".to_string(), + admin_bind: "127.0.0.1:0".to_string(), + log_level: "info".to_string(), + }, + storage: Storage { + postgres: Postgres { dsn, pool_max_conns: 2 }, + qdrant: elf_config::Qdrant { + url: qdrant_url, + collection: collection.clone(), + docs_collection, + vector_dim, + }, + }, + providers: elf_config::Providers { + embedding, + rerank: dummy_provider(), + llm_extractor: dummy_llm_provider(), + }, + scopes: Scopes { + allowed: vec![ + "agent_private".to_string(), + "project_shared".to_string(), + "org_shared".to_string(), + ], + read_profiles: ReadProfiles { + private_only: vec!["agent_private".to_string()], + private_plus_project: vec![ + "agent_private".to_string(), + "project_shared".to_string(), + ], + all_scopes: vec![ + "agent_private".to_string(), + "project_shared".to_string(), + "org_shared".to_string(), + ], + }, + precedence: ScopePrecedence { agent_private: 30, project_shared: 20, org_shared: 10 }, + write_allowed: ScopeWriteAllowed { + agent_private: true, + project_shared: true, + org_shared: true, + }, + }, + memory: Memory { + max_notes_per_add_event: 3, + max_note_chars: 240, + dup_sim_threshold: 0.92, + update_sim_threshold: 0.85, + candidate_k: 60, + top_k: 12, + policy: MemoryPolicy { rules: vec![] }, + }, + search: test_search(), + ranking: test_ranking(), + lifecycle: Lifecycle { + ttl_days: TtlDays { + plan: 14, + fact: 180, + preference: 0, + constraint: 0, + decision: 0, + profile: 0, + }, + purge_deleted_after_days: 30, + purge_deprecated_after_days: 180, + }, + chunking: Chunking { + enabled: true, + max_tokens: 512, + overlap_tokens: 128, + tokenizer_repo: test_tokenizer_repo(&collection), + }, + security: Security { + bind_localhost_only: true, + reject_non_english: true, + redact_secrets_on_write: true, + evidence_min_quotes: 1, + evidence_max_quotes: 2, + evidence_max_quote_chars: 320, + auth_mode: "off".to_string(), + auth_keys: vec![], + }, + context: None, + mcp: None, + } +} + +pub fn dummy_embedding_provider() -> EmbeddingProviderConfig { + EmbeddingProviderConfig { + provider_id: "test".to_string(), + api_base: "http://127.0.0.1:1".to_string(), + api_key: "test-key".to_string(), + path: "/".to_string(), + model: "test".to_string(), + dimensions: 4_096, + timeout_ms: 1_000, + default_headers: Map::new(), + } +} + +pub fn dummy_provider() -> ProviderConfig { + ProviderConfig { + provider_id: "test".to_string(), + api_base: "http://127.0.0.1:1".to_string(), + api_key: "test-key".to_string(), + path: "/".to_string(), + model: "test".to_string(), + timeout_ms: 1_000, + default_headers: Map::new(), + } +} + +pub fn dummy_llm_provider() -> LlmProviderConfig { + LlmProviderConfig { + provider_id: "test".to_string(), + api_base: "http://127.0.0.1:1".to_string(), + api_key: "test-key".to_string(), + path: "/".to_string(), + model: "test".to_string(), + temperature: 0.1, + timeout_ms: 1_000, + default_headers: Map::new(), + } +} + +pub async fn test_db() -> Option<TestDatabase> { + let base_dsn = elf_testkit::env_dsn()?; + let db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + + Some(db) +} + +fn test_tokenizer_repo(collection: &str) -> String { + let tokenizer_path = env::temp_dir().join(format!("{collection}-tokenizer.json")); + + if tokenizer_path.exists() { + return tokenizer_path.to_string_lossy().into_owned(); + } + + let mut vocab = AHashMap::new(); + + vocab.insert("<unk>".to_string(), 0_u32); + + let model = WordLevel::builder() + .vocab(vocab) + .unk_token("<unk>".to_string()) + .build() + .expect("Failed to build acceptance tokenizer."); + let tokenizer = Tokenizer::new(model); + let parent = tokenizer_path.parent().expect("Temporary tokenizer path has a parent directory."); + + fs::create_dir_all(parent).expect("Failed to create acceptance tokenizer directory."); + + tokenizer.save(&tokenizer_path, false).expect("Failed to save acceptance tokenizer."); + + tokenizer_path.to_string_lossy().into_owned() +} + +fn test_search() -> Search { + Search { + expansion: SearchExpansion { + mode: "off".to_string(), + max_queries: 4, + include_original: true, + }, + dynamic: SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, + prefilter: SearchPrefilter { max_candidates: 0 }, + cache: SearchCache { + enabled: true, + expansion_ttl_days: 7, + rerank_ttl_days: 7, + max_payload_bytes: Some(262_144), + }, + explain: SearchExplain { + retention_days: 7, + capture_candidates: false, + candidate_retention_days: 2, + write_mode: "outbox".to_string(), + }, + recursive: SearchRecursive { + enabled: false, + max_depth: 2, + max_children_per_node: 4, + max_nodes_per_scope: 32, + max_total_nodes: 256, + }, + graph_context: SearchGraphContext { + enabled: false, + max_facts_per_item: 16, + max_evidence_notes_per_fact: 16, + }, + } +} + +fn test_ranking() -> Ranking { + Ranking { + recency_tau_days: 60.0, + tie_breaker_weight: 0.1, + deterministic: RankingDeterministic { + enabled: false, + lexical: RankingDeterministicLexical { + enabled: false, + weight: 0.05, + min_ratio: 0.3, + max_query_terms: 16, + max_text_terms: 1_024, + }, + hits: RankingDeterministicHits { + enabled: false, + weight: 0.05, + half_saturation: 8.0, + last_hit_tau_days: 14.0, + }, + decay: RankingDeterministicDecay { enabled: false, weight: 0.05, tau_days: 30.0 }, + }, + blend: RankingBlend { + enabled: true, + rerank_normalization: "rank".to_string(), + retrieval_normalization: "rank".to_string(), + segments: vec![ + RankingBlendSegment { max_retrieval_rank: 3, retrieval_weight: 0.8 }, + RankingBlendSegment { max_retrieval_rank: 10, retrieval_weight: 0.5 }, + RankingBlendSegment { max_retrieval_rank: 1_000_000, retrieval_weight: 0.2 }, + ], + }, + diversity: RankingDiversity { + enabled: true, + sim_threshold: 0.88, + mmr_lambda: 0.7, + max_skips: 64, + }, + retrieval_sources: RankingRetrievalSources { + fusion_weight: 1.0, + structured_field_weight: 1.0, + fusion_priority: 1, + structured_field_priority: 0, + }, + } +} + +async fn reset_qdrant_collection( + client: &qdrant_client::Qdrant, + collection: &str, + vector_dim: u32, +) -> AcceptanceResult<()> { + let max_attempts = 8; + let mut backoff = Duration::from_millis(100); + let mut last_err = None; + + for attempt in 1..=max_attempts { + let _ = client.delete_collection(collection.to_string()).await; + let mut vectors_config = VectorsConfigBuilder::default(); + + vectors_config.add_named_vector_params( + DENSE_VECTOR_NAME, + VectorParamsBuilder::new(vector_dim.into(), Distance::Cosine), + ); + + let mut sparse_vectors_config = SparseVectorsConfigBuilder::default(); + + sparse_vectors_config.add_named_vector_params( + BM25_VECTOR_NAME, + SparseVectorParamsBuilder::default().modifier(Modifier::Idf as i32), + ); + + let builder = CreateCollectionBuilder::new(collection.to_string()) + .vectors_config(vectors_config) + .sparse_vectors_config(sparse_vectors_config); + + match client.create_collection(builder).await { + Ok(_) => return Ok(()), + Err(err) => { + last_err = Some(err); + + if attempt == max_attempts { + break; + } + + time::sleep(backoff).await; + + backoff = backoff.saturating_mul(2).min(Duration::from_secs(2)); + }, + } + } + + Err(AcceptanceFailure::Message(format!( + "Failed to create Qdrant collection {collection:?} after {max_attempts} attempts: {last_err:?}." + ))) +} + +async fn build_service( + cfg: Config, + providers: elf_service::Providers, +) -> AcceptanceResult<ElfService> { + let db = Db::connect(&cfg.storage.postgres).await?; + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await?; + + let qdrant = QdrantStore::new(&cfg.storage.qdrant)?; + + Ok(ElfService::with_providers(cfg, db, qdrant, providers)) +} + +async fn reset_db<'e, E>(executor: E) -> AcceptanceResult<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +TRUNCATE + graph_entities, + graph_entity_aliases, + graph_predicates, + graph_predicate_aliases, + graph_facts, + graph_fact_evidence, + graph_fact_supersessions, + memory_hits, + memory_ingest_decisions, + memory_note_versions, + memory_space_grants, + note_field_embeddings, + memory_note_fields, + note_chunk_embeddings, + memory_note_chunks, + note_embeddings, + search_trace_items, + search_trace_stage_items, + search_trace_stages, + search_traces, + search_trace_outbox, + search_sessions, + search_trace_candidates, + indexing_outbox, + doc_indexing_outbox, + doc_chunk_embeddings, + doc_chunks, + doc_documents, + knowledge_page_lint_findings, + knowledge_page_source_refs, + knowledge_page_sections, + knowledge_pages, + consolidation_run_jobs, + consolidation_proposal_reviews, + consolidation_proposals, + consolidation_runs, + memory_notes", + ) + .execute(executor) + .await?; + + Ok(()) +} diff --git a/packages/elf-service/tests/acceptance/trace_admin_observability.rs b/packages/elf-service/tests/acceptance/trace_admin_observability.rs new file mode 100644 index 00000000..30453fe9 --- /dev/null +++ b/packages/elf-service/tests/acceptance/trace_admin_observability.rs @@ -0,0 +1,553 @@ +use std::sync::{Arc, atomic::AtomicUsize}; + +use serde_json::Value; +use sqlx::PgPool; +use time::{Duration, OffsetDateTime}; +use uuid::Uuid; + +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; +use elf_service::{ + ElfService, Providers, SearchExplainRequest, TraceBundleGetRequest, TraceGetRequest, + TraceRecentListRequest, TraceRecentListResponse, TraceTrajectoryGetRequest, + search::{TraceBundleMode, TraceReplayCandidate}, +}; +use elf_testkit::TestDatabase; + +const TENANT_ID: &str = "tenant_admin_scope"; +const PROJECT_ID: &str = "project_admin_scope"; +const TRACE_VERSION: i32 = 3; + +struct TraceAdminObservabilityFixture { + service: ElfService, + test_db: TestDatabase, +} + +struct VisibilityTraceFixtureIds { + trace_one: Uuid, + trace_two: Uuid, + trace_three: Uuid, + item_two: Uuid, +} + +async fn setup_service(test_name: &str) -> Option<TraceAdminObservabilityFixture> { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); + + return None; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); + + return None; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let extractor = SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(extractor), + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + Some(TraceAdminObservabilityFixture { service, test_db }) +} + +async fn insert_trace( + executor: &PgPool, + trace_id: Uuid, + agent_id: &str, + read_profile: &str, + query: &str, + created_at: OffsetDateTime, +) { + sqlx::query( + "\ +INSERT INTO search_traces ( + trace_id, + tenant_id, + project_id, + agent_id, + read_profile, + query, + expansion_mode, + expanded_queries, + allowed_scopes, + candidate_count, + top_k, + config_snapshot, + trace_version, + created_at, + expires_at +) + VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15 + )", + ) + .bind(trace_id) + .bind(TENANT_ID) + .bind(PROJECT_ID) + .bind(agent_id) + .bind(read_profile) + .bind(query) + .bind("full") + .bind(serde_json::json!([query])) + .bind(serde_json::json!(["agent_private", "project_shared", "org_shared"])) + .bind(10_i32) + .bind(5_i32) + .bind(serde_json::json!({ "test": true })) + .bind(TRACE_VERSION) + .bind(created_at) + .bind(created_at + Duration::minutes(60)) + .execute(executor) + .await + .expect("Failed to insert trace."); +} + +async fn insert_trace_item( + executor: &PgPool, + item_id: Uuid, + trace_id: Uuid, + note_id: Uuid, + chunk_id: Uuid, + rank: i32, +) { + sqlx::query( + "\ +INSERT INTO search_trace_items ( + item_id, + trace_id, + note_id, + chunk_id, + rank, + final_score, + explain +) +VALUES ($1, $2, $3, $4, $5, $6, $7)", + ) + .bind(item_id) + .bind(trace_id) + .bind(note_id) + .bind(chunk_id) + .bind(rank) + .bind(1.0_f32) + .bind(serde_json::json!({ + "match": { "matched_terms": [], "matched_fields": [] }, + "ranking": { + "schema": "search_ranking_explain/v2", + "policy_id": "ranking_v2:test", + "final_score": 1.0, + "terms": [] + } + })) + .execute(executor) + .await + .expect("Failed to insert trace item."); +} + +async fn insert_trace_stage( + executor: &PgPool, + stage_id: Uuid, + trace_id: Uuid, + stage_order: i32, + stage_name: &str, + created_at: OffsetDateTime, +) { + sqlx::query( + "\ +INSERT INTO search_trace_stages ( + stage_id, + trace_id, + stage_order, + stage_name, + stage_payload, + created_at +) +VALUES ($1, $2, $3, $4, $5, $6)", + ) + .bind(stage_id) + .bind(trace_id) + .bind(stage_order) + .bind(stage_name) + .bind(serde_json::json!({ + "stage_name": stage_name, + "metrics": { "items": 0 } + })) + .bind(created_at) + .execute(executor) + .await + .expect("Failed to insert trace stage."); +} + +async fn insert_trace_stage_item( + executor: &PgPool, + item_id: Uuid, + stage_id: Uuid, + note_id: Uuid, + chunk_id: Uuid, + metrics: Value, +) { + sqlx::query( + "\ +INSERT INTO search_trace_stage_items ( + id, + stage_id, + item_id, + note_id, + chunk_id, + metrics +) +VALUES ($1, $2, $3, $4, $5, $6)", + ) + .bind(Uuid::new_v4()) + .bind(stage_id) + .bind(item_id) + .bind(note_id) + .bind(chunk_id) + .bind(metrics) + .execute(executor) + .await + .expect("Failed to insert trace stage item."); +} + +#[allow(clippy::too_many_arguments)] +async fn insert_trace_candidate( + executor: &PgPool, + candidate_id: Uuid, + trace_id: Uuid, + note_id: Uuid, + chunk_id: Uuid, + rank: i32, + retrieval_rank: i32, + retrieval_score: f32, + created_at: OffsetDateTime, +) { + sqlx::query( + "\ +INSERT INTO search_trace_candidates ( + candidate_id, + trace_id, + note_id, + chunk_id, + chunk_index, + snippet, + candidate_snapshot, + retrieval_rank, + rerank_score, + note_scope, + note_importance, + note_updated_at, + note_hit_count, + note_last_hit_at, + created_at, + expires_at +) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16)", + ) + .bind(candidate_id) + .bind(trace_id) + .bind(note_id) + .bind(chunk_id) + .bind(rank) + .bind("trace candidate snippet") + .bind({ + let candidate_snapshot = TraceReplayCandidate { + note_id, + chunk_id, + chunk_index: rank, + snippet: "trace candidate snippet".to_string(), + retrieval_rank: retrieval_rank as u32, + retrieval_score: Some(retrieval_score), + rerank_score: retrieval_score, + note_scope: "agent_private".to_string(), + note_importance: 0.6, + note_updated_at: created_at, + note_hit_count: 12, + note_last_hit_at: None, + diversity_selected: None, + diversity_selected_rank: None, + diversity_selected_reason: None, + diversity_skipped_reason: None, + diversity_nearest_selected_note_id: None, + diversity_similarity: None, + diversity_mmr_score: None, + diversity_missing_embedding: None, + }; + + serde_json::to_value(candidate_snapshot) + .expect("Failed to serialize trace replay candidate.") + }) + .bind(retrieval_rank) + .bind(retrieval_score) + .bind("agent_private") + .bind(0.6_f32) + .bind(created_at) + .bind(12_i64) + .bind(Option::<OffsetDateTime>::None) + .bind(created_at) + .bind(created_at + Duration::minutes(90)) + .execute(executor) + .await + .expect("Failed to insert trace candidate."); +} + +async fn seed_visibility_and_recent_list_traces( + service: &ElfService, + now: OffsetDateTime, +) -> VisibilityTraceFixtureIds { + let trace_one = Uuid::new_v4(); + let trace_two = Uuid::new_v4(); + let trace_three = Uuid::new_v4(); + let item_one = Uuid::new_v4(); + let item_two = Uuid::new_v4(); + let item_three = Uuid::new_v4(); + let note_one = Uuid::new_v4(); + let note_two = Uuid::new_v4(); + let note_three = Uuid::new_v4(); + let chunk_one = Uuid::new_v4(); + let chunk_two = Uuid::new_v4(); + let chunk_three = Uuid::new_v4(); + + insert_trace(&service.db.pool, trace_one, "agent_one", "private_only", "one", now).await; + insert_trace( + &service.db.pool, + trace_two, + "agent_two", + "private_only", + "two", + now - Duration::seconds(10), + ) + .await; + insert_trace( + &service.db.pool, + trace_three, + "agent_three", + "private_only", + "three", + now - Duration::seconds(20), + ) + .await; + insert_trace_item(&service.db.pool, item_one, trace_one, note_one, chunk_one, 1).await; + insert_trace_item(&service.db.pool, item_two, trace_two, note_two, chunk_two, 1).await; + insert_trace_item(&service.db.pool, item_three, trace_three, note_three, chunk_three, 1).await; + + VisibilityTraceFixtureIds { trace_one, trace_two, trace_three, item_two } +} + +async fn trace_recent_list_page( + service: &ElfService, + cursor_created_at: Option<OffsetDateTime>, + cursor_trace_id: Option<Uuid>, +) -> TraceRecentListResponse { + service + .trace_recent_list(TraceRecentListRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: "admin_agent".to_string(), + limit: Some(2), + cursor_created_at, + cursor_trace_id, + agent_id_filter: None, + read_profile: None, + created_after: None, + created_before: None, + }) + .await + .expect("Failed to list recent traces.") +} + +async fn assert_trace_admin_visibility_cross_scope( + service: &ElfService, + trace_id: Uuid, + item_id: Uuid, +) { + let cross_agent_trace_get = service + .trace_get(TraceGetRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: "different_agent".to_string(), + trace_id, + }) + .await + .expect("Expected cross-agent trace lookup to bypass agent ownership filtering."); + + assert_eq!(cross_agent_trace_get.trace.trace_id, trace_id); + assert_eq!(cross_agent_trace_get.trace.agent_id, "agent_two"); + + let cross_agent_trajectory = service + .trace_trajectory_get(TraceTrajectoryGetRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: "different_agent".to_string(), + trace_id, + }) + .await + .expect("Expected cross-agent trajectory lookup to bypass agent ownership filtering."); + + assert_eq!(cross_agent_trajectory.trace.trace_id, trace_id); + + let cross_agent_item = service + .search_explain(SearchExplainRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: "different_agent".to_string(), + result_handle: item_id, + }) + .await + .expect("Expected cross-agent trace-item lookup to bypass agent ownership filtering."); + + assert_eq!(cross_agent_item.item.result_handle, item_id); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn trace_admin_visibility_and_recent_list_cursor() { + let Some(fixture) = setup_service("trace_admin_visibility_and_recent_list_cursor").await else { + return; + }; + let TraceAdminObservabilityFixture { service, test_db } = fixture; + let now = OffsetDateTime::now_utc(); + let VisibilityTraceFixtureIds { trace_one, trace_two, trace_three, item_two } = + seed_visibility_and_recent_list_traces(&service, now).await; + let first = trace_recent_list_page(&service, None, None).await; + + assert_eq!(first.schema, "elf.recent_traces/v1"); + assert_eq!(first.traces.len(), 2); + assert_eq!(first.traces[0].trace_id, trace_one); + assert_eq!(first.traces[1].trace_id, trace_two); + assert!(first.traces[0].created_at > first.traces[1].created_at); + + let Some(cursor) = first.next_cursor else { + panic!("Expected next_cursor to exist for second page."); + }; + let second = + trace_recent_list_page(&service, Some(cursor.created_at), Some(cursor.trace_id)).await; + + assert_eq!(second.traces.len(), 1); + assert_eq!(second.traces[0].trace_id, trace_three); + assert!(second.next_cursor.is_none()); + + assert_trace_admin_visibility_cross_scope(&service, trace_two, item_two).await; + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn trace_bundle_truncation_and_candidate_limits() { + let Some(fixture) = setup_service("trace_bundle_truncation_and_candidate_limits").await else { + return; + }; + let TraceAdminObservabilityFixture { service, test_db } = fixture; + let now = OffsetDateTime::now_utc(); + let trace_id = Uuid::new_v4(); + let stage_id = Uuid::new_v4(); + + insert_trace(&service.db.pool, trace_id, "agent_one", "private_only", "bundle", now).await; + insert_trace_stage(&service.db.pool, stage_id, trace_id, 0, "selection.final", now).await; + + for index in 0..3 { + let item_id = Uuid::new_v4(); + let note_id = Uuid::new_v4(); + let chunk_id = Uuid::new_v4(); + + insert_trace_item(&service.db.pool, item_id, trace_id, note_id, chunk_id, index + 1).await; + insert_trace_stage_item( + &service.db.pool, + item_id, + stage_id, + note_id, + chunk_id, + serde_json::json!({ "candidate_index": index }), + ) + .await; + } + for (idx, rank) in [(2_i32, 2_i32), (1_i32, 1_i32), (3_i32, 3_i32)] { + insert_trace_candidate( + &service.db.pool, + Uuid::new_v4(), + trace_id, + Uuid::new_v4(), + Uuid::new_v4(), + idx, + rank, + 0.9_f32 - (idx as f32 * 0.1), + now, + ) + .await; + } + + let bounded = service + .trace_bundle_get(TraceBundleGetRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: "admin_agent".to_string(), + trace_id, + mode: TraceBundleMode::Bounded, + stage_items_limit: Some(1), + candidates_limit: None, + }) + .await + .expect("Failed to fetch bounded bundle."); + + assert_eq!(bounded.schema, "elf.trace_bundle/v1"); + assert_eq!(bounded.stages.len(), 1); + assert_eq!(bounded.stages[0].items.len(), 1); + assert!(bounded.candidates.is_none()); + + let full = service + .trace_bundle_get(TraceBundleGetRequest { + tenant_id: TENANT_ID.to_string(), + project_id: PROJECT_ID.to_string(), + agent_id: "admin_agent".to_string(), + trace_id, + mode: TraceBundleMode::Full, + stage_items_limit: Some(1), + candidates_limit: Some(2), + }) + .await + .expect("Failed to fetch full bundle."); + + assert_eq!(full.stages[0].items.len(), 1); + assert!(full.candidates.as_ref().is_some_and(|candidates| candidates.len() == 2)); + + let candidates = full.candidates.unwrap(); + + assert_eq!(candidates[0].retrieval_rank, 1); + assert_eq!(candidates[1].retrieval_rank, 2); + assert!( + candidates[0].retrieval_score.is_some_and(|score| (score - 0.8_f32).abs() < 1e-6), + "Unexpected retrieval_score: {:?}", + candidates[0].retrieval_score + ); + assert!(candidates[0].rerank_score >= candidates[1].rerank_score); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-service/tests/qdrant_init.rs b/packages/elf-service/tests/qdrant_init.rs new file mode 100644 index 00000000..f7e3af1c --- /dev/null +++ b/packages/elf-service/tests/qdrant_init.rs @@ -0,0 +1,38 @@ +#![allow(unused_crate_dependencies)] + +//! Regression tests for Qdrant init-script payload indexes. + +use std::{fs, path::PathBuf}; + +#[test] +fn qdrant_init_script_creates_docs_payload_indexes() { + let script_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(".."); + let script_path = script_path.join("..").join("qdrant").join("init.sh"); + let script = fs::read_to_string(&script_path) + .unwrap_or_else(|err| panic!("Failed to read {}: {err}", script_path.display())); + let script = script.chars().filter(|ch| !ch.is_whitespace()).collect::<String>(); + + for (field, field_schema) in [ + ("scope", "keyword"), + ("status", "keyword"), + ("doc_type", "keyword"), + ("agent_id", "keyword"), + ("updated_at", "datetime"), + ("doc_ts", "datetime"), + ("thread_id", "keyword"), + ("domain", "keyword"), + ("repo", "keyword"), + ] { + let needle = format!("\"field_name\":\"{field}\",\"field_schema\":\"{field_schema}\""); + + assert!( + script.contains(&needle), + "Missing payload index for docs field {field} with schema {field_schema} in qdrant/init.sh" + ); + } + + assert!( + script.contains("\"${collection}\"==\"${ELF_QDRANT_DOCS_COLLECTION"), + "Docs payload indexing is not gated to ELF_QDRANT_DOCS_COLLECTION." + ); +} diff --git a/packages/elf-service/tests/service.rs b/packages/elf-service/tests/service.rs index ac83d32c..7443e882 100644 --- a/packages/elf-service/tests/service.rs +++ b/packages/elf-service/tests/service.rs @@ -1,45 +1,54 @@ -// std +#![allow(unused_crate_dependencies)] + +//! Integration tests for service-layer note ingestion and policy behavior. + use std::sync::{ Arc, atomic::{AtomicUsize, Ordering}, }; -// crates.io use serde_json::{Map, Value}; use sqlx::PgPool; -// self -use elf_config::{Config, EmbeddingProviderConfig, LlmProviderConfig, ProviderConfig}; +use elf_config::{ + Chunking, Config, EmbeddingProviderConfig, Lifecycle, LlmProviderConfig, Memory, MemoryPolicy, + Postgres, ProviderConfig, Qdrant, Ranking, RankingBlend, RankingBlendSegment, + RankingDeterministic, RankingDeterministicDecay, RankingDeterministicHits, + RankingDeterministicLexical, RankingDiversity, RankingRetrievalSources, ReadProfiles, + ScopePrecedence, ScopeWriteAllowed, Scopes, Search, SearchCache, SearchDynamic, + SearchExpansion, SearchExplain, SearchGraphContext, SearchPrefilter, SearchRecursive, Security, + Service, Storage, TtlDays, +}; use elf_service::{ - AddNoteInput, AddNoteRequest, ElfService, EmbeddingProvider, ExtractorProvider, Providers, - RerankProvider, ServiceError, + AddNoteInput, AddNoteRequest, BoxFuture, ElfService, EmbeddingProvider, Error, + ExtractorProvider, RerankProvider, Result, }; use elf_storage::{db::Db, qdrant::QdrantStore}; struct DummyEmbedding; - impl EmbeddingProvider for DummyEmbedding { fn embed<'a>( &'a self, cfg: &'a EmbeddingProviderConfig, texts: &'a [String], - ) -> elf_service::BoxFuture<'a, color_eyre::Result<Vec<Vec<f32>>>> { + ) -> BoxFuture<'a, Result<Vec<Vec<f32>>>> { let dim = (cfg.dimensions as usize).max(1); let vec = vec![0.0; dim]; + Box::pin(async move { Ok(vec![vec; texts.len()]) }) } } struct DummyRerank; - impl RerankProvider for DummyRerank { fn rerank<'a>( &'a self, _cfg: &'a ProviderConfig, _query: &'a str, docs: &'a [String], - ) -> elf_service::BoxFuture<'a, color_eyre::Result<Vec<f32>>> { + ) -> BoxFuture<'a, Result<Vec<f32>>> { let scores = vec![0.0; docs.len()]; + Box::pin(async move { Ok(scores) }) } } @@ -47,7 +56,6 @@ impl RerankProvider for DummyRerank { struct SpyExtractor { calls: Arc<AtomicUsize>, } - impl SpyExtractor { fn new() -> Self { Self { calls: Arc::new(AtomicUsize::new(0)) } @@ -57,35 +65,82 @@ impl SpyExtractor { self.calls.load(Ordering::SeqCst) } } - impl ExtractorProvider for SpyExtractor { fn extract<'a>( &'a self, _cfg: &'a LlmProviderConfig, _messages: &'a [Value], - ) -> elf_service::BoxFuture<'a, color_eyre::Result<Value>> { + ) -> BoxFuture<'a, Result<Value>> { self.calls.fetch_add(1, Ordering::SeqCst); + Box::pin(async move { Ok(serde_json::json!({ "notes": [] })) }) } } +fn test_ranking() -> Ranking { + Ranking { + recency_tau_days: 60.0, + tie_breaker_weight: 0.1, + deterministic: RankingDeterministic { + enabled: false, + lexical: RankingDeterministicLexical { + enabled: false, + weight: 0.05, + min_ratio: 0.3, + max_query_terms: 16, + max_text_terms: 1_024, + }, + hits: RankingDeterministicHits { + enabled: false, + weight: 0.05, + half_saturation: 8.0, + last_hit_tau_days: 14.0, + }, + decay: RankingDeterministicDecay { enabled: false, weight: 0.05, tau_days: 30.0 }, + }, + blend: RankingBlend { + enabled: true, + rerank_normalization: "rank".to_string(), + retrieval_normalization: "rank".to_string(), + segments: vec![ + RankingBlendSegment { max_retrieval_rank: 3, retrieval_weight: 0.8 }, + RankingBlendSegment { max_retrieval_rank: 10, retrieval_weight: 0.5 }, + RankingBlendSegment { max_retrieval_rank: 1_000_000, retrieval_weight: 0.2 }, + ], + }, + diversity: RankingDiversity { + enabled: true, + sim_threshold: 0.88, + mmr_lambda: 0.7, + max_skips: 64, + }, + retrieval_sources: RankingRetrievalSources { + fusion_weight: 1.0, + structured_field_weight: 1.0, + fusion_priority: 1, + structured_field_priority: 0, + }, + } +} + fn test_config() -> Config { Config { - service: elf_config::Service { + service: Service { http_bind: "127.0.0.1:8080".to_string(), mcp_bind: "127.0.0.1:8082".to_string(), admin_bind: "127.0.0.1:8081".to_string(), log_level: "info".to_string(), }, - storage: elf_config::Storage { - postgres: elf_config::Postgres { + storage: Storage { + postgres: Postgres { dsn: "postgres://user:pass@localhost/db".to_string(), pool_max_conns: 1, }, - qdrant: elf_config::Qdrant { + qdrant: Qdrant { url: "http://localhost:6334".to_string(), - collection: "mem_notes_v1".to_string(), - vector_dim: 3, + collection: "mem_notes_v2".to_string(), + docs_collection: "doc_chunks_v1".to_string(), + vector_dim: 4_096, }, }, providers: elf_config::Providers { @@ -93,53 +148,65 @@ fn test_config() -> Config { rerank: dummy_provider(), llm_extractor: dummy_llm_provider(), }, - scopes: elf_config::Scopes { + scopes: Scopes { allowed: vec!["agent_private".to_string()], - read_profiles: elf_config::ReadProfiles { + read_profiles: ReadProfiles { private_only: vec!["agent_private".to_string()], private_plus_project: vec!["agent_private".to_string()], all_scopes: vec!["agent_private".to_string()], }, - precedence: elf_config::ScopePrecedence { - agent_private: 30, - project_shared: 20, - org_shared: 10, - }, - write_allowed: elf_config::ScopeWriteAllowed { + precedence: ScopePrecedence { agent_private: 30, project_shared: 20, org_shared: 10 }, + write_allowed: ScopeWriteAllowed { agent_private: true, project_shared: false, org_shared: false, }, }, - memory: elf_config::Memory { + memory: Memory { max_notes_per_add_event: 3, max_note_chars: 500, dup_sim_threshold: 0.9, update_sim_threshold: 0.8, candidate_k: 10, top_k: 5, + policy: MemoryPolicy { rules: vec![] }, }, - search: elf_config::Search { - expansion: elf_config::SearchExpansion { + search: Search { + expansion: SearchExpansion { mode: "off".to_string(), max_queries: 4, include_original: true, }, - dynamic: elf_config::SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, - prefilter: elf_config::SearchPrefilter { max_candidates: 0 }, - cache: elf_config::SearchCache { + dynamic: SearchDynamic { min_candidates: 10, min_top_score: 0.12 }, + prefilter: SearchPrefilter { max_candidates: 0 }, + cache: SearchCache { enabled: true, expansion_ttl_days: 7, rerank_ttl_days: 7, max_payload_bytes: Some(262_144), - expansion_version: "v1".to_string(), - rerank_version: "v1".to_string(), }, - explain: elf_config::SearchExplain { retention_days: 7 }, + explain: SearchExplain { + retention_days: 7, + capture_candidates: false, + candidate_retention_days: 2, + write_mode: "outbox".to_string(), + }, + recursive: SearchRecursive { + enabled: false, + max_depth: 2, + max_children_per_node: 4, + max_nodes_per_scope: 32, + max_total_nodes: 256, + }, + graph_context: SearchGraphContext { + enabled: false, + max_facts_per_item: 16, + max_evidence_notes_per_fact: 16, + }, }, - ranking: elf_config::Ranking { recency_tau_days: 60.0, tie_breaker_weight: 0.1 }, - lifecycle: elf_config::Lifecycle { - ttl_days: elf_config::TtlDays { + ranking: test_ranking(), + lifecycle: Lifecycle { + ttl_days: TtlDays { plan: 1, fact: 2, preference: 0, @@ -150,57 +217,61 @@ fn test_config() -> Config { purge_deleted_after_days: 30, purge_deprecated_after_days: 180, }, - chunking: elf_config::Chunking { + chunking: Chunking { enabled: true, max_tokens: 512, overlap_tokens: 128, - tokenizer_repo: None, + tokenizer_repo: "gpt2".to_string(), }, - security: elf_config::Security { + security: Security { bind_localhost_only: true, - reject_cjk: true, + reject_non_english: true, redact_secrets_on_write: true, evidence_min_quotes: 1, evidence_max_quotes: 2, evidence_max_quote_chars: 320, + auth_mode: "off".to_string(), + auth_keys: vec![], }, + context: None, + mcp: None, } } -fn dummy_embedding_provider() -> elf_config::EmbeddingProviderConfig { - elf_config::EmbeddingProviderConfig { +fn dummy_embedding_provider() -> EmbeddingProviderConfig { + EmbeddingProviderConfig { provider_id: "p".to_string(), api_base: "http://localhost".to_string(), api_key: "key".to_string(), path: "/".to_string(), model: "3".to_string(), - dimensions: 3, - timeout_ms: 1000, + dimensions: 4_096, + timeout_ms: 1_000, default_headers: Map::new(), } } -fn dummy_provider() -> elf_config::ProviderConfig { - elf_config::ProviderConfig { +fn dummy_provider() -> ProviderConfig { + ProviderConfig { provider_id: "p".to_string(), api_base: "http://localhost".to_string(), api_key: "key".to_string(), path: "/".to_string(), model: "3".to_string(), - timeout_ms: 1000, + timeout_ms: 1_000, default_headers: Map::new(), } } -fn dummy_llm_provider() -> elf_config::LlmProviderConfig { - elf_config::LlmProviderConfig { +fn dummy_llm_provider() -> LlmProviderConfig { + LlmProviderConfig { provider_id: "p".to_string(), api_base: "http://localhost".to_string(), api_key: "key".to_string(), path: "/".to_string(), model: "m".to_string(), temperature: 0.1, - timeout_ms: 1000, + timeout_ms: 1_000, default_headers: Map::new(), } } @@ -212,10 +283,9 @@ async fn add_note_does_not_call_llm() { PgPool::connect_lazy(&cfg.storage.postgres.dsn).expect("Failed to create lazy pool."); let db = Db { pool }; let qdrant = QdrantStore::new(&cfg.storage.qdrant).expect("Failed to create Qdrant store."); - let spy = Arc::new(SpyExtractor::new()); - let providers = Providers::new(Arc::new(DummyEmbedding), Arc::new(DummyRerank), spy.clone()); - + let providers = + elf_service::Providers::new(Arc::new(DummyEmbedding), Arc::new(DummyRerank), spy.clone()); let service = ElfService::with_providers(cfg, db, qdrant, providers); let req = AddNoteRequest { tenant_id: "t1".to_string(), @@ -223,18 +293,20 @@ async fn add_note_does_not_call_llm() { agent_id: "a1".to_string(), scope: "agent_private".to_string(), notes: vec![AddNoteInput { - note_type: "fact".to_string(), + r#type: "fact".to_string(), key: None, text: "こんにちは".to_string(), + structured: None, importance: 0.5, confidence: 0.5, ttl_days: None, source_ref: serde_json::json!({}), + write_policy: None, }], }; - let result = service.add_note(req).await; - assert!(matches!(result, Err(ServiceError::NonEnglishInput { .. }))); + + assert!(matches!(result, Err(Error::NonEnglishInput { .. }))); assert_eq!(spy.count(), 0); } @@ -245,10 +317,9 @@ async fn add_note_rejects_empty_notes() { PgPool::connect_lazy(&cfg.storage.postgres.dsn).expect("Failed to create lazy pool."); let db = Db { pool }; let qdrant = QdrantStore::new(&cfg.storage.qdrant).expect("Failed to create Qdrant store."); - let spy = Arc::new(SpyExtractor::new()); - let providers = Providers::new(Arc::new(DummyEmbedding), Arc::new(DummyRerank), spy.clone()); - + let providers = + elf_service::Providers::new(Arc::new(DummyEmbedding), Arc::new(DummyRerank), spy.clone()); let service = ElfService::with_providers(cfg, db, qdrant, providers); let req = AddNoteRequest { tenant_id: "t1".to_string(), @@ -257,8 +328,8 @@ async fn add_note_rejects_empty_notes() { scope: "agent_private".to_string(), notes: vec![], }; - let result = service.add_note(req).await; - assert!(matches!(result, Err(ServiceError::InvalidRequest { .. }))); + + assert!(matches!(result, Err(Error::InvalidRequest { .. }))); assert_eq!(spy.count(), 0); } diff --git a/packages/elf-storage/Cargo.toml b/packages/elf-storage/Cargo.toml index e911fb88..518e1fce 100644 --- a/packages/elf-storage/Cargo.toml +++ b/packages/elf-storage/Cargo.toml @@ -1,17 +1,19 @@ [package] edition = "2024" name = "elf-storage" -version = "0.1.0" +version = "0.2.0" [dependencies] -color-eyre = { workspace = true } -elf-config = { path = "../elf-config" } qdrant-client = { workspace = true } serde_json = { workspace = true } sqlx = { workspace = true } +thiserror = { workspace = true } time = { workspace = true } uuid = { workspace = true } +elf-config = { workspace = true } + [dev-dependencies] -elf-testkit = { path = "../elf-testkit" } -tokio = { workspace = true } +tokio = { workspace = true } + +elf-testkit = { workspace = true } diff --git a/packages/elf-storage/src/consolidation.rs b/packages/elf-storage/src/consolidation.rs new file mode 100644 index 00000000..d6699e36 --- /dev/null +++ b/packages/elf-storage/src/consolidation.rs @@ -0,0 +1,733 @@ +//! Consolidation run and proposal persistence queries. + +use serde_json::Value; +use sqlx::PgExecutor; +use time::{Duration, OffsetDateTime}; +use uuid::Uuid; + +use crate::{ + Result, + db::Db, + models::{ + ConsolidationProposal, ConsolidationProposalReviewEvent, ConsolidationRun, + ConsolidationRunJob, + }, +}; + +const CONSOLIDATION_RUN_SELECT: &str = "\ +SELECT + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + job_kind, + status, + input_refs, + source_snapshot, + lineage, + COALESCE(error, '{}'::jsonb) AS error, + created_at, + updated_at, + completed_at +FROM consolidation_runs +WHERE tenant_id = $1 AND project_id = $2 AND run_id = $3 +LIMIT 1"; +const CONSOLIDATION_PROPOSAL_SELECT: &str = "\ +SELECT + proposal_id, + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + proposal_kind, + apply_intent, + review_state, + source_refs, + source_snapshot, + lineage, + diff, + confidence, + COALESCE(unsupported_claim_flags, '[]'::jsonb) AS unsupported_claim_flags, + COALESCE(contradiction_markers, '[]'::jsonb) AS contradiction_markers, + COALESCE(staleness_markers, '[]'::jsonb) AS staleness_markers, + COALESCE(target_ref, '{}'::jsonb) AS target_ref, + COALESCE(proposed_payload, '{}'::jsonb) AS proposed_payload, + reviewer_agent_id, + review_comment, + reviewed_at, + created_at, + updated_at +FROM consolidation_proposals +WHERE tenant_id = $1 AND project_id = $2 AND proposal_id = $3 +LIMIT 1"; + +/// Arguments for updating a consolidation run state. +pub struct ConsolidationRunStateUpdate<'a> { + /// Tenant that owns the run. + pub tenant_id: &'a str, + /// Project that owns the run. + pub project_id: &'a str, + /// Run identifier. + pub run_id: Uuid, + /// New run status. + pub status: &'a str, + /// Structured error payload for terminal failure states. + pub error: &'a Value, + /// Update timestamp. + pub now: OffsetDateTime, +} + +/// Arguments for updating a consolidation proposal review state. +pub struct ConsolidationProposalReviewUpdate<'a> { + /// Tenant that owns the proposal. + pub tenant_id: &'a str, + /// Project that owns the proposal. + pub project_id: &'a str, + /// Proposal identifier. + pub proposal_id: Uuid, + /// New review state. + pub review_state: &'a str, + /// Reviewing agent identifier. + pub reviewer_agent_id: &'a str, + /// Optional reviewer comment. + pub review_comment: Option<&'a str>, + /// Update timestamp. + pub now: OffsetDateTime, +} + +/// Arguments for inserting a consolidation proposal review event. +pub struct ConsolidationProposalReviewEventInsert<'a> { + /// Review event identifier. + pub review_id: Uuid, + /// Reviewed proposal identifier. + pub proposal_id: Uuid, + /// Parent consolidation run identifier. + pub run_id: Uuid, + /// Tenant that owns the proposal. + pub tenant_id: &'a str, + /// Project that owns the proposal. + pub project_id: &'a str, + /// Reviewing agent identifier. + pub reviewer_agent_id: &'a str, + /// Review action requested by the reviewer. + pub action: &'a str, + /// Review state before the transition. + pub from_review_state: &'a str, + /// Review state after the transition. + pub to_review_state: &'a str, + /// Optional reviewer comment. + pub review_comment: Option<&'a str>, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Arguments for inserting a consolidation worker job. +pub struct ConsolidationRunJobInsert<'a> { + /// Worker job identifier. + pub job_id: Uuid, + /// Consolidation run to materialize. + pub run_id: Uuid, + /// Tenant that owns the run. + pub tenant_id: &'a str, + /// Project that owns the run. + pub project_id: &'a str, + /// Agent that registered the run. + pub agent_id: &'a str, + /// Job kind, such as fixture or manual. + pub job_kind: &'a str, + /// Queued proposal payload. + pub payload: &'a Value, + /// Creation timestamp. + pub now: OffsetDateTime, +} + +/// Inserts one consolidation run. +pub async fn insert_consolidation_run<'e, E>(executor: E, run: &ConsolidationRun) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO consolidation_runs ( + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + job_kind, + status, + input_refs, + source_snapshot, + lineage, + error, + created_at, + updated_at, + completed_at +) +VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14)", + ) + .bind(run.run_id) + .bind(run.tenant_id.as_str()) + .bind(run.project_id.as_str()) + .bind(run.agent_id.as_str()) + .bind(run.contract_schema.as_str()) + .bind(run.job_kind.as_str()) + .bind(run.status.as_str()) + .bind(&run.input_refs) + .bind(&run.source_snapshot) + .bind(&run.lineage) + .bind(&run.error) + .bind(run.created_at) + .bind(run.updated_at) + .bind(run.completed_at) + .execute(executor) + .await?; + + Ok(()) +} + +/// Enqueues one consolidation worker job. +pub async fn insert_consolidation_run_job<'e, E>( + executor: E, + args: ConsolidationRunJobInsert<'_>, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO consolidation_run_jobs ( + job_id, + run_id, + tenant_id, + project_id, + agent_id, + job_kind, + status, + payload, + available_at, + created_at, + updated_at +) +VALUES ($1,$2,$3,$4,$5,$6,'PENDING',$7,$8,$8,$8)", + ) + .bind(args.job_id) + .bind(args.run_id) + .bind(args.tenant_id) + .bind(args.project_id) + .bind(args.agent_id) + .bind(args.job_kind) + .bind(args.payload) + .bind(args.now) + .execute(executor) + .await?; + + Ok(()) +} + +/// Fetches one consolidation run by tenant and run identifier. +pub async fn get_consolidation_run<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + run_id: Uuid, +) -> Result<Option<ConsolidationRun>> +where + E: PgExecutor<'e>, +{ + let row = sqlx::query_as::<_, ConsolidationRun>(CONSOLIDATION_RUN_SELECT) + .bind(tenant_id) + .bind(project_id) + .bind(run_id) + .fetch_optional(executor) + .await?; + + Ok(row) +} + +/// Lists consolidation runs for one tenant and project. +pub async fn list_consolidation_runs<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + limit: i64, +) -> Result<Vec<ConsolidationRun>> +where + E: PgExecutor<'e>, +{ + let rows = sqlx::query_as::<_, ConsolidationRun>( + "\ +SELECT + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + job_kind, + status, + input_refs, + source_snapshot, + lineage, + COALESCE(error, '{}'::jsonb) AS error, + created_at, + updated_at, + completed_at +FROM consolidation_runs +WHERE tenant_id = $1 AND project_id = $2 +ORDER BY created_at DESC, run_id DESC +LIMIT $3", + ) + .bind(tenant_id) + .bind(project_id) + .bind(limit) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Claims the next due consolidation worker job and leases it until `lease_seconds`. +pub async fn claim_next_consolidation_run_job( + db: &Db, + now: OffsetDateTime, + lease_seconds: i64, +) -> Result<Option<ConsolidationRunJob>> { + let mut tx = db.pool.begin().await?; + let row = sqlx::query_as::<_, ConsolidationRunJob>( + "\ +SELECT + job_id, + run_id, + tenant_id, + project_id, + agent_id, + job_kind, + status, + payload, + attempts, + last_error, + available_at, + created_at, + updated_at +FROM consolidation_run_jobs +WHERE status IN ('PENDING','FAILED','CLAIMED') AND available_at <= $1 +ORDER BY available_at ASC +LIMIT 1 +FOR UPDATE SKIP LOCKED", + ) + .bind(now) + .fetch_optional(&mut *tx) + .await?; + let job = if let Some(mut job) = row { + let lease_until = now + Duration::seconds(lease_seconds); + + sqlx::query( + "\ +UPDATE consolidation_run_jobs +SET status = 'CLAIMED', available_at = $1, updated_at = $2 +WHERE job_id = $3", + ) + .bind(lease_until) + .bind(now) + .bind(job.job_id) + .execute(&mut *tx) + .await?; + + job.status = "CLAIMED".to_string(); + job.available_at = lease_until; + job.updated_at = now; + + Some(job) + } else { + None + }; + + tx.commit().await?; + + Ok(job) +} + +/// Marks a consolidation worker job as completed. +pub async fn mark_consolidation_run_job_done<'e, E>( + executor: E, + job_id: Uuid, + now: OffsetDateTime, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +UPDATE consolidation_run_jobs +SET status = 'DONE', updated_at = $1 +WHERE job_id = $2", + ) + .bind(now) + .bind(job_id) + .execute(executor) + .await?; + + Ok(()) +} + +/// Marks a consolidation worker job as failed and schedules its retry. +pub async fn mark_consolidation_run_job_failed( + db: &Db, + job_id: Uuid, + attempts: i32, + error_text: &str, + available_at: OffsetDateTime, + now: OffsetDateTime, +) -> Result<()> { + sqlx::query( + "\ +UPDATE consolidation_run_jobs +SET status = 'FAILED', + attempts = $1, + last_error = $2, + available_at = $3, + updated_at = $4 +WHERE job_id = $5", + ) + .bind(attempts) + .bind(error_text) + .bind(available_at) + .bind(now) + .bind(job_id) + .execute(&db.pool) + .await?; + + Ok(()) +} + +/// Updates one consolidation run state. +pub async fn update_consolidation_run_state<'e, E>( + executor: E, + args: ConsolidationRunStateUpdate<'_>, +) -> Result<Option<ConsolidationRun>> +where + E: PgExecutor<'e>, +{ + let row = sqlx::query_as::<_, ConsolidationRun>( + "\ +UPDATE consolidation_runs +SET + status = $1, + error = $2, + updated_at = $3, + completed_at = CASE + WHEN $1 IN ('completed', 'failed', 'cancelled') THEN $3 + ELSE completed_at + END +WHERE tenant_id = $4 AND project_id = $5 AND run_id = $6 +RETURNING + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + job_kind, + status, + input_refs, + source_snapshot, + lineage, + COALESCE(error, '{}'::jsonb) AS error, + created_at, + updated_at, + completed_at", + ) + .bind(args.status) + .bind(args.error) + .bind(args.now) + .bind(args.tenant_id) + .bind(args.project_id) + .bind(args.run_id) + .fetch_optional(executor) + .await?; + + Ok(row) +} + +/// Inserts one consolidation proposal. +pub async fn insert_consolidation_proposal<'e, E>( + executor: E, + proposal: &ConsolidationProposal, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO consolidation_proposals ( + proposal_id, + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + proposal_kind, + apply_intent, + review_state, + source_refs, + source_snapshot, + lineage, + diff, + confidence, + unsupported_claim_flags, + contradiction_markers, + staleness_markers, + target_ref, + proposed_payload, + reviewer_agent_id, + review_comment, + reviewed_at, + created_at, + updated_at +) +VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24)", + ) + .bind(proposal.proposal_id) + .bind(proposal.run_id) + .bind(proposal.tenant_id.as_str()) + .bind(proposal.project_id.as_str()) + .bind(proposal.agent_id.as_str()) + .bind(proposal.contract_schema.as_str()) + .bind(proposal.proposal_kind.as_str()) + .bind(proposal.apply_intent.as_str()) + .bind(proposal.review_state.as_str()) + .bind(&proposal.source_refs) + .bind(&proposal.source_snapshot) + .bind(&proposal.lineage) + .bind(&proposal.diff) + .bind(proposal.confidence) + .bind(&proposal.unsupported_claim_flags) + .bind(&proposal.contradiction_markers) + .bind(&proposal.staleness_markers) + .bind(&proposal.target_ref) + .bind(&proposal.proposed_payload) + .bind(proposal.reviewer_agent_id.as_deref()) + .bind(proposal.review_comment.as_deref()) + .bind(proposal.reviewed_at) + .bind(proposal.created_at) + .bind(proposal.updated_at) + .execute(executor) + .await?; + + Ok(()) +} + +/// Fetches one consolidation proposal by tenant and proposal identifier. +pub async fn get_consolidation_proposal<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + proposal_id: Uuid, +) -> Result<Option<ConsolidationProposal>> +where + E: PgExecutor<'e>, +{ + let row = sqlx::query_as::<_, ConsolidationProposal>(CONSOLIDATION_PROPOSAL_SELECT) + .bind(tenant_id) + .bind(project_id) + .bind(proposal_id) + .fetch_optional(executor) + .await?; + + Ok(row) +} + +/// Lists consolidation proposals for one tenant and project. +pub async fn list_consolidation_proposals<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + run_id: Option<Uuid>, + review_state: Option<&str>, + limit: i64, +) -> Result<Vec<ConsolidationProposal>> +where + E: PgExecutor<'e>, +{ + let rows = sqlx::query_as::<_, ConsolidationProposal>( + "\ +SELECT + proposal_id, + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + proposal_kind, + apply_intent, + review_state, + source_refs, + source_snapshot, + lineage, + diff, + confidence, + COALESCE(unsupported_claim_flags, '[]'::jsonb) AS unsupported_claim_flags, + COALESCE(contradiction_markers, '[]'::jsonb) AS contradiction_markers, + COALESCE(staleness_markers, '[]'::jsonb) AS staleness_markers, + COALESCE(target_ref, '{}'::jsonb) AS target_ref, + COALESCE(proposed_payload, '{}'::jsonb) AS proposed_payload, + reviewer_agent_id, + review_comment, + reviewed_at, + created_at, + updated_at +FROM consolidation_proposals +WHERE tenant_id = $1 + AND project_id = $2 + AND ($3::uuid IS NULL OR run_id = $3) + AND ($4::text IS NULL OR review_state = $4) +ORDER BY created_at DESC, proposal_id DESC +LIMIT $5", + ) + .bind(tenant_id) + .bind(project_id) + .bind(run_id) + .bind(review_state) + .bind(limit) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Updates one proposal review state. +pub async fn update_consolidation_proposal_review<'e, E>( + executor: E, + args: ConsolidationProposalReviewUpdate<'_>, +) -> Result<Option<ConsolidationProposal>> +where + E: PgExecutor<'e>, +{ + let row = sqlx::query_as::<_, ConsolidationProposal>( + "\ +UPDATE consolidation_proposals +SET + review_state = $1, + reviewer_agent_id = $2, + review_comment = $3, + reviewed_at = $4, + updated_at = $4 +WHERE tenant_id = $5 AND project_id = $6 AND proposal_id = $7 +RETURNING + proposal_id, + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + proposal_kind, + apply_intent, + review_state, + source_refs, + source_snapshot, + lineage, + diff, + confidence, + COALESCE(unsupported_claim_flags, '[]'::jsonb) AS unsupported_claim_flags, + COALESCE(contradiction_markers, '[]'::jsonb) AS contradiction_markers, + COALESCE(staleness_markers, '[]'::jsonb) AS staleness_markers, + COALESCE(target_ref, '{}'::jsonb) AS target_ref, + COALESCE(proposed_payload, '{}'::jsonb) AS proposed_payload, + reviewer_agent_id, + review_comment, + reviewed_at, + created_at, + updated_at", + ) + .bind(args.review_state) + .bind(args.reviewer_agent_id) + .bind(args.review_comment) + .bind(args.now) + .bind(args.tenant_id) + .bind(args.project_id) + .bind(args.proposal_id) + .fetch_optional(executor) + .await?; + + Ok(row) +} + +/// Inserts one proposal review audit event. +pub async fn insert_consolidation_proposal_review_event<'e, E>( + executor: E, + args: ConsolidationProposalReviewEventInsert<'_>, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO consolidation_proposal_reviews ( + review_id, + proposal_id, + run_id, + tenant_id, + project_id, + reviewer_agent_id, + action, + from_review_state, + to_review_state, + review_comment, + created_at +) +VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)", + ) + .bind(args.review_id) + .bind(args.proposal_id) + .bind(args.run_id) + .bind(args.tenant_id) + .bind(args.project_id) + .bind(args.reviewer_agent_id) + .bind(args.action) + .bind(args.from_review_state) + .bind(args.to_review_state) + .bind(args.review_comment) + .bind(args.created_at) + .execute(executor) + .await?; + + Ok(()) +} + +/// Lists review events for one consolidation proposal. +pub async fn list_consolidation_proposal_review_events<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + proposal_id: Uuid, +) -> Result<Vec<ConsolidationProposalReviewEvent>> +where + E: PgExecutor<'e>, +{ + let rows = sqlx::query_as::<_, ConsolidationProposalReviewEvent>( + "\ +SELECT + review_id, + proposal_id, + run_id, + tenant_id, + project_id, + reviewer_agent_id, + action, + from_review_state, + to_review_state, + review_comment, + created_at +FROM consolidation_proposal_reviews +WHERE tenant_id = $1 AND project_id = $2 AND proposal_id = $3 +ORDER BY created_at ASC, review_id ASC", + ) + .bind(tenant_id) + .bind(project_id) + .bind(proposal_id) + .fetch_all(executor) + .await?; + + Ok(rows) +} diff --git a/packages/elf-storage/src/db.rs b/packages/elf-storage/src/db.rs index 63e2bb6b..7f10ff95 100644 --- a/packages/elf-storage/src/db.rs +++ b/packages/elf-storage/src/db.rs @@ -1,42 +1,95 @@ -// crates.io -use color_eyre::Result; -use sqlx::postgres::PgPoolOptions; +//! Postgres connection helpers and schema bootstrap logic. -// self -use crate::schema; +use sqlx::{AssertSqlSafe, PgConnection, PgPool, Transaction, postgres::PgPoolOptions}; +use crate::{Result, graph, schema}; + +/// Shared Postgres handle for ELF storage operations. pub struct Db { - pub pool: sqlx::PgPool, + /// Connection pool used by storage queries. + pub pool: PgPool, } - impl Db { + /// Connects to Postgres using the configured pool settings. pub async fn connect(cfg: &elf_config::Postgres) -> Result<Self> { let pool = PgPoolOptions::new().max_connections(cfg.pool_max_conns).connect(&cfg.dsn).await?; + Ok(Self { pool }) } + /// Ensures the storage schema exists and applies required backfills. pub async fn ensure_schema(&self, vector_dim: u32) -> Result<()> { let sql = schema::render_schema(vector_dim); let lock_id: i64 = 7_120_114; - sqlx::query("SELECT pg_advisory_lock($1)").bind(lock_id).execute(&self.pool).await?; + // Advisory locks are held per connection. Use a single transaction so the lock is scoped to + // one connection and automatically released when the transaction ends. + let mut tx = self.pool.begin().await?; + + sqlx::query("SELECT pg_advisory_xact_lock($1)").bind(lock_id).execute(&mut *tx).await?; - let mut failure: Option<color_eyre::Report> = None; for statement in sql.split(';') { let trimmed = statement.trim(); + if trimmed.is_empty() { continue; } - if let Err(err) = sqlx::query(trimmed).execute(&self.pool).await { - failure = Some(err.into()); - break; - } - } - let _ = - sqlx::query("SELECT pg_advisory_unlock($1)").bind(lock_id).execute(&self.pool).await; - if let Some(err) = failure { - return Err(err); + + sqlx::raw_sql(AssertSqlSafe(trimmed)).execute(&mut *tx).await?; } + + backfill_graph_fact_predicate_ids(&mut tx).await?; + + tx.commit().await?; + Ok(()) } } + +async fn backfill_graph_fact_predicate_ids(tx: &mut Transaction<'_, sqlx::Postgres>) -> Result<()> { + loop { + let conn: &mut PgConnection = &mut *tx; + let rows: Vec<(String, String, String)> = sqlx::query_as( + "\ +SELECT DISTINCT tenant_id, project_id, predicate +FROM graph_facts +WHERE predicate_id IS NULL +LIMIT 200", + ) + .fetch_all(conn) + .await?; + + if rows.is_empty() { + break; + } + + for (tenant_id, project_id, predicate_surface) in rows { + let conn: &mut PgConnection = &mut *tx; + let predicate = graph::resolve_or_register_predicate( + conn, + tenant_id.as_str(), + project_id.as_str(), + predicate_surface.as_str(), + ) + .await?; + + sqlx::query( + "\ +UPDATE graph_facts +SET predicate_id = $1 +WHERE tenant_id = $2 + AND project_id = $3 + AND predicate = $4 + AND predicate_id IS NULL", + ) + .bind(predicate.predicate_id) + .bind(tenant_id.as_str()) + .bind(project_id.as_str()) + .bind(predicate_surface.as_str()) + .execute(conn) + .await?; + } + } + + Ok(()) +} diff --git a/packages/elf-storage/src/doc_outbox.rs b/packages/elf-storage/src/doc_outbox.rs new file mode 100644 index 00000000..884dbea0 --- /dev/null +++ b/packages/elf-storage/src/doc_outbox.rs @@ -0,0 +1,136 @@ +//! Document indexing outbox helpers. + +use sqlx::PgExecutor; +use time::{Duration, OffsetDateTime}; +use uuid::Uuid; + +use crate::{Result, db::Db, models::DocIndexingOutboxEntry}; + +/// Enqueues one document chunk for downstream indexing work. +pub async fn enqueue_doc_outbox<'e, E>( + executor: E, + doc_id: Uuid, + chunk_id: Uuid, + op: &str, + embedding_version: &str, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO doc_indexing_outbox (outbox_id, doc_id, chunk_id, op, embedding_version, status) +VALUES ($1,$2,$3,$4,$5,'PENDING')", + ) + .bind(Uuid::new_v4()) + .bind(doc_id) + .bind(chunk_id) + .bind(op) + .bind(embedding_version) + .execute(executor) + .await?; + + Ok(()) +} + +/// Claims the next due document-indexing outbox job and leases it until `lease_seconds`. +pub async fn claim_next_doc_indexing_outbox_job( + db: &Db, + now: OffsetDateTime, + lease_seconds: i64, +) -> Result<Option<DocIndexingOutboxEntry>> { + let mut tx = db.pool.begin().await?; + let row = sqlx::query_as::<_, DocIndexingOutboxEntry>( + "\ +SELECT + outbox_id, + doc_id, + chunk_id, + op, + embedding_version, + status, + attempts, + last_error, + available_at, + created_at, + updated_at +FROM doc_indexing_outbox +WHERE status IN ('PENDING','FAILED','CLAIMED') AND available_at <= $1 +ORDER BY available_at ASC +LIMIT 1 +FOR UPDATE SKIP LOCKED", + ) + .bind(now) + .fetch_optional(&mut *tx) + .await?; + let job = if let Some(mut job) = row { + let lease_until = now + Duration::seconds(lease_seconds); + + sqlx::query( + "UPDATE doc_indexing_outbox SET status = 'CLAIMED', available_at = $1, updated_at = $2 WHERE outbox_id = $3", + ) + .bind(lease_until) + .bind(now) + .bind(job.outbox_id) + .execute(&mut *tx) + .await?; + + job.available_at = lease_until; + job.updated_at = now; + + Some(job) + } else { + None + }; + + tx.commit().await?; + + Ok(job) +} + +/// Marks a document-indexing outbox job as completed. +pub async fn mark_doc_indexing_outbox_done( + db: &Db, + outbox_id: Uuid, + now: OffsetDateTime, +) -> Result<()> { + sqlx::query( + "UPDATE doc_indexing_outbox SET status = 'DONE', updated_at = $1 WHERE outbox_id = $2", + ) + .bind(now) + .bind(outbox_id) + .execute(&db.pool) + .await?; + + Ok(()) +} + +/// Marks a document-indexing outbox job as failed and schedules its retry. +pub async fn mark_doc_indexing_outbox_failed( + db: &Db, + outbox_id: Uuid, + attempts: i32, + error_text: &str, + available_at: OffsetDateTime, + now: OffsetDateTime, +) -> Result<()> { + sqlx::query( + "\ +UPDATE doc_indexing_outbox +SET status = 'FAILED', + attempts = $1, + last_error = $2, + available_at = $3, + updated_at = $4 +WHERE outbox_id = $5", + ) + .bind(attempts) + .bind(error_text) + .bind(available_at) + .bind(now) + .bind(outbox_id) + .execute(&db.pool) + .await?; + + Ok(()) +} diff --git a/packages/elf-storage/src/docs.rs b/packages/elf-storage/src/docs.rs new file mode 100644 index 00000000..5672966f --- /dev/null +++ b/packages/elf-storage/src/docs.rs @@ -0,0 +1,242 @@ +//! Document persistence queries. + +use serde_json::Value; +use sqlx::PgExecutor; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{ + Result, + models::{DocChunk, DocDocument}, +}; + +/// Normalizes absent document source metadata to an empty JSON object. +pub fn normalize_source_ref(source_ref: Option<Value>) -> Value { + source_ref.unwrap_or(Value::Object(Default::default())) +} + +/// Inserts one document record into storage. +pub async fn insert_doc_document<'e, E>(executor: E, doc: &DocDocument) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ + INSERT INTO doc_documents ( + doc_id, + tenant_id, + project_id, + agent_id, + scope, + doc_type, + status, + title, + source_ref, + content, + content_bytes, + content_hash, + created_at, + updated_at + ) + VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14)", + ) + .bind(doc.doc_id) + .bind(doc.tenant_id.as_str()) + .bind(doc.project_id.as_str()) + .bind(doc.agent_id.as_str()) + .bind(doc.scope.as_str()) + .bind(doc.doc_type.as_str()) + .bind(doc.status.as_str()) + .bind(doc.title.as_deref()) + .bind(&doc.source_ref) + .bind(doc.content.as_str()) + .bind(doc.content_bytes) + .bind(doc.content_hash.as_str()) + .bind(doc.created_at) + .bind(doc.updated_at) + .execute(executor) + .await?; + + Ok(()) +} + +/// Fetches one document record by tenant and document identifier. +pub async fn get_doc_document<'e, E>( + executor: E, + tenant_id: &str, + doc_id: Uuid, +) -> Result<Option<DocDocument>> +where + E: PgExecutor<'e>, +{ + let row = sqlx::query_as::<_, DocDocument>( + "\ + SELECT + doc_id, + tenant_id, + project_id, + agent_id, + scope, + doc_type, + status, + title, + COALESCE(source_ref, '{}'::jsonb) AS source_ref, + content, + content_bytes, + content_hash, + created_at, + updated_at +FROM doc_documents +WHERE tenant_id = $1 AND doc_id = $2 +LIMIT 1", + ) + .bind(tenant_id) + .bind(doc_id) + .fetch_optional(executor) + .await?; + + Ok(row) +} + +/// Inserts one document chunk row. +pub async fn insert_doc_chunk<'e, E>(executor: E, chunk: &DocChunk) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO doc_chunks ( + chunk_id, + doc_id, + chunk_index, + start_offset, + end_offset, + chunk_text, + chunk_hash, + created_at +) +VALUES ($1,$2,$3,$4,$5,$6,$7,$8)", + ) + .bind(chunk.chunk_id) + .bind(chunk.doc_id) + .bind(chunk.chunk_index) + .bind(chunk.start_offset) + .bind(chunk.end_offset) + .bind(chunk.chunk_text.as_str()) + .bind(chunk.chunk_hash.as_str()) + .bind(chunk.created_at) + .execute(executor) + .await?; + + Ok(()) +} + +/// Lists all chunks for one document in chunk order. +pub async fn list_doc_chunks<'e, E>(executor: E, doc_id: Uuid) -> Result<Vec<DocChunk>> +where + E: PgExecutor<'e>, +{ + let rows = sqlx::query_as::<_, DocChunk>( + "\ +SELECT + chunk_id, + doc_id, + chunk_index, + start_offset, + end_offset, + chunk_text, + chunk_hash, + created_at +FROM doc_chunks +WHERE doc_id = $1 +ORDER BY chunk_index ASC", + ) + .bind(doc_id) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Fetches one document chunk by chunk identifier. +pub async fn get_doc_chunk<'e, E>(executor: E, chunk_id: Uuid) -> Result<Option<DocChunk>> +where + E: PgExecutor<'e>, +{ + let row = sqlx::query_as::<_, DocChunk>( + "\ +SELECT + chunk_id, + doc_id, + chunk_index, + start_offset, + end_offset, + chunk_text, + chunk_hash, + created_at +FROM doc_chunks +WHERE chunk_id = $1 +LIMIT 1", + ) + .bind(chunk_id) + .fetch_optional(executor) + .await?; + + Ok(row) +} + +/// Upserts one dense or sparse embedding vector for a document chunk. +pub async fn insert_doc_chunk_embedding<'e, E>( + executor: E, + chunk_id: Uuid, + embedding_version: &str, + embedding_dim: i32, + vec: &str, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO doc_chunk_embeddings (chunk_id, embedding_version, embedding_dim, vec) +VALUES ($1, $2, $3, $4::text::vector) +ON CONFLICT (chunk_id, embedding_version) DO UPDATE +SET + embedding_dim = EXCLUDED.embedding_dim, + vec = EXCLUDED.vec, + created_at = now()", + ) + .bind(chunk_id) + .bind(embedding_version) + .bind(embedding_dim) + .bind(vec) + .execute(executor) + .await?; + + Ok(()) +} + +/// Marks one document record as deleted. +pub async fn mark_doc_deleted<'e, E>( + executor: E, + tenant_id: &str, + doc_id: Uuid, + now: OffsetDateTime, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +UPDATE doc_documents +SET status = 'deleted', updated_at = $1 +WHERE tenant_id = $2 AND doc_id = $3", + ) + .bind(now) + .bind(tenant_id) + .bind(doc_id) + .execute(executor) + .await?; + + Ok(()) +} diff --git a/packages/elf-storage/src/error.rs b/packages/elf-storage/src/error.rs new file mode 100644 index 00000000..fc3a0b0a --- /dev/null +++ b/packages/elf-storage/src/error.rs @@ -0,0 +1,24 @@ +/// Storage-layer errors returned by Postgres and Qdrant helpers. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// A SQLx query or connection operation failed. + #[error(transparent)] + Sqlx(#[from] sqlx::Error), + /// The caller supplied an invalid storage argument. + #[error("Invalid argument: {0}")] + InvalidArgument(String), + /// The requested storage record does not exist. + #[error("Not found: {0}")] + NotFound(String), + /// The requested storage mutation conflicts with existing state. + #[error("Conflict: {0}")] + Conflict(String), + /// A Qdrant client operation failed. + #[error(transparent)] + Qdrant(#[from] Box<qdrant_client::QdrantError>), +} +impl From<qdrant_client::QdrantError> for Error { + fn from(err: qdrant_client::QdrantError) -> Self { + Self::Qdrant(Box::new(err)) + } +} diff --git a/packages/elf-storage/src/graph.rs b/packages/elf-storage/src/graph.rs new file mode 100644 index 00000000..4bed5e36 --- /dev/null +++ b/packages/elf-storage/src/graph.rs @@ -0,0 +1,983 @@ +//! Graph entity, predicate, and fact storage helpers. + +use sqlx::PgConnection; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{ + Error, Result, + models::{GraphEntity, GraphFact, GraphPredicate, GraphPredicateAlias}, +}; + +const GRAPH_PREDICATE_SCOPE_GLOBAL: &str = "__global__"; +const GRAPH_PREDICATE_SCOPE_PROJECT_PREFIX: &str = "__project__:"; + +/// Normalizes graph entity surfaces for uniqueness and lookup. +pub fn normalize_entity_name(input: &str) -> String { + input.split_whitespace().collect::<Vec<_>>().join(" ").to_lowercase() +} + +/// Normalizes graph predicate surfaces for uniqueness and lookup. +pub fn normalize_predicate_name(input: &str) -> String { + normalize_entity_name(input) +} + +/// Lists predicates visible within the provided scope keys. +pub async fn list_predicates_by_scope_keys( + executor: &mut PgConnection, + scope_keys: &[String], +) -> Result<Vec<GraphPredicate>> { + if scope_keys.is_empty() { + return Ok(vec![]); + } + + let scope_keys = scope_keys.to_vec(); + let rows = sqlx::query_as::<_, GraphPredicate>( + "\ +SELECT + predicate_id, + scope_key, + tenant_id, + project_id, + canonical, + canonical_norm, + cardinality, + status, + created_at, + updated_at +FROM graph_predicates +WHERE scope_key = ANY($1::text[]) +ORDER BY scope_key, canonical_norm", + ) + .bind(&scope_keys) + .fetch_all(&mut *executor) + .await?; + + Ok(rows) +} + +/// Fetches one predicate by identifier. +pub async fn get_predicate_by_id( + executor: &mut PgConnection, + predicate_id: Uuid, +) -> Result<Option<GraphPredicate>> { + let row = sqlx::query_as::<_, GraphPredicate>( + "\ +SELECT + predicate_id, + scope_key, + tenant_id, + project_id, + canonical, + canonical_norm, + cardinality, + status, + created_at, + updated_at +FROM graph_predicates +WHERE predicate_id = $1", + ) + .bind(predicate_id) + .fetch_optional(&mut *executor) + .await?; + + Ok(row) +} + +/// Updates a predicate's mutable status and cardinality fields. +pub async fn update_predicate( + executor: &mut PgConnection, + predicate_id: Uuid, + status: Option<&str>, + cardinality: Option<&str>, +) -> Result<GraphPredicate> { + let status = status.map(str::trim); + + if status.is_some_and(str::is_empty) { + return Err(Error::InvalidArgument("graph predicate status must not be empty".to_string())); + } + + let cardinality = cardinality.map(str::trim); + + if cardinality.is_some_and(str::is_empty) { + return Err(Error::InvalidArgument( + "graph predicate cardinality must not be empty".to_string(), + )); + } + + let row = sqlx::query_as::<_, GraphPredicate>( + "\ +UPDATE graph_predicates +SET + status = COALESCE($2, status), + cardinality = COALESCE($3, cardinality), + updated_at = now() +WHERE predicate_id = $1 +RETURNING + predicate_id, + scope_key, + tenant_id, + project_id, + canonical, + canonical_norm, + cardinality, + status, + created_at, + updated_at", + ) + .bind(predicate_id) + .bind(status) + .bind(cardinality) + .fetch_optional(&mut *executor) + .await?; + + row.ok_or_else(|| { + Error::NotFound(format!("graph predicate not found; predicate_id={predicate_id}")) + }) +} + +/// Updates a predicate only when its current state matches the expected guard values. +pub async fn update_predicate_guarded( + executor: &mut PgConnection, + predicate_id: Uuid, + expected_status: &str, + expected_cardinality: &str, + status: Option<&str>, + cardinality: Option<&str>, +) -> Result<GraphPredicate> { + let expected_status = expected_status.trim(); + let expected_cardinality = expected_cardinality.trim(); + + if expected_status.is_empty() { + return Err(Error::InvalidArgument( + "graph predicate expected_status must not be empty".to_string(), + )); + } + if expected_cardinality.is_empty() { + return Err(Error::InvalidArgument( + "graph predicate expected_cardinality must not be empty".to_string(), + )); + } + if expected_status == "deprecated" { + return Err(Error::Conflict(format!( + "graph predicate is deprecated and cannot be modified; predicate_id={predicate_id}" + ))); + } + + let status = status.map(str::trim); + + if status.is_some_and(str::is_empty) { + return Err(Error::InvalidArgument("graph predicate status must not be empty".to_string())); + } + + let cardinality = cardinality.map(str::trim); + + if cardinality.is_some_and(str::is_empty) { + return Err(Error::InvalidArgument( + "graph predicate cardinality must not be empty".to_string(), + )); + } + + let row = sqlx::query_as::<_, GraphPredicate>( + "\ + UPDATE graph_predicates + SET + status = COALESCE($4, status), + cardinality = COALESCE($5, cardinality), + updated_at = now() + WHERE predicate_id = $1 + AND status = $2 + AND cardinality = $3 + RETURNING + predicate_id, + scope_key, + tenant_id, + project_id, + canonical, + canonical_norm, + cardinality, + status, + created_at, + updated_at", + ) + .bind(predicate_id) + .bind(expected_status) + .bind(expected_cardinality) + .bind(status) + .bind(cardinality) + .fetch_optional(&mut *executor) + .await?; + + if let Some(row) = row { + return Ok(row); + } + + let existing = get_predicate_by_id(executor, predicate_id).await?; + let Some(_) = existing else { + return Err(Error::NotFound(format!( + "graph predicate not found; predicate_id={predicate_id}" + ))); + }; + + Err(Error::Conflict(format!( + "graph predicate update conflict; predicate_id={predicate_id} expected_status={expected_status} expected_cardinality={expected_cardinality}" + ))) +} + +/// Registers an additional alias for an existing predicate. +pub async fn add_predicate_alias( + executor: &mut PgConnection, + predicate_id: Uuid, + alias: &str, +) -> Result<()> { + let alias = alias.trim(); + + if alias.is_empty() { + return Err(Error::InvalidArgument( + "graph predicate alias is required; alias must not be empty".to_string(), + )); + } + + let alias_norm = normalize_predicate_name(alias); + + if alias_norm.is_empty() { + return Err(Error::InvalidArgument( + "graph predicate alias is required; alias_norm must not be empty".to_string(), + )); + } + + let predicate_scope_key: Option<(String,)> = sqlx::query_as( + "\ +SELECT scope_key +FROM graph_predicates +WHERE predicate_id = $1", + ) + .bind(predicate_id) + .fetch_optional(&mut *executor) + .await?; + let Some((scope_key,)) = predicate_scope_key else { + return Err(Error::NotFound(format!( + "graph predicate not found; predicate_id={predicate_id}" + ))); + }; + let res = sqlx::query( + "\ +INSERT INTO graph_predicate_aliases ( + alias_id, + predicate_id, + scope_key, + alias, + alias_norm, + created_at +) +VALUES ($1, $2, $3, $4, $5, now()) +ON CONFLICT (scope_key, alias_norm) DO UPDATE +SET alias = EXCLUDED.alias +WHERE graph_predicate_aliases.predicate_id = EXCLUDED.predicate_id", + ) + .bind(Uuid::new_v4()) + .bind(predicate_id) + .bind(&scope_key) + .bind(alias) + .bind(&alias_norm) + .execute(&mut *executor) + .await?; + + if res.rows_affected() == 0 { + return Err(Error::Conflict(format!( + "graph predicate alias already bound; scope_key={scope_key} alias_norm={alias_norm}" + ))); + } + + Ok(()) +} + +/// Lists aliases bound to one predicate. +pub async fn list_predicate_aliases( + executor: &mut PgConnection, + predicate_id: Uuid, +) -> Result<Vec<GraphPredicateAlias>> { + let rows = sqlx::query_as::<_, GraphPredicateAlias>( + "\ +SELECT + alias_id, + predicate_id, + scope_key, + alias, + alias_norm, + created_at +FROM graph_predicate_aliases +WHERE predicate_id = $1 +ORDER BY created_at ASC, alias_norm ASC", + ) + .bind(predicate_id) + .fetch_all(&mut *executor) + .await?; + + Ok(rows) +} + +/// Resolves a predicate surface across visible scopes or registers a project-scoped predicate. +pub async fn resolve_or_register_predicate( + executor: &mut PgConnection, + tenant_id: &str, + project_id: &str, + predicate_surface: &str, +) -> Result<GraphPredicate> { + let predicate_surface = predicate_surface.trim(); + + if predicate_surface.is_empty() { + return Err(Error::InvalidArgument( + "graph predicate is required; predicate_surface must not be empty".to_string(), + )); + } + + let alias_norm = normalize_predicate_name(predicate_surface); + let tenant_project_scope = predicate_scope_key_tenant_project(tenant_id, project_id); + let project_scope = predicate_scope_key_project(project_id); + let global_scope = GRAPH_PREDICATE_SCOPE_GLOBAL.to_string(); + + for scope_key in [&tenant_project_scope, &project_scope, &global_scope] { + if let Some(row) = sqlx::query_as::<_, GraphPredicate>( + "\ +SELECT + gp.predicate_id, + gp.scope_key, + gp.tenant_id, + gp.project_id, + gp.canonical, + gp.canonical_norm, + gp.cardinality, + gp.status, + gp.created_at, + gp.updated_at +FROM graph_predicate_aliases gpa +JOIN graph_predicates gp ON gp.predicate_id = gpa.predicate_id +WHERE gpa.scope_key = $1 + AND gpa.alias_norm = $2 +LIMIT 1", + ) + .bind(scope_key) + .bind(&alias_norm) + .fetch_optional(&mut *executor) + .await? + { + return Ok(row); + } + } + + let predicate_id = Uuid::new_v4(); + let predicate_row = sqlx::query_as::<_, GraphPredicate>( + "\ +INSERT INTO graph_predicates ( + predicate_id, + scope_key, + tenant_id, + project_id, + canonical, + canonical_norm, + cardinality, + status, + created_at, + updated_at +) +VALUES ($1, $2, $3, $4, $5, $6, 'multi', 'pending', now(), now()) +ON CONFLICT (scope_key, canonical_norm) +DO UPDATE +SET canonical = graph_predicates.canonical +RETURNING + predicate_id, + scope_key, + tenant_id, + project_id, + canonical, + canonical_norm, + cardinality, + status, + created_at, + updated_at", + ) + .bind(predicate_id) + .bind(&tenant_project_scope) + .bind(tenant_id) + .bind(project_id) + .bind(predicate_surface) + .bind(&alias_norm) + .fetch_one(&mut *executor) + .await?; + + sqlx::query( + "\ +INSERT INTO graph_predicate_aliases ( + alias_id, + predicate_id, + scope_key, + alias, + alias_norm, + created_at +) +VALUES ($1, $2, $3, $4, $5, now()) +ON CONFLICT (scope_key, alias_norm) DO NOTHING", + ) + .bind(Uuid::new_v4()) + .bind(predicate_row.predicate_id) + .bind(&tenant_project_scope) + .bind(predicate_surface) + .bind(&alias_norm) + .execute(&mut *executor) + .await?; + + Ok(predicate_row) +} + +/// Resolves a predicate surface across visible scopes without creating a new predicate. +pub async fn resolve_predicate_no_register( + executor: &mut PgConnection, + tenant_id: &str, + project_id: &str, + predicate_surface: &str, +) -> Result<Option<GraphPredicate>> { + let predicate_surface = predicate_surface.trim(); + + if predicate_surface.is_empty() { + return Err(Error::InvalidArgument( + "graph predicate is required; predicate_surface must not be empty".to_string(), + )); + } + + let alias_norm = normalize_predicate_name(predicate_surface); + let tenant_project_scope = predicate_scope_key_tenant_project(tenant_id, project_id); + let project_scope = predicate_scope_key_project(project_id); + let global_scope = GRAPH_PREDICATE_SCOPE_GLOBAL.to_string(); + + for scope_key in [&tenant_project_scope, &project_scope, &global_scope] { + if let Some(row) = sqlx::query_as::<_, GraphPredicate>( + "\ +SELECT + gp.predicate_id, + gp.scope_key, + gp.tenant_id, + gp.project_id, + gp.canonical, + gp.canonical_norm, + gp.cardinality, + gp.status, + gp.created_at, + gp.updated_at +FROM graph_predicate_aliases gpa +JOIN graph_predicates gp ON gp.predicate_id = gpa.predicate_id +WHERE gpa.scope_key = $1 + AND gpa.alias_norm = $2 +LIMIT 1", + ) + .bind(scope_key) + .bind(&alias_norm) + .fetch_optional(&mut *executor) + .await? + { + return Ok(Some(row)); + } + } + + Ok(None) +} + +/// Resolves an entity surface against canonical names and aliases within one tenant/project. +pub async fn resolve_entity_by_surface( + executor: &mut PgConnection, + tenant_id: &str, + project_id: &str, + entity_surface: &str, +) -> Result<Option<GraphEntity>> { + let entity_surface = entity_surface.trim(); + + if entity_surface.is_empty() { + return Err(Error::InvalidArgument( + "graph entity is required; entity_surface must not be empty".to_string(), + )); + } + + let canonical_norm = normalize_entity_name(entity_surface); + let canonical = sqlx::query_as::<_, GraphEntity>( + "\ +SELECT + entity_id, + tenant_id, + project_id, + canonical, + canonical_norm, + kind, + created_at, + updated_at +FROM graph_entities +WHERE tenant_id = $1 + AND project_id = $2 + AND canonical_norm = $3", + ) + .bind(tenant_id) + .bind(project_id) + .bind(&canonical_norm) + .fetch_optional(&mut *executor) + .await?; + + if let Some(entity) = canonical { + return Ok(Some(entity)); + } + + let alias_matches = sqlx::query_as::<_, GraphEntity>( + "\ +SELECT + ge.entity_id, + ge.tenant_id, + ge.project_id, + ge.canonical, + ge.canonical_norm, + ge.kind, + ge.created_at, + ge.updated_at +FROM graph_entity_aliases gea +JOIN graph_entities ge ON ge.entity_id = gea.entity_id +WHERE ge.tenant_id = $1 + AND ge.project_id = $2 + AND gea.alias_norm = $3", + ) + .bind(tenant_id) + .bind(project_id) + .bind(&canonical_norm) + .fetch_all(&mut *executor) + .await?; + + if alias_matches.len() == 1 { + return Ok(alias_matches.into_iter().next()); + } + if alias_matches.len() > 1 { + let candidates = alias_matches + .iter() + .map(|entity| entity.entity_id.to_string()) + .collect::<Vec<_>>() + .join(", "); + + return Err(Error::Conflict(format!( + "graph entity surface is ambiguous; entity_surface={entity_surface} alias_norm={canonical_norm} candidates=[{candidates}]" + ))); + } + + Ok(None) +} + +#[allow(clippy::too_many_arguments)] +/// Inserts a new graph fact row and attaches its evidence note identifiers. +pub async fn insert_fact_with_evidence( + executor: &mut PgConnection, + tenant_id: &str, + project_id: &str, + agent_id: &str, + scope: &str, + subject_entity_id: Uuid, + predicate: &str, + predicate_id: Uuid, + object_entity_id: Option<Uuid>, + object_value: Option<&str>, + valid_from: OffsetDateTime, + valid_to: Option<OffsetDateTime>, + evidence_note_ids: &[Uuid], +) -> Result<Uuid> { + if evidence_note_ids.is_empty() { + return Err(Error::InvalidArgument( + "graph fact evidence is required; evidence_note_ids must not be empty".to_string(), + )); + } + + match (object_entity_id, object_value) { + (Some(_), None) | (None, Some(_)) => (), + _ => { + return Err(Error::InvalidArgument( + "graph fact must provide exactly one of object_entity_id and object_value" + .to_string(), + )); + }, + } + + let row: (Uuid,) = sqlx::query_as( + "\ +INSERT INTO graph_facts ( + fact_id, + tenant_id, + project_id, + agent_id, + scope, + subject_entity_id, + predicate, + predicate_id, + object_entity_id, + object_value, + valid_from, + valid_to, + created_at, + updated_at +) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, now(), now()) +RETURNING fact_id", + ) + .bind(Uuid::new_v4()) + .bind(tenant_id) + .bind(project_id) + .bind(agent_id) + .bind(scope) + .bind(subject_entity_id) + .bind(predicate) + .bind(predicate_id) + .bind(object_entity_id) + .bind(object_value) + .bind(valid_from) + .bind(valid_to) + .fetch_one(&mut *executor) + .await?; + let fact_id = row.0; + + for note_id in evidence_note_ids { + sqlx::query( + "\ +INSERT INTO graph_fact_evidence (evidence_id, fact_id, note_id, created_at) +VALUES ($1, $2, $3, now()) +ON CONFLICT (fact_id, note_id) DO NOTHING", + ) + .bind(Uuid::new_v4()) + .bind(fact_id) + .bind(*note_id) + .execute(&mut *executor) + .await?; + } + + Ok(fact_id) +} + +#[allow(clippy::too_many_arguments)] +/// Upserts an active graph fact row and ensures the provided evidence links exist. +pub async fn upsert_fact_with_evidence( + executor: &mut PgConnection, + tenant_id: &str, + project_id: &str, + agent_id: &str, + scope: &str, + subject_entity_id: Uuid, + predicate: &str, + predicate_id: Uuid, + object_entity_id: Option<Uuid>, + object_value: Option<&str>, + valid_from: OffsetDateTime, + valid_to: Option<OffsetDateTime>, + evidence_note_ids: &[Uuid], +) -> Result<Uuid> { + if evidence_note_ids.is_empty() { + return Err(Error::InvalidArgument( + "graph fact evidence is required; evidence_note_ids must not be empty".to_string(), + )); + } + + let fact_id = match (object_entity_id, object_value) { + (Some(object_entity_id), None) => { + let row: (Uuid,) = sqlx::query_as::<_, (Uuid,)>( + "\ +INSERT INTO graph_facts ( + fact_id, + tenant_id, + project_id, + agent_id, + scope, + subject_entity_id, + predicate, + predicate_id, + object_entity_id, + object_value, + valid_from, + valid_to, + created_at, + updated_at +) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, now(), now()) +ON CONFLICT (tenant_id, project_id, scope, subject_entity_id, predicate_id, object_entity_id) +WHERE valid_to IS NULL AND object_entity_id IS NOT NULL +DO UPDATE +SET updated_at = graph_facts.updated_at +RETURNING fact_id", + ) + .bind(Uuid::new_v4()) + .bind(tenant_id) + .bind(project_id) + .bind(agent_id) + .bind(scope) + .bind(subject_entity_id) + .bind(predicate) + .bind(predicate_id) + .bind(Some(object_entity_id)) + .bind(None::<String>) + .bind(valid_from) + .bind(valid_to) + .fetch_one(&mut *executor) + .await?; + + row.0 + }, + (None, Some(object_value)) => { + let row: (Uuid,) = sqlx::query_as::<_, (Uuid,)>( + "\ +INSERT INTO graph_facts ( + fact_id, + tenant_id, + project_id, + agent_id, + scope, + subject_entity_id, + predicate, + predicate_id, + object_entity_id, + object_value, + valid_from, + valid_to, + created_at, + updated_at +) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, now(), now()) +ON CONFLICT (tenant_id, project_id, scope, subject_entity_id, predicate_id, object_value) +WHERE valid_to IS NULL AND object_value IS NOT NULL +DO UPDATE +SET updated_at = graph_facts.updated_at +RETURNING fact_id", + ) + .bind(Uuid::new_v4()) + .bind(tenant_id) + .bind(project_id) + .bind(agent_id) + .bind(scope) + .bind(subject_entity_id) + .bind(predicate) + .bind(predicate_id) + .bind(None::<Uuid>) + .bind(Some(object_value)) + .bind(valid_from) + .bind(valid_to) + .fetch_one(&mut *executor) + .await?; + + row.0 + }, + _ => { + return Err(Error::InvalidArgument( + "graph fact must provide exactly one of object_entity_id and object_value" + .to_string(), + )); + }, + }; + + for note_id in evidence_note_ids { + sqlx::query( + "\ +INSERT INTO graph_fact_evidence (evidence_id, fact_id, note_id, created_at) +VALUES ($1, $2, $3, now()) +ON CONFLICT (fact_id, note_id) DO NOTHING", + ) + .bind(Uuid::new_v4()) + .bind(fact_id) + .bind(*note_id) + .execute(&mut *executor) + .await?; + } + + Ok(fact_id) +} + +/// Upserts an entity by normalized canonical surface and returns its identifier. +pub async fn upsert_entity( + executor: &mut PgConnection, + tenant_id: &str, + project_id: &str, + canonical: &str, + kind: Option<&str>, +) -> Result<Uuid> { + let canonical_norm = normalize_entity_name(canonical); + let row: (Uuid,) = sqlx::query_as( + "\ +INSERT INTO graph_entities ( + entity_id, + tenant_id, + project_id, + canonical, + canonical_norm, + kind, + created_at, + updated_at +) +VALUES ( + $1, $2, $3, $4, $5, $6, now(), now() +) +ON CONFLICT (tenant_id, project_id, canonical_norm) +DO UPDATE +SET + canonical = EXCLUDED.canonical, + kind = COALESCE(EXCLUDED.kind, graph_entities.kind), + updated_at = now() +RETURNING entity_id", + ) + .bind(Uuid::new_v4()) + .bind(tenant_id) + .bind(project_id) + .bind(canonical) + .bind(&canonical_norm) + .bind(kind) + .fetch_one(executor) + .await?; + + Ok(row.0) +} + +/// Upserts an alias for an existing entity. +pub async fn upsert_entity_alias( + executor: &mut PgConnection, + entity_id: Uuid, + alias: &str, +) -> Result<()> { + let alias_norm = normalize_entity_name(alias); + + sqlx::query( + "\ +INSERT INTO graph_entity_aliases ( + alias_id, + entity_id, + alias, + alias_norm, + created_at +) +VALUES ($1, $2, $3, $4, now()) +ON CONFLICT (entity_id, alias_norm) +DO UPDATE SET alias = EXCLUDED.alias", + ) + .bind(Uuid::new_v4()) + .bind(entity_id) + .bind(alias) + .bind(&alias_norm) + .execute(executor) + .await?; + + Ok(()) +} + +/// Fetches active facts for one subject entity at the provided point in time. +pub async fn fetch_active_facts_for_subject( + executor: &mut PgConnection, + tenant_id: &str, + project_id: &str, + scope: &str, + subject_entity_id: Uuid, + now: OffsetDateTime, +) -> Result<Vec<GraphFact>> { + let rows = sqlx::query_as::<_, GraphFact>( + "\ +SELECT + fact_id, + tenant_id, + project_id, + agent_id, + scope, + subject_entity_id, + predicate, + predicate_id, + object_entity_id, + object_value, + valid_from, + valid_to, + created_at, + updated_at +FROM graph_facts +WHERE tenant_id = $1 + AND project_id = $2 + AND scope = $3 + AND subject_entity_id = $4 + AND valid_from <= $5 + AND (valid_to IS NULL OR valid_to > $5)", + ) + .bind(tenant_id) + .bind(project_id) + .bind(scope) + .bind(subject_entity_id) + .bind(now) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +#[allow(clippy::too_many_arguments)] +/// Supersedes active facts that conflict with the replacement fact and records supersession rows. +pub async fn supersede_conflicting_active_facts( + executor: &mut PgConnection, + tenant_id: &str, + project_id: &str, + scope: &str, + subject_entity_id: Uuid, + predicate_id: Uuid, + to_fact_id: Uuid, + note_id: Uuid, + effective_at: OffsetDateTime, +) -> Result<Vec<Uuid>> { + let superseded: Vec<(Uuid,)> = sqlx::query_as( + "\ +UPDATE graph_facts +SET valid_to = $1, updated_at = now() +WHERE tenant_id = $2 + AND project_id = $3 + AND scope = $4 + AND subject_entity_id = $5 + AND predicate_id = $6 + AND valid_to IS NULL + AND valid_from <= $1 + AND fact_id <> $7 +RETURNING fact_id", + ) + .bind(effective_at) + .bind(tenant_id) + .bind(project_id) + .bind(scope) + .bind(subject_entity_id) + .bind(predicate_id) + .bind(to_fact_id) + .fetch_all(&mut *executor) + .await?; + + for (from_fact_id,) in &superseded { + sqlx::query( + "\ +INSERT INTO graph_fact_supersessions ( + supersession_id, + tenant_id, + project_id, + from_fact_id, + to_fact_id, + note_id, + effective_at, + created_at +) +VALUES ($1, $2, $3, $4, $5, $6, $7, now()) +ON CONFLICT (from_fact_id, to_fact_id, note_id) DO NOTHING", + ) + .bind(Uuid::new_v4()) + .bind(tenant_id) + .bind(project_id) + .bind(*from_fact_id) + .bind(to_fact_id) + .bind(note_id) + .bind(effective_at) + .execute(&mut *executor) + .await?; + } + + Ok(superseded.into_iter().map(|(fact_id,)| fact_id).collect()) +} + +fn predicate_scope_key_tenant_project(tenant_id: &str, project_id: &str) -> String { + format!("{tenant_id}:{project_id}") +} + +fn predicate_scope_key_project(project_id: &str) -> String { + format!("{GRAPH_PREDICATE_SCOPE_PROJECT_PREFIX}{project_id}") +} diff --git a/packages/elf-storage/src/knowledge.rs b/packages/elf-storage/src/knowledge.rs new file mode 100644 index 00000000..1e37cf7e --- /dev/null +++ b/packages/elf-storage/src/knowledge.rs @@ -0,0 +1,1067 @@ +//! Derived knowledge page persistence and source-snapshot queries. + +use serde_json::Value; +use sqlx::{FromRow, PgExecutor}; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::{ + Result, + models::{ + KnowledgePage, KnowledgePageLintFinding, KnowledgePageSection, KnowledgePageSourceRef, + }, +}; + +/// Arguments for upserting one derived knowledge page. +pub struct KnowledgePageUpsert<'a> { + /// Page identifier to use for a newly created page. + pub page_id: Uuid, + /// Tenant that owns the page. + pub tenant_id: &'a str, + /// Project that owns the page. + pub project_id: &'a str, + /// Page kind. + pub page_kind: &'a str, + /// Stable page key. + pub page_key: &'a str, + /// Page title. + pub title: &'a str, + /// Versioned page contract schema. + pub contract_schema: &'a str, + /// Page lifecycle status. + pub status: &'a str, + /// Canonical source snapshot hash. + pub rebuild_source_hash: &'a str, + /// Canonical page content hash. + pub content_hash: &'a str, + /// Source coverage metadata. + pub source_coverage: &'a Value, + /// Aggregate source snapshot metadata. + pub source_snapshot: &'a Value, + /// Rebuild metadata. + pub rebuild_metadata: &'a Value, + /// Rebuild timestamp. + pub now: OffsetDateTime, +} + +/// Arguments for inserting one knowledge page section. +pub struct KnowledgePageSectionInsert<'a> { + /// Section identifier. + pub section_id: Uuid, + /// Parent page identifier. + pub page_id: Uuid, + /// Stable section key. + pub section_key: &'a str, + /// Section heading. + pub heading: &'a str, + /// Section role. + pub role: &'a str, + /// Section content. + pub content: &'a str, + /// Section display order. + pub ordinal: i32, + /// Section citations. + pub citations: &'a Value, + /// Reason the section has no citations, when intentionally unsupported. + pub unsupported_reason: Option<&'a str>, + /// Section content hash. + pub content_hash: &'a str, + /// Creation/update timestamp. + pub now: OffsetDateTime, +} + +/// Arguments for inserting one normalized knowledge page citation. +pub struct KnowledgePageSourceRefInsert<'a> { + /// Source-reference row identifier. + pub ref_id: Uuid, + /// Parent page identifier. + pub page_id: Uuid, + /// Section that cites the source, if section-scoped. + pub section_id: Option<Uuid>, + /// Source kind. + pub source_kind: &'a str, + /// Authoritative source identifier. + pub source_id: Uuid, + /// Captured source status. + pub source_status: Option<&'a str>, + /// Captured source updated timestamp. + pub source_updated_at: Option<OffsetDateTime>, + /// Captured source content hash. + pub source_content_hash: Option<&'a str>, + /// Captured source snapshot. + pub source_snapshot: &'a Value, + /// Citation-local metadata. + pub citation_metadata: &'a Value, + /// Creation timestamp. + pub now: OffsetDateTime, +} + +/// Arguments for inserting one knowledge page lint finding. +pub struct KnowledgePageLintFindingInsert<'a> { + /// Lint finding identifier. + pub finding_id: Uuid, + /// Parent page identifier. + pub page_id: Uuid, + /// Section associated with the finding, when available. + pub section_id: Option<Uuid>, + /// Finding type. + pub finding_type: &'a str, + /// Finding severity. + pub severity: &'a str, + /// Source kind associated with the finding, when available. + pub source_kind: Option<&'a str>, + /// Source identifier associated with the finding, when available. + pub source_id: Option<Uuid>, + /// Human-readable finding message. + pub message: &'a str, + /// Structured finding details. + pub details: &'a Value, + /// Creation timestamp. + pub now: OffsetDateTime, +} + +/// Authoritative note source row used by the knowledge page rebuilder. +#[derive(Debug, FromRow)] +pub struct KnowledgeNoteSource { + /// Note identifier. + pub note_id: Uuid, + /// Agent that owns the note. + pub agent_id: String, + /// Note scope. + pub scope: String, + /// Note type. + pub note_type: String, + /// Optional note key. + pub key: Option<String>, + /// Note text. + pub text: String, + /// Note importance. + pub importance: f32, + /// Note confidence. + pub confidence: f32, + /// Note status. + pub status: String, + /// Note creation timestamp. + pub created_at: OffsetDateTime, + /// Note update timestamp. + pub updated_at: OffsetDateTime, + /// Optional note expiry timestamp. + pub expires_at: Option<OffsetDateTime>, + /// Note embedding version. + pub embedding_version: String, + /// Opaque note source reference. + pub source_ref: Value, +} + +/// Durable add_event audit source row used by the knowledge page rebuilder. +#[derive(Debug, FromRow)] +pub struct KnowledgeEventSource { + /// Ingest decision identifier. + pub decision_id: Uuid, + /// Agent that wrote the audited event-derived note decision. + pub agent_id: String, + /// Scope associated with the audited decision. + pub scope: String, + /// Ingestion pipeline name. + pub pipeline: String, + /// Event-derived note type. + pub note_type: String, + /// Optional note key. + pub note_key: Option<String>, + /// Note identifier affected by the decision, when persisted. + pub note_id: Option<Uuid>, + /// Policy decision. + pub policy_decision: String, + /// Note operation. + pub note_op: String, + /// Optional reason code. + pub reason_code: Option<String>, + /// Structured audit details. + pub details: Value, + /// Audit timestamp. + pub ts: OffsetDateTime, +} + +/// Authoritative graph relation source row used by the knowledge page rebuilder. +#[derive(Debug, FromRow)] +pub struct KnowledgeRelationSource { + /// Graph fact identifier. + pub fact_id: Uuid, + /// Agent that wrote the fact. + pub agent_id: String, + /// Fact scope. + pub scope: String, + /// Subject canonical text. + pub subject: String, + /// Optional subject kind. + pub subject_kind: Option<String>, + /// Predicate text. + pub predicate: String, + /// Optional object entity canonical text. + pub object_entity: Option<String>, + /// Optional object entity kind. + pub object_kind: Option<String>, + /// Optional scalar object value. + pub object_value: Option<String>, + /// Fact validity window start. + pub valid_from: OffsetDateTime, + /// Fact validity window end, when historical. + pub valid_to: Option<OffsetDateTime>, + /// Fact update timestamp. + pub updated_at: OffsetDateTime, + /// Evidence notes linked to this fact. + pub evidence_notes: Value, +} + +/// Reviewed consolidation proposal source row used by the knowledge page rebuilder. +#[derive(Debug, FromRow)] +pub struct KnowledgeProposalSource { + /// Consolidation proposal identifier. + pub proposal_id: Uuid, + /// Parent consolidation run identifier. + pub run_id: Uuid, + /// Agent that registered the proposal. + pub agent_id: String, + /// Proposal kind. + pub proposal_kind: String, + /// Proposal apply intent. + pub apply_intent: String, + /// Proposal review state. + pub review_state: String, + /// Serialized proposal source references. + pub source_refs: Value, + /// Serialized proposal source snapshot. + pub source_snapshot: Value, + /// Serialized proposal lineage. + pub lineage: Value, + /// Serialized proposal diff. + pub diff: Value, + /// Proposal confidence. + pub confidence: f32, + /// Unsupported claim flags. + pub unsupported_claim_flags: Value, + /// Contradiction markers. + pub contradiction_markers: Value, + /// Staleness markers. + pub staleness_markers: Value, + /// Derived target reference. + pub target_ref: Value, + /// Proposed derived payload. + pub proposed_payload: Value, + /// Proposal update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Searchable knowledge page section row with page and lint metadata. +#[derive(Debug, FromRow)] +pub struct KnowledgePageSearchRow { + /// Derived page identifier. + pub page_id: Uuid, + /// Page kind. + pub page_kind: String, + /// Stable page key. + pub page_key: String, + /// Page title. + pub title: String, + /// Page lifecycle status. + pub status: String, + /// Source coverage metadata. + pub source_coverage: Value, + /// Rebuild metadata. + pub rebuild_metadata: Value, + /// Page update timestamp. + pub page_updated_at: OffsetDateTime, + /// Page rebuild timestamp. + pub rebuilt_at: OffsetDateTime, + /// Section identifier. + pub section_id: Uuid, + /// Stable section key. + pub section_key: String, + /// Section heading. + pub heading: String, + /// Section role. + pub role: String, + /// Section content. + pub content: String, + /// Section display order. + pub ordinal: i32, + /// Section citations. + pub citations: Value, + /// Reason the section is unsupported, when present. + pub unsupported_reason: Option<String>, + /// Number of error lint findings for the page. + pub lint_error_count: i64, + /// Number of warning lint findings for the page. + pub lint_warning_count: i64, + /// Number of info lint findings for the page. + pub lint_info_count: i64, + /// Number of normalized source refs for this section. + pub section_source_ref_count: i64, +} + +/// Upserts one derived knowledge page and returns the persisted row. +pub async fn upsert_knowledge_page<'e, E>( + executor: E, + args: KnowledgePageUpsert<'_>, +) -> Result<KnowledgePage> +where + E: PgExecutor<'e>, +{ + let row = sqlx::query_as::<_, KnowledgePage>( + "\ +INSERT INTO knowledge_pages ( + page_id, + tenant_id, + project_id, + page_kind, + page_key, + title, + contract_schema, + status, + rebuild_source_hash, + content_hash, + source_coverage, + source_snapshot, + rebuild_metadata, + created_at, + updated_at, + rebuilt_at +) +VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$14,$14) +ON CONFLICT (tenant_id, project_id, page_kind, page_key) DO UPDATE +SET + title = EXCLUDED.title, + contract_schema = EXCLUDED.contract_schema, + status = EXCLUDED.status, + rebuild_source_hash = EXCLUDED.rebuild_source_hash, + content_hash = EXCLUDED.content_hash, + source_coverage = EXCLUDED.source_coverage, + source_snapshot = EXCLUDED.source_snapshot, + rebuild_metadata = EXCLUDED.rebuild_metadata, + updated_at = EXCLUDED.updated_at, + rebuilt_at = EXCLUDED.rebuilt_at +RETURNING + page_id, + tenant_id, + project_id, + page_kind, + page_key, + title, + contract_schema, + status, + rebuild_source_hash, + content_hash, + source_coverage, + source_snapshot, + rebuild_metadata, + created_at, + updated_at, + rebuilt_at", + ) + .bind(args.page_id) + .bind(args.tenant_id) + .bind(args.project_id) + .bind(args.page_kind) + .bind(args.page_key) + .bind(args.title) + .bind(args.contract_schema) + .bind(args.status) + .bind(args.rebuild_source_hash) + .bind(args.content_hash) + .bind(args.source_coverage) + .bind(args.source_snapshot) + .bind(args.rebuild_metadata) + .bind(args.now) + .fetch_one(executor) + .await?; + + Ok(row) +} + +/// Deletes all section, citation, and lint child rows for a page before rebuild. +pub async fn delete_knowledge_page_children<'e, E>(executor: E, page_id: Uuid) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ + WITH deleted_lint AS ( + DELETE FROM knowledge_page_lint_findings + WHERE page_id = $1 + ), + deleted_source_refs AS ( + DELETE FROM knowledge_page_source_refs + WHERE page_id = $1 + ) + DELETE FROM knowledge_page_sections + WHERE page_id = $1", + ) + .bind(page_id) + .execute(executor) + .await?; + + Ok(()) +} + +/// Inserts one derived knowledge page section. +pub async fn insert_knowledge_page_section<'e, E>( + executor: E, + args: KnowledgePageSectionInsert<'_>, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO knowledge_page_sections ( + section_id, + page_id, + section_key, + heading, + role, + content, + ordinal, + citations, + unsupported_reason, + content_hash, + created_at, + updated_at +) +VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$11)", + ) + .bind(args.section_id) + .bind(args.page_id) + .bind(args.section_key) + .bind(args.heading) + .bind(args.role) + .bind(args.content) + .bind(args.ordinal) + .bind(args.citations) + .bind(args.unsupported_reason) + .bind(args.content_hash) + .bind(args.now) + .execute(executor) + .await?; + + Ok(()) +} + +/// Inserts one normalized knowledge page citation/source reference. +pub async fn insert_knowledge_page_source_ref<'e, E>( + executor: E, + args: KnowledgePageSourceRefInsert<'_>, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO knowledge_page_source_refs ( + ref_id, + page_id, + section_id, + source_kind, + source_id, + source_status, + source_updated_at, + source_content_hash, + source_snapshot, + citation_metadata, + created_at +) +VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)", + ) + .bind(args.ref_id) + .bind(args.page_id) + .bind(args.section_id) + .bind(args.source_kind) + .bind(args.source_id) + .bind(args.source_status) + .bind(args.source_updated_at) + .bind(args.source_content_hash) + .bind(args.source_snapshot) + .bind(args.citation_metadata) + .bind(args.now) + .execute(executor) + .await?; + + Ok(()) +} + +/// Inserts one knowledge page lint finding. +pub async fn insert_knowledge_page_lint_finding<'e, E>( + executor: E, + args: KnowledgePageLintFindingInsert<'_>, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO knowledge_page_lint_findings ( + finding_id, + page_id, + section_id, + finding_type, + severity, + source_kind, + source_id, + message, + details, + created_at +) +VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)", + ) + .bind(args.finding_id) + .bind(args.page_id) + .bind(args.section_id) + .bind(args.finding_type) + .bind(args.severity) + .bind(args.source_kind) + .bind(args.source_id) + .bind(args.message) + .bind(args.details) + .bind(args.now) + .execute(executor) + .await?; + + Ok(()) +} + +/// Deletes persisted lint findings for one page. +pub async fn delete_knowledge_page_lint_findings<'e, E>(executor: E, page_id: Uuid) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query("DELETE FROM knowledge_page_lint_findings WHERE page_id = $1") + .bind(page_id) + .execute(executor) + .await?; + + Ok(()) +} + +/// Fetches one knowledge page by identifier. +pub async fn get_knowledge_page<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + page_id: Uuid, +) -> Result<Option<KnowledgePage>> +where + E: PgExecutor<'e>, +{ + let row = sqlx::query_as::<_, KnowledgePage>( + "\ +SELECT + page_id, + tenant_id, + project_id, + page_kind, + page_key, + title, + contract_schema, + status, + rebuild_source_hash, + content_hash, + source_coverage, + source_snapshot, + rebuild_metadata, + created_at, + updated_at, + rebuilt_at +FROM knowledge_pages +WHERE tenant_id = $1 AND project_id = $2 AND page_id = $3 +LIMIT 1", + ) + .bind(tenant_id) + .bind(project_id) + .bind(page_id) + .fetch_optional(executor) + .await?; + + Ok(row) +} + +/// Lists knowledge pages for a tenant and project. +pub async fn list_knowledge_pages<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + page_kind: Option<&str>, + limit: i64, +) -> Result<Vec<KnowledgePage>> +where + E: PgExecutor<'e>, +{ + let rows = sqlx::query_as::<_, KnowledgePage>( + "\ +SELECT + page_id, + tenant_id, + project_id, + page_kind, + page_key, + title, + contract_schema, + status, + rebuild_source_hash, + content_hash, + source_coverage, + source_snapshot, + rebuild_metadata, + created_at, + updated_at, + rebuilt_at +FROM knowledge_pages +WHERE tenant_id = $1 + AND project_id = $2 + AND ($3::text IS NULL OR page_kind = $3) +ORDER BY updated_at DESC, page_id DESC +LIMIT $4", + ) + .bind(tenant_id) + .bind(project_id) + .bind(page_kind) + .bind(limit) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Lists sections for one knowledge page. +pub async fn list_knowledge_page_sections<'e, E>( + executor: E, + page_id: Uuid, +) -> Result<Vec<KnowledgePageSection>> +where + E: PgExecutor<'e>, +{ + let rows = sqlx::query_as::<_, KnowledgePageSection>( + "\ +SELECT + section_id, + page_id, + section_key, + heading, + role, + content, + ordinal, + citations, + unsupported_reason, + content_hash, + created_at, + updated_at +FROM knowledge_page_sections +WHERE page_id = $1 +ORDER BY ordinal ASC, section_key ASC", + ) + .bind(page_id) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Lists normalized source refs for one knowledge page. +pub async fn list_knowledge_page_source_refs<'e, E>( + executor: E, + page_id: Uuid, +) -> Result<Vec<KnowledgePageSourceRef>> +where + E: PgExecutor<'e>, +{ + let rows = sqlx::query_as::<_, KnowledgePageSourceRef>( + "\ +SELECT + ref_id, + page_id, + section_id, + source_kind, + source_id, + source_status, + source_updated_at, + source_content_hash, + source_snapshot, + citation_metadata, + created_at +FROM knowledge_page_source_refs +WHERE page_id = $1 +ORDER BY source_kind ASC, source_id ASC, ref_id ASC", + ) + .bind(page_id) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Lists normalized source refs for a set of knowledge pages. +pub async fn list_knowledge_page_source_refs_for_pages<'e, E>( + executor: E, + page_ids: &[Uuid], +) -> Result<Vec<KnowledgePageSourceRef>> +where + E: PgExecutor<'e>, +{ + if page_ids.is_empty() { + return Ok(Vec::new()); + } + + let rows = sqlx::query_as::<_, KnowledgePageSourceRef>( + "\ +SELECT + ref_id, + page_id, + section_id, + source_kind, + source_id, + source_status, + source_updated_at, + source_content_hash, + source_snapshot, + citation_metadata, + created_at +FROM knowledge_page_source_refs +WHERE page_id = ANY($1::uuid[]) +ORDER BY page_id ASC, source_kind ASC, source_id ASC, ref_id ASC", + ) + .bind(page_ids) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Lists lint findings for one knowledge page. +pub async fn list_knowledge_page_lint_findings<'e, E>( + executor: E, + page_id: Uuid, +) -> Result<Vec<KnowledgePageLintFinding>> +where + E: PgExecutor<'e>, +{ + let rows = sqlx::query_as::<_, KnowledgePageLintFinding>( + "\ +SELECT + finding_id, + page_id, + section_id, + finding_type, + severity, + source_kind, + source_id, + message, + details, + created_at +FROM knowledge_page_lint_findings +WHERE page_id = $1 +ORDER BY severity DESC, created_at ASC, finding_id ASC", + ) + .bind(page_id) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Searches derived knowledge page sections by page and section text. +pub async fn search_knowledge_page_sections<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + page_kind: Option<&str>, + query_pattern: &str, + limit: i64, +) -> Result<Vec<KnowledgePageSearchRow>> +where + E: PgExecutor<'e>, +{ + let rows = sqlx::query_as::<_, KnowledgePageSearchRow>( + "\ +WITH page_lint AS ( + SELECT + page_id, + count(*) FILTER (WHERE severity = 'error') AS error_count, + count(*) FILTER (WHERE severity = 'warning') AS warning_count, + count(*) FILTER (WHERE severity = 'info') AS info_count + FROM knowledge_page_lint_findings + GROUP BY page_id +), +section_refs AS ( + SELECT section_id, count(*) AS source_ref_count + FROM knowledge_page_source_refs + GROUP BY section_id +) +SELECT + p.page_id, + p.page_kind, + p.page_key, + p.title, + p.status, + p.source_coverage, + p.rebuild_metadata, + p.updated_at AS page_updated_at, + p.rebuilt_at, + s.section_id, + s.section_key, + s.heading, + s.role, + s.content, + s.ordinal, + s.citations, + s.unsupported_reason, + COALESCE(page_lint.error_count, 0)::bigint AS lint_error_count, + COALESCE(page_lint.warning_count, 0)::bigint AS lint_warning_count, + COALESCE(page_lint.info_count, 0)::bigint AS lint_info_count, + COALESCE(section_refs.source_ref_count, 0)::bigint AS section_source_ref_count +FROM knowledge_pages p +JOIN knowledge_page_sections s ON s.page_id = p.page_id +LEFT JOIN page_lint ON page_lint.page_id = p.page_id +LEFT JOIN section_refs ON section_refs.section_id = s.section_id +WHERE p.tenant_id = $1 + AND p.project_id = $2 + AND p.status IN ('active', 'stale') + AND ($3::text IS NULL OR p.page_kind = $3) + AND ( + lower(p.title) LIKE $4 + OR lower(p.page_key) LIKE $4 + OR lower(s.heading) LIKE $4 + OR lower(s.content) LIKE $4 + ) +ORDER BY + CASE + WHEN lower(p.title) LIKE $4 THEN 4 + WHEN lower(s.heading) LIKE $4 THEN 3 + WHEN lower(p.page_key) LIKE $4 THEN 2 + ELSE 1 + END DESC, + p.updated_at DESC, + s.ordinal ASC, + p.page_id DESC +LIMIT $5", + ) + .bind(tenant_id) + .bind(project_id) + .bind(page_kind) + .bind(query_pattern) + .bind(limit) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Fetches note sources by identifier for a knowledge page rebuild. +pub async fn fetch_knowledge_note_sources<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + note_ids: &[Uuid], +) -> Result<Vec<KnowledgeNoteSource>> +where + E: PgExecutor<'e>, +{ + if note_ids.is_empty() { + return Ok(Vec::new()); + } + + let rows = sqlx::query_as::<_, KnowledgeNoteSource>( + "\ +SELECT + note_id, + agent_id, + scope, + type AS note_type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref +FROM memory_notes +WHERE tenant_id = $1 + AND project_id = $2 + AND note_id = ANY($3::uuid[]) +ORDER BY updated_at ASC, note_id ASC", + ) + .bind(tenant_id) + .bind(project_id) + .bind(note_ids) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Fetches durable add_event audit sources by decision identifier. +pub async fn fetch_knowledge_event_sources<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + decision_ids: &[Uuid], +) -> Result<Vec<KnowledgeEventSource>> +where + E: PgExecutor<'e>, +{ + if decision_ids.is_empty() { + return Ok(Vec::new()); + } + + let rows = sqlx::query_as::<_, KnowledgeEventSource>( + "\ +SELECT + decision_id, + agent_id, + scope, + pipeline, + note_type, + note_key, + note_id, + policy_decision, + note_op, + reason_code, + details, + ts +FROM memory_ingest_decisions +WHERE tenant_id = $1 + AND project_id = $2 + AND decision_id = ANY($3::uuid[]) + AND pipeline = 'add_event' +ORDER BY ts ASC, decision_id ASC", + ) + .bind(tenant_id) + .bind(project_id) + .bind(decision_ids) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Fetches relation sources by graph fact identifier for a knowledge page rebuild. +pub async fn fetch_knowledge_relation_sources<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + fact_ids: &[Uuid], +) -> Result<Vec<KnowledgeRelationSource>> +where + E: PgExecutor<'e>, +{ + if fact_ids.is_empty() { + return Ok(Vec::new()); + } + + let rows = sqlx::query_as::<_, KnowledgeRelationSource>( + "\ +SELECT + gf.fact_id, + gf.agent_id, + gf.scope, + subject.canonical AS subject, + subject.kind AS subject_kind, + gf.predicate, + object_entity.canonical AS object_entity, + object_entity.kind AS object_kind, + gf.object_value, + gf.valid_from, + gf.valid_to, + gf.updated_at, + COALESCE( + jsonb_agg( + jsonb_build_object( + 'note_id', evidence.note_id, + 'status', note.status, + 'updated_at', note.updated_at + ) + ORDER BY evidence.created_at ASC, evidence.note_id ASC + ) FILTER (WHERE evidence.note_id IS NOT NULL), + '[]'::jsonb + ) AS evidence_notes +FROM graph_facts gf +JOIN graph_entities subject ON subject.entity_id = gf.subject_entity_id +LEFT JOIN graph_entities object_entity ON object_entity.entity_id = gf.object_entity_id +LEFT JOIN graph_fact_evidence evidence ON evidence.fact_id = gf.fact_id +LEFT JOIN memory_notes note ON note.note_id = evidence.note_id +WHERE gf.tenant_id = $1 + AND gf.project_id = $2 + AND gf.fact_id = ANY($3::uuid[]) +GROUP BY + gf.fact_id, + gf.agent_id, + gf.scope, + subject.canonical, + subject.kind, + gf.predicate, + object_entity.canonical, + object_entity.kind, + gf.object_value, + gf.valid_from, + gf.valid_to, + gf.updated_at +ORDER BY gf.updated_at ASC, gf.fact_id ASC", + ) + .bind(tenant_id) + .bind(project_id) + .bind(fact_ids) + .fetch_all(executor) + .await?; + + Ok(rows) +} + +/// Fetches applied proposal sources by identifier for a knowledge page rebuild. +pub async fn fetch_knowledge_proposal_sources<'e, E>( + executor: E, + tenant_id: &str, + project_id: &str, + proposal_ids: &[Uuid], +) -> Result<Vec<KnowledgeProposalSource>> +where + E: PgExecutor<'e>, +{ + if proposal_ids.is_empty() { + return Ok(Vec::new()); + } + + let rows = sqlx::query_as::<_, KnowledgeProposalSource>( + "\ +SELECT + proposal_id, + run_id, + agent_id, + proposal_kind, + apply_intent, + review_state, + source_refs, + source_snapshot, + lineage, + diff, + confidence, + COALESCE(unsupported_claim_flags, '[]'::jsonb) AS unsupported_claim_flags, + COALESCE(contradiction_markers, '[]'::jsonb) AS contradiction_markers, + COALESCE(staleness_markers, '[]'::jsonb) AS staleness_markers, + COALESCE(target_ref, '{}'::jsonb) AS target_ref, + COALESCE(proposed_payload, '{}'::jsonb) AS proposed_payload, + updated_at +FROM consolidation_proposals +WHERE tenant_id = $1 + AND project_id = $2 + AND proposal_id = ANY($3::uuid[]) + AND review_state = 'applied' +ORDER BY updated_at ASC, proposal_id ASC", + ) + .bind(tenant_id) + .bind(project_id) + .bind(proposal_ids) + .fetch_all(executor) + .await?; + + Ok(rows) +} diff --git a/packages/elf-storage/src/lib.rs b/packages/elf-storage/src/lib.rs index 595ec8f6..0631dabc 100644 --- a/packages/elf-storage/src/lib.rs +++ b/packages/elf-storage/src/lib.rs @@ -1,6 +1,22 @@ +#![cfg_attr(test, allow(unused_crate_dependencies))] + +//! Storage adapters and row models for ELF persistence backends. + +pub mod consolidation; pub mod db; +pub mod doc_outbox; +pub mod docs; +pub mod graph; +pub mod knowledge; pub mod models; pub mod outbox; pub mod qdrant; pub mod queries; pub mod schema; + +mod error; + +pub use error::Error; + +/// Storage-layer result type. +pub type Result<T, E = Error> = std::result::Result<T, E>; diff --git a/packages/elf-storage/src/models.rs b/packages/elf-storage/src/models.rs index d9fea7d5..2276d977 100644 --- a/packages/elf-storage/src/models.rs +++ b/packages/elf-storage/src/models.rs @@ -1,65 +1,639 @@ -#[derive(Debug, sqlx::FromRow)] +//! Database row models shared across storage modules. + +use serde_json::Value; +use sqlx::FromRow; +use time::OffsetDateTime; +use uuid::Uuid; + +/// Persisted memory note row. +#[derive(Debug, FromRow)] pub struct MemoryNote { - pub note_id: uuid::Uuid, + /// Note identifier. + pub note_id: Uuid, + /// Tenant that owns the note. pub tenant_id: String, + /// Project that owns the note. pub project_id: String, + /// Agent that wrote the note. pub agent_id: String, + /// Scope key for the note. pub scope: String, + /// Note type discriminator. pub r#type: String, + /// Optional application-defined key for deduplication or lookup. pub key: Option<String>, + /// Note body text. pub text: String, + /// Importance score persisted for ranking. pub importance: f32, + /// Confidence score persisted for ranking. pub confidence: f32, + /// Lifecycle status for the note. pub status: String, - pub created_at: time::OffsetDateTime, - pub updated_at: time::OffsetDateTime, - pub expires_at: Option<time::OffsetDateTime>, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, + /// Optional expiry timestamp. + pub expires_at: Option<OffsetDateTime>, + /// Embedding version associated with the stored note. pub embedding_version: String, - pub source_ref: serde_json::Value, + /// Structured source reference metadata. + pub source_ref: Value, + /// Search hit counter. pub hit_count: i64, - pub last_hit_at: Option<time::OffsetDateTime>, + /// Timestamp of the most recent search hit. + pub last_hit_at: Option<OffsetDateTime>, } -#[derive(Debug, sqlx::FromRow)] +/// Persisted chunk row for one memory note. +#[derive(Debug, FromRow)] pub struct MemoryNoteChunk { - pub chunk_id: uuid::Uuid, - pub note_id: uuid::Uuid, + /// Chunk identifier. + pub chunk_id: Uuid, + /// Parent note identifier. + pub note_id: Uuid, + /// Zero-based chunk position within the note. pub chunk_index: i32, + /// Inclusive start byte offset within the original note text. pub start_offset: i32, + /// Exclusive end byte offset within the original note text. pub end_offset: i32, + /// Chunk text. pub text: String, + /// Embedding version associated with the chunk. pub embedding_version: String, - pub created_at: time::OffsetDateTime, + /// Creation timestamp. + pub created_at: OffsetDateTime, } -#[derive(Debug, sqlx::FromRow)] +/// Persisted embedding row for one note chunk. +#[derive(Debug, FromRow)] pub struct NoteChunkEmbedding { - pub chunk_id: uuid::Uuid, + /// Chunk identifier. + pub chunk_id: Uuid, + /// Embedding version associated with the vector. pub embedding_version: String, + /// Embedding dimensionality. pub embedding_dim: i32, + /// Embedding vector payload. pub vec: Vec<f32>, - pub created_at: time::OffsetDateTime, + /// Creation timestamp. + pub created_at: OffsetDateTime, } +/// In-memory embedding payload for a full note. #[derive(Debug)] pub struct NoteEmbedding { - pub note_id: uuid::Uuid, + /// Note identifier. + pub note_id: Uuid, + /// Embedding version associated with the vector. pub embedding_version: String, + /// Embedding dimensionality. pub embedding_dim: i32, + /// Embedding vector payload. pub vec: Vec<f32>, - pub created_at: time::OffsetDateTime, + /// Creation timestamp. + pub created_at: OffsetDateTime, } -#[derive(Debug)] +/// Persisted note-indexing outbox row. +#[derive(Debug, FromRow)] pub struct IndexingOutboxEntry { - pub outbox_id: uuid::Uuid, - pub note_id: uuid::Uuid, + /// Outbox identifier. + pub outbox_id: Uuid, + /// Note identifier queued for indexing. + pub note_id: Uuid, + /// Requested indexing operation. + pub op: String, + /// Embedding version the worker should use. + pub embedding_version: String, + /// Current outbox status. + pub status: String, + /// Number of attempts already made. + pub attempts: i32, + /// Most recent failure text, if any. + pub last_error: Option<String>, + /// Earliest time the job may be claimed again. + pub available_at: OffsetDateTime, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Persisted search-trace outbox job. +#[derive(Debug, FromRow)] +pub struct TraceOutboxJob { + /// Outbox identifier. + pub outbox_id: Uuid, + /// Trace identifier to export. + pub trace_id: Uuid, + /// Serialized trace payload. + pub payload: Value, + /// Number of attempts already made. + pub attempts: i32, +} + +/// Persisted graph entity row. +#[derive(Debug, FromRow)] +pub struct GraphEntity { + /// Entity identifier. + pub entity_id: Uuid, + /// Tenant that owns the entity. + pub tenant_id: String, + /// Project that owns the entity. + pub project_id: String, + /// Canonical entity surface. + pub canonical: String, + /// Normalized canonical entity surface. + pub canonical_norm: String, + /// Optional entity kind. + pub kind: Option<String>, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Persisted alias row for a graph entity. +#[derive(Debug, FromRow)] +pub struct GraphEntityAlias { + /// Alias identifier. + pub alias_id: Uuid, + /// Entity identifier that owns the alias. + pub entity_id: Uuid, + /// Alias surface. + pub alias: String, + /// Normalized alias surface. + pub alias_norm: String, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Persisted graph fact row. +#[derive(Debug, FromRow)] +pub struct GraphFact { + /// Fact identifier. + pub fact_id: Uuid, + /// Tenant that owns the fact. + pub tenant_id: String, + /// Project that owns the fact. + pub project_id: String, + /// Agent that emitted the fact. + pub agent_id: String, + /// Scope key for the fact. + pub scope: String, + /// Subject entity identifier. + pub subject_entity_id: Uuid, + /// Predicate surface captured with the fact. + pub predicate: String, + /// Resolved predicate identifier, when available. + pub predicate_id: Option<Uuid>, + /// Object entity identifier for entity-to-entity facts. + pub object_entity_id: Option<Uuid>, + /// Scalar object value for entity-to-value facts. + pub object_value: Option<String>, + /// Start of the fact validity window. + pub valid_from: OffsetDateTime, + /// End of the fact validity window, if superseded. + pub valid_to: Option<OffsetDateTime>, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Evidence link between one graph fact and one memory note. +#[derive(Debug, FromRow)] +pub struct GraphFactEvidence { + /// Evidence row identifier. + pub evidence_id: Uuid, + /// Fact identifier. + pub fact_id: Uuid, + /// Note identifier that supports the fact. + pub note_id: Uuid, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Persisted graph predicate row. +#[derive(Debug, FromRow)] +pub struct GraphPredicate { + /// Predicate identifier. + pub predicate_id: Uuid, + /// Scope key where the predicate is visible. + pub scope_key: String, + /// Tenant scope, when tenant-specific. + pub tenant_id: Option<String>, + /// Project scope, when project-specific. + pub project_id: Option<String>, + /// Canonical predicate surface. + pub canonical: String, + /// Normalized canonical predicate surface. + pub canonical_norm: String, + /// Cardinality policy for the predicate. + pub cardinality: String, + /// Lifecycle status for the predicate. + pub status: String, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Persisted alias row for a graph predicate. +#[derive(Debug, FromRow)] +pub struct GraphPredicateAlias { + /// Alias identifier. + pub alias_id: Uuid, + /// Predicate identifier that owns the alias. + pub predicate_id: Uuid, + /// Scope key where the alias resolves. + pub scope_key: String, + /// Alias surface. + pub alias: String, + /// Normalized alias surface. + pub alias_norm: String, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Persisted supersession row linking two facts. +#[derive(Debug, FromRow)] +pub struct GraphFactSupersession { + /// Supersession identifier. + pub supersession_id: Uuid, + /// Tenant that owns the supersession record. + pub tenant_id: String, + /// Project that owns the supersession record. + pub project_id: String, + /// Fact identifier that was superseded. + pub from_fact_id: Uuid, + /// Fact identifier that replaced the prior fact. + pub to_fact_id: Uuid, + /// Note identifier that justified the supersession. + pub note_id: Uuid, + /// Time the supersession took effect. + pub effective_at: OffsetDateTime, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Persisted consolidation run row. +#[derive(Debug, FromRow)] +pub struct ConsolidationRun { + /// Consolidation run identifier. + pub run_id: Uuid, + /// Tenant that owns the run. + pub tenant_id: String, + /// Project that owns the run. + pub project_id: String, + /// Agent that registered the run. + pub agent_id: String, + /// Versioned consolidation contract schema. + pub contract_schema: String, + /// Job kind, such as fixture, manual, or scheduled. + pub job_kind: String, + /// Current run status. + pub status: String, + /// Serialized input references. + pub input_refs: Value, + /// Aggregate source snapshot metadata. + pub source_snapshot: Value, + /// Serialized run lineage. + pub lineage: Value, + /// Structured error payload for failed runs. + pub error: Value, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, + /// Completion timestamp for terminal runs. + pub completed_at: Option<OffsetDateTime>, +} + +/// Persisted consolidation proposal row. +#[derive(Debug, FromRow)] +pub struct ConsolidationProposal { + /// Consolidation proposal identifier. + pub proposal_id: Uuid, + /// Parent consolidation run identifier. + pub run_id: Uuid, + /// Tenant that owns the proposal. + pub tenant_id: String, + /// Project that owns the proposal. + pub project_id: String, + /// Agent that registered the proposal. + pub agent_id: String, + /// Versioned consolidation contract schema. + pub contract_schema: String, + /// Proposal kind, such as derived_note or knowledge_page. + pub proposal_kind: String, + /// Derived-output apply intent. + pub apply_intent: String, + /// Current review state. + pub review_state: String, + /// Serialized source references. + pub source_refs: Value, + /// Aggregate source snapshot metadata. + pub source_snapshot: Value, + /// Serialized proposal lineage. + pub lineage: Value, + /// Serialized reviewable diff. + pub diff: Value, + /// Proposal confidence score. + pub confidence: f32, + /// Serialized unsupported-claim flags. + pub unsupported_claim_flags: Value, + /// Serialized contradiction markers. + pub contradiction_markers: Value, + /// Serialized staleness markers. + pub staleness_markers: Value, + /// Serialized derived target reference. + pub target_ref: Value, + /// Serialized proposed derived output payload. + pub proposed_payload: Value, + /// Agent that last reviewed the proposal. + pub reviewer_agent_id: Option<String>, + /// Optional reviewer comment. + pub review_comment: Option<String>, + /// Timestamp of the last review transition. + pub reviewed_at: Option<OffsetDateTime>, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Persisted consolidation proposal review event row. +#[derive(Debug, FromRow)] +pub struct ConsolidationProposalReviewEvent { + /// Review event identifier. + pub review_id: Uuid, + /// Reviewed proposal identifier. + pub proposal_id: Uuid, + /// Parent consolidation run identifier. + pub run_id: Uuid, + /// Tenant that owns the proposal. + pub tenant_id: String, + /// Project that owns the proposal. + pub project_id: String, + /// Agent that performed the review action. + pub reviewer_agent_id: String, + /// Review action requested by the reviewer. + pub action: String, + /// Review state before the transition. + pub from_review_state: String, + /// Review state after the transition. + pub to_review_state: String, + /// Optional reviewer comment. + pub review_comment: Option<String>, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Persisted consolidation worker job row. +#[derive(Debug, FromRow)] +pub struct ConsolidationRunJob { + /// Worker job identifier. + pub job_id: Uuid, + /// Consolidation run to materialize. + pub run_id: Uuid, + /// Tenant that owns the run. + pub tenant_id: String, + /// Project that owns the run. + pub project_id: String, + /// Agent that registered the run. + pub agent_id: String, + /// Job kind, such as fixture or manual. + pub job_kind: String, + /// Current job status. + pub status: String, + /// Queued proposal payload. + pub payload: Value, + /// Number of attempts already made. + pub attempts: i32, + /// Most recent failure text, if any. + pub last_error: Option<String>, + /// Earliest time the job may be claimed again. + pub available_at: OffsetDateTime, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Persisted derived knowledge page row. +#[derive(Debug, FromRow)] +pub struct KnowledgePage { + /// Derived page identifier. + pub page_id: Uuid, + /// Tenant that owns the page. + pub tenant_id: String, + /// Project that owns the page. + pub project_id: String, + /// Page kind, such as project, entity, concept, issue, or decision. + pub page_kind: String, + /// Stable page key within the tenant/project/kind namespace. + pub page_key: String, + /// Human-readable page title. + pub title: String, + /// Versioned knowledge page contract schema. + pub contract_schema: String, + /// Derived page lifecycle status. + pub status: String, + /// BLAKE3 hash of the canonical source snapshot. + pub rebuild_source_hash: String, + /// BLAKE3 hash of the canonical page payload. + pub content_hash: String, + /// Source coverage metadata. + pub source_coverage: Value, + /// Aggregate source snapshot metadata captured during rebuild. + pub source_snapshot: Value, + /// Rebuild metadata, including deterministic/provider information. + pub rebuild_metadata: Value, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, + /// Last rebuild timestamp. + pub rebuilt_at: OffsetDateTime, +} + +/// Persisted derived knowledge page section row. +#[derive(Debug, FromRow)] +pub struct KnowledgePageSection { + /// Section identifier. + pub section_id: Uuid, + /// Parent page identifier. + pub page_id: Uuid, + /// Stable section key within one page. + pub section_key: String, + /// Section heading. + pub heading: String, + /// Section role, such as current_truth, history, relations, or proposals. + pub role: String, + /// Section content. + pub content: String, + /// Display order within the page. + pub ordinal: i32, + /// Serialized citation array for this section. + pub citations: Value, + /// Reason a section lacks citations, when intentionally unsupported. + pub unsupported_reason: Option<String>, + /// BLAKE3 hash of the section content and citations. + pub content_hash: String, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Persisted normalized citation/source reference for a knowledge page. +#[derive(Debug, FromRow)] +pub struct KnowledgePageSourceRef { + /// Source-reference row identifier. + pub ref_id: Uuid, + /// Parent page identifier. + pub page_id: Uuid, + /// Section that cites the source, if section-scoped. + pub section_id: Option<Uuid>, + /// Source kind, such as note, relation, proposal, or event. + pub source_kind: String, + /// Authoritative source identifier. + pub source_id: Uuid, + /// Source lifecycle status captured during rebuild. + pub source_status: Option<String>, + /// Source last-update timestamp captured during rebuild. + pub source_updated_at: Option<OffsetDateTime>, + /// Source content hash captured during rebuild. + pub source_content_hash: Option<String>, + /// Full source snapshot captured during rebuild. + pub source_snapshot: Value, + /// Citation-local metadata. + pub citation_metadata: Value, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Persisted lint finding for one derived knowledge page. +#[derive(Debug, FromRow)] +pub struct KnowledgePageLintFinding { + /// Lint finding identifier. + pub finding_id: Uuid, + /// Parent page identifier. + pub page_id: Uuid, + /// Section associated with the finding, when available. + pub section_id: Option<Uuid>, + /// Finding type, such as stale_source_ref or unsupported_claim. + pub finding_type: String, + /// Finding severity. + pub severity: String, + /// Source kind associated with the finding, when available. + pub source_kind: Option<String>, + /// Source identifier associated with the finding, when available. + pub source_id: Option<Uuid>, + /// Human-readable finding message. + pub message: String, + /// Structured finding details. + pub details: Value, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Persisted document row. +#[derive(Debug, FromRow)] +pub struct DocDocument { + /// Document identifier. + pub doc_id: Uuid, + /// Tenant that owns the document. + pub tenant_id: String, + /// Project that owns the document. + pub project_id: String, + /// Agent that ingested the document. + pub agent_id: String, + /// Scope key for the document. + pub scope: String, + /// Document type discriminator. + pub doc_type: String, + /// Lifecycle status for the document. + pub status: String, + /// Optional document title. + pub title: Option<String>, + /// Structured source reference metadata. + pub source_ref: Value, + /// Full document content. + pub content: String, + /// Byte length of the document content. + pub content_bytes: i32, + /// Content hash for deduplication and change detection. + pub content_hash: String, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, +} + +/// Persisted chunk row for one document. +#[derive(Debug, FromRow)] +pub struct DocChunk { + /// Chunk identifier. + pub chunk_id: Uuid, + /// Parent document identifier. + pub doc_id: Uuid, + /// Zero-based chunk position within the document. + pub chunk_index: i32, + /// Inclusive start byte offset within the original document content. + pub start_offset: i32, + /// Exclusive end byte offset within the original document content. + pub end_offset: i32, + /// Chunk text. + pub chunk_text: String, + /// Chunk content hash. + pub chunk_hash: String, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Persisted embedding row for one document chunk. +#[derive(Debug, FromRow)] +pub struct DocChunkEmbedding { + /// Chunk identifier. + pub chunk_id: Uuid, + /// Embedding version associated with the vector. + pub embedding_version: String, + /// Embedding dimensionality. + pub embedding_dim: i32, + /// Embedding vector payload. + pub vec: Vec<f32>, + /// Creation timestamp. + pub created_at: OffsetDateTime, +} + +/// Persisted document-indexing outbox row. +#[derive(Debug, FromRow)] +pub struct DocIndexingOutboxEntry { + /// Outbox identifier. + pub outbox_id: Uuid, + /// Document identifier queued for indexing. + pub doc_id: Uuid, + /// Chunk identifier queued for indexing. + pub chunk_id: Uuid, + /// Requested indexing operation. pub op: String, + /// Embedding version the worker should use. pub embedding_version: String, + /// Current outbox status. pub status: String, + /// Number of attempts already made. pub attempts: i32, + /// Most recent failure text, if any. pub last_error: Option<String>, - pub available_at: time::OffsetDateTime, - pub created_at: time::OffsetDateTime, - pub updated_at: time::OffsetDateTime, + /// Earliest time the job may be claimed again. + pub available_at: OffsetDateTime, + /// Creation timestamp. + pub created_at: OffsetDateTime, + /// Last update timestamp. + pub updated_at: OffsetDateTime, } diff --git a/packages/elf-storage/src/outbox.rs b/packages/elf-storage/src/outbox.rs index 810d1af8..db46e85d 100644 --- a/packages/elf-storage/src/outbox.rs +++ b/packages/elf-storage/src/outbox.rs @@ -1,24 +1,222 @@ -// crates.io -use color_eyre::Result; +//! Note indexing and trace outbox helpers. + +use sqlx::PgExecutor; +use time::{Duration, OffsetDateTime}; use uuid::Uuid; -// self -use crate::db::Db; +use crate::{ + Result, + db::Db, + models::{IndexingOutboxEntry, TraceOutboxJob}, +}; -pub async fn enqueue_outbox( - db: &Db, +/// Enqueues one note for downstream indexing work. +pub async fn enqueue_outbox<'e, E>( + executor: E, note_id: Uuid, op: &str, embedding_version: &str, +) -> Result<()> +where + E: PgExecutor<'e>, +{ + sqlx::query( + "INSERT INTO indexing_outbox (outbox_id, note_id, op, embedding_version, status) \ +VALUES ($1,$2,$3,$4,'PENDING')", + ) + .bind(Uuid::new_v4()) + .bind(note_id) + .bind(op) + .bind(embedding_version) + .execute(executor) + .await?; + + Ok(()) +} + +/// Claims the next due note-indexing outbox job and leases it until `lease_seconds`. +pub async fn claim_next_indexing_outbox_job( + db: &Db, + now: OffsetDateTime, + lease_seconds: i64, +) -> Result<Option<IndexingOutboxEntry>> { + let mut tx = db.pool.begin().await?; + let row = sqlx::query_as::<_, IndexingOutboxEntry>( + "\ +SELECT + outbox_id, + note_id, + op, + embedding_version, + status, + attempts, + last_error, + available_at, + created_at, + updated_at +FROM indexing_outbox +WHERE status IN ('PENDING','FAILED') AND available_at <= $1 +ORDER BY available_at ASC +LIMIT 1 +FOR UPDATE SKIP LOCKED", + ) + .bind(now) + .fetch_optional(&mut *tx) + .await?; + let job = if let Some(mut job) = row { + let lease_until = now + Duration::seconds(lease_seconds); + + sqlx::query( + "UPDATE indexing_outbox SET available_at = $1, updated_at = $2 WHERE outbox_id = $3", + ) + .bind(lease_until) + .bind(now) + .bind(job.outbox_id) + .execute(&mut *tx) + .await?; + + job.available_at = lease_until; + job.updated_at = now; + + Some(job) + } else { + None + }; + + tx.commit().await?; + + Ok(job) +} + +/// Marks a note-indexing outbox job as completed. +pub async fn mark_indexing_outbox_done( + db: &Db, + outbox_id: Uuid, + now: OffsetDateTime, +) -> Result<()> { + sqlx::query("UPDATE indexing_outbox SET status = 'DONE', updated_at = $1 WHERE outbox_id = $2") + .bind(now) + .bind(outbox_id) + .execute(&db.pool) + .await?; + + Ok(()) +} + +/// Marks a note-indexing outbox job as failed and schedules its retry. +pub async fn mark_indexing_outbox_failed( + db: &Db, + outbox_id: Uuid, + attempts: i32, + error_text: &str, + available_at: OffsetDateTime, + now: OffsetDateTime, ) -> Result<()> { sqlx::query( - "INSERT INTO indexing_outbox (outbox_id, note_id, op, embedding_version, status) VALUES ($1,$2,$3,$4,'PENDING')", - ) - .bind(Uuid::new_v4()) - .bind(note_id) - .bind(op) - .bind(embedding_version) - .execute(&db.pool) - .await?; + "\ +UPDATE indexing_outbox +SET status = 'FAILED', + attempts = $1, + last_error = $2, + available_at = $3, + updated_at = $4 +WHERE outbox_id = $5", + ) + .bind(attempts) + .bind(error_text) + .bind(available_at) + .bind(now) + .bind(outbox_id) + .execute(&db.pool) + .await?; + + Ok(()) +} + +/// Claims the next due trace outbox job and leases it until `lease_seconds`. +pub async fn claim_next_trace_outbox_job( + db: &Db, + now: OffsetDateTime, + lease_seconds: i64, +) -> Result<Option<TraceOutboxJob>> { + let mut tx = db.pool.begin().await?; + let row = sqlx::query_as::<_, TraceOutboxJob>( + "\ +SELECT + outbox_id, + trace_id, + payload, + attempts +FROM search_trace_outbox +WHERE status IN ('PENDING','FAILED') AND available_at <= $1 +ORDER BY available_at ASC +LIMIT 1 +FOR UPDATE SKIP LOCKED", + ) + .bind(now) + .fetch_optional(&mut *tx) + .await?; + let job = if let Some(job) = row { + let lease_until = now + Duration::seconds(lease_seconds); + + sqlx::query( + "UPDATE search_trace_outbox SET available_at = $1, updated_at = $2 WHERE outbox_id = $3", + ) + .bind(lease_until) + .bind(now) + .bind(job.outbox_id) + .execute(&mut *tx) + .await?; + + Some(job) + } else { + None + }; + + tx.commit().await?; + + Ok(job) +} + +/// Marks a trace outbox job as completed. +pub async fn mark_trace_outbox_done(db: &Db, outbox_id: Uuid, now: OffsetDateTime) -> Result<()> { + sqlx::query( + "UPDATE search_trace_outbox SET status = 'DONE', updated_at = $1 WHERE outbox_id = $2", + ) + .bind(now) + .bind(outbox_id) + .execute(&db.pool) + .await?; + + Ok(()) +} + +/// Marks a trace outbox job as failed and schedules its retry. +pub async fn mark_trace_outbox_failed( + db: &Db, + outbox_id: Uuid, + attempts: i32, + error_text: &str, + available_at: OffsetDateTime, + now: OffsetDateTime, +) -> Result<()> { + sqlx::query( + "\ +UPDATE search_trace_outbox +SET status = 'FAILED', + attempts = $1, + last_error = $2, + available_at = $3, + updated_at = $4 +WHERE outbox_id = $5", + ) + .bind(attempts) + .bind(error_text) + .bind(available_at) + .bind(now) + .bind(outbox_id) + .execute(&db.pool) + .await?; + Ok(()) } diff --git a/packages/elf-storage/src/qdrant.rs b/packages/elf-storage/src/qdrant.rs index 1414a1f6..c8ad5fa1 100644 --- a/packages/elf-storage/src/qdrant.rs +++ b/packages/elf-storage/src/qdrant.rs @@ -1,20 +1,160 @@ -// crates.io -use color_eyre::Result; -use qdrant_client::Qdrant; +//! Qdrant collection bootstrap helpers. +use std::time::Duration; + +use qdrant_client::{ + QdrantError, + qdrant::{ + CreateCollectionBuilder, CreateFieldIndexCollection, Distance, FieldType, Modifier, + PayloadSchemaType, SparseVectorParamsBuilder, SparseVectorsConfigBuilder, + VectorParamsBuilder, VectorsConfigBuilder, + }, +}; + +use crate::{Error, Result}; + +/// Name of the dense vector stored in each Qdrant point. pub const DENSE_VECTOR_NAME: &str = "dense"; +/// Name of the sparse BM25 vector stored in each Qdrant point. pub const BM25_VECTOR_NAME: &str = "bm25"; +/// Sparse model identifier used for BM25 search. pub const BM25_MODEL: &str = "qdrant/bm25"; +/// Required payload indexes for the document-search collection. +pub const DOCS_SEARCH_FILTER_INDEXES: [(&str, PayloadSchemaType, FieldType); 9] = [ + ("scope", PayloadSchemaType::Keyword, FieldType::Keyword), + ("status", PayloadSchemaType::Keyword, FieldType::Keyword), + ("doc_type", PayloadSchemaType::Keyword, FieldType::Keyword), + ("agent_id", PayloadSchemaType::Keyword, FieldType::Keyword), + ("updated_at", PayloadSchemaType::Datetime, FieldType::Datetime), + ("doc_ts", PayloadSchemaType::Datetime, FieldType::Datetime), + ("thread_id", PayloadSchemaType::Keyword, FieldType::Keyword), + ("domain", PayloadSchemaType::Keyword, FieldType::Keyword), + ("repo", PayloadSchemaType::Keyword, FieldType::Keyword), +]; + +const DEFAULT_QDRANT_CLIENT_TIMEOUT_SECS: u64 = 60; +const DEFAULT_QDRANT_OPERATION_TIMEOUT_SECS: u64 = 60; +/// Qdrant collection handle plus the configured vector dimension. pub struct QdrantStore { - pub client: Qdrant, + /// Qdrant client used for collection and payload-index operations. + pub client: qdrant_client::Qdrant, + /// Collection name managed by this store. pub collection: String, + /// Dense vector dimension expected by the collection schema. pub vector_dim: u32, } - impl QdrantStore { + /// Builds a store from the configured default collection. pub fn new(cfg: &elf_config::Qdrant) -> Result<Self> { - let client = Qdrant::from_url(&cfg.url).build()?; - Ok(Self { client, collection: cfg.collection.clone(), vector_dim: cfg.vector_dim }) + Self::new_with_collection(cfg, cfg.collection.as_str()) + } + + /// Builds a store for the provided collection name. + pub fn new_with_collection(cfg: &elf_config::Qdrant, collection: &str) -> Result<Self> { + let client = qdrant_client::Qdrant::from_url(&cfg.url) + .timeout(Duration::from_secs(DEFAULT_QDRANT_CLIENT_TIMEOUT_SECS)) + .build()?; + + Ok(Self { client, collection: collection.to_string(), vector_dim: cfg.vector_dim }) + } + + /// Ensures the configured Qdrant collection exists with the required vector layout. + pub async fn ensure_collection(&self) -> Result<()> { + match self.client.collection_info(&self.collection).await { + Ok(_) => return Ok(()), + Err(err) if is_qdrant_not_found(&err) => {}, + Err(err) => return Err(err.into()), + } + + let mut vectors_config = VectorsConfigBuilder::default(); + + vectors_config.add_named_vector_params( + DENSE_VECTOR_NAME, + VectorParamsBuilder::new(self.vector_dim.into(), Distance::Cosine), + ); + + let mut sparse_vectors_config = SparseVectorsConfigBuilder::default(); + + sparse_vectors_config.add_named_vector_params( + BM25_VECTOR_NAME, + SparseVectorParamsBuilder::default().modifier(Modifier::Idf as i32), + ); + + let builder = CreateCollectionBuilder::new(self.collection.clone()) + .vectors_config(vectors_config) + .sparse_vectors_config(sparse_vectors_config) + .timeout(DEFAULT_QDRANT_OPERATION_TIMEOUT_SECS); + + match self.client.create_collection(builder).await { + Ok(_) => Ok(()), + Err(err) if is_qdrant_already_exists(&err) => Ok(()), + Err(err) => Err(err.into()), + } } + + /// Ensures the required payload indexes exist for the collection. + pub async fn ensure_payload_indexes( + &self, + required_indexes: &[(&str, PayloadSchemaType, FieldType)], + ) -> Result<()> { + let payload_schema = self + .client + .collection_info(&self.collection) + .await? + .result + .map(|info| info.payload_schema) + .unwrap_or_default(); + + for (field_name, payload_type, field_type) in required_indexes.iter() { + let existing = payload_schema.get(*field_name); + + if let Some(existing) = existing + && existing.data_type != *payload_type as i32 + { + return Err(Error::Conflict(format!( + "Qdrant collection {:?} has payload field {:?} with unexpected type (expected {:?}).", + self.collection, field_name, payload_type + ))); + } + + if existing.is_some() { + continue; + } + + let request = CreateFieldIndexCollection { + collection_name: self.collection.clone(), + wait: Some(true), + field_name: (*field_name).to_string(), + field_type: Some(*field_type as i32), + field_index_params: None, + ordering: None, + timeout: None, + }; + + match self.client.create_field_index(request).await { + Ok(_) => {}, + Err(err) if is_qdrant_already_exists(&err) => {}, + Err(err) => return Err(err.into()), + } + } + + Ok(()) + } +} + +fn qdrant_error_code(err: &QdrantError) -> Option<String> { + match err { + QdrantError::ResponseError { status } => Some(format!("{:?}", status.code())), + QdrantError::ResourceExhaustedError { status, .. } => Some(format!("{:?}", status.code())), + _ => None, + } +} + +fn is_qdrant_not_found(err: &QdrantError) -> bool { + qdrant_error_code(err).as_deref() == Some("NotFound") +} + +fn is_qdrant_already_exists(err: &QdrantError) -> bool { + qdrant_error_code(err).as_deref() == Some("AlreadyExists") } diff --git a/packages/elf-storage/src/queries.rs b/packages/elf-storage/src/queries.rs index 86fef773..71980cab 100644 --- a/packages/elf-storage/src/queries.rs +++ b/packages/elf-storage/src/queries.rs @@ -1,65 +1,129 @@ -// crates.io -use color_eyre::Result; +//! Memory note persistence queries. + +use sqlx::PgExecutor; use uuid::Uuid; -// self -use crate::{db::Db, models::MemoryNote}; +use crate::{Result, models::MemoryNote}; -pub async fn insert_note(db: &Db, note: &MemoryNote) -> Result<()> { +/// Inserts one memory note row. +pub async fn insert_note<'e, E>(executor: E, note: &MemoryNote) -> Result<()> +where + E: PgExecutor<'e>, +{ sqlx::query( - "INSERT INTO memory_notes (note_id, tenant_id, project_id, agent_id, scope, type, key, text, importance, confidence, status, created_at, updated_at, expires_at, embedding_version, source_ref, hit_count, last_hit_at)\ - VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18)", - ) - .bind(note.note_id) - .bind(¬e.tenant_id) - .bind(¬e.project_id) - .bind(¬e.agent_id) - .bind(¬e.scope) - .bind(¬e.r#type) - .bind(¬e.key) - .bind(¬e.text) - .bind(note.importance) - .bind(note.confidence) - .bind(¬e.status) - .bind(note.created_at) - .bind(note.updated_at) - .bind(note.expires_at) - .bind(¬e.embedding_version) - .bind(¬e.source_ref) - .bind(note.hit_count) - .bind(note.last_hit_at) - .execute(&db.pool) - .await?; + "\ +INSERT INTO memory_notes ( + note_id, + tenant_id, + project_id, + agent_id, + scope, + type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref, + hit_count, + last_hit_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15, + $16, + $17, + $18 +)", + ) + .bind(note.note_id) + .bind(note.tenant_id.as_str()) + .bind(note.project_id.as_str()) + .bind(note.agent_id.as_str()) + .bind(note.scope.as_str()) + .bind(note.r#type.as_str()) + .bind(note.key.as_deref()) + .bind(note.text.as_str()) + .bind(note.importance) + .bind(note.confidence) + .bind(note.status.as_str()) + .bind(note.created_at) + .bind(note.updated_at) + .bind(note.expires_at) + .bind(note.embedding_version.as_str()) + .bind(¬e.source_ref) + .bind(note.hit_count) + .bind(note.last_hit_at) + .execute(executor) + .await?; + Ok(()) } -pub async fn update_note(db: &Db, note: &MemoryNote) -> Result<()> { +/// Updates mutable fields for one memory note row. +pub async fn update_note<'e, E>(executor: E, note: &MemoryNote) -> Result<()> +where + E: PgExecutor<'e>, +{ sqlx::query( - "UPDATE memory_notes SET text = $1, importance = $2, confidence = $3, updated_at = $4, expires_at = $5, source_ref = $6 WHERE note_id = $7", - ) - .bind(¬e.text) - .bind(note.importance) - .bind(note.confidence) - .bind(note.updated_at) - .bind(note.expires_at) - .bind(¬e.source_ref) - .bind(note.note_id) - .execute(&db.pool) - .await?; + "\ +UPDATE memory_notes +SET + text = $1, + importance = $2, + confidence = $3, + updated_at = $4, + expires_at = $5, + source_ref = $6 +WHERE note_id = $7", + ) + .bind(note.text.as_str()) + .bind(note.importance) + .bind(note.confidence) + .bind(note.updated_at) + .bind(note.expires_at) + .bind(¬e.source_ref) + .bind(note.note_id) + .execute(executor) + .await?; + Ok(()) } -pub async fn delete_note_chunks(db: &Db, note_id: Uuid) -> Result<()> { +/// Deletes all chunk rows for one memory note. +pub async fn delete_note_chunks<'e, E>(executor: E, note_id: Uuid) -> Result<()> +where + E: PgExecutor<'e>, +{ sqlx::query("DELETE FROM memory_note_chunks WHERE note_id = $1") .bind(note_id) - .execute(&db.pool) + .execute(executor) .await?; + Ok(()) } #[allow(clippy::too_many_arguments)] -pub async fn insert_note_chunk( - db: &Db, +/// Upserts one chunk row for a memory note. +pub async fn insert_note_chunk<'e, E>( + executor: E, chunk_id: Uuid, note_id: Uuid, chunk_index: i32, @@ -67,11 +131,27 @@ pub async fn insert_note_chunk( end_offset: i32, text: &str, embedding_version: &str, -) -> Result<()> { +) -> Result<()> +where + E: PgExecutor<'e>, +{ sqlx::query( - "INSERT INTO memory_note_chunks (chunk_id, note_id, chunk_index, start_offset, end_offset, text, embedding_version) \ - VALUES ($1,$2,$3,$4,$5,$6,$7) \ - ON CONFLICT (chunk_id) DO UPDATE SET text = EXCLUDED.text, start_offset = EXCLUDED.start_offset, end_offset = EXCLUDED.end_offset", + "\ +INSERT INTO memory_note_chunks ( + chunk_id, + note_id, + chunk_index, + start_offset, + end_offset, + text, + embedding_version +) +VALUES ($1, $2, $3, $4, $5, $6, $7) +ON CONFLICT (chunk_id) DO UPDATE +SET + text = EXCLUDED.text, + start_offset = EXCLUDED.start_offset, + end_offset = EXCLUDED.end_offset", ) .bind(chunk_id) .bind(note_id) @@ -80,29 +160,39 @@ pub async fn insert_note_chunk( .bind(end_offset) .bind(text) .bind(embedding_version) - .execute(&db.pool) + .execute(executor) .await?; + Ok(()) } -pub async fn insert_note_chunk_embedding( - db: &Db, +/// Upserts one embedding vector for a note chunk. +pub async fn insert_note_chunk_embedding<'e, E>( + executor: E, chunk_id: Uuid, embedding_version: &str, embedding_dim: i32, vec: &str, -) -> Result<()> { +) -> Result<()> +where + E: PgExecutor<'e>, +{ sqlx::query( - "INSERT INTO note_chunk_embeddings (chunk_id, embedding_version, embedding_dim, vec) \ - VALUES ($1,$2,$3,$4::vector) \ - ON CONFLICT (chunk_id, embedding_version) DO UPDATE \ - SET embedding_dim = EXCLUDED.embedding_dim, vec = EXCLUDED.vec, created_at = now()", + "\ +INSERT INTO note_chunk_embeddings (chunk_id, embedding_version, embedding_dim, vec) +VALUES ($1, $2, $3, $4::text::vector) +ON CONFLICT (chunk_id, embedding_version) DO UPDATE +SET + embedding_dim = EXCLUDED.embedding_dim, + vec = EXCLUDED.vec, +created_at = now()", ) .bind(chunk_id) .bind(embedding_version) .bind(embedding_dim) .bind(vec) - .execute(&db.pool) + .execute(executor) .await?; + Ok(()) } diff --git a/packages/elf-storage/src/schema.rs b/packages/elf-storage/src/schema.rs index a10a530f..9bbafc56 100644 --- a/packages/elf-storage/src/schema.rs +++ b/packages/elf-storage/src/schema.rs @@ -1,22 +1,46 @@ +//! SQL schema rendering utilities. + +/// Renders the full storage bootstrap SQL with the configured vector dimension. pub fn render_schema(vector_dim: u32) -> String { let init = include_str!("../../../sql/init.sql"); let expanded = expand_includes(init); + expanded.replace("<VECTOR_DIM>", &vector_dim.to_string()) } fn expand_includes(sql: &str) -> String { let mut out = String::new(); + for line in sql.lines() { let trimmed = line.trim(); + if let Some(path) = trimmed.strip_prefix("\\ir ") { match path.trim() { "00_extensions.sql" => out.push_str(include_str!("../../../sql/00_extensions.sql")), "tables/001_memory_notes.sql" => out.push_str(include_str!("../../../sql/tables/001_memory_notes.sql")), + "tables/016_graph_entities.sql" => + out.push_str(include_str!("../../../sql/tables/016_graph_entities.sql")), + "tables/017_graph_entity_aliases.sql" => + out.push_str(include_str!("../../../sql/tables/017_graph_entity_aliases.sql")), + "tables/020_graph_predicates.sql" => + out.push_str(include_str!("../../../sql/tables/020_graph_predicates.sql")), + "tables/021_graph_predicate_aliases.sql" => out + .push_str(include_str!("../../../sql/tables/021_graph_predicate_aliases.sql")), + "tables/018_graph_facts.sql" => + out.push_str(include_str!("../../../sql/tables/018_graph_facts.sql")), + "tables/019_graph_fact_evidence.sql" => + out.push_str(include_str!("../../../sql/tables/019_graph_fact_evidence.sql")), + "tables/022_graph_fact_supersessions.sql" => out + .push_str(include_str!("../../../sql/tables/022_graph_fact_supersessions.sql")), + "tables/013_memory_note_fields.sql" => + out.push_str(include_str!("../../../sql/tables/013_memory_note_fields.sql")), "tables/009_memory_note_chunks.sql" => out.push_str(include_str!("../../../sql/tables/009_memory_note_chunks.sql")), "tables/010_note_chunk_embeddings.sql" => out.push_str(include_str!("../../../sql/tables/010_note_chunk_embeddings.sql")), + "tables/014_note_field_embeddings.sql" => + out.push_str(include_str!("../../../sql/tables/014_note_field_embeddings.sql")), "tables/002_note_embeddings.sql" => out.push_str(include_str!("../../../sql/tables/002_note_embeddings.sql")), "tables/003_memory_note_versions.sql" => @@ -27,16 +51,90 @@ fn expand_includes(sql: &str) -> String { out.push_str(include_str!("../../../sql/tables/005_indexing_outbox.sql")), "tables/006_search_traces.sql" => out.push_str(include_str!("../../../sql/tables/006_search_traces.sql")), + "tables/012_search_trace_candidates.sql" => out + .push_str(include_str!("../../../sql/tables/012_search_trace_candidates.sql")), + "tables/015_search_trace_stages.sql" => + out.push_str(include_str!("../../../sql/tables/015_search_trace_stages.sql")), "tables/007_search_trace_outbox.sql" => out.push_str(include_str!("../../../sql/tables/007_search_trace_outbox.sql")), "tables/008_llm_cache.sql" => out.push_str(include_str!("../../../sql/tables/008_llm_cache.sql")), + "tables/011_search_sessions.sql" => + out.push_str(include_str!("../../../sql/tables/011_search_sessions.sql")), + "tables/025_doc_documents.sql" => + out.push_str(include_str!("../../../sql/tables/025_doc_documents.sql")), + "tables/026_doc_chunks.sql" => + out.push_str(include_str!("../../../sql/tables/026_doc_chunks.sql")), + "tables/027_doc_chunk_embeddings.sql" => + out.push_str(include_str!("../../../sql/tables/027_doc_chunk_embeddings.sql")), + "tables/028_doc_indexing_outbox.sql" => + out.push_str(include_str!("../../../sql/tables/028_doc_indexing_outbox.sql")), + "tables/029_memory_ingestion_profiles.sql" => out.push_str(include_str!( + "../../../sql/tables/029_memory_ingestion_profiles.sql" + )), + "tables/030_memory_ingestion_profile_defaults.sql" => out.push_str(include_str!( + "../../../sql/tables/030_memory_ingestion_profile_defaults.sql" + )), + "tables/031_consolidation_runs.sql" => + out.push_str(include_str!("../../../sql/tables/031_consolidation_runs.sql")), + "tables/032_consolidation_proposals.sql" => out + .push_str(include_str!("../../../sql/tables/032_consolidation_proposals.sql")), + "tables/033_consolidation_proposal_reviews.sql" => out.push_str(include_str!( + "../../../sql/tables/033_consolidation_proposal_reviews.sql" + )), + "tables/034_consolidation_run_jobs.sql" => + out.push_str(include_str!("../../../sql/tables/034_consolidation_run_jobs.sql")), + "tables/035_knowledge_pages.sql" => + out.push_str(include_str!("../../../sql/tables/035_knowledge_pages.sql")), + "tables/036_knowledge_page_sections.sql" => out + .push_str(include_str!("../../../sql/tables/036_knowledge_page_sections.sql")), + "tables/037_knowledge_page_source_refs.sql" => out.push_str(include_str!( + "../../../sql/tables/037_knowledge_page_source_refs.sql" + )), + "tables/038_knowledge_page_lint_findings.sql" => out.push_str(include_str!( + "../../../sql/tables/038_knowledge_page_lint_findings.sql" + )), + "tables/039_core_memory_blocks.sql" => + out.push_str(include_str!("../../../sql/tables/039_core_memory_blocks.sql")), + "tables/040_core_memory_block_attachments.sql" => out.push_str(include_str!( + "../../../sql/tables/040_core_memory_block_attachments.sql" + )), + "tables/041_core_memory_block_events.sql" => out + .push_str(include_str!("../../../sql/tables/041_core_memory_block_events.sql")), + "tables/023_memory_ingest_decisions.sql" => out + .push_str(include_str!("../../../sql/tables/023_memory_ingest_decisions.sql")), + "tables/024_memory_space_grants.sql" => + out.push_str(include_str!("../../../sql/tables/024_memory_space_grants.sql")), _ => out.push_str(line), } } else { out.push_str(line); } + out.push('\n'); } + out } + +#[cfg(test)] +mod tests { + use crate::schema; + + #[test] + fn render_schema_expands_all_includes() { + let schema = schema::render_schema(4_096); + + assert!( + !schema.contains("\\ir "), + "rendered schema must not leave psql include directives" + ); + assert!(schema.contains("CREATE TABLE IF NOT EXISTS knowledge_pages")); + assert!(schema.contains("CREATE TABLE IF NOT EXISTS knowledge_page_sections")); + assert!(schema.contains("CREATE TABLE IF NOT EXISTS knowledge_page_source_refs")); + assert!(schema.contains("CREATE TABLE IF NOT EXISTS knowledge_page_lint_findings")); + assert!(schema.contains("CREATE TABLE IF NOT EXISTS core_memory_blocks")); + assert!(schema.contains("CREATE TABLE IF NOT EXISTS core_memory_block_attachments")); + assert!(schema.contains("CREATE TABLE IF NOT EXISTS core_memory_block_events")); + } +} diff --git a/packages/elf-storage/tests/db_smoke.rs b/packages/elf-storage/tests/db_smoke.rs index c48a5503..7807c199 100644 --- a/packages/elf-storage/tests/db_smoke.rs +++ b/packages/elf-storage/tests/db_smoke.rs @@ -1,42 +1,193 @@ -// crates.io +#![allow(unused_crate_dependencies)] + +//! Integration tests for storage schema bootstrap. + use tokio::runtime::Runtime; +use uuid::Uuid; -// self +use elf_config::Postgres; use elf_storage::db::Db; use elf_testkit::TestDatabase; -#[tokio::test] -#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] -async fn db_connects_and_bootstraps() { - let Some(base_dsn) = elf_testkit::env_dsn() else { - eprintln!("Skipping db_connects_and_bootstraps; set ELF_PG_DSN to run this test."); - return; - }; - let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); - let cfg = elf_config::Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; - let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); - db.ensure_schema(3).await.expect("Failed to ensure schema."); - test_db.cleanup().await.expect("Failed to cleanup test database."); -} - #[test] #[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] fn chunk_tables_exist_after_bootstrap() { let Some(dsn) = elf_testkit::env_dsn() else { eprintln!("Skipping chunk_tables_exist_after_bootstrap; set ELF_PG_DSN to run this test."); + return; }; let rt = Runtime::new().expect("Failed to build runtime."); + rt.block_on(async { - let cfg = elf_config::Postgres { dsn: dsn.clone(), pool_max_conns: 1 }; + let cfg = Postgres { dsn: dsn.clone(), pool_max_conns: 1 }; let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); - db.ensure_schema(3).await.expect("Failed to ensure schema."); - let rows: (i64,) = sqlx::query_as( + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let count: i64 = sqlx::query_scalar( "SELECT count(*) FROM information_schema.tables WHERE table_name = 'memory_note_chunks'", ) .fetch_one(&db.pool) .await .expect("Failed to query schema tables."); - assert_eq!(rows.0, 1); + + assert_eq!(count, 1); + + let count: i64 = sqlx::query_scalar( + "SELECT count(*) FROM information_schema.tables WHERE table_name = 'memory_ingest_decisions'", + ) + .fetch_one(&db.pool) + .await + .expect("Failed to query schema tables."); + + assert_eq!(count, 1); + + let count: i64 = sqlx::query_scalar( + "SELECT count(*) FROM information_schema.tables WHERE table_name = 'consolidation_runs'", + ) + .fetch_one(&db.pool) + .await + .expect("Failed to query schema tables."); + + assert_eq!(count, 1); + + let count: i64 = sqlx::query_scalar( + "SELECT count(*) FROM information_schema.tables WHERE table_name = 'consolidation_proposals'", + ) + .fetch_one(&db.pool) + .await + .expect("Failed to query schema tables."); + + assert_eq!(count, 1); + + let count: i64 = sqlx::query_scalar( + "SELECT count(*) FROM information_schema.tables WHERE table_name = 'consolidation_proposal_reviews'", + ) + .fetch_one(&db.pool) + .await + .expect("Failed to query schema tables."); + + assert_eq!(count, 1); + + let count: i64 = sqlx::query_scalar( + "SELECT count(*) FROM information_schema.tables WHERE table_name = 'memory_space_grants'", + ) + .fetch_one(&db.pool) + .await + .expect("Failed to query schema tables."); + + assert_eq!(count, 1); }); } + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn db_connects_and_bootstraps() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!("Skipping db_connects_and_bootstraps; set ELF_PG_DSN to run this test."); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn memory_space_grants_active_uniqueness_enforced() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!( + "Skipping memory_space_grants_active_uniqueness_enforced; set ELF_PG_DSN to run." + ); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let project_grant = r#" + INSERT INTO memory_space_grants ( + grant_id, + tenant_id, + project_id, + scope, + space_owner_agent_id, + grantee_kind, + grantee_agent_id, + granted_by_agent_id + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) + "#; + let first_project = sqlx::query(project_grant) + .bind(Uuid::parse_str("11111111-1111-1111-1111-111111111111").expect("uuid")) + .bind("tenant_alpha") + .bind("project_alpha") + .bind("project_shared") + .bind("owner_alpha") + .bind("project") + .bind(None::<String>) + .bind("granter_alpha"); + let first_project_result = first_project.execute(&db.pool).await; + + assert!( + first_project_result.is_ok(), + "Expected first project grant to insert cleanly: {first_project_result:?}" + ); + + let duplicate_project = sqlx::query(project_grant) + .bind(Uuid::parse_str("11111111-1111-1111-1111-111111111112").expect("uuid")) + .bind("tenant_alpha") + .bind("project_alpha") + .bind("project_shared") + .bind("owner_alpha") + .bind("project") + .bind(None::<String>) + .bind("granter_alpha"); + + assert!(duplicate_project.execute(&db.pool).await.is_err()); + + let agent_grant = r#" + INSERT INTO memory_space_grants ( + grant_id, + tenant_id, + project_id, + scope, + space_owner_agent_id, + grantee_kind, + grantee_agent_id, + granted_by_agent_id + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) + "#; + let first_agent = sqlx::query(agent_grant) + .bind(Uuid::parse_str("22222222-2222-2222-2222-222222222221").expect("uuid")) + .bind("tenant_alpha") + .bind("project_alpha") + .bind("project_shared") + .bind("owner_alpha") + .bind("agent") + .bind("grantee_alpha") + .bind("granter_alpha"); + + assert!(first_agent.execute(&db.pool).await.is_ok()); + + let duplicate_agent = sqlx::query(agent_grant) + .bind(Uuid::parse_str("22222222-2222-2222-2222-222222222222").expect("uuid")) + .bind("tenant_alpha") + .bind("project_alpha") + .bind("project_shared") + .bind("owner_alpha") + .bind("agent") + .bind("grantee_alpha") + .bind("granter_alpha"); + + assert!(duplicate_agent.execute(&db.pool).await.is_err()); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-storage/tests/graph_memory.rs b/packages/elf-storage/tests/graph_memory.rs new file mode 100644 index 00000000..c9e9fe57 --- /dev/null +++ b/packages/elf-storage/tests/graph_memory.rs @@ -0,0 +1,427 @@ +#![allow(unused_crate_dependencies)] + +//! Integration tests for graph and memory storage helpers. + +use sqlx::PgConnection; +use time::{Duration, OffsetDateTime}; +use uuid::Uuid; + +use elf_config::Postgres; +use elf_storage::{ + db::Db, + graph, + models::{GraphFact, MemoryNote}, + queries, +}; +use elf_testkit::TestDatabase; + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_entity_upsert_is_idempotent_by_normalized_canonical() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!( + "Skipping graph_entity_upsert_is_idempotent_by_normalized_canonical; set ELF_PG_DSN to run." + ); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let tenant_id = "tenant-a"; + let project_id = "project-a"; + let entity_id = + graph::upsert_entity(&mut tx, tenant_id, project_id, " Alice Doe ", Some("person")) + .await + .expect("Failed to upsert canonical entity."); + let canonical_norm = graph::normalize_entity_name("Alice doe"); + + assert_eq!(canonical_norm, "alice doe"); + + let entity_again = + graph::upsert_entity(&mut tx, tenant_id, project_id, "Alice\tDoe", Some("person")) + .await + .expect("Failed to upsert canonical alias."); + + assert_eq!(entity_id, entity_again); + + tx.commit().await.expect("Failed to commit transaction."); + + assert!(test_db.cleanup().await.is_ok(), "Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_fact_with_empty_evidence_is_rejected() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!("Skipping graph_fact_with_empty_evidence_is_rejected; set ELF_PG_DSN to run."); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity A", None) + .await + .expect("Failed to upsert subject."); + let predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "related_to") + .await + .expect("Failed to resolve predicate."); + let err = graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "related_to", + predicate.predicate_id, + None, + Some("value"), + OffsetDateTime::now_utc(), + None, + &[], + ) + .await + .expect_err("Expected empty evidence to be rejected."); + + assert!(matches!(err, elf_storage::Error::InvalidArgument(_))); + + tx.rollback().await.expect("Failed to rollback transaction."); + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_fact_duplicates_with_active_window_fail_unique_constraint() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!( + "Skipping graph_fact_duplicates_with_active_window_fail_unique_constraint; set ELF_PG_DSN to run." + ); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let note_id = insert_memory_note(&mut tx, "tenant-a", "project-a").await; + let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Subject", None) + .await + .expect("Failed to upsert subject."); + let object = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Object", None) + .await + .expect("Failed to upsert object."); + let predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "related_to") + .await + .expect("Failed to resolve predicate."); + let now = OffsetDateTime::now_utc(); + + graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "related_to", + predicate.predicate_id, + Some(object), + None, + now, + None, + &[note_id], + ) + .await + .expect("Failed to insert graph fact."); + + let err = graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "related_to", + predicate.predicate_id, + Some(object), + None, + now, + None, + &[note_id], + ) + .await; + + assert!(err.is_err()); + + tx.rollback().await.expect("Failed to rollback transaction."); + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_fact_rejects_invalid_valid_window() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!("Skipping graph_fact_rejects_invalid_valid_window; set ELF_PG_DSN to run."); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let note_id = insert_memory_note(&mut tx, "tenant-a", "project-a").await; + let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Subject", None) + .await + .expect("Failed to upsert subject."); + let predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "expires") + .await + .expect("Failed to resolve predicate."); + let now = OffsetDateTime::now_utc(); + let err = graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "expires", + predicate.predicate_id, + None, + Some("value"), + now, + Some(now), + &[note_id], + ) + .await; + + assert!(err.is_err()); + + tx.rollback().await.expect("Failed to rollback transaction."); + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_fetch_active_facts_returns_active_window_only() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!( + "Skipping graph_fetch_active_facts_returns_active_window_only; set ELF_PG_DSN to run." + ); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let note_id = insert_memory_note(&mut tx, "tenant-a", "project-a").await; + let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Subject", None) + .await + .expect("Failed to upsert subject."); + let active_predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "active_fact") + .await + .expect("Failed to resolve predicate."); + let expired_predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "expired_fact") + .await + .expect("Failed to resolve predicate."); + let future_predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "future_fact") + .await + .expect("Failed to resolve predicate."); + let now = OffsetDateTime::now_utc(); + let active = graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "active_fact", + active_predicate.predicate_id, + None, + Some("alpha"), + now - Duration::hours(1), + None, + &[note_id], + ) + .await + .expect("Failed to insert active graph fact."); + + graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "expired_fact", + expired_predicate.predicate_id, + None, + Some("beta"), + now - Duration::hours(2), + Some(now - Duration::minutes(1)), + &[note_id], + ) + .await + .expect("Failed to insert expired graph fact."); + graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "future_fact", + future_predicate.predicate_id, + None, + Some("gamma"), + now + Duration::hours(1), + None, + &[note_id], + ) + .await + .expect("Failed to insert future graph fact."); + + let facts: Vec<GraphFact> = graph::fetch_active_facts_for_subject( + &mut tx, + "tenant-a", + "project-a", + "scope-a", + subject, + now, + ) + .await + .expect("Failed to fetch active graph facts."); + + assert_eq!(facts.len(), 1); + assert_eq!(facts[0].fact_id, active); + assert_eq!(facts[0].predicate, "active_fact"); + + tx.rollback().await.expect("Failed to rollback transaction."); + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_predicate_guarded_update_conflicts_after_deprecate() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!( + "Skipping graph_predicate_guarded_update_conflicts_after_deprecate; set ELF_PG_DSN to run." + ); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "mentors") + .await + .expect("Failed to resolve predicate."); + let updated_active = graph::update_predicate_guarded( + &mut tx, + predicate.predicate_id, + predicate.status.as_str(), + predicate.cardinality.as_str(), + Some("active"), + None, + ) + .await + .expect("Failed to activate predicate."); + let stale_expected_status = updated_active.status.clone(); + let stale_expected_cardinality = updated_active.cardinality.clone(); + let updated_deprecated = graph::update_predicate_guarded( + &mut tx, + predicate.predicate_id, + updated_active.status.as_str(), + updated_active.cardinality.as_str(), + Some("deprecated"), + None, + ) + .await + .expect("Failed to deprecate predicate."); + + assert_eq!(updated_deprecated.status, "deprecated"); + + let err = graph::update_predicate_guarded( + &mut tx, + predicate.predicate_id, + stale_expected_status.as_str(), + stale_expected_cardinality.as_str(), + None, + Some("single"), + ) + .await + .expect_err("Expected guarded update to conflict after deprecate."); + + assert!(matches!(err, elf_storage::Error::Conflict(_))); + + let predicate_now = graph::get_predicate_by_id(&mut tx, predicate.predicate_id) + .await + .expect("Failed to load predicate.") + .expect("Expected predicate row."); + + assert_eq!(predicate_now.status, "deprecated"); + + tx.rollback().await.expect("Failed to rollback transaction."); + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +async fn insert_memory_note( + executor: &mut PgConnection, + tenant_id: &str, + project_id: &str, +) -> Uuid { + let note_id = Uuid::new_v4(); + let note = MemoryNote { + note_id, + tenant_id: tenant_id.to_string(), + project_id: project_id.to_string(), + agent_id: "agent-a".to_string(), + scope: "scope-a".to_string(), + r#type: "fact".to_string(), + key: None, + text: "graph note evidence".to_string(), + importance: 1.0, + confidence: 1.0, + status: "active".to_string(), + created_at: OffsetDateTime::now_utc(), + updated_at: OffsetDateTime::now_utc(), + expires_at: None, + embedding_version: "test:vec:1".to_string(), + source_ref: serde_json::json!({}), + hit_count: 0, + last_hit_at: None, + }; + + queries::insert_note(executor, ¬e).await.expect("Failed to insert evidence note."); + + note_id +} diff --git a/packages/elf-storage/tests/outbox.rs b/packages/elf-storage/tests/outbox.rs index bc4acc95..36ddca49 100644 --- a/packages/elf-storage/tests/outbox.rs +++ b/packages/elf-storage/tests/outbox.rs @@ -1,7 +1,10 @@ -// crates.io +#![allow(unused_crate_dependencies)] + +//! Integration tests for storage outbox helpers. + use uuid::Uuid; -// self +use elf_config::Postgres; use elf_storage::{db::Db, outbox}; use elf_testkit::TestDatabase; @@ -10,15 +13,18 @@ use elf_testkit::TestDatabase; async fn enqueues_outbox_job() { let Some(base_dsn) = elf_testkit::env_dsn() else { eprintln!("Skipping enqueues_outbox_job; set ELF_PG_DSN to run this test."); + return; }; let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); - let cfg = elf_config::Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); - db.ensure_schema(3).await.expect("Failed to ensure schema."); - outbox::enqueue_outbox(&db, Uuid::new_v4(), "UPSERT", "test:vector:1") + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + outbox::enqueue_outbox(&db.pool, Uuid::new_v4(), "UPSERT", "test:vector:1") .await .expect("Failed to enqueue outbox."); + test_db.cleanup().await.expect("Failed to cleanup test database."); } diff --git a/packages/elf-testkit/Cargo.toml b/packages/elf-testkit/Cargo.toml index d765b2a4..5d53bdb1 100644 --- a/packages/elf-testkit/Cargo.toml +++ b/packages/elf-testkit/Cargo.toml @@ -1,10 +1,11 @@ [package] edition = "2024" name = "elf-testkit" -version = "0.1.0" +version = "0.2.0" [dependencies] -color-eyre = { workspace = true } -sqlx = { workspace = true } -tokio = { workspace = true } -uuid = { workspace = true } +qdrant-client = { workspace = true } +sqlx = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true } +uuid = { workspace = true } diff --git a/packages/elf-testkit/src/error.rs b/packages/elf-testkit/src/error.rs new file mode 100644 index 00000000..2ec59531 --- /dev/null +++ b/packages/elf-testkit/src/error.rs @@ -0,0 +1,23 @@ +/// Result alias for ELF testkit helpers. +pub type Result<T, E = Error> = std::result::Result<T, E>; + +/// Errors returned by ELF integration-test helpers. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// A helper-specific failure message. + #[error("{0}")] + Message(String), + + /// SQLx returned an error while creating or cleaning test databases. + #[error(transparent)] + Sqlx(#[from] sqlx::Error), + + /// Qdrant returned an error while managing test collections. + #[error(transparent)] + Qdrant(#[from] Box<qdrant_client::QdrantError>), +} +impl From<qdrant_client::QdrantError> for Error { + fn from(err: qdrant_client::QdrantError) -> Self { + Self::Qdrant(Box::new(err)) + } +} diff --git a/packages/elf-testkit/src/lib.rs b/packages/elf-testkit/src/lib.rs index 01a9f4e2..591e3d29 100644 --- a/packages/elf-testkit/src/lib.rs +++ b/packages/elf-testkit/src/lib.rs @@ -1,114 +1,179 @@ -// std -use std::{env, future::Future, str::FromStr, thread}; +//! Test helpers for ephemeral Postgres databases and Qdrant collections. -// crates.io -use color_eyre::eyre::{self, WrapErr}; +mod error; + +pub use error::{Error, Result}; + +use std::{ + collections::HashSet, env, future::Future, str::FromStr, sync::Mutex, thread, time::Duration, +}; + +use qdrant_client::Qdrant; use sqlx::{ - ConnectOptions, Connection, Executor, + AssertSqlSafe, ConnectOptions, Connection, postgres::{PgConnectOptions, PgConnection}, }; -use tokio::runtime::Builder; +use tokio::{runtime::Builder, time}; use uuid::Uuid; const ADMIN_DATABASES: [&str; 2] = ["postgres", "template1"]; -pub fn env_dsn() -> Option<String> { - env::var("ELF_PG_DSN").ok() -} - +/// Ephemeral test database handle with tracked Qdrant collections for cleanup. pub struct TestDatabase { name: String, dsn: String, admin_options: PgConnectOptions, cleaned: bool, + collections: Mutex<HashSet<String>>, } - impl TestDatabase { - pub async fn new(base_dsn: &str) -> color_eyre::Result<Self> { - let base_options: PgConnectOptions = - PgConnectOptions::from_str(base_dsn).wrap_err("Failed to parse ELF_PG_DSN.")?; + /// Creates a fresh temporary Postgres database from a base admin DSN. + pub async fn new(base_dsn: &str) -> Result<Self> { + let base_options: PgConnectOptions = PgConnectOptions::from_str(base_dsn) + .map_err(|err| Error::Message(format!("Failed to parse ELF_PG_DSN: {err}.")))?; let (admin_options, mut admin_conn) = connect_admin(&base_options).await?; let name = format!("elf_test_{}", Uuid::new_v4().simple()); let create_sql = format!(r#"CREATE DATABASE "{}""#, name); - admin_conn - .execute(create_sql.as_str()) + + sqlx::raw_sql(AssertSqlSafe(create_sql)) + .execute(&mut admin_conn) .await - .wrap_err("Failed to create test database.")?; + .map_err(|err| Error::Message(format!("Failed to create test database: {err}.")))?; + let dsn = base_options.clone().database(&name).to_url_lossy().to_string(); - Ok(Self { name, dsn, admin_options, cleaned: false }) + Ok(Self { + name, + dsn, + admin_options, + cleaned: false, + collections: Mutex::new(HashSet::new()), + }) } + /// Returns the DSN for the temporary test database. pub fn dsn(&self) -> &str { &self.dsn } + /// Returns the generated database name. pub fn name(&self) -> &str { &self.name } + /// Returns a unique collection prefix and tracks the related Qdrant collections. pub fn collection_name(&self, prefix: &str) -> String { - format!("{prefix}_{}", self.name) + let collection = format!("{prefix}_{}", self.name); + let docs_collection = format!("{collection}_docs"); + let mut tracked = self.collections.lock().unwrap_or_else(|err| err.into_inner()); + + tracked.insert(collection.clone()); + tracked.insert(docs_collection); + + collection } - pub async fn cleanup(mut self) -> color_eyre::Result<()> { + /// Drops the temporary database and any tracked Qdrant collections. + pub async fn cleanup(mut self) -> Result<()> { self.cleanup_inner().await } - async fn cleanup_inner(&mut self) -> color_eyre::Result<()> { + async fn cleanup_inner(&mut self) -> Result<()> { if self.cleaned { return Ok(()); } - cleanup_database(&self.name, &self.admin_options).await?; + + let collections = { + let tracked = self.collections.lock().unwrap_or_else(|err| err.into_inner()); + + tracked.iter().cloned().collect::<Vec<_>>() + }; + let db_result = cleanup_database(&self.name, &self.admin_options).await; + let qdrant_result = cleanup_qdrant_collections(&collections).await; + + db_result?; + qdrant_result?; + self.cleaned = true; + Ok(()) } } - impl Drop for TestDatabase { fn drop(&mut self) { if self.cleaned { return; } + let name = self.name.clone(); let admin_options = self.admin_options.clone(); - let _ = thread::spawn(move || { + let collections = self + .collections + .lock() + .unwrap_or_else(|err| err.into_inner()) + .iter() + .cloned() + .collect::<Vec<_>>(); + let cleanup_thread = thread::spawn(move || { let runtime = match Builder::new_current_thread().enable_all().build() { Ok(runtime) => runtime, Err(err) => { eprintln!("Test database cleanup failed: {err}."); + return; }, }; + + if let Err(err) = runtime.block_on(cleanup_qdrant_collections(&collections)) { + eprintln!("Test Qdrant cleanup failed: {err}."); + } if let Err(err) = runtime.block_on(cleanup_database(&name, &admin_options)) { eprintln!("Test database cleanup failed: {err}."); } }); + let _ = cleanup_thread.join(); } } -pub async fn with_test_db<F, Fut, T>(base_dsn: &str, f: F) -> color_eyre::Result<T> +/// Returns `ELF_PG_DSN` when it is available for integration tests. +pub fn env_dsn() -> Option<String> { + env::var("ELF_PG_DSN").ok() +} + +/// Returns the configured Qdrant URL for integration tests. +pub fn env_qdrant_url() -> Option<String> { + env::var("ELF_QDRANT_GRPC_URL").or_else(|_| env::var("ELF_QDRANT_URL")).ok() +} + +/// Runs an async test closure with a temporary database and guaranteed cleanup. +pub async fn with_test_db<F, Fut, T>(base_dsn: &str, f: F) -> Result<T> where F: FnOnce(&TestDatabase) -> Fut, - Fut: Future<Output = color_eyre::Result<T>>, + Fut: Future<Output = Result<T>>, { - let mut db = TestDatabase::new(base_dsn).await?; + let db = TestDatabase::new(base_dsn).await?; let result = f(&db).await; + let mut db = db; + if let Err(err) = db.cleanup_inner().await { eprintln!("Test database cleanup warning: {err}."); + if result.is_ok() { return Err(err); } } + result } async fn connect_admin( base_options: &PgConnectOptions, -) -> color_eyre::Result<(PgConnectOptions, PgConnection)> { +) -> Result<(PgConnectOptions, PgConnection)> { let mut last_err = None; + for database in ADMIN_DATABASES { let options = base_options.clone().database(database); + match PgConnection::connect_with(&options).await { Ok(conn) => return Ok((options, conn)), Err(err) => { @@ -116,24 +181,94 @@ async fn connect_admin( }, } } - Err(eyre::eyre!("Failed to connect to an admin database: {:?}", last_err)) + + Err(Error::Message(format!("Failed to connect to an admin database: {last_err:?}."))) } -async fn cleanup_database(name: &str, admin_options: &PgConnectOptions) -> color_eyre::Result<()> { - let mut conn = PgConnection::connect_with(admin_options) - .await - .wrap_err("Failed to connect to admin database for cleanup.")?; +async fn cleanup_database(name: &str, admin_options: &PgConnectOptions) -> Result<()> { + let conn = PgConnection::connect_with(admin_options).await.map_err(|err| { + Error::Message(format!("Failed to connect to admin database for cleanup: {err}.")) + })?; + let drop_sql = format!(r#"DROP DATABASE IF EXISTS "{}""#, name); + let mut conn = conn; let _ = sqlx::query( - "SELECT pg_terminate_backend(pid) FROM pg_stat_activity \ - WHERE datname = $1 AND pid <> pg_backend_pid()", + "\ +SELECT pg_terminate_backend(pid) +FROM pg_stat_activity +WHERE datname = $1 AND pid <> pg_backend_pid()", ) .bind(name) - .execute(&mut conn) + .fetch_all(&mut conn) .await; - let drop_sql = format!(r#"DROP DATABASE IF EXISTS "{}""#, name); - sqlx::query(drop_sql.as_str()) + + sqlx::raw_sql(AssertSqlSafe(drop_sql)) .execute(&mut conn) .await - .wrap_err("Failed to drop test database.")?; + .map_err(|err| Error::Message(format!("Failed to drop test database: {err}.")))?; + + Ok(()) +} + +async fn cleanup_qdrant_collections(collections: &[String]) -> Result<()> { + if collections.is_empty() { + return Ok(()); + } + + let Some(qdrant_url) = env_qdrant_url() else { + eprintln!( + "Skipping Qdrant cleanup; set ELF_QDRANT_GRPC_URL (or ELF_QDRANT_URL) to delete test collections." + ); + + return Ok(()); + }; + let client = Qdrant::from_url(&qdrant_url) + .build() + .map_err(|err| Error::Message(format!("Failed to build Qdrant client: {err}.")))?; + let max_attempts = 6; + let mut remaining = collections.iter().cloned().collect::<HashSet<_>>(); + let mut backoff = Duration::from_millis(100); + + for attempt in 1..=max_attempts { + let existing = time::timeout(Duration::from_secs(10), client.list_collections()) + .await + .map_err(|_| Error::Message("Qdrant list_collections timed out.".to_string()))? + .map_err(|err| Error::Message(format!("Failed to list Qdrant collections: {err}.")))?; + let existing = existing.collections.into_iter().map(|c| c.name).collect::<HashSet<_>>(); + + remaining.retain(|collection| existing.contains(collection)); + + if remaining.is_empty() { + return Ok(()); + } + + for collection in remaining.iter().cloned().collect::<Vec<_>>() { + let result = time::timeout( + Duration::from_secs(10), + client.delete_collection(collection.clone()), + ) + .await; + + match result { + Ok(Ok(_)) => {}, + Ok(Err(err)) => + if attempt == max_attempts { + return Err(Error::Message(format!( + "Failed to delete Qdrant collection {collection:?} after {attempt} attempts: {err}." + ))); + }, + Err(_) => + if attempt == max_attempts { + return Err(Error::Message(format!( + "Timed out deleting Qdrant collection {collection:?} after {attempt} attempts." + ))); + }, + } + } + + time::sleep(backoff).await; + + backoff = backoff.saturating_mul(2).min(Duration::from_secs(2)); + } + Ok(()) } diff --git a/qdrant/init.sh b/qdrant/init.sh index 27b1c17a..4449da28 100755 --- a/qdrant/init.sh +++ b/qdrant/init.sh @@ -5,9 +5,58 @@ set -euo pipefail : "${ELF_QDRANT_COLLECTION:?Set ELF_QDRANT_COLLECTION to the collection name.}" : "${ELF_QDRANT_VECTOR_DIM:?Set ELF_QDRANT_VECTOR_DIM to the dense vector dimension.}" -curl -sS -X PUT "${ELF_QDRANT_HTTP_URL}/collections/${ELF_QDRANT_COLLECTION}?wait=true" \ - -H 'Content-Type: application/json' \ - -d @- <<JSON +collections=("${ELF_QDRANT_COLLECTION}") + +if [[ -n "${ELF_QDRANT_DOCS_COLLECTION:-}" ]]; then + collections+=("${ELF_QDRANT_DOCS_COLLECTION}") +fi + +create_payload_index() { + local collection=$1 + local payload=$2 + local field_name + local response + local status + response="$(mktemp)" + field_name="${payload#*\"field_name\":\"}" + field_name="${field_name%%\"*}" + + status=$(curl -sS -w '%{http_code}' -o "$response" -X PUT \ + "${ELF_QDRANT_HTTP_URL}/collections/${collection}/index?wait=true" \ + -H 'Content-Type: application/json' \ + -d "$payload" + ) + + if [[ "$status" == 2* ]]; then + rm -f "$response" + return + fi + + if grep -qi "already.*exists" "$response"; then + rm -f "$response" + return + fi + + echo "Failed to create payload index for field '${field_name}' in ${collection}. HTTP ${status}." >&2 + echo "Response body: $(cat "$response")" >&2 + rm -f "$response" + exit 1 +} + +for collection in "${collections[@]}"; do + collection_exists=false + + if curl -fsS "${ELF_QDRANT_HTTP_URL}/collections/${collection}" >/dev/null 2>&1; then + echo "Qdrant collection ${collection} already exists. Skipping create." + collection_exists=true + fi + + if [[ "$collection_exists" == "false" ]]; then + echo "Creating Qdrant collection ${collection}." + + curl -sS -X PUT "${ELF_QDRANT_HTTP_URL}/collections/${collection}?wait=true" \ + -H 'Content-Type: application/json' \ + -d @- <<JSON { "vectors": { "dense": { @@ -22,3 +71,17 @@ curl -sS -X PUT "${ELF_QDRANT_HTTP_URL}/collections/${ELF_QDRANT_COLLECTION}?wai } } JSON + fi + + if [[ -n "${ELF_QDRANT_DOCS_COLLECTION:-}" && "${collection}" == "${ELF_QDRANT_DOCS_COLLECTION}" ]]; then + create_payload_index "$collection" '{"field_name":"scope","field_schema":"keyword"}' + create_payload_index "$collection" '{"field_name":"status","field_schema":"keyword"}' + create_payload_index "$collection" '{"field_name":"doc_type","field_schema":"keyword"}' + create_payload_index "$collection" '{"field_name":"agent_id","field_schema":"keyword"}' + create_payload_index "$collection" '{"field_name":"updated_at","field_schema":"datetime"}' + create_payload_index "$collection" '{"field_name":"doc_ts","field_schema":"datetime"}' + create_payload_index "$collection" '{"field_name":"thread_id","field_schema":"keyword"}' + create_payload_index "$collection" '{"field_name":"domain","field_schema":"keyword"}' + create_payload_index "$collection" '{"field_name":"repo","field_schema":"keyword"}' + fi +done diff --git a/rust-toolchain.toml b/rust-toolchain.toml index e5da0e75..b36e8bee 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,4 +1,4 @@ [toolchain] channel = "stable" -components = ["cargo", "clippy", "rust-src", "rustc", "rustfmt"] +components = ["cargo", "clippy", "rust-analyzer", "rust-src", "rustc", "rustfmt"] profile = "minimal" diff --git a/scripts/baseline-docker.sh b/scripts/baseline-docker.sh new file mode 100755 index 00000000..a6e38d82 --- /dev/null +++ b/scripts/baseline-docker.sh @@ -0,0 +1,173 @@ +#!/usr/bin/env bash +set -euo pipefail + +profile="${1:-}" +if [ -z "$profile" ]; then + echo "usage: scripts/baseline-docker.sh <profile>" >&2 + exit 2 +fi + +head="$(git rev-parse HEAD)" +if [ -n "$(git status --porcelain)" ]; then + head="$head+dirty" +fi + +run_baseline() { + docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner +} + +selected_projects_or_default() { + local selected_projects + selected_projects="$(printenv ELF_BASELINE_PROJECTS || true)" + if [ -z "$selected_projects" ]; then + selected_projects="ELF" + fi + printf '%s' "$selected_projects" +} + +case "$profile" in +live) + export ELF_BASELINE_ELF_HEAD="$head" + run_baseline + ;; +backfill) + selected_projects="$(selected_projects_or_default)" + selected_profile="$(printenv ELF_BASELINE_PROFILE || true)" + if [ -z "$selected_profile" ]; then + selected_profile="backfill" + fi + backfill_docs="$(printenv ELF_BASELINE_BACKFILL_DOCS || true)" + if [ -z "$backfill_docs" ]; then + backfill_docs="2000" + fi + elf_timeout="$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)" + if [ -z "$elf_timeout" ]; then + elf_timeout="3600" + fi + max_elf_seconds="$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)" + if [ -z "$max_elf_seconds" ]; then + max_elf_seconds="3600" + fi + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS="$selected_projects" + export ELF_BASELINE_PROFILE="$selected_profile" + export ELF_BASELINE_BACKFILL_DOCS="$backfill_docs" + export ELF_BASELINE_ELF_TIMEOUT_SECONDS="$elf_timeout" + export ELF_BASELINE_MAX_ELF_SECONDS="$max_elf_seconds" + run_baseline + ;; +openmemory-ui-export-readback) + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS=mem0 + run_baseline + ;; +production-synthetic) + selected_projects="$(selected_projects_or_default)" + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS="$selected_projects" + export ELF_BASELINE_PROFILE=production-synthetic + run_baseline + ;; +production-private) + manifest="$(printenv ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST || true)" + if [ -z "$manifest" ]; then + echo "ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is required for baseline-production-private" >&2 + exit 1 + fi + selected_projects="$(selected_projects_or_default)" + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS="$selected_projects" + export ELF_BASELINE_PROFILE=production-private + run_baseline + ;; +production-private-addendum) + manifest="$(printenv ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST || true)" + if [ -z "$manifest" ]; then + echo "ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is required for baseline-production-private-addendum" >&2 + exit 1 + fi + selected_projects="$(selected_projects_or_default)" + addendum="$(printenv ELF_BASELINE_PRIVATE_ADDENDUM || true)" + if [ -z "$addendum" ]; then + addendum="tmp/live-baseline/private-production-addendum.md" + fi + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS="$selected_projects" + export ELF_BASELINE_PROFILE=production-private + run_baseline + ELF_BASELINE_MARKDOWN_REPORT="$addendum" bash scripts/live-baseline-report-to-md.sh + echo "Private production addendum: $addendum" + ;; +backfill-10k) + backfill_docs="$(printenv ELF_BASELINE_BACKFILL_DOCS || true)" + if [ -z "$backfill_docs" ]; then + backfill_docs="10000" + fi + elf_timeout="$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)" + if [ -z "$elf_timeout" ]; then + elf_timeout="14400" + fi + max_elf_seconds="$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)" + if [ -z "$max_elf_seconds" ]; then + max_elf_seconds="$elf_timeout" + fi + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS=ELF + export ELF_BASELINE_PROFILE=backfill + export ELF_BASELINE_BACKFILL_DOCS="$backfill_docs" + export ELF_BASELINE_ELF_TIMEOUT_SECONDS="$elf_timeout" + export ELF_BASELINE_MAX_ELF_SECONDS="$max_elf_seconds" + run_baseline + ;; +backfill-100k) + enabled="$(printenv ELF_BASELINE_ENABLE_EXPENSIVE || true)" + if [ "$enabled" != "1" ]; then + echo "ELF_BASELINE_ENABLE_EXPENSIVE=1 is required for baseline-backfill-100k-docker" >&2 + exit 1 + fi + backfill_docs="$(printenv ELF_BASELINE_BACKFILL_DOCS || true)" + if [ -z "$backfill_docs" ]; then + backfill_docs="100000" + fi + elf_timeout="$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)" + if [ -z "$elf_timeout" ]; then + elf_timeout="86400" + fi + max_elf_seconds="$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)" + if [ -z "$max_elf_seconds" ]; then + max_elf_seconds="$elf_timeout" + fi + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS=ELF + export ELF_BASELINE_PROFILE=backfill + export ELF_BASELINE_BACKFILL_DOCS="$backfill_docs" + export ELF_BASELINE_ELF_TIMEOUT_SECONDS="$elf_timeout" + export ELF_BASELINE_MAX_ELF_SECONDS="$max_elf_seconds" + run_baseline + ;; +soak) + soak_seconds="$(printenv ELF_BASELINE_SOAK_SECONDS || true)" + if [ -z "$soak_seconds" ]; then + soak_seconds="3600" + fi + elf_timeout="$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)" + if [ -z "$elf_timeout" ]; then + elf_timeout="$((soak_seconds + 1800))" + fi + max_elf_seconds="$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)" + if [ -z "$max_elf_seconds" ]; then + max_elf_seconds="$elf_timeout" + fi + export ELF_BASELINE_ELF_HEAD="$head" + export ELF_BASELINE_PROJECTS=ELF + export ELF_BASELINE_PROFILE=stress + export ELF_BASELINE_SOAK_SECONDS="$soak_seconds" + export ELF_BASELINE_ELF_TIMEOUT_SECONDS="$elf_timeout" + export ELF_BASELINE_MAX_ELF_SECONDS="$max_elf_seconds" + run_baseline + ;; +*) + echo "unknown baseline profile: $profile" >&2 + exit 2 + ;; +esac diff --git a/scripts/check-docs.py b/scripts/check-docs.py new file mode 100755 index 00000000..9f64d34e --- /dev/null +++ b/scripts/check-docs.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import re +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +TASK_RE = re.compile(r"^\[tasks\.([^\]]+)\]", re.MULTILINE) +CARGO_MAKE_RE = re.compile(r"\bcargo\s+make\s+([A-Za-z0-9][A-Za-z0-9_:-]*)") +MARKDOWN_LINK_RE = re.compile(r"!?\[[^\]\n]*\]\(([^)\n]+)\)") + + +def read_text(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def cargo_make_tasks() -> set[str]: + return set(TASK_RE.findall(read_text(ROOT / "Makefile.toml"))) + + +def iter_reference_files() -> list[Path]: + roots = [ + ROOT / "README.md", + ROOT / "AGENTS.md", + ROOT / "docs", + ROOT / ".github" / "workflows", + ] + files: list[Path] = [] + for root in roots: + if root.is_file(): + files.append(root) + continue + if root.is_dir(): + files.extend( + path + for path in root.rglob("*") + if path.suffix in {".md", ".yml", ".yaml"} + ) + return sorted(files) + + +def iter_markdown_files() -> list[Path]: + return [ + path + for path in iter_reference_files() + if path.suffix == ".md" + ] + + +def normalize_link_target(raw_target: str) -> str: + target = raw_target.strip() + if target.startswith("<") and ">" in target: + target = target[1:target.index(">")] + elif " " in target: + target = target.split(maxsplit=1)[0] + return target + + +def is_external_or_anchor(target: str) -> bool: + return ( + not target + or target.startswith("#") + or target.startswith("/") + or bool(re.match(r"^[A-Za-z][A-Za-z0-9+.-]*:", target)) + ) + + +def check_cargo_make_references(tasks: set[str]) -> list[str]: + errors: list[str] = [] + for path in iter_reference_files(): + for line_number, line in enumerate(read_text(path).splitlines(), start=1): + for match in CARGO_MAKE_RE.finditer(line): + task = match.group(1) + if task not in tasks: + rel_path = path.relative_to(ROOT) + errors.append(f"{rel_path}:{line_number}: unknown cargo make task `{task}`") + return errors + + +def check_markdown_links() -> list[str]: + errors: list[str] = [] + for path in iter_markdown_files(): + for line_number, line in enumerate(read_text(path).splitlines(), start=1): + for match in MARKDOWN_LINK_RE.finditer(line): + target = normalize_link_target(match.group(1)) + if is_external_or_anchor(target): + continue + path_part = target.split("#", maxsplit=1)[0] + if not path_part: + continue + candidate = ( + ROOT / path_part.removeprefix("/") + if path_part.startswith("/") + else path.parent / path_part + ) + if not candidate.exists(): + rel_path = path.relative_to(ROOT) + errors.append(f"{rel_path}:{line_number}: broken local link `{target}`") + return errors + + +def main() -> int: + errors = check_cargo_make_references(cargo_make_tasks()) + errors.extend(check_markdown_links()) + if errors: + for error in errors: + print(error, file=sys.stderr) + return 1 + print("check-docs passed") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/consolidation-harness.sh b/scripts/consolidation-harness.sh new file mode 100755 index 00000000..8816fa82 --- /dev/null +++ b/scripts/consolidation-harness.sh @@ -0,0 +1,570 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +if [[ -f "${ROOT_DIR}/.env" ]]; then + set -a + # shellcheck disable=SC1090 + source "${ROOT_DIR}/.env" + set +a +fi + +: "${ELF_PG_DSN:?Set ELF_PG_DSN to a Postgres DSN (usually .../postgres).}" +: "${ELF_QDRANT_HTTP_URL:?Set ELF_QDRANT_HTTP_URL to the Qdrant REST base URL, for example http://127.0.0.1:51889 (default: http://127.0.0.1:6333).}" + +QDRANT_GRPC_URL="${ELF_QDRANT_GRPC_URL:-${ELF_QDRANT_URL:-}}" +if [[ -z "${QDRANT_GRPC_URL}" ]]; then + echo "Set ELF_QDRANT_GRPC_URL to the Qdrant gRPC base URL, for example http://127.0.0.1:51890 (default: http://127.0.0.1:6334). Legacy alias ELF_QDRANT_URL is deprecated but still supported." + exit 1 +fi + +if command -v jaq >/dev/null 2>&1; then + JSON_TOOL="jaq" +elif command -v jq >/dev/null 2>&1; then + JSON_TOOL="jq" +else + echo "Missing jaq/jq. Install jaq (recommended) or jq." >&2 + exit 1 +fi + +for cmd in curl psql; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd}." >&2 + exit 1 + fi +done + +RUN_ID="${ELF_HARNESS_RUN_ID:-"$(date +%s)-$$"}" + +DB_NAME="${ELF_HARNESS_DB_NAME:-elf_consolidation}" +QDRANT_COLLECTION="${ELF_HARNESS_COLLECTION:-elf_harness_consolidation_${RUN_ID}}" +VECTOR_DIM="${ELF_HARNESS_VECTOR_DIM:-4096}" +TOP_K="${ELF_HARNESS_TOP_K:-3}" +CANDIDATE_K="${ELF_HARNESS_CANDIDATE_K:-30}" +TARGET_KEY="incident_merge_protocol" + +if [[ ! "${DB_NAME}" =~ ^elf_ ]]; then + echo "ELF_HARNESS_DB_NAME must start with elf_ to avoid deleting real data." >&2 + exit 1 +fi +if [[ ! "${QDRANT_COLLECTION}" =~ ^elf_ ]]; then + echo "ELF_HARNESS_COLLECTION must start with elf_ to avoid deleting real data." >&2 + exit 1 +fi +if [[ ! "${VECTOR_DIM}" =~ ^[0-9]+$ ]] || [[ "${VECTOR_DIM}" -le 0 ]]; then + echo "ELF_HARNESS_VECTOR_DIM must be a positive integer." >&2 + exit 1 +fi + +HTTP_BIND="${ELF_HARNESS_HTTP_BIND:-127.0.0.1:18389}" +ADMIN_BIND="${ELF_HARNESS_ADMIN_BIND:-127.0.0.1:18390}" +MCP_BIND="${ELF_HARNESS_MCP_BIND:-127.0.0.1:18391}" +HTTP_BASE="http://${HTTP_BIND}" + +PG_DSN_BASE="${ELF_PG_DSN%/*}" +PG_DSN="${PG_DSN_BASE}/${DB_NAME}" + +VECTOR_DIM_TOML="$(echo "${VECTOR_DIM}" | perl -pe '1 while s/^([0-9]+)([0-9]{3})/$1_$2/')" + +CFG_BASE="${ROOT_DIR}/tmp/elf.consolidation.base.toml" +DATASET="${ROOT_DIR}/tmp/elf.consolidation.dataset.json" +OUT_BASE="${ROOT_DIR}/tmp/elf.consolidation.out.base.json" +OUT_AFTER="${ROOT_DIR}/tmp/elf.consolidation.out.after.json" +WORKER_LOG="${ROOT_DIR}/tmp/elf.consolidation.worker.log" +API_LOG="${ROOT_DIR}/tmp/elf.consolidation.api.log" + +WORKER_PID="" +API_PID="" + +cleanup() { + set +e + + if [[ -n "${API_PID}" ]] && kill -0 "${API_PID}" >/dev/null 2>&1; then + kill "${API_PID}" >/dev/null 2>&1 || true + fi + if [[ -n "${WORKER_PID}" ]] && kill -0 "${WORKER_PID}" >/dev/null 2>&1; then + kill "${WORKER_PID}" >/dev/null 2>&1 || true + fi + wait >/dev/null 2>&1 || true + + if [[ "${ELF_HARNESS_KEEP_COLLECTION:-0}" != "1" ]]; then + curl -sS -X DELETE "${ELF_QDRANT_HTTP_URL}/collections/${QDRANT_COLLECTION}?wait=true" >/dev/null || true + fi + + if [[ "${ELF_HARNESS_KEEP_DB:-0}" != "1" ]]; then + psql "${ELF_PG_DSN}" -tAc \ + "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${DB_NAME}' AND pid <> pg_backend_pid();" \ + >/dev/null 2>&1 || true + psql "${ELF_PG_DSN}" -v ON_ERROR_STOP=1 -c "DROP DATABASE IF EXISTS ${DB_NAME};" >/dev/null 2>&1 || true + fi +} + +trap cleanup EXIT + +wait_for_outbox_done() { + local note_id="$1" + for _ in $(seq 1 120); do + status="$( + psql "${PG_DSN}" -tAc \ + "SELECT status FROM indexing_outbox WHERE note_id = '${note_id}' ORDER BY created_at DESC LIMIT 1;" \ + | tr -d '[:space:]' + )" + if [[ -z "${status}" ]] || [[ "${status}" == "DONE" ]]; then + return 0 + fi + sleep 0.5 + done + return 1 +} + +run_eval() { + local out_path="$1" + (cd "${ROOT_DIR}" && cargo run -q -p elf-eval -- --config "${CFG_BASE}" --dataset "${DATASET}") \ + | awk 'BEGIN { started = 0 } /^\{/ { started = 1 } { if (started) print }' \ + >"${out_path}" +} + +echo "Recreating database ${DB_NAME}." +psql "${ELF_PG_DSN}" -v ON_ERROR_STOP=1 -c "DROP DATABASE IF EXISTS ${DB_NAME};" >/dev/null +psql "${ELF_PG_DSN}" -v ON_ERROR_STOP=1 -c "CREATE DATABASE ${DB_NAME};" >/dev/null + +echo "Recreating Qdrant collection ${QDRANT_COLLECTION}." +curl -sS -X DELETE "${ELF_QDRANT_HTTP_URL}/collections/${QDRANT_COLLECTION}?wait=true" >/dev/null || true +(cd "${ROOT_DIR}" && ELF_QDRANT_COLLECTION="${QDRANT_COLLECTION}" ELF_QDRANT_VECTOR_DIM="${VECTOR_DIM}" ./qdrant/init.sh >/dev/null) + +cat >"${CFG_BASE}" <<TOML +[service] +admin_bind = "${ADMIN_BIND}" +http_bind = "${HTTP_BIND}" +log_level = "info" +mcp_bind = "${MCP_BIND}" + +[storage.postgres] +dsn = "${PG_DSN}" +pool_max_conns = 10 + +[storage.qdrant] +collection = "${QDRANT_COLLECTION}" +docs_collection = "${QDRANT_COLLECTION}_docs" +url = "${QDRANT_GRPC_URL}" +vector_dim = ${VECTOR_DIM_TOML} + +[providers.embedding] +api_base = "http://127.0.0.1" +api_key = "local" +dimensions = ${VECTOR_DIM_TOML} +model = "local-hash" +path = "/embeddings" +provider_id = "local" +timeout_ms = 1_000 + +default_headers = {} + +[providers.rerank] +api_base = "http://127.0.0.1" +api_key = "local" +model = "local-token-overlap" +path = "/rerank" +provider_id = "local" +timeout_ms = 1_000 + +default_headers = {} + +[providers.llm_extractor] +api_base = "http://127.0.0.1" +api_key = "local" +model = "local-disabled" +path = "/chat/completions" +provider_id = "local" +temperature = 0.0 +timeout_ms = 1_000 + +default_headers = {} + +[scopes] +allowed = ["agent_private", "org_shared", "project_shared"] + +[scopes.read_profiles] +all_scopes = ["agent_private", "org_shared", "project_shared"] +private_only = ["agent_private"] +private_plus_project = ["agent_private", "project_shared"] + +[scopes.precedence] +agent_private = 30 +org_shared = 10 +project_shared = 20 + +[scopes.write_allowed] +agent_private = true +org_shared = true +project_shared = true + +[memory] +candidate_k = ${CANDIDATE_K} +dup_sim_threshold = 0.92 +max_note_chars = 240 +max_notes_per_add_event = 3 +top_k = ${TOP_K} +update_sim_threshold = 0.85 + +[memory.policy] + +[[memory.policy.rules]] +min_confidence = 0.0 +min_importance = 0.0 + +[chunking] +enabled = true +max_tokens = 512 +overlap_tokens = 128 +tokenizer_repo = "gpt2" + +[search.expansion] +include_original = true +max_queries = 4 +mode = "off" + +[search.dynamic] +min_candidates = 10 +min_top_score = 0.12 + +[search.prefilter] +max_candidates = 0 + +[search.cache] +enabled = false +expansion_ttl_days = 7 +rerank_ttl_days = 7 + +[search.explain] +retention_days = 7 +capture_candidates = false +candidate_retention_days = 2 +write_mode = "outbox" + +[search.recursive] +enabled = false +max_children_per_node = 4 +max_depth = 2 +max_nodes_per_scope = 32 +max_total_nodes = 256 + +[search.graph_context] +enabled = false +max_evidence_notes_per_fact = 16 +max_facts_per_item = 16 + +[ranking] +recency_tau_days = 60 +tie_breaker_weight = 0.1 + +[ranking.deterministic] +enabled = false + +[ranking.deterministic.lexical] +enabled = false +max_query_terms = 16 +max_text_terms = 1024 +min_ratio = 0.3 +weight = 0.05 + +[ranking.deterministic.hits] +enabled = false +half_saturation = 8.0 +last_hit_tau_days = 14.0 +weight = 0.05 + +[ranking.deterministic.decay] +enabled = false +tau_days = 30.0 +weight = 0.05 + +[ranking.blend] +enabled = true +rerank_normalization = "rank" +retrieval_normalization = "rank" + +[[ranking.blend.segments]] +max_retrieval_rank = 3 +retrieval_weight = 0.8 + +[[ranking.blend.segments]] +max_retrieval_rank = 10 +retrieval_weight = 0.2 + +[[ranking.blend.segments]] +max_retrieval_rank = 1_000_000 +retrieval_weight = 0.2 + +[ranking.diversity] +enabled = true +max_skips = 64 +mmr_lambda = 0.7 +sim_threshold = 0.88 + +[ranking.retrieval_sources] +fusion_priority = 1 +fusion_weight = 1.0 +structured_field_priority = 0 +structured_field_weight = 1.0 + +[lifecycle.ttl_days] +constraint = 0 +decision = 0 +fact = 180 +plan = 14 +preference = 0 +profile = 0 + +[lifecycle] +purge_deleted_after_days = 30 +purge_deprecated_after_days = 180 + +[security] +auth_mode = "off" +auth_keys = [] +bind_localhost_only = true +evidence_max_quote_chars = 320 +evidence_max_quotes = 2 +evidence_min_quotes = 1 +redact_secrets_on_write = true +reject_non_english = true +TOML + +if command -v taplo >/dev/null 2>&1; then + taplo fmt "${CFG_BASE}" >/dev/null 2>&1 +else + echo "taplo not found; continuing with unformatted generated harness config." +fi + +echo "Building harness binaries." +(cd "${ROOT_DIR}" && cargo build -p elf-worker -p elf-api -p elf-eval >/dev/null) + +echo "Starting worker and API (logs: ${WORKER_LOG}, ${API_LOG})." +(cd "${ROOT_DIR}" && "${ROOT_DIR}/target/debug/elf-worker" --config "${CFG_BASE}" >"${WORKER_LOG}" 2>&1) & +WORKER_PID="$!" +(cd "${ROOT_DIR}" && "${ROOT_DIR}/target/debug/elf-api" --config "${CFG_BASE}" >"${API_LOG}" 2>&1) & +API_PID="$!" + +echo "Waiting for API health check at ${HTTP_BASE}/health." +for _ in $(seq 1 120); do + status="$(curl -s -o /dev/null -w '%{http_code}' "${HTTP_BASE}/health" 2>/dev/null || true)" + if [[ "${status}" == "200" ]]; then + break + fi + sleep 0.5 +done + +status="$(curl -s -o /dev/null -w '%{http_code}' "${HTTP_BASE}/health" 2>/dev/null || true)" +if [[ "${status}" != "200" ]]; then + echo "API did not become healthy in time. Check logs: ${API_LOG}." >&2 + exit 1 +fi + +if [[ "${ELF_HARNESS_CHECK_VIEWER:-0}" == "1" ]]; then + VIEWER_BASE="http://${ADMIN_BIND}" + viewer_status="$(curl -s -o /dev/null -w '%{http_code}' "${VIEWER_BASE}/viewer" 2>/dev/null || true)" + if [[ "${viewer_status}" != "200" ]]; then + echo "Admin viewer did not return 200 at ${VIEWER_BASE}/viewer. Check logs: ${API_LOG}." >&2 + exit 1 + fi + echo "Admin viewer check passed at ${VIEWER_BASE}/viewer." +fi + +TENANT_ID="consolidation-tenant-${RUN_ID}" +PROJECT_ID="consolidation-project-${RUN_ID}" +AGENT_ID="consolidation-agent-${RUN_ID}" + +echo "Ingesting duplicate policy notes (legacy/noisy) before consolidation." +DUP_NOTE_IDS_RAW="$( + "${JSON_TOOL}" -n \ + --arg run "${RUN_ID}" \ + --arg key "${TARGET_KEY}" \ + --arg tenant "${TENANT_ID}" \ + --arg project "${PROJECT_ID}" \ + --arg agent "${AGENT_ID}" \ + '{ + tenant_id: $tenant, + project_id: $project, + agent_id: $agent, + scope: "agent_private", + notes: [ + { + type: "fact", + key: $key, + text: "Incident merge protocol draft A: for every incident merge, consolidate duplicate notes with the same policy key and carry forward the newest canonical decision evidence.", + importance: 0.95, + confidence: 0.4, + ttl_days: 180, + source_ref: {run: $run, stage: "legacy-a"} + }, + { + type: "fact", + key: $key, + text: "Incident merge protocol draft B: consolidate duplicate incident notes, retain one canonical policy note, and remove stale duplicates after the merge checkpoint.", + importance: 0.95, + confidence: 0.4, + ttl_days: 180, + source_ref: {run: $run, stage: "legacy-b"} + }, + { + type: "fact", + key: $key, + text: "Incident merge protocol draft C: when duplicate memory notes exist for the same key, de-duplicate to one canonical incident policy and archive obsolete variants.", + importance: 0.95, + confidence: 0.4, + ttl_days: 180, + source_ref: {run: $run, stage: "legacy-c"} + } + ] + }' \ + | curl -sS "${HTTP_BASE}/v2/notes/ingest" \ + -H 'content-type: application/json' \ + -H "X-ELF-Tenant-Id: ${TENANT_ID}" \ + -H "X-ELF-Project-Id: ${PROJECT_ID}" \ + -H "X-ELF-Agent-Id: ${AGENT_ID}" \ + -d @- \ + | "${JSON_TOOL}" -r '.results[].note_id' +)" + +mapfile -t DUP_NOTE_IDS <<<"${DUP_NOTE_IDS_RAW}" + +echo "Ingesting distractor notes." +DISTRACTOR_IDS_RAW="$( + "${JSON_TOOL}" -n \ + --arg run "${RUN_ID}" \ + '{ + scope: "agent_private", + notes: [range(1; 13) as $i | { + type: "fact", + key: ("distraction_" + ($i|tostring)), + text: ("Unrelated backlog signal " + ($i|tostring) + "."), + importance: 0.01, + confidence: 0.5, + ttl_days: 180, + source_ref: {run: $run} + }] + }' \ + | curl -sS "${HTTP_BASE}/v2/notes/ingest" \ + -H 'content-type: application/json' \ + -H "X-ELF-Tenant-Id: ${TENANT_ID}" \ + -H "X-ELF-Project-Id: ${PROJECT_ID}" \ + -H "X-ELF-Agent-Id: ${AGENT_ID}" \ + -d @- \ + | "${JSON_TOOL}" -r '.results[].note_id' +)" + +mapfile -t DISTRACTOR_IDS <<<"${DISTRACTOR_IDS_RAW}" + +if [[ "${#DUP_NOTE_IDS[@]}" -lt 3 || "${#DISTRACTOR_IDS[@]}" -lt 8 ]]; then + echo "Add-note failed. Check logs: ${API_LOG}." >&2 + exit 1 +fi + +echo "Waiting for indexing jobs to finish." +for id in "${DUP_NOTE_IDS[@]}" "${DISTRACTOR_IDS[@]}"; do + if ! wait_for_outbox_done "${id}"; then + echo "Timed out waiting for indexing. Check logs: ${WORKER_LOG}." >&2 + exit 1 + fi +done + +cat >"${DATASET}" <<JSON +{ + "name": "incident-consolidation-harness", + "defaults": { + "tenant_id": "${TENANT_ID}", + "project_id": "${PROJECT_ID}", + "agent_id": "${AGENT_ID}", + "read_profile": "all_scopes", + "top_k": ${TOP_K}, + "candidate_k": ${CANDIDATE_K} + }, + "queries": [ + { + "id": "q-1", + "query": "How do we consolidate duplicate incident notes into one canonical policy?", + "expected_keys": ["${TARGET_KEY}"] + } + ] +} +JSON + +run_eval "${OUT_BASE}" + +BASE_RECALL="$("${JSON_TOOL}" -r '.summary.avg_recall_at_k' "${OUT_BASE}")" +BASE_CONTEXT="$("${JSON_TOOL}" -r '.summary.avg_retrieved_summary_chars' "${OUT_BASE}")" +BASE_KEYS="$("${JSON_TOOL}" -r '.queries[0].retrieved_keys | map(. // "") | join(",")' "${OUT_BASE}")" + +echo "Consolidation step: deleting duplicate legacy notes and adding a canonical entry." +for id in "${DUP_NOTE_IDS[@]}"; do + curl -sS -X DELETE "${HTTP_BASE}/v2/notes/${id}" \ + -H "X-ELF-Tenant-Id: ${TENANT_ID}" \ + -H "X-ELF-Project-Id: ${PROJECT_ID}" \ + -H "X-ELF-Agent-Id: ${AGENT_ID}" \ + >/dev/null + if ! wait_for_outbox_done "${id}"; then + echo "Timed out waiting for duplicate note to de-index. Check logs: ${WORKER_LOG}." >&2 + exit 1 + fi +done + +STABLE_NOTE_ID="$( + curl -sS "${HTTP_BASE}/v2/notes/ingest" \ + -H 'content-type: application/json' \ + -H "X-ELF-Tenant-Id: ${TENANT_ID}" \ + -H "X-ELF-Project-Id: ${PROJECT_ID}" \ + -H "X-ELF-Agent-Id: ${AGENT_ID}" \ + -d "{ + \"scope\": \"agent_private\", + \"notes\": [ + { + \"type\": \"fact\", + \"key\": \"${TARGET_KEY}\", + \"text\": \"Canonical incident merge protocol: keep one note per policy key and remove duplicates after merge.\", + \"importance\": 0.9, + \"confidence\": 0.98, + \"ttl_days\": 180, + \"source_ref\": {\"run\": \"${RUN_ID}\", \"stage\": \"consolidated\"} + } + ] + }" | "${JSON_TOOL}" -r '.results[0].note_id' +)" + +if [[ -z "${STABLE_NOTE_ID}" || "${STABLE_NOTE_ID}" == "null" ]]; then + echo "Failed to ingest consolidated note." >&2 + exit 1 +fi + +if ! wait_for_outbox_done "${STABLE_NOTE_ID}"; then + echo "Timed out waiting for consolidated note to index. Check logs: ${WORKER_LOG}." >&2 + exit 1 +fi + +run_eval "${OUT_AFTER}" + +AFTER_RECALL="$("${JSON_TOOL}" -r '.summary.avg_recall_at_k' "${OUT_AFTER}")" +AFTER_CONTEXT="$("${JSON_TOOL}" -r '.summary.avg_retrieved_summary_chars' "${OUT_AFTER}")" +AFTER_KEYS="$("${JSON_TOOL}" -r '.queries[0].retrieved_keys | map(. // "") | join(",")' "${OUT_AFTER}")" + +echo "Consolidation results:" +echo "baseline recall@${TOP_K}=${BASE_RECALL} avg_retrieved_summary_chars=${BASE_CONTEXT}" +echo "baseline top_keys=${BASE_KEYS}" +echo "after recall@${TOP_K}=${AFTER_RECALL} avg_retrieved_summary_chars=${AFTER_CONTEXT}" +echo "after top_keys=${AFTER_KEYS}" + +if [[ "${AFTER_KEYS}" != *"${TARGET_KEY}"* ]]; then + echo "Expected consolidated key ${TARGET_KEY} to remain retrievable after consolidation." >&2 + exit 1 +fi + +awk -v after="${AFTER_RECALL}" -v base="${BASE_RECALL}" 'BEGIN { exit !(after + 1e-9 >= base) }' || { + echo "Expected recall to be preserved or improved after consolidation." >&2 + exit 1 +} + +awk -v after="${AFTER_CONTEXT}" -v base="${BASE_CONTEXT}" 'BEGIN { exit !(after <= base + 1e-9) }' || { + echo "Expected avg_retrieved_summary_chars to decrease or stay flat after consolidation." >&2 + exit 1 +} diff --git a/scripts/context-misranking-harness.sh b/scripts/context-misranking-harness.sh new file mode 100755 index 00000000..578f09a5 --- /dev/null +++ b/scripts/context-misranking-harness.sh @@ -0,0 +1,533 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +if [[ -f "${ROOT_DIR}/.env" ]]; then + set -a + # shellcheck disable=SC1090 + source "${ROOT_DIR}/.env" + set +a +fi + +: "${ELF_PG_DSN:?Set ELF_PG_DSN to a Postgres DSN (usually .../postgres).}" +: "${ELF_QDRANT_HTTP_URL:?Set ELF_QDRANT_HTTP_URL to the Qdrant REST base URL, for example http://127.0.0.1:51889 (default: http://127.0.0.1:6333).}" + +QDRANT_GRPC_URL="${ELF_QDRANT_GRPC_URL:-${ELF_QDRANT_URL:-}}" +if [[ -z "${QDRANT_GRPC_URL}" ]]; then + echo "Set ELF_QDRANT_GRPC_URL to the Qdrant gRPC base URL, for example http://127.0.0.1:51890 (default: http://127.0.0.1:6334). Legacy alias ELF_QDRANT_URL is deprecated but still supported." + exit 1 +fi + +if command -v jaq >/dev/null 2>&1; then + JSON_TOOL="jaq" +elif command -v jq >/dev/null 2>&1; then + JSON_TOOL="jq" +else + echo "Missing jaq/jq. Install jaq (recommended) or jq." >&2 + exit 1 +fi + +if ! command -v curl >/dev/null 2>&1; then + echo "Missing curl." >&2 + exit 1 +fi + +if ! command -v psql >/dev/null 2>&1; then + echo "Missing psql." >&2 + exit 1 +fi + +if ! command -v taplo >/dev/null 2>&1; then + echo "Missing taplo." >&2 + exit 1 +fi + +RUN_ID="${ELF_HARNESS_RUN_ID:-"$(date +%s)-$$"}" + +DB_NAME="${ELF_HARNESS_DB_NAME:-elf_e2e}" +QDRANT_COLLECTION="${ELF_HARNESS_COLLECTION:-elf_harness_${RUN_ID}}" +VECTOR_DIM="${ELF_HARNESS_VECTOR_DIM:-4096}" + +if [[ ! "${VECTOR_DIM}" =~ ^[0-9]+$ ]]; then + echo "ELF_HARNESS_VECTOR_DIM must be an integer." >&2 + exit 1 +fi + +# Keep VECTOR_DIM numeric for JSON and SQL usage; use an underscore-formatted variant for TOML. +VECTOR_DIM_TOML="$(echo "${VECTOR_DIM}" | perl -pe '1 while s/^([0-9]+)([0-9]{3})/$1_$2/')" + +if [[ "${DB_NAME}" != elf_* ]]; then + echo "ELF_HARNESS_DB_NAME must start with elf_ to avoid deleting real data." >&2 + exit 1 +fi + +HTTP_BIND="${ELF_HARNESS_HTTP_BIND:-127.0.0.1:18089}" +ADMIN_BIND="${ELF_HARNESS_ADMIN_BIND:-127.0.0.1:18090}" +MCP_BIND="${ELF_HARNESS_MCP_BIND:-127.0.0.1:18091}" + +HTTP_BASE="http://${HTTP_BIND}" + +PG_DSN_BASE="${ELF_PG_DSN%/*}" +PG_DSN="${PG_DSN_BASE}/${DB_NAME}" + +CFG_BASE="${ROOT_DIR}/tmp/elf.harness.base.toml" +CFG_CONTEXT="${ROOT_DIR}/tmp/elf.harness.context.toml" +DATASET="${ROOT_DIR}/tmp/elf.harness.dataset.json" +OUT_BASE="${ROOT_DIR}/tmp/elf.harness.out.base.json" +OUT_CONTEXT="${ROOT_DIR}/tmp/elf.harness.out.context.json" +WORKER_LOG="${ROOT_DIR}/tmp/elf.harness.worker.log" +API_LOG="${ROOT_DIR}/tmp/elf.harness.api.log" + +if [[ "${QDRANT_COLLECTION}" != elf_harness_* ]]; then + echo "ELF_HARNESS_COLLECTION must start with elf_harness_ to avoid deleting real data." >&2 + exit 1 +fi + +WORKER_PID="" +API_PID="" + +cleanup() { + set +e + + if [[ -n "${API_PID}" ]] && kill -0 "${API_PID}" >/dev/null 2>&1; then + kill "${API_PID}" >/dev/null 2>&1 || true + fi + if [[ -n "${WORKER_PID}" ]] && kill -0 "${WORKER_PID}" >/dev/null 2>&1; then + kill "${WORKER_PID}" >/dev/null 2>&1 || true + fi + wait >/dev/null 2>&1 || true + + if [[ "${ELF_HARNESS_KEEP_COLLECTION:-0}" != "1" ]]; then + curl -sS -X DELETE "${ELF_QDRANT_HTTP_URL}/collections/${QDRANT_COLLECTION}?wait=true" >/dev/null || true + fi + + if [[ "${ELF_HARNESS_KEEP_DB:-0}" != "1" ]]; then + psql "${ELF_PG_DSN}" -tAc \ + "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${DB_NAME}' AND pid <> pg_backend_pid();" \ + >/dev/null 2>&1 || true + psql "${ELF_PG_DSN}" -v ON_ERROR_STOP=1 -c "DROP DATABASE IF EXISTS ${DB_NAME};" >/dev/null 2>&1 || true + fi +} + +trap cleanup EXIT + +echo "Recreating database ${DB_NAME}." +psql "${ELF_PG_DSN}" -v ON_ERROR_STOP=1 -c "DROP DATABASE IF EXISTS ${DB_NAME};" >/dev/null +psql "${ELF_PG_DSN}" -v ON_ERROR_STOP=1 -c "CREATE DATABASE ${DB_NAME};" >/dev/null + +echo "Recreating Qdrant collection ${QDRANT_COLLECTION}." +curl -sS -X DELETE "${ELF_QDRANT_HTTP_URL}/collections/${QDRANT_COLLECTION}?wait=true" >/dev/null || true +(cd "${ROOT_DIR}" && ELF_QDRANT_COLLECTION="${QDRANT_COLLECTION}" ELF_QDRANT_VECTOR_DIM="${VECTOR_DIM}" ./qdrant/init.sh >/dev/null) + +cat >"${CFG_BASE}" <<TOML +[service] +admin_bind = "${ADMIN_BIND}" +http_bind = "${HTTP_BIND}" +log_level = "info" +mcp_bind = "${MCP_BIND}" + +[storage.postgres] +dsn = "${PG_DSN}" +pool_max_conns = 10 + +[storage.qdrant] +collection = "${QDRANT_COLLECTION}" +docs_collection = "${QDRANT_COLLECTION}_docs" +url = "${QDRANT_GRPC_URL}" +vector_dim = ${VECTOR_DIM_TOML} + +[providers.embedding] +api_base = "http://127.0.0.1" +api_key = "local" +dimensions = ${VECTOR_DIM_TOML} +model = "local-hash" +path = "/embeddings" +provider_id = "local" +timeout_ms = 1_000 + +default_headers = {} + +[providers.rerank] +api_base = "http://127.0.0.1" +api_key = "local" +model = "local-token-overlap" +path = "/rerank" +provider_id = "local" +timeout_ms = 1_000 + +default_headers = {} + +[providers.llm_extractor] +api_base = "http://127.0.0.1" +api_key = "local" +model = "local-disabled" +path = "/chat/completions" +provider_id = "local" +temperature = 0.0 +timeout_ms = 1_000 + +default_headers = {} + +[scopes] +allowed = ["agent_private", "org_shared", "project_shared"] + +[scopes.read_profiles] +all_scopes = ["agent_private", "org_shared", "project_shared"] +private_only = ["agent_private"] +private_plus_project = ["agent_private", "project_shared"] + +[scopes.precedence] +agent_private = 30 +org_shared = 10 +project_shared = 20 + +[scopes.write_allowed] +agent_private = true +org_shared = true +project_shared = true + +[memory] +candidate_k = 60 +dup_sim_threshold = 0.92 +max_note_chars = 240 +max_notes_per_add_event = 3 +top_k = 12 +update_sim_threshold = 0.85 + +[memory.policy] + +[[memory.policy.rules]] +min_confidence = 0.0 +min_importance = 0.0 + +[chunking] +enabled = true +max_tokens = 512 +overlap_tokens = 128 +tokenizer_repo = "config/local/tokenizer.wordlevel.json" + +[search.expansion] +include_original = true +max_queries = 4 +mode = "off" + +[search.dynamic] +min_candidates = 10 +min_top_score = 0.12 + +[search.prefilter] +max_candidates = 0 + +[search.cache] +enabled = false +expansion_ttl_days = 7 +rerank_ttl_days = 7 + +[search.explain] +retention_days = 7 +capture_candidates = false +candidate_retention_days = 2 +write_mode = "outbox" + +[search.recursive] +enabled = false +max_children_per_node = 4 +max_depth = 2 +max_nodes_per_scope = 32 +max_total_nodes = 256 + +[search.graph_context] +enabled = false +max_evidence_notes_per_fact = 16 +max_facts_per_item = 16 + +[ranking] +recency_tau_days = 60 +tie_breaker_weight = 0.1 + +[ranking.deterministic] +enabled = false + +[ranking.deterministic.lexical] +enabled = false +max_query_terms = 16 +max_text_terms = 1024 +min_ratio = 0.3 +weight = 0.05 + +[ranking.deterministic.hits] +enabled = false +half_saturation = 8.0 +last_hit_tau_days = 14.0 +weight = 0.05 + +[ranking.deterministic.decay] +enabled = false +tau_days = 30.0 +weight = 0.05 + +[ranking.blend] +enabled = true +rerank_normalization = "rank" +retrieval_normalization = "rank" + +[[ranking.blend.segments]] +max_retrieval_rank = 3 +retrieval_weight = 0.8 + +[[ranking.blend.segments]] +max_retrieval_rank = 10 +retrieval_weight = 0.5 + +[[ranking.blend.segments]] +max_retrieval_rank = 1_000_000 +retrieval_weight = 0.2 + +[ranking.diversity] +enabled = true +max_skips = 64 +mmr_lambda = 0.7 +sim_threshold = 0.88 + +[ranking.retrieval_sources] +fusion_priority = 1 +fusion_weight = 1.0 +structured_field_priority = 0 +structured_field_weight = 1.0 + +[lifecycle.ttl_days] +constraint = 0 +decision = 0 +fact = 180 +plan = 14 +preference = 0 +profile = 0 + +[lifecycle] +purge_deleted_after_days = 30 +purge_deprecated_after_days = 180 + +[security] +auth_mode = "off" +auth_keys = [] +bind_localhost_only = true +evidence_max_quote_chars = 320 +evidence_max_quotes = 2 +evidence_min_quotes = 1 +redact_secrets_on_write = true +reject_non_english = true +TOML + +cp "${CFG_BASE}" "${CFG_CONTEXT}" +cat >>"${CFG_CONTEXT}" <<'TOML' + +[context] +scope_boost_weight = 0.1 + +[context.scope_descriptions] +org_shared = "Org-wide policies and shared operating context." +project_shared = "Project-specific deployment steps and runbooks." +TOML + +taplo fmt "${CFG_BASE}" "${CFG_CONTEXT}" >/dev/null 2>&1 + +echo "Building harness binaries." +(cd "${ROOT_DIR}" && cargo build -p elf-worker -p elf-api -p elf-eval >/dev/null) + +echo "Starting worker and API (logs: ${WORKER_LOG}, ${API_LOG})." +(cd "${ROOT_DIR}" && "${ROOT_DIR}/target/debug/elf-worker" --config "${CFG_BASE}" >"${WORKER_LOG}" 2>&1) & +WORKER_PID="$!" +(cd "${ROOT_DIR}" && "${ROOT_DIR}/target/debug/elf-api" --config "${CFG_BASE}" >"${API_LOG}" 2>&1) & +API_PID="$!" + +echo "Waiting for API health check at ${HTTP_BASE}/health." +for _ in $(seq 1 120); do + status="$(curl -s -o /dev/null -w '%{http_code}' "${HTTP_BASE}/health" 2>/dev/null || true)" + if [[ "${status}" == "200" ]]; then + break + fi + sleep 0.5 +done + +status="$(curl -s -o /dev/null -w '%{http_code}' "${HTTP_BASE}/health" 2>/dev/null || true)" +if [[ "${status}" != "200" ]]; then + echo "API did not become healthy in time. Check logs: ${API_LOG}." >&2 + exit 1 +fi +TENANT_ID="harness-tenant-${RUN_ID}" +PROJECT_ID="harness-project-${RUN_ID}" +AGENT_ID="harness-agent-${RUN_ID}" + +echo "Adding confuser notes in org_shared and project_shared." +NOTE_ORG="$( + curl -sS "${HTTP_BASE}/v2/notes/ingest" \ + -H 'content-type: application/json' \ + -H "X-ELF-Tenant-Id: ${TENANT_ID}" \ + -H "X-ELF-Project-Id: ${PROJECT_ID}" \ + -H "X-ELF-Agent-Id: ${AGENT_ID}" \ + -d "{ + \"scope\": \"org_shared\", + \"notes\": [ + { + \"type\": \"fact\", + \"key\": \"deployment_steps\", + \"text\": \"Deployment steps.\", + \"importance\": 0.9, + \"confidence\": 0.9, + \"ttl_days\": 180, + \"source_ref\": {\"run\": \"context-harness\"} + } + ] + }" | "${JSON_TOOL}" -r '.results[0].note_id' +)" + +NOTE_PROJECT="$( + curl -sS "${HTTP_BASE}/v2/notes/ingest" \ + -H 'content-type: application/json' \ + -H "X-ELF-Tenant-Id: ${TENANT_ID}" \ + -H "X-ELF-Project-Id: ${PROJECT_ID}" \ + -H "X-ELF-Agent-Id: ${AGENT_ID}" \ + -d "{ + \"scope\": \"project_shared\", + \"notes\": [ + { + \"type\": \"fact\", + \"key\": \"deployment_steps\", + \"text\": \"Deployment steps for service.\", + \"importance\": 0.6, + \"confidence\": 0.9, + \"ttl_days\": 180, + \"source_ref\": {\"run\": \"context-harness\"} + } + ] + }" | "${JSON_TOOL}" -r '.results[0].note_id' +)" + +echo "Adding filler notes to increase candidate set size." +FILLER_PAYLOAD="$( + "${JSON_TOOL}" -n --arg run "context-harness" '{ + scope: "agent_private", + notes: [range(1; 26) as $i | { + type: "fact", + key: ("filler_" + ($i|tostring)), + text: ("Filler note " + ($i|tostring) + ": alpha beta gamma delta epsilon."), + importance: 0.1, + confidence: 0.5, + ttl_days: 180, + source_ref: {run: $run} + }] + }' +)" + +FILLER_IDS_RAW="$( + curl -sS "${HTTP_BASE}/v2/notes/ingest" \ + -H 'content-type: application/json' \ + -H "X-ELF-Tenant-Id: ${TENANT_ID}" \ + -H "X-ELF-Project-Id: ${PROJECT_ID}" \ + -H "X-ELF-Agent-Id: ${AGENT_ID}" \ + -d "${FILLER_PAYLOAD}" | "${JSON_TOOL}" -r '.results[].note_id' +)" + +mapfile -t FILLER_IDS <<<"${FILLER_IDS_RAW}" + +if [[ "${NOTE_ORG}" == "null" ]] || [[ "${NOTE_PROJECT}" == "null" ]] || [[ "${#FILLER_IDS[@]}" -lt 10 ]]; then + echo "Add-note failed. Check logs: ${API_LOG}." >&2 + exit 1 +fi + +wait_for_outbox_done() { + local note_id="$1" + for _ in $(seq 1 120); do + status="$( + psql "${PG_DSN}" -tAc \ + "SELECT status FROM indexing_outbox WHERE note_id = '${note_id}' ORDER BY created_at DESC LIMIT 1;" \ + | tr -d '[:space:]' + )" + if [[ "${status}" == "DONE" ]]; then + return 0 + fi + sleep 0.5 + done + return 1 +} + +echo "Waiting for indexing jobs to finish." +if ! wait_for_outbox_done "${NOTE_ORG}"; then + echo "Timed out waiting for org_shared note to index. Check logs: ${WORKER_LOG}." >&2 + exit 1 +fi +if ! wait_for_outbox_done "${NOTE_PROJECT}"; then + echo "Timed out waiting for project_shared note to index. Check logs: ${WORKER_LOG}." >&2 + exit 1 +fi +for id in "${FILLER_IDS[@]}"; do + if ! wait_for_outbox_done "${id}"; then + echo "Timed out waiting for filler note to index. Check logs: ${WORKER_LOG}." >&2 + exit 1 + fi +done + +cat >"${DATASET}" <<JSON +{ + "name": "context-misranking", + "defaults": { + "tenant_id": "${TENANT_ID}", + "project_id": "${PROJECT_ID}", + "agent_id": "${AGENT_ID}", + "read_profile": "all_scopes", + "top_k": 1, + "candidate_k": 60 + }, + "queries": [ + { + "id": "q-1", + "query": "deployment steps", + "expected_note_ids": ["${NOTE_PROJECT}"] + } + ] +} +JSON + +run_eval() { + local cfg_path="$1" + local out_path="$2" + (cd "${ROOT_DIR}" && cargo run -q -p elf-eval -- --config "${cfg_path}" --dataset "${DATASET}") \ + | awk 'BEGIN { started = 0 } /^\{/ { started = 1 } { if (started) print }' \ + >"${out_path}" +} + +echo "Running baseline eval (no context)." +run_eval "${CFG_BASE}" "${OUT_BASE}" + +echo "Running context eval (scope boost enabled)." +run_eval "${CFG_CONTEXT}" "${OUT_CONTEXT}" + +RECALL_BASE="$("${JSON_TOOL}" -r '.summary.avg_recall_at_k' "${OUT_BASE}")" +TOP_BASE="$("${JSON_TOOL}" -r '.queries[0].retrieved_note_ids[0]' "${OUT_BASE}")" +RECALL_CONTEXT="$("${JSON_TOOL}" -r '.summary.avg_recall_at_k' "${OUT_CONTEXT}")" +TOP_CONTEXT="$("${JSON_TOOL}" -r '.queries[0].retrieved_note_ids[0]' "${OUT_CONTEXT}")" + +echo "Results:" +echo "baseline recall@1=${RECALL_BASE} top_note_id=${TOP_BASE}" +echo "context recall@1=${RECALL_CONTEXT} top_note_id=${TOP_CONTEXT}" +echo "expected note_id=${NOTE_PROJECT}" + +if [[ "${TOP_BASE}" == "${NOTE_PROJECT}" ]]; then + echo "Expected baseline to misrank (top_note_id != expected), but it matched expected." >&2 + exit 1 +fi + +if [[ "${TOP_CONTEXT}" != "${NOTE_PROJECT}" ]]; then + echo "Expected context to correct the misranking (top_note_id == expected), but it did not." >&2 + exit 1 +fi + +echo "Cleaning up notes." +for id in "${NOTE_ORG}" "${NOTE_PROJECT}" "${FILLER_IDS[@]}"; do + curl -sS -X DELETE "${HTTP_BASE}/v2/notes/${id}" \ + -H "X-ELF-Tenant-Id: ${TENANT_ID}" \ + -H "X-ELF-Project-Id: ${PROJECT_ID}" \ + -H "X-ELF-Agent-Id: ${AGENT_ID}" \ + >/dev/null +done diff --git a/scripts/graphify-docker-graph-report-smoke.py b/scripts/graphify-docker-graph-report-smoke.py new file mode 100755 index 00000000..c5ac0cfc --- /dev/null +++ b/scripts/graphify-docker-graph-report-smoke.py @@ -0,0 +1,1485 @@ +#!/usr/bin/env python3 +"""Docker-contained graphify graph/report smoke for real-world adapters.""" + +from __future__ import annotations + +import csv +import json +import os +import shutil +import subprocess +import sys +import time +from dataclasses import dataclass, replace +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +SCRIPT_DIR = Path(__file__).resolve().parent +ROOT_DIR = SCRIPT_DIR.parent +REPORT_DIR = Path( + os.environ.get( + "ELF_GRAPHIFY_SMOKE_REPORT_DIR", + ROOT_DIR / "tmp" / "real-world-memory" / "graphify-smoke", + ) +) +WORK_DIR = Path(os.environ.get("ELF_GRAPHIFY_SMOKE_WORK_DIR", REPORT_DIR / "work")) +OUT = Path(os.environ.get("ELF_GRAPHIFY_SMOKE_OUT", REPORT_DIR / "graphify-smoke.json")) +MANIFEST_OUT = Path( + os.environ.get( + "ELF_GRAPHIFY_SMOKE_MANIFEST_OUT", + REPORT_DIR / "memory_projects_manifest.graphify-smoke.json", + ) +) +SUMMARY_OUT = Path(os.environ.get("ELF_GRAPHIFY_SMOKE_SUMMARY_OUT", REPORT_DIR / "summary.json")) +REPORT_JSON = Path(os.environ.get("ELF_GRAPHIFY_SMOKE_REPORT_JSON", REPORT_DIR / "graphify-report.json")) +REPORT_MD = Path(os.environ.get("ELF_GRAPHIFY_SMOKE_REPORT_MD", REPORT_DIR / "graphify-report.md")) +FIXTURE_DIR = REPORT_DIR / "graphify-fixtures" +CORPUS_DIR = WORK_DIR / "generated-public-corpus" +OUTPUT_CAPTURE_DIR = REPORT_DIR / "graphify-out" +LOG_DIR = REPORT_DIR / "logs" + +RUN_ID = os.environ.get( + "ELF_GRAPHIFY_SMOKE_RUN_ID", + f"graphify-docker-smoke-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", +) +RUN_GRAPHIFY = os.environ.get("ELF_GRAPHIFY_SMOKE_RUN", "1") == "1" +ALLOW_HOST = os.environ.get("ELF_GRAPHIFY_SMOKE_ALLOW_HOST", "0") == "1" +INSTALL_GRAPHIFY = os.environ.get("ELF_GRAPHIFY_SMOKE_INSTALL", "1") == "1" +GRAPHIFY_PACKAGE = os.environ.get("ELF_GRAPHIFY_PACKAGE", "graphifyy") +GRAPHIFY_REF = os.environ.get("ELF_GRAPHIFY_REF", f"pypi:{GRAPHIFY_PACKAGE}") +TIMEOUT_SECONDS = int(os.environ.get("ELF_GRAPHIFY_TIMEOUT_SECONDS", "600")) +QUERY_BUDGET = int(os.environ.get("ELF_GRAPHIFY_QUERY_BUDGET", "1200")) + + +@dataclass +class CorpusItem: + """Generated public corpus item with source mapping metadata.""" + + evidence_id: str + claim_id: str + title: str + file_name: str + text: str + expected: bool + kind: str = "document" + line: int = 1 + + +@dataclass +class StatusState: + """Typed status for generated graphify smoke artifacts.""" + + setup: str = "blocked" + run: str = "not_encoded" + result: str = "blocked" + overall: str = "blocked" + evidence_class: str = "research_gate" + failure_class: str = "graphify_live_run_disabled" + failure_reason: str = ( + "graphify graph/report execution is disabled; set ELF_GRAPHIFY_SMOKE_RUN=1 " + "to install and run graphify inside Docker." + ) + + +@dataclass +class CommandRecord: + """Captured command result without secret-bearing environment values.""" + + label: str + command: list[str] + status: str + elapsed_ms: float + stdout_artifact: str | None + stderr_artifact: str | None + returncode: int | None + reason: str + + +def utc_now() -> str: + """Return an RFC3339 UTC timestamp.""" + + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def rel(path: Path) -> str: + """Return a repository-relative path when possible.""" + + try: + return str(path.resolve().relative_to(ROOT_DIR)) + except ValueError: + return str(path) + + +def mkdirs() -> None: + """Create and reset output directories owned by this smoke.""" + + for path in (FIXTURE_DIR, OUTPUT_CAPTURE_DIR, LOG_DIR): + if path.exists(): + shutil.rmtree(path) + + for path in (REPORT_DIR, WORK_DIR, FIXTURE_DIR, OUTPUT_CAPTURE_DIR, LOG_DIR): + path.mkdir(parents=True, exist_ok=True) + + for path in ( + OUT, + MANIFEST_OUT, + SUMMARY_OUT, + REPORT_JSON, + REPORT_MD, + REPORT_DIR / "generated-corpus.csv", + ): + if path.exists(): + path.unlink() + + +def write_json(path: Path, payload: Any) -> None: + """Write stable, pretty JSON.""" + + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def run_scored_report(fixture_path: Path, manifest_path: Path, status: StatusState) -> dict[str, Any]: + """Score the generated graphify fixture through the real-world job runner.""" + + run_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + str(fixture_path), + "--out", + str(REPORT_JSON), + "--run-id", + "real-world-memory-live-graphify", + "--adapter-id", + "graphify_docker_smoke", + "--adapter-name", + "graphify Docker graph/report smoke adapter", + "--adapter-behavior", + "docker_cli_graph_report_smoke", + "--adapter-storage-status", + status.setup, + "--adapter-runtime-status", + status.overall, + "--adapter-notes", + "Generated by the graphify Docker graph/report smoke; pass or wrong_result requires graph.json, GRAPH_REPORT.md, and query output mapped to generated evidence ids, while setup/runtime limits remain typed.", + "--external-adapter-manifest", + str(manifest_path), + ] + publish_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + str(REPORT_JSON), + "--out", + str(REPORT_MD), + ] + + subprocess.run(run_cmd, cwd=ROOT_DIR, check=True) + subprocess.run(publish_cmd, cwd=ROOT_DIR, check=True) + + report = json.loads(REPORT_JSON.read_text(encoding="utf-8")) + + return { + "json": rel(REPORT_JSON), + "markdown": rel(REPORT_MD), + "summary": report.get("summary", {}), + "suites": report.get("suites", []), + } + + +def scored_benchmark(report: dict[str, Any] | None) -> dict[str, Any]: + """Extract the post-score benchmark status from a real_world_job report.""" + + if report is None: + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": "pending", + "reason": "The smoke materialization was written before benchmark scoring completed.", + } + + summary = report.get("summary", {}) + counts = { + status: int(summary.get(status, 0) or 0) + for status in ( + "pass", + "wrong_result", + "lifecycle_fail", + "incomplete", + "blocked", + "not_encoded", + ) + } + status = next((name for name, count in counts.items() if name != "pass" and count > 0), "pass") + + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": status, + "counts": counts, + "job_count": int(summary.get("job_count", 0) or 0), + "mean_score": summary.get("mean_score"), + "evidence_coverage": summary.get("evidence_coverage"), + } + + +def status_with_scored_result(status: StatusState, report: dict[str, Any]) -> StatusState: + """Return a manifest status that follows the scored real_world_job outcome.""" + + scored = scored_benchmark(report) + scored_status = scored.get("status") + if scored_status not in { + "pass", + "wrong_result", + "lifecycle_fail", + "incomplete", + "blocked", + "not_encoded", + }: + return status + + manifest_status = replace(status) + manifest_status.result = str(scored_status) + manifest_status.overall = str(scored_status) + + if scored_status == "pass": + manifest_status.failure_class = "" + manifest_status.failure_reason = "" + elif scored_status == "wrong_result": + manifest_status.failure_class = "scored_benchmark_wrong_result" + manifest_status.failure_reason = ( + "The graphify smoke materialized graph/report evidence, but the scored " + "real_world_job outcome is wrong_result; inspect graphify-report.json for " + "wrong-result signals." + ) + + return manifest_status + + +def dir_size(path: Path) -> int: + """Return total file size for a directory or file.""" + + if not path.exists(): + return 0 + if path.is_file(): + return path.stat().st_size + + return sum(item.stat().st_size for item in path.rglob("*") if item.is_file()) + + +def file_count(path: Path) -> int: + """Return file count for a directory.""" + + if not path.exists(): + return 0 + + return sum(1 for item in path.rglob("*") if item.is_file()) + + +def command_available(command: str) -> bool: + """Return whether a command is on PATH.""" + + return shutil.which(command) is not None + + +def runtime_env() -> dict[str, str]: + """Return an isolated graphify runtime environment.""" + + home = WORK_DIR / "home" + return { + "HOME": str(home), + "XDG_CONFIG_HOME": str(home / ".config"), + "XDG_CACHE_HOME": str(home / ".cache"), + "CODEX_HOME": str(home / ".codex"), + "CLAUDE_CONFIG_DIR": str(home / ".claude"), + "GEMINI_HOME": str(home / ".gemini"), + "PYTHONUNBUFFERED": "1", + "NO_COLOR": "1", + } + + +def run_command( + label: str, + command: list[str], + cwd: Path, + timeout: int = TIMEOUT_SECONDS, + extra_env: dict[str, str] | None = None, +) -> CommandRecord: + """Run a subprocess and capture stdout/stderr artifacts.""" + + cwd.mkdir(parents=True, exist_ok=True) + stdout_path = LOG_DIR / f"{label}.stdout.log" + stderr_path = LOG_DIR / f"{label}.stderr.log" + env = os.environ.copy() + + if extra_env: + env.update(extra_env) + + started = time.monotonic() + try: + proc = subprocess.run( + command, + cwd=cwd, + env=env, + text=True, + capture_output=True, + timeout=timeout, + check=False, + ) + elapsed_ms = (time.monotonic() - started) * 1000 + stdout_path.write_text(proc.stdout, encoding="utf-8") + stderr_path.write_text(proc.stderr, encoding="utf-8") + status = "pass" if proc.returncode == 0 else "incomplete" + reason = "Command completed." if proc.returncode == 0 else f"Command exited {proc.returncode}." + + return CommandRecord( + label=label, + command=command, + status=status, + elapsed_ms=elapsed_ms, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=proc.returncode, + reason=reason, + ) + except subprocess.TimeoutExpired as err: + elapsed_ms = (time.monotonic() - started) * 1000 + stdout_path.write_text(err.stdout or "", encoding="utf-8") + stderr_path.write_text(err.stderr or "", encoding="utf-8") + + return CommandRecord( + label=label, + command=command, + status="incomplete", + elapsed_ms=elapsed_ms, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=None, + reason=f"Command timed out after {timeout} seconds.", + ) + + +def command_to_json(record: CommandRecord) -> dict[str, Any]: + """Serialize a command record.""" + + return { + "label": record.label, + "status": record.status, + "command": record.command, + "elapsed_ms": round(record.elapsed_ms, 3), + "stdout_artifact": record.stdout_artifact, + "stderr_artifact": record.stderr_artifact, + "returncode": record.returncode, + "reason": record.reason, + } + + +def generated_corpus() -> list[CorpusItem]: + """Return the bounded generated-public graphify corpus.""" + + return [ + CorpusItem( + evidence_id="graphify-smoke-memory-service", + claim_id="memory_service_graph", + title="ELF Memory Service Graph Note", + file_name="elf_memory_service.py", + text=( + '"""Evidence ID graphify-smoke-memory-service.\n' + "ELF stores evidence-linked facts as notes and keeps Postgres as the " + "source of truth for graph/report validation.\n" + '"""\n\n' + "class ElfMemoryService:\n" + " \"\"\"Evidence ID graphify-smoke-memory-service maps memory notes " + "to source-backed graph nodes.\"\"\"\n\n" + " def attach_evidence(self, note_id: str, source_ref: str) -> tuple[str, str]:\n" + " \"\"\"Attach source_ref evidence to a note before retrieval.\"\"\"\n" + " return note_id, source_ref\n" + ), + expected=True, + ), + CorpusItem( + evidence_id="graphify-smoke-qdrant-rebuild", + claim_id="qdrant_rebuild_graph", + title="Qdrant Rebuild Graph Note", + file_name="qdrant_rebuild.py", + text=( + '"""Evidence ID graphify-smoke-qdrant-rebuild.\n' + "Qdrant is a derived, rebuildable index. The graphify smoke should " + "connect Qdrant rebuild evidence to the ELF memory service node and " + "preserve this source file as evidence for scoring.\n" + '"""\n\n' + "class QdrantRebuildIndex:\n" + " \"\"\"Evidence ID graphify-smoke-qdrant-rebuild maps rebuildable " + "index behavior to source evidence.\"\"\"\n\n" + " def rebuild_from_postgres_vectors(self, collection: str) -> str:\n" + " \"\"\"Rebuild the derived Qdrant collection from Postgres vectors.\"\"\"\n" + " return collection\n" + ), + expected=True, + ), + CorpusItem( + evidence_id="graphify-smoke-report-mapping", + claim_id="graph_report_mapping", + title="Graph Report Mapping Note", + file_name="graph_report_mapping.py", + text=( + '"""Evidence ID graphify-smoke-report-mapping.\n' + "GRAPH_REPORT.md and graph.json must be captured as derived adapter " + "artifacts, then mapped back to real_world_job evidence ids.\n" + '"""\n\n' + "def map_graph_report_to_evidence(graph_json: str, graph_report: str) -> str:\n" + " \"\"\"Return graphify-smoke-report-mapping when graph artifacts cite sources.\"\"\"\n" + " return f\"{graph_json}:{graph_report}\"\n" + ), + expected=True, + ), + CorpusItem( + evidence_id="graphify-smoke-stale-trap", + claim_id="stale_authority_trap", + title="Stale Graph Authority Trap", + file_name="stale_vector_authority.py", + text=( + '"""Evidence ID graphify-smoke-stale-trap.\n' + "Stale trap: graphify output is an authoritative ELF memory store. " + "This is intentionally false; graphify is only a derived graph/report adapter.\n" + '"""\n\n' + "def stale_authority_claim() -> str:\n" + " \"\"\"Return the stale claim that must not drive the answer.\"\"\"\n" + " return \"graphify is authoritative\"\n" + ), + expected=False, + ), + ] + + +def write_corpus(corpus: list[CorpusItem]) -> Path: + """Write graphify input files plus a CSV mapping copy.""" + + if CORPUS_DIR.exists(): + shutil.rmtree(CORPUS_DIR) + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + csv_path = REPORT_DIR / "generated-corpus.csv" + + with csv_path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter( + handle, + fieldnames=("evidence_id", "claim_id", "title", "file_name", "line", "text"), + ) + writer.writeheader() + + for item in corpus: + line = evidence_line(item.text, item.evidence_id) + item.line = line + writer.writerow( + { + "evidence_id": item.evidence_id, + "claim_id": item.claim_id, + "title": item.title, + "file_name": item.file_name, + "line": line, + "text": item.text, + } + ) + (CORPUS_DIR / item.file_name).write_text(item.text, encoding="utf-8") + + (CORPUS_DIR / ".graphifyignore").write_text( + "graphify-out/\n__pycache__/\n*.pyc\n", + encoding="utf-8", + ) + + return csv_path + + +def evidence_line(text: str, evidence_id: str) -> int: + """Return the first line containing an evidence id.""" + + for index, line in enumerate(text.splitlines(), start=1): + if evidence_id in line: + return index + + return 1 + + +def install_graphify(command_records: list[CommandRecord]) -> Path | None: + """Create a venv and install graphify in the container-local work dir.""" + + venv_dir = WORK_DIR / ".venv" + python = venv_dir / "bin" / "python" + graphify = venv_dir / "bin" / "graphify" + + if INSTALL_GRAPHIFY: + venv_record = run_command("python-venv", [sys.executable, "-m", "venv", str(venv_dir)], WORK_DIR) + command_records.append(venv_record) + if venv_record.status != "pass": + return None + + install_record = run_command( + "graphify-install", + [str(python), "-m", "pip", "install", "--disable-pip-version-check", GRAPHIFY_PACKAGE], + WORK_DIR, + extra_env=runtime_env(), + ) + command_records.append(install_record) + if install_record.status != "pass": + return None + elif not graphify.exists(): + command_records.append( + CommandRecord( + label="graphify-install", + command=["graphify"], + status="incomplete", + elapsed_ms=0.0, + stdout_artifact=None, + stderr_artifact=None, + returncode=None, + reason="graphify install was disabled and no venv graphify executable exists.", + ) + ) + return None + + version_record = run_command("graphify-help", [str(graphify), "--help"], WORK_DIR, extra_env=runtime_env()) + command_records.append(version_record) + + return graphify if version_record.status == "pass" else None + + +def run_graphify(graphify: Path, command_records: list[CommandRecord]) -> Path | None: + """Run graphify build and query commands.""" + + build_record = run_command( + "graphify-build", + [str(graphify), str(CORPUS_DIR), "--no-viz"], + WORK_DIR, + extra_env=runtime_env(), + ) + command_records.append(build_record) + if build_record.status != "pass": + return None + + cluster_record = run_command( + "graphify-cluster-report", + [str(graphify), "cluster-only", str(CORPUS_DIR)], + WORK_DIR, + extra_env=runtime_env(), + ) + command_records.append(cluster_record) + + output_dir = find_graphify_output_dir() + + if output_dir is None: + command_records.append( + CommandRecord( + label="graphify-output-discovery", + command=["find", str(WORK_DIR), "-path", "*/graphify-out/graph.json"], + status="incomplete", + elapsed_ms=0.0, + stdout_artifact=None, + stderr_artifact=None, + returncode=None, + reason="graphify completed but graphify-out/graph.json was not found.", + ) + ) + return None + + copy_graphify_output(output_dir) + graph_json = OUTPUT_CAPTURE_DIR / "graph.json" + query_record = run_command( + "graphify-query", + [ + str(graphify), + "query", + "what connects the ELF memory service, Qdrant rebuild, and graph report evidence mapping?", + "--graph", + str(graph_json), + "--budget", + str(QUERY_BUDGET), + ], + WORK_DIR, + extra_env=runtime_env(), + ) + command_records.append(query_record) + + return OUTPUT_CAPTURE_DIR + + +def find_graphify_output_dir() -> Path | None: + """Find the graphify output directory generated by the CLI.""" + + candidates: list[Path] = [] + + for base in (WORK_DIR, CORPUS_DIR): + if not base.exists(): + continue + + for graph_path in base.rglob("graph.json"): + if ".venv" in graph_path.parts: + continue + if graph_path.parent.name == "graphify-out": + candidates.append(graph_path.parent) + + if not candidates: + return None + + candidates.sort(key=lambda path: path.stat().st_mtime if path.exists() else 0.0) + + return candidates[-1] + + +def copy_graphify_output(output_dir: Path) -> None: + """Copy graphify output artifacts into the report directory.""" + + if OUTPUT_CAPTURE_DIR.exists(): + shutil.rmtree(OUTPUT_CAPTURE_DIR) + shutil.copytree(output_dir, OUTPUT_CAPTURE_DIR) + + +def map_artifacts(corpus: list[CorpusItem], command_records: list[CommandRecord]) -> dict[str, Any]: + """Map graphify graph/report/query output to real_world_job evidence ids.""" + + graph_json = OUTPUT_CAPTURE_DIR / "graph.json" + graph_report = OUTPUT_CAPTURE_DIR / "GRAPH_REPORT.md" + graph_payload = read_json_or_none(graph_json) + nodes, edges = extract_graph_rows(graph_payload) + node_mappings = [map_graph_row("node", row, corpus) for row in nodes] + edge_mappings = [map_graph_row("edge", row, corpus) for row in edges] + report_mapping = map_text_artifact("graph_report", graph_report, corpus) + query_mapping = map_query_output(command_records, corpus) + mapped_ids: list[str] = [] + + for section in (node_mappings, edge_mappings): + for row in section: + for evidence_id in row["evidence_ids"]: + append_unique(mapped_ids, evidence_id) + + for row in (report_mapping, query_mapping): + for evidence_id in row["evidence_ids"]: + append_unique(mapped_ids, evidence_id) + + return { + "expected_evidence_ids": expected_ids(corpus), + "mapped_evidence_ids": mapped_ids, + "graph_json": { + "artifact": rel(graph_json) if graph_json.exists() else None, + "exists": graph_json.exists(), + "size_bytes": graph_json.stat().st_size if graph_json.exists() else 0, + }, + "graph_report": report_mapping, + "query_output": query_mapping, + "nodes": node_mappings, + "edges": edge_mappings, + } + + +def read_json_or_none(path: Path) -> Any | None: + """Read JSON and return None on missing or invalid payloads.""" + + if not path.exists(): + return None + + try: + return json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + return None + + +def extract_graph_rows(payload: Any | None) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Extract node and edge rows from common graph JSON shapes.""" + + if not isinstance(payload, dict): + return [], [] + + nodes = payload.get("nodes") + edges = payload.get("edges") or payload.get("links") or payload.get("relationships") + + if nodes is None and isinstance(payload.get("elements"), dict): + elements = payload["elements"] + nodes = elements.get("nodes") + edges = elements.get("edges") + + return rows_from_value(nodes), rows_from_value(edges) + + +def rows_from_value(value: Any) -> list[dict[str, Any]]: + """Normalize a graph row container into dictionaries.""" + + if not isinstance(value, list): + return [] + + rows: list[dict[str, Any]] = [] + for item in value: + if isinstance(item, dict): + data = item.get("data") + rows.append(data if isinstance(data, dict) else item) + + return rows + + +def map_graph_row(kind: str, row: dict[str, Any], corpus: list[CorpusItem]) -> dict[str, Any]: + """Map one graph node or edge row to evidence ids.""" + + blob = json.dumps(row, sort_keys=True, default=str) + evidence_ids = evidence_from_text(blob, corpus) + return { + "kind": kind, + "row_id": str(row.get("id") or row.get("key") or row.get("source") or ""), + "label": first_text(row, ("label", "name", "title", "type", "kind")), + "edge_type": first_text(row, ("edge_type", "type", "relation", "relationship", "predicate")), + "confidence": first_text( + row, + ("confidence", "confidence_score", "confidence_tag", "extraction_status", "status"), + ), + "source_files": source_values(row), + "source_locations": source_location_values(row), + "evidence_ids": evidence_ids, + } + + +def first_text(row: dict[str, Any], keys: tuple[str, ...]) -> str | None: + """Return the first scalar text value for a set of keys.""" + + for key in keys: + value = row.get(key) + + if isinstance(value, (str, int, float)): + return str(value) + + return None + + +def source_values(value: Any) -> list[str]: + """Collect source file-ish values from a graph row.""" + + values: list[str] = [] + collect_source_values(value, values, ("source", "file", "path")) + + return values[:12] + + +def source_location_values(value: Any) -> list[str]: + """Collect source location-ish values from a graph row.""" + + values: list[str] = [] + collect_source_values(value, values, ("location", "line", "span", "range")) + + return values[:12] + + +def collect_source_values(value: Any, out: list[str], key_fragments: tuple[str, ...]) -> None: + """Recursively collect bounded source-related values.""" + + if isinstance(value, dict): + for key, item in value.items(): + key_lower = key.lower() + + if any(fragment in key_lower for fragment in key_fragments) and isinstance(item, (str, int, float)): + append_unique(out, str(item)) + else: + collect_source_values(item, out, key_fragments) + elif isinstance(value, list): + for item in value: + collect_source_values(item, out, key_fragments) + + +def map_text_artifact(kind: str, path: Path, corpus: list[CorpusItem]) -> dict[str, Any]: + """Map a text artifact to evidence ids.""" + + text = "" + if path.exists(): + try: + text = path.read_text(encoding="utf-8") + except UnicodeDecodeError: + text = "" + + return { + "kind": kind, + "artifact": rel(path) if path.exists() else None, + "exists": path.exists(), + "size_bytes": path.stat().st_size if path.exists() else 0, + "evidence_ids": evidence_from_text(text, corpus), + } + + +def map_query_output(command_records: list[CommandRecord], corpus: list[CorpusItem]) -> dict[str, Any]: + """Map graphify query stdout to evidence ids.""" + + query_record = next((record for record in command_records if record.label == "graphify-query"), None) + text = "" + artifact = query_record.stdout_artifact if query_record else None + + if artifact: + path = ROOT_DIR / artifact + if path.exists(): + text = path.read_text(encoding="utf-8") + + return { + "kind": "query_output", + "artifact": artifact, + "exists": bool(artifact and (ROOT_DIR / artifact).exists()), + "command_status": query_record.status if query_record else "not_encoded", + "evidence_ids": evidence_from_text(text, corpus), + } + + +def evidence_from_text(text: str, corpus: list[CorpusItem]) -> list[str]: + """Return evidence ids whose signatures appear in a text blob.""" + + evidence_ids: list[str] = [] + haystack = text.lower() + + for item in corpus: + signatures = ( + item.evidence_id, + slug(item.evidence_id), + item.file_name, + item.title, + f"{item.file_name}:{item.line}", + ) + + if any(signature.lower() in haystack for signature in signatures): + append_unique(evidence_ids, item.evidence_id) + + return evidence_ids + + +def append_unique(values: list[str], value: str) -> None: + """Append a value if absent.""" + + if value not in values: + values.append(value) + + +def expected_ids(corpus: list[CorpusItem]) -> list[str]: + """Return expected evidence ids for pass scoring.""" + + return [item.evidence_id for item in corpus if item.expected] + + +def mapping_outcome(mappings: dict[str, Any], command_records: list[CommandRecord]) -> tuple[str, str]: + """Return typed result status and explanation for evidence mapping.""" + + graph_build = next((record for record in command_records if record.label == "graphify-build"), None) + graph_query = next((record for record in command_records if record.label == "graphify-query"), None) + + if graph_build is None or graph_build.status != "pass": + return "incomplete", "graphify did not complete graph/report build for the generated corpus." + if not mappings["graph_json"]["exists"]: + return "incomplete", "graphify did not produce graph.json." + if not mappings["graph_report"]["exists"]: + return "incomplete", "graphify did not produce GRAPH_REPORT.md." + if graph_query is None or graph_query.status != "pass": + return "incomplete", "graphify query output was not available for scoring." + + missing = [ + evidence_id + for evidence_id in mappings["expected_evidence_ids"] + if evidence_id not in mappings["mapped_evidence_ids"] + ] + + if missing: + return "wrong_result", f"graphify output mappings missed expected evidence ids: {', '.join(missing)}." + + return "pass", "graphify graph/report/query output mapped to expected generated evidence ids." + + +def write_fixture(corpus: list[CorpusItem], status: StatusState, mapped_ids: list[str]) -> Path: + """Write a generated real_world_job fixture for the graphify smoke.""" + + fixture_path = FIXTURE_DIR / "knowledge" / "graphify_graph_report.json" + used_ids = [evidence_id for evidence_id in mapped_ids if evidence_id in expected_ids(corpus)] + response = { + "adapter_id": "graphify_docker_smoke", + "answer": { + "content": ( + "graphify connected the ELF memory service, Qdrant rebuild, and graph report mapping " + "through graph/report artifacts that cite generated source evidence." + if used_ids + else "" + ), + "claims": [ + { + "claim_id": "graphify_report_evidence_mapping", + "text": ( + "graphify graph/report artifacts map back to the generated ELF memory service, " + "Qdrant rebuild, and report mapping evidence ids." + ), + "evidence_ids": used_ids, + "confidence": "derived_from_graphify_graph_report_mapping", + } + ] + if used_ids + else [], + "evidence_ids": used_ids, + "pages": [ + { + "page_id": "graphify:graph-report", + "page_type": "concept", + "title": "graphify Graph Report", + "path": rel(OUTPUT_CAPTURE_DIR / "GRAPH_REPORT.md"), + "sections": [ + { + "section_id": "derived-graph-report", + "heading": "Derived Graph Report", + "role": "summary", + "content": "GRAPH_REPORT.md is a derived graphify artifact, not authoritative ELF memory.", + "evidence_ids": used_ids, + "timeline_event_ids": ["graphify-smoke-built-graph-report"], + "unsupported_reason": None if used_ids else "graphify output was not mapped.", + } + ], + "backlinks": used_ids, + "lint_findings": [], + } + ] + if (OUTPUT_CAPTURE_DIR / "GRAPH_REPORT.md").exists() + else [], + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0, + }, + }, + } + fixture: dict[str, Any] = { + "schema": "elf.real_world_job/v1", + "job_id": "graphify-graph-report-001", + "suite": "knowledge_compilation", + "title": "Map graphify graph/report output to generated evidence", + "corpus": { + "corpus_id": "graphify-generated-public-smoke", + "profile": "generated_public", + "items": [ + { + "evidence_id": item.evidence_id, + "kind": item.kind, + "text": item.text, + "source_ref": { + "schema": "source_ref/v1", + "resolver": "graphify_smoke/v1", + "ref": { + "run_id": RUN_ID, + "file": item.file_name, + "line": item.line, + "evidence_id": item.evidence_id, + }, + }, + "created_at": "2026-06-10T00:00:00Z", + } + for item in corpus + ], + "adapter_response": response, + }, + "timeline": [ + { + "event_id": "graphify-smoke-corpus-generated", + "ts": "2026-06-10T00:00:00Z", + "actor": "system", + "action": "generated_public_corpus", + "evidence_ids": expected_ids(corpus), + "summary": "The graphify smoke generated a tiny public corpus for source mapping.", + }, + { + "event_id": "graphify-smoke-built-graph-report", + "ts": "2026-06-10T00:01:00Z", + "actor": "system", + "action": "built_derived_graph_report", + "evidence_ids": used_ids, + "summary": "graphify built derived graph/report artifacts when the Docker smoke reached execution.", + }, + ], + "prompt": { + "role": "user", + "content": "What does graphify connect in the generated ELF graph/report smoke?", + "job_mode": "compile", + "constraints": ["cite_evidence", "avoid_stale_facts", "do_not_claim_authoritative_store"], + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "graphify_report_evidence_mapping", + "text": ( + "graphify connects the ELF memory service, Qdrant rebuild, and graph report " + "mapping through derived graph/report artifacts." + ), + } + ], + "must_not_include": ["graphify output is an authoritative ELF memory store."], + "evidence_links": {"graphify_report_evidence_mapping": expected_ids(corpus)}, + "answer_type": "compiled_knowledge", + "accepted_alternates": [], + "requires_caveat": True, + "requires_refusal": False, + }, + "required_evidence": [ + { + "evidence_id": item.evidence_id, + "claim_id": "graphify_report_evidence_mapping", + "requirement": "cite", + "quote": item.evidence_id, + } + for item in corpus + if item.expected + ], + "negative_traps": [ + { + "trap_id": "graphify-authoritative-store", + "type": "unsupported_claim", + "evidence_ids": ["graphify-smoke-stale-trap"], + "failure_if_used": True, + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the graph/report connection without broad quality claims.", + }, + "evidence_grounding": { + "weight": 0.4, + "max_points": 1.0, + "criteria": "Maps graphify output back to generated evidence ids.", + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not treat graphify output as an authoritative ELF memory store.", + }, + "latency_resource": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Records build time, artifact sizes, provider boundary, and retry behavior.", + }, + }, + "pass_threshold": 0.75, + "hard_fail_rules": [], + }, + "allowed_uncertainty": { + "can_answer_unknown": False, + "acceptable_phrases": ["tiny generated corpus", "derived graph/report adapter"], + "fallback_action": "state_blocker", + }, + "operator_debug": None, + "encoding": {}, + "memory_evolution": None, + "tags": ["external_adapter", "generated_public", "graphify", "no_live_claim"], + } + + if status.result in {"blocked", "incomplete"}: + fixture["encoding"] = { + "status": status.result, + "reason": status.failure_reason, + } + + write_json(fixture_path, fixture) + + return fixture_path + + +def write_materialization( + status: StatusState, + corpus: list[CorpusItem], + fixture_path: Path, + corpus_csv: Path, + command_records: list[CommandRecord], + mappings: dict[str, Any], + started_at: float, + report: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Write the primary smoke artifact.""" + + elapsed_ms = (time.monotonic() - started_at) * 1000 + graph_json = OUTPUT_CAPTURE_DIR / "graph.json" + graph_report = OUTPUT_CAPTURE_DIR / "GRAPH_REPORT.md" + cache_dir = OUTPUT_CAPTURE_DIR / "cache" + query_record = next((record for record in command_records if record.label == "graphify-query"), None) + payload = { + "schema": "elf.graphify_docker_graph_report_smoke/v1", + "generated_at": utc_now(), + "run_id": RUN_ID, + "adapter_id": "graphify_docker_smoke", + "evidence_class": status.evidence_class, + "status": { + "source": "smoke_materialization", + "setup": status.setup, + "run": status.run, + "result": status.result, + "overall": status.overall, + "failure_class": status.failure_class, + "failure_reason": status.failure_reason, + }, + "scored_benchmark": scored_benchmark(report), + "artifacts": { + "generated_corpus_csv": rel(corpus_csv), + "generated_corpus_dir": rel(CORPUS_DIR), + "generated_fixture": rel(fixture_path), + "graph_output_dir": rel(OUTPUT_CAPTURE_DIR), + "graph_json": rel(graph_json) if graph_json.exists() else None, + "graph_report": rel(graph_report) if graph_report.exists() else None, + "query_output": query_record.stdout_artifact if query_record else None, + "manifest": rel(MANIFEST_OUT), + "summary": rel(SUMMARY_OUT), + "scored_report_json": rel(REPORT_JSON), + "scored_report_markdown": rel(REPORT_MD), + }, + "docker_boundary": { + "compose_file": "docker-compose.baseline.yml", + "runner_service": "baseline-runner", + "runner": "scripts/graphify-docker-graph-report-smoke.py", + "host_global_installs_required": False, + "docker_only": True, + "assistant_hook_install_used": False, + "isolated_home": True, + }, + "model_provider_boundary": { + "package": GRAPHIFY_REF, + "package_spec": GRAPHIFY_PACKAGE, + "assistant_platform_hooks_used": False, + "host_global_assistant_config_used": False, + "operator_owned_provider_credentials_used": False, + "provider_or_model_name": "graphify CLI default; no model configured by this runner", + "live_run_enabled": RUN_GRAPHIFY, + }, + "resource_bounds": { + "generated_file_count": len(corpus), + "generated_input_chars": sum(len(item.text) for item in corpus), + "timeout_seconds": TIMEOUT_SECONDS, + "elapsed_ms": round(elapsed_ms, 3), + "graph_json_size_bytes": graph_json.stat().st_size if graph_json.exists() else 0, + "graph_report_size_bytes": graph_report.stat().st_size if graph_report.exists() else 0, + "graph_output_size_bytes": dir_size(OUTPUT_CAPTURE_DIR), + "cache_size_bytes": dir_size(cache_dir), + "cache_file_count": file_count(cache_dir), + }, + "retry_behavior": { + "max_attempts": 1, + "retries_performed": 0, + "retry_guidance": "Rerun the same Docker command after setup/runtime fixes; do not use host assistant hooks as proof.", + }, + "commands": [command_to_json(record) for record in command_records], + "evidence_mapping": mappings, + } + write_json(OUT, payload) + + return payload + + +def write_manifest(status: StatusState) -> dict[str, Any]: + """Write a generated external adapter manifest for this smoke.""" + + manifest = { + "schema": "elf.real_world_external_adapter_manifest/v1", + "manifest_id": f"graphify-docker-smoke-{RUN_ID}", + "docker_isolation": { + "default": True, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/graphify-docker-graph-report-smoke.py", + "artifact_dir": "tmp/real-world-memory/graphify-smoke", + "host_global_installs_required": False, + "notes": [ + f"Generated by the graphify Docker graph/report smoke at {utc_now()}.", + "The smoke uses generated public source files and records typed setup/runtime failures.", + ], + }, + "adapters": [ + { + "adapter_id": "graphify_docker_smoke", + "project": "graphify", + "adapter_kind": "docker_cli_graph_report_smoke", + "evidence_class": status.evidence_class, + "docker_default": True, + "host_global_installs_required": False, + "overall_status": status.overall, + "setup": { + "status": status.setup, + "evidence": "The smoke installs graphify in a container-local Python venv and runs with isolated assistant config paths.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": rel(OUT), + }, + "run": { + "status": status.run, + "evidence": "The live path builds graphify graph/report artifacts from a generated public corpus and runs graphify query over graph.json.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": rel(OUT), + }, + "result": { + "status": status.result, + "evidence": status.failure_reason + if status.failure_reason + else "graphify graph.json, GRAPH_REPORT.md, and query output mapped to generated real_world_job evidence ids.", + "artifact": rel(OUT), + }, + "capabilities": [ + { + "capability": "docker_cli_boundary", + "status": status.setup, + "evidence": "The runner uses docker-compose.baseline.yml baseline-runner and does not install graphify or assistant hooks on the host.", + }, + { + "capability": "graph_report_generation", + "status": status.run, + "evidence": "The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, and command logs when build succeeds.", + }, + { + "capability": "graph_query_evidence_mapping", + "status": status.result, + "evidence": "Node labels, edge types, confidence tags, source files, source locations, report text, and query output are scanned for generated evidence ids.", + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim multimodal, private corpus, broad codebase-understanding, or large-corpus graph quality.", + }, + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": status.result, + "evidence": "Only the generated graph/report evidence-mapping job is represented.", + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The smoke uses graphify query output only to support source mapping; broad retrieval quality is not scored.", + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume-answer behavior is not encoded by this graph/report smoke.", + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "The smoke records resource bounds but does not encode backup, restore, provider credential, or private corpus operations.", + }, + ], + "evidence": [ + {"kind": "artifact", "ref": rel(OUT), "status": status.result}, + {"kind": "artifact", "ref": rel(OUTPUT_CAPTURE_DIR), "status": status.result}, + {"kind": "manifest", "ref": rel(MANIFEST_OUT), "status": status.overall}, + {"kind": "source", "ref": "https://github.com/safishamsi/graphify", "status": "real"}, + { + "kind": "source", + "ref": "https://github.com/safishamsi/graphify/blob/v3/README.md", + "status": "real", + }, + ], + "execution_metadata": { + "sources": [ + { + "label": "graphify repository", + "url": "https://github.com/safishamsi/graphify", + "evidence": "Official source for graphify graph extraction and query workflow.", + }, + { + "label": "graphify README", + "url": "https://github.com/safishamsi/graphify/blob/v3/README.md", + "evidence": "Official CLI, output artifact, query, confidence, and source-location contract.", + }, + { + "label": "graphify PyPI package", + "url": "https://pypi.org/project/graphifyy/", + "evidence": "Official package referenced by the graphify README.", + }, + ], + "setup_path": "Run cargo make smoke-graphify-docker-graph-report to install graphify in a container-local venv and build graph/report artifacts over generated public files.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, isolated HOME/config paths, generated corpus, and artifacts under tmp/real-world-memory/graphify-smoke.", + "resource_expectation": f"graphify package {GRAPHIFY_REF}, generated_files=4, timeout_seconds={TIMEOUT_SECONDS}, query_budget={QUERY_BUDGET}.", + "retry_guidance": [ + "Rerun cargo make smoke-graphify-docker-graph-report after dependency or runtime fixes.", + "Do not use graphify install hooks, host-global Codex/Claude/Gemini config, or private corpora as proof.", + "Score only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids.", + ], + "research_depth": "D1 feasibility plus XY-889 Docker graph/report smoke implementation; generated artifact decides live evidence class.", + }, + "notes": [ + "The checked-in manifest carries the current graphify status; generated smoke artifacts carry the run-specific live status.", + "graphify output is treated as a derived graph/report adapter, not an authoritative ELF memory store.", + ], + } + ], + } + write_json(MANIFEST_OUT, manifest) + + return manifest + + +def write_summary(materialization: dict[str, Any], manifest: dict[str, Any], report: dict[str, Any]) -> None: + """Write a small summary artifact.""" + + write_json( + SUMMARY_OUT, + { + "schema": "elf.graphify_docker_smoke_summary/v1", + "generated_at": utc_now(), + "adapter_id": "graphify_docker_smoke", + "evidence_class": materialization["evidence_class"], + "status_boundary": { + "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", + "manifest": "external adapter declaration consumed by the scorer", + "scored_benchmark": "post-score real_world_job outcome; use this for quality status", + }, + "scored_benchmark": materialization["scored_benchmark"], + "materialization": materialization, + "manifest": { + "json": rel(MANIFEST_OUT), + "status_source": "external_adapter_manifest_score_aligned", + "summary": manifest["adapters"][0]["overall_status"], + "suites": manifest["adapters"][0]["suites"], + }, + "report": report, + }, + ) + + +def slug(value: str) -> str: + """Return a small ASCII slug.""" + + out: list[str] = [] + last_dash = False + + for char in value.lower(): + if char.isascii() and char.isalnum(): + out.append(char) + last_dash = False + elif not last_dash and out: + out.append("-") + last_dash = True + + while out and out[-1] == "-": + out.pop() + + return "".join(out) or "item" + + +def main() -> int: + """Run the smoke and always emit typed artifacts when possible.""" + + started_at = time.monotonic() + mkdirs() + status = StatusState() + command_records: list[CommandRecord] = [] + corpus = generated_corpus() + corpus_csv = write_corpus(corpus) + mappings = { + "expected_evidence_ids": expected_ids(corpus), + "mapped_evidence_ids": [], + "graph_json": {"artifact": None, "exists": False, "size_bytes": 0}, + "graph_report": { + "kind": "graph_report", + "artifact": None, + "exists": False, + "size_bytes": 0, + "evidence_ids": [], + }, + "query_output": { + "kind": "query_output", + "artifact": None, + "exists": False, + "command_status": "not_encoded", + "evidence_ids": [], + }, + "nodes": [], + "edges": [], + } + + if not Path("/.dockerenv").exists() and not ALLOW_HOST: + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "not_running_in_docker" + status.failure_reason = "graphify smoke must run inside Docker; use cargo make smoke-graphify-docker-graph-report." + elif not command_available("python3"): + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "python_missing" + status.failure_reason = "python3 is required for the graphify smoke runner." + elif not RUN_GRAPHIFY: + pass + else: + graphify = install_graphify(command_records) + + if graphify is None: + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "graphify_setup_failed" + status.failure_reason = "graphify installation or help command failed inside the Docker runner." + else: + status.setup = "pass" + output_dir = run_graphify(graphify, command_records) + + if output_dir is None: + status.run = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "graphify_build_failed" + status.failure_reason = "graphify did not build graph/report artifacts for the generated corpus." + else: + status.run = "pass" + status.evidence_class = "live_real_world" + mappings = map_artifacts(corpus, command_records) + result_status, reason = mapping_outcome(mappings, command_records) + status.result = result_status + status.overall = result_status + + if result_status == "pass": + status.failure_class = "" + status.failure_reason = "" + else: + status.failure_class = "graphify_evidence_mapping_failed" + status.failure_reason = reason + + fixture_path = write_fixture(corpus, status, mappings["mapped_evidence_ids"]) + materialization = write_materialization( + status, + corpus, + fixture_path, + corpus_csv, + command_records, + mappings, + started_at, + ) + manifest = write_manifest(status) + report = run_scored_report(fixture_path, MANIFEST_OUT, status) + manifest_status = status_with_scored_result(status, report) + if manifest_status.overall != status.overall or manifest_status.result != status.result: + manifest = write_manifest(manifest_status) + report = run_scored_report(fixture_path, MANIFEST_OUT, manifest_status) + materialization = write_materialization( + status, + corpus, + fixture_path, + corpus_csv, + command_records, + mappings, + started_at, + report, + ) + write_summary(materialization, manifest, report) + print(f"graphify smoke artifact: {OUT}") + print(f"graphify smoke manifest: {MANIFEST_OUT}") + print(f"graphify smoke summary: {SUMMARY_OUT}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/graphiti-zep-docker-temporal-smoke.py b/scripts/graphiti-zep-docker-temporal-smoke.py new file mode 100644 index 00000000..065bb78c --- /dev/null +++ b/scripts/graphiti-zep-docker-temporal-smoke.py @@ -0,0 +1,1289 @@ +#!/usr/bin/env python3 +"""Docker-contained Graphiti/Zep temporal fact smoke for real-world adapters.""" + +from __future__ import annotations + +import json +import os +import shutil +import socket +import subprocess +import sys +import textwrap +import time +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +SCRIPT_DIR = Path(__file__).resolve().parent +ROOT_DIR = SCRIPT_DIR.parent +REPORT_DIR = Path( + os.environ.get( + "ELF_GRAPHITI_ZEP_SMOKE_REPORT_DIR", + ROOT_DIR / "tmp" / "real-world-memory" / "graphiti-zep-smoke", + ) +) +WORK_DIR = Path(os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_WORK_DIR", REPORT_DIR / "work")) +OUT = Path(os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_OUT", REPORT_DIR / "graphiti-zep-smoke.json")) +MANIFEST_OUT = Path( + os.environ.get( + "ELF_GRAPHITI_ZEP_SMOKE_MANIFEST_OUT", + REPORT_DIR / "memory_projects_manifest.graphiti-zep-smoke.json", + ) +) +SUMMARY_OUT = Path(os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_SUMMARY_OUT", REPORT_DIR / "summary.json")) +REPORT_JSON = Path( + os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_REPORT_JSON", REPORT_DIR / "graphiti-zep-report.json") +) +REPORT_MD = Path( + os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_REPORT_MD", REPORT_DIR / "graphiti-zep-report.md") +) +FIXTURE_DIR = REPORT_DIR / "graphiti-zep-fixtures" +LOG_DIR = REPORT_DIR / "logs" + +RUN_ID = os.environ.get( + "ELF_GRAPHITI_ZEP_SMOKE_RUN_ID", + f"graphiti-zep-docker-smoke-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", +) +RUN_LIVE = os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_RUN", "0") == "1" +ALLOW_HOST = os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_ALLOW_HOST", "0") == "1" +INSTALL_GRAPHITI = os.environ.get("ELF_GRAPHITI_ZEP_SMOKE_INSTALL", "1") == "1" +GRAPHITI_VERSION = os.environ.get("ELF_GRAPHITI_ZEP_VERSION", "0.21.0") +GRAPHITI_PACKAGE = os.environ.get( + "ELF_GRAPHITI_ZEP_PACKAGE", + f"graphiti-core[falkordb]=={GRAPHITI_VERSION}", +) +GRAPHITI_REF = os.environ.get("ELF_GRAPHITI_ZEP_REF", f"pypi:{GRAPHITI_PACKAGE}") +FALKORDB_HOST = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_HOST", "graphiti-falkordb") +FALKORDB_PORT = int(os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_PORT", "6379")) +FALKORDB_DATABASE = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_DATABASE", "elf_graphiti_zep_smoke") +FALKORDB_USERNAME = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_USERNAME", "") +FALKORDB_PASSWORD = os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_PASSWORD", "") +API_KEY = os.environ.get( + "ELF_GRAPHITI_ZEP_API_KEY", + os.environ.get("GRAPHITI_OPENAI_API_KEY", os.environ.get("OPENAI_API_KEY", "")), +) +API_BASE = os.environ.get("ELF_GRAPHITI_ZEP_API_BASE", os.environ.get("OPENAI_BASE_URL", "")) +LLM_MODEL = os.environ.get("ELF_GRAPHITI_ZEP_LLM_MODEL", "gpt-4o-mini") +EMBEDDING_MODEL = os.environ.get("ELF_GRAPHITI_ZEP_EMBEDDING_MODEL", "text-embedding-3-small") +TIMEOUT_SECONDS = int(os.environ.get("ELF_GRAPHITI_ZEP_TIMEOUT_SECONDS", "900")) +STARTUP_ATTEMPTS = int(os.environ.get("ELF_GRAPHITI_ZEP_STARTUP_ATTEMPTS", "30")) +STARTUP_INTERVAL_SECONDS = float(os.environ.get("ELF_GRAPHITI_ZEP_STARTUP_INTERVAL_SECONDS", "2")) + + +@dataclass +class StatusState: + """Typed status for generated Graphiti/Zep smoke artifacts.""" + + setup: str = "blocked" + run: str = "not_encoded" + result: str = "blocked" + overall: str = "blocked" + evidence_class: str = "research_gate" + failure_class: str = "graphiti_zep_live_run_disabled" + failure_reason: str = ( + "Graphiti/Zep temporal graph live run is opt-in; set " + "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 and provide explicit " + "provider configuration to attempt the Docker-local FalkorDB smoke." + ) + + +@dataclass +class CommandRecord: + """Captured command result without secret-bearing environment values.""" + + label: str + command: list[str] + status: str + elapsed_ms: float + stdout_artifact: str | None + stderr_artifact: str | None + returncode: int | None + reason: str + + +def utc_now() -> str: + """Return an RFC3339 UTC timestamp.""" + + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def rel(path: Path) -> str: + """Return a repository-relative path when possible.""" + + try: + return str(path.resolve().relative_to(ROOT_DIR)) + except ValueError: + return str(path) + + +def mkdirs() -> None: + """Create output directories.""" + + for path in (REPORT_DIR, WORK_DIR, FIXTURE_DIR, LOG_DIR): + path.mkdir(parents=True, exist_ok=True) + + +def write_json(path: Path, payload: Any) -> None: + """Write stable, pretty JSON.""" + + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def run_scored_report(fixture_path: Path, manifest_path: Path, status: StatusState) -> dict[str, Any]: + """Score the generated temporal smoke fixture through the real-world job runner.""" + + run_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + str(fixture_path), + "--out", + str(REPORT_JSON), + "--run-id", + "real-world-memory-live-graphiti-zep", + "--adapter-id", + "graphiti_zep_temporal_smoke", + "--adapter-name", + "Graphiti/Zep Docker temporal smoke adapter", + "--adapter-behavior", + "docker_python_falkordb_temporal_smoke", + "--adapter-storage-status", + status.setup, + "--adapter-runtime-status", + status.overall, + "--adapter-notes", + "Generated by the Graphiti/Zep Docker temporal smoke; pass or wrong_result requires current and historical validity-window facts mapped to generated evidence ids, while provider/setup limits remain typed.", + "--external-adapter-manifest", + str(manifest_path), + ] + publish_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + str(REPORT_JSON), + "--out", + str(REPORT_MD), + ] + + subprocess.run(run_cmd, cwd=ROOT_DIR, check=True) + subprocess.run(publish_cmd, cwd=ROOT_DIR, check=True) + + report = json.loads(REPORT_JSON.read_text(encoding="utf-8")) + + return { + "json": rel(REPORT_JSON), + "markdown": rel(REPORT_MD), + "summary": report.get("summary", {}), + "suites": report.get("suites", []), + } + + +def scored_benchmark(report: dict[str, Any] | None) -> dict[str, Any]: + """Extract the post-score benchmark status from a real_world_job report.""" + + if report is None: + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": "pending", + "reason": "The smoke materialization was written before benchmark scoring completed.", + } + + summary = report.get("summary", {}) + counts = { + status: int(summary.get(status, 0) or 0) + for status in ( + "pass", + "wrong_result", + "lifecycle_fail", + "incomplete", + "blocked", + "not_encoded", + ) + } + status = next((name for name, count in counts.items() if name != "pass" and count > 0), "pass") + + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": status, + "counts": counts, + "job_count": int(summary.get("job_count", 0) or 0), + "mean_score": summary.get("mean_score"), + "evidence_coverage": summary.get("evidence_coverage"), + } + + +def command_available(command: str) -> bool: + """Return whether a command is on PATH.""" + + return shutil.which(command) is not None + + +def dir_size(path: Path) -> int: + """Return total file size for a directory or file.""" + + if not path.exists(): + return 0 + if path.is_file(): + return path.stat().st_size + + return sum(item.stat().st_size for item in path.rglob("*") if item.is_file()) + + +def file_count(path: Path) -> int: + """Return file count for a directory.""" + + if not path.exists(): + return 0 + + return sum(1 for item in path.rglob("*") if item.is_file()) + + +def temporal_facts() -> list[dict[str, Any]]: + """Return the generated-public temporal fact corpus.""" + + return [ + { + "evidence_id": "graphiti-zep-old-owner", + "claim_id": "relation_historical_owner", + "source": "Team Delta", + "edge_name": "OWNED_REVIEW", + "target": "deployment method review", + "fact": "Team Delta owned deployment method review before 2026-06-06.", + "valid_at": "2026-06-05T00:00:00Z", + "invalid_at": "2026-06-08T00:00:00Z", + "created_at": "2026-06-05T00:00:00Z", + "current": False, + }, + { + "evidence_id": "graphiti-zep-current-owner", + "claim_id": "relation_current_owner", + "source": "Team Echo", + "edge_name": "OWNS_REVIEW", + "target": "deployment method review", + "fact": "Team Echo owns deployment method review since 2026-06-08.", + "valid_at": "2026-06-08T00:00:00Z", + "invalid_at": None, + "created_at": "2026-06-08T00:00:00Z", + "current": True, + }, + { + "evidence_id": "graphiti-zep-owner-rationale", + "claim_id": "relation_owner_update_rationale", + "source": "single-user production runbook scope", + "edge_name": "MOVED_OWNERSHIP_TO", + "target": "Team Echo", + "fact": "Ownership moved to Team Echo after single-user production runbook scope changed.", + "valid_at": "2026-06-08T00:05:00Z", + "invalid_at": None, + "created_at": "2026-06-08T00:05:00Z", + "current": True, + }, + ] + + +def command_to_json(record: CommandRecord) -> dict[str, Any]: + """Serialize a command record.""" + + return { + "label": record.label, + "status": record.status, + "command": record.command, + "elapsed_ms": round(record.elapsed_ms, 3), + "stdout_artifact": record.stdout_artifact, + "stderr_artifact": record.stderr_artifact, + "returncode": record.returncode, + "reason": record.reason, + } + + +def run_command( + label: str, + command: list[str], + cwd: Path, + timeout: int = TIMEOUT_SECONDS, + extra_env: dict[str, str] | None = None, +) -> CommandRecord: + """Run a subprocess and capture stdout/stderr artifacts.""" + + cwd.mkdir(parents=True, exist_ok=True) + stdout_path = LOG_DIR / f"{label}.stdout.log" + stderr_path = LOG_DIR / f"{label}.stderr.log" + env = os.environ.copy() + + if extra_env: + env.update(extra_env) + + started = time.monotonic() + try: + proc = subprocess.run( + command, + cwd=cwd, + env=env, + text=True, + capture_output=True, + timeout=timeout, + check=False, + ) + elapsed_ms = (time.monotonic() - started) * 1000 + stdout_path.write_text(proc.stdout, encoding="utf-8") + stderr_path.write_text(proc.stderr, encoding="utf-8") + status = "pass" if proc.returncode == 0 else "incomplete" + reason = "Command completed." if proc.returncode == 0 else f"Command exited {proc.returncode}." + + return CommandRecord( + label=label, + command=command, + status=status, + elapsed_ms=elapsed_ms, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=proc.returncode, + reason=reason, + ) + except subprocess.TimeoutExpired as err: + elapsed_ms = (time.monotonic() - started) * 1000 + stdout_path.write_text(err.stdout or "", encoding="utf-8") + stderr_path.write_text(err.stderr or "", encoding="utf-8") + + return CommandRecord( + label=label, + command=command, + status="incomplete", + elapsed_ms=elapsed_ms, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=None, + reason=f"Command timed out after {timeout} seconds.", + ) + + +def wait_for_falkordb(command_records: list[CommandRecord]) -> bool: + """Poll the configured FalkorDB TCP endpoint.""" + + started = time.monotonic() + attempts: list[dict[str, Any]] = [] + + for attempt in range(1, STARTUP_ATTEMPTS + 1): + try: + with socket.create_connection((FALKORDB_HOST, FALKORDB_PORT), timeout=2): + elapsed_ms = (time.monotonic() - started) * 1000 + attempts.append({"attempt": attempt, "status": "pass", "elapsed_ms": round(elapsed_ms, 3)}) + path = LOG_DIR / "falkordb-startup-attempts.json" + write_json(path, attempts) + command_records.append( + CommandRecord( + label="falkordb-startup", + command=["tcp-connect", FALKORDB_HOST, str(FALKORDB_PORT)], + status="pass", + elapsed_ms=elapsed_ms, + stdout_artifact=rel(path), + stderr_artifact=None, + returncode=0, + reason="FalkorDB TCP endpoint accepted a connection.", + ) + ) + return True + except OSError as err: + attempts.append({"attempt": attempt, "status": "incomplete", "reason": str(err)}) + time.sleep(STARTUP_INTERVAL_SECONDS) + + elapsed_ms = (time.monotonic() - started) * 1000 + path = LOG_DIR / "falkordb-startup-attempts.json" + write_json(path, attempts) + command_records.append( + CommandRecord( + label="falkordb-startup", + command=["tcp-connect", FALKORDB_HOST, str(FALKORDB_PORT)], + status="incomplete", + elapsed_ms=elapsed_ms, + stdout_artifact=rel(path), + stderr_artifact=None, + returncode=None, + reason="FalkorDB TCP endpoint did not become reachable.", + ) + ) + return False + + +def init_graphiti(command_records: list[CommandRecord]) -> tuple[bool, Path]: + """Create a venv and install Graphiti with FalkorDB support.""" + + venv_dir = WORK_DIR / ".venv" + python = venv_dir / "bin" / "python" + + if INSTALL_GRAPHITI: + venv_record = run_command("python-venv", [sys.executable, "-m", "venv", str(venv_dir)], WORK_DIR) + command_records.append(venv_record) + if venv_record.status != "pass": + return False, python + + install_record = run_command( + "graphiti-install", + [str(python), "-m", "pip", "install", "--disable-pip-version-check", GRAPHITI_PACKAGE], + WORK_DIR, + ) + command_records.append(install_record) + if install_record.status != "pass": + return False, python + elif not python.exists(): + command_records.append( + CommandRecord( + label="graphiti-install", + command=["graphiti-core"], + status="incomplete", + elapsed_ms=0.0, + stdout_artifact=None, + stderr_artifact=None, + returncode=None, + reason="Graphiti install was disabled and no venv python exists.", + ) + ) + return False, python + + return True, python + + +def write_live_runner(path: Path) -> None: + """Write the isolated Graphiti execution script.""" + + payload = { + "run_id": RUN_ID, + "facts": temporal_facts(), + "query": "Who currently owns deployment method review, and who owned it historically?", + "falkordb": { + "host": FALKORDB_HOST, + "port": FALKORDB_PORT, + "database": FALKORDB_DATABASE, + }, + "models": { + "llm": LLM_MODEL, + "embedding": EMBEDDING_MODEL, + "api_base": API_BASE, + }, + } + input_path = WORK_DIR / "graphiti-live-input.json" + output_path = WORK_DIR / "graphiti-live-output.json" + write_json(input_path, payload) + script = f""" +import asyncio +import json +import os +import uuid +from datetime import datetime +from pathlib import Path + +from graphiti_core import Graphiti +from graphiti_core.driver.falkordb_driver import FalkorDriver +from graphiti_core.edges import EntityEdge +from graphiti_core.nodes import EntityNode + + +INPUT = Path({str(input_path)!r}) +OUTPUT = Path({str(output_path)!r}) + + +def parse_dt(value): + if value is None: + return None + return datetime.fromisoformat(value.replace("Z", "+00:00")) + + +async def main(): + data = json.loads(INPUT.read_text(encoding="utf-8")) + config = data["falkordb"] + driver = FalkorDriver( + host=config["host"], + port=config["port"], + username=os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_USERNAME") or None, + password=os.environ.get("ELF_GRAPHITI_ZEP_FALKORDB_PASSWORD") or None, + database=config.get("database") or "default_db", + ) + graphiti = Graphiti(graph_driver=driver) + try: + await graphiti.build_indices_and_constraints() + inserted = [] + for fact in data["facts"]: + group_id = data["run_id"] + source_uuid = str(uuid.uuid5(uuid.NAMESPACE_URL, group_id + ":source:" + fact["source"])) + target_uuid = str(uuid.uuid5(uuid.NAMESPACE_URL, group_id + ":target:" + fact["target"])) + edge_uuid = str(uuid.uuid5(uuid.NAMESPACE_URL, group_id + ":edge:" + fact["evidence_id"])) + source_node = EntityNode(uuid=source_uuid, name=fact["source"], group_id=group_id) + target_node = EntityNode(uuid=target_uuid, name=fact["target"], group_id=group_id) + edge = EntityEdge( + uuid=edge_uuid, + group_id=group_id, + source_node_uuid=source_uuid, + target_node_uuid=target_uuid, + created_at=parse_dt(fact["created_at"]), + name=fact["edge_name"], + fact=fact["fact"], + valid_at=parse_dt(fact["valid_at"]), + invalid_at=parse_dt(fact.get("invalid_at")), + ) + await graphiti.add_triplet(source_node, edge, target_node) + inserted.append({{"evidence_id": fact["evidence_id"], "uuid": edge_uuid}}) + + results = await graphiti.search(data["query"]) + serialized = [] + for edge in results: + serialized.append({{ + "uuid": getattr(edge, "uuid", None), + "name": getattr(edge, "name", None), + "fact": getattr(edge, "fact", None), + "valid_at": str(getattr(edge, "valid_at", "")) if getattr(edge, "valid_at", None) else None, + "invalid_at": str(getattr(edge, "invalid_at", "")) if getattr(edge, "invalid_at", None) else None, + "source_node_uuid": getattr(edge, "source_node_uuid", None), + "target_node_uuid": getattr(edge, "target_node_uuid", None), + }}) + + OUTPUT.write_text(json.dumps({{"inserted": inserted, "results": serialized}}, indent=2, sort_keys=True) + "\\n", encoding="utf-8") + finally: + await graphiti.close() + + +asyncio.run(main()) +""" + path.write_text(textwrap.dedent(script).lstrip(), encoding="utf-8") + + +def run_graphiti(python: Path, command_records: list[CommandRecord]) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Run the Graphiti live worker and return inserted/search result facts.""" + + runner = WORK_DIR / "graphiti_live_runner.py" + write_live_runner(runner) + env = { + "OPENAI_API_KEY": API_KEY, + "MODEL_NAME": LLM_MODEL, + "LLM_MODEL": LLM_MODEL, + "EMBEDDING_MODEL": EMBEDDING_MODEL, + } + + if API_BASE: + env["OPENAI_BASE_URL"] = API_BASE + if FALKORDB_USERNAME: + env["ELF_GRAPHITI_ZEP_FALKORDB_USERNAME"] = FALKORDB_USERNAME + if FALKORDB_PASSWORD: + env["ELF_GRAPHITI_ZEP_FALKORDB_PASSWORD"] = FALKORDB_PASSWORD + + record = run_command("graphiti-live-run", [str(python), str(runner)], WORK_DIR, extra_env=env) + command_records.append(record) + + output_path = WORK_DIR / "graphiti-live-output.json" + if record.status != "pass" or not output_path.exists(): + return [], [] + + payload = json.loads(output_path.read_text(encoding="utf-8")) + return payload.get("inserted", []), payload.get("results", []) + + +def map_observed_facts(results: list[dict[str, Any]], facts: list[dict[str, Any]]) -> dict[str, Any]: + """Map Graphiti search results back to expected evidence ids.""" + + expected_by_id = {fact["evidence_id"]: fact for fact in facts} + mappings: list[dict[str, Any]] = [] + mapped_ids: list[str] = [] + + for fact in facts: + matched = [ + result + for result in results + if isinstance(result.get("fact"), str) and fact["fact"].lower() in result["fact"].lower() + ] + if matched: + result = matched[0] + mapped_ids.append(fact["evidence_id"]) + mappings.append( + { + "evidence_id": fact["evidence_id"], + "claim_id": fact["claim_id"], + "status": "pass", + "uuid": result.get("uuid"), + "fact": result.get("fact"), + "valid_at": result.get("valid_at"), + "invalid_at": result.get("invalid_at"), + "expected_valid_at": fact["valid_at"], + "expected_invalid_at": fact["invalid_at"], + "current": fact["current"], + } + ) + else: + mappings.append( + { + "evidence_id": fact["evidence_id"], + "claim_id": fact["claim_id"], + "status": "blocked", + "expected_valid_at": fact["valid_at"], + "expected_invalid_at": fact["invalid_at"], + "current": fact["current"], + } + ) + + current_ok = any( + item["evidence_id"] == "graphiti-zep-current-owner" + and item["status"] == "pass" + and not item.get("invalid_at") + for item in mappings + ) + historical_ok = any( + item["evidence_id"] == "graphiti-zep-old-owner" + and item["status"] == "pass" + and item.get("invalid_at") + for item in mappings + ) + rationale_ok = "graphiti-zep-owner-rationale" in mapped_ids + required_ids = list(expected_by_id) + missing_ids = [evidence_id for evidence_id in required_ids if evidence_id not in mapped_ids] + + if current_ok and historical_ok and rationale_ok: + status = "pass" + reason = "Graphiti/Zep search results mapped current, historical, and rationale facts with validity windows." + else: + status = "wrong_result" + reason = ( + "Graphiti/Zep search results did not map all required temporal facts with expected validity " + f"windows; missing={', '.join(missing_ids) or 'none'}." + ) + + return { + "status": status, + "reason": reason, + "expected_evidence_ids": required_ids, + "mapped_evidence_ids": mapped_ids, + "facts": mappings, + } + + +def write_fixture(facts: list[dict[str, Any]], status: StatusState, mapping: dict[str, Any]) -> Path: + """Write a generated memory_evolution fixture for the smoke.""" + + fixture_path = FIXTURE_DIR / "memory_evolution" / "graphiti_zep_temporal_validity.json" + mapped_ids = mapping.get("mapped_evidence_ids", []) + claims = [] + + if status.result == "pass": + claims = [ + { + "claim_id": "relation_current_owner", + "text": "Team Echo currently owns deployment method review.", + "evidence_ids": [ + "graphiti-zep-current-owner", + "graphiti-zep-old-owner", + "graphiti-zep-owner-rationale", + ], + "confidence": "derived_from_graphiti_temporal_search", + }, + { + "claim_id": "relation_historical_owner", + "text": "Team Delta owned deployment method review historically.", + "evidence_ids": ["graphiti-zep-old-owner"], + "confidence": "derived_from_graphiti_temporal_search", + }, + { + "claim_id": "relation_owner_update_rationale", + "text": "Ownership moved after single-user production runbook scope changed.", + "evidence_ids": ["graphiti-zep-owner-rationale"], + "confidence": "derived_from_graphiti_temporal_search", + }, + ] + + fixture: dict[str, Any] = { + "schema": "elf.real_world_job/v1", + "job_id": "graphiti-zep-temporal-validity-001", + "suite": "memory_evolution", + "title": "Map Graphiti/Zep temporal validity windows to current and historical relation facts", + "corpus": { + "corpus_id": "graphiti-zep-generated-public-smoke", + "profile": "generated_public", + "items": [ + { + "evidence_id": fact["evidence_id"], + "kind": "temporal_fact", + "text": fact["fact"], + "source_ref": { + "schema": "source_ref/v1", + "resolver": "graphiti_zep_smoke/v1", + "ref": { + "run_id": RUN_ID, + "evidence_id": fact["evidence_id"], + "valid_at": fact["valid_at"], + "invalid_at": fact["invalid_at"], + }, + }, + "created_at": fact["created_at"], + } + for fact in facts + ], + "adapter_response": { + "adapter_id": "graphiti_zep_temporal_smoke", + "answer": { + "content": ( + "Team Echo currently owns deployment method review. Team Delta owned it " + "historically, and the move followed the single-user production runbook scope change." + if claims + else "" + ), + "claims": claims, + "evidence_ids": mapped_ids, + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0, + }, + }, + }, + }, + "timeline": [ + { + "event_id": "graphiti-zep-old-owner", + "ts": "2026-06-05T00:00:00Z", + "actor": "agent", + "action": "recorded_relation", + "evidence_ids": ["graphiti-zep-old-owner"], + "summary": "Team Delta was the historical owner.", + }, + { + "event_id": "graphiti-zep-current-owner", + "ts": "2026-06-08T00:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": ["graphiti-zep-current-owner", "graphiti-zep-owner-rationale"], + "summary": "Team Echo became the current owner after the scope changed.", + }, + ], + "prompt": { + "role": "user", + "content": "Who currently owns deployment method review, and who owned it historically?", + "job_mode": "answer", + "constraints": ["cite_evidence", "distinguish_current_from_historical"], + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "relation_current_owner", + "text": "Team Echo currently owns deployment method review.", + }, + { + "claim_id": "relation_historical_owner", + "text": "Team Delta owned deployment method review historically.", + }, + ], + "must_not_include": ["Team Delta currently owns deployment method review."], + "evidence_links": { + "relation_current_owner": [ + "graphiti-zep-current-owner", + "graphiti-zep-old-owner", + "graphiti-zep-owner-rationale", + ], + "relation_historical_owner": ["graphiti-zep-old-owner"], + "relation_owner_update_rationale": ["graphiti-zep-owner-rationale"], + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": False, + "requires_refusal": False, + }, + "required_evidence": [ + { + "evidence_id": "graphiti-zep-current-owner", + "claim_id": "relation_current_owner", + "requirement": "cite", + "quote": "Team Echo owns deployment method review", + }, + { + "evidence_id": "graphiti-zep-old-owner", + "claim_id": "relation_historical_owner", + "requirement": "cite", + "quote": "Team Delta owned deployment method review", + }, + ], + "negative_traps": [ + { + "trap_id": "old-owner-as-current", + "type": "stale_fact", + "evidence_ids": ["graphiti-zep-old-owner"], + "failure_if_used": False, + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.4, + "max_points": 1.0, + "criteria": "Requires current-only versus historical temporal validity for relation facts.", + }, + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Would identify current and historical owners separately.", + }, + "evidence_grounding": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Would cite both current and historical relation evidence.", + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Would not report the historical owner as current.", + }, + }, + "pass_threshold": 0.8, + "hard_fail_rules": [], + }, + "allowed_uncertainty": { + "can_answer_unknown": False, + "acceptable_phrases": ["Graphiti/Zep smoke did not return temporal facts."], + "fallback_action": "score_temporal_relation_behavior", + }, + "memory_evolution": { + "current_evidence_ids": ["graphiti-zep-current-owner"], + "historical_evidence_ids": ["graphiti-zep-old-owner"], + "stale_trap_ids": ["old-owner-as-current"], + "conflicts": [ + { + "conflict_id": "relation-owner-current-historical", + "claim_id": "relation_current_owner", + "current_evidence_id": "graphiti-zep-current-owner", + "historical_evidence_id": "graphiti-zep-old-owner", + "resolved_by_evidence_id": "graphiti-zep-owner-rationale", + } + ], + "update_rationale": { + "claim_id": "relation_owner_update_rationale", + "evidence_ids": ["graphiti-zep-owner-rationale"], + "available": True, + }, + "temporal_validity": {"required": True, "encoded": True}, + }, + "tags": ["external_adapter", "generated_public", "memory_evolution", "reference_graphiti_zep_temporal"], + } + + if status.result in {"blocked", "incomplete", "not_encoded"}: + fixture["encoding"] = {"status": status.result, "reason": status.failure_reason} + + write_json(fixture_path, fixture) + + return fixture_path + + +def write_materialization( + status: StatusState, + facts: list[dict[str, Any]], + fixture_path: Path, + command_records: list[CommandRecord], + inserted: list[dict[str, Any]], + search_results: list[dict[str, Any]], + mapping: dict[str, Any], + started_at: float, + report: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Write the primary smoke artifact.""" + + elapsed_ms = (time.monotonic() - started_at) * 1000 + payload = { + "schema": "elf.graphiti_zep_temporal_smoke/v1", + "generated_at": utc_now(), + "run_id": RUN_ID, + "adapter_id": "graphiti_zep_temporal_smoke", + "project": "Graphiti/Zep", + "status": status.overall, + "materialization_status": { + "source": "smoke_materialization", + "setup": status.setup, + "run": status.run, + "result": status.result, + "overall": status.overall, + "failure_class": status.failure_class, + "failure_reason": status.failure_reason, + }, + "scored_benchmark": scored_benchmark(report), + "evidence_class": status.evidence_class, + "failure": { + "class": status.failure_class or None, + "reason": status.failure_reason or None, + }, + "artifacts": { + "materialization": rel(OUT), + "manifest": rel(MANIFEST_OUT), + "summary": rel(SUMMARY_OUT), + "fixture": rel(fixture_path), + "scored_report_json": rel(REPORT_JSON), + "scored_report_markdown": rel(REPORT_MD), + }, + "docker_boundary": { + "compose_file": "docker-compose.baseline.yml", + "service_profile": "graphiti-zep", + "graph_store_service": "graphiti-falkordb", + "runner_service": "baseline-runner", + "runner": "scripts/graphiti-zep-docker-temporal-smoke.py", + "host_global_installs_required": False, + "docker_only": True, + }, + "provider_configuration": { + "package": GRAPHITI_REF, + "package_spec": GRAPHITI_PACKAGE, + "llm_model": LLM_MODEL, + "embedding_model": EMBEDDING_MODEL, + "api_base_configured": bool(API_BASE), + "api_key_provided": bool(API_KEY), + "operator_owned_provider_credentials_used": False, + "live_run_enabled": RUN_LIVE, + "falkordb": { + "host": FALKORDB_HOST, + "port": FALKORDB_PORT, + "database": FALKORDB_DATABASE, + "username_configured": bool(FALKORDB_USERNAME), + "password_configured": bool(FALKORDB_PASSWORD), + }, + }, + "resource_bounds": { + "fact_count": len(facts), + "timeout_seconds": TIMEOUT_SECONDS, + "elapsed_ms": round(elapsed_ms, 3), + "work_dir_size_bytes": dir_size(WORK_DIR), + "work_dir_file_count": file_count(WORK_DIR), + }, + "commands": [command_to_json(record) for record in command_records], + "temporal_facts": facts, + "inserted_facts": inserted, + "search_results": search_results, + "evidence_mapping": mapping, + } + write_json(OUT, payload) + + return payload + + +def write_manifest(status: StatusState) -> dict[str, Any]: + """Write a generated external adapter manifest for this smoke.""" + + manifest = { + "schema": "elf.real_world_external_adapter_manifest/v1", + "manifest_id": f"graphiti-zep-temporal-smoke-{RUN_ID}", + "docker_isolation": { + "default": True, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/graphiti-zep-docker-temporal-smoke.py", + "artifact_dir": "tmp/real-world-memory/graphiti-zep-smoke", + "host_global_installs_required": False, + "notes": [ + f"Generated by the Graphiti/Zep Docker smoke at {utc_now()}.", + "The smoke uses generated public temporal facts and records typed setup/runtime failures.", + ], + }, + "adapters": [ + { + "adapter_id": "graphiti_zep_temporal_smoke", + "project": "Graphiti/Zep", + "adapter_kind": "docker_python_falkordb_temporal_smoke", + "evidence_class": status.evidence_class, + "docker_default": True, + "host_global_installs_required": False, + "overall_status": status.overall, + "setup": { + "status": status.setup, + "evidence": "The smoke runs inside the baseline Docker runner and uses Docker-local FalkorDB plus a container-local Python venv.", + "command": "cargo make smoke-graphiti-zep-docker-temporal", + "artifact": rel(OUT), + }, + "run": { + "status": status.run, + "evidence": "The live path adds generated temporal fact triples and searches Graphiti/Zep for UUID, fact, valid_at, invalid_at, and source node evidence.", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", + "artifact": rel(OUT), + }, + "result": { + "status": status.result, + "evidence": status.failure_reason + if status.failure_reason + else "Graphiti/Zep temporal search mapped current and historical facts to validity windows.", + "artifact": rel(OUT), + }, + "capabilities": [ + { + "capability": "docker_falkordb_setup", + "status": status.setup, + "evidence": "The task starts a Docker Compose FalkorDB profile only when explicitly requested, and uses no host-global graph database.", + }, + { + "capability": "temporal_fact_triple_ingest", + "status": status.run, + "evidence": "The live worker uses Graphiti fact triples for current, historical, and rationale facts with validity windows.", + }, + { + "capability": "validity_window_evidence_mapping", + "status": status.result, + "evidence": "Search output UUID, fact text, valid_at, invalid_at, and node ids are mapped to memory_evolution expected evidence ids.", + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-memory quality, large-corpus behavior, managed Zep service behavior, or private-corpus performance.", + }, + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": status.result, + "evidence": "Only generated current-versus-historical temporal relation facts are represented.", + }, + { + "suite_id": "retrieval", + "status": status.run if status.run != "pass" else "not_encoded", + "evidence": "Hybrid retrieval reachability is exercised by the live search, but broad retrieval quality scoring is not encoded.", + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations.", + }, + ], + "evidence": [ + {"kind": "artifact", "ref": rel(OUT), "status": status.result}, + {"kind": "manifest", "ref": rel(MANIFEST_OUT), "status": status.overall}, + {"kind": "source", "ref": "https://github.com/getzep/graphiti", "status": "real"}, + { + "kind": "source", + "ref": "https://help.getzep.com/graphiti/getting-started/quick-start", + "status": "real", + }, + { + "kind": "source", + "ref": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", + "status": "real", + }, + { + "kind": "source", + "ref": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", + "status": "real", + }, + ], + "execution_metadata": { + "sources": [ + { + "label": "Graphiti repository", + "url": "https://github.com/getzep/graphiti", + "evidence": "Official source for the open-source temporal context graph engine.", + }, + { + "label": "Graphiti quick start", + "url": "https://help.getzep.com/graphiti/getting-started/quick-start", + "evidence": "Official search output examples include UUID, fact, valid_at, and invalid_at fields.", + }, + { + "label": "Graphiti FalkorDB configuration", + "url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", + "evidence": "Official Docker-local FalkorDB setup and Python driver reference.", + }, + { + "label": "Graphiti fact triples", + "url": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", + "evidence": "Official manual fact-triple ingest contract.", + }, + ], + "setup_path": "Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", + "resource_expectation": f"Graphiti package {GRAPHITI_REF}, fact_count=3, timeout_seconds={TIMEOUT_SECONDS}, FalkorDB host={FALKORDB_HOST}:{FALKORDB_PORT}.", + "retry_guidance": [ + "Default command records a typed blocked artifact without model calls.", + "Enable the live path only with Docker-local FalkorDB and explicit provider configuration.", + "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass.", + ], + "research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation; generated artifact decides live evidence class.", + }, + "notes": [ + "The checked-in manifest record remains research_gate; generated smoke artifacts carry live status.", + "Failure before Graphiti search output remains typed as blocked or incomplete.", + "The smoke does not use a hosted Zep service, private corpora, or unrecorded provider credentials.", + ], + } + ], + } + write_json(MANIFEST_OUT, manifest) + + return manifest + + +def write_summary(materialization: dict[str, Any], manifest: dict[str, Any], report: dict[str, Any]) -> None: + """Write a small summary artifact.""" + + write_json( + SUMMARY_OUT, + { + "schema": "elf.graphiti_zep_temporal_smoke_summary/v1", + "generated_at": utc_now(), + "adapter_id": "graphiti_zep_temporal_smoke", + "evidence_class": materialization["evidence_class"], + "status_boundary": { + "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", + "manifest": "external adapter declaration consumed by the scorer", + "scored_benchmark": "post-score real_world_job outcome; use this for quality status", + }, + "scored_benchmark": materialization["scored_benchmark"], + "materialization": materialization, + "manifest": { + "json": rel(MANIFEST_OUT), + "status_source": "external_adapter_manifest_pre_score", + "summary": manifest["adapters"][0]["overall_status"], + "suites": manifest["adapters"][0]["suites"], + }, + "report": report, + }, + ) + + +def main() -> int: + """Run the smoke and always emit typed artifacts when possible.""" + + started_at = time.monotonic() + mkdirs() + status = StatusState() + command_records: list[CommandRecord] = [] + facts = temporal_facts() + inserted: list[dict[str, Any]] = [] + search_results: list[dict[str, Any]] = [] + mapping: dict[str, Any] = { + "status": "blocked", + "reason": status.failure_reason, + "expected_evidence_ids": [fact["evidence_id"] for fact in facts], + "mapped_evidence_ids": [], + "facts": [ + { + "evidence_id": fact["evidence_id"], + "claim_id": fact["claim_id"], + "status": "blocked", + "expected_valid_at": fact["valid_at"], + "expected_invalid_at": fact["invalid_at"], + "current": fact["current"], + } + for fact in facts + ], + } + + if not Path("/.dockerenv").exists() and not ALLOW_HOST: + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "not_running_in_docker" + status.failure_reason = "Graphiti/Zep smoke must run inside Docker; use cargo make smoke-graphiti-zep-docker-temporal." + mapping["status"] = status.result + mapping["reason"] = status.failure_reason + elif not command_available("python3"): + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "python_missing" + status.failure_reason = "python3 is required for the Graphiti/Zep smoke runner." + mapping["status"] = status.result + mapping["reason"] = status.failure_reason + elif not RUN_LIVE: + pass + elif not API_KEY: + status.setup = "blocked" + status.run = "not_encoded" + status.result = "blocked" + status.overall = "blocked" + status.failure_class = "provider_api_key_missing" + status.failure_reason = "Graphiti/Zep live temporal search requires an explicit provider API key; no hosted Zep service or unrecorded provider credentials were used." + mapping["reason"] = status.failure_reason + elif not wait_for_falkordb(command_records): + status.setup = "incomplete" + status.run = "not_encoded" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "falkordb_unreachable" + status.failure_reason = "Docker-local FalkorDB did not become reachable for the Graphiti/Zep smoke." + mapping["status"] = status.result + mapping["reason"] = status.failure_reason + else: + installed, python = init_graphiti(command_records) + if not installed: + status.setup = "incomplete" + status.run = "not_encoded" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "graphiti_setup_failed" + status.failure_reason = "Graphiti installation failed inside the Docker runner." + mapping["status"] = status.result + mapping["reason"] = status.failure_reason + else: + status.setup = "pass" + inserted, search_results = run_graphiti(python, command_records) + + if not search_results: + status.run = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "graphiti_temporal_search_failed" + status.failure_reason = "Graphiti/Zep did not return temporal search results for the generated fact corpus." + mapping["status"] = status.result + mapping["reason"] = status.failure_reason + else: + status.run = "pass" + status.evidence_class = "live_real_world" + mapping = map_observed_facts(search_results, facts) + if mapping["status"] == "pass": + status.result = "pass" + status.overall = "pass" + status.failure_class = "" + status.failure_reason = "" + else: + status.result = "wrong_result" + status.overall = "wrong_result" + status.failure_class = "graphiti_temporal_mapping_failed" + status.failure_reason = mapping["reason"] + + fixture_path = write_fixture(facts, status, mapping) + materialization = write_materialization( + status, + facts, + fixture_path, + command_records, + inserted, + search_results, + mapping, + started_at, + ) + manifest = write_manifest(status) + report = run_scored_report(fixture_path, MANIFEST_OUT, status) + materialization = write_materialization( + status, + facts, + fixture_path, + command_records, + inserted, + search_results, + mapping, + started_at, + report, + ) + write_summary(materialization, manifest, report) + print(f"Graphiti/Zep smoke artifact: {OUT}") + print(f"Graphiti/Zep smoke manifest: {MANIFEST_OUT}") + print(f"Graphiti/Zep smoke summary: {SUMMARY_OUT}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/graphrag-docker-smoke.py b/scripts/graphrag-docker-smoke.py new file mode 100755 index 00000000..c6b01d45 --- /dev/null +++ b/scripts/graphrag-docker-smoke.py @@ -0,0 +1,1470 @@ +#!/usr/bin/env python3 +"""Cost-bounded GraphRAG Docker smoke for real-world external adapters.""" + +from __future__ import annotations + +import csv +import json +import math +import os +import shutil +import subprocess +import sys +import time +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +SCRIPT_DIR = Path(__file__).resolve().parent +ROOT_DIR = SCRIPT_DIR.parent +REPORT_DIR = Path( + os.environ.get( + "ELF_GRAPHRAG_SMOKE_REPORT_DIR", + ROOT_DIR / "tmp" / "real-world-memory" / "graphrag-smoke", + ) +) +WORK_DIR = Path(os.environ.get("ELF_GRAPHRAG_SMOKE_WORK_DIR", REPORT_DIR / "work")) +OUT = Path(os.environ.get("ELF_GRAPHRAG_SMOKE_OUT", REPORT_DIR / "graphrag-smoke.json")) +MANIFEST_OUT = Path( + os.environ.get( + "ELF_GRAPHRAG_SMOKE_MANIFEST_OUT", + REPORT_DIR / "memory_projects_manifest.graphrag-smoke.json", + ) +) +SUMMARY_OUT = Path(os.environ.get("ELF_GRAPHRAG_SMOKE_SUMMARY_OUT", REPORT_DIR / "summary.json")) +REPORT_JSON = Path(os.environ.get("ELF_GRAPHRAG_SMOKE_REPORT_JSON", REPORT_DIR / "graphrag-report.json")) +REPORT_MD = Path(os.environ.get("ELF_GRAPHRAG_SMOKE_REPORT_MD", REPORT_DIR / "graphrag-report.md")) +FIXTURE_DIR = REPORT_DIR / "graphrag-fixtures" +OUTPUT_CAPTURE_DIR = REPORT_DIR / "graphrag-output" +LOG_DIR = REPORT_DIR / "logs" + +RUN_ID = os.environ.get( + "ELF_GRAPHRAG_SMOKE_RUN_ID", + f"graphrag-docker-smoke-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}", +) +RUN_LIVE = os.environ.get("ELF_GRAPHRAG_SMOKE_RUN", "0") == "1" +ALLOW_HOST = os.environ.get("ELF_GRAPHRAG_SMOKE_ALLOW_HOST", "0") == "1" +INSTALL_GRAPHRAG = os.environ.get("ELF_GRAPHRAG_SMOKE_INSTALL", "1") == "1" +GRAPH_RAG_VERSION = os.environ.get("ELF_GRAPHRAG_VERSION", "3.1.0") +GRAPH_RAG_PACKAGE = os.environ.get("ELF_GRAPHRAG_PACKAGE", f"graphrag=={GRAPH_RAG_VERSION}") +GRAPH_RAG_REF = os.environ.get("ELF_GRAPHRAG_REF", f"pypi:{GRAPH_RAG_PACKAGE}") +CHAT_MODEL = os.environ.get("ELF_GRAPHRAG_CHAT_MODEL", "gpt-4o-mini") +EMBEDDING_MODEL = os.environ.get("ELF_GRAPHRAG_EMBEDDING_MODEL", "text-embedding-3-small") +API_BASE = os.environ.get("ELF_GRAPHRAG_API_BASE", "") +API_KEY = os.environ.get("ELF_GRAPHRAG_API_KEY", os.environ.get("GRAPHRAG_API_KEY", "")) +INDEX_METHOD = os.environ.get("ELF_GRAPHRAG_INDEX_METHOD", "fast") +QUERY_METHOD = os.environ.get("ELF_GRAPHRAG_QUERY_METHOD", "local") +TIMEOUT_SECONDS = int(os.environ.get("ELF_GRAPHRAG_TIMEOUT_SECONDS", "900")) +MAX_DOCS = max(1, min(int(os.environ.get("ELF_GRAPHRAG_MAX_DOCS", "3")), 3)) +MAX_INPUT_CHARS = max(400, min(int(os.environ.get("ELF_GRAPHRAG_MAX_INPUT_CHARS", "2400")), 6000)) + +TABLES = ( + "documents", + "text_units", + "communities", + "community_reports", + "entities", + "relationships", +) + + +@dataclass +class StatusState: + """Typed status for generated GraphRAG smoke artifacts.""" + + setup: str = "blocked" + run: str = "not_encoded" + result: str = "blocked" + overall: str = "blocked" + evidence_class: str = "research_gate" + failure_class: str = "graphrag_live_run_disabled" + failure_reason: str = ( + "GraphRAG indexing is model-call intensive; set ELF_GRAPHRAG_SMOKE_RUN=1 " + "and provide explicit provider configuration to attempt the live Docker smoke." + ) + + +@dataclass +class CommandRecord: + """Captured command result without secret-bearing environment values.""" + + label: str + command: list[str] + status: str + elapsed_ms: float + stdout_artifact: str | None + stderr_artifact: str | None + returncode: int | None + reason: str + + +def utc_now() -> str: + """Return an RFC3339 UTC timestamp.""" + + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def rel(path: Path) -> str: + """Return a repository-relative path when possible.""" + + try: + return str(path.resolve().relative_to(ROOT_DIR)) + except ValueError: + return str(path) + + +def mkdirs() -> None: + """Create output directories.""" + + for path in (REPORT_DIR, WORK_DIR, FIXTURE_DIR, OUTPUT_CAPTURE_DIR, LOG_DIR): + path.mkdir(parents=True, exist_ok=True) + + +def write_json(path: Path, payload: Any) -> None: + """Write stable, pretty JSON.""" + + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def run_scored_report(fixture_path: Path, manifest_path: Path, status: StatusState) -> dict[str, Any]: + """Score the generated smoke fixture through the real-world job runner.""" + + run_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + str(fixture_path), + "--out", + str(REPORT_JSON), + "--run-id", + "real-world-memory-live-graphrag", + "--adapter-id", + "graphrag_docker_smoke", + "--adapter-name", + "GraphRAG Docker smoke adapter", + "--adapter-behavior", + "docker_python_cli_api_smoke", + "--adapter-storage-status", + status.setup, + "--adapter-runtime-status", + status.overall, + "--adapter-notes", + "Generated by the cost-bounded GraphRAG Docker smoke; pass or wrong_result requires GraphRAG output tables mapped to generated evidence ids, while provider/setup limits remain typed.", + "--external-adapter-manifest", + str(manifest_path), + ] + publish_cmd = [ + "cargo", + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + str(REPORT_JSON), + "--out", + str(REPORT_MD), + ] + + subprocess.run(run_cmd, cwd=ROOT_DIR, check=True) + subprocess.run(publish_cmd, cwd=ROOT_DIR, check=True) + + report = json.loads(REPORT_JSON.read_text(encoding="utf-8")) + + return { + "json": rel(REPORT_JSON), + "markdown": rel(REPORT_MD), + "summary": report.get("summary", {}), + "suites": report.get("suites", []), + } + + +def scored_benchmark(report: dict[str, Any] | None) -> dict[str, Any]: + """Extract the post-score benchmark status from a real_world_job report.""" + + if report is None: + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": "pending", + "reason": "The smoke materialization was written before benchmark scoring completed.", + } + + summary = report.get("summary", {}) + counts = { + status: int(summary.get(status, 0) or 0) + for status in ( + "pass", + "wrong_result", + "lifecycle_fail", + "incomplete", + "blocked", + "not_encoded", + ) + } + status = next((name for name, count in counts.items() if name != "pass" and count > 0), "pass") + + return { + "schema": "elf.scored_benchmark_status/v1", + "source": "real_world_job_benchmark", + "status": status, + "counts": counts, + "job_count": int(summary.get("job_count", 0) or 0), + "mean_score": summary.get("mean_score"), + "evidence_coverage": summary.get("evidence_coverage"), + } + + +def dir_size(path: Path) -> int: + """Return total file size for a directory or file.""" + + if not path.exists(): + return 0 + if path.is_file(): + return path.stat().st_size + + return sum(item.stat().st_size for item in path.rglob("*") if item.is_file()) + + +def file_count(path: Path) -> int: + """Return file count for a directory.""" + + if not path.exists(): + return 0 + + return sum(1 for item in path.rglob("*") if item.is_file()) + + +def command_available(command: str) -> bool: + """Return whether a command is on PATH.""" + + return shutil.which(command) is not None + + +def run_command( + label: str, + command: list[str], + cwd: Path, + timeout: int = TIMEOUT_SECONDS, + extra_env: dict[str, str] | None = None, +) -> CommandRecord: + """Run a subprocess and capture stdout/stderr artifacts.""" + + cwd.mkdir(parents=True, exist_ok=True) + stdout_path = LOG_DIR / f"{label}.stdout.log" + stderr_path = LOG_DIR / f"{label}.stderr.log" + env = os.environ.copy() + + if extra_env: + env.update(extra_env) + + started = time.monotonic() + try: + proc = subprocess.run( + command, + cwd=cwd, + env=env, + text=True, + capture_output=True, + timeout=timeout, + check=False, + ) + elapsed_ms = (time.monotonic() - started) * 1000 + stdout_path.write_text(proc.stdout, encoding="utf-8") + stderr_path.write_text(proc.stderr, encoding="utf-8") + status = "pass" if proc.returncode == 0 else "incomplete" + reason = "Command completed." if proc.returncode == 0 else f"Command exited {proc.returncode}." + + return CommandRecord( + label=label, + command=command, + status=status, + elapsed_ms=elapsed_ms, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=proc.returncode, + reason=reason, + ) + except subprocess.TimeoutExpired as err: + elapsed_ms = (time.monotonic() - started) * 1000 + stdout_path.write_text(err.stdout or "", encoding="utf-8") + stderr_path.write_text(err.stderr or "", encoding="utf-8") + + return CommandRecord( + label=label, + command=command, + status="incomplete", + elapsed_ms=elapsed_ms, + stdout_artifact=rel(stdout_path), + stderr_artifact=rel(stderr_path), + returncode=None, + reason=f"Command timed out after {timeout} seconds.", + ) + + +def command_to_json(record: CommandRecord) -> dict[str, Any]: + """Serialize a command record.""" + + return { + "label": record.label, + "status": record.status, + "command": record.command, + "elapsed_ms": round(record.elapsed_ms, 3), + "stdout_artifact": record.stdout_artifact, + "stderr_artifact": record.stderr_artifact, + "returncode": record.returncode, + "reason": record.reason, + } + + +def generated_corpus() -> list[dict[str, str]]: + """Return the bounded generated-public corpus.""" + + docs = [ + { + "evidence_id": "graphrag-smoke-nova-observatory", + "title": "Nova Observatory memo", + "text": ( + "Evidence ID graphrag-smoke-nova-observatory. Nova Observatory " + "operates the public Aurora Index review. The Aurora Index links " + "skyglow measurements to open weather station readings for civic " + "science audits. The GraphRAG smoke must map this source document " + "and its text unit back to the Nova Observatory evidence id." + ), + }, + { + "evidence_id": "graphrag-smoke-aurora-index", + "title": "Aurora Index field note", + "text": ( + "Evidence ID graphrag-smoke-aurora-index. The Aurora Index uses " + "Nova Observatory calibration notes when explaining why a public " + "skyglow reading changed. The GraphRAG smoke must keep the Aurora " + "Index source document and text unit evidence id recoverable." + ), + }, + { + "evidence_id": "graphrag-smoke-stale-trap", + "title": "Retired skyglow note", + "text": ( + "Evidence ID graphrag-smoke-stale-trap. Retired note: Nova " + "Observatory previously used the obsolete Zenith Ledger. This note " + "is a distractor and must not be used as the primary answer." + ), + }, + ] + trimmed: list[dict[str, str]] = [] + used_chars = 0 + + for doc in docs[:MAX_DOCS]: + remaining = MAX_INPUT_CHARS - used_chars + + if remaining <= 0: + break + + text = doc["text"][:remaining].strip() + used_chars += len(text) + trimmed.append({**doc, "text": text}) + + return trimmed + + +def write_corpus(project_dir: Path, corpus: list[dict[str, str]]) -> Path: + """Write GraphRAG plain text input plus a CSV mapping copy.""" + + input_dir = project_dir / "input" + input_dir.mkdir(parents=True, exist_ok=True) + csv_path = REPORT_DIR / "generated-corpus.csv" + + with csv_path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=("evidence_id", "title", "text")) + writer.writeheader() + + for item in corpus: + writer.writerow(item) + + for item in corpus: + file_name = f"{slug(item['evidence_id'])}.txt" + (input_dir / file_name).write_text( + f"Title: {item['title']}\nEvidence ID: {item['evidence_id']}\n\n{item['text']}\n", + encoding="utf-8", + ) + + return csv_path + + +def write_fixture(corpus: list[dict[str, str]], status: StatusState, mapped_ids: list[str]) -> Path: + """Write a generated real_world_job fixture for the smoke.""" + + fixture_path = FIXTURE_DIR / "knowledge" / "graphrag_tiny_corpus.json" + expected_ids = [item["evidence_id"] for item in corpus if item["evidence_id"] != "graphrag-smoke-stale-trap"] + used_ids = [item for item in mapped_ids if item in expected_ids] + stale_trap_ids = [ + item["evidence_id"] for item in corpus if item["evidence_id"] == "graphrag-smoke-stale-trap" + ] + response = { + "adapter_id": "graphrag_docker_smoke", + "answer": { + "content": ( + "Nova Observatory and the Aurora Index are connected by calibration " + "and public skyglow review evidence." + if used_ids + else "" + ), + "claims": [ + { + "claim_id": "nova_aurora_link", + "text": ( + "Nova Observatory and the Aurora Index are connected by " + "calibration and public skyglow review evidence." + ), + "evidence_ids": used_ids, + "confidence": "derived_from_graphrag_table_mapping", + } + ] + if used_ids + else [], + "evidence_ids": used_ids, + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0, + }, + }, + } + fixture: dict[str, Any] = { + "schema": "elf.real_world_job/v1", + "job_id": "graphrag-tiny-corpus-001", + "suite": "knowledge_compilation", + "title": "Map GraphRAG output tables to generated evidence", + "corpus": { + "corpus_id": "graphrag-generated-public-smoke", + "profile": "generated_public", + "items": [ + { + "evidence_id": item["evidence_id"], + "kind": "document", + "text": item["text"], + "source_ref": { + "schema": "source_ref/v1", + "resolver": "graphrag_smoke/v1", + "ref": { + "run_id": RUN_ID, + "evidence_id": item["evidence_id"], + "title": item["title"], + }, + }, + "created_at": "2026-06-10T00:00:00Z", + } + for item in corpus + ], + "adapter_response": response, + }, + "timeline": [ + { + "event_id": "graphrag-smoke-corpus-generated", + "ts": "2026-06-10T00:00:00Z", + "actor": "system", + "action": "generated_public_corpus", + "evidence_ids": expected_ids, + "summary": "The GraphRAG smoke generated a tiny public corpus for source mapping.", + } + ], + "prompt": { + "role": "user", + "content": "What connects Nova Observatory and the Aurora Index in the generated corpus?", + "job_mode": "compile", + "constraints": ["cite_evidence", "avoid_stale_facts"], + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "nova_aurora_link", + "text": ( + "Nova Observatory and the Aurora Index are connected by " + "calibration and public skyglow review evidence." + ), + } + ], + "must_not_include": ["Zenith Ledger is the current source."], + "evidence_links": {"nova_aurora_link": expected_ids}, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": False, + "requires_refusal": False, + }, + "required_evidence": [ + { + "evidence_id": evidence_id, + "claim_id": "nova_aurora_link", + "requirement": "cite", + "quote": "Aurora Index", + } + for evidence_id in expected_ids + ], + "negative_traps": [ + { + "trap_id": "retired-zenith-ledger", + "type": "stale_fact", + "evidence_ids": stale_trap_ids, + "failure_if_used": True, + } + ] + if stale_trap_ids + else [], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "States the Nova Observatory and Aurora Index relationship.", + }, + "evidence_grounding": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Maps output table identifiers to generated evidence ids.", + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not use the retired Zenith Ledger distractor.", + }, + "uncertainty": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Does not claim broad GraphRAG quality from the tiny smoke.", + }, + }, + "pass_threshold": 0.75, + "hard_fail_rules": [], + }, + "allowed_uncertainty": { + "can_answer_unknown": False, + "acceptable_phrases": ["tiny generated corpus", "smoke only"], + "fallback_action": "state_blocker", + }, + "operator_debug": None, + "encoding": {}, + "memory_evolution": None, + "tags": ["external_adapter", "generated_public", "no_live_claim"], + } + + if status.result in {"blocked", "incomplete"}: + fixture["encoding"] = { + "status": status.result, + "reason": status.failure_reason, + } + + write_json(fixture_path, fixture) + + return fixture_path + + +def slug(value: str) -> str: + """Return a small ASCII slug.""" + + out: list[str] = [] + last_dash = False + + for char in value.lower(): + if char.isascii() and char.isalnum(): + out.append(char) + last_dash = False + elif not last_dash and out: + out.append("-") + last_dash = True + + while out and out[-1] == "-": + out.pop() + + return "".join(out) or "item" + + +def init_project(project_dir: Path, command_records: list[CommandRecord]) -> bool: + """Create a venv, install GraphRAG, and initialize the project.""" + + venv_dir = WORK_DIR / ".venv" + python = venv_dir / "bin" / "python" + graphrag = venv_dir / "bin" / "graphrag" + + if INSTALL_GRAPHRAG: + venv_record = run_command("python-venv", [sys.executable, "-m", "venv", str(venv_dir)], WORK_DIR) + command_records.append(venv_record) + if venv_record.status != "pass": + return False + + install_record = run_command( + "graphrag-install", + [str(python), "-m", "pip", "install", "--disable-pip-version-check", GRAPH_RAG_PACKAGE], + WORK_DIR, + ) + command_records.append(install_record) + if install_record.status != "pass": + return False + elif not graphrag.exists(): + command_records.append( + CommandRecord( + label="graphrag-install", + command=["graphrag"], + status="incomplete", + elapsed_ms=0.0, + stdout_artifact=None, + stderr_artifact=None, + returncode=None, + reason="GraphRAG install was disabled and no venv graphrag executable exists.", + ) + ) + + return False + + init_record = run_command( + "graphrag-init", + [ + str(graphrag), + "init", + "--root", + str(project_dir), + "--model", + CHAT_MODEL, + "--embedding", + EMBEDDING_MODEL, + "--force", + ], + WORK_DIR, + extra_env={"GRAPHRAG_API_KEY": API_KEY, "GRAPHRAG_API_BASE": API_BASE}, + ) + command_records.append(init_record) + + if init_record.status != "pass": + return False + + patch_settings(project_dir / "settings.yaml") + + return True + + +def patch_settings(settings_path: Path) -> None: + """Apply bounded model, chunking, and output configuration to settings.yaml.""" + + if not settings_path.exists(): + return + + lines = settings_path.read_text(encoding="utf-8").splitlines() + patched: list[str] = [] + inserted_api_base = False + + for line in lines: + patched.append(line) + stripped = line.strip() + indent = line[: len(line) - len(line.lstrip())] + + if API_BASE and stripped.startswith("api_key:") and not inserted_api_base: + patched.append(f"{indent}api_base: ${{GRAPHRAG_API_BASE}}") + inserted_api_base = True + + patched.extend( + [ + "", + "# ELF GraphRAG smoke bounds.", + "chunks:", + " size: 220", + " overlap: 20", + " prepend_metadata: false", + "extract_graph:", + " max_gleanings: 0", + "summarize_descriptions:", + " max_length: 160", + " max_input_length: 600", + "community_reports:", + " max_length: 220", + " max_input_length: 800", + "parallelization:", + " stagger: 0.0", + " num_threads: 1", + "async_mode: threaded", + ] + ) + settings_path.write_text("\n".join(patched) + "\n", encoding="utf-8") + + +def run_graphrag(project_dir: Path, command_records: list[CommandRecord]) -> Path | None: + """Run GraphRAG index and local query.""" + + graphrag = WORK_DIR / ".venv" / "bin" / "graphrag" + env = {"GRAPHRAG_API_KEY": API_KEY, "GRAPHRAG_API_BASE": API_BASE} + index_record = run_command( + "graphrag-index", + [ + str(graphrag), + "index", + "--root", + str(project_dir), + "--method", + INDEX_METHOD, + "--cache", + ], + WORK_DIR, + extra_env=env, + ) + command_records.append(index_record) + if index_record.status != "pass": + return None + + output_dir = find_output_dir(project_dir) + if output_dir is None: + command_records.append( + CommandRecord( + label="graphrag-output-discovery", + command=["find", str(project_dir / "output"), "-name", "*.parquet"], + status="incomplete", + elapsed_ms=0.0, + stdout_artifact=None, + stderr_artifact=None, + returncode=None, + reason="GraphRAG index completed but no parquet output directory was found.", + ) + ) + + return None + + query_record = run_command( + "graphrag-query-local", + [ + str(graphrag), + "query", + "--root", + str(project_dir), + "--method", + QUERY_METHOD, + "--data", + str(output_dir), + "--response-type", + "Single Sentence", + "What connects Nova Observatory and the Aurora Index in the generated corpus?", + ], + WORK_DIR, + extra_env=env, + ) + command_records.append(query_record) + + if query_record.status != "pass": + return None + + return output_dir + + +def find_output_dir(project_dir: Path) -> Path | None: + """Find a GraphRAG output directory containing parquet tables.""" + + output_root = project_dir / "output" + candidates: list[Path] = [] + + if output_root.exists(): + for parquet in output_root.rglob("*.parquet"): + candidates.append(parquet.parent) + + if not candidates: + return None + + candidates.sort(key=lambda path: path.stat().st_mtime if path.exists() else 0.0) + + return candidates[-1] + + +def map_tables(output_dir: Path, corpus: list[dict[str, str]]) -> tuple[list[dict[str, Any]], list[str]]: + """Map GraphRAG parquet table identifiers to real_world_job evidence ids.""" + + try: + import pandas as pd # type: ignore[import-not-found] + except ImportError as err: + return ( + [ + { + "table": table, + "mapping_status": "reader_missing", + "error": f"pandas/pyarrow unavailable: {err}", + "row_count": 0, + "mapped_row_count": 0, + "rows": [], + } + for table in TABLES + ], + [], + ) + + table_paths = capture_table_artifacts(output_dir) + mapped_by_table: dict[str, dict[str, list[str]]] = {} + mappings: list[dict[str, Any]] = [] + + for table in TABLES: + path = table_paths.get(table) + + if path is None: + mappings.append( + { + "table": table, + "mapping_status": "missing_table", + "artifact": None, + "row_count": 0, + "mapped_row_count": 0, + "rows": [], + } + ) + mapped_by_table[table] = {} + continue + + try: + frame = pd.read_parquet(path) + except Exception as err: # noqa: BLE001 + mappings.append( + { + "table": table, + "mapping_status": "read_failed", + "artifact": rel(path), + "error": str(err), + "row_count": 0, + "mapped_row_count": 0, + "rows": [], + } + ) + mapped_by_table[table] = {} + continue + + rows, by_id = map_frame(table, frame, corpus, mapped_by_table) + mapped_count = sum(1 for row in rows if row["evidence_ids"]) + status = "pass" + + if table in {"documents", "text_units"} and mapped_count < len(rows): + status = "unmapped_required_rows" + elif mapped_count == 0 and len(rows) > 0: + status = "unmapped_rows" + + mappings.append( + { + "table": table, + "mapping_status": status, + "artifact": rel(path), + "row_count": len(rows), + "mapped_row_count": mapped_count, + "rows": rows, + } + ) + mapped_by_table[table] = by_id + + evidence_ids: list[str] = [] + + for mapping in mappings: + for row in mapping["rows"]: + for evidence_id in row["evidence_ids"]: + if evidence_id not in evidence_ids: + evidence_ids.append(evidence_id) + + return mappings, evidence_ids + + +def empty_table_mappings(mapping_status: str) -> list[dict[str, Any]]: + """Return explicit table mapping placeholders for non-live typed outcomes.""" + + return [ + { + "table": table, + "mapping_status": mapping_status, + "artifact": None, + "row_count": 0, + "mapped_row_count": 0, + "rows": [], + } + for table in TABLES + ] + + +def capture_table_artifacts(output_dir: Path) -> dict[str, Path]: + """Copy known GraphRAG parquet tables into the report artifact directory.""" + + table_paths: dict[str, Path] = {} + + if OUTPUT_CAPTURE_DIR.exists(): + shutil.rmtree(OUTPUT_CAPTURE_DIR) + OUTPUT_CAPTURE_DIR.mkdir(parents=True, exist_ok=True) + + for table in TABLES: + source = find_table_path(output_dir, table) + + if source is None: + continue + + destination = OUTPUT_CAPTURE_DIR / f"{table}.parquet" + shutil.copy2(source, destination) + table_paths[table] = destination + + return table_paths + + +def find_table_path(output_dir: Path, table: str) -> Path | None: + """Find a parquet file for a GraphRAG logical table name.""" + + candidates = list(output_dir.rglob("*.parquet")) + exact_names = { + f"{table}.parquet", + f"create_final_{table}.parquet", + f"final_{table}.parquet", + } + + for path in candidates: + if path.name in exact_names: + return path + + for path in candidates: + stem = path.stem.lower() + + if stem.endswith(table) or stem == table or f"_{table}" in stem: + return path + + return None + + +def map_frame( + table: str, + frame: Any, + corpus: list[dict[str, str]], + mapped_by_table: dict[str, dict[str, list[str]]], +) -> tuple[list[dict[str, Any]], dict[str, list[str]]]: + """Map rows for a GraphRAG output table.""" + + rows: list[dict[str, Any]] = [] + by_id: dict[str, list[str]] = {} + + for _, row in frame.iterrows(): + row_dict = {key: normalize_cell(value) for key, value in row.to_dict().items()} + row_id = str(row_dict.get("id") or row_dict.get("human_readable_id") or row_dict.get("community") or "") + evidence_ids = evidence_from_row(table, row_dict, corpus, mapped_by_table) + rows.append( + { + "row_id": row_id, + "human_readable_id": row_dict.get("human_readable_id"), + "document_id": row_dict.get("document_id"), + "community": row_dict.get("community"), + "text_unit_ids": row_dict.get("text_unit_ids") or row_dict.get("text_units") or [], + "evidence_ids": evidence_ids, + } + ) + + if row_id: + by_id[row_id] = evidence_ids + + return rows, by_id + + +def normalize_cell(value: Any) -> Any: + """Normalize dataframe cell values into JSON-safe values.""" + + if value is None: + return None + if hasattr(value, "tolist"): + return normalize_cell(value.tolist()) + if isinstance(value, float) and math.isnan(value): + return None + if isinstance(value, (list, tuple, set)): + return [normalize_cell(item) for item in value] + if isinstance(value, dict): + return {str(key): normalize_cell(item) for key, item in value.items()} + + return value + + +def evidence_from_row( + table: str, + row: dict[str, Any], + corpus: list[dict[str, str]], + mapped_by_table: dict[str, dict[str, list[str]]], +) -> list[str]: + """Return mapped evidence ids for one output row.""" + + evidence_ids: list[str] = [] + haystack = json.dumps(row, sort_keys=True, default=str) + + for item in corpus: + evidence_id = item["evidence_id"] + title = item["title"] + signature = item["text"].split(".")[0] + + if ( + evidence_id in haystack + or slug(evidence_id) in haystack + or title in haystack + or signature in haystack + ): + append_unique(evidence_ids, evidence_id) + + document_id = row.get("document_id") + if document_id is not None: + for evidence_id in mapped_by_table.get("documents", {}).get(str(document_id), []): + append_unique(evidence_ids, evidence_id) + + for text_unit_id in row.get("text_unit_ids") or []: + for evidence_id in mapped_by_table.get("text_units", {}).get(str(text_unit_id), []): + append_unique(evidence_ids, evidence_id) + + if table == "community_reports": + community = row.get("community") + + if community is not None: + for candidate_id, candidate_evidence in mapped_by_table.get("communities", {}).items(): + if str(candidate_id) == str(community): + for evidence_id in candidate_evidence: + append_unique(evidence_ids, evidence_id) + + return evidence_ids + + +def append_unique(values: list[str], value: str) -> None: + """Append a value if absent.""" + + if value not in values: + values.append(value) + + +def mapping_is_valid(mappings: list[dict[str, Any]], expected_ids: list[str]) -> tuple[bool, str]: + """Validate source document/text-unit evidence mapping.""" + + mapping_by_table = {mapping["table"]: mapping for mapping in mappings} + + for table in TABLES: + mapping = mapping_by_table.get(table) + + if mapping is None or mapping["mapping_status"] in {"missing_table", "read_failed", "reader_missing"}: + return False, f"GraphRAG output table {table} was not available for evidence mapping." + + for table in ("documents", "text_units"): + mapping = mapping_by_table[table] + + if mapping["mapping_status"] != "pass": + return False, f"GraphRAG {table} rows include identifiers that did not map to evidence ids." + + seen: list[str] = [] + for mapping in mappings: + for row in mapping["rows"]: + for evidence_id in row["evidence_ids"]: + append_unique(seen, evidence_id) + + missing = [evidence_id for evidence_id in expected_ids if evidence_id not in seen] + + if missing: + return False, f"GraphRAG output mappings missed expected evidence ids: {', '.join(missing)}." + + return True, "GraphRAG output tables mapped to expected generated evidence ids." + + +def write_materialization( + status: StatusState, + corpus: list[dict[str, str]], + fixture_path: Path, + corpus_csv: Path, + command_records: list[CommandRecord], + mappings: list[dict[str, Any]], + mapped_ids: list[str], + started_at: float, + report: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Write the primary smoke artifact.""" + + cache_dir = WORK_DIR / "project" / "cache" + output_dir = WORK_DIR / "project" / "output" + elapsed_ms = (time.monotonic() - started_at) * 1000 + expected_ids = [item["evidence_id"] for item in corpus if item["evidence_id"] != "graphrag-smoke-stale-trap"] + payload = { + "schema": "elf.graphrag_docker_smoke/v1", + "generated_at": utc_now(), + "run_id": RUN_ID, + "adapter_id": "graphrag_docker_smoke", + "evidence_class": status.evidence_class, + "status": { + "source": "smoke_materialization", + "setup": status.setup, + "run": status.run, + "result": status.result, + "overall": status.overall, + "failure_class": status.failure_class, + "failure_reason": status.failure_reason, + }, + "scored_benchmark": scored_benchmark(report), + "artifacts": { + "generated_corpus_csv": rel(corpus_csv), + "generated_fixture": rel(fixture_path), + "graph_output_dir": rel(OUTPUT_CAPTURE_DIR), + "manifest": rel(MANIFEST_OUT), + "summary": rel(SUMMARY_OUT), + "scored_report_json": rel(REPORT_JSON), + "scored_report_markdown": rel(REPORT_MD), + }, + "docker_boundary": { + "compose_file": "docker-compose.baseline.yml", + "runner_service": "baseline-runner", + "runner": "scripts/graphrag-docker-smoke.py", + "host_global_installs_required": False, + "docker_only": True, + }, + "provider_configuration": { + "package": GRAPH_RAG_REF, + "package_spec": GRAPH_RAG_PACKAGE, + "chat_model": CHAT_MODEL, + "embedding_model": EMBEDDING_MODEL, + "api_base_configured": bool(API_BASE), + "api_key_provided": bool(API_KEY), + "operator_owned_provider_credentials_used": False, + "index_method": INDEX_METHOD, + "query_method": QUERY_METHOD, + "live_run_enabled": RUN_LIVE, + }, + "resource_bounds": { + "max_docs": MAX_DOCS, + "max_input_chars": MAX_INPUT_CHARS, + "actual_doc_count": len(corpus), + "actual_input_chars": sum(len(item["text"]) for item in corpus), + "timeout_seconds": TIMEOUT_SECONDS, + "elapsed_ms": round(elapsed_ms, 3), + "cache_size_bytes": dir_size(cache_dir), + "cache_file_count": file_count(cache_dir), + "output_size_bytes": dir_size(output_dir), + "captured_output_size_bytes": dir_size(OUTPUT_CAPTURE_DIR), + "model_call_observation": { + "source": "GraphRAG cache artifact count when available", + "observed_cache_entries": file_count(cache_dir), + "raw_provider_usage_tokens_recorded": False, + }, + }, + "commands": [command_to_json(record) for record in command_records], + "evidence_mapping": { + "expected_evidence_ids": expected_ids, + "mapped_evidence_ids": mapped_ids, + "tables": mappings, + }, + } + write_json(OUT, payload) + + return payload + + +def write_manifest(status: StatusState) -> dict[str, Any]: + """Write a generated external adapter manifest for this smoke.""" + + manifest = { + "schema": "elf.real_world_external_adapter_manifest/v1", + "manifest_id": f"graphrag-docker-smoke-{RUN_ID}", + "docker_isolation": { + "default": True, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/graphrag-docker-smoke.py", + "artifact_dir": "tmp/real-world-memory/graphrag-smoke", + "host_global_installs_required": False, + "notes": [ + f"Generated by the GraphRAG Docker smoke at {utc_now()}.", + "The smoke uses a generated public corpus and records typed setup/runtime failures.", + ], + }, + "adapters": [ + { + "adapter_id": "graphrag_docker_smoke", + "project": "GraphRAG", + "adapter_kind": "docker_python_cli_api_smoke", + "evidence_class": status.evidence_class, + "docker_default": True, + "host_global_installs_required": False, + "overall_status": status.overall, + "setup": { + "status": status.setup, + "evidence": "The smoke runs inside the baseline Docker runner and installs or invokes GraphRAG only in the container-local work directory.", + "command": "cargo make smoke-graphrag-docker", + "artifact": rel(OUT), + }, + "run": { + "status": status.run, + "evidence": "The live path generates a tiny public corpus, initializes GraphRAG, indexes with bounded inputs, and runs local search when provider config is supplied.", + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker", + "artifact": rel(OUT), + }, + "result": { + "status": status.result, + "evidence": status.failure_reason + if status.failure_reason + else "GraphRAG parquet output tables mapped to generated real_world_job evidence ids.", + "artifact": rel(OUT), + }, + "capabilities": [ + { + "capability": "docker_python_cli_boundary", + "status": status.setup, + "evidence": "The runner is Python-only inside docker-compose.baseline.yml baseline-runner and does not require host-global GraphRAG installs.", + }, + { + "capability": "graphrag_index_query", + "status": status.run, + "evidence": "The opt-in live path runs GraphRAG index and local query over the generated public corpus.", + }, + { + "capability": "parquet_table_evidence_mapping", + "status": status.result, + "evidence": "documents, text_units, communities, community_reports, entities, and relationships parquet table identifiers are mapped to evidence ids when available.", + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim graph-navigation quality, synthesis quality, private-corpus behavior, or large-corpus indexing.", + }, + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": status.result, + "evidence": "Only the generated tiny-corpus table-mapping job is represented.", + }, + { + "suite_id": "retrieval", + "status": status.run if status.run != "pass" else "not_encoded", + "evidence": "The smoke may run local search for reachability, but retrieval quality scoring is not encoded.", + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "The smoke records resource bounds but does not encode backup, restore, provider credential, or private corpus production-ops checks.", + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "GraphRAG update/delete/current-versus-historical behavior is not encoded by this smoke.", + }, + ], + "evidence": [ + {"kind": "artifact", "ref": rel(OUT), "status": status.result}, + {"kind": "artifact", "ref": rel(OUTPUT_CAPTURE_DIR), "status": status.result}, + {"kind": "manifest", "ref": rel(MANIFEST_OUT), "status": status.overall}, + {"kind": "source", "ref": "https://github.com/microsoft/graphrag", "status": "real"}, + {"kind": "source", "ref": "https://microsoft.github.io/graphrag/", "status": "real"}, + { + "kind": "source", + "ref": "https://microsoft.github.io/graphrag/index/outputs/", + "status": "real", + }, + ], + "execution_metadata": { + "sources": [ + { + "label": "GraphRAG repository", + "url": "https://github.com/microsoft/graphrag", + "evidence": "Official source and package for GraphRAG.", + }, + { + "label": "GraphRAG CLI docs", + "url": "https://microsoft.github.io/graphrag/cli/", + "evidence": "Official index and query command contract.", + }, + { + "label": "GraphRAG input docs", + "url": "https://microsoft.github.io/graphrag/index/inputs/", + "evidence": "Official input formats and document schema.", + }, + { + "label": "GraphRAG output tables", + "url": "https://microsoft.github.io/graphrag/index/outputs/", + "evidence": "Official parquet output table schema for evidence mapping.", + }, + { + "label": "GraphRAG local search docs", + "url": "https://microsoft.github.io/graphrag/query/local_search/", + "evidence": "Official local-search context and graph traversal reference.", + }, + ], + "setup_path": "Run cargo make smoke-graphrag-docker for a typed artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live index/query attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke.", + "resource_expectation": f"GraphRAG package {GRAPH_RAG_REF}, max_docs={MAX_DOCS}, max_input_chars={MAX_INPUT_CHARS}, timeout_seconds={TIMEOUT_SECONDS}, index_method={INDEX_METHOD}.", + "retry_guidance": [ + "Default command records a typed blocked artifact without model calls.", + "Enable the live path only with explicit provider configuration and generated public corpus.", + "Treat missing or unmapped documents/text_units as wrong_result, not as pass.", + ], + "research_depth": "D2 feasibility plus XY-887 cost-bounded Docker smoke implementation; generated artifact decides live evidence class.", + }, + "notes": [ + "The checked-in manifest record remains research_gate; generated smoke artifacts carry live status.", + "Failure before GraphRAG output remains typed as blocked or incomplete.", + "The smoke does not use private corpora or unrecorded provider credentials.", + ], + } + ], + } + write_json(MANIFEST_OUT, manifest) + + return manifest + + +def write_summary(materialization: dict[str, Any], manifest: dict[str, Any], report: dict[str, Any]) -> None: + """Write a small summary artifact.""" + + write_json( + SUMMARY_OUT, + { + "schema": "elf.graphrag_docker_smoke_summary/v1", + "generated_at": utc_now(), + "adapter_id": "graphrag_docker_smoke", + "evidence_class": materialization["evidence_class"], + "status_boundary": { + "materialization": "setup/run/evidence-mapping state emitted by the smoke runner", + "manifest": "external adapter declaration consumed by the scorer", + "scored_benchmark": "post-score real_world_job outcome; use this for quality status", + }, + "scored_benchmark": materialization["scored_benchmark"], + "materialization": materialization, + "manifest": { + "json": rel(MANIFEST_OUT), + "status_source": "external_adapter_manifest_pre_score", + "summary": manifest["adapters"][0]["overall_status"], + "suites": manifest["adapters"][0]["suites"], + }, + "report": report, + }, + ) + + +def scrub_report_secrets(project_dir: Path) -> None: + """Remove provider secrets from text artifacts before reporting.""" + + if not API_KEY: + return + + for root in (project_dir, LOG_DIR): + if not root.exists(): + continue + + for path in root.rglob("*"): + if not path.is_file() or path.suffix not in {".env", ".json", ".log", ".txt", ".yaml", ".yml"}: + continue + + try: + content = path.read_text(encoding="utf-8") + except UnicodeDecodeError: + continue + + if API_KEY in content: + path.write_text(content.replace(API_KEY, "<redacted>"), encoding="utf-8") + + +def main() -> int: + """Run the smoke and always emit typed artifacts when possible.""" + + started_at = time.monotonic() + mkdirs() + status = StatusState() + command_records: list[CommandRecord] = [] + mappings: list[dict[str, Any]] = empty_table_mappings("not_encoded") + mapped_ids: list[str] = [] + corpus = generated_corpus() + project_dir = WORK_DIR / "project" + corpus_csv = write_corpus(project_dir, corpus) + + if not Path("/.dockerenv").exists() and not ALLOW_HOST: + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "not_running_in_docker" + status.failure_reason = "GraphRAG smoke must run inside Docker; use cargo make smoke-graphrag-docker." + elif not command_available("python3"): + status.setup = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "python_missing" + status.failure_reason = "python3 is required for the GraphRAG smoke runner." + elif not RUN_LIVE: + pass + elif not API_KEY: + status.setup = "blocked" + status.run = "not_encoded" + status.result = "blocked" + status.overall = "blocked" + status.failure_class = "provider_api_key_missing" + status.failure_reason = "GraphRAG live indexing requires an explicit provider API key; no private or unrecorded provider credentials were used." + elif not init_project(project_dir, command_records): + status.setup = "incomplete" + status.run = "not_encoded" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "graphrag_setup_failed" + status.failure_reason = "GraphRAG installation or initialization failed inside the Docker runner." + else: + status.setup = "pass" + output_dir = run_graphrag(project_dir, command_records) + + if output_dir is None: + status.run = "incomplete" + status.result = "incomplete" + status.overall = "incomplete" + status.failure_class = "graphrag_index_or_query_failed" + status.failure_reason = "GraphRAG did not complete both index and local query for the generated corpus." + else: + status.run = "pass" + status.evidence_class = "live_real_world" + mappings, mapped_ids = map_tables(output_dir, corpus) + expected_ids = [ + item["evidence_id"] + for item in corpus + if item["evidence_id"] != "graphrag-smoke-stale-trap" + ] + valid, reason = mapping_is_valid(mappings, expected_ids) + + if valid: + status.result = "pass" + status.overall = "pass" + status.failure_class = "" + status.failure_reason = "" + else: + status.result = "wrong_result" + status.overall = "wrong_result" + status.failure_class = "graphrag_evidence_mapping_failed" + status.failure_reason = reason + + scrub_report_secrets(project_dir) + fixture_path = write_fixture(corpus, status, mapped_ids) + materialization = write_materialization( + status, + corpus, + fixture_path, + corpus_csv, + command_records, + mappings, + mapped_ids, + started_at, + ) + manifest = write_manifest(status) + report = run_scored_report(fixture_path, MANIFEST_OUT, status) + materialization = write_materialization( + status, + corpus, + fixture_path, + corpus_csv, + command_records, + mappings, + mapped_ids, + started_at, + report, + ) + write_summary(materialization, manifest, report) + print(f"GraphRAG smoke artifact: {OUT}") + print(f"GraphRAG smoke manifest: {MANIFEST_OUT}") + print(f"GraphRAG smoke summary: {SUMMARY_OUT}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/lightrag-docker-context-smoke.sh b/scripts/lightrag-docker-context-smoke.sh new file mode 100644 index 00000000..a643d286 --- /dev/null +++ b/scripts/lightrag-docker-context-smoke.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_LIGHTRAG_CONTEXT_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/lightrag-context}" +FIXTURE_DIR="${ELF_LIGHTRAG_CONTEXT_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory/retrieval}" +WORK_DIR="${ELF_LIGHTRAG_CONTEXT_WORK_DIR:-/bench/real-world-live-adapters/lightrag}" +API_BASE="${ELF_LIGHTRAG_API_BASE:-http://lightrag:9621}" +ADAPTER_ID="${ELF_LIGHTRAG_ADAPTER_ID:-lightrag_live_real_world}" +ADAPTER_NAME="${ELF_LIGHTRAG_ADAPTER_NAME:-LightRAG Docker context-export adapter}" +STARTUP_ATTEMPTS="${ELF_LIGHTRAG_STARTUP_ATTEMPTS:-6}" +STARTUP_INTERVAL_SECONDS="${ELF_LIGHTRAG_STARTUP_INTERVAL_SECONDS:-2}" +INDEX_ATTEMPTS="${ELF_LIGHTRAG_INDEX_ATTEMPTS:-60}" +INDEX_INTERVAL_SECONDS="${ELF_LIGHTRAG_INDEX_INTERVAL_SECONDS:-2}" + +if [[ ! -f "/.dockerenv" && "${ELF_LIGHTRAG_CONTEXT_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run LightRAG context smoke outside Docker. Use cargo make smoke-lightrag-docker-context." >&2 + exit 1 +fi + +for cmd in cargo jq; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in LightRAG context smoke runner." >&2 + exit 1 + fi +done + +mkdir -p "${REPORT_DIR}" "${WORK_DIR}" +rm -rf "${REPORT_DIR:?}/lightrag-fixtures" \ + "${REPORT_DIR:?}/lightrag-materialization.json" \ + "${REPORT_DIR:?}/lightrag-report.json" \ + "${REPORT_DIR:?}/lightrag-report.md" \ + "${REPORT_DIR:?}/summary.json" + +cd "${ROOT_DIR}" + +cargo run -p elf-eval --bin real_world_live_adapter -- lightrag \ + --fixtures "${FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/lightrag-fixtures" \ + --evidence-out "${REPORT_DIR}/lightrag-materialization.json" \ + --work-dir "${WORK_DIR}" \ + --api-base "${API_BASE}" \ + --adapter-id "${ADAPTER_ID}" \ + --startup-attempts "${STARTUP_ATTEMPTS}" \ + --startup-interval-seconds "${STARTUP_INTERVAL_SECONDS}" \ + --index-attempts "${INDEX_ATTEMPTS}" \ + --index-interval-seconds "${INDEX_INTERVAL_SECONDS}" + +MATERIALIZATION_STATUS="$(jq -r '.status' "${REPORT_DIR}/lightrag-materialization.json")" + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/lightrag-fixtures" \ + --out "${REPORT_DIR}/lightrag-report.json" \ + --run-id real-world-memory-live-lightrag \ + --adapter-id "${ADAPTER_ID}" \ + --adapter-name "${ADAPTER_NAME}" \ + --adapter-behavior docker_api_context_export \ + --adapter-storage-status "${MATERIALIZATION_STATUS}" \ + --adapter-runtime-status "${MATERIALIZATION_STATUS}" \ + --adapter-notes "Materialized by real_world_live_adapter through the LightRAG Docker API using generated source file paths, /documents/texts ingest, /query context export, and reference/content evidence mapping; non-executed suites remain typed non-pass records." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/lightrag-report.json" \ + --out "${REPORT_DIR}/lightrag-report.md" + +jq -n \ + --slurpfile materialization "${REPORT_DIR}/lightrag-materialization.json" \ + --slurpfile report "${REPORT_DIR}/lightrag-report.json" \ + 'def count($key): ($report[0].summary[$key] // 0); + def scored_status: + if count("wrong_result") > 0 then "wrong_result" + elif count("lifecycle_fail") > 0 then "lifecycle_fail" + elif count("incomplete") > 0 then "incomplete" + elif count("blocked") > 0 then "blocked" + elif count("not_encoded") > 0 then "not_encoded" + elif count("pass") > 0 then "pass" + else "not_encoded" + end; + { + schema: "elf.lightrag_context_export_smoke/v1", + generated_at: (now | todateiso8601), + artifact_dir: (env.ELF_LIGHTRAG_CONTEXT_REPORT_DIR // "tmp/real-world-memory/lightrag-context"), + fixture_dir: (env.ELF_LIGHTRAG_CONTEXT_FIXTURES // "apps/elf-eval/fixtures/real_world_memory/retrieval"), + adapter_id: (env.ELF_LIGHTRAG_ADAPTER_ID // "lightrag_live_real_world"), + evidence_class: ( + if ($materialization[0].status == "pass" or $materialization[0].status == "wrong_result") then + "live_real_world" + else + "research_gate" + end + ), + status_boundary: { + materialization: "API reachability, ingest, context export, and evidence-mapping state emitted by the adapter", + report: "post-score real_world_job outcome; use this for quality status" + }, + scored_benchmark: { + schema: "elf.scored_benchmark_status/v1", + source: "real_world_job_benchmark", + status: scored_status, + counts: { + pass: count("pass"), + wrong_result: count("wrong_result"), + lifecycle_fail: count("lifecycle_fail"), + incomplete: count("incomplete"), + blocked: count("blocked"), + not_encoded: count("not_encoded") + }, + job_count: ($report[0].summary.job_count // 0), + mean_score: ($report[0].summary.mean_score // null), + evidence_coverage: ($report[0].summary.evidence_coverage // null) + }, + materialization: $materialization[0], + report: { + json: "tmp/real-world-memory/lightrag-context/lightrag-report.json", + markdown: "tmp/real-world-memory/lightrag-context/lightrag-report.md", + summary: $report[0].summary, + suites: $report[0].suites + } + }' >"${REPORT_DIR}/summary.json" + +echo "LightRAG context-export smoke reports:" +echo " ${REPORT_DIR}/lightrag-materialization.json" +echo " ${REPORT_DIR}/lightrag-report.json" +echo " ${REPORT_DIR}/lightrag-report.md" +echo " ${REPORT_DIR}/summary.json" diff --git a/scripts/lightrag-mock-openai-provider.py b/scripts/lightrag-mock-openai-provider.py new file mode 100644 index 00000000..975261d2 --- /dev/null +++ b/scripts/lightrag-mock-openai-provider.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Small OpenAI-compatible mock provider for LightRAG Docker smokes.""" + +from __future__ import annotations + +import hashlib +import json +import os +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from typing import Any + + +EMBEDDING_DIM = int(os.environ.get("ELF_LIGHTRAG_MOCK_EMBEDDING_DIM", "64")) +HOST = os.environ.get("ELF_LIGHTRAG_MOCK_HOST", "0.0.0.0") +PORT = int(os.environ.get("ELF_LIGHTRAG_MOCK_PORT", "8080")) + + +def _read_json(handler: BaseHTTPRequestHandler) -> dict[str, Any]: + length = int(handler.headers.get("content-length", "0")) + if length == 0: + return {} + raw = handler.rfile.read(length) + return json.loads(raw.decode("utf-8")) + + +def _write_json(handler: BaseHTTPRequestHandler, status: int, payload: dict[str, Any]) -> None: + body = json.dumps(payload).encode("utf-8") + handler.send_response(status) + handler.send_header("content-type", "application/json") + handler.send_header("content-length", str(len(body))) + handler.end_headers() + handler.wfile.write(body) + + +def _embedding(text: str) -> list[float]: + vector = [0.0] * EMBEDDING_DIM + for term in "".join(ch.lower() if ch.isalnum() else " " for ch in text).split(): + if len(term) < 2: + continue + digest = hashlib.blake2b(term.encode("utf-8"), digest_size=8).digest() + index = int.from_bytes(digest[:4], "little") % EMBEDDING_DIM + vector[index] += 1.0 + norm = sum(value * value for value in vector) ** 0.5 + if norm > 0: + vector = [value / norm for value in vector] + return vector + + +def _chat_completion(request: dict[str, Any]) -> dict[str, Any]: + content = ( + '{"entities":[],"relationships":[],"summary":"No graph facts extracted by ' + 'the local LightRAG smoke provider."}' + ) + return { + "id": "elf-lightrag-mock-chat", + "object": "chat.completion", + "model": request.get("model", "elf-lightrag-mock"), + "choices": [ + { + "index": 0, + "finish_reason": "stop", + "message": {"role": "assistant", "content": content}, + } + ], + "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, + } + + +def _embeddings(request: dict[str, Any]) -> dict[str, Any]: + inputs = request.get("input", []) + if isinstance(inputs, str): + inputs = [inputs] + return { + "object": "list", + "model": request.get("model", "elf-lightrag-mock-embedding"), + "data": [ + {"object": "embedding", "index": index, "embedding": _embedding(str(text))} + for index, text in enumerate(inputs) + ], + "usage": {"prompt_tokens": 0, "total_tokens": 0}, + } + + +def _rerank(request: dict[str, Any]) -> dict[str, Any]: + documents = request.get("documents", []) + if not isinstance(documents, list): + documents = [] + return { + "id": "elf-lightrag-mock-rerank", + "results": [ + {"index": index, "relevance_score": 1.0 / (index + 1)} + for index, _document in enumerate(documents) + ], + } + + +class Handler(BaseHTTPRequestHandler): + """HTTP handler for the mock provider.""" + + def do_GET(self) -> None: + if self.path in {"/health", "/v1/health"}: + _write_json(self, 200, {"status": "ok"}) + return + _write_json(self, 404, {"error": "not_found"}) + + def do_POST(self) -> None: + try: + request = _read_json(self) + if self.path.endswith("/chat/completions"): + _write_json(self, 200, _chat_completion(request)) + elif self.path.endswith("/embeddings"): + _write_json(self, 200, _embeddings(request)) + elif self.path.endswith("/rerank") or self.path == "/rerank": + _write_json(self, 200, _rerank(request)) + else: + _write_json(self, 404, {"error": "not_found", "path": self.path}) + except Exception as exc: # noqa: BLE001 + _write_json(self, 500, {"error": "mock_provider_error", "detail": str(exc)}) + + def log_message(self, format: str, *args: Any) -> None: + return + + +if __name__ == "__main__": + server = ThreadingHTTPServer((HOST, PORT), Handler) + server.serve_forever() diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh new file mode 100755 index 00000000..bf5cf624 --- /dev/null +++ b/scripts/live-baseline-benchmark.sh @@ -0,0 +1,3929 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_BASELINE_REPORT_DIR:-${ROOT_DIR}/tmp/live-baseline}" +WORK_DIR="${ELF_BASELINE_WORK_DIR:-/bench}" +REPOS_DIR="${WORK_DIR}/repos" +CORPUS_DIR="${WORK_DIR}/corpus" +HOME_DIR="${WORK_DIR}/home" +RECORDS="${REPORT_DIR}/project-records.jsonl" +REPORT="${REPORT_DIR}/live-baseline-report.json" +RUN_ID="${ELF_BASELINE_RUN_ID:-live-baseline-$(date +%Y%m%d%H%M%S)}" +PROJECT_FILTER="${ELF_BASELINE_PROJECTS:-all}" +CORPUS_PROFILE="${ELF_BASELINE_PROFILE:-smoke}" +SCALE_DOC_COUNT="${ELF_BASELINE_SCALE_DOCS:-120}" +STRESS_DOC_COUNT="${ELF_BASELINE_STRESS_DOCS:-480}" +BACKFILL_DOC_COUNT="${ELF_BASELINE_BACKFILL_DOCS:-2000}" +QUERY_TOP_K="${ELF_BASELINE_TOP_K:-10}" +CURRENT_PROJECT_STARTED_AT="" +PRODUCTION_SYNTHETIC_MANIFEST="${ROOT_DIR}/apps/elf-eval/fixtures/production_corpus/synthetic_coding_agent_manifest.json" +CORPUS_TRACK="generated_public" +CORPUS_PATH_DESCRIPTION="generated in Docker under /bench/corpus" +CORPUS_MANIFEST_ID="" + +elf_timeout_seconds() { + if [[ -n "${ELF_BASELINE_ELF_TIMEOUT_SECONDS:-}" ]]; then + echo "${ELF_BASELINE_ELF_TIMEOUT_SECONDS}" + return + fi + + case "${CORPUS_PROFILE}" in + backfill | large) + echo 3600 + ;; + stress) + echo 1800 + ;; + *) + echo 1200 + ;; + esac +} + +ensure_adapter_metadata() { + local project="$1" + local adapter_path="${REPORT_DIR}/${project}-adapter.json" + + if [[ -s "${adapter_path}" ]] && jq -e . "${adapter_path}" >/dev/null 2>&1; then + return + fi + + jq -nc \ + --arg project "${project}" \ + '{ + schema: "elf.live_baseline.adapter_metadata/v1", + project: $project, + storage: { + status: "incomplete", + detail: "Adapter metadata was not declared by the project runner." + }, + behaviors: {} + }' >"${adapter_path}" +} + +typed_status_from_result() { + local result_path="$1" + + jq -r ' + .check_summary as $summary + | if ($summary.wrong_result // 0) > 0 then "wrong_result" + elif ($summary.lifecycle_fail // 0) > 0 then "lifecycle_fail" + elif ($summary.blocked // 0) > 0 then "blocked" + elif ($summary.incomplete // 0) > 0 then "incomplete" + elif ($summary.not_encoded // 0) > 0 then "not_encoded" + else "pass" + end + ' "${result_path}" +} + +typed_status_reason() { + local project="$1" + local status="$2" + + case "${status}" in + pass) + if [[ "${project}" == "mem0" ]]; then + echo "mem0 SDK same-corpus retrieval and every encoded SDK behavior check passed; OpenMemory export-helper setup probe is reported separately in adapter.behaviors.openmemory_ui_export and tmp/live-baseline/mem0-openmemory-ui-export.json" + else + echo "${project} same-corpus retrieval and every encoded behavior check passed" + fi + ;; + wrong_result) + echo "${project} ran but returned the wrong same-corpus result or missed expected evidence" + ;; + lifecycle_fail) + echo "${project} same-corpus retrieval passed, but one or more lifecycle checks failed" + ;; + blocked) + echo "${project} same-corpus retrieval passed, but one or more lifecycle checks are blocked by missing durable runtime, credentials, or host integration" + ;; + incomplete) + echo "${project} setup or a declared behavior check could not complete in the Docker runner" + ;; + not_encoded) + echo "${project} same-corpus retrieval passed, but one or more capability checks are not encoded" + ;; + *) + echo "${project} produced unrecognized benchmark status ${status}" + ;; + esac +} + +probe_mem0_openmemory_ui_export() { + local project_repo="$1" + local sdk_result_path="$2" + local out_path="$3" + local log_path="$4" + local openmemory_dir="${project_repo}/openmemory" + local export_script="${openmemory_dir}/backup-scripts/export_openmemory.sh" + local ui_package="${openmemory_dir}/ui/package.json" + local compose_file="${openmemory_dir}/docker-compose.yml" + local readme_path="${openmemory_dir}/README.md" + local run_script="${openmemory_dir}/run.sh" + local api_env_example="${openmemory_dir}/api/.env.example" + local attempt_log="${REPORT_DIR}/mem0-openmemory-export-attempt.log" + local validation_path="${REPORT_DIR}/mem0-openmemory-export-validation.json" + local export_user_id="${ELF_MEM0_OPENMEMORY_EXPORT_USER_ID:-elf-history-user}" + local export_container="${ELF_MEM0_OPENMEMORY_EXPORT_CONTAINER:-openmemory-openmemory-mcp-1}" + local export_zip="${project_repo}/memories_export_${export_user_id}.zip" + local command_display="timeout 30 bash openmemory/backup-scripts/export_openmemory.sh --user-id ${export_user_id} --container ${export_container}" + local sdk_get_all_status + local export_exit_code=0 + local openmemory_tree_present=false + local ui_package_present=false + local compose_present=false + local export_script_present=false + local sunsetting_notice_present=false + local requires_api_key=false + local requires_docker_compose=false + local export_requires_running_container=false + local status="blocked" + local comparison_outcome="blocked" + local reason_code="OPENMEMORY_CONTAINER_NOT_RUNNING" + local reason="OpenMemory export-helper setup probe could not run because no OpenMemory product container is available in the Docker baseline runner." + local next_action="Add a dedicated OpenMemory Docker Compose profile that imports the generated mem0 corpus into the OpenMemory app database, starts the API/UI with explicit local or provider configuration, then rerun the export helper and validate the exported memories." + local output_excerpt="" + local validation_json="{}" + + sdk_get_all_status="$(jq -r '[.checks[]? | select(.name == "local_get_all_export_readback") | .status][0] // "missing"' "${sdk_result_path}" 2>/dev/null || echo "missing")" + + [[ -d "${openmemory_dir}" ]] && openmemory_tree_present=true + [[ -f "${ui_package}" ]] && ui_package_present=true + [[ -f "${compose_file}" ]] && compose_present=true + [[ -f "${export_script}" ]] && export_script_present=true + if [[ -f "${readme_path}" ]] && grep -qi "sunsetting notice" "${readme_path}"; then + sunsetting_notice_present=true + fi + if grep -q "OPENAI_API_KEY" "${run_script}" "${api_env_example}" 2>/dev/null; then + requires_api_key=true + fi + if [[ -f "${run_script}" ]] && grep -q "docker compose" "${run_script}"; then + requires_docker_compose=true + fi + if [[ -f "${export_script}" ]] && grep -q "docker ps" "${export_script}"; then + export_requires_running_container=true + fi + + : >"${attempt_log}" + rm -f "${validation_path}" "${export_zip}" + if [[ "${openmemory_tree_present}" != "true" ]]; then + status="unsupported" + reason_code="OPENMEMORY_TREE_MISSING" + reason="The cloned mem0 repository does not contain the OpenMemory product tree, so no export-helper setup probe path is available in this revision." + elif [[ "${export_script_present}" != "true" ]]; then + status="unsupported" + reason_code="OPENMEMORY_EXPORT_SCRIPT_MISSING" + reason="The OpenMemory tree is present, but its export helper is missing, so the runner cannot attempt export-helper setup readback." + else + set +e + ( + cd "${project_repo}" + timeout 30 bash openmemory/backup-scripts/export_openmemory.sh \ + --user-id "${export_user_id}" \ + --container "${export_container}" + ) >"${attempt_log}" 2>&1 + export_exit_code=$? + set -e + output_excerpt="$(head -c 4000 "${attempt_log}" || true)" + + if [[ "${export_exit_code}" -eq 0 && -s "${export_zip}" ]]; then + python3 - "${export_zip}" "${validation_path}" <<'PY' +import json +import sys +import zipfile +from pathlib import Path + +zip_path = Path(sys.argv[1]) +out_path = Path(sys.argv[2]) +result = { + "zip_present": zip_path.is_file(), + "zip_path": str(zip_path), + "memories_json_present": False, + "has_current_preference": False, + "omits_other_scope": False, + "error": None, +} + +try: + with zipfile.ZipFile(zip_path) as archive: + result["members"] = archive.namelist() + if "memories.json" in archive.namelist(): + result["memories_json_present"] = True + payload = archive.read("memories.json").decode("utf-8", "replace") + lowered = payload.lower() + result["has_current_preference"] = ( + "concise" in lowered and "evidence-linked" in lowered + ) + result["omits_other_scope"] = "long-form chinese" not in lowered +except Exception as exc: + result["error"] = repr(exc) + +out_path.write_text(json.dumps(result, indent=2) + "\n", encoding="utf-8") +PY + validation_json="$(cat "${validation_path}")" + if jq -e '.has_current_preference == true and .omits_other_scope == true' "${validation_path}" >/dev/null; then + status="pass" + reason_code="OPENMEMORY_EXPORT_READBACK_MATCHED" + reason="OpenMemory export produced a zip containing the current scoped preference and omitting the other scope." + next_action="Keep OpenMemory export-helper readback as a separate product-UX scenario from SDK get_all and rerun after any OpenMemory setup change." + else + status="blocked" + reason_code="OPENMEMORY_EXPORT_MISSING_SAME_CORPUS" + reason="OpenMemory export ran, but the exported product data did not prove readback of the same local mem0 SDK corpus." + fi + elif [[ "${export_exit_code}" -eq 124 ]]; then + status="blocked" + reason_code="OPENMEMORY_EXPORT_TIMEOUT" + reason="OpenMemory export did not complete within the bounded 30-second probe." + elif grep -qi "docker.*command not found\|docker: not found\|docker not found" "${attempt_log}"; then + status="blocked" + reason_code="DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER" + reason="The OpenMemory export helper requires Docker access, but Docker is not available inside the baseline-runner container." + elif grep -qi "Container .*not found/running" "${attempt_log}"; then + status="blocked" + reason_code="OPENMEMORY_CONTAINER_NOT_RUNNING" + reason="The OpenMemory export helper requires a running OpenMemory product container, but the baseline runner only starts the mem0 SDK path." + else + status="blocked" + reason_code="OPENMEMORY_EXPORT_COMMAND_FAILED" + reason="The OpenMemory export helper failed before export-helper readback could be validated." + fi + fi + + case "${status}" in + pass) + comparison_outcome="not_tested" + ;; + blocked) + comparison_outcome="blocked" + ;; + unsupported) + comparison_outcome="non_goal" + ;; + *) + comparison_outcome="not_tested" + ;; + esac + + jq -nc \ + --arg schema "elf.live_baseline.openmemory_ui_export_probe/v1" \ + --arg run_id "${RUN_ID}" \ + --arg project "mem0/OpenMemory" \ + --arg scenario_id "openmemory_ui_export_readback" \ + --arg status "${status}" \ + --arg comparison_outcome "${comparison_outcome}" \ + --arg generated_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --arg sdk_result_artifact "tmp/live-baseline/mem0-search.json" \ + --arg sdk_get_all_status "${sdk_get_all_status}" \ + --arg export_user_id "${export_user_id}" \ + --arg export_container "${export_container}" \ + --arg command "${command_display}" \ + --arg log_artifact "tmp/live-baseline/mem0-openmemory-export-attempt.log" \ + --arg output_excerpt "${output_excerpt}" \ + --arg reason_code "${reason_code}" \ + --arg reason "${reason}" \ + --arg next_action "${next_action}" \ + --argjson exit_code "${export_exit_code}" \ + --argjson openmemory_tree_present "${openmemory_tree_present}" \ + --argjson ui_package_present "${ui_package_present}" \ + --argjson compose_present "${compose_present}" \ + --argjson export_script_present "${export_script_present}" \ + --argjson sunsetting_notice_present "${sunsetting_notice_present}" \ + --argjson requires_api_key "${requires_api_key}" \ + --argjson requires_docker_compose "${requires_docker_compose}" \ + --argjson export_requires_running_container "${export_requires_running_container}" \ + --argjson validation "${validation_json}" \ + '{ + schema: $schema, + run_id: $run_id, + project: $project, + scenario_id: $scenario_id, + status: $status, + comparison_outcome: $comparison_outcome, + generated_at: $generated_at, + same_corpus: { + sdk_result_artifact: $sdk_result_artifact, + sdk_get_all_check_status: $sdk_get_all_status, + sdk_history_filters: { + user_id: "elf-history-user", + agent_id: "elf-history-agent", + run_id: "elf-project" + }, + sdk_get_all_is_ui_export_evidence: false + }, + openmemory_surface: { + tree_present: $openmemory_tree_present, + ui_package_present: $ui_package_present, + compose_file_present: $compose_present, + export_script_present: $export_script_present, + sunsetting_notice_present: $sunsetting_notice_present, + requires_openai_api_key: $requires_api_key, + requires_docker_compose: $requires_docker_compose, + export_requires_running_container: $export_requires_running_container, + default_export_container: $export_container + }, + attempt: { + command: $command, + exit_code: $exit_code, + log_artifact: $log_artifact, + output_excerpt: $output_excerpt + }, + export_validation: $validation, + classification: { + status: $status, + reason_code: $reason_code, + reason: $reason, + next_action: $next_action + }, + claim_boundary: { + hosted_platform_claim: false, + optional_graph_memory_enabled: false, + sdk_get_all_is_ui_export_evidence: false + } + }' >"${out_path}" + + jq \ + --arg status "${status}" \ + --arg artifact "tmp/live-baseline/mem0-openmemory-ui-export.json" \ + '.behaviors.openmemory_ui_export.status = $status + | .behaviors.openmemory_ui_export.surface = + ("bounded OpenMemory export-helper setup probe recorded at " + $artifact + "; SDK get_all remains separate")' \ + "${REPORT_DIR}/mem0-adapter.json" >"${REPORT_DIR}/mem0-adapter.json.tmp" + mv "${REPORT_DIR}/mem0-adapter.json.tmp" "${REPORT_DIR}/mem0-adapter.json" + { + echo "OpenMemory UI/export probe status: ${status}" + echo "Reason code: ${reason_code}" + echo "Next action: ${next_action}" + } >>"${log_path}" +} + +if [[ ! -f "/.dockerenv" && "${ELF_BASELINE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run live baseline benchmark outside Docker. Use cargo make baseline-live-docker." >&2 + exit 1 +fi + +for cmd in bash cargo git jq node npm python3 rg timeout; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in baseline runner." >&2 + exit 1 + fi +done + +generate_corpus() { + python3 - "${CORPUS_PROFILE}" "${SCALE_DOC_COUNT}" "${STRESS_DOC_COUNT}" "${BACKFILL_DOC_COUNT}" "${CORPUS_DIR}" "${REPORT_DIR}/queries.json" <<'PY' +import json +import sys +from pathlib import Path + +profile, scale_doc_count_raw, stress_doc_count_raw, backfill_doc_count_raw, corpus_dir_raw, queries_path_raw = sys.argv[1:] +corpus_dir = Path(corpus_dir_raw) +queries_path = Path(queries_path_raw) +scale_doc_count = int(scale_doc_count_raw) +stress_doc_count = int(stress_doc_count_raw) +backfill_doc_count = int(backfill_doc_count_raw) + +anchors = [ + { + "name": "auth-memory.md", + "title": "Auth Memory", + "body": "The API auth middleware validates JWT tokens with key id `kid-v3`. The middleware rejects tokens older than 15 minutes and requires tenant scope `project_shared` for deployment operations.", + "query": "Which JWT key id does the auth middleware require?", + "alternate_query": "Find the auth note that mentions key id kid-v3 and tenant scope.", + "terms": ["kid-v3", "auth middleware"], + }, + { + "name": "database-memory.md", + "title": "Database Memory", + "body": "The invoice list N+1 query was fixed by eager loading invoice lines through `InvoiceLineBatcher`. Do not reintroduce per-row SQL calls in invoice rendering.", + "query": "How was the invoice list N+1 query fixed?", + "alternate_query": "Find the invoice rendering memory about InvoiceLineBatcher and N+1 prevention.", + "terms": ["InvoiceLineBatcher", "N+1"], + }, + { + "name": "deploy-memory.md", + "title": "Deploy Memory", + "body": "Production deploys must run Docker-isolated parity checks first. The cleanup command must remove Postgres, Qdrant, npm, pip, cargo, and target volumes before adoption.", + "query": "What must be cleaned up after Docker parity checks?", + "alternate_query": "Find the deploy checklist that mentions Postgres, Qdrant, and cleanup volumes.", + "terms": ["Postgres", "Qdrant", "volumes"], + }, + { + "name": "retention-memory.md", + "title": "Retention Memory", + "body": "The retention worker uses `RetentionSweepPlan` before deletion and writes a tombstone ledger entry named `ledger-retain-77` for every expired note.", + "query": "Which plan does the retention worker use before deletion?", + "alternate_query": "Find the retention note with ledger-retain-77 tombstone handling.", + "terms": ["RetentionSweepPlan", "ledger-retain-77"], + }, + { + "name": "incident-memory.md", + "title": "Incident Memory", + "body": "During canary incidents, `CanaryTraceGate` must stay enabled until the rollback window closes and the release captain records marker `incident-green-42`.", + "query": "Which gate stays enabled during canary incidents?", + "alternate_query": "Find the canary incident memory with incident-green-42.", + "terms": ["CanaryTraceGate", "incident-green-42"], + }, + { + "name": "billing-memory.md", + "title": "Billing Memory", + "body": "Billing replay uses `UsageAccumulator` with idempotency key `bill-run-42` so duplicate metering events do not create extra invoices.", + "query": "Which accumulator and idempotency key protect billing replay?", + "alternate_query": "Find the billing replay note with bill-run-42.", + "terms": ["UsageAccumulator", "bill-run-42"], + }, + { + "name": "search-memory.md", + "title": "Search Memory", + "body": "Search fanout routes tenant scoped reads through `SemanticShardRouter`; every shard label must include the prefix `tenant_scope` before merge ranking.", + "query": "Which router handles tenant scoped search fanout?", + "alternate_query": "Find the tenant_scope shard routing memory.", + "terms": ["SemanticShardRouter", "tenant_scope"], + }, + { + "name": "recovery-memory.md", + "title": "Recovery Memory", + "body": "Disaster recovery requires `SnapshotRestoreFence` and a WAL checkpoint named `wal-green-17` before accepting new writes after restore.", + "query": "Which fence is required before accepting writes after restore?", + "alternate_query": "Find the disaster recovery note with wal-green-17.", + "terms": ["SnapshotRestoreFence", "wal-green-17"], + }, +] + +if profile == "smoke": + docs = anchors[:3] +elif profile in {"scale", "full"}: + docs = list(anchors) + target_count = max(scale_doc_count, len(anchors)) +elif profile == "stress": + docs = list(anchors) + target_count = max(stress_doc_count, len(anchors)) +elif profile in {"backfill", "large"}: + docs = list(anchors) + target_count = max(backfill_doc_count, len(anchors)) +else: + raise SystemExit(f"unsupported ELF_BASELINE_PROFILE={profile!r}") + +if profile in {"scale", "full", "stress", "backfill", "large"}: + topics = [ + "scheduler dry run budget window", + "operator dashboard cache refresh", + "import packet normalization lane", + "workspace role synchronization", + "trace export sampling policy", + "background compaction checkpoint", + "local fixture replay validation", + "notification queue dampening", + ] + for idx in range(1, target_count - len(anchors) + 1): + topic = topics[idx % len(topics)] + docs.append( + { + "name": f"distractor-{idx:03d}.md", + "title": f"Distractor Memory {idx:03d}", + "body": ( + f"This operational note covers {topic}. " + f"It intentionally uses ordinary maintenance vocabulary for lane {idx:03d}, " + f"checkpoint batch {1000 + idx}, and reviewer group {idx % 9}. " + "It should not answer the benchmark needle queries." + ), + } + ) + +for existing in corpus_dir.glob("*.md"): + existing.unlink() + +for doc in docs: + (corpus_dir / doc["name"]).write_text( + f"# {doc['title']}\n\n{doc['body']}\n", encoding="utf-8" + ) + +query_docs = anchors[: (3 if profile == "smoke" else len(anchors))] +queries = [] +for doc in query_docs: + base_id = doc["name"].replace("-memory.md", "").replace(".md", "") + evidence_id = doc["name"].replace(".md", "") + queries.append( + { + "id": f"q-{base_id}", + "task": "same_corpus_retrieval", + "query": doc["query"], + "expected_doc": doc["name"], + "expected_terms": doc["terms"], + "expected_evidence_ids": [evidence_id], + "allowed_alternate_evidence_ids": [], + } + ) + if profile in {"stress", "backfill", "large"}: + queries.append( + { + "id": f"q-{base_id}-alt", + "task": "same_corpus_retrieval", + "query": doc["alternate_query"], + "expected_doc": doc["name"], + "expected_terms": doc["terms"], + "expected_evidence_ids": [evidence_id], + "allowed_alternate_evidence_ids": [], + } + ) + +queries_path.write_text( + json.dumps( + { + "schema": "elf.live_baseline.queries/v1", + "profile": profile, + "document_count": len(docs), + "queries": queries, + }, + indent=2, + ) + + "\n", + encoding="utf-8", +) +PY +} + +prepare_production_corpus() { + local manifest_path="${ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST:-}" + local corpus_summary="${REPORT_DIR}/production-corpus-summary.json" + + case "${CORPUS_PROFILE}" in + production-synthetic) + manifest_path="${manifest_path:-${PRODUCTION_SYNTHETIC_MANIFEST}}" + ;; + production-private) + if [[ -z "${manifest_path}" ]]; then + echo "ELF_BASELINE_PROFILE=production-private requires ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST." >&2 + exit 1 + fi + ;; + *) + echo "Unsupported production corpus profile: ${CORPUS_PROFILE}" >&2 + exit 1 + ;; + esac + + if [[ ! -f "${manifest_path}" ]]; then + echo "Missing production corpus manifest: ${manifest_path}" >&2 + exit 1 + fi + + python3 - "${CORPUS_PROFILE}" "${manifest_path}" "${CORPUS_DIR}" "${REPORT_DIR}/queries.json" "${corpus_summary}" <<'PY' +import json +import re +import sys +from collections import Counter +from pathlib import Path + +profile, manifest_path_raw, corpus_dir_raw, queries_path_raw, summary_path_raw = sys.argv[1:] +manifest_path = Path(manifest_path_raw) +corpus_dir = Path(corpus_dir_raw) +queries_path = Path(queries_path_raw) +summary_path = Path(summary_path_raw) +corpus_track = "synthetic_production" if profile == "production-synthetic" else "private_production" +allowed_categories = { + "issue", + "pr", + "worktree", + "runbook", + "decision", + "blocker", + "recovery_note", +} +allowed_tasks = { + "resume_lane", + "recover_exact_command", + "explain_stale_blocker", + "find_prior_decision", + "compare_project_status", + "detect_contradiction_update", +} +id_re = re.compile(r"[a-z0-9][a-z0-9_.-]{1,80}") + + +def fail(message): + raise SystemExit(f"Invalid production corpus manifest: {message}") + + +def require_string(obj, field, context): + value = obj.get(field) + if not isinstance(value, str) or not value.strip(): + fail(f"{context}.{field} must be a non-empty string") + return value.strip() + + +def require_string_list(obj, field, context): + value = obj.get(field) + if not isinstance(value, list) or not value: + fail(f"{context}.{field} must be a non-empty string array") + out = [] + for index, item in enumerate(value): + if not isinstance(item, str) or not item.strip(): + fail(f"{context}.{field}[{index}] must be a non-empty string") + out.append(item.strip()) + return out + + +def load_text(item, context): + has_text = isinstance(item.get("text"), str) + has_path = isinstance(item.get("local_path"), str) + if has_text == has_path: + fail(f"{context} must set exactly one of text or local_path") + if has_text: + text = item["text"].strip() + else: + local_path = Path(item["local_path"]) + if not local_path.is_absolute(): + local_path = manifest_path.parent / local_path + if not local_path.is_file(): + fail(f"{context}.local_path does not point to a readable file") + text = local_path.read_text(encoding="utf-8").strip() + if not text: + fail(f"{context} text must not be empty") + if "\x00" in text: + fail(f"{context} text contains a NUL byte") + return text + + +manifest = json.loads(manifest_path.read_text(encoding="utf-8")) +if manifest.get("schema") != "elf.production_corpus_manifest/v1": + fail("schema must be elf.production_corpus_manifest/v1") + +manifest_id = require_string(manifest, "manifest_id", "$") +if not id_re.fullmatch(manifest_id): + fail("$.manifest_id must be lower-case ASCII and safe for reports") +evidence_items = manifest.get("evidence") +if not isinstance(evidence_items, list) or not evidence_items: + fail("$.evidence must be a non-empty array") +query_items = manifest.get("queries") +if not isinstance(query_items, list) or not query_items: + fail("$.queries must be a non-empty array") + +for existing in corpus_dir.glob("*.md"): + existing.unlink() + +evidence_by_id = {} +category_counts = Counter() +for index, item in enumerate(evidence_items): + context = f"$.evidence[{index}]" + if not isinstance(item, dict): + fail(f"{context} must be an object") + evidence_id = require_string(item, "evidence_id", context) + if not id_re.fullmatch(evidence_id): + fail(f"{context}.evidence_id must be lower-case ASCII and safe for filenames") + if evidence_id in evidence_by_id: + fail(f"{context}.evidence_id duplicates an earlier item") + category = require_string(item, "category", context) + if category not in allowed_categories: + fail(f"{context}.category must be one of {sorted(allowed_categories)}") + title = require_string(item, "title", context) + text = load_text(item, context) + evidence_by_id[evidence_id] = { + "category": category, + "title": title, + "text": text, + } + category_counts[category] += 1 + (corpus_dir / f"{evidence_id}.md").write_text( + "\n".join( + [ + f"# {title}", + "", + text, + "", + ] + ), + encoding="utf-8", + ) + +queries = [] +query_ids = set() +task_counts = Counter() +for index, item in enumerate(query_items): + context = f"$.queries[{index}]" + if not isinstance(item, dict): + fail(f"{context} must be an object") + query_id = require_string(item, "query_id", context) + if not id_re.fullmatch(query_id): + fail(f"{context}.query_id must be lower-case ASCII and safe for reports") + if query_id in query_ids: + fail(f"{context}.query_id duplicates an earlier item") + query_ids.add(query_id) + task = require_string(item, "task", context) + if task not in allowed_tasks: + fail(f"{context}.task must be one of {sorted(allowed_tasks)}") + query = require_string(item, "query", context) + expected_ids = require_string_list(item, "expected_evidence_ids", context) + allowed_alternate_ids = item.get("allowed_alternate_evidence_ids", []) + if allowed_alternate_ids is None: + allowed_alternate_ids = [] + if not isinstance(allowed_alternate_ids, list): + fail(f"{context}.allowed_alternate_evidence_ids must be an array") + allowed_alternate_ids = [ + evidence_id.strip() + for evidence_id in allowed_alternate_ids + if isinstance(evidence_id, str) and evidence_id.strip() + ] + expected_terms = require_string_list(item, "expected_terms", context) + for evidence_id in [*expected_ids, *allowed_alternate_ids]: + if evidence_id not in evidence_by_id: + fail(f"{context} references unknown evidence_id {evidence_id!r}") + queries.append( + { + "id": query_id, + "task": task, + "query": query, + "expected_doc": f"{expected_ids[0]}.md", + "allowed_alternate_docs": [ + f"{evidence_id}.md" for evidence_id in [*expected_ids[1:], *allowed_alternate_ids] + ], + "expected_terms": expected_terms, + "expected_evidence_ids": expected_ids, + "allowed_alternate_evidence_ids": allowed_alternate_ids, + } + ) + task_counts[task] += 1 + +queries_path.write_text( + json.dumps( + { + "schema": "elf.live_baseline.queries/v1", + "profile": profile, + "corpus_track": corpus_track, + "manifest_schema": manifest["schema"], + "manifest_id": manifest_id, + "document_count": len(evidence_by_id), + "queries": queries, + }, + indent=2, + ) + + "\n", + encoding="utf-8", +) + +summary_path.write_text( + json.dumps( + { + "schema": "elf.production_corpus_summary/v1", + "corpus_track": corpus_track, + "manifest_schema": manifest["schema"], + "manifest_id": manifest_id, + "document_count": len(evidence_by_id), + "query_count": len(queries), + "category_counts": dict(sorted(category_counts.items())), + "task_counts": dict(sorted(task_counts.items())), + "evidence_ids": sorted(evidence_by_id), + "query_evidence": [ + { + "query_id": query["id"], + "task": query["task"], + "expected_evidence_ids": query["expected_evidence_ids"], + "allowed_alternate_evidence_ids": query["allowed_alternate_evidence_ids"], + } + for query in queries + ], + }, + indent=2, + ) + + "\n", + encoding="utf-8", +) +PY + + CORPUS_TRACK="$(jq -r '.corpus_track' "${corpus_summary}")" + CORPUS_MANIFEST_ID="$(jq -r '.manifest_id' "${corpus_summary}")" + CORPUS_PATH_DESCRIPTION="production corpus materialized in Docker under /bench/corpus" +} + +rm -rf "${WORK_DIR}" +mkdir -p "${REPORT_DIR}" +find "${REPORT_DIR}" -maxdepth 1 -type f -delete +mkdir -p "${REPOS_DIR}" "${CORPUS_DIR}" "${HOME_DIR}" +: >"${RECORDS}" + +case "${CORPUS_PROFILE}" in + production-synthetic | production-private) + prepare_production_corpus + ;; + *) + generate_corpus + ;; +esac +DOCUMENT_COUNT="$(find "${CORPUS_DIR}" -maxdepth 1 -type f -name '*.md' | wc -l | tr -d ' ')" +QUERY_COUNT="$(jq '.queries | length' "${REPORT_DIR}/queries.json")" + +json_record() { + local project="$1" + local repo="$2" + local head="$3" + local status="$4" + local retrieval_status="$5" + local reason="$6" + local log_path="$7" + local command_summary="$8" + local finished_at + local elapsed_seconds + local checks_path + local adapter_path + finished_at="$(date +%s)" + elapsed_seconds=0 + if [[ -n "${CURRENT_PROJECT_STARTED_AT}" ]]; then + elapsed_seconds=$((finished_at - CURRENT_PROJECT_STARTED_AT)) + fi + checks_path="${REPORT_DIR}/${project}-checks.json" + adapter_path="${REPORT_DIR}/${project}-adapter.json" + ensure_adapter_metadata "${project}" + + if [[ -s "${checks_path}" ]] && jq -e '.checks and .check_summary' "${checks_path}" >/dev/null 2>&1; then + jq -nc \ + --arg project "${project}" \ + --arg repo "${repo}" \ + --arg head "${head}" \ + --arg status "${status}" \ + --arg retrieval_status "${retrieval_status}" \ + --arg reason "${reason}" \ + --arg log_path "${log_path}" \ + --arg command_summary "${command_summary}" \ + --argjson elapsed_seconds "${elapsed_seconds}" \ + --slurpfile adapter "${adapter_path}" \ + --slurpfile checks "${checks_path}" \ + '{ + project: $project, + repo: $repo, + head: $head, + status: $status, + retrieval_status: $retrieval_status, + reason: $reason, + log_path: $log_path, + command_summary: $command_summary, + elapsed_seconds: $elapsed_seconds, + adapter: $adapter[0], + embedding: ($checks[0].embedding // null), + cost_proxy: ($checks[0].cost_proxy // null), + query_summary: ($checks[0].query_summary // null), + queries: ($checks[0].queries // null), + backfill: ($checks[0].backfill // null), + resource_envelope: ([$checks[0].checks[]? | select(.name == "resource_envelope") | .evidence][0] // null), + ops_cases: ($checks[0].ops_cases // null), + check_summary: $checks[0].check_summary, + checks: $checks[0].checks + }' >>"${RECORDS}" + else + jq -nc \ + --arg project "${project}" \ + --arg repo "${repo}" \ + --arg head "${head}" \ + --arg status "${status}" \ + --arg retrieval_status "${retrieval_status}" \ + --arg reason "${reason}" \ + --arg log_path "${log_path}" \ + --arg command_summary "${command_summary}" \ + --argjson elapsed_seconds "${elapsed_seconds}" \ + --slurpfile adapter "${adapter_path}" \ + ' + def check_status: + if $status == "pass" and $retrieval_status == "retrieval_pass" then "pass" + elif $status == "wrong_result" then "wrong_result" + elif $status == "lifecycle_fail" then "lifecycle_fail" + elif $status == "blocked" then "blocked" + elif $status == "not_encoded" then "not_encoded" + elif $status == "incomplete" then "incomplete" + elif $retrieval_status == "retrieval_pass" then "pass" + else "incomplete" + end; + def is_fail: + check_status == "wrong_result" or check_status == "lifecycle_fail"; + { + project: $project, + repo: $repo, + head: $head, + status: $status, + retrieval_status: $retrieval_status, + reason: $reason, + log_path: $log_path, + command_summary: $command_summary, + elapsed_seconds: $elapsed_seconds, + query_summary: null, + queries: null, + backfill: null, + cost_proxy: null, + resource_envelope: null, + ops_cases: null, + adapter: $adapter[0], + check_summary: { + total: 1, + pass: (if check_status == "pass" then 1 else 0 end), + fail: (if is_fail then 1 else 0 end), + wrong_result: (if check_status == "wrong_result" then 1 else 0 end), + lifecycle_fail: (if check_status == "lifecycle_fail" then 1 else 0 end), + incomplete: (if check_status == "incomplete" then 1 else 0 end), + blocked: (if check_status == "blocked" then 1 else 0 end), + not_encoded: (if check_status == "not_encoded" then 1 else 0 end) + }, + checks: [ + { + name: "same_corpus_retrieval", + status: check_status, + reason: $reason, + evidence: { + retrieval_status: $retrieval_status, + log_path: $log_path, + command_summary: $command_summary + } + } + ] + }' >>"${RECORDS}" + fi +} + +run_cmd() { + local label="$1" + local timeout_seconds="$2" + local log_path="$3" + shift 3 + + { + echo "## ${label}" + echo "## started_at=$(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "## command=$*" + } >>"${log_path}" + + if timeout "${timeout_seconds}" bash -lc "$*" >>"${log_path}" 2>&1; then + echo "## exit=0" >>"${log_path}" + return 0 + fi + + local code + code=$? + echo "## exit=${code}" >>"${log_path}" + return "${code}" +} + +clone_project() { + local project="$1" + local repo="$2" + local log_path="$3" + local target="${REPOS_DIR}/${project}" + + if run_cmd "${project}: clone" 180 "${log_path}" "git clone --depth 1 '${repo}' '${target}'"; then + git -C "${target}" rev-parse HEAD + return 0 + fi + + echo "clone_failed" + return 1 +} + +prepare_project_corpus() { + local project="$1" + local target="${WORK_DIR}/corpus-${project}" + + rm -rf "${target}" + mkdir -p "${target}" + cp -R "${CORPUS_DIR}/." "${target}/" + echo "${target}" +} + +finish_report() { + jq -s \ + --arg schema "elf.live_baseline.report/v1" \ + --arg run_id "${RUN_ID}" \ + --arg project_filter "${PROJECT_FILTER}" \ + --arg corpus_profile "${CORPUS_PROFILE}" \ + --arg corpus_track "${CORPUS_TRACK}" \ + --arg corpus_path "${CORPUS_PATH_DESCRIPTION}" \ + --arg corpus_manifest_id "${CORPUS_MANIFEST_ID}" \ + --argjson document_count "${DOCUMENT_COUNT}" \ + --argjson query_count "${QUERY_COUNT}" \ + --arg generated_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + ' + def failure_status: + . == "wrong_result" or . == "lifecycle_fail"; + { + schema: $schema, + run_id: $run_id, + generated_at: $generated_at, + docker_only: true, + project_filter: $project_filter, + corpus: { + profile: $corpus_profile, + track: $corpus_track, + manifest_id: (if $corpus_manifest_id == "" then null else $corpus_manifest_id end), + document_count: $document_count, + query_count: $query_count, + path: $corpus_path, + query_file: "tmp/live-baseline/queries.json" + }, + verdict: ( + if length == 0 then "incomplete" + elif any(.[]; .status | failure_status) then "fail" + elif any(.[]; .status == "blocked") then "blocked" + elif any(.[]; .status == "incomplete") then "incomplete" + elif any(.[]; .status == "not_encoded") then "incomplete" + elif all(.[]; .status == "pass" and .retrieval_status == "retrieval_pass") then "pass" + else "incomplete" + end + ), + summary: { + total: length, + pass: ([.[] | select(.status == "pass")] | length), + fail: ([.[] | select(.status | failure_status)] | length), + wrong_result: ([.[] | select(.status == "wrong_result")] | length), + lifecycle_fail: ([.[] | select(.status == "lifecycle_fail")] | length), + incomplete: ([.[] | select(.status == "incomplete")] | length), + blocked: ([.[] | select(.status == "blocked")] | length), + not_encoded: ([.[] | select(.status == "not_encoded")] | length) + }, + same_corpus_summary: { + total: length, + pass: ([.[] | select(.retrieval_status == "retrieval_pass")] | length), + fail: ([.[] | select(.retrieval_status == "retrieval_wrong_result")] | length), + wrong_result: ([.[] | select(.retrieval_status == "retrieval_wrong_result")] | length), + lifecycle_fail: 0, + incomplete: ([.[] | select(.retrieval_status != "retrieval_pass" and .status == "incomplete")] | length), + blocked: ([.[] | select(.retrieval_status != "retrieval_pass" and .status == "blocked")] | length), + not_encoded: ([.[] | select(.retrieval_status != "retrieval_pass" and .status == "not_encoded")] | length) + }, + full_check_summary: { + total: ([.[] | .check_summary.total // 0] | add // 0), + pass: ([.[] | .check_summary.pass // 0] | add // 0), + fail: ([.[] | .check_summary.fail // 0] | add // 0), + wrong_result: ([.[] | .check_summary.wrong_result // 0] | add // 0), + lifecycle_fail: ([.[] | .check_summary.lifecycle_fail // 0] | add // 0), + incomplete: ([.[] | .check_summary.incomplete // 0] | add // 0), + blocked: ([.[] | .check_summary.blocked // 0] | add // 0), + not_encoded: ([.[] | .check_summary.not_encoded // 0] | add // 0) + }, + wrong_result_count: ([.[] | .query_summary.wrong_result_count // .query_summary.fail // 0] | add // 0), + latency_ms: { + total: ([.[] | .query_summary.latency_ms_total // 0] | add // 0), + mean: ( + [.[] | select(.query_summary != null) | .query_summary.latency_ms_mean // 0] as $means + | if ($means | length) == 0 then 0 else (($means | add) / ($means | length)) end + ), + p50: ( + [.[] | select(.query_summary != null) | .query_summary.latency_ms_p50 // 0] as $values + | if ($values | length) == 0 then 0 else (($values | add) / ($values | length)) end + ), + p95: ( + [.[] | select(.query_summary != null) | .query_summary.latency_ms_p95 // 0] as $values + | if ($values | length) == 0 then 0 else (($values | add) / ($values | length)) end + ), + p99: ( + [.[] | select(.query_summary != null) | .query_summary.latency_ms_p99 // 0] as $values + | if ($values | length) == 0 then 0 else (($values | add) / ($values | length)) end + ), + max: ([.[] | .query_summary.latency_ms_max // 0] | max // 0) + }, + cost_proxy: { + projects: [.[] | select(.cost_proxy != null) | {project, cost_proxy}], + estimated_usd: ([.[] | .cost_proxy.estimated_usd? // empty] | add // null), + estimated_input_tokens: ([.[] | .cost_proxy.estimated_input_tokens // 0] | add // 0) + }, + resource_usage: { + projects: [.[] | select(.resource_envelope != null) | {project, resource_envelope}] + }, + ops_cases: [.[] | select(.ops_cases != null) | {project, cases: .ops_cases}], + projects: . + }' "${RECORDS}" >"${REPORT}" +} + +project_enabled() { + local project="$1" + + if [[ -z "${PROJECT_FILTER}" || "${PROJECT_FILTER}" == "all" ]]; then + return 0 + fi + + for selected in ${PROJECT_FILTER//,/ }; do + if [[ "${selected}" == "${project}" ]]; then + return 0 + fi + done + + return 1 +} + +run_project() { + local project="$1" + local fn="$2" + + if project_enabled "${project}"; then + CURRENT_PROJECT_STARTED_AT="$(date +%s)" + "${fn}" + CURRENT_PROJECT_STARTED_AT="" + fi +} + +project_elf() { + local project="ELF" + local repo="local:/workspace" + local log_path="${REPORT_DIR}/${project}.log" + local result_path="${REPORT_DIR}/${project}-result.json" + local head + cat >"${REPORT_DIR}/${project}-adapter.json" <<'JSON' +{ + "schema": "elf.live_baseline.adapter_metadata/v1", + "project": "ELF", + "storage": { + "status": "real", + "detail": "Docker-owned Postgres with pgvector is the source of truth and Qdrant is rebuilt from persisted chunk vectors." + }, + "behaviors": { + "same_corpus_retrieval": { + "status": "real", + "surface": "add_note, worker indexing, Qdrant rebuild, and search_raw over the configured service stores" + }, + "update": { + "status": "real", + "surface": "service update plus worker reindex" + }, + "delete_or_expire": { + "status": "real", + "surface": "service delete plus worker delete propagation" + }, + "cold_start_reload": { + "status": "real", + "surface": "new ElfService over the same Postgres and Qdrant stores" + }, + "concurrent_write_search": { + "status": "real", + "surface": "parallel add_note calls followed by worker indexing and search probes" + }, + "scale_stress_profile": { + "status": "real", + "surface": "profile-selected generated or production corpus size plus soak and resource-envelope checks" + }, + "soak_profile": { + "status": "real", + "surface": "profile-controlled repeated write/search stability window" + }, + "resource_envelope": { + "status": "real", + "surface": "local elapsed-time and RSS envelope check" + } + } +} +JSON + head="${ELF_BASELINE_ELF_HEAD:-}" + if [[ -z "${head}" ]]; then + head="$(git -C "${ROOT_DIR}" rev-parse HEAD 2>>"${log_path}" || echo "unknown")" + fi + + if run_cmd "${project}: same-corpus retrieval" "$(elf_timeout_seconds)" "${log_path}" \ + "cd '${ROOT_DIR}' && cargo run -p elf-eval --bin live_baseline_elf -- --config config/local/elf.docker.toml --corpus '${CORPUS_DIR}' --queries '${REPORT_DIR}/queries.json' --out '${result_path}'"; then + if [[ -s "${result_path}" ]] && jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then + jq '{embedding, cost_proxy, query_summary: .summary, queries, backfill, ops_cases, check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + if [[ -s "${result_path}" ]] && jq -e --argjson document_count "${DOCUMENT_COUNT}" --argjson query_count "${QUERY_COUNT}" ' + .schema == "elf.live_baseline.elf_result/v1" and + .status == "pass" and + .summary.total == $query_count and + .summary.fail == 0 and + .check_summary.fail == 0 and + .check_summary.incomplete == 0 and + .backfill.source_count == $document_count and + .backfill.completed_count == $document_count and + (.backfill.duplicate_source_notes | length) == 0 and + ( + .backfill.resume.enabled == false or + (.backfill.resume.interrupted == true and .backfill.resume.resume_attempts >= 2) + ) and + (.check_summary.blocked // 0) == 0 and + (.check_summary.not_encoded // 0) == 0 and + .indexing.note_count == $document_count and + .indexing.rebuild_rebuilt_count >= $document_count and + .indexing.rebuild_error_count == 0 + ' "${result_path}" >/dev/null; then + json_record "${project}" "${repo}" "${head}" "pass" "retrieval_pass" \ + "$(jq -r '.reason' "${result_path}")" \ + "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability; latency/resource/cost proxies" + return + fi + + if [[ -s "${result_path}" ]] && jq -e '.schema == "elf.live_baseline.elf_result/v1"' "${result_path}" >/dev/null 2>&1; then + json_record "${project}" "${repo}" "${head}" "$(jq -r '.status // "incomplete"' "${result_path}")" \ + "$(jq -r '.retrieval_status // "retrieval_failed"' "${result_path}")" \ + "$(jq -r '.reason // "ELF result did not satisfy live baseline pass criteria"' "${result_path}")" \ + "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability; latency/resource/cost proxies" + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "runtime_failed" \ + "ELF command completed but did not write a valid live-baseline result; inspect ELF.log for the runtime error" \ + "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability; latency/resource/cost proxies" + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "runtime_failed" \ + "ELF same-corpus retrieval command failed in Docker" \ + "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability; latency/resource/cost proxies" +} + +project_agentmemory() { + local project="agentmemory" + local repo="https://github.com/rohitg00/agentmemory.git" + local log_path="${REPORT_DIR}/${project}.log" + local result_path="${REPORT_DIR}/${project}-search.json" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-agentmemory.ts" + local head + cat >"${REPORT_DIR}/${project}-adapter.json" <<'JSON' +{ + "schema": "elf.live_baseline.adapter_metadata/v1", + "project": "agentmemory", + "storage": { + "status": "mocked", + "detail": "The harness registers agentmemory functions against in-memory SDK and KV mocks; it does not prove package durability." + }, + "behaviors": { + "same_corpus_retrieval": { + "status": "mocked", + "surface": "mem::remember and mem::search through an in-memory SDK/KV mock" + }, + "update": { + "status": "mocked", + "surface": "superseding mem::remember through the in-memory mock" + }, + "delete_or_expire": { + "status": "mocked", + "surface": "mem::forget through the in-memory mock; expiry is unsupported by this adapter" + }, + "expire": { + "status": "unsupported", + "surface": "no TTL/expiry behavior is exposed by the encoded local adapter" + }, + "cold_start_reload": { + "status": "blocked", + "surface": "no durable KV/index path is available in the Docker harness", + "evidence": "The adapter state is a process-local Map and search index.", + "retry": "Wire a persistent agentmemory KV/index path or hosted runtime, then restart a fresh process over that store." + }, + "scale_stress_profile": { + "status": "incomplete", + "surface": "smoke adapter only until durable package behavior is available" + } + } +} +JSON + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if run_cmd "${project}: install/build" 300 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && (npm ci || npm install --no-audit --no-fund) && npm run build --if-present"; then + cat >"${driver_path}" <<'TS' +import { readFileSync, readdirSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import { registerRememberFunction } from "./src/functions/remember.js"; +import { + getSearchIndex, + registerSearchFunction, + setEmbeddingProvider, + setVectorIndex, +} from "./src/functions/search.js"; + +function mockKV() { + const store = new Map<string, Map<string, unknown>>(); + return { + get: async <T>(scope: string, key: string): Promise<T | null> => + (store.get(scope)?.get(key) as T) ?? null, + set: async <T>(scope: string, key: string, data: T): Promise<T> => { + if (!store.has(scope)) store.set(scope, new Map()); + store.get(scope)!.set(key, data); + return data; + }, + delete: async (scope: string, key: string): Promise<void> => { + store.get(scope)?.delete(key); + }, + list: async <T>(scope: string): Promise<T[]> => { + const entries = store.get(scope); + return entries ? (Array.from(entries.values()) as T[]) : []; + }, + }; +} + +function mockSdk() { + const functions = new Map<string, Function>(); + return { + registerFunction: (idOrOpts: string | { id: string }, handler: Function) => { + const id = typeof idOrOpts === "string" ? idOrOpts : idOrOpts.id; + functions.set(id, handler); + }, + registerTrigger: () => {}, + trigger: async ( + idOrInput: string | { function_id: string; payload: unknown }, + data?: unknown, + ) => { + const id = typeof idOrInput === "string" ? idOrInput : idOrInput.function_id; + const payload = typeof idOrInput === "string" ? data : idOrInput.payload; + const fn = functions.get(id); + if (!fn) { + if (id === "mem::cascade-update") return { success: true }; + throw new Error(`No function: ${id}`); + } + return fn(payload); + }, + }; +} + +type QueryCase = { + id: string; + query: string; + expected_doc: string; + expected_terms: string[]; +}; + +const outPath = process.argv[2]; +const corpusPath = process.argv[3]; +const queriesPath = process.argv[4]; +if (!outPath || !corpusPath || !queriesPath) { + throw new Error("output path, corpus path, and query path are required"); +} + +const sdk = mockSdk(); +const kv = mockKV(); +getSearchIndex().clear(); +setVectorIndex(null); +setEmbeddingProvider(null); +registerRememberFunction(sdk as never, kv as never); +registerSearchFunction(sdk as never, kv as never); + +function plainText(markdown: string): string { + return markdown + .split(/\r?\n/) + .filter((line) => !line.trimStart().startsWith("#")) + .join(" ") + .replace(/\s+/g, " ") + .trim(); +} + +function conceptsFor(file: string): string[] { + return file + .replace(/\.md$/i, "") + .split(/[^A-Za-z0-9]+/) + .map((part) => part.toLowerCase()) + .filter(Boolean); +} + +function queryMatches(result: unknown, query: QueryCase): boolean { + const results = (result as { results?: unknown[] }).results ?? []; + return results.some((entry) => { + const entryJson = JSON.stringify(entry); + const entryText = entryJson.toLowerCase(); + const files = + (entry as { observation?: { files?: string[] } }).observation?.files ?? []; + return ( + files.includes(query.expected_doc) && + query.expected_terms.every((term) => + entryText.includes(term.toLowerCase()), + ) + ); + }); +} + +function resultEntries(result: unknown): unknown[] { + return (result as { results?: unknown[] }).results ?? []; +} + +function makeCheck( + name: string, + status: + | "pass" + | "wrong_result" + | "lifecycle_fail" + | "incomplete" + | "blocked" + | "not_encoded", + reason: string, + evidence: unknown, +) { + return { name, status, reason, evidence }; +} + +function summarizeChecks(checks: Array<{ status: string }>) { + return { + total: checks.length, + pass: checks.filter((check) => check.status === "pass").length, + fail: checks.filter( + (check) => + check.status === "wrong_result" || + check.status === "lifecycle_fail", + ).length, + wrong_result: checks.filter((check) => check.status === "wrong_result") + .length, + lifecycle_fail: checks.filter((check) => check.status === "lifecycle_fail") + .length, + incomplete: checks.filter((check) => check.status === "incomplete").length, + blocked: checks.filter((check) => check.status === "blocked").length, + not_encoded: checks.filter((check) => check.status === "not_encoded") + .length, + }; +} + +async function runSearch(query: QueryCase) { + return sdk.trigger("mem::search", { + query: query.query, + limit: topK, + format: "full", + project: "elfbench", + }); +} + +const docs = readdirSync(corpusPath) + .filter((file) => file.endsWith(".md")) + .sort() + .map((file) => ({ + content: plainText(readFileSync(join(corpusPath, file), "utf8")), + concepts: conceptsFor(file), + files: [file], + })); +const queries = JSON.parse(readFileSync(queriesPath, "utf8")).queries as QueryCase[]; + +const writes = []; +const memoryIdsBySource = new Map<string, string>(); +for (const doc of docs) { + const write = await sdk.trigger("mem::remember", { + content: doc.content, + type: "fact", + concepts: doc.concepts, + files: doc.files, + project: "elfbench", + agentId: "elf-baseline", + }); + writes.push({ source: doc.files[0], result: write }); + const memoryId = (write as { memory?: { id?: string } }).memory?.id; + if (memoryId) memoryIdsBySource.set(doc.files[0], memoryId); +} + +const queryResults = []; +const topK = Number(process.env.ELF_BASELINE_TOP_K ?? "10"); +for (const query of queries) { + const result = await runSearch(query); + queryResults.push({ + id: query.id, + query: query.query, + expected_doc: query.expected_doc, + expected_terms: query.expected_terms, + matched: queryMatches(result, query), + result, + }); +} + +const pass = queryResults.filter((result) => result.matched).length; +const checks = [ + makeCheck( + "same_corpus_retrieval", + pass === queryResults.length ? "pass" : "wrong_result", + pass === queryResults.length + ? "agentmemory mem::remember/mem::search returned expected evidence for every query." + : "agentmemory mem::remember/mem::search missed one or more expected results.", + { + total: queryResults.length, + pass, + fail: queryResults.length - pass, + }, + ), +]; + +const authId = memoryIdsBySource.get("auth-memory.md"); +if (!authId) { + checks.push( + makeCheck( + "update_replaces_note_text", + "incomplete", + "The auth memory id was not returned by mem::remember, so supersede/update could not be exercised.", + { source: "auth-memory.md" }, + ), + ); +} else { + const updateRemember = await sdk.trigger("mem::remember", { + content: + "The API auth middleware validates JWT tokens with key id `kid-v4` under `RotatedJwtKeyPlan`. The middleware rejects tokens older than 15 minutes and requires tenant scope `project_shared` for deployment operations.", + type: "fact", + concepts: conceptsFor("auth-memory.md"), + files: ["auth-memory.md"], + project: "elfbench", + agentId: "elf-baseline", + }); + const updateQuery: QueryCase = { + id: "lifecycle-update-new-marker", + query: "Which rotated JWT key id does the auth middleware require?", + expected_doc: "auth-memory.md", + expected_terms: ["kid-v4", "RotatedJwtKeyPlan"], + }; + const updateResult = await runSearch(updateQuery); + const updateMatched = queryMatches(updateResult, updateQuery); + const oldMarkerAbsent = resultEntries(updateResult) + .filter((entry) => { + const files = + (entry as { observation?: { files?: string[] } }).observation?.files ?? []; + return files.includes("auth-memory.md"); + }) + .every((entry) => !JSON.stringify(entry).toLowerCase().includes("kid-v3")); + checks.push( + makeCheck( + "update_replaces_note_text", + updateMatched && oldMarkerAbsent ? "pass" : "lifecycle_fail", + updateMatched && oldMarkerAbsent + ? "agentmemory mem::remember supersede returned the new marker and did not return the old marker for the updated file." + : "agentmemory mem::remember supersede did not cleanly replace the searchable auth memory text.", + { + memory_id: authId, + update_result: updateRemember, + matched_new_marker: updateMatched, + old_marker_absent: oldMarkerAbsent, + result: updateResult, + }, + ), + ); +} + +const deleteQuery = queries.find( + (query) => + query.expected_doc !== "auth-memory.md" && + query.expected_doc !== "database-memory.md" && + memoryIdsBySource.has(query.expected_doc), +); +if (!deleteQuery) { + checks.push( + makeCheck( + "delete_suppresses_retrieval", + "incomplete", + "No non-update, non-recovery memory id was available, so mem::forget could not be exercised.", + { available_sources: Array.from(memoryIdsBySource.keys()).sort() }, + ), + ); +} else { + const deleteId = memoryIdsBySource.get(deleteQuery.expected_doc)!; + const deleteResult = await sdk.trigger("mem::forget", { memoryId: deleteId }); + const searchAfterDelete = await runSearch(deleteQuery); + const deletedStillMatched = queryMatches(searchAfterDelete, deleteQuery); + checks.push( + makeCheck( + "delete_suppresses_retrieval", + deletedStillMatched ? "lifecycle_fail" : "pass", + deletedStillMatched + ? "agentmemory mem::forget returned success but the deleted memory was still searchable." + : "agentmemory mem::forget suppressed the deleted memory from subsequent search.", + { + memory_id: deleteId, + source: deleteQuery.expected_doc, + query: deleteQuery, + delete_result: deleteResult, + deleted_still_matched: deletedStillMatched, + result: searchAfterDelete, + }, + ), + ); +} + +checks.push( + makeCheck( + "cold_start_recovery_search", + "blocked", + "This adapter runs agentmemory against an in-memory SDK/KV mock; no durable store is available in the harness to prove cold-start recovery.", + { + adapter_storage: "mock StateKV Map", + required_next_step: "wire an agentmemory persistent KV/index path or hosted runtime for restart testing", + }, + ), +); + +const checkSummary = summarizeChecks(checks); + +writeFileSync( + outPath, + JSON.stringify( + { + schema: "elf.live_baseline.agentmemory_result/v1", + corpus: { + document_count: docs.length, + query_count: queries.length, + }, + writes, + summary: { + total: queryResults.length, + pass, + fail: queryResults.length - pass, + }, + check_summary: checkSummary, + checks, + queries: queryResults, + }, + null, + 2, + ), +); +TS + if run_cmd "${project}: same-corpus remember/search" 240 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && npx tsx '${driver_path}' '${result_path}' '${CORPUS_DIR}' '${REPORT_DIR}/queries.json'"; then + if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then + jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + if jq -e --argjson query_count "${QUERY_COUNT}" --argjson document_count "${DOCUMENT_COUNT}" ' + .schema == "elf.live_baseline.agentmemory_result/v1" and + .corpus.document_count == $document_count and + .summary.total == $query_count + ' "${result_path}" >/dev/null; then + local typed_status + local retrieval_status + typed_status="$(typed_status_from_result "${result_path}")" + if jq -e '.summary.fail == 0' "${result_path}" >/dev/null; then + retrieval_status="retrieval_pass" + else + retrieval_status="retrieval_wrong_result" + fi + json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "npm install/build; mem::remember/mem::forget/mem::search" + return + fi + json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "agentmemory command completed, but did not produce a valid benchmark result" "${project}.log" "npm install/build; mem::remember; mem::search" + return + fi + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "agentmemory install/build passed but same-corpus remember/search failed" "${project}.log" "npm install/build; mem::remember; mem::search" + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "install/build failed" "${project}.log" "npm install/build" +} + +project_qmd() { + local project="qmd" + local repo="https://github.com/tobi/qmd.git" + local log_path="${REPORT_DIR}/${project}.log" + local query_result_path="${REPORT_DIR}/${project}-query.json" + local status_path="${REPORT_DIR}/${project}-status.txt" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-qmd.mjs" + local home="${HOME_DIR}/${project}" + local corpus_path + local head + mkdir -p "${home}" + cat >"${REPORT_DIR}/${project}-adapter.json" <<'JSON' +{ + "schema": "elf.live_baseline.adapter_metadata/v1", + "project": "qmd", + "storage": { + "status": "real", + "detail": "The adapter uses qmd's local collection, persisted project files, and fresh CLI query processes inside Docker." + }, + "behaviors": { + "same_corpus_retrieval": { + "status": "real", + "surface": "collection add, update, embed -f, and query --json" + }, + "update": { + "status": "real", + "surface": "rewrite corpus file, rerun qmd update/embed, and query for the replacement marker" + }, + "delete_or_expire": { + "status": "real", + "surface": "delete corpus file, rerun qmd update, and verify deleted evidence is not returned" + }, + "expire": { + "status": "unsupported", + "surface": "qmd file collections support deletion but no TTL/expiry behavior is encoded" + }, + "cold_start_reload": { + "status": "real", + "surface": "fresh qmd query process over the persisted local collection" + }, + "scale_stress_profile": { + "status": "real", + "surface": "Run ELF_BASELINE_PROJECTS=qmd with ELF_BASELINE_PROFILE=scale or stress through cargo make baseline-live-docker." + } + } +} +JSON + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if ! run_cmd "${project}: install/build" 300 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && (npm ci || npm install --no-audit --no-fund) && npm run build --if-present"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "install/build failed" "${project}.log" "npm install/build" + return + fi + corpus_path="$(prepare_project_corpus "${project}")" + + cat >"${driver_path}" <<'JS' +import { execFileSync } from "node:child_process"; +import { existsSync, readFileSync, unlinkSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; + +const outPath = process.argv[2]; +const queriesPath = process.argv[3]; +const corpusPath = process.argv[4]; +if (!outPath || !queriesPath || !corpusPath) { + throw new Error("output path, query path, and corpus path are required"); +} + +const queries = JSON.parse(readFileSync(queriesPath, "utf8")).queries; +const topK = process.env.ELF_BASELINE_TOP_K ?? "10"; + +function resultMatches(results, query) { + if (!Array.isArray(results)) return false; + return results.some((entry) => { + const entryText = JSON.stringify(entry).toLowerCase(); + const file = String(entry.file ?? ""); + return ( + file.includes(query.expected_doc) && + query.expected_terms.every((term) => + entryText.includes(String(term).toLowerCase()), + ) + ); + }); +} + +function qmdQuery(queryText) { + const structuredQuery = `lex: ${queryText}\nvec: ${queryText}`; + const stdout = execFileSync( + "npx", + [ + "tsx", + "src/cli/qmd.ts", + "query", + structuredQuery, + "-c", + "elfbench", + "--json", + "--no-rerank", + "--min-score", + "0", + "-n", + topK, + ], + { encoding: "utf8", env: process.env }, + ); + return JSON.parse(stdout); +} + +function runQueryCase(query) { + const results = qmdQuery(query.query); + return { + id: query.id, + query: query.query, + expected_doc: query.expected_doc, + expected_terms: query.expected_terms, + matched: resultMatches(results, query), + results, + }; +} + +function makeCheck(name, status, reason, evidence) { + return { name, status, reason, evidence }; +} + +function summarizeChecks(checks) { + return { + total: checks.length, + pass: checks.filter((check) => check.status === "pass").length, + fail: checks.filter( + (check) => + check.status === "wrong_result" || + check.status === "lifecycle_fail", + ).length, + wrong_result: checks.filter((check) => check.status === "wrong_result") + .length, + lifecycle_fail: checks.filter((check) => check.status === "lifecycle_fail") + .length, + incomplete: checks.filter((check) => check.status === "incomplete").length, + blocked: checks.filter((check) => check.status === "blocked").length, + not_encoded: checks.filter((check) => check.status === "not_encoded") + .length, + }; +} + +function runQmd(args) { + return execFileSync("npx", ["tsx", "src/cli/qmd.ts", ...args], { + encoding: "utf8", + env: process.env, + }); +} + +function syncCollection({ embed = false } = {}) { + runQmd(["update"]); + if (embed) { + runQmd(["embed", "-f", "-c", "elfbench"]); + } +} + +const queryResults = queries.map((query) => runQueryCase(query)); +const pass = queryResults.filter((result) => result.matched).length; +const checks = [ + makeCheck( + "same_corpus_retrieval", + pass === queryResults.length ? "pass" : "wrong_result", + pass === queryResults.length + ? "qmd structured hybrid query returned expected evidence for every query." + : "qmd structured hybrid query missed one or more expected results.", + { + total: queryResults.length, + pass, + fail: queryResults.length - pass, + }, + ), +]; + +const authPath = join(corpusPath, "auth-memory.md"); +if (!existsSync(authPath)) { + checks.push( + makeCheck( + "update_replaces_note_text", + "not_encoded", + "The auth corpus file was missing, so qmd update could not be exercised.", + { source: "auth-memory.md" }, + ), + ); +} else { + writeFileSync( + authPath, + "# Auth Memory\n\nRotated auth middleware validates JWT tokens with key id `kid-v4` under `RotatedJwtKeyPlan`. It still requires tenant scope `project_shared` for deployment operations after the emergency key rotation.\n", + ); + syncCollection({ embed: true }); + const updateQuery = { + id: "lifecycle-update-new-marker", + query: "Which rotated JWT key id does the auth middleware require?", + expected_doc: "auth-memory.md", + expected_terms: ["kid-v4", "RotatedJwtKeyPlan"], + }; + const updateResults = qmdQuery(updateQuery.query); + const updateMatched = resultMatches(updateResults, updateQuery); + const oldMarkerAbsent = updateResults + .filter((entry) => String(entry.file ?? "").includes("auth-memory.md")) + .every((entry) => !JSON.stringify(entry).toLowerCase().includes("kid-v3")); + checks.push( + makeCheck( + "update_replaces_note_text", + updateMatched && oldMarkerAbsent ? "pass" : "lifecycle_fail", + updateMatched && oldMarkerAbsent + ? "qmd update/embed returned the new marker and did not return the old marker for the updated file." + : "qmd update/embed did not cleanly replace the searchable auth file text.", + { + source: "auth-memory.md", + matched_new_marker: updateMatched, + old_marker_absent: oldMarkerAbsent, + results: updateResults, + }, + ), + ); +} + +const deleteQuery = queries.find( + (query) => + query.expected_doc !== "auth-memory.md" && + query.expected_doc !== "database-memory.md" && + existsSync(join(corpusPath, query.expected_doc)), +); +if (!deleteQuery) { + checks.push( + makeCheck( + "delete_suppresses_retrieval", + "not_encoded", + "No non-update, non-recovery corpus file was available, so qmd delete could not be exercised.", + { available_docs: queries.map((query) => query.expected_doc) }, + ), + ); +} else { + unlinkSync(join(corpusPath, deleteQuery.expected_doc)); + syncCollection(); + const deleteResults = qmdQuery(deleteQuery.query); + const deletedStillMatched = resultMatches(deleteResults, deleteQuery); + checks.push( + makeCheck( + "delete_suppresses_retrieval", + deletedStillMatched ? "lifecycle_fail" : "pass", + deletedStillMatched + ? "qmd update marked the deleted file removed, but it was still searchable." + : "qmd update suppressed the deleted file from subsequent search.", + { + source: deleteQuery.expected_doc, + query: deleteQuery, + deleted_still_matched: deletedStillMatched, + results: deleteResults, + }, + ), + ); +} + +const recoveryQuery = { + id: "lifecycle-cold-start-recovery", + query: + "The invoice list N+1 query was fixed by eager loading invoice lines through `InvoiceLineBatcher`. Do not reintroduce per-row SQL calls in invoice rendering.", + expected_doc: "database-memory.md", + expected_terms: ["InvoiceLineBatcher", "N+1"], +}; +const recoveryResults = qmdQuery(recoveryQuery.query); +const recoveryMatched = resultMatches(recoveryResults, recoveryQuery); +checks.push( + makeCheck( + "cold_start_recovery_search", + recoveryMatched ? "pass" : "lifecycle_fail", + recoveryMatched + ? "A fresh qmd query process reopened the persisted index and retrieved expected evidence." + : "A fresh qmd query process did not retrieve expected persisted evidence.", + { + expected_doc: recoveryQuery.expected_doc, + matched: recoveryMatched, + results: recoveryResults, + }, + ), +); + +const checkSummary = summarizeChecks(checks); +writeFileSync( + outPath, + JSON.stringify( + { + schema: "elf.live_baseline.qmd_result/v1", + summary: { + total: queryResults.length, + pass, + fail: queryResults.length - pass, + }, + check_summary: checkSummary, + checks, + queries: queryResults, + }, + null, + 2, + ), +); +JS + + if run_cmd "${project}: embedded retrieval" 900 "${log_path}" \ + "export HOME='${home}'; export XDG_CACHE_HOME='/root/.cache'; export QMD_FORCE_CPU=1; cd '${REPOS_DIR}/${project}' && npx tsx src/cli/qmd.ts collection add '${corpus_path}' --name elfbench && npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts status > '${status_path}' && node '${driver_path}' '${query_result_path}' '${REPORT_DIR}/queries.json' '${corpus_path}'"; then + if jq -e '.checks and .check_summary' "${query_result_path}" >/dev/null 2>&1; then + jq '{check_summary, checks}' "${query_result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + if jq -e --argjson query_count "${QUERY_COUNT}" ' + .schema == "elf.live_baseline.qmd_result/v1" and + .summary.total == $query_count + ' "${query_result_path}" >/dev/null; then + local typed_status + local retrieval_status + typed_status="$(typed_status_from_result "${query_result_path}")" + if jq -e '.summary.fail == 0' "${query_result_path}" >/dev/null; then + retrieval_status="retrieval_pass" + else + retrieval_status="retrieval_wrong_result" + fi + json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "collection add; update; embed -f; query --json" + elif ! rg -q "Embedded [1-9][0-9]* chunks" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "embedding_required" "qmd indexed the corpus, but no successful embedding completion was observed" "${project}.log" "collection add; update; embed -f; query --json" + elif ! jq -e '.schema == "elf.live_baseline.qmd_result/v1"' "${query_result_path}" >/dev/null 2>&1; then + json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "qmd query command completed, but did not produce parseable JSON results" "${project}.log" "collection add; update; embed -f; search/query --json" + else + json_record "${project}" "${repo}" "${head}" "wrong_result" "retrieval_wrong_result" "qmd embedded retrieval ran but did not return expected evidence" "${project}.log" "collection add; update; embed -f; search/query --json" + fi + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "qmd install passed but embedded retrieval command failed" "${project}.log" "collection add; update; embed -f; search/query --json" +} + +project_memsearch() { + local project="memsearch" + local repo="https://github.com/zilliztech/memsearch.git" + local log_path="${REPORT_DIR}/${project}.log" + local home="${HOME_DIR}/${project}" + local result_path="${REPORT_DIR}/${project}-search.json" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-memsearch.py" + local corpus_path + local head + mkdir -p "${home}" + cat >"${REPORT_DIR}/${project}-adapter.json" <<'JSON' +{ + "schema": "elf.live_baseline.adapter_metadata/v1", + "project": "memsearch", + "storage": { + "status": "real", + "detail": "The adapter uses memsearch CLI indexing and search with the local ONNX embedder inside Docker." + }, + "behaviors": { + "same_corpus_retrieval": { + "status": "real", + "surface": "memsearch index and memsearch search" + }, + "update": { + "status": "real", + "surface": "rewrite corpus file, rerun memsearch index, and query for the replacement marker" + }, + "delete_or_expire": { + "status": "real", + "surface": "delete corpus file, rerun memsearch index, and verify deleted evidence is not returned" + }, + "expire": { + "status": "unsupported", + "surface": "the encoded CLI path supports reindex/delete but no TTL/expiry behavior" + }, + "cold_start_reload": { + "status": "real", + "surface": "fresh memsearch CLI search process over the local index" + }, + "scale_stress_profile": { + "status": "incomplete", + "surface": "smoke lifecycle path is encoded; scale/stress timing and resource thresholds are not yet calibrated" + } + } +} +JSON + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if ! run_cmd "${project}: install" 420 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && python3 -m venv .venv && .venv/bin/pip install --upgrade pip && .venv/bin/pip install -e '.[local,onnx]'"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "pip install failed" "${project}.log" "pip install -e .[local,onnx]" + return + fi + corpus_path="$(prepare_project_corpus "${project}")" + + cat >"${driver_path}" <<'PY' +import json +import os +import subprocess +from pathlib import Path + +out_path = Path(os.environ["ELF_MEMSEARCH_RESULT_PATH"]) +queries_path = Path(os.environ["ELF_BASELINE_QUERIES_PATH"]) +corpus_path = Path(os.environ["ELF_BASELINE_CORPUS_PATH"]) +top_k = os.environ.get("ELF_BASELINE_TOP_K", "10") +queries = json.loads(queries_path.read_text())["queries"] + + +def run_memsearch(args): + return subprocess.run( + ["memsearch", *args], + check=True, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ).stdout + + +def index_corpus(): + return run_memsearch(["index", str(corpus_path)]) + + +def search_output(query_text): + return run_memsearch(["search", query_text, "--top-k", top_k]) + + +def output_matches(output, query): + lowered = output.lower() + matched = query["expected_doc"] in output and all( + term.lower() in lowered for term in query["expected_terms"] + ) + if not matched: + matched = all(term.lower() in lowered for term in query["expected_terms"]) + return matched + + +def make_check(name, status, reason, evidence): + return { + "name": name, + "status": status, + "reason": reason, + "evidence": evidence, + } + + +def summarize_checks(checks): + wrong_result = sum(1 for check in checks if check["status"] == "wrong_result") + lifecycle_fail = sum(1 for check in checks if check["status"] == "lifecycle_fail") + return { + "total": len(checks), + "pass": sum(1 for check in checks if check["status"] == "pass"), + "fail": wrong_result + lifecycle_fail, + "wrong_result": wrong_result, + "lifecycle_fail": lifecycle_fail, + "incomplete": sum(1 for check in checks if check["status"] == "incomplete"), + "blocked": sum(1 for check in checks if check["status"] == "blocked"), + "not_encoded": sum(1 for check in checks if check["status"] == "not_encoded"), + } + + +query_results = [] +for query in queries: + output = search_output(query["query"]) + matched = output_matches(output, query) + query_results.append( + { + "id": query["id"], + "query": query["query"], + "expected_doc": query["expected_doc"], + "expected_terms": query["expected_terms"], + "matched": matched, + "output": output, + } + ) + +pass_count = sum(1 for result in query_results if result["matched"]) +checks = [ + make_check( + "same_corpus_retrieval", + "pass" if pass_count == len(query_results) else "wrong_result", + "memsearch search returned expected evidence for every query." + if pass_count == len(query_results) + else "memsearch search missed one or more expected results.", + { + "total": len(query_results), + "pass": pass_count, + "fail": len(query_results) - pass_count, + }, + ) +] + +auth_path = corpus_path / "auth-memory.md" +if not auth_path.exists(): + checks.append( + make_check( + "update_replaces_note_text", + "not_encoded", + "The auth corpus file was missing, so memsearch update could not be exercised.", + {"source": "auth-memory.md"}, + ) + ) +else: + auth_path.write_text( + "# Auth Memory\n\nRotated auth middleware validates JWT tokens with key id `kid-v4` under `RotatedJwtKeyPlan`. It still requires tenant scope `project_shared` for deployment operations after the emergency key rotation.\n" + ) + update_index_output = index_corpus() + update_query = { + "id": "lifecycle-update-new-marker", + "query": "Which rotated JWT key id does the auth middleware require?", + "expected_doc": "auth-memory.md", + "expected_terms": ["kid-v4", "RotatedJwtKeyPlan"], + } + update_output = search_output(update_query["query"]) + update_matched = output_matches(update_output, update_query) + old_marker_absent = "kid-v3" not in update_output.lower() + checks.append( + make_check( + "update_replaces_note_text", + "pass" if update_matched and old_marker_absent else "lifecycle_fail", + "memsearch re-index returned the new marker and did not return the old marker for the updated file." + if update_matched and old_marker_absent + else "memsearch re-index did not cleanly replace the searchable auth file text.", + { + "source": "auth-memory.md", + "matched_new_marker": update_matched, + "old_marker_absent": old_marker_absent, + "index_output": update_index_output, + "output": update_output, + }, + ) + ) + +delete_query = next( + ( + query + for query in queries + if query["expected_doc"] not in {"auth-memory.md", "database-memory.md"} + and (corpus_path / query["expected_doc"]).exists() + ), + None, +) +if delete_query is None: + checks.append( + make_check( + "delete_suppresses_retrieval", + "not_encoded", + "No non-update, non-recovery corpus file was available, so memsearch delete could not be exercised.", + {"available_docs": [query["expected_doc"] for query in queries]}, + ) + ) +else: + (corpus_path / delete_query["expected_doc"]).unlink() + delete_index_output = index_corpus() + delete_output = search_output(delete_query["query"]) + deleted_still_matched = output_matches(delete_output, delete_query) + checks.append( + make_check( + "delete_suppresses_retrieval", + "lifecycle_fail" if deleted_still_matched else "pass", + "memsearch index removed the deleted file from subsequent search." + if not deleted_still_matched + else "memsearch index returned success but the deleted file was still searchable.", + { + "source": delete_query["expected_doc"], + "query": delete_query, + "deleted_still_matched": deleted_still_matched, + "index_output": delete_index_output, + "output": delete_output, + }, + ) + ) + +recovery_query = { + "id": "lifecycle-cold-start-recovery", + "query": "The invoice list N+1 query was fixed by eager loading invoice lines through `InvoiceLineBatcher`. Do not reintroduce per-row SQL calls in invoice rendering.", + "expected_doc": "database-memory.md", + "expected_terms": ["InvoiceLineBatcher", "N+1"], +} +recovery_output = search_output(recovery_query["query"]) +recovery_matched = output_matches(recovery_output, recovery_query) +checks.append( + make_check( + "cold_start_recovery_search", + "pass" if recovery_matched else "lifecycle_fail", + "A fresh memsearch CLI process reopened the local Milvus index and retrieved persisted evidence." + if recovery_matched + else "A fresh memsearch CLI process did not retrieve expected persisted evidence.", + { + "expected_doc": recovery_query["expected_doc"], + "matched": recovery_matched, + "output": recovery_output, + }, + ) +) + +check_summary = summarize_checks(checks) +out_path.write_text( + json.dumps( + { + "schema": "elf.live_baseline.memsearch_result/v1", + "summary": { + "total": len(query_results), + "pass": pass_count, + "fail": len(query_results) - pass_count, + }, + "check_summary": check_summary, + "checks": checks, + "queries": query_results, + }, + indent=2, + ) +) +PY + + if run_cmd "${project}: cli retrieval attempt" 240 "${log_path}" \ + "export HOME='${home}'; export ELF_MEMSEARCH_RESULT_PATH='${result_path}'; export ELF_BASELINE_QUERIES_PATH='${REPORT_DIR}/queries.json'; export ELF_BASELINE_CORPUS_PATH='${corpus_path}'; cd '${REPOS_DIR}/${project}' && source .venv/bin/activate && memsearch --help && memsearch config set embedding.provider onnx && memsearch index '${corpus_path}' && python '${driver_path}'"; then + if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then + jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + if jq -e --argjson query_count "${QUERY_COUNT}" ' + .schema == "elf.live_baseline.memsearch_result/v1" and + .summary.total == $query_count + ' "${result_path}" >/dev/null; then + local typed_status + local retrieval_status + typed_status="$(typed_status_from_result "${result_path}")" + if jq -e '.summary.fail == 0' "${result_path}" >/dev/null; then + retrieval_status="retrieval_pass" + else + retrieval_status="retrieval_wrong_result" + fi + json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "config; index; search" + else + json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "memsearch command completed, but did not produce a valid benchmark result" "${project}.log" "config; index; search" + fi + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "memsearch installed, but the current CLI retrieval command failed" "${project}.log" "memsearch --help; config; index; search" +} + +project_mem0() { + local project="mem0" + local repo="https://github.com/mem0ai/mem0.git" + local log_path="${REPORT_DIR}/${project}.log" + local result_path="${REPORT_DIR}/${project}-search.json" + local openmemory_probe_path="${REPORT_DIR}/${project}-openmemory-ui-export.json" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-mem0.py" + local home="${HOME_DIR}/${project}" + local corpus_path + local head + mkdir -p "${home}" + cat >"${REPORT_DIR}/${project}-adapter.json" <<'JSON' +{ + "schema": "elf.live_baseline.adapter_metadata/v1", + "project": "mem0", + "storage": { + "status": "real", + "detail": "The adapter uses Memory.from_config with local FastEmbed, Qdrant path storage, and history DB paths inside Docker." + }, + "behaviors": { + "same_corpus_retrieval": { + "status": "real", + "surface": "Memory.add(infer=false) and Memory.search" + }, + "update": { + "status": "real", + "surface": "Memory.update against the stored memory id" + }, + "delete_or_expire": { + "status": "real", + "surface": "Memory.delete against the stored memory id" + }, + "expire": { + "status": "unsupported", + "surface": "the encoded local Memory path does not expose TTL/expiry behavior" + }, + "cold_start_reload": { + "status": "real", + "surface": "new Memory.from_config over the same local Qdrant/history paths" + }, + "preference_history": { + "status": "real", + "surface": "Memory.history after a local preference correction update" + }, + "entity_scope_personalization": { + "status": "real", + "surface": "Memory.add/search with user_id, agent_id, and run_id filters" + }, + "deletion_audit": { + "status": "real", + "surface": "Memory.history after Memory.delete" + }, + "local_export_readback": { + "status": "real", + "surface": "Memory.get_all over local OSS storage for inspection/export-style readback" + }, + "openmemory_ui_export": { + "status": "blocked", + "surface": "bounded export-helper setup probe writes tmp/live-baseline/mem0-openmemory-ui-export.json; SDK get_all remains separate" + }, + "scale_stress_profile": { + "status": "incomplete", + "surface": "smoke lifecycle path is encoded; scale/stress timing and resource thresholds are not yet calibrated" + } + } +} +JSON + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if ! run_cmd "${project}: install/import" 420 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && python3 -m venv .venv && .venv/bin/pip install --upgrade pip && .venv/bin/pip install -e . fastembed ollama && .venv/bin/python - <<'PY' +from mem0 import Memory +print('mem0 Memory import ok:', Memory) +PY"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "pip install or import failed" "${project}.log" "pip install -e . fastembed ollama; import Memory" + return + fi + corpus_path="$(prepare_project_corpus "${project}")" + + cat >"${driver_path}" <<'PY' +import gc +import json +import os +from pathlib import Path + +os.environ.setdefault("MEM0_TELEMETRY", "false") + +from mem0 import Memory + +out_path = Path(os.environ["ELF_MEM0_RESULT_PATH"]) +base = Path(os.environ["ELF_MEM0_HOME"]) +corpus_path = Path(os.environ["ELF_BASELINE_CORPUS_PATH"]) +queries_path = Path(os.environ["ELF_BASELINE_QUERIES_PATH"]) +top_k = int(os.environ.get("ELF_BASELINE_TOP_K", "10")) + +config = { + "vector_store": { + "provider": "qdrant", + "config": { + "collection_name": "elfbench", + "path": str(base / "qdrant"), + "embedding_model_dims": 384, + }, + }, + "embedder": { + "provider": "fastembed", + "config": { + "model": "BAAI/bge-small-en-v1.5", + "embedding_dims": 384, + }, + }, + "llm": { + "provider": "ollama", + "config": { + "model": "llama3.1:8b", + "ollama_base_url": "http://127.0.0.1:11434", + }, + }, + "history_db_path": str(base / "history.db"), + "version": "v1.1", +} + +memory = Memory.from_config(config) + +def plain_text(markdown: str) -> str: + return " ".join( + line.strip() + for line in markdown.splitlines() + if not line.lstrip().startswith("#") + ).strip() + + +docs = [ + (plain_text(path.read_text()), path.name) + for path in sorted(corpus_path.glob("*.md")) +] +queries = json.loads(queries_path.read_text())["queries"] + +adds = [] +memory_ids_by_source = {} +for text, source in docs: + added = memory.add( + text, + user_id="elf-bench", + metadata={"source": source}, + infer=False, + ) + adds.append({"source": source, "result": added}) + results = added.get("results", []) if isinstance(added, dict) else [] + if results and isinstance(results[0], dict) and results[0].get("id"): + memory_ids_by_source[source] = results[0]["id"] + + +def result_entries(search): + if isinstance(search, dict): + for key in ("results", "memories"): + entries = search.get(key) + if isinstance(entries, list): + return entries + if isinstance(search, list): + return search + return [] + + +def search_memory(memory_instance, query_text, filters=None): + return memory_instance.search( + query_text, + filters=filters or {"user_id": "elf-bench"}, + top_k=top_k, + threshold=0.0, + ) + + +def json_lower(value): + return json.dumps(value, default=str).lower() + + +def contains_terms(value, terms): + text = json_lower(value) + return all(term.lower() in text for term in terms) + + +def history_entries(history): + if isinstance(history, dict): + for key in ("results", "history", "memories"): + entries = history.get(key) + if isinstance(entries, list): + return entries + if isinstance(history, list): + return history + return [] + + +def history_has_event(history, expected_event): + expected = expected_event.upper() + return any( + isinstance(entry, dict) and str(entry.get("event", "")).upper() == expected + for entry in history_entries(history) + ) + + +def first_memory_id(add_result): + results = add_result.get("results", []) if isinstance(add_result, dict) else [] + if results and isinstance(results[0], dict): + return results[0].get("id") + return None + + +def memory_history(memory_instance, memory_id): + if not hasattr(memory_instance, "history"): + return { + "available": False, + "history": None, + "error": "Memory.history is unavailable", + } + try: + return { + "available": True, + "history": memory_instance.history(memory_id), + "error": None, + } + except Exception as exc: + return { + "available": False, + "history": None, + "error": repr(exc), + } + + +def get_all_memories(memory_instance, filters): + if not hasattr(memory_instance, "get_all"): + return { + "available": False, + "memories": None, + "error": "Memory.get_all is unavailable", + } + try: + return { + "available": True, + "memories": memory_instance.get_all(filters=filters), + "error": None, + } + except TypeError: + try: + return { + "available": True, + "memories": memory_instance.get_all( + user_id=filters.get("user_id"), + agent_id=filters.get("agent_id"), + run_id=filters.get("run_id"), + ), + "error": None, + } + except Exception as exc: + return { + "available": False, + "memories": None, + "error": repr(exc), + } + except Exception as exc: + return { + "available": False, + "memories": None, + "error": repr(exc), + } + + +def matches_expected(search, expected_doc, expected_terms): + for entry in result_entries(search): + entry_text = json_lower(entry) + source = ((entry.get("metadata") or {}).get("source") or "") + if source == expected_doc and all( + term.lower() in entry_text for term in expected_terms + ): + return True + return False + + +def query_result(query, search): + return { + "id": query["id"], + "query": query["query"], + "expected_doc": query["expected_doc"], + "expected_terms": query["expected_terms"], + "matched": matches_expected( + search, + query["expected_doc"], + query["expected_terms"], + ), + "search": search, + } + + +def make_check(name, status, reason, evidence): + return { + "name": name, + "status": status, + "reason": reason, + "evidence": evidence, + } + + +def summarize_checks(checks): + wrong_result = sum(1 for check in checks if check["status"] == "wrong_result") + lifecycle_fail = sum(1 for check in checks if check["status"] == "lifecycle_fail") + return { + "total": len(checks), + "pass": sum(1 for check in checks if check["status"] == "pass"), + "fail": wrong_result + lifecycle_fail, + "wrong_result": wrong_result, + "lifecycle_fail": lifecycle_fail, + "incomplete": sum(1 for check in checks if check["status"] == "incomplete"), + "blocked": sum(1 for check in checks if check["status"] == "blocked"), + "not_encoded": sum(1 for check in checks if check["status"] == "not_encoded"), + } + +query_results = [] +for query in queries: + query_results.append(query_result(query, search_memory(memory, query["query"]))) + +pass_count = sum(1 for result in query_results if result["matched"]) +checks = [ + make_check( + "same_corpus_retrieval", + "pass" if pass_count == len(query_results) else "wrong_result", + "mem0 local FastEmbed/Qdrant search returned expected evidence for every query." + if pass_count == len(query_results) + else "mem0 local FastEmbed/Qdrant search missed one or more expected results.", + { + "total": len(query_results), + "pass": pass_count, + "fail": len(query_results) - pass_count, + }, + ) +] + +auth_id = memory_ids_by_source.get("auth-memory.md") +if not auth_id: + checks.append( + make_check( + "update_replaces_note_text", + "not_encoded", + "The auth memory id was not returned by mem0 add(), so update could not be exercised.", + {"source": "auth-memory.md"}, + ) + ) +else: + update_text = ( + "Rotated auth middleware validates JWT tokens with key id `kid-v4` " + "under `RotatedJwtKeyPlan`. It still requires tenant scope " + "`project_shared` for deployment operations after the emergency key rotation." + ) + update_result = memory.update( + auth_id, + update_text, + metadata={"source": "auth-memory.md", "lifecycle": "updated"}, + ) + update_search = search_memory( + memory, + "Which rotated JWT key id does the auth middleware require?", + ) + update_matched = matches_expected( + update_search, + "auth-memory.md", + ["kid-v4", "RotatedJwtKeyPlan"], + ) + old_marker_absent = all( + "kid-v3" not in json.dumps(entry, default=str).lower() + for entry in result_entries(update_search) + if entry.get("id") == auth_id + or ((entry.get("metadata") or {}).get("source") == "auth-memory.md") + ) + checks.append( + make_check( + "update_replaces_note_text", + "pass" if update_matched and old_marker_absent else "lifecycle_fail", + "mem0 update() returned the new marker and did not return the old marker for the updated memory." + if update_matched and old_marker_absent + else "mem0 update() did not cleanly replace the searchable auth memory text.", + { + "memory_id": auth_id, + "update_result": update_result, + "matched_new_marker": update_matched, + "old_marker_absent": old_marker_absent, + "search": update_search, + }, + ) + ) + +history_filters = { + "user_id": "elf-history-user", + "agent_id": "elf-history-agent", + "run_id": "elf-project", +} +old_preference = ( + "Preference v1 for ELF: provide verbose tutorial explanations for every answer." +) +current_preference = ( + "Preference v2 for ELF: answer concisely with evidence-linked bullets." +) +preference_add = memory.add( + old_preference, + user_id=history_filters["user_id"], + agent_id=history_filters["agent_id"], + run_id=history_filters["run_id"], + metadata={"source": "preference-history", "kind": "preference"}, + infer=False, +) +preference_id = first_memory_id(preference_add) +if not preference_id: + checks.append( + make_check( + "preference_correction_history", + "incomplete", + "The preference memory id was not returned, so correction history could not be inspected.", + {"add_result": preference_add}, + ) + ) +else: + preference_update = memory.update( + preference_id, + current_preference, + metadata={"source": "preference-history", "kind": "preference"}, + ) + preference_history = memory_history(memory, preference_id) + preference_search = search_memory( + memory, + "How should answers be written for the ELF project?", + history_filters, + ) + history_has_old = contains_terms(preference_history["history"], ["verbose tutorial"]) + history_has_current = contains_terms( + preference_history["history"], + ["concise", "evidence-linked"], + ) + history_has_add_event = preference_history["available"] and history_has_event( + preference_history["history"], + "ADD", + ) + history_has_update_event = preference_history["available"] and history_has_event( + preference_history["history"], + "UPDATE", + ) + search_has_current = contains_terms( + result_entries(preference_search), + ["concise", "evidence-linked"], + ) + search_omits_old = "verbose tutorial" not in json_lower(result_entries(preference_search)) + if not preference_history["available"]: + preference_status = "blocked" + preference_reason = "Memory.history could not be read for the updated preference memory." + elif ( + history_has_old + and history_has_current + and history_has_add_event + and history_has_update_event + and search_has_current + and search_omits_old + ): + preference_status = "pass" + preference_reason = "mem0 history preserved ADD and UPDATE preference events while search returned only the current correction." + else: + preference_status = "lifecycle_fail" + preference_reason = "mem0 did not expose a clean preference correction chain with current-only search readback." + checks.append( + make_check( + "preference_correction_history", + preference_status, + preference_reason, + { + "memory_id": preference_id, + "add_result": preference_add, + "update_result": preference_update, + "history_available": preference_history["available"], + "history_error": preference_history["error"], + "history_has_old": history_has_old, + "history_has_current": history_has_current, + "history_has_add_event": history_has_add_event, + "history_has_update_event": history_has_update_event, + "search_has_current": search_has_current, + "search_omits_old": search_omits_old, + "history": preference_history["history"], + "search": preference_search, + }, + ) + ) + +other_scope_add = memory.add( + "Preference for PubFi: answer in long-form Chinese prose with no bullets.", + user_id=history_filters["user_id"], + agent_id=history_filters["agent_id"], + run_id="pubfi-project", + metadata={"source": "pubfi-preference", "kind": "preference"}, + infer=False, +) +entity_search = search_memory( + memory, + "What answer style preference applies here?", + history_filters, +) +entity_search_text = json_lower(result_entries(entity_search)) +entity_has_current = "evidence-linked bullets" in entity_search_text +entity_omits_other = "long-form chinese" not in entity_search_text +checks.append( + make_check( + "entity_scoped_personalization", + "pass" if entity_has_current and entity_omits_other else "lifecycle_fail", + "mem0 search respected user_id, agent_id, and run_id filters for the current preference scope." + if entity_has_current and entity_omits_other + else "mem0 entity-scoped search did not isolate the current preference from another run/project scope.", + { + "current_memory_id": preference_id, + "other_scope_add": other_scope_add, + "filters": history_filters, + "has_current": entity_has_current, + "omits_other_scope": entity_omits_other, + "search": entity_search, + }, + ) +) + +export_readback = get_all_memories(memory, history_filters) +export_has_current = contains_terms( + export_readback["memories"], + ["concise", "evidence-linked"], +) +export_omits_other = "long-form chinese" not in json_lower(export_readback["memories"]) +if not export_readback["available"]: + export_status = "blocked" + export_reason = "Memory.get_all could not be read for local OSS inspection/export-style evidence." +elif export_has_current and export_omits_other: + export_status = "pass" + export_reason = "mem0 get_all returned local export-style readback for the current scoped preference without the other scope." +else: + export_status = "lifecycle_fail" + export_reason = "mem0 get_all did not return the current scoped preference cleanly for local export-style readback." +checks.append( + make_check( + "local_get_all_export_readback", + export_status, + export_reason, + { + "available": export_readback["available"], + "error": export_readback["error"], + "filters": history_filters, + "has_current": export_has_current, + "omits_other_scope": export_omits_other, + "memories": export_readback["memories"], + }, + ) +) + +delete_query = next( + ( + query + for query in queries + if query["expected_doc"] in memory_ids_by_source + and query["expected_doc"] not in {"auth-memory.md", "database-memory.md"} + ), + None, +) +if delete_query is None: + checks.append( + make_check( + "delete_suppresses_retrieval", + "not_encoded", + "No non-update, non-recovery memory id was available, so delete could not be exercised.", + {"available_sources": sorted(memory_ids_by_source)}, + ) + ) +else: + delete_source = delete_query["expected_doc"] + delete_id = memory_ids_by_source[delete_source] + delete_result = memory.delete(delete_id) + delete_search = search_memory( + memory, + delete_query["query"], + ) + deleted_still_matched = matches_expected( + delete_search, + delete_source, + delete_query["expected_terms"], + ) + checks.append( + make_check( + "delete_suppresses_retrieval", + "pass" if not deleted_still_matched else "lifecycle_fail", + "mem0 delete() suppressed the deleted memory from subsequent search." + if not deleted_still_matched + else "mem0 delete() returned success but the deleted memory was still searchable.", + { + "memory_id": delete_id, + "source": delete_source, + "query": delete_query, + "delete_result": delete_result, + "deleted_still_matched": deleted_still_matched, + "search": delete_search, + }, + ) + ) + delete_history = memory_history(memory, delete_id) + delete_history_has_event = delete_history["available"] and history_has_event( + delete_history["history"], + "DELETE", + ) + if not delete_history["available"]: + delete_audit_status = "blocked" + delete_audit_reason = "Memory.history could not be read after delete, so deletion audit readback is blocked." + elif delete_history_has_event and not deleted_still_matched: + delete_audit_status = "pass" + delete_audit_reason = "mem0 history exposed a delete event and search suppressed the deleted memory." + else: + delete_audit_status = "lifecycle_fail" + delete_audit_reason = "mem0 did not expose a delete audit event while suppressing the deleted memory." + checks.append( + make_check( + "delete_history_audit_readback", + delete_audit_status, + delete_audit_reason, + { + "memory_id": delete_id, + "source": delete_source, + "history_available": delete_history["available"], + "history_error": delete_history["error"], + "history_has_delete_event": delete_history_has_event, + "deleted_still_matched": deleted_still_matched, + "history": delete_history["history"], + }, + ) + ) + +del memory +gc.collect() +reopened_memory = Memory.from_config(config) +recovery_search = search_memory( + reopened_memory, + "The invoice list N+1 query was fixed by eager loading invoice lines through `InvoiceLineBatcher`. Do not reintroduce per-row SQL calls in invoice rendering.", +) +recovery_matched = matches_expected( + recovery_search, + "database-memory.md", + ["InvoiceLineBatcher", "N+1"], +) +checks.append( + make_check( + "cold_start_recovery_search", + "pass" if recovery_matched else "lifecycle_fail", + "A newly constructed mem0 Memory over the same local Qdrant/history paths retrieved persisted evidence." + if recovery_matched + else "A newly constructed mem0 Memory over the same local Qdrant/history paths did not retrieve persisted evidence.", + { + "expected_doc": "database-memory.md", + "matched": recovery_matched, + "search": recovery_search, + }, + ) +) + +check_summary = summarize_checks(checks) + +out_path.write_text( + json.dumps( + { + "schema": "elf.live_baseline.mem0_result/v1", + "config": { + "embedder": "fastembed:BAAI/bge-small-en-v1.5", + "vector_store": "qdrant:path", + "infer": False, + }, + "corpus": { + "document_count": len(docs), + "query_count": len(queries), + }, + "adds": adds, + "summary": { + "total": len(query_results), + "pass": pass_count, + "fail": len(query_results) - pass_count, + }, + "check_summary": check_summary, + "checks": checks, + "queries": query_results, + }, + indent=2, + default=str, + ) +) +PY + + if run_cmd "${project}: local fastembed add/search" 900 "${log_path}" \ + "export HOME='${home}'; export ELF_MEM0_HOME='${home}'; export ELF_MEM0_RESULT_PATH='${result_path}'; export ELF_BASELINE_CORPUS_PATH='${corpus_path}'; export ELF_BASELINE_QUERIES_PATH='${REPORT_DIR}/queries.json'; export MEM0_TELEMETRY=false; cd '${REPOS_DIR}/${project}' && source .venv/bin/activate && python '${driver_path}'"; then + if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then + jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + probe_mem0_openmemory_ui_export "${REPOS_DIR}/${project}" "${result_path}" "${openmemory_probe_path}" "${log_path}" + if jq -e --argjson query_count "${QUERY_COUNT}" --argjson document_count "${DOCUMENT_COUNT}" ' + .schema == "elf.live_baseline.mem0_result/v1" and + .corpus.document_count == $document_count and + .summary.total == $query_count + ' "${result_path}" >/dev/null; then + local typed_status + local retrieval_status + typed_status="$(typed_status_from_result "${result_path}")" + if jq -e '.summary.fail == 0' "${result_path}" >/dev/null; then + retrieval_status="retrieval_pass" + else + retrieval_status="retrieval_wrong_result" + fi + json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add/update/delete/history/get_all/search; OpenMemory export probe" + return + fi + json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "mem0 command completed, but did not produce a valid benchmark result" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add infer=false; search" + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "mem0 installed and imported, but local fastembed/Qdrant add/search failed" "${project}.log" "pip install -e . fastembed ollama; Memory.from_config; add infer=false; search" +} + +project_openviking() { + local project="OpenViking" + local repo="https://github.com/volcengine/OpenViking.git" + local log_path="${REPORT_DIR}/${project}.log" + local home="${HOME_DIR}/${project}" + local config_path="${REPORT_DIR}/${project}-ov.conf" + local result_path="${REPORT_DIR}/${project}-search.json" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-openviking.py" + local constraints_path="${REPORT_DIR}/${project}-constraints.txt" + local llama_cpp_python_version="${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION:-0.3.28}" + local llama_cpp_python_index="${ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX:-https://abetlen.github.io/llama-cpp-python/whl/cpu}" + local local_embed_failure_pattern="target specific option mismatch|failed-wheel-build-for-install|Failed building wheel for llama-cpp-python|Failed to build llama-cpp-python|Could not build wheels for llama-cpp-python|No module named 'llama_cpp'|Local embedding is enabled but 'llama-cpp-python' is not installed|No matching distribution found|Could not find a version that satisfies|not a supported wheel" + local local_embed_install_reason="OpenViking local-embed install failed in Docker for pinned llama-cpp-python==${llama_cpp_python_version} from the CPU wheel index, so same-corpus local retrieval could not be run" + local local_embed_command_summary="pip install -e .; openviking/ov --help; pip install llama-cpp-python==${llama_cpp_python_version} --extra-index-url ${llama_cpp_python_index} --only-binary llama-cpp-python; pip install -e .[local-embed]; OpenViking.add_resource/find" + local head + mkdir -p "${home}" + cat >"${REPORT_DIR}/${project}-adapter.json" <<JSON +{ + "schema": "elf.live_baseline.adapter_metadata/v1", + "project": "OpenViking", + "storage": { + "status": "real", + "detail": "The adapter uses OpenViking local storage after pinning the Docker local embedding dependency path." + }, + "behaviors": { + "same_corpus_retrieval": { + "status": "real", + "surface": "OpenViking.add_resource and OpenViking.find after installing .[local-embed] with llama-cpp-python==${llama_cpp_python_version} from the CPU wheel index", + "evidence": "The Docker dependency boundary is the local llama-cpp-python wheel/import path, not provider-backed ELF embeddings. Once setup reaches add_resource/find, misses are reported as wrong_result.", + "retry": "Retry with ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker; override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the pinned CPU wheel is unavailable for the Docker platform. Treat wheel install/import failures as incomplete, not wrong_result." + }, + "update": { + "status": "not_encoded", + "surface": "no update replacement check is encoded for OpenViking" + }, + "delete_or_expire": { + "status": "not_encoded", + "surface": "no delete or expiry check is encoded for OpenViking" + }, + "expire": { + "status": "unsupported", + "surface": "no TTL/expiry behavior is encoded in the local adapter" + }, + "cold_start_reload": { + "status": "not_encoded", + "surface": "no restart/reopen check is encoded until local same-corpus retrieval completes" + }, + "staged_retrieval_trajectory": { + "status": "blocked", + "surface": "no staged retrieval trajectory check is scored until same-corpus retrieval matches expected evidence ids" + }, + "hierarchy_selection": { + "status": "blocked", + "surface": "no hierarchy selection check is scored until same-corpus retrieval matches expected evidence ids" + }, + "recursive_context_expansion": { + "status": "blocked", + "surface": "no recursive/context expansion check is scored until same-corpus retrieval matches expected evidence ids" + }, + "scale_stress_profile": { + "status": "blocked", + "surface": "scale/stress is blocked until smoke same-corpus retrieval returns evidence-bearing results" + } + } +} +JSON + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if ! run_cmd "${project}: install/help" 600 "${log_path}" \ + "export HOME='${home}'; cd '${REPOS_DIR}/${project}' && python3 -m venv .venv && .venv/bin/pip install --upgrade pip && .venv/bin/pip install maturin && .venv/bin/pip install -e . && (.venv/bin/openviking language en || .venv/bin/ov language en) && (.venv/bin/openviking --help || .venv/bin/ov --help)"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "pip install or CLI help failed" "${project}.log" "pip install -e .; openviking/ov --help" + return + fi + + if rg -q "ERROR: Failed building editable|Failed to build openviking|error: failed-wheel-build-for-install|CMake Error" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "partial_install" "OpenViking install/help returned success but the build log contains native build errors" "${project}.log" "pip install -e .; openviking/ov --help" + return + fi + + cat >"${config_path}" <<EOF +{ + "default_account": "elfbench", + "default_user": "elfbench", + "storage": { + "workspace": "${home}/data", + "skip_process_lock": true, + "vectordb": { + "backend": "local", + "name": "elfbench_context", + "dimension": 512 + } + }, + "embedding": { + "dense": { + "provider": "local", + "model": "bge-small-zh-v1.5-f16", + "cache_dir": "${home}/models" + }, + "text_source": "content_only", + "max_concurrent": 2 + }, + "auto_generate_l0": false, + "auto_generate_l1": false, + "default_search_mode": "fast", + "vlm": {}, + "query_planner": {}, + "rerank": {} +} +EOF + + cat >"${driver_path}" <<'PY' +import json +import os +from pathlib import Path + +from openviking import OpenViking + + +def to_jsonable(value): + if hasattr(value, "to_dict"): + return value.to_dict() + if hasattr(value, "model_dump"): + return value.model_dump() + if isinstance(value, list): + return [to_jsonable(item) for item in value] + if isinstance(value, dict): + return {key: to_jsonable(item) for key, item in value.items()} + return value + + +out_path = Path(os.environ["ELF_OPENVIKING_RESULT_PATH"]) +data_path = os.environ["ELF_OPENVIKING_DATA_PATH"] +corpus_path = os.environ["ELF_OPENVIKING_CORPUS_PATH"] +queries_path = Path(os.environ["ELF_BASELINE_QUERIES_PATH"]) +top_k = int(os.environ.get("ELF_BASELINE_TOP_K", "10")) + + +def expected_evidence_ids(query): + ids = query.get("expected_evidence_ids") or [] + if ids: + return ids + expected_doc = query["expected_doc"] + return [expected_doc[:-3] if expected_doc.endswith(".md") else expected_doc] + + +def allowed_evidence_ids(query): + return query.get("allowed_alternate_evidence_ids") or [] + + +def result_raw(found): + return json.dumps(to_jsonable(found), ensure_ascii=False, default=str).lower() + + +def visible_evidence_ids(found, query): + raw = result_raw(found) + candidate_ids = [*expected_evidence_ids(query), *allowed_evidence_ids(query)] + visible = [] + for evidence_id in candidate_ids: + lowered = evidence_id.lower() + if lowered in raw or f"{lowered}.md" in raw: + visible.append(evidence_id) + return visible + + +def result_matches(found, query): + raw = result_raw(found) + expected_docs = [ + query["expected_doc"], + *query.get("allowed_alternate_docs", []), + ] + has_doc = any(expected_doc.lower() in raw for expected_doc in expected_docs) + has_terms = all(term.lower() in raw for term in query["expected_terms"]) + return has_doc and has_terms + + +client = OpenViking(path=data_path) +client.initialize() +try: + queries = json.loads(queries_path.read_text())["queries"] + added = client.add_resource( + corpus_path, + to="viking://resources/elfbench", + wait=True, + timeout=240, + build_index=True, + summarize=False, + ) + query_results = [] + for query in queries: + found = client.find( + query["query"], + target_uri="viking://resources/elfbench", + limit=top_k, + score_threshold=0.0, + level=[2], + ) + matched_evidence_ids = visible_evidence_ids(found, query) + required_evidence_ids = expected_evidence_ids(query) + query_results.append( + { + "id": query["id"], + "query": query["query"], + "expected_doc": query["expected_doc"], + "expected_terms": query["expected_terms"], + "expected_evidence_ids": required_evidence_ids, + "allowed_alternate_evidence_ids": allowed_evidence_ids(query), + "matched_evidence_ids": matched_evidence_ids, + "missing_evidence_ids": [ + evidence_id + for evidence_id in required_evidence_ids + if evidence_id not in matched_evidence_ids + ], + "matched": result_matches(found, query), + "find": to_jsonable(found), + } + ) + pass_count = sum(1 for result in query_results if result["matched"]) + evidence_total = sum(len(result["expected_evidence_ids"]) for result in query_results) + evidence_matched = sum( + len( + [ + evidence_id + for evidence_id in result["matched_evidence_ids"] + if evidence_id in result["expected_evidence_ids"] + ] + ) + for result in query_results + ) + same_corpus_output_correct = ( + pass_count == len(query_results) + and evidence_total > 0 + and evidence_matched == evidence_total + ) + trajectory_gate_status = "not_encoded" if same_corpus_output_correct else "blocked" + trajectory_gate_reason = ( + "OpenViking same-corpus retrieval matched expected evidence ids, but staged trajectory scoring is not encoded in this Docker adapter." + if trajectory_gate_status == "not_encoded" + else "OpenViking staged trajectory scoring is blocked until same-corpus retrieval matches expected evidence ids." + ) + checks = [ + { + "name": "same_corpus_retrieval", + "status": "pass" if pass_count == len(query_results) else "wrong_result", + "reason": "OpenViking find returned expected evidence for every query." + if pass_count == len(query_results) + else "OpenViking find missed one or more expected results.", + "evidence": { + "total": len(query_results), + "pass": pass_count, + "fail": len(query_results) - pass_count, + }, + }, + { + "name": "same_corpus_expected_evidence_ids_visible", + "status": "pass" + if all(result["expected_evidence_ids"] for result in query_results) + else "incomplete", + "reason": "OpenViking query results expose expected, matched, and missing evidence ids for every same-corpus query.", + "evidence": { + "total_queries": len(query_results), + "queries_with_expected_evidence_ids": sum( + 1 for result in query_results if result["expected_evidence_ids"] + ), + "expected_evidence_total": evidence_total, + "expected_evidence_matched": evidence_matched, + }, + }, + { + "name": "update_replaces_note_text", + "status": "not_encoded", + "reason": "OpenViking update replacement is not encoded in this Docker adapter.", + "evidence": {}, + }, + { + "name": "delete_suppresses_retrieval", + "status": "not_encoded", + "reason": "OpenViking delete or expiry behavior is not encoded in this Docker adapter.", + "evidence": {}, + }, + { + "name": "cold_start_recovery_search", + "status": "not_encoded", + "reason": "OpenViking cold-start reload is not encoded until the local retrieval path is stable in Docker.", + "evidence": {}, + }, + { + "name": "staged_retrieval_trajectory", + "status": trajectory_gate_status, + "reason": trajectory_gate_reason, + "evidence": { + "blocked_by": "same_corpus_expected_evidence_miss" + if trajectory_gate_status == "blocked" + else None + }, + }, + { + "name": "hierarchy_selection", + "status": trajectory_gate_status, + "reason": trajectory_gate_reason.replace( + "staged trajectory", "hierarchy selection" + ), + "evidence": { + "blocked_by": "same_corpus_expected_evidence_miss" + if trajectory_gate_status == "blocked" + else None + }, + }, + { + "name": "recursive_context_expansion", + "status": trajectory_gate_status, + "reason": trajectory_gate_reason.replace( + "staged trajectory", "recursive/context expansion" + ), + "evidence": { + "blocked_by": "same_corpus_expected_evidence_miss" + if trajectory_gate_status == "blocked" + else None + }, + }, + ] + wrong_result_count = sum( + 1 for check in checks if check["status"] == "wrong_result" + ) + lifecycle_fail_count = sum( + 1 for check in checks if check["status"] == "lifecycle_fail" + ) + check_summary = { + "total": len(checks), + "pass": sum(1 for check in checks if check["status"] == "pass"), + "fail": wrong_result_count + lifecycle_fail_count, + "wrong_result": wrong_result_count, + "lifecycle_fail": lifecycle_fail_count, + "incomplete": sum(1 for check in checks if check["status"] == "incomplete"), + "blocked": sum(1 for check in checks if check["status"] == "blocked"), + "not_encoded": sum(1 for check in checks if check["status"] == "not_encoded"), + } + out_path.write_text( + json.dumps( + { + "schema": "elf.live_baseline.openviking_result/v1", + "config": { + "embedder": "local:bge-small-zh-v1.5-f16", + "vector_store": "local", + "mode": "OpenViking.add_resource/find", + }, + "add": to_jsonable(added), + "summary": { + "total": len(query_results), + "pass": pass_count, + "fail": len(query_results) - pass_count, + }, + "check_summary": check_summary, + "checks": checks, + "queries": query_results, + }, + ensure_ascii=False, + indent=2, + default=str, + ) + ) +finally: + client.close() +PY + + if ! run_cmd "${project}: install pinned local embedding extras" 900 "${log_path}" \ + "export HOME='${home}'; cd '${REPOS_DIR}/${project}' && printf 'llama-cpp-python==${llama_cpp_python_version}\n' > '${constraints_path}' && .venv/bin/pip install --extra-index-url '${llama_cpp_python_index}' --only-binary llama-cpp-python -c '${constraints_path}' 'llama-cpp-python==${llama_cpp_python_version}' && .venv/bin/pip install --extra-index-url '${llama_cpp_python_index}' --only-binary llama-cpp-python -c '${constraints_path}' -e '.[local-embed]' && .venv/bin/python - <<'PY' +import llama_cpp + +print('llama_cpp_import_ok', getattr(llama_cpp, '__version__', 'unknown')) +PY"; then + if rg -q "${local_embed_failure_pattern}" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "${local_embed_install_reason}" "${project}.log" "${local_embed_command_summary}" + return + fi + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "${local_embed_install_reason}" "${project}.log" "${local_embed_command_summary}" + return + fi + + if rg -q "${local_embed_failure_pattern}" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking pinned local-embed install returned success but the log contains llama-cpp-python wheel/import failure, so same-corpus local retrieval could not be run" "${project}.log" "${local_embed_command_summary}" + return + fi + + if run_cmd "${project}: local add/find" 900 "${log_path}" \ + "export HOME='${home}'; export OPENVIKING_CONFIG_FILE='${config_path}'; export ELF_OPENVIKING_DATA_PATH='${home}/data'; export ELF_OPENVIKING_CORPUS_PATH='${CORPUS_DIR}'; export ELF_OPENVIKING_RESULT_PATH='${result_path}'; export ELF_BASELINE_QUERIES_PATH='${REPORT_DIR}/queries.json'; cd '${REPOS_DIR}/${project}' && source .venv/bin/activate && python '${driver_path}'"; then + if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then + jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + if rg -q "${local_embed_failure_pattern}" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local add_resource/find hit pinned llama-cpp-python wheel/import failure, so same-corpus local retrieval could not be run" "${project}.log" "${local_embed_command_summary}" + return + fi + if [[ ! -s "${result_path}" ]] || ! jq -e . "${result_path}" >/dev/null 2>&1; then + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "OpenViking local add_resource/find returned success but did not write a valid result JSON" "${project}.log" "${local_embed_command_summary}" + return + fi + if jq -e --argjson query_count "${QUERY_COUNT}" ' + .schema == "elf.live_baseline.openviking_result/v1" and + .summary.total == $query_count + ' "${result_path}" >/dev/null; then + local typed_status + local retrieval_status + typed_status="$(typed_status_from_result "${result_path}")" + if jq -e '.summary.fail == 0' "${result_path}" >/dev/null; then + retrieval_status="retrieval_pass" + else + retrieval_status="retrieval_wrong_result" + fi + json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "${local_embed_command_summary}" + return + fi + json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "OpenViking local add_resource/find did not produce a valid benchmark result" "${project}.log" "${local_embed_command_summary}" + return + fi + + if rg -q "${local_embed_failure_pattern}" "${log_path}"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "local_embed_install_failed" "OpenViking local add_resource/find failed because pinned llama-cpp-python was unavailable in Docker" "${project}.log" "${local_embed_command_summary}" + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "OpenViking pinned local-embed installed, but same-corpus add_resource/find failed in Docker" "${project}.log" "${local_embed_command_summary}" +} + +project_claude_mem() { + local project="claude-mem" + local repo="https://github.com/thedotmack/claude-mem.git" + local log_path="${REPORT_DIR}/${project}.log" + local result_path="${REPORT_DIR}/${project}-search.json" + local driver_path="${REPOS_DIR}/${project}/elf-live-baseline-claude-mem.ts" + local home="${HOME_DIR}/${project}" + local corpus_path + local db_path="${HOME_DIR}/${project}/claude-mem.sqlite" + local head + mkdir -p "${home}" + cat >"${REPORT_DIR}/${project}-adapter.json" <<'JSON' +{ + "schema": "elf.live_baseline.adapter_metadata/v1", + "project": "claude-mem", + "storage": { + "status": "real", + "detail": "The adapter uses claude-mem repository classes with a durable SQLite file inside Docker for same-corpus and lifecycle checks." + }, + "behaviors": { + "same_corpus_retrieval": { + "status": "real", + "surface": "MemoryItemsRepository.create/search over a Docker-local SQLite database" + }, + "update": { + "status": "real", + "surface": "MemoryItemsRepository.update against the stored memory item id" + }, + "delete_or_expire": { + "status": "real", + "surface": "delete from the repository-owned SQLite memory_items table and verify FTS suppression" + }, + "expire": { + "status": "unsupported", + "surface": "no TTL/expiry behavior is encoded in the local adapter" + }, + "cold_start_reload": { + "status": "real", + "surface": "new Database and repository instances over the same Docker-local SQLite file" + }, + "progressive_disclosure": { + "status": "real", + "surface": "search returns bounded memory items and detail/source hydration uses getById plus listSources" + }, + "scale_stress_profile": { + "status": "incomplete", + "surface": "durable smoke lifecycle path is encoded; scale/stress timing and resource thresholds are not yet calibrated" + } + } +} +JSON + head="$(clone_project "${project}" "${repo}" "${log_path}")" || { + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "clone failed" "${project}.log" "git clone" + return + } + + if ! run_cmd "${project}: install/build" 420 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && (npm ci || npm install --no-audit --no-fund) && npm run build --if-present"; then + json_record "${project}" "${repo}" "${head}" "incomplete" "not_run" "npm install/build failed" "${project}.log" "npm install/build" + return + fi + corpus_path="$(prepare_project_corpus "${project}")" + + cat >"${driver_path}" <<'TS' +import { readFileSync, readdirSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import { Database } from "bun:sqlite"; +import { MemoryItemsRepository } from "./src/storage/sqlite/memory-items.ts"; +import { ProjectsRepository } from "./src/storage/sqlite/projects.ts"; + +const outPath = Bun.argv[2]; +const corpusPath = Bun.argv[3]; +const queriesPath = Bun.argv[4]; +const dbPath = Bun.argv[5]; +if (!outPath || !corpusPath || !queriesPath || !dbPath) { + throw new Error("output path, corpus path, query path, and database path are required"); +} + +type QueryCase = { + id: string; + query: string; + expected_doc: string; + expected_terms: string[]; +}; + +function plainText(markdown: string): string { + return markdown + .split(/\r?\n/) + .filter((line) => !line.trimStart().startsWith("#")) + .join(" ") + .replace(/\s+/g, " ") + .trim(); +} + +function titleFrom(markdown: string, file: string): string { + const heading = markdown + .split(/\r?\n/) + .find((line) => line.trimStart().startsWith("# ")); + return heading ? heading.replace(/^#\s+/, "").trim() : file; +} + +function conceptsFor(file: string): string[] { + return file + .replace(/\.md$/i, "") + .split(/[^A-Za-z0-9]+/) + .map((part) => part.toLowerCase()) + .filter(Boolean); +} + +function resultMatches(results: unknown[], query: QueryCase): boolean { + return results.some((entry) => { + const files = (entry as { filesRead?: string[] }).filesRead ?? []; + const entryText = JSON.stringify(entry).toLowerCase(); + return ( + files.includes(query.expected_doc) && + query.expected_terms.every((term) => + entryText.includes(term.toLowerCase()), + ) + ); + }); +} + +function resultEntriesForSource(results: unknown[], source: string): unknown[] { + return results.filter((entry) => { + const files = (entry as { filesRead?: string[] }).filesRead ?? []; + return files.includes(source); + }); +} + +function makeCheck( + name: string, + status: + | "pass" + | "wrong_result" + | "lifecycle_fail" + | "incomplete" + | "blocked" + | "not_encoded", + reason: string, + evidence: unknown, +) { + return { name, status, reason, evidence }; +} + +function summarizeChecks(checks: Array<{ status: string }>) { + const wrongResult = checks.filter((check) => check.status === "wrong_result") + .length; + const lifecycleFail = checks.filter( + (check) => check.status === "lifecycle_fail", + ).length; + return { + total: checks.length, + pass: checks.filter((check) => check.status === "pass").length, + fail: wrongResult + lifecycleFail, + wrong_result: wrongResult, + lifecycle_fail: lifecycleFail, + incomplete: checks.filter((check) => check.status === "incomplete").length, + blocked: checks.filter((check) => check.status === "blocked").length, + not_encoded: checks.filter((check) => check.status === "not_encoded") + .length, + }; +} + +function markerQuery(query: QueryCase): string { + return query.expected_terms.join(" "); +} + +const db = new Database(dbPath); +db.run("PRAGMA foreign_keys = ON"); + +try { + const projects = new ProjectsRepository(db); + const memories = new MemoryItemsRepository(db); + const project = projects.create({ + name: "elfbench", + slug: "elfbench", + rootPath: "/bench/corpus", + metadata: { source: "elf-live-baseline" }, + }); + + const docs = readdirSync(corpusPath) + .filter((file) => file.endsWith(".md")) + .sort() + .map((file) => { + const raw = readFileSync(join(corpusPath, file), "utf8"); + return { + title: titleFrom(raw, file), + text: plainText(raw), + concepts: conceptsFor(file), + file, + }; + }); + const queries = JSON.parse(readFileSync(queriesPath, "utf8")).queries as QueryCase[]; + const topK = Number(process.env.ELF_BASELINE_TOP_K ?? "10"); + + const created = []; + const createdBySource = new Map<string, ReturnType<MemoryItemsRepository["create"]>>(); + for (const doc of docs) { + const item = memories.create({ + projectId: project.id, + kind: "manual", + type: "fact", + title: doc.title, + text: doc.text, + narrative: doc.text, + facts: [doc.text], + concepts: doc.concepts, + filesRead: [doc.file], + metadata: { source: doc.file }, + }); + const source = memories.addSource({ + memoryItemId: item.id, + sourceType: "import", + sourceUri: `file://${doc.file}`, + metadata: { source: doc.file }, + }); + created.push({ item, source }); + createdBySource.set(doc.file, item); + } + + const queryResults = queries.map((query) => { + const results = memories.search(project.id, query.query, topK); + return { + id: query.id, + query: query.query, + expected_doc: query.expected_doc, + expected_terms: query.expected_terms, + matched: resultMatches(results, query), + results, + }; + }); + const pass = queryResults.filter((result) => result.matched).length; + const checks = [ + makeCheck( + "same_corpus_retrieval", + pass === queryResults.length ? "pass" : "wrong_result", + pass === queryResults.length + ? "claude-mem repository search returned expected evidence for every query." + : "claude-mem repository search missed one or more expected results.", + { + total: queryResults.length, + pass, + fail: queryResults.length - pass, + }, + ), + ]; + + const auth = createdBySource.get("auth-memory.md"); + if (!auth) { + checks.push( + makeCheck( + "update_replaces_note_text", + "incomplete", + "The auth memory item was not created, so update replacement could not be exercised.", + { source: "auth-memory.md" }, + ), + ); + } else { + const updateText = + "Rotated auth middleware validates JWT tokens with key id `kid-v4` under `RotatedJwtKeyPlan`. It still requires tenant scope `project_shared` for deployment operations after the emergency key rotation."; + const update = memories.update(auth.id, { + title: "Auth Memory Updated", + text: updateText, + narrative: updateText, + facts: [updateText], + concepts: conceptsFor("auth-memory.md"), + filesRead: ["auth-memory.md"], + metadata: { source: "auth-memory.md", lifecycle: "updated" }, + }); + const updateQuery: QueryCase = { + id: "lifecycle-update-new-marker", + query: "Which rotated JWT key id does the auth middleware require?", + expected_doc: "auth-memory.md", + expected_terms: ["kid-v4", "RotatedJwtKeyPlan"], + }; + const updateResults = memories.search(project.id, markerQuery(updateQuery), topK); + const updateMatched = resultMatches(updateResults, updateQuery); + const oldMarkerAbsent = resultEntriesForSource(updateResults, "auth-memory.md") + .every((entry) => !JSON.stringify(entry).toLowerCase().includes("kid-v3")); + checks.push( + makeCheck( + "update_replaces_note_text", + updateMatched && oldMarkerAbsent ? "pass" : "lifecycle_fail", + updateMatched && oldMarkerAbsent + ? "claude-mem update returned the new marker and did not return the old marker for the updated memory item." + : "claude-mem update did not cleanly replace the searchable auth memory item text.", + { + memory_item_id: auth.id, + update, + matched_new_marker: updateMatched, + old_marker_absent: oldMarkerAbsent, + results: updateResults, + }, + ), + ); + } + + const deleteQuery = queries.find( + (query) => + query.expected_doc !== "auth-memory.md" && + query.expected_doc !== "database-memory.md" && + createdBySource.has(query.expected_doc), + ); + if (!deleteQuery) { + checks.push( + makeCheck( + "delete_suppresses_retrieval", + "incomplete", + "No non-update, non-recovery memory item was available, so delete suppression could not be exercised.", + { available_sources: Array.from(createdBySource.keys()).sort() }, + ), + ); + } else { + const deleteId = createdBySource.get(deleteQuery.expected_doc)!.id; + const deleteResult = db.prepare("DELETE FROM memory_items WHERE id = ?").run(deleteId); + const deleteResults = memories.search(project.id, markerQuery(deleteQuery), topK); + const deletedStillMatched = resultMatches(deleteResults, deleteQuery); + checks.push( + makeCheck( + "delete_suppresses_retrieval", + deletedStillMatched ? "lifecycle_fail" : "pass", + deletedStillMatched + ? "claude-mem SQLite delete returned success but the deleted memory item was still searchable." + : "claude-mem SQLite delete suppressed the deleted memory item from subsequent FTS search.", + { + memory_item_id: deleteId, + source: deleteQuery.expected_doc, + query: deleteQuery, + changes: deleteResult.changes, + deleted_still_matched: deletedStillMatched, + results: deleteResults, + }, + ), + ); + } + + const progressQuery = + queries.find( + (query) => + query.expected_doc === "database-memory.md" || + (query.expected_doc !== "auth-memory.md" && + query.expected_doc !== deleteQuery?.expected_doc), + ) ?? queries[0]; + const progressResults = memories.search(project.id, markerQuery(progressQuery), topK); + const progressItem = progressResults.find((entry) => + ((entry as { filesRead?: string[] }).filesRead ?? []).includes( + progressQuery.expected_doc, + ), + ); + const detail = progressItem ? memories.getById(progressItem.id) : null; + const sources = detail ? memories.listSources(detail.id) : []; + const detailHasEvidence = + !!detail && + !!detail.text && + detail.facts.length > 0 && + detail.concepts.length > 0 && + detail.filesRead.includes(progressQuery.expected_doc); + const sourceHydrated = sources.some((source) => + source.sourceUri?.includes(progressQuery.expected_doc), + ); + checks.push( + makeCheck( + "progressive_disclosure_detail_hydration", + progressResults.length > 0 && detailHasEvidence && sourceHydrated + ? "pass" + : "lifecycle_fail", + progressResults.length > 0 && detailHasEvidence && sourceHydrated + ? "claude-mem search returned a bounded item that could be hydrated into detail and source evidence." + : "claude-mem search/detail/source hydration did not expose the expected progressive-disclosure evidence.", + { + query: progressQuery, + search_result_count: progressResults.length, + detail_has_evidence: detailHasEvidence, + source_hydrated: sourceHydrated, + detail, + sources, + }, + ), + ); + + db.close(); + + const reopenedDb = new Database(dbPath); + reopenedDb.run("PRAGMA foreign_keys = ON"); + const reopenedProjects = new ProjectsRepository(reopenedDb); + const reopenedMemories = new MemoryItemsRepository(reopenedDb); + const reopenedProject = + reopenedProjects.getByRootPath("/bench/corpus") ?? reopenedProjects.getById(project.id); + const recoveryQuery: QueryCase = { + id: "lifecycle-cold-start-recovery", + query: + "The invoice list N+1 query was fixed by eager loading invoice lines through `InvoiceLineBatcher`. Do not reintroduce per-row SQL calls in invoice rendering.", + expected_doc: "database-memory.md", + expected_terms: ["InvoiceLineBatcher", "N+1"], + }; + const recoveryResults = reopenedProject + ? reopenedMemories.search(reopenedProject.id, markerQuery(recoveryQuery), topK) + : []; + const recoveryMatched = resultMatches(recoveryResults, recoveryQuery); + checks.push( + makeCheck( + "cold_start_recovery_search", + recoveryMatched ? "pass" : "lifecycle_fail", + recoveryMatched + ? "A new claude-mem repository instance reopened the durable SQLite file and retrieved persisted evidence." + : "A new claude-mem repository instance did not retrieve expected persisted evidence from the durable SQLite file.", + { + db_path: dbPath, + expected_doc: recoveryQuery.expected_doc, + matched: recoveryMatched, + results: recoveryResults, + }, + ), + ); + reopenedDb.close(); + + const checkSummary = summarizeChecks(checks); + + writeFileSync( + outPath, + JSON.stringify( + { + schema: "elf.live_baseline.claude_mem_result/v1", + corpus: { + document_count: docs.length, + query_count: queries.length, + }, + created, + summary: { + total: queryResults.length, + pass, + fail: queryResults.length - pass, + }, + check_summary: checkSummary, + checks, + queries: queryResults, + }, + null, + 2, + ), + ); +} catch (err) { + try { + db.close(); + } catch { + // Ignore close errors while surfacing the original benchmark failure. + } + throw err; +} +TS + + if run_cmd "${project}: same-corpus durable sqlite search" 300 "${log_path}" \ + "cd '${REPOS_DIR}/${project}' && bun '${driver_path}' '${result_path}' '${corpus_path}' '${REPORT_DIR}/queries.json' '${db_path}'"; then + if jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then + jq '{check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + fi + if jq -e --argjson query_count "${QUERY_COUNT}" --argjson document_count "${DOCUMENT_COUNT}" ' + .schema == "elf.live_baseline.claude_mem_result/v1" and + .corpus.document_count == $document_count and + .summary.total == $query_count + ' "${result_path}" >/dev/null; then + local typed_status + local retrieval_status + typed_status="$(typed_status_from_result "${result_path}")" + if jq -e '.summary.fail == 0' "${result_path}" >/dev/null; then + retrieval_status="retrieval_pass" + else + retrieval_status="retrieval_wrong_result" + fi + json_record "${project}" "${repo}" "${head}" "${typed_status}" "${retrieval_status}" "$(typed_status_reason "${project}" "${typed_status}")" "${project}.log" "npm install/build; MemoryItemsRepository.create/update/search; durable SQLite reopen" + return + fi + json_record "${project}" "${repo}" "${head}" "incomplete" "invalid_json_result" "claude-mem same-corpus search did not produce a valid benchmark result" "${project}.log" "npm install/build; MemoryItemsRepository.create/update/search; durable SQLite reopen" + return + fi + + json_record "${project}" "${repo}" "${head}" "incomplete" "retrieval_command_failed" "claude-mem built, but same-corpus SQLite search did not pass in Docker" "${project}.log" "npm install/build; MemoryItemsRepository.create/update/search; durable SQLite reopen" +} + +run_project "ELF" project_elf +run_project "agentmemory" project_agentmemory +run_project "qmd" project_qmd +run_project "memsearch" project_memsearch +run_project "mem0" project_mem0 +run_project "OpenViking" project_openviking +run_project "claude-mem" project_claude_mem +finish_report + +jq . "${REPORT}" +echo "Live baseline report: ${REPORT}" + +if [[ "${ELF_BASELINE_STRICT:-0}" == "1" ]]; then + jq -e '.verdict == "pass"' "${REPORT}" >/dev/null +fi diff --git a/scripts/live-baseline-report-to-md.sh b/scripts/live-baseline-report-to-md.sh new file mode 100755 index 00000000..38ef83ff --- /dev/null +++ b/scripts/live-baseline-report-to-md.sh @@ -0,0 +1,259 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT="${1:-${ELF_BASELINE_REPORT:-${ROOT_DIR}/tmp/live-baseline/live-baseline-report.json}}" +OUT="${2:-${ELF_BASELINE_MARKDOWN_REPORT:-}}" +REPORT_DISPLAY="${REPORT}" +if [[ "${REPORT_DISPLAY}" == "${ROOT_DIR}/"* ]]; then + REPORT_DISPLAY="${REPORT_DISPLAY#"${ROOT_DIR}/"}" +fi + +if ! command -v jq >/dev/null 2>&1; then + echo "Missing jq; cannot render live baseline Markdown report." >&2 + exit 1 +fi + +if [[ ! -f "${REPORT}" ]]; then + echo "Missing report: ${REPORT}" >&2 + exit 1 +fi + +render_report() { + jq -r --arg report_path "${REPORT_DISPLAY}" ' + def dash: + if . == null then "-" else tostring end; + def md: + dash | gsub("\\|"; "\\|") | gsub("\n"; " "); + def checks: + ((.check_summary.pass // 0 | tostring) + "/" + (.check_summary.total // 0 | tostring)); + + "# Live Baseline Benchmark Report", + "", + "Goal: Publish a Markdown summary for one generated live baseline aggregate report.", + "Read this when: You need a durable, reviewable summary of a live baseline JSON report.", + ("Inputs: `" + $report_path + "`."), + "Depends on: `scripts/live-baseline-benchmark.sh` and `docs/guide/benchmarking/live_baseline_benchmark.md`.", + "Verification: Compare this Markdown summary with the source JSON before committing.", + "", + "## Summary", + "", + ("- Run ID: `" + (.run_id | md) + "`"), + ("- Generated at: `" + (.generated_at | md) + "`"), + ("- Verdict: `" + (.verdict | md) + "`"), + ("- Project filter: `" + (.project_filter | md) + "`"), + ("- Corpus profile: `" + (.corpus.profile | md) + "`"), + ("- Corpus track: `" + ((.corpus.track // "generated_public") | md) + "`"), + ( + if (.corpus.manifest_id // null) == null then empty + else "- Corpus manifest: `" + (.corpus.manifest_id | md) + "`" + end + ), + ("- Documents: `" + (.corpus.document_count | tostring) + "`"), + ("- Queries: `" + (.corpus.query_count | tostring) + "`"), + ("- Wrong-result count: `" + ((.wrong_result_count // 0) | tostring) + "`"), + ("- Query latency mean: `" + ((.latency_ms.mean // 0) | tostring) + " ms`"), + ("- Query latency P50/P95/P99: `" + ((.latency_ms.p50 // 0) | tostring) + " ms`, `" + ((.latency_ms.p95 // 0) | tostring) + " ms`, `" + ((.latency_ms.p99 // 0) | tostring) + " ms`"), + ("- Query latency max: `" + ((.latency_ms.max // 0) | tostring) + " ms`"), + ("- Project summary: `" + (.summary.pass // 0 | tostring) + " pass`, `" + (.summary.wrong_result // 0 | tostring) + " wrong_result`, `" + (.summary.lifecycle_fail // 0 | tostring) + " lifecycle_fail`, `" + (.summary.blocked // 0 | tostring) + " blocked`, `" + (.summary.incomplete // 0 | tostring) + " incomplete`, `" + (.summary.not_encoded // 0 | tostring) + " not_encoded`"), + ("- Same-corpus summary: `" + (.same_corpus_summary.pass // 0 | tostring) + " pass`, `" + (.same_corpus_summary.wrong_result // 0 | tostring) + " wrong_result`, `" + (.same_corpus_summary.blocked // 0 | tostring) + " blocked`, `" + (.same_corpus_summary.incomplete // 0 | tostring) + " incomplete`, `" + (.same_corpus_summary.not_encoded // 0 | tostring) + " not_encoded`"), + ("- Full check summary: `" + (.full_check_summary.pass // 0 | tostring) + "/" + (.full_check_summary.total // 0 | tostring) + " pass`, `" + (.full_check_summary.wrong_result // 0 | tostring) + " wrong_result`, `" + (.full_check_summary.lifecycle_fail // 0 | tostring) + " lifecycle_fail`, `" + (.full_check_summary.blocked // 0 | tostring) + " blocked`, `" + (.full_check_summary.incomplete // 0 | tostring) + " incomplete`, `" + (.full_check_summary.not_encoded // 0 | tostring) + " not_encoded`"), + "", + "## Projects", + "", + "| Project | Status | Retrieval | Checks | Elapsed | Reason |", + "| --- | --- | --- | --- | --- | --- |", + ( + .projects[] + | "| " + (.project | md) + + " | `" + (.status | md) + "`" + + " | `" + (.retrieval_status | md) + "`" + + " | `" + checks + "`" + + " | `" + (.elapsed_seconds | tostring) + "s`" + + " | " + (.reason | md) + " |" + ), + "", + ( + [.projects[] | select(.adapter != null)] as $adapters + | if ($adapters | length) > 0 then + "## Adapter Behavior", + "", + "| Project | Storage | Retrieval | Update | Delete/Expire | Cold Start | Scale/Stress |", + "| --- | --- | --- | --- | --- | --- | --- |", + ( + $adapters[] + | "| " + (.project | md) + + " | `" + (.adapter.storage.status | md) + "`" + + " | `" + (.adapter.behaviors.same_corpus_retrieval.status | md) + "`" + + " | `" + (.adapter.behaviors.update.status | md) + "`" + + " | `" + (.adapter.behaviors.delete_or_expire.status | md) + "`" + + " | `" + (.adapter.behaviors.cold_start_reload.status | md) + "`" + + " | `" + ( + .adapter.behaviors.scale_stress_profile.status + // .adapter.behaviors.soak_profile.status + // .adapter.behaviors.resource_envelope.status + | md + ) + "` |" + ), + "" + else empty end + ), + ( + [.projects[] | select(.cost_proxy != null)] as $costed + | if ($costed | length) > 0 then + "## Cost Proxy", + "", + "This is an input-size proxy for planning provider-backed runs, not a billing claim.", + "", + "| Project | Scope | Mode | Estimated Input Tokens | Rate | Estimated Cost |", + "| --- | --- | --- | --- | --- | --- |", + ( + $costed[] + | "| " + (.project | md) + + " | " + (.cost_proxy.scope | md) + + " | `" + (.cost_proxy.embedding_mode | md) + "`" + + " | `" + (.cost_proxy.estimated_input_tokens | tostring) + "`" + + " | `" + ((.cost_proxy.configured_usd_per_1k_tokens // "-") | tostring) + "`" + + " | `" + ((.cost_proxy.estimated_usd // "-") | tostring) + "` |" + ), + "" + else empty end + ), + ( + [.projects[] | select(.resource_envelope != null)] as $resources + | if ($resources | length) > 0 then + "## Resource Usage", + "", + "| Project | Elapsed | RSS KB | Max RSS KB | Postgres Bytes | Corpus Bytes | Report Bytes | Checkpoint Bytes |", + "| --- | --- | --- | --- | --- | --- | --- | --- |", + ( + $resources[] + | "| " + (.project | md) + + " | `" + (.resource_envelope.elapsed_seconds | tostring) + "s`" + + " | `" + ((.resource_envelope.rss_kb // "-") | tostring) + "`" + + " | `" + (.resource_envelope.max_rss_kb | tostring) + "`" + + " | `" + ((.resource_envelope.postgres_database_bytes // "-") | tostring) + "`" + + " | `" + ((.resource_envelope.corpus_dir_bytes // "-") | tostring) + "`" + + " | `" + ((.resource_envelope.report_dir_bytes // "-") | tostring) + "`" + + " | `" + ((.resource_envelope.checkpoint_file_bytes // "-") | tostring) + "` |" + ), + "" + else empty end + ), + ( + [.projects[] | select(.embedding != null)] as $embedded + | if ($embedded | length) > 0 then + "## Embedding", + "", + "| Project | Mode | Provider | Model | Dimensions | Timeout | API Base | Path |", + "| --- | --- | --- | --- | --- | --- | --- | --- |", + ( + $embedded[] + | "| " + (.project | md) + + " | `" + (.embedding.mode | md) + "`" + + " | `" + (.embedding.provider_id | md) + "`" + + " | `" + (.embedding.model | md) + "`" + + " | `" + (.embedding.dimensions | tostring) + "`" + + " | `" + (.embedding.timeout_ms | tostring) + "ms`" + + " | `" + (.embedding.api_base | md) + "`" + + " | `" + (.embedding.path | md) + "` |" + ), + "" + else empty end + ), + ( + [.projects[] | {project, queries: (.queries // [])} | select((.queries | length) > 0)] as $query_projects + | if ($query_projects | length) > 0 then + "## Query Evidence", + "", + "| Project | Query | Trace ID | Task | Expected Evidence | Allowed Alternates | Top Evidence | Matched | Latency |", + "| --- | --- | --- | --- | --- | --- | --- | --- | --- |", + ( + $query_projects[] + | .project as $project + | .queries[] + | "| " + ($project | md) + + " | `" + (.id | md) + "`" + + " | `" + ((.trace_id // "-") | md) + "`" + + " | `" + ((.task // "-") | md) + "`" + + " | `" + (((.expected_evidence_ids // []) | join(", ")) | md) + "`" + + " | `" + (((.allowed_alternate_evidence_ids // []) | join(", ")) | md) + "`" + + " | `" + ((.top_evidence_id // "-") | md) + "`" + + " | `" + (.matched | tostring) + "`" + + " | `" + ((.latency_ms // 0) | tostring) + " ms` |" + ), + "" + else empty end + ), + ( + [.projects[] | select(.backfill != null)] as $backfilled + | if ($backfilled | length) > 0 then + "## Backfill", + "", + "| Project | Sources | Completed | Batch | Workers | Resume | Attempts | Skipped | Duplicates | Backfill Elapsed |", + "| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |", + ( + $backfilled[] + | "| " + (.project | md) + + " | `" + (.backfill.source_count | tostring) + "`" + + " | `" + (.backfill.completed_count | tostring) + "`" + + " | `" + (.backfill.batch_size | tostring) + "`" + + " | `" + (.backfill.worker_concurrency | tostring) + "`" + + " | `" + ( + if .backfill.resume.enabled then + "resumed after " + (.backfill.resume.completed_before_resume | tostring) + + "/" + (.backfill.resume.completed_after_resume | tostring) + else + "disabled" + end + ) + "`" + + " | `" + ((.backfill.resume.resume_attempts // 0) | tostring) + "`" + + " | `" + ((.backfill.skipped_completed // 0) | tostring) + "`" + + " | `" + ((.backfill.duplicate_source_notes | length) | tostring) + "`" + + " | `" + (.backfill.elapsed_seconds | tostring) + "s` |" + ), + "" + else empty end + ), + ( + [.ops_cases[]?] as $groups + | if ($groups | length) > 0 then + "## Operational Cases", + "", + "| Project | Case | Default Status | Operator Status | Command | Evidence | Safety |", + "| --- | --- | --- | --- | --- | --- | --- |", + ( + $groups[] + | .project as $project + | .cases[] + | "| " + ($project | md) + + " | `" + (.name | md) + "`" + + " | `" + (.default_status | md) + "`" + + " | `" + (.operator_status | md) + "`" + + " | `" + (.command | md) + "`" + + " | " + (.evidence | md) + + " | " + (.safety | md) + " |" + ), + "" + else empty end + ), + "## Result Semantics", + "", + "- `pass`: every encoded check for the selected project and profile passed.", + "- `wrong_result`: a retrieval check completed but returned the wrong memory or missed expected evidence.", + "- `lifecycle_fail`: same-corpus retrieval may pass, but an encoded update, delete, cold-start, persistence, or related lifecycle check failed.", + "- `incomplete`: setup or a declared check could not complete because install, runtime, dependency, or adapter wiring failed in Docker.", + "- `blocked`: a safe check cannot run without external credentials, manual setup, durable runtime wiring, or host integration outside this run.", + "- `not_encoded`: the capability is not covered by the current adapter, so no pass/fail claim is allowed.", + "", + "`incomplete`, `blocked`, and `not_encoded` are not passes; treat them as benchmark coverage debt." + ' "${REPORT}" +} + +if [[ -n "${OUT}" ]]; then + mkdir -p "$(dirname "${OUT}")" + render_report >"${OUT}" + echo "Wrote ${OUT}" +else + render_report +fi diff --git a/scripts/parity-docker-gate.sh b/scripts/parity-docker-gate.sh new file mode 100755 index 00000000..62fa0ec1 --- /dev/null +++ b/scripts/parity-docker-gate.sh @@ -0,0 +1,256 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_PARITY_REPORT_DIR:-${ROOT_DIR}/tmp/parity}" +RUN_ID="${ELF_PARITY_RUN_ID:-parity-$(date +%Y%m%d%H%M%S)}" + +if [[ ! -f "/.dockerenv" && "${ELF_PARITY_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run parity gate outside Docker. Use cargo make parity-docker." >&2 + exit 1 +fi + +for cmd in cargo curl jq psql; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in parity runner." >&2 + exit 1 + fi +done + +mkdir -p "${REPORT_DIR}" "${ROOT_DIR}/tmp" + +ADAPTER_OUT="${REPORT_DIR}/agentmemory-adapter.json" +CONSOLIDATION_LOG="${REPORT_DIR}/consolidation-harness.log" +CONSOLIDATION_BEFORE="${REPORT_DIR}/consolidation-before.json" +CONSOLIDATION_AFTER="${REPORT_DIR}/consolidation-after.json" +REPORT_OUT="${REPORT_DIR}/competitive-parity-report.json" + +write_report() { + local verdict="$1" + local failure_reason="${2:-}" + local adapter_status="${3:-not_run}" + local consolidation_status="${4:-not_run}" + + local note_candidates="0" + local doc_candidates="0" + local baseline_queries="0" + local ignored_items="0" + local provenance_completeness="0" + local unsupported_kind_rejected="false" + local base_recall="0" + local after_recall="0" + local base_context="0" + local after_context="0" + + if [[ -f "${ADAPTER_OUT}" ]]; then + note_candidates="$(jq -r '.summary.note_candidate_count // 0' "${ADAPTER_OUT}")" + doc_candidates="$(jq -r '.summary.doc_candidate_count // 0' "${ADAPTER_OUT}")" + baseline_queries="$(jq -r '.summary.baseline_query_count // 0' "${ADAPTER_OUT}")" + ignored_items="$(jq -r '.summary.ignored_count // 0' "${ADAPTER_OUT}")" + provenance_completeness="$( + jq -r ' + if (.summary.note_candidate_count // 0) == 0 then + 0 + else + ( + [ + .note_candidates[] + | select( + .notes_ingest_item.source_ref.resolver == "agentmemory_fixture/v1" + and (.notes_ingest_item.source_ref.ref.fixture_id | type == "string") + and (.notes_ingest_item.source_ref.ref.session_id | type == "string") + and (.notes_ingest_item.source_ref.ref.memory_id | type == "string") + ) + ] | length + ) / .summary.note_candidate_count + end + ' "${ADAPTER_OUT}" + )" + unsupported_kind_rejected="$( + jq -r '[.ignored_items[]? | select(.reason == "unsupported_memory_kind")] | length > 0' \ + "${ADAPTER_OUT}" + )" + fi + + if [[ -f "${CONSOLIDATION_BEFORE}" ]]; then + base_recall="$(jq -r '.summary.avg_recall_at_k // 0' "${CONSOLIDATION_BEFORE}")" + base_context="$(jq -r '.summary.avg_retrieved_summary_chars // 0' "${CONSOLIDATION_BEFORE}")" + fi + + if [[ -f "${CONSOLIDATION_AFTER}" ]]; then + after_recall="$(jq -r '.summary.avg_recall_at_k // 0' "${CONSOLIDATION_AFTER}")" + after_context="$(jq -r '.summary.avg_retrieved_summary_chars // 0' "${CONSOLIDATION_AFTER}")" + fi + + jq -n \ + --arg schema "elf.competitive_parity_gate.report/v1" \ + --arg gate_schema "elf.competitive_parity_gate/v1" \ + --arg gate_id "${RUN_ID}" \ + --arg verdict "${verdict}" \ + --arg failure_reason "${failure_reason}" \ + --arg adapter_status "${adapter_status}" \ + --arg consolidation_status "${consolidation_status}" \ + --argjson note_candidates "${note_candidates}" \ + --argjson doc_candidates "${doc_candidates}" \ + --argjson baseline_queries "${baseline_queries}" \ + --argjson ignored_items "${ignored_items}" \ + --argjson provenance_completeness "${provenance_completeness}" \ + --argjson unsupported_kind_rejected "${unsupported_kind_rejected}" \ + --argjson base_recall "${base_recall}" \ + --argjson after_recall "${after_recall}" \ + --argjson base_context "${base_context}" \ + --argjson after_context "${after_context}" \ + '{ + schema: $schema, + gate_schema: $gate_schema, + gate_id: $gate_id, + verdict: $verdict, + failure_reason: (if $failure_reason == "" then null else $failure_reason end), + docker_only: true, + baselines: { + agentmemory_fixture: { + status: $adapter_status, + note_candidate_count: $note_candidates, + doc_candidate_count: $doc_candidates, + baseline_query_count: $baseline_queries, + ignored_count: $ignored_items, + provenance_completeness: $provenance_completeness, + unsupported_kind_rejected: $unsupported_kind_rejected + }, + elf_consolidation_harness: { + status: $consolidation_status, + baseline_avg_recall_at_k: $base_recall, + after_avg_recall_at_k: $after_recall, + baseline_avg_retrieved_summary_chars: $base_context, + after_avg_retrieved_summary_chars: $after_context + } + }, + dimensions: { + docker_isolation: {status: "pass"}, + adapter_coverage: { + status: (if $note_candidates == 2 and $doc_candidates == 2 and $baseline_queries == 1 and $ignored_items == 1 then "pass" else "fail" end) + }, + provenance_integrity: { + status: (if $provenance_completeness == 1 then "pass" else "fail" end) + }, + unsafe_rejection: { + status: (if $unsupported_kind_rejected then "pass" else "fail" end) + }, + retrieval_quality: { + status: (if $consolidation_status == "pass" and $after_recall >= $base_recall then "pass" else "fail" end) + }, + context_efficiency: { + status: (if $consolidation_status == "pass" and $after_context <= $base_context then "pass" else "fail" end) + }, + source_safety: { + status: (if $consolidation_status == "pass" then "pass" else "fail" end) + }, + operator_inspectability: { + status: (if $consolidation_status == "pass" then "pass" else "fail" end), + checked_route: "GET /viewer" + }, + cleanup: { + status: "documented", + command: "cargo make clean-parity-docker" + } + }, + thresholds: { + agentmemory_fixture: { + note_candidate_count: 2, + doc_candidate_count: 2, + baseline_query_count: 1, + ignored_count: 1, + provenance_completeness: 1, + requires_unsupported_memory_kind_rejection: true + }, + consolidation: { + after_recall_must_be_at_least_baseline: true, + after_context_chars_must_not_exceed_baseline: true, + viewer_must_return_200: true + } + }, + artifacts: { + adapter_output: "tmp/parity/agentmemory-adapter.json", + consolidation_log: "tmp/parity/consolidation-harness.log", + consolidation_before: "tmp/parity/consolidation-before.json", + consolidation_after: "tmp/parity/consolidation-after.json" + } + }' >"${REPORT_OUT}" +} + +fail_gate() { + local reason="$1" + local adapter_status="${2:-fail}" + local consolidation_status="${3:-fail}" + write_report "fail" "${reason}" "${adapter_status}" "${consolidation_status}" + echo "Parity gate failed: ${reason}" >&2 + echo "Report: ${REPORT_OUT}" >&2 + exit 1 +} + +assert_passing_report() { + jq -e ' + .verdict == "pass" + and ([.dimensions | to_entries[] | select(.key != "cleanup" and .value.status != "pass")] | length == 0) + ' "${REPORT_OUT}" >/dev/null +} + +echo "Waiting for Docker service dependencies." +for _ in $(seq 1 120); do + if psql "${ELF_PG_DSN}" -tAc "SELECT 1" >/dev/null 2>&1 \ + && curl -fsS "${ELF_QDRANT_HTTP_URL}/collections" >/dev/null 2>&1; then + break + fi + sleep 0.5 +done + +if ! psql "${ELF_PG_DSN}" -tAc "SELECT 1" >/dev/null 2>&1; then + fail_gate "postgres dependency did not become reachable" "not_run" "not_run" +fi + +if ! curl -fsS "${ELF_QDRANT_HTTP_URL}/collections" >/dev/null 2>&1; then + fail_gate "qdrant dependency did not become reachable" "not_run" "not_run" +fi + +echo "Running agentmemory fixture adapter gate." +(cd "${ROOT_DIR}" && cargo run -q -p elf-eval --bin agentmemory_fixture_adapter -- \ + --fixture apps/elf-eval/fixtures/agentmemory/sample_session.json \ + --out "${ADAPTER_OUT}") || fail_gate "agentmemory fixture adapter command failed" "fail" "not_run" + +jq -e ' + .schema == "elf.agentmemory_adapter/v1" + and .summary.note_candidate_count == 2 + and .summary.doc_candidate_count == 2 + and .summary.baseline_query_count == 1 + and .summary.ignored_count == 1 + and ( + [ + .note_candidates[] + | select( + .notes_ingest_item.source_ref.resolver != "agentmemory_fixture/v1" + or (.notes_ingest_item.source_ref.ref.fixture_id | type != "string") + or (.notes_ingest_item.source_ref.ref.session_id | type != "string") + or (.notes_ingest_item.source_ref.ref.memory_id | type != "string") + ) + ] | length == 0 + ) + and ([.ignored_items[]? | select(.reason == "unsupported_memory_kind")] | length >= 1) +' "${ADAPTER_OUT}" >/dev/null \ + || fail_gate "agentmemory fixture adapter thresholds failed" "fail" "not_run" + +echo "Running service-backed consolidation parity gate." +( + cd "${ROOT_DIR}" + ELF_HARNESS_CHECK_VIEWER=1 \ + bash scripts/consolidation-harness.sh +) 2>&1 | tee "${CONSOLIDATION_LOG}" \ + || fail_gate "consolidation harness thresholds failed" "pass" "fail" + +cp "${ROOT_DIR}/tmp/elf.consolidation.out.base.json" "${CONSOLIDATION_BEFORE}" +cp "${ROOT_DIR}/tmp/elf.consolidation.out.after.json" "${CONSOLIDATION_AFTER}" + +write_report "pass" "" "pass" "pass" +assert_passing_report || fail_gate "one or more parity report dimensions failed" "pass" "pass" + +echo "Parity gate passed." +echo "Report: ${REPORT_OUT}" diff --git a/scripts/ragflow-docker-evidence-smoke.sh b/scripts/ragflow-docker-evidence-smoke.sh new file mode 100755 index 00000000..17dd572f --- /dev/null +++ b/scripts/ragflow-docker-evidence-smoke.sh @@ -0,0 +1,1278 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +ARTIFACT_DIR="${ELF_RAGFLOW_SMOKE_ARTIFACT_DIR:-${ROOT_DIR}/tmp/real-world-memory/ragflow-smoke}" +OUT="${ELF_RAGFLOW_SMOKE_OUT:-${ARTIFACT_DIR}/ragflow-smoke.json}" +MANIFEST_OUT="${ELF_RAGFLOW_SMOKE_MANIFEST_OUT:-${ARTIFACT_DIR}/memory_projects_manifest.ragflow-smoke.json}" +SUMMARY_OUT="${ELF_RAGFLOW_SMOKE_SUMMARY_OUT:-${ARTIFACT_DIR}/summary.json}" +FIXTURE_DIR="${ELF_RAGFLOW_SMOKE_FIXTURE_DIR:-${ARTIFACT_DIR}/ragflow-fixtures}" +FIXTURE_PATH="${ELF_RAGFLOW_SMOKE_FIXTURE_PATH:-${FIXTURE_DIR}/retrieval/ragflow_evidence_smoke.json}" +REPORT_JSON="${ELF_RAGFLOW_SMOKE_REPORT_JSON:-${ARTIFACT_DIR}/ragflow-report.json}" +REPORT_MD="${ELF_RAGFLOW_SMOKE_REPORT_MD:-${ARTIFACT_DIR}/ragflow-report.md}" +SCORED_BENCHMARK="${ELF_RAGFLOW_SMOKE_SCORED_BENCHMARK:-${ARTIFACT_DIR}/scored-benchmark.json}" +WORK_DIR="${ELF_RAGFLOW_SMOKE_WORK_DIR:-${ARTIFACT_DIR}/work}" +RAGFLOW_REPO_URL="${ELF_RAGFLOW_REPO_URL:-https://github.com/infiniflow/ragflow.git}" +RAGFLOW_REF="${ELF_RAGFLOW_REF:-v0.25.6}" +RAGFLOW_IMAGE="${ELF_RAGFLOW_IMAGE:-infiniflow/ragflow:v0.25.6}" +COMPOSE_PROJECT="${ELF_RAGFLOW_COMPOSE_PROJECT:-elf-ragflow-smoke}" +START_RAGFLOW="${ELF_RAGFLOW_SMOKE_START:-0}" +ACCEPT_RESOURCE_ENVELOPE="${ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE:-0}" +ALLOW_ARM="${ELF_RAGFLOW_SMOKE_ALLOW_ARM:-0}" +PULL_IMAGE="${ELF_RAGFLOW_SMOKE_PULL_IMAGE:-0}" +CLEANUP="${ELF_RAGFLOW_SMOKE_CLEANUP:-1}" +CPU_GPU_MODE="${ELF_RAGFLOW_SMOKE_DEVICE:-cpu}" +API_PORT="${ELF_RAGFLOW_API_PORT:-19380}" +API_BASE="${ELF_RAGFLOW_API_BASE:-http://127.0.0.1:${API_PORT}}" +API_KEY="${ELF_RAGFLOW_API_KEY:-${RAGFLOW_API_KEY:-}}" +STARTUP_ATTEMPTS="${ELF_RAGFLOW_SMOKE_STARTUP_ATTEMPTS:-60}" +STARTUP_INTERVAL_SECONDS="${ELF_RAGFLOW_SMOKE_STARTUP_INTERVAL_SECONDS:-5}" +COMPOSE_TIMEOUT_SECONDS="${ELF_RAGFLOW_SMOKE_COMPOSE_TIMEOUT_SECONDS:-1800}" +RUN_ID="${ELF_RAGFLOW_SMOKE_RUN_ID:-ragflow-docker-smoke-$(date -u +%Y%m%d%H%M%S)}" +EVIDENCE_ID="ragflow-smoke-anchor" +DOCUMENT_NAME="${RUN_ID}.txt" +EVIDENCE_TOKEN="ELF_RAGFLOW_SMOKE_TOKEN_${RUN_ID}" +CORPUS_TEXT="RAGFlow smoke evidence ${EVIDENCE_TOKEN}: the ELF adapter maps returned reference chunks to the ragflow-smoke-anchor evidence id." + +mkdir -p \ + "${ARTIFACT_DIR}" \ + "${WORK_DIR}" \ + "$(dirname "${OUT}")" \ + "$(dirname "${MANIFEST_OUT}")" \ + "$(dirname "${SUMMARY_OUT}")" \ + "$(dirname "${FIXTURE_PATH}")" \ + "$(dirname "${REPORT_JSON}")" \ + "$(dirname "${REPORT_MD}")" \ + "$(dirname "${SCORED_BENCHMARK}")" + +rm -f "${OUT}" "${MANIFEST_OUT}" "${SUMMARY_OUT}" "${REPORT_JSON}" "${REPORT_MD}" "${SCORED_BENCHMARK}" + +DOCKER_INFO="${ARTIFACT_DIR}/docker-info.json" +IMAGE_INSPECT="${ARTIFACT_DIR}/ragflow-image-inspect.json" +STARTUP_ATTEMPTS_JSONL="${ARTIFACT_DIR}/startup-attempts.jsonl" +DATASET_REQUEST="${ARTIFACT_DIR}/dataset-create-request.json" +DATASET_RESPONSE="${ARTIFACT_DIR}/dataset-create-response.json" +DOCUMENT_REQUEST="${ARTIFACT_DIR}/document-create-request.json" +DOCUMENT_RESPONSE="${ARTIFACT_DIR}/document-create-response.json" +CHUNK_REQUEST="${ARTIFACT_DIR}/chunk-create-request.json" +CHUNK_RESPONSE="${ARTIFACT_DIR}/chunk-create-response.json" +RETRIEVAL_REQUEST="${ARTIFACT_DIR}/retrieval-request.json" +RETRIEVAL_RESPONSE="${ARTIFACT_DIR}/retrieval-response.json" +REFERENCE_MAPPING="${ARTIFACT_DIR}/reference-mapping.json" +DOCKER_DF="${ARTIFACT_DIR}/docker-system-df.txt" +COMPOSE_UP_LOG="${ARTIFACT_DIR}/compose-up.log" +COMPOSE_DOWN_LOG="${ARTIFACT_DIR}/compose-down.log" + +printf '[]\n' >"${IMAGE_INSPECT}" +printf '[]\n' >"${REFERENCE_MAPPING}" +for json_file in \ + "${DATASET_REQUEST}" \ + "${DATASET_RESPONSE}" \ + "${DOCUMENT_REQUEST}" \ + "${DOCUMENT_RESPONSE}" \ + "${CHUNK_REQUEST}" \ + "${CHUNK_RESPONSE}" \ + "${RETRIEVAL_REQUEST}" \ + "${RETRIEVAL_RESPONSE}"; do + printf 'null\n' >"${json_file}" +done +: >"${STARTUP_ATTEMPTS_JSONL}" +: >"${DOCKER_DF}" +: >"${COMPOSE_UP_LOG}" +: >"${COMPOSE_DOWN_LOG}" + +SETUP_STATUS="blocked" +RUN_STATUS="not_encoded" +RESULT_STATUS="blocked" +OVERALL_STATUS="blocked" +EVIDENCE_CLASS="research_gate" +FAILURE_CLASS="resource_confirmation_required" +FAILURE_REASON="RAGFlow startup is resource-heavy; set ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 to run the official Docker Compose stack." +STARTUP_TIME_MS="" +STARTED="false" +DATASET_ID="" +DOCUMENT_ID="" +CHUNK_ID="" +VM_MAX_MAP_COUNT="" +VM_MAX_MAP_COUNT_STATUS="not_observed" +VM_MAX_MAP_COUNT_ACTION="not_changed" +IMAGE_PRESENT="false" +IMAGE_SIZE_BYTES="" +HOST_GLOBAL_INSTALLS_REQUIRED="false" +DATASET_STEP_STATUS="not_encoded" +DOCUMENT_STEP_STATUS="not_encoded" +CHUNK_STEP_STATUS="not_encoded" +RETRIEVAL_STEP_STATUS="not_encoded" + +required_command() { + local cmd="$1" + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd}; cannot write RAGFlow smoke artifacts." >&2 + exit 1 + fi +} + +optional_command_status() { + local cmd="$1" + if command -v "${cmd}" >/dev/null 2>&1; then + printf 'available' + else + printf 'missing' + fi +} + +relative_path() { + local path="$1" + if [[ "${path}" == "${ROOT_DIR}/"* ]]; then + printf '%s' "${path#"${ROOT_DIR}/"}" + else + printf '%s' "${path}" + fi +} + +json_status() { + local status="$1" + case "${status}" in + real | mocked | unsupported | blocked | incomplete | wrong_result | lifecycle_fail | pass | not_encoded) + printf '%s' "${status}" + ;; + *) + printf 'incomplete' + ;; + esac +} + +capture_docker_info() { + if docker info --format '{{json .}}' >"${DOCKER_INFO}" 2>"${ARTIFACT_DIR}/docker-info.stderr"; then + return 0 + fi + + jq -n --rawfile stderr "${ARTIFACT_DIR}/docker-info.stderr" '{ + error: "docker_info_failed", + stderr: $stderr + }' >"${DOCKER_INFO}" + return 1 +} + +capture_disk_info() { + docker system df >"${DOCKER_DF}" 2>/dev/null || true +} + +capture_vm_max_map_count() { + if VM_MAX_MAP_COUNT="$(sysctl -n vm.max_map_count 2>/dev/null)"; then + if [[ "${VM_MAX_MAP_COUNT}" =~ ^[0-9]+$ ]] && [[ "${VM_MAX_MAP_COUNT}" -ge 262144 ]]; then + VM_MAX_MAP_COUNT_STATUS="pass" + elif [[ "${VM_MAX_MAP_COUNT}" =~ ^[0-9]+$ ]]; then + VM_MAX_MAP_COUNT_STATUS="blocked" + else + VM_MAX_MAP_COUNT_STATUS="not_observed" + fi + else + VM_MAX_MAP_COUNT="" + VM_MAX_MAP_COUNT_STATUS="not_observed" + fi +} + +capture_image_info() { + if [[ "${PULL_IMAGE}" == "1" && "${ACCEPT_RESOURCE_ENVELOPE}" == "1" ]]; then + docker pull "${RAGFLOW_IMAGE}" >"${ARTIFACT_DIR}/docker-pull.log" 2>&1 || true + fi + + if docker image inspect "${RAGFLOW_IMAGE}" >"${IMAGE_INSPECT}" 2>/dev/null; then + IMAGE_PRESENT="true" + IMAGE_SIZE_BYTES="$(jq -r '.[0].Size // ""' "${IMAGE_INSPECT}")" + else + printf '[]\n' >"${IMAGE_INSPECT}" + fi +} + +update_env_var() { + local file="$1" + local key="$2" + local value="$3" + + if grep -q "^${key}=" "${file}"; then + sed -i.bak "s|^${key}=.*|${key}=${value}|" "${file}" + else + printf '\n%s=%s\n' "${key}" "${value}" >>"${file}" + fi +} + +prepare_official_ragflow_repo() { + local repo_dir="${WORK_DIR}/ragflow" + + if [[ ! -d "${repo_dir}/.git" ]]; then + rm -rf "${repo_dir}" + git clone --depth 1 --branch "${RAGFLOW_REF}" "${RAGFLOW_REPO_URL}" "${repo_dir}" \ + >"${ARTIFACT_DIR}/ragflow-git-clone.log" 2>&1 + else + git -C "${repo_dir}" fetch --depth 1 origin "${RAGFLOW_REF}" \ + >"${ARTIFACT_DIR}/ragflow-git-fetch.log" 2>&1 + git -C "${repo_dir}" checkout -f FETCH_HEAD \ + >"${ARTIFACT_DIR}/ragflow-git-checkout.log" 2>&1 + fi + + update_env_var "${repo_dir}/docker/.env" "DEVICE" "${CPU_GPU_MODE}" + update_env_var "${repo_dir}/docker/.env" "SVR_WEB_HTTP_PORT" "${ELF_RAGFLOW_WEB_HTTP_PORT:-18080}" + update_env_var "${repo_dir}/docker/.env" "SVR_WEB_HTTPS_PORT" "${ELF_RAGFLOW_WEB_HTTPS_PORT:-18443}" + update_env_var "${repo_dir}/docker/.env" "SVR_HTTP_PORT" "${API_PORT}" + update_env_var "${repo_dir}/docker/.env" "ADMIN_SVR_HTTP_PORT" "${ELF_RAGFLOW_ADMIN_PORT:-19381}" + update_env_var "${repo_dir}/docker/.env" "SVR_MCP_PORT" "${ELF_RAGFLOW_MCP_PORT:-19382}" + update_env_var "${repo_dir}/docker/.env" "GO_HTTP_PORT" "${ELF_RAGFLOW_GO_HTTP_PORT:-19384}" + update_env_var "${repo_dir}/docker/.env" "GO_ADMIN_PORT" "${ELF_RAGFLOW_GO_ADMIN_PORT:-19383}" + update_env_var "${repo_dir}/docker/.env" "EXPOSE_MYSQL_PORT" "${ELF_RAGFLOW_MYSQL_PORT:-13306}" + update_env_var "${repo_dir}/docker/.env" "MINIO_CONSOLE_PORT" "${ELF_RAGFLOW_MINIO_CONSOLE_PORT:-19001}" + update_env_var "${repo_dir}/docker/.env" "MINIO_PORT" "${ELF_RAGFLOW_MINIO_PORT:-19000}" + update_env_var "${repo_dir}/docker/.env" "REDIS_PORT" "${ELF_RAGFLOW_REDIS_PORT:-16379}" + update_env_var "${repo_dir}/docker/.env" "ES_PORT" "${ELF_RAGFLOW_ES_PORT:-11200}" + update_env_var "${repo_dir}/docker/.env" "OS_PORT" "${ELF_RAGFLOW_OS_PORT:-11201}" + update_env_var "${repo_dir}/docker/.env" "RAGFLOW_IMAGE" "${RAGFLOW_IMAGE}" + + printf '%s' "${repo_dir}" +} + +run_with_timeout_if_available() { + local seconds="$1" + shift + + if command -v timeout >/dev/null 2>&1; then + timeout "${seconds}" "$@" + else + "$@" + fi +} + +start_ragflow_stack() { + local repo_dir="$1" + local started_at ended_at + started_at="$(date +%s)" + + if ( + cd "${repo_dir}/docker" + run_with_timeout_if_available "${COMPOSE_TIMEOUT_SECONDS}" \ + docker compose -p "${COMPOSE_PROJECT}" -f docker-compose.yml up -d + ) >"${COMPOSE_UP_LOG}" 2>&1; then + STARTED="true" + SETUP_STATUS="pass" + FAILURE_CLASS="" + FAILURE_REASON="" + else + SETUP_STATUS="incomplete" + OVERALL_STATUS="incomplete" + RESULT_STATUS="incomplete" + FAILURE_CLASS="ragflow_compose_start_failed" + FAILURE_REASON="Official RAGFlow Docker Compose did not start successfully; see compose-up.log in the artifact directory." + fi + + ended_at="$(date +%s)" + STARTUP_TIME_MS="$(((ended_at - started_at) * 1000))" +} + +wait_for_ragflow_api() { + local attempt code + + for attempt in $(seq 1 "${STARTUP_ATTEMPTS}"); do + code="$(curl -sS -o /dev/null -w '%{http_code}' "${API_BASE}/api/v1/system/healthz" 2>/dev/null || true)" + jq -nc --argjson attempt "${attempt}" --arg code "${code}" --arg url "${API_BASE}/api/v1/system/healthz" '{ + attempt: $attempt, + url: $url, + http_code: $code + }' >>"${STARTUP_ATTEMPTS_JSONL}" + + if [[ "${code}" == "200" ]]; then + return 0 + fi + + sleep "${STARTUP_INTERVAL_SECONDS}" + done + + return 1 +} + +api_json_request() { + local method="$1" + local path="$2" + local request_file="$3" + local response_file="$4" + local stderr_file="${response_file}.stderr" + local code + + code="$(curl -sS -X "${method}" \ + -o "${response_file}" \ + -w '%{http_code}' \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer ${API_KEY}" \ + --data-binary @"${request_file}" \ + "${API_BASE}${path}" 2>"${stderr_file}" || true)" + + jq -n --arg code "${code}" --rawfile stderr "${stderr_file}" '{ + http_code: $code, + stderr: $stderr + }' >"${response_file}.meta.json" + + [[ "${code}" =~ ^2 ]] +} + +response_code_ok() { + local response_file="$1" + + jq -e '(.code? == 0) or (.id? != null) or (.data? != null)' "${response_file}" >/dev/null 2>&1 +} + +extract_id() { + local response_file="$1" + jq -r ' + .data.id + // .data[0].id + // .data.document_id + // .data.chunk_id + // .id + // empty + ' "${response_file}" +} + +run_api_smoke() { + local dataset_name="${RUN_ID}" + + jq -n --arg name "${dataset_name}" '{ + name: $name, + description: "Generated public ELF RAGFlow Docker evidence smoke corpus.", + permission: "me", + chunk_method: "manual", + parser_config: {"raptor": {"use_raptor": false}} + }' >"${DATASET_REQUEST}" + + if api_json_request POST "/api/v1/datasets" "${DATASET_REQUEST}" "${DATASET_RESPONSE}" \ + && response_code_ok "${DATASET_RESPONSE}"; then + DATASET_STEP_STATUS="pass" + DATASET_ID="$(extract_id "${DATASET_RESPONSE}")" + else + DATASET_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_dataset_create_failed" + FAILURE_REASON="RAGFlow dataset creation did not return a successful response." + return 0 + fi + + if [[ -z "${DATASET_ID}" ]]; then + DATASET_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_dataset_id_missing" + FAILURE_REASON="RAGFlow dataset creation succeeded but no dataset id was found in the response." + return 0 + fi + + jq -n --arg name "${DOCUMENT_NAME}" '{name: $name}' >"${DOCUMENT_REQUEST}" + + if api_json_request POST "/api/v1/datasets/${DATASET_ID}/documents?type=empty" \ + "${DOCUMENT_REQUEST}" "${DOCUMENT_RESPONSE}" \ + && response_code_ok "${DOCUMENT_RESPONSE}"; then + DOCUMENT_STEP_STATUS="pass" + DOCUMENT_ID="$(extract_id "${DOCUMENT_RESPONSE}")" + else + DOCUMENT_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_document_create_failed" + FAILURE_REASON="RAGFlow empty document creation did not return a successful response." + return 0 + fi + + if [[ -z "${DOCUMENT_ID}" ]]; then + DOCUMENT_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_document_id_missing" + FAILURE_REASON="RAGFlow empty document creation succeeded but no document id was found in the response." + return 0 + fi + + jq -n \ + --arg content "${CORPUS_TEXT}" \ + --arg token "${EVIDENCE_TOKEN}" \ + '{ + content: $content, + important_keywords: [$token], + questions: ["Which evidence token should map to ragflow-smoke-anchor?"] + }' >"${CHUNK_REQUEST}" + + if api_json_request POST "/api/v1/datasets/${DATASET_ID}/documents/${DOCUMENT_ID}/chunks" \ + "${CHUNK_REQUEST}" "${CHUNK_RESPONSE}" \ + && response_code_ok "${CHUNK_RESPONSE}"; then + CHUNK_STEP_STATUS="pass" + CHUNK_ID="$(extract_id "${CHUNK_RESPONSE}")" + else + CHUNK_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_chunk_create_failed" + FAILURE_REASON="RAGFlow chunk creation did not return a successful response." + return 0 + fi + + jq -n \ + --arg question "Which RAGFlow smoke evidence token maps to ragflow-smoke-anchor?" \ + --arg dataset_id "${DATASET_ID}" \ + --arg document_id "${DOCUMENT_ID}" \ + '{ + question: $question, + dataset_ids: [$dataset_id], + document_ids: [$document_id], + page: 1, + page_size: 5, + similarity_threshold: 0.0, + vector_similarity_weight: 0.0, + top_k: 5, + keyword: true, + highlight: false + }' >"${RETRIEVAL_REQUEST}" + + if api_json_request POST "/api/v1/retrieval" "${RETRIEVAL_REQUEST}" "${RETRIEVAL_RESPONSE}" \ + && response_code_ok "${RETRIEVAL_RESPONSE}"; then + RETRIEVAL_STEP_STATUS="pass" + else + RETRIEVAL_STEP_STATUS="incomplete" + RUN_STATUS="incomplete" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_retrieval_failed" + FAILURE_REASON="RAGFlow retrieval did not return a successful response." + return 0 + fi + + jq \ + --arg evidence_id "${EVIDENCE_ID}" \ + --arg token "${EVIDENCE_TOKEN}" \ + --arg document_name "${DOCUMENT_NAME}" ' + def chunk_array: + if (.data.chunks? | type) == "array" then .data.chunks + elif (.reference.chunks? | type) == "array" then .reference.chunks + else [] end; + chunk_array + | map({ + chunk_id: (.id // .chunk_id // ""), + content: (.content // .content_with_weight // ""), + document_id: (.document_id // .doc_id // ""), + document_name: (.document_name // .document_keyword // .doc_name // .docnm_kwd // ""), + dataset_id: (.dataset_id // .kb_id // ""), + positions: (.positions // []), + similarity: (.similarity // null), + vector_similarity: (.vector_similarity // null), + term_similarity: (.term_similarity // null), + evidence_ids: ( + if (((.content // .content_with_weight // "") | contains($token)) + or ((.document_name // .document_keyword // .doc_name // .docnm_kwd // "") == $document_name)) + then [$evidence_id] + else [] + end + ), + mapping_status: ( + if ((.content // .content_with_weight // "") | contains($token)) then "matched_content" + elif ((.document_name // .document_keyword // .doc_name // .docnm_kwd // "") == $document_name) then "matched_document" + else "unmatched" + end + ) + })' "${RETRIEVAL_RESPONSE}" >"${REFERENCE_MAPPING}" + + RUN_STATUS="pass" + EVIDENCE_CLASS="live_real_world" + + if jq -e --arg evidence_id "${EVIDENCE_ID}" ' + length > 0 and any(.[]; (.evidence_ids // []) | index($evidence_id)) + ' "${REFERENCE_MAPPING}" >/dev/null; then + RESULT_STATUS="pass" + OVERALL_STATUS="pass" + FAILURE_CLASS="" + FAILURE_REASON="" + else + RESULT_STATUS="wrong_result" + OVERALL_STATUS="wrong_result" + FAILURE_CLASS="ragflow_reference_mapping_missing" + FAILURE_REASON="RAGFlow retrieval returned chunks but none mapped to the generated evidence id." + fi +} + +cleanup_stack() { + local repo_dir="${WORK_DIR}/ragflow" + + if [[ "${STARTED}" != "true" || "${CLEANUP}" != "1" || ! -d "${repo_dir}/docker" ]]; then + return 0 + fi + + ( + cd "${repo_dir}/docker" + docker compose -p "${COMPOSE_PROJECT}" -f docker-compose.yml down -v + ) >"${COMPOSE_DOWN_LOG}" 2>&1 || true +} + +write_scored_benchmark() { + if [[ -s "${REPORT_JSON}" ]]; then + jq 'def count($key): (.summary[$key] // 0); + def scored_status: + if count("wrong_result") > 0 then "wrong_result" + elif count("lifecycle_fail") > 0 then "lifecycle_fail" + elif count("incomplete") > 0 then "incomplete" + elif count("blocked") > 0 then "blocked" + elif count("not_encoded") > 0 then "not_encoded" + elif count("pass") > 0 then "pass" + else "not_encoded" + end; + { + schema: "elf.scored_benchmark_status/v1", + source: "real_world_job_benchmark", + status: scored_status, + counts: { + pass: count("pass"), + wrong_result: count("wrong_result"), + lifecycle_fail: count("lifecycle_fail"), + incomplete: count("incomplete"), + blocked: count("blocked"), + not_encoded: count("not_encoded") + }, + job_count: (.summary.job_count // 0), + mean_score: (.summary.mean_score // null), + evidence_coverage: (.summary.evidence_coverage // null) + }' "${REPORT_JSON}" >"${SCORED_BENCHMARK}" + else + jq -n '{ + schema: "elf.scored_benchmark_status/v1", + source: "real_world_job_benchmark", + status: "pending", + reason: "The smoke materialization was written before benchmark scoring completed." + }' >"${SCORED_BENCHMARK}" + fi +} + +write_artifact() { + local generated_at out_rel manifest_rel fixture_rel report_json_rel report_md_rel docker_status git_status curl_status jq_status + generated_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + out_rel="$(relative_path "${OUT}")" + manifest_rel="$(relative_path "${MANIFEST_OUT}")" + fixture_rel="$(relative_path "${FIXTURE_PATH}")" + report_json_rel="$(relative_path "${REPORT_JSON}")" + report_md_rel="$(relative_path "${REPORT_MD}")" + docker_status="$(optional_command_status docker)" + git_status="$(optional_command_status git)" + curl_status="$(optional_command_status curl)" + jq_status="$(optional_command_status jq)" + + jq -n \ + --arg schema "elf.ragflow_docker_evidence_smoke/v1" \ + --arg run_id "${RUN_ID}" \ + --arg generated_at "${generated_at}" \ + --arg adapter_id "ragflow_docker_evidence_smoke" \ + --arg evidence_class "${EVIDENCE_CLASS}" \ + --arg overall_status "$(json_status "${OVERALL_STATUS}")" \ + --arg setup_status "$(json_status "${SETUP_STATUS}")" \ + --arg run_status "$(json_status "${RUN_STATUS}")" \ + --arg result_status "$(json_status "${RESULT_STATUS}")" \ + --arg failure_class "${FAILURE_CLASS}" \ + --arg failure_reason "${FAILURE_REASON}" \ + --arg out_rel "${out_rel}" \ + --arg manifest_rel "${manifest_rel}" \ + --arg fixture_rel "${fixture_rel}" \ + --arg report_json_rel "${report_json_rel}" \ + --arg report_md_rel "${report_md_rel}" \ + --arg artifact_dir "$(relative_path "${ARTIFACT_DIR}")" \ + --arg work_dir "$(relative_path "${WORK_DIR}")" \ + --arg repo_url "${RAGFLOW_REPO_URL}" \ + --arg ragflow_ref "${RAGFLOW_REF}" \ + --arg ragflow_image "${RAGFLOW_IMAGE}" \ + --arg compose_project "${COMPOSE_PROJECT}" \ + --arg cpu_gpu_mode "${CPU_GPU_MODE}" \ + --arg start_enabled "${START_RAGFLOW}" \ + --arg accept_resource_envelope "${ACCEPT_RESOURCE_ENVELOPE}" \ + --arg allow_arm "${ALLOW_ARM}" \ + --arg pull_image "${PULL_IMAGE}" \ + --arg cleanup "${CLEANUP}" \ + --arg api_base "${API_BASE}" \ + --arg api_key_provided "$([[ -n "${API_KEY}" ]] && printf true || printf false)" \ + --arg startup_time_ms "${STARTUP_TIME_MS}" \ + --arg started "${STARTED}" \ + --arg startup_attempt_count "${STARTUP_ATTEMPTS}" \ + --arg startup_interval_seconds "${STARTUP_INTERVAL_SECONDS}" \ + --arg compose_timeout_seconds "${COMPOSE_TIMEOUT_SECONDS}" \ + --arg evidence_id "${EVIDENCE_ID}" \ + --arg document_name "${DOCUMENT_NAME}" \ + --arg evidence_token "${EVIDENCE_TOKEN}" \ + --arg corpus_text "${CORPUS_TEXT}" \ + --arg dataset_id "${DATASET_ID}" \ + --arg document_id "${DOCUMENT_ID}" \ + --arg chunk_id "${CHUNK_ID}" \ + --arg vm_max_map_count "${VM_MAX_MAP_COUNT}" \ + --arg vm_max_map_count_status "${VM_MAX_MAP_COUNT_STATUS}" \ + --arg vm_max_map_count_action "${VM_MAX_MAP_COUNT_ACTION}" \ + --arg image_present "${IMAGE_PRESENT}" \ + --arg image_size_bytes "${IMAGE_SIZE_BYTES}" \ + --arg host_global_installs_required "${HOST_GLOBAL_INSTALLS_REQUIRED}" \ + --arg docker_status "${docker_status}" \ + --arg git_status "${git_status}" \ + --arg curl_status "${curl_status}" \ + --arg jq_status "${jq_status}" \ + --arg dataset_step_status "$(json_status "${DATASET_STEP_STATUS}")" \ + --arg document_step_status "$(json_status "${DOCUMENT_STEP_STATUS}")" \ + --arg chunk_step_status "$(json_status "${CHUNK_STEP_STATUS}")" \ + --arg retrieval_step_status "$(json_status "${RETRIEVAL_STEP_STATUS}")" \ + --slurpfile docker_info "${DOCKER_INFO}" \ + --slurpfile image_inspect "${IMAGE_INSPECT}" \ + --slurpfile reference_mapping "${REFERENCE_MAPPING}" \ + --rawfile docker_df "${DOCKER_DF}" \ + --rawfile compose_up_log "${COMPOSE_UP_LOG}" \ + --rawfile compose_down_log "${COMPOSE_DOWN_LOG}" \ + --slurpfile dataset_response "${DATASET_RESPONSE}" \ + --slurpfile document_response "${DOCUMENT_RESPONSE}" \ + --slurpfile chunk_response "${CHUNK_RESPONSE}" \ + --slurpfile retrieval_response "${RETRIEVAL_RESPONSE}" \ + --slurpfile scored_benchmark "${SCORED_BENCHMARK}" \ + --slurpfile startup_attempts <(jq -s '.' "${STARTUP_ATTEMPTS_JSONL}") \ + '{ + schema: $schema, + run_id: $run_id, + generated_at: $generated_at, + adapter_id: $adapter_id, + evidence_class: $evidence_class, + overall_status: $overall_status, + status_source: "smoke_materialization", + scored_benchmark: $scored_benchmark[0], + no_quality_claim: true, + failure: ( + if $failure_class == "" then null + else { + class: $failure_class, + reason: $failure_reason + } + end + ), + artifacts: { + smoke: $out_rel, + external_adapter_manifest: $manifest_rel, + generated_fixture: $fixture_rel, + scored_report_json: $report_json_rel, + scored_report_markdown: $report_md_rel, + artifact_dir: $artifact_dir, + work_dir: $work_dir + }, + upstream: { + repository: $repo_url, + ref: $ragflow_ref, + quickstart: "https://ragflow.io/docs/", + http_api_reference: "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + api_key_guide: "https://ragflow.io/docs/acquire_ragflow_api_key" + }, + docker_boundary: { + status: $setup_status, + official_compose_path: "ragflow/docker/docker-compose.yml", + compose_project: $compose_project, + image: $ragflow_image, + device: $cpu_gpu_mode, + start_enabled: ($start_enabled == "1"), + resource_envelope_accepted: ($accept_resource_envelope == "1"), + allow_arm: ($allow_arm == "1"), + pull_image_requested: ($pull_image == "1"), + cleanup_requested: ($cleanup == "1"), + host_global_installs_required: ($host_global_installs_required == "true"), + tooling: { + docker: $docker_status, + git: $git_status, + curl: $curl_status, + jq: $jq_status + } + }, + setup: { + status: $setup_status, + command: "cargo make smoke-ragflow-docker", + live_command: "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", + started: ($started == "true"), + startup_time_ms: (if $startup_time_ms == "" then null else ($startup_time_ms | tonumber) end), + vm_max_map_count: { + status: $vm_max_map_count_status, + observed: (if $vm_max_map_count == "" then null else $vm_max_map_count end), + required_min: 262144, + action: $vm_max_map_count_action + }, + image: { + present: ($image_present == "true"), + size_bytes: (if $image_size_bytes == "" then null else ($image_size_bytes | tonumber) end), + official_compressed_size_note: "RAGFlow quickstart lists the stable image at about 2 GB compressed.", + official_expanded_size_note: "RAGFlow quickstart says the image expands to about 7 GB once unpacked.", + inspect: ($image_inspect[0] // []) + }, + resource_envelope: { + official_min_cpu_cores: 4, + official_min_ram_gb: 16, + official_min_disk_gb: 50, + docker_info: ($docker_info[0] // {}), + docker_system_df: $docker_df + }, + provider_boundaries: { + ragflow_api_base: $api_base, + ragflow_api_key_provided: ($api_key_provided == "true"), + operator_owned_provider_credentials_used: false, + private_corpus_used: false, + generated_public_corpus_only: true, + external_llm_quality_scoring_claimed: false + }, + retry_behavior: { + startup_poll_attempts_configured: ($startup_attempt_count | tonumber), + startup_interval_seconds: ($startup_interval_seconds | tonumber), + compose_timeout_seconds: ($compose_timeout_seconds | tonumber), + startup_attempts: ($startup_attempts[0] // []) + }, + log_excerpt: { + compose_up: ($compose_up_log | split("\n") | .[0:40]), + compose_down: ($compose_down_log | split("\n") | .[0:20]) + } + }, + corpus: { + profile: "generated_public", + evidence_id: $evidence_id, + document_name: $document_name, + evidence_token: $evidence_token, + text: $corpus_text, + dataset_id: (if $dataset_id == "" then null else $dataset_id end), + document_id: (if $document_id == "" then null else $document_id end), + chunk_id: (if $chunk_id == "" then null else $chunk_id end) + }, + run: { + status: $run_status, + steps: { + dataset_creation: { + status: $dataset_step_status, + request_artifact: "dataset-create-request.json", + response_artifact: "dataset-create-response.json", + response: ($dataset_response[0] // null) + }, + document_creation: { + status: $document_step_status, + request_artifact: "document-create-request.json", + response_artifact: "document-create-response.json", + response: ($document_response[0] // null) + }, + chunk_ingest: { + status: $chunk_step_status, + request_artifact: "chunk-create-request.json", + response_artifact: "chunk-create-response.json", + response: ($chunk_response[0] // null) + }, + retrieval_query: { + status: $retrieval_step_status, + request_artifact: "retrieval-request.json", + response_artifact: "retrieval-response.json", + response: ($retrieval_response[0] // null) + } + } + }, + result: { + status: $result_status, + evidence: "RAGFlow retrieval reference chunks are mapped to real_world_job evidence ids when content or document metadata matches the generated public corpus.", + reference_chunk_count: (($reference_mapping[0] // []) | length), + mapped_reference_chunk_count: (($reference_mapping[0] // []) | map(select((.evidence_ids // []) | length > 0)) | length) + }, + evidence_mapping: { + expected_evidence_ids: [$evidence_id], + reference_chunks: ($reference_mapping[0] // []), + field_mapping: { + "id": "chunk_id", + "document_id": "document_id", + "document_name_or_document_keyword": "document_name", + "dataset_id_or_kb_id": "dataset_id", + "content_or_content_with_weight": "content", + "positions": "positions", + "similarity": "similarity", + "vector_similarity": "vector_similarity", + "term_similarity": "term_similarity" + } + } + }' >"${OUT}" +} + +write_manifest() { + local generated_at out_rel manifest_rel retrieval_suite_status production_ops_status capability_retrieval_status capability_setup_status + generated_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + out_rel="$(relative_path "${OUT}")" + manifest_rel="$(relative_path "${MANIFEST_OUT}")" + retrieval_suite_status="$(json_status "${RESULT_STATUS}")" + capability_retrieval_status="$(json_status "${RESULT_STATUS}")" + capability_setup_status="$(json_status "${SETUP_STATUS}")" + production_ops_status="not_encoded" + + jq -n \ + --arg generated_at "${generated_at}" \ + --arg manifest_id "ragflow-docker-evidence-smoke-${RUN_ID}" \ + --arg out_rel "${out_rel}" \ + --arg manifest_rel "${manifest_rel}" \ + --arg evidence_class "${EVIDENCE_CLASS}" \ + --arg overall_status "$(json_status "${OVERALL_STATUS}")" \ + --arg setup_status "$(json_status "${SETUP_STATUS}")" \ + --arg run_status "$(json_status "${RUN_STATUS}")" \ + --arg result_status "$(json_status "${RESULT_STATUS}")" \ + --arg retrieval_suite_status "${retrieval_suite_status}" \ + --arg production_ops_status "${production_ops_status}" \ + --arg capability_setup_status "${capability_setup_status}" \ + --arg capability_retrieval_status "${capability_retrieval_status}" \ + --arg ragflow_image "${RAGFLOW_IMAGE}" \ + --arg cpu_gpu_mode "${CPU_GPU_MODE}" \ + --arg failure_reason "${FAILURE_REASON}" \ + --arg host_global_installs_required "${HOST_GLOBAL_INSTALLS_REQUIRED}" \ + '{ + schema: "elf.real_world_external_adapter_manifest/v1", + manifest_id: $manifest_id, + docker_isolation: { + default: true, + compose_file: "official RAGFlow docker/docker-compose.yml", + runner: "scripts/ragflow-docker-evidence-smoke.sh", + artifact_dir: "tmp/real-world-memory/ragflow-smoke", + host_global_installs_required: ($host_global_installs_required == "true"), + notes: [ + "Generated by the RAGFlow evidence-smoke script at " + $generated_at + ".", + "The smoke uses a generated public corpus and does not use private corpus or operator-owned provider credentials." + ] + }, + adapters: [ + { + adapter_id: "ragflow_docker_evidence_smoke", + project: "RAGFlow", + adapter_kind: "docker_service_evidence_smoke", + evidence_class: $evidence_class, + docker_default: true, + host_global_installs_required: ($host_global_installs_required == "true"), + overall_status: $overall_status, + setup: { + status: $setup_status, + evidence: "Official RAGFlow Docker Compose boundary and resource envelope were evaluated for the tiny evidence smoke.", + command: "cargo make smoke-ragflow-docker", + artifact: $out_rel + }, + run: { + status: $run_status, + evidence: "The smoke attempts dataset creation, empty-document corpus ingest, chunk insert, retrieval query, and reference chunk extraction.", + command: "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", + artifact: $out_rel + }, + result: { + status: $result_status, + evidence: ( + if $failure_reason == "" then "Returned RAGFlow reference chunks were mapped to generated real_world_job evidence ids for the smoke only." + else $failure_reason + end + ), + artifact: $out_rel + }, + capabilities: [ + { + capability: "official_docker_service_boundary", + status: $capability_setup_status, + evidence: "The script uses the official RAGFlow Docker Compose setup and records image, disk, startup, CPU/GPU, and vm.max_map_count evidence." + }, + { + capability: "dataset_or_chunk_ingest", + status: $run_status, + evidence: "The live path creates a generated public dataset, empty document, and chunk before querying." + }, + { + capability: "retrieval_reference_mapping", + status: $capability_retrieval_status, + evidence: "The script maps returned chunk id, document id, document name, dataset id, positions, and similarity fields to benchmark evidence ids." + }, + { + capability: "quality_or_scale_claim", + status: "not_encoded", + evidence: "The smoke does not run broad RAGFlow quality scoring, scale tests, private corpora, or comparative ranking claims." + } + ], + suites: [ + { + suite_id: "retrieval", + status: $retrieval_suite_status, + evidence: "Only the generated-public RAGFlow evidence-smoke retrieval path is represented." + }, + { + suite_id: "production_ops", + status: $production_ops_status, + evidence: "Resource envelope evidence is recorded, but no production-ops suite scoring is encoded." + }, + { + suite_id: "knowledge_compilation", + status: "not_encoded", + evidence: "RAGFlow page or knowledge-compilation behavior is not part of this smoke." + } + ], + evidence: [ + { + kind: "artifact", + ref: $out_rel, + status: $result_status + }, + { + kind: "manifest", + ref: $manifest_rel, + status: $overall_status + }, + { + kind: "source", + ref: "https://ragflow.io/docs/", + status: "real" + }, + { + kind: "source", + ref: "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + status: "real" + } + ], + execution_metadata: { + sources: [ + { + label: "RAGFlow quickstart", + url: "https://ragflow.io/docs/", + evidence: "Official Docker startup, resource envelope, vm.max_map_count, and provider configuration guidance." + }, + { + label: "RAGFlow HTTP API reference", + url: "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + evidence: "Official dataset, document, chunk, retrieval, and reference-chunk field contract." + } + ], + setup_path: "Run the official RAGFlow Docker Compose stack with generated public corpus only.", + runtime_boundary: "Official RAGFlow Docker Compose service boundary; no host-global RAGFlow install.", + resource_expectation: ( + "RAGFlow image " + $ragflow_image + ", CPU/GPU mode " + $cpu_gpu_mode + ", official minimums 4 CPU cores, 16 GB RAM, 50 GB disk, and vm.max_map_count >= 262144." + ), + retry_guidance: [ + "Default command records a typed blocked preflight unless resource-heavy startup is explicitly enabled.", + "Set ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 for a live Docker startup attempt.", + "Provide only a local self-hosted RAGFlow API key; do not use private corpora or operator-owned model provider credentials for this smoke." + ], + research_depth: "D2 feasibility plus XY-885 evidence-smoke implementation; generated artifact decides live evidence class." + }, + notes: [ + "This adapter record is generated by a smoke artifact and must not be generalized into broad RAGFlow quality evidence.", + "Failure before query output remains typed as blocked, incomplete, or not_encoded." + ] + } + ] + }' >"${MANIFEST_OUT}" +} + +write_fixture() { + local result_status reason + result_status="$(json_status "${RESULT_STATUS}")" + reason="${FAILURE_REASON}" + + jq -n \ + --arg run_id "${RUN_ID}" \ + --arg evidence_id "${EVIDENCE_ID}" \ + --arg evidence_token "${EVIDENCE_TOKEN}" \ + --arg corpus_text "${CORPUS_TEXT}" \ + --arg result_status "${result_status}" \ + --arg failure_reason "${reason}" \ + '{ + schema: "elf.real_world_job/v1", + job_id: "ragflow-evidence-smoke-001", + suite: "retrieval", + title: "Map RAGFlow reference chunks to generated evidence", + corpus: { + corpus_id: "ragflow-generated-public-smoke", + profile: "generated_public", + items: [ + { + evidence_id: $evidence_id, + kind: "document", + text: $corpus_text, + source_ref: { + schema: "source_ref/v1", + resolver: "ragflow_smoke/v1", + ref: { + run_id: $run_id, + evidence_token: $evidence_token + } + }, + created_at: "2026-06-10T00:00:00Z" + } + ], + adapter_response: { + adapter_id: "ragflow_docker_evidence_smoke", + answer: { + content: ( + if $result_status == "pass" then + "RAGFlow returned reference chunks that map to the generated ragflow-smoke-anchor evidence id." + else + "" + end + ), + claims: ( + if $result_status == "pass" then + [ + { + claim_id: "ragflow_reference_mapping", + text: "RAGFlow reference chunks map to the generated ragflow-smoke-anchor evidence id.", + evidence_ids: [$evidence_id], + confidence: "derived_from_ragflow_reference_chunk_mapping" + } + ] + else + [] + end + ), + evidence_ids: (if $result_status == "pass" then [$evidence_id] else [] end), + latency_ms: 0.0, + cost: { + currency: "USD", + amount: 0.0, + input_tokens: 0, + output_tokens: 0 + } + } + } + }, + timeline: [ + { + event_id: "ragflow-smoke-corpus-generated", + ts: "2026-06-10T00:00:00Z", + actor: "system", + action: "generated_public_corpus", + evidence_ids: [$evidence_id], + summary: "The RAGFlow smoke generated a tiny public corpus for reference chunk mapping." + } + ], + prompt: { + role: "user", + content: "Which RAGFlow smoke evidence token maps to the generated reference chunk?", + job_mode: "answer", + constraints: ["cite_evidence", "avoid_broad_quality_claims"] + }, + expected_answer: { + must_include: [ + { + claim_id: "ragflow_reference_mapping", + text: "RAGFlow reference chunks map to the generated ragflow-smoke-anchor evidence id." + } + ], + must_not_include: ["RAGFlow passed a broad graph/RAG quality benchmark."], + evidence_links: { + ragflow_reference_mapping: [$evidence_id] + }, + answer_type: "direct_answer", + accepted_alternates: [], + requires_caveat: true, + requires_refusal: false + }, + required_evidence: [ + { + evidence_id: $evidence_id, + claim_id: "ragflow_reference_mapping", + requirement: "cite", + quote: "ragflow-smoke-anchor evidence id" + } + ], + negative_traps: [], + scoring_rubric: { + dimensions: { + answer_correctness: { + weight: 0.3, + max_points: 1.0, + criteria: "States the generated evidence mapping without broad quality claims." + }, + evidence_grounding: { + weight: 0.45, + max_points: 1.0, + criteria: "Maps returned RAGFlow reference chunks to the generated evidence id." + }, + trap_avoidance: { + weight: 0.15, + max_points: 1.0, + criteria: "Does not claim broad RAGFlow quality from the tiny smoke." + }, + latency_resource: { + weight: 0.1, + max_points: 1.0, + criteria: "Records setup, resource, provider, and reference-mapping boundaries." + } + }, + pass_threshold: 0.75, + hard_fail_rules: [] + }, + allowed_uncertainty: { + can_answer_unknown: false, + acceptable_phrases: ["tiny generated corpus", "reference chunk smoke only"], + fallback_action: "state_blocker" + }, + operator_debug: null, + encoding: {}, + memory_evolution: null, + tags: ["external_adapter", "generated_public", "ragflow", "no_live_claim"] + } + | if ["blocked", "incomplete", "not_encoded"] | index($result_status) then + .encoding = {status: $result_status, reason: $failure_reason} + else + . + end' >"${FIXTURE_PATH}" +} + +write_scored_report() { + ( + cd "${ROOT_DIR}" + cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${FIXTURE_PATH}" \ + --out "${REPORT_JSON}" \ + --run-id real-world-memory-live-ragflow \ + --adapter-id ragflow_docker_evidence_smoke \ + --adapter-name "RAGFlow Docker evidence smoke adapter" \ + --adapter-behavior docker_service_evidence_smoke \ + --adapter-storage-status "$(json_status "${SETUP_STATUS}")" \ + --adapter-runtime-status "$(json_status "${OVERALL_STATUS}")" \ + --adapter-notes "Generated by the RAGFlow Docker evidence smoke; pass or wrong_result requires reference chunks mapped to generated evidence ids, while resource/setup/API-key limits remain typed." \ + --external-adapter-manifest "${MANIFEST_OUT}" + cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_JSON}" \ + --out "${REPORT_MD}" + ) +} + +write_summary() { + jq -n \ + --slurpfile materialization "${OUT}" \ + --slurpfile manifest "${MANIFEST_OUT}" \ + --slurpfile report "${REPORT_JSON}" \ + '{ + schema: "elf.ragflow_docker_smoke_summary/v1", + generated_at: (now | todateiso8601), + adapter_id: "ragflow_docker_evidence_smoke", + evidence_class: $materialization[0].evidence_class, + status_boundary: { + materialization: "setup/run/evidence-mapping state emitted by the smoke runner", + manifest: "external adapter declaration consumed by the scorer", + scored_benchmark: "post-score real_world_job outcome; use this for quality status" + }, + scored_benchmark: $materialization[0].scored_benchmark, + materialization: $materialization[0], + manifest: { + json: ($materialization[0].artifacts.external_adapter_manifest // "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json"), + status_source: "external_adapter_manifest_pre_score", + summary: $manifest[0].adapters[0].overall_status, + suites: $manifest[0].adapters[0].suites + }, + report: { + json: ($materialization[0].artifacts.scored_report_json // "tmp/real-world-memory/ragflow-smoke/ragflow-report.json"), + markdown: ($materialization[0].artifacts.scored_report_markdown // "tmp/real-world-memory/ragflow-smoke/ragflow-report.md"), + summary: $report[0].summary, + suites: $report[0].suites + } + }' >"${SUMMARY_OUT}" +} + +write_outputs() { + write_scored_benchmark + write_artifact + write_manifest + write_fixture + write_scored_report + write_scored_benchmark + write_artifact + write_summary + echo "RAGFlow smoke artifact: ${OUT}" + echo "RAGFlow smoke manifest: ${MANIFEST_OUT}" + echo "RAGFlow smoke report: ${REPORT_JSON}" + echo "RAGFlow smoke summary: ${SUMMARY_OUT}" +} + +for cmd in jq curl; do + required_command "${cmd}" +done + +if ! command -v docker >/dev/null 2>&1; then + jq -n '{error: "docker_missing"}' >"${DOCKER_INFO}" + SETUP_STATUS="incomplete" + OVERALL_STATUS="incomplete" + RESULT_STATUS="incomplete" + FAILURE_CLASS="docker_cli_missing" + FAILURE_REASON="Docker CLI is required for the RAGFlow evidence smoke." + write_outputs + exit 0 +fi + +if ! capture_docker_info; then + SETUP_STATUS="incomplete" + OVERALL_STATUS="incomplete" + RESULT_STATUS="incomplete" + FAILURE_CLASS="docker_unavailable" + FAILURE_REASON="Docker is installed but docker info failed; RAGFlow Docker setup was not attempted." + write_outputs + exit 0 +fi + +capture_disk_info +capture_vm_max_map_count +capture_image_info + +ARCH="$(uname -m)" +if [[ "${ARCH}" != "x86_64" && "${ARCH}" != "amd64" && "${ALLOW_ARM}" != "1" ]]; then + SETUP_STATUS="blocked" + OVERALL_STATUS="blocked" + RESULT_STATUS="blocked" + FAILURE_CLASS="unsupported_ragflow_docker_architecture" + FAILURE_REASON="Official RAGFlow quickstart supports x86 CPU and Nvidia GPU Docker images; set ELF_RAGFLOW_SMOKE_ALLOW_ARM=1 only for an explicitly built ARM image path." + write_outputs + exit 0 +fi + +if [[ "${START_RAGFLOW}" != "1" ]]; then + write_outputs + exit 0 +fi + +if [[ "${ACCEPT_RESOURCE_ENVELOPE}" != "1" ]]; then + write_outputs + exit 0 +fi + +if ! command -v git >/dev/null 2>&1; then + SETUP_STATUS="incomplete" + OVERALL_STATUS="incomplete" + RESULT_STATUS="incomplete" + FAILURE_CLASS="git_missing_for_ragflow_source" + FAILURE_REASON="git is required to fetch the official RAGFlow Docker Compose files for this smoke." + write_outputs + exit 0 +fi + +RAGFLOW_REPO_DIR="" +if RAGFLOW_REPO_DIR="$(prepare_official_ragflow_repo)"; then + start_ragflow_stack "${RAGFLOW_REPO_DIR}" +else + SETUP_STATUS="incomplete" + OVERALL_STATUS="incomplete" + RESULT_STATUS="incomplete" + FAILURE_CLASS="ragflow_source_checkout_failed" + FAILURE_REASON="Failed to fetch the official RAGFlow Docker Compose source." +fi + +if [[ "${SETUP_STATUS}" == "pass" ]]; then + if wait_for_ragflow_api; then + if [[ -z "${API_KEY}" ]]; then + RUN_STATUS="blocked" + RESULT_STATUS="blocked" + OVERALL_STATUS="blocked" + FAILURE_CLASS="ragflow_api_key_required" + FAILURE_REASON="RAGFlow HTTP APIs require a local self-host API key; no private or operator-owned provider credentials were used." + else + run_api_smoke + fi + else + SETUP_STATUS="incomplete" + RUN_STATUS="not_encoded" + RESULT_STATUS="incomplete" + OVERALL_STATUS="incomplete" + FAILURE_CLASS="ragflow_api_startup_timeout" + FAILURE_REASON="RAGFlow Docker services started but the HTTP API did not become healthy within the configured retry window." + fi +fi + +cleanup_stack +write_outputs diff --git a/scripts/ranking-stability-harness.sh b/scripts/ranking-stability-harness.sh new file mode 100755 index 00000000..fefb1a0d --- /dev/null +++ b/scripts/ranking-stability-harness.sh @@ -0,0 +1,454 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +if [[ -f "${ROOT_DIR}/.env" ]]; then + set -a + # shellcheck disable=SC1090 + source "${ROOT_DIR}/.env" + set +a +fi + +: "${ELF_PG_DSN:?Set ELF_PG_DSN to a Postgres DSN (usually .../postgres).}" +: "${ELF_QDRANT_HTTP_URL:?Set ELF_QDRANT_HTTP_URL to the Qdrant REST base URL, for example http://127.0.0.1:51889 (default: http://127.0.0.1:6333).}" + +QDRANT_GRPC_URL="${ELF_QDRANT_GRPC_URL:-${ELF_QDRANT_URL:-}}" +if [[ -z "${QDRANT_GRPC_URL}" ]]; then + echo "Set ELF_QDRANT_GRPC_URL to the Qdrant gRPC base URL, for example http://127.0.0.1:51890 (default: http://127.0.0.1:6334). Legacy alias ELF_QDRANT_URL is deprecated but still supported." + exit 1 +fi + +if command -v jaq >/dev/null 2>&1; then + JSON_TOOL="jaq" +elif command -v jq >/dev/null 2>&1; then + JSON_TOOL="jq" +else + echo "Missing jaq/jq. Install jaq (recommended) or jq." >&2 + exit 1 +fi + +for cmd in curl psql taplo; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd}." >&2 + exit 1 + fi +done + +RUN_ID="${ELF_HARNESS_RUN_ID:-"$(date +%s)-$$"}" + +DB_NAME="${ELF_HARNESS_DB_NAME:-elf_stability}" +QDRANT_COLLECTION="${ELF_HARNESS_COLLECTION:-elf_stability_${RUN_ID}}" +VECTOR_DIM="${ELF_HARNESS_VECTOR_DIM:-4096}" + +NOISE_STD="${ELF_HARNESS_NOISE_STD:-0.08}" +RUNS_PER_QUERY="${ELF_HARNESS_RUNS_PER_QUERY:-8}" +TOP_K="${ELF_HARNESS_TOP_K:-10}" +CANDIDATE_K="${ELF_HARNESS_CANDIDATE_K:-60}" +TARGET_TOP_K="${ELF_HARNESS_TARGET_TOP_K:-10}" + +if [[ "${DB_NAME}" != elf_* ]]; then + echo "ELF_HARNESS_DB_NAME must start with elf_ to avoid deleting real data." >&2 + exit 1 +fi +if [[ "${QDRANT_COLLECTION}" != elf_* ]]; then + echo "ELF_HARNESS_COLLECTION must start with elf_ to avoid deleting real data." >&2 + exit 1 +fi + +HTTP_BIND="${ELF_HARNESS_HTTP_BIND:-127.0.0.1:18189}" +ADMIN_BIND="${ELF_HARNESS_ADMIN_BIND:-127.0.0.1:18190}" +MCP_BIND="${ELF_HARNESS_MCP_BIND:-127.0.0.1:18191}" +HTTP_BASE="http://${HTTP_BIND}" + +PG_DSN_BASE="${ELF_PG_DSN%/*}" +PG_DSN="${PG_DSN_BASE}/${DB_NAME}" + +CFG_BASE="${ROOT_DIR}/tmp/elf.stability.base.toml" +CFG_DET="${ROOT_DIR}/tmp/elf.stability.det.toml" +DATASET="${ROOT_DIR}/tmp/elf.stability.dataset.json" +OUT_JSON="${ROOT_DIR}/tmp/elf.stability.out.json" +WORKER_LOG="${ROOT_DIR}/tmp/elf.stability.worker.log" +API_LOG="${ROOT_DIR}/tmp/elf.stability.api.log" + +WORKER_PID="" +API_PID="" + +cleanup() { + set +e + + if [[ -n "${API_PID}" ]] && kill -0 "${API_PID}" >/dev/null 2>&1; then + kill "${API_PID}" >/dev/null 2>&1 || true + fi + if [[ -n "${WORKER_PID}" ]] && kill -0 "${WORKER_PID}" >/dev/null 2>&1; then + kill "${WORKER_PID}" >/dev/null 2>&1 || true + fi + wait >/dev/null 2>&1 || true + + if [[ "${ELF_HARNESS_KEEP_COLLECTION:-0}" != "1" ]]; then + curl -sS -X DELETE "${ELF_QDRANT_HTTP_URL}/collections/${QDRANT_COLLECTION}?wait=true" >/dev/null || true + fi + + if [[ "${ELF_HARNESS_KEEP_DB:-0}" != "1" ]]; then + psql "${ELF_PG_DSN}" -tAc \ + "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${DB_NAME}' AND pid <> pg_backend_pid();" \ + >/dev/null 2>&1 || true + psql "${ELF_PG_DSN}" -v ON_ERROR_STOP=1 -c "DROP DATABASE IF EXISTS ${DB_NAME};" >/dev/null 2>&1 || true + fi +} + +trap cleanup EXIT + +echo "Recreating database ${DB_NAME}." +psql "${ELF_PG_DSN}" -v ON_ERROR_STOP=1 -c "DROP DATABASE IF EXISTS ${DB_NAME};" >/dev/null +psql "${ELF_PG_DSN}" -v ON_ERROR_STOP=1 -c "CREATE DATABASE ${DB_NAME};" >/dev/null + +echo "Recreating Qdrant collection ${QDRANT_COLLECTION}." +curl -sS -X DELETE "${ELF_QDRANT_HTTP_URL}/collections/${QDRANT_COLLECTION}?wait=true" >/dev/null || true +(cd "${ROOT_DIR}" && ELF_QDRANT_COLLECTION="${QDRANT_COLLECTION}" ELF_QDRANT_VECTOR_DIM="${VECTOR_DIM}" ./qdrant/init.sh >/dev/null) + +VECTOR_DIM_TOML="$(echo "${VECTOR_DIM}" | perl -pe '1 while s/^([0-9]+)([0-9]{3})/$1_$2/')" + +cat >"${CFG_BASE}" <<TOML +[service] +admin_bind = "${ADMIN_BIND}" +http_bind = "${HTTP_BIND}" +log_level = "info" +mcp_bind = "${MCP_BIND}" + +[storage.postgres] +dsn = "${PG_DSN}" +pool_max_conns = 10 + +[storage.qdrant] +collection = "${QDRANT_COLLECTION}" +docs_collection = "${QDRANT_COLLECTION}_docs" +url = "${QDRANT_GRPC_URL}" +vector_dim = ${VECTOR_DIM_TOML} + +[providers.embedding] +api_base = "http://127.0.0.1" +api_key = "local" +dimensions = ${VECTOR_DIM_TOML} +model = "local-hash" +path = "/embeddings" +provider_id = "local" +timeout_ms = 1_000 + +default_headers = {} + +[providers.rerank] +api_base = "http://127.0.0.1" +api_key = "local" +model = "local-token-overlap-noisy@${NOISE_STD}" +path = "/rerank" +provider_id = "local" +timeout_ms = 1_000 + +default_headers = {} + +[providers.llm_extractor] +api_base = "http://127.0.0.1" +api_key = "local" +model = "local-disabled" +path = "/chat/completions" +provider_id = "local" +temperature = 0.0 +timeout_ms = 1_000 + +default_headers = {} + +[scopes] +allowed = ["agent_private", "org_shared", "project_shared"] + +[scopes.read_profiles] +all_scopes = ["agent_private", "org_shared", "project_shared"] +private_only = ["agent_private"] +private_plus_project = ["agent_private", "project_shared"] + +[scopes.precedence] +agent_private = 30 +org_shared = 10 +project_shared = 20 + +[scopes.write_allowed] +agent_private = true +org_shared = true +project_shared = true + +[memory] +candidate_k = ${CANDIDATE_K} +dup_sim_threshold = 0.92 +max_note_chars = 240 +max_notes_per_add_event = 3 +top_k = ${TOP_K} +update_sim_threshold = 0.85 + +[memory.policy] + +[[memory.policy.rules]] +min_confidence = 0.0 +min_importance = 0.0 + +[chunking] +enabled = true +max_tokens = 512 +overlap_tokens = 128 +tokenizer_repo = "gpt2" + +[search.expansion] +include_original = true +max_queries = 4 +mode = "off" + +[search.dynamic] +min_candidates = 10 +min_top_score = 0.12 + +[search.prefilter] +max_candidates = 0 + +[search.cache] +enabled = false +expansion_ttl_days = 7 +rerank_ttl_days = 7 + +[search.explain] +retention_days = 2 +capture_candidates = false +candidate_retention_days = 2 +write_mode = "outbox" + +[search.recursive] +enabled = false +max_children_per_node = 4 +max_depth = 2 +max_nodes_per_scope = 32 +max_total_nodes = 256 + +[search.graph_context] +enabled = false +max_evidence_notes_per_fact = 16 +max_facts_per_item = 16 + +[ranking] +recency_tau_days = 0 +tie_breaker_weight = 0.0 + +[ranking.deterministic] +enabled = false + +[ranking.deterministic.lexical] +enabled = false +max_query_terms = 16 +max_text_terms = 1024 +min_ratio = 0.3 +weight = 0.0 + +[ranking.deterministic.hits] +enabled = false +weight = 0.0 +half_saturation = 8.0 +last_hit_tau_days = 14.0 + +[ranking.deterministic.decay] +enabled = false +tau_days = 30.0 +weight = 0.0 + +[ranking.blend] +enabled = true +rerank_normalization = "rank" +retrieval_normalization = "rank" + +[[ranking.blend.segments]] +max_retrieval_rank = 3 +retrieval_weight = 0.8 + +[[ranking.blend.segments]] +max_retrieval_rank = 10 +retrieval_weight = 0.5 + +[[ranking.blend.segments]] +max_retrieval_rank = 1_000_000 +retrieval_weight = 0.2 + +[ranking.diversity] +enabled = true +max_skips = 64 +mmr_lambda = 0.7 +sim_threshold = 0.88 + +[ranking.retrieval_sources] +fusion_priority = 1 +fusion_weight = 1.0 +structured_field_priority = 0 +structured_field_weight = 1.0 + +[lifecycle.ttl_days] +constraint = 0 +decision = 0 +fact = 180 +plan = 14 +preference = 0 +profile = 0 + +[lifecycle] +purge_deleted_after_days = 30 +purge_deprecated_after_days = 180 + +[security] +auth_mode = "off" +auth_keys = [] +bind_localhost_only = true +evidence_max_quote_chars = 320 +evidence_max_quotes = 2 +evidence_min_quotes = 1 +redact_secrets_on_write = true +reject_non_english = true +TOML + +cp "${CFG_BASE}" "${CFG_DET}" +perl -0777 -i -pe 'BEGIN { $c = 0 } $c += s/\[ranking\.deterministic\]\nenabled\s*=\s*false/[ranking.deterministic]\nenabled = true/s; END { exit($c ? 0 : 1) }' "${CFG_DET}" +perl -0777 -i -pe 'BEGIN { $c = 0 } $c += s/\[ranking\.deterministic\.hits\]\nenabled\s*=\s*false\nweight\s*=\s*0\.0\nhalf_saturation\s*=\s*8\.0\nlast_hit_tau_days\s*=\s*14\.0/[ranking.deterministic.hits]\nenabled = true\nweight = 1.25\nhalf_saturation = 1.0\nlast_hit_tau_days = 30.0/s; END { exit($c ? 0 : 1) }' "${CFG_DET}" + +taplo fmt "${CFG_BASE}" "${CFG_DET}" >/dev/null 2>&1 + +echo "Building harness binaries." +(cd "${ROOT_DIR}" && cargo build -p elf-worker -p elf-api -p elf-eval >/dev/null) + +echo "Starting worker and API (logs: ${WORKER_LOG}, ${API_LOG})." +(cd "${ROOT_DIR}" && cargo run -p elf-worker -- --config "${CFG_BASE}" >"${WORKER_LOG}" 2>&1) & +WORKER_PID="$!" +(cd "${ROOT_DIR}" && cargo run -p elf-api -- --config "${CFG_BASE}" >"${API_LOG}" 2>&1) & +API_PID="$!" + +echo "Waiting for API health check at ${HTTP_BASE}/health." +for _ in $(seq 1 120); do + status="$(curl -s -o /dev/null -w '%{http_code}' "${HTTP_BASE}/health" 2>/dev/null || true)" + if [[ "${status}" == "200" ]]; then + break + fi + sleep 0.5 +done + +status="$(curl -s -o /dev/null -w '%{http_code}' "${HTTP_BASE}/health" 2>/dev/null || true)" +if [[ "${status}" != "200" ]]; then + echo "API did not become healthy in time. Check logs: ${API_LOG}." >&2 + exit 1 +fi + +TENANT_ID="stability-tenant-${RUN_ID}" +PROJECT_ID="stability-project-${RUN_ID}" +AGENT_ID="stability-agent-${RUN_ID}" + +NOTE_PAYLOAD="$( + "${JSON_TOOL}" -n --arg run "ranking-stability-harness" --arg scope "agent_private" --arg query "deployment steps" --argjson count "${CANDIDATE_K}" '{ + scope: $scope, + notes: [range(1; $count + 1) as $i | { + type: "fact", + key: ("stability_" + ($i|tostring)), + text: ("Deployment steps for service. " + $query + ". Candidate " + ($i|tostring) + "."), + importance: 0.2, + confidence: 0.9, + ttl_days: 180, + source_ref: {run: $run} + }] + }' +)" + +echo "Ingesting ${CANDIDATE_K} notes." +NOTE_IDS_RAW="$( + curl -sS "${HTTP_BASE}/v2/notes/ingest" \ + -H 'content-type: application/json' \ + -H "X-ELF-Tenant-Id: ${TENANT_ID}" \ + -H "X-ELF-Project-Id: ${PROJECT_ID}" \ + -H "X-ELF-Agent-Id: ${AGENT_ID}" \ + -d "${NOTE_PAYLOAD}" | "${JSON_TOOL}" -r '.results[].note_id' +)" +mapfile -t NOTE_IDS <<<"${NOTE_IDS_RAW}" + +if [[ "${#NOTE_IDS[@]}" -lt 10 ]]; then + echo "Add-note failed. Check logs: ${API_LOG}." >&2 + exit 1 +fi + +wait_for_outbox_done() { + local note_id="$1" + for _ in $(seq 1 120); do + status="$( + psql "${PG_DSN}" -tAc \ + "SELECT status FROM indexing_outbox WHERE note_id = '${note_id}' ORDER BY created_at DESC LIMIT 1;" \ + | tr -d '[:space:]' + )" + if [[ "${status}" == "DONE" ]]; then + return 0 + fi + sleep 0.5 + done + return 1 +} + +echo "Waiting for indexing jobs to finish." +for id in "${NOTE_IDS[@]}"; do + if ! wait_for_outbox_done "${id}"; then + echo "Timed out waiting for note to index. Check logs: ${WORKER_LOG}." >&2 + exit 1 + fi +done + +TARGET_IDS=("${NOTE_IDS[@]:0:${TARGET_TOP_K}}") + +echo "Boosting hit_count for the first ${TARGET_TOP_K} notes to create a stable target set." +TARGET_LIST="$( + printf "%s\n" "${TARGET_IDS[@]}" | "${JSON_TOOL}" -R -s -c 'split("\n")[:-1]' +)" +TARGET_ARRAY_SQL="{" +for id in "${TARGET_IDS[@]}"; do + TARGET_ARRAY_SQL+="${id}," +done +TARGET_ARRAY_SQL="${TARGET_ARRAY_SQL%,}}" +psql "${PG_DSN}" -v ON_ERROR_STOP=1 -c \ + "UPDATE memory_notes SET hit_count = 100, last_hit_at = now() WHERE note_id = ANY ('${TARGET_ARRAY_SQL}'::uuid[]);" \ + >/dev/null + +cat >"${DATASET}" <<JSON +{ + "name": "ranking-stability-harness", + "defaults": { + "tenant_id": "${TENANT_ID}", + "project_id": "${PROJECT_ID}", + "agent_id": "${AGENT_ID}", + "read_profile": "all_scopes", + "top_k": ${TOP_K}, + "candidate_k": ${CANDIDATE_K} + }, + "queries": [ + { + "id": "q-1", + "query": "deployment steps", + "expected_note_ids": ${TARGET_LIST} + } + ] +} +JSON + +echo "Running eval compare (runs_per_query=${RUNS_PER_QUERY})." +(cd "${ROOT_DIR}" && cargo run -q -p elf-eval -- --config-a "${CFG_BASE}" --config-b "${CFG_DET}" --dataset "${DATASET}" --runs-per-query "${RUNS_PER_QUERY}") \ + | awk 'BEGIN { started = 0 } /^\{/ { started = 1 } { if (started) print }' \ + >"${OUT_JSON}" + +SET_CHURN_A="$("${JSON_TOOL}" -r '.summary_a.stability.avg_set_churn_at_k' "${OUT_JSON}")" +SET_CHURN_B="$("${JSON_TOOL}" -r '.summary_b.stability.avg_set_churn_at_k' "${OUT_JSON}")" +POS_CHURN_A="$("${JSON_TOOL}" -r '.summary_a.stability.avg_positional_churn_at_k' "${OUT_JSON}")" +POS_CHURN_B="$("${JSON_TOOL}" -r '.summary_b.stability.avg_positional_churn_at_k' "${OUT_JSON}")" + +echo "Results (lower churn is better):" +echo "A (deterministic off) set_churn@k=${SET_CHURN_A} positional_churn@k=${POS_CHURN_A}" +echo "B (deterministic on) set_churn@k=${SET_CHURN_B} positional_churn@k=${POS_CHURN_B}" +echo "Output: ${OUT_JSON}" + +awk -v a="${SET_CHURN_A}" -v b="${SET_CHURN_B}" 'BEGIN { exit !(b <= a + 1e-9) }' || { + echo "Expected deterministic ranking to reduce churn, but set churn did not improve." >&2 + exit 1 +} diff --git a/scripts/real-world-consolidation-live-adapter.sh b/scripts/real-world-consolidation-live-adapter.sh new file mode 100755 index 00000000..5d506134 --- /dev/null +++ b/scripts/real-world-consolidation-live-adapter.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_CONSOLIDATION_LIVE_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/live-consolidation}" +FIXTURE_DIR="${ELF_CONSOLIDATION_LIVE_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory/consolidation}" + +if [[ ! -f "/.dockerenv" && "${ELF_CONSOLIDATION_LIVE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run live consolidation adapter outside Docker. Use cargo make real-world-memory-live-consolidation." >&2 + exit 1 +fi + +for cmd in bash cargo jq; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in live consolidation runner." >&2 + exit 1 + fi +done + +mkdir -p "${REPORT_DIR}" +rm -rf "${REPORT_DIR:?}/elf-fixtures" \ + "${REPORT_DIR:?}/elf-materialization.json" \ + "${REPORT_DIR:?}/elf-report.json" \ + "${REPORT_DIR:?}/elf-report.md" \ + "${REPORT_DIR:?}/summary.json" + +cd "${ROOT_DIR}" + +cargo run -p elf-eval --bin real_world_live_adapter -- elf \ + --fixtures "${FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/elf-fixtures" \ + --evidence-out "${REPORT_DIR}/elf-materialization.json" \ + --config config/local/elf.docker.toml + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/elf-fixtures" \ + --out "${REPORT_DIR}/elf-report.json" \ + --run-id real-world-memory-live-consolidation \ + --adapter-id elf_live_real_world \ + --adapter-name "ELF live consolidation service adapter" \ + --adapter-behavior live_real_world_adapter \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized by real_world_live_adapter through ElfService consolidation_run_create, worker proposal materialization, and apply/defer/discard review audit transitions; source notes remain immutable derived-output evidence." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/elf-report.json" \ + --out "${REPORT_DIR}/elf-report.md" + +jq -n \ + --slurpfile materialization "${REPORT_DIR}/elf-materialization.json" \ + --slurpfile report "${REPORT_DIR}/elf-report.json" \ + '{ + schema: "elf.real_world_consolidation_live_adapter_sweep/v1", + generated_at: (now | todateiso8601), + fixture_dir: (env.ELF_CONSOLIDATION_LIVE_FIXTURES // "apps/elf-eval/fixtures/real_world_memory/consolidation"), + artifact_dir: (env.ELF_CONSOLIDATION_LIVE_REPORT_DIR // "tmp/real-world-memory/live-consolidation"), + adapter: { + adapter_id: "elf_live_real_world", + evidence_class: "live_real_world", + materialization: $materialization[0], + report: { + json: "tmp/real-world-memory/live-consolidation/elf-report.json", + markdown: "tmp/real-world-memory/live-consolidation/elf-report.md", + summary: $report[0].summary, + suites: $report[0].suites + } + } + }' >"${REPORT_DIR}/summary.json" diff --git a/scripts/real-world-docker.sh b/scripts/real-world-docker.sh new file mode 100755 index 00000000..a6413839 --- /dev/null +++ b/scripts/real-world-docker.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +set -euo pipefail + +profile="${1:-}" +if [ -z "$profile" ]; then + echo "usage: scripts/real-world-docker.sh <profile>" >&2 + exit 2 +fi + +case "$profile" in +job-operator-ux-live-adapters) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_OPERATOR_DEBUG_LIVE_REPORT_DIR \ + -e ELF_OPERATOR_DEBUG_LIVE_FIXTURES \ + -e ELF_OPERATOR_DEBUG_LIVE_WORK_DIR \ + -e ELF_OPERATOR_DEBUG_QMD_DIR \ + baseline-runner bash scripts/real-world-operator-debug-live-adapters.sh + ;; +memory-live-consolidation) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_CONSOLIDATION_LIVE_REPORT_DIR \ + -e ELF_CONSOLIDATION_LIVE_FIXTURES \ + baseline-runner bash scripts/real-world-consolidation-live-adapter.sh + ;; +memory-live-adapters) + lightrag_start="$(printenv ELF_LIGHTRAG_CONTEXT_START || true)" + graphiti_start="$(printenv ELF_GRAPHITI_ZEP_SMOKE_START || true)" + status=0 + if [ "$lightrag_start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile lightrag up -d lightrag + fi + if [ "$graphiti_start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile graphiti-zep up -d graphiti-falkordb + fi + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW \ + -e ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG \ + -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG \ + -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHITI_ZEP \ + -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHIFY \ + -e ELF_RAGFLOW_SMOKE_START \ + -e ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE \ + -e ELF_RAGFLOW_SMOKE_ALLOW_ARM \ + -e ELF_RAGFLOW_SMOKE_PULL_IMAGE \ + -e ELF_RAGFLOW_SMOKE_CLEANUP \ + -e ELF_RAGFLOW_SMOKE_DEVICE \ + -e ELF_RAGFLOW_API_PORT \ + -e ELF_RAGFLOW_API_BASE \ + -e ELF_RAGFLOW_API_KEY \ + -e RAGFLOW_API_KEY \ + -e ELF_RAGFLOW_SMOKE_STARTUP_ATTEMPTS \ + -e ELF_RAGFLOW_SMOKE_STARTUP_INTERVAL_SECONDS \ + -e ELF_RAGFLOW_SMOKE_COMPOSE_TIMEOUT_SECONDS \ + -e ELF_RAGFLOW_REPO_URL \ + -e ELF_RAGFLOW_REF \ + -e ELF_RAGFLOW_IMAGE \ + -e ELF_RAGFLOW_COMPOSE_PROJECT \ + -e ELF_LIGHTRAG_CONTEXT_START \ + -e ELF_LIGHTRAG_API_BASE \ + -e ELF_LIGHTRAG_ADAPTER_ID \ + -e ELF_LIGHTRAG_ADAPTER_NAME \ + -e ELF_LIGHTRAG_STARTUP_ATTEMPTS \ + -e ELF_LIGHTRAG_STARTUP_INTERVAL_SECONDS \ + -e ELF_LIGHTRAG_INDEX_ATTEMPTS \ + -e ELF_LIGHTRAG_INDEX_INTERVAL_SECONDS \ + -e ELF_GRAPHRAG_SMOKE_RUN \ + -e ELF_GRAPHRAG_SMOKE_WORK_DIR \ + -e ELF_GRAPHRAG_SMOKE_INSTALL \ + -e ELF_GRAPHRAG_VERSION \ + -e ELF_GRAPHRAG_PACKAGE \ + -e ELF_GRAPHRAG_REF \ + -e ELF_GRAPHRAG_CHAT_MODEL \ + -e ELF_GRAPHRAG_EMBEDDING_MODEL \ + -e ELF_GRAPHRAG_API_BASE \ + -e ELF_GRAPHRAG_API_KEY \ + -e ELF_GRAPHRAG_INDEX_METHOD \ + -e ELF_GRAPHRAG_QUERY_METHOD \ + -e ELF_GRAPHRAG_TIMEOUT_SECONDS \ + -e ELF_GRAPHRAG_MAX_DOCS \ + -e ELF_GRAPHRAG_MAX_INPUT_CHARS \ + -e ELF_GRAPHITI_ZEP_SMOKE_START \ + -e ELF_GRAPHITI_ZEP_SMOKE_RUN \ + -e ELF_GRAPHITI_ZEP_SMOKE_WORK_DIR \ + -e ELF_GRAPHITI_ZEP_SMOKE_INSTALL \ + -e ELF_GRAPHITI_ZEP_VERSION \ + -e ELF_GRAPHITI_ZEP_PACKAGE \ + -e ELF_GRAPHITI_ZEP_REF \ + -e ELF_GRAPHITI_ZEP_API_BASE \ + -e ELF_GRAPHITI_ZEP_API_KEY \ + -e ELF_GRAPHITI_ZEP_LLM_MODEL \ + -e ELF_GRAPHITI_ZEP_EMBEDDING_MODEL \ + -e ELF_GRAPHITI_ZEP_FALKORDB_HOST \ + -e ELF_GRAPHITI_ZEP_FALKORDB_PORT \ + -e ELF_GRAPHITI_ZEP_FALKORDB_DATABASE \ + -e ELF_GRAPHITI_ZEP_TIMEOUT_SECONDS \ + -e ELF_GRAPHITI_ZEP_STARTUP_ATTEMPTS \ + -e ELF_GRAPHITI_ZEP_STARTUP_INTERVAL_SECONDS \ + -e ELF_GRAPHIFY_SMOKE_RUN \ + -e ELF_GRAPHIFY_SMOKE_WORK_DIR \ + -e ELF_GRAPHIFY_SMOKE_INSTALL \ + -e ELF_GRAPHIFY_PACKAGE \ + -e ELF_GRAPHIFY_REF \ + -e ELF_GRAPHIFY_TIMEOUT_SECONDS \ + -e ELF_GRAPHIFY_QUERY_BUDGET \ + baseline-runner bash scripts/real-world-live-adapters.sh || status=$? + if [ "$lightrag_start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile lightrag stop lightrag lightrag-mock-provider >/dev/null 2>&1 || true + fi + if [ "$graphiti_start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile graphiti-zep stop graphiti-falkordb >/dev/null 2>&1 || true + fi + exit "$status" + ;; +*) + echo "unknown real-world Docker profile: $profile" >&2 + exit 2 + ;; +esac diff --git a/scripts/real-world-live-adapters.sh b/scripts/real-world-live-adapters.sh new file mode 100755 index 00000000..398cae08 --- /dev/null +++ b/scripts/real-world-live-adapters.sh @@ -0,0 +1,291 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_REAL_WORLD_LIVE_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/live-adapters}" +FIXTURE_DIR="${ELF_REAL_WORLD_LIVE_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory}" +OPERATOR_FIXTURE_DIR="${ELF_REAL_WORLD_OPERATOR_DEBUG_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux}" +INPUT_FIXTURE_DIR="${REPORT_DIR}/input-fixtures" +WORK_DIR="${ELF_REAL_WORLD_LIVE_WORK_DIR:-/bench/real-world-live-adapters}" +QMD_DIR="${ELF_REAL_WORLD_QMD_DIR:-/bench/repos/qmd}" + +if [[ ! -f "/.dockerenv" && "${ELF_REAL_WORLD_LIVE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run live real-world adapters outside Docker. Use cargo make real-world-memory-live-adapters." >&2 + exit 1 +fi + +for cmd in bash cargo git jq npm npx; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in live adapter runner." >&2 + exit 1 + fi +done + +mkdir -p "${REPORT_DIR}" "${WORK_DIR}" +rm -rf "${INPUT_FIXTURE_DIR}" \ + "${REPORT_DIR:?}/elf-fixtures" \ + "${REPORT_DIR:?}/qmd-fixtures" \ + "${REPORT_DIR:?}/elf-materialization.json" \ + "${REPORT_DIR:?}/qmd-materialization.json" \ + "${REPORT_DIR:?}/elf-report.json" \ + "${REPORT_DIR:?}/elf-report.md" \ + "${REPORT_DIR:?}/qmd-report.json" \ + "${REPORT_DIR:?}/qmd-report.md" \ + "${REPORT_DIR:?}/ragflow" \ + "${REPORT_DIR:?}/lightrag" \ + "${REPORT_DIR:?}/graphrag" \ + "${REPORT_DIR:?}/graphiti-zep" \ + "${REPORT_DIR:?}/graphify" \ + "${REPORT_DIR:?}/summary.json" + +cd "${ROOT_DIR}" + +mkdir -p "${INPUT_FIXTURE_DIR}" +cp -R "${FIXTURE_DIR}/." "${INPUT_FIXTURE_DIR}/" +mkdir -p "${INPUT_FIXTURE_DIR}/operator_debugging_ux" +cp -R "${OPERATOR_FIXTURE_DIR}/." "${INPUT_FIXTURE_DIR}/operator_debugging_ux/" + +cargo run -p elf-eval --bin real_world_live_adapter -- elf \ + --fixtures "${INPUT_FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/elf-fixtures" \ + --evidence-out "${REPORT_DIR}/elf-materialization.json" \ + --config config/local/elf.docker.toml + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/elf-fixtures" \ + --out "${REPORT_DIR}/elf-report.json" \ + --run-id real-world-memory-live-elf \ + --adapter-id elf_live_real_world \ + --adapter-name "ELF live real-world service adapter" \ + --adapter-behavior live_real_world_adapter \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized by real_world_live_adapter through ElfService, worker indexing, and search_raw across the encoded real-world suite corpus; unsupported suite capabilities remain typed non-pass records." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/elf-report.json" \ + --out "${REPORT_DIR}/elf-report.md" + +cargo run -p elf-eval --bin real_world_live_adapter -- qmd \ + --fixtures "${INPUT_FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/qmd-fixtures" \ + --evidence-out "${REPORT_DIR}/qmd-materialization.json" \ + --qmd-dir "${QMD_DIR}" \ + --work-dir "${WORK_DIR}/qmd" + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/qmd-fixtures" \ + --out "${REPORT_DIR}/qmd-report.json" \ + --run-id real-world-memory-live-qmd \ + --adapter-id qmd_live_real_world \ + --adapter-name "qmd live real-world CLI adapter" \ + --adapter-behavior live_real_world_adapter \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized by real_world_live_adapter through qmd collection add, update, embed, and query --json across the encoded real-world suite corpus; unsupported suite capabilities remain typed non-pass records." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/qmd-report.json" \ + --out "${REPORT_DIR}/qmd-report.md" + +if [[ "${ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW:-0}" == "1" ]]; then + ELF_RAGFLOW_SMOKE_ARTIFACT_DIR="${REPORT_DIR}/ragflow" \ + bash scripts/ragflow-docker-evidence-smoke.sh +fi + +if [[ "${ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG:-0}" == "1" ]]; then + ELF_LIGHTRAG_CONTEXT_REPORT_DIR="${REPORT_DIR}/lightrag" \ + ELF_LIGHTRAG_CONTEXT_FIXTURES="${ELF_LIGHTRAG_CONTEXT_FIXTURES:-${FIXTURE_DIR}/retrieval}" \ + bash scripts/lightrag-docker-context-smoke.sh +fi + +if [[ "${ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG:-0}" == "1" ]]; then + ELF_GRAPHRAG_SMOKE_REPORT_DIR="${REPORT_DIR}/graphrag" \ + python3 scripts/graphrag-docker-smoke.py +fi + +if [[ "${ELF_REAL_WORLD_LIVE_ENABLE_GRAPHITI_ZEP:-0}" == "1" ]]; then + ELF_GRAPHITI_ZEP_SMOKE_REPORT_DIR="${REPORT_DIR}/graphiti-zep" \ + python3 scripts/graphiti-zep-docker-temporal-smoke.py +fi + +if [[ "${ELF_REAL_WORLD_LIVE_ENABLE_GRAPHIFY:-0}" == "1" ]]; then + ELF_GRAPHIFY_SMOKE_REPORT_DIR="${REPORT_DIR}/graphify" \ + python3 scripts/graphify-docker-graph-report-smoke.py +fi + +jq -n \ + --slurpfile elf_materialization "${REPORT_DIR}/elf-materialization.json" \ + --slurpfile qmd_materialization "${REPORT_DIR}/qmd-materialization.json" \ + --slurpfile elf_report "${REPORT_DIR}/elf-report.json" \ + --slurpfile qmd_report "${REPORT_DIR}/qmd-report.json" \ + '{ + schema: "elf.real_world_live_adapter_sweep/v1", + generated_at: (now | todateiso8601), + artifact_dir: (env.ELF_REAL_WORLD_LIVE_REPORT_DIR // "tmp/real-world-memory/live-adapters"), + fixture_dir: (env.ELF_REAL_WORLD_LIVE_FIXTURES // "apps/elf-eval/fixtures/real_world_memory"), + operator_debug_fixture_dir: (env.ELF_REAL_WORLD_OPERATOR_DEBUG_FIXTURES // "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux"), + combined_fixture_dir: "tmp/real-world-memory/live-adapters/input-fixtures", + graph_rag_smoke_controls: { + inclusion_flags: { + ragflow: (env.ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW // "0"), + lightrag: (env.ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG // "0"), + graphrag: (env.ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG // "0"), + graphiti_zep: (env.ELF_REAL_WORLD_LIVE_ENABLE_GRAPHITI_ZEP // "0"), + graphify: (env.ELF_REAL_WORLD_LIVE_ENABLE_GRAPHIFY // "0") + }, + live_attempt_boundary: "Inclusion flags only add smoke adapters to this aggregate sweep. Provider, service-start, and resource-heavy live attempts still require each adapter-specific control.", + service_start_controls: { + lightrag: (env.ELF_LIGHTRAG_CONTEXT_START // "0"), + graphiti_zep: (env.ELF_GRAPHITI_ZEP_SMOKE_START // "0") + }, + provider_or_resource_controls_forwarded: [ + "ELF_RAGFLOW_SMOKE_START", + "ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE", + "ELF_GRAPHRAG_SMOKE_RUN", + "ELF_GRAPHRAG_API_KEY", + "ELF_GRAPHITI_ZEP_SMOKE_RUN", + "ELF_GRAPHITI_ZEP_API_KEY", + "ELF_GRAPHIFY_SMOKE_RUN" + ] + }, + adapters: [ + { + adapter_id: "elf_live_real_world", + evidence_class: "live_real_world", + materialization: $elf_materialization[0], + report: { + json: "tmp/real-world-memory/live-adapters/elf-report.json", + markdown: "tmp/real-world-memory/live-adapters/elf-report.md", + summary: $elf_report[0].summary, + suites: $elf_report[0].suites + } + }, + { + adapter_id: "qmd_live_real_world", + evidence_class: "live_real_world", + materialization: $qmd_materialization[0], + report: { + json: "tmp/real-world-memory/live-adapters/qmd-report.json", + markdown: "tmp/real-world-memory/live-adapters/qmd-report.md", + summary: $qmd_report[0].summary, + suites: $qmd_report[0].suites + } + } + ] + }' >"${REPORT_DIR}/summary.json" + +if [[ -f "${REPORT_DIR}/ragflow/summary.json" ]]; then + jq \ + --slurpfile ragflow_summary "${REPORT_DIR}/ragflow/summary.json" \ + '.adapters += [ + { + adapter_id: $ragflow_summary[0].adapter_id, + evidence_class: $ragflow_summary[0].evidence_class, + status_boundary: $ragflow_summary[0].status_boundary, + scored_benchmark: $ragflow_summary[0].scored_benchmark, + materialization: $ragflow_summary[0].materialization, + report: $ragflow_summary[0].report + } + ]' "${REPORT_DIR}/summary.json" >"${REPORT_DIR}/summary.json.tmp" + mv "${REPORT_DIR}/summary.json.tmp" "${REPORT_DIR}/summary.json" +fi + +if [[ -f "${REPORT_DIR}/lightrag/summary.json" ]]; then + jq \ + --slurpfile lightrag_summary "${REPORT_DIR}/lightrag/summary.json" \ + '.adapters += [ + { + adapter_id: $lightrag_summary[0].adapter_id, + evidence_class: $lightrag_summary[0].evidence_class, + status_boundary: $lightrag_summary[0].status_boundary, + scored_benchmark: $lightrag_summary[0].scored_benchmark, + materialization: $lightrag_summary[0].materialization, + report: $lightrag_summary[0].report + } + ]' "${REPORT_DIR}/summary.json" >"${REPORT_DIR}/summary.json.tmp" + mv "${REPORT_DIR}/summary.json.tmp" "${REPORT_DIR}/summary.json" +fi + +if [[ -f "${REPORT_DIR}/graphrag/summary.json" ]]; then + jq \ + --slurpfile graphrag_summary "${REPORT_DIR}/graphrag/summary.json" \ + '.adapters += [ + { + adapter_id: $graphrag_summary[0].adapter_id, + evidence_class: $graphrag_summary[0].evidence_class, + status_boundary: $graphrag_summary[0].status_boundary, + scored_benchmark: $graphrag_summary[0].scored_benchmark, + materialization: $graphrag_summary[0].materialization, + report: $graphrag_summary[0].report + } + ]' "${REPORT_DIR}/summary.json" >"${REPORT_DIR}/summary.json.tmp" + mv "${REPORT_DIR}/summary.json.tmp" "${REPORT_DIR}/summary.json" +fi + +if [[ -f "${REPORT_DIR}/graphiti-zep/summary.json" ]]; then + jq \ + --slurpfile graphiti_summary "${REPORT_DIR}/graphiti-zep/summary.json" \ + '.adapters += [ + { + adapter_id: $graphiti_summary[0].adapter_id, + evidence_class: $graphiti_summary[0].evidence_class, + status_boundary: $graphiti_summary[0].status_boundary, + scored_benchmark: $graphiti_summary[0].scored_benchmark, + materialization: $graphiti_summary[0].materialization, + report: $graphiti_summary[0].report + } + ]' "${REPORT_DIR}/summary.json" >"${REPORT_DIR}/summary.json.tmp" + mv "${REPORT_DIR}/summary.json.tmp" "${REPORT_DIR}/summary.json" +fi + +if [[ -f "${REPORT_DIR}/graphify/summary.json" ]]; then + jq \ + --slurpfile graphify_summary "${REPORT_DIR}/graphify/summary.json" \ + '.adapters += [ + { + adapter_id: $graphify_summary[0].adapter_id, + evidence_class: $graphify_summary[0].evidence_class, + status_boundary: $graphify_summary[0].status_boundary, + scored_benchmark: $graphify_summary[0].scored_benchmark, + materialization: $graphify_summary[0].materialization, + report: $graphify_summary[0].report + } + ]' "${REPORT_DIR}/summary.json" >"${REPORT_DIR}/summary.json.tmp" + mv "${REPORT_DIR}/summary.json.tmp" "${REPORT_DIR}/summary.json" +fi + +echo "Live real-world adapter reports:" +echo " ${REPORT_DIR}/elf-report.json" +echo " ${REPORT_DIR}/elf-report.md" +echo " ${REPORT_DIR}/qmd-report.json" +echo " ${REPORT_DIR}/qmd-report.md" +if [[ -f "${REPORT_DIR}/ragflow/summary.json" ]]; then + echo " ${REPORT_DIR}/ragflow/ragflow-report.json" + echo " ${REPORT_DIR}/ragflow/ragflow-report.md" + echo " ${REPORT_DIR}/ragflow/summary.json" +fi +if [[ -f "${REPORT_DIR}/lightrag/summary.json" ]]; then + echo " ${REPORT_DIR}/lightrag/lightrag-report.json" + echo " ${REPORT_DIR}/lightrag/lightrag-report.md" + echo " ${REPORT_DIR}/lightrag/summary.json" +fi +if [[ -f "${REPORT_DIR}/graphrag/summary.json" ]]; then + echo " ${REPORT_DIR}/graphrag/graphrag-report.json" + echo " ${REPORT_DIR}/graphrag/graphrag-report.md" + echo " ${REPORT_DIR}/graphrag/graphrag-smoke.json" + echo " ${REPORT_DIR}/graphrag/summary.json" +fi +if [[ -f "${REPORT_DIR}/graphiti-zep/summary.json" ]]; then + echo " ${REPORT_DIR}/graphiti-zep/graphiti-zep-report.json" + echo " ${REPORT_DIR}/graphiti-zep/graphiti-zep-report.md" + echo " ${REPORT_DIR}/graphiti-zep/graphiti-zep-smoke.json" + echo " ${REPORT_DIR}/graphiti-zep/summary.json" +fi +if [[ -f "${REPORT_DIR}/graphify/summary.json" ]]; then + echo " ${REPORT_DIR}/graphify/graphify-report.json" + echo " ${REPORT_DIR}/graphify/graphify-report.md" + echo " ${REPORT_DIR}/graphify/graphify-smoke.json" + echo " ${REPORT_DIR}/graphify/summary.json" +fi +echo " ${REPORT_DIR}/summary.json" diff --git a/scripts/real-world-operator-debug-live-adapters.sh b/scripts/real-world-operator-debug-live-adapters.sh new file mode 100755 index 00000000..f027fe4d --- /dev/null +++ b/scripts/real-world-operator-debug-live-adapters.sh @@ -0,0 +1,129 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_OPERATOR_DEBUG_LIVE_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-job/operator-ux-live-adapters}" +FIXTURE_DIR="${ELF_OPERATOR_DEBUG_LIVE_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux}" +WORK_DIR="${ELF_OPERATOR_DEBUG_LIVE_WORK_DIR:-/bench/operator-debug-live-adapters}" +QMD_DIR="${ELF_OPERATOR_DEBUG_QMD_DIR:-/bench/repos/qmd}" + +if [[ ! -f "/.dockerenv" && "${ELF_OPERATOR_DEBUG_LIVE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run operator-debug live adapters outside Docker. Use cargo make real-world-job-operator-ux-live-adapters." >&2 + exit 1 +fi + +for cmd in bash cargo git jq npm npx; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in operator-debug live adapter runner." >&2 + exit 1 + fi +done + +mkdir -p "${REPORT_DIR}" "${WORK_DIR}" +rm -rf "${REPORT_DIR:?}/elf-fixtures" \ + "${REPORT_DIR:?}/qmd-fixtures" \ + "${REPORT_DIR:?}/elf-materialization.json" \ + "${REPORT_DIR:?}/qmd-materialization.json" \ + "${REPORT_DIR:?}/elf-report.json" \ + "${REPORT_DIR:?}/elf-report.md" \ + "${REPORT_DIR:?}/qmd-report.json" \ + "${REPORT_DIR:?}/qmd-report.md" \ + "${REPORT_DIR:?}/summary.json" + +cd "${ROOT_DIR}" + +cargo run -p elf-eval --bin real_world_live_adapter -- elf \ + --fixtures "${FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/elf-fixtures" \ + --evidence-out "${REPORT_DIR}/elf-materialization.json" \ + --config config/local/elf.docker.toml \ + --adapter-id elf_operator_debug_live + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/elf-fixtures" \ + --out "${REPORT_DIR}/elf-report.json" \ + --run-id real-world-operator-debug-live-elf \ + --adapter-id elf_operator_debug_live \ + --adapter-name "ELF live operator-debug service adapter" \ + --adapter-behavior live_operator_debug_adapter \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized by real_world_live_adapter through ElfService, worker indexing, search_raw trace ids, and operator-debug trace metadata." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/elf-report.json" \ + --out "${REPORT_DIR}/elf-report.md" + +cargo run -p elf-eval --bin real_world_live_adapter -- qmd \ + --fixtures "${FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/qmd-fixtures" \ + --evidence-out "${REPORT_DIR}/qmd-materialization.json" \ + --qmd-dir "${QMD_DIR}" \ + --work-dir "${WORK_DIR}/qmd" \ + --adapter-id qmd_operator_debug_live + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/qmd-fixtures" \ + --out "${REPORT_DIR}/qmd-report.json" \ + --run-id real-world-operator-debug-live-qmd \ + --adapter-id qmd_operator_debug_live \ + --adapter-name "qmd live operator-debug CLI adapter" \ + --adapter-behavior live_operator_debug_adapter \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized by real_world_live_adapter through qmd collection add, update, embed, query --json, and local replay command metadata; ELF trace/viewer surfaces are not inferred." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/qmd-report.json" \ + --out "${REPORT_DIR}/qmd-report.md" + +jq -n \ + --slurpfile elf_materialization "${REPORT_DIR}/elf-materialization.json" \ + --slurpfile qmd_materialization "${REPORT_DIR}/qmd-materialization.json" \ + --slurpfile elf_report "${REPORT_DIR}/elf-report.json" \ + --slurpfile qmd_report "${REPORT_DIR}/qmd-report.json" \ + '{ + schema: "elf.real_world_operator_debug_live_adapter_sweep/v1", + generated_at: (now | todateiso8601), + artifact_dir: (env.ELF_OPERATOR_DEBUG_LIVE_REPORT_DIR // "tmp/real-world-job/operator-ux-live-adapters"), + fixture_dir: (env.ELF_OPERATOR_DEBUG_LIVE_FIXTURES // "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux"), + adapters: [ + { + adapter_id: "elf_operator_debug_live", + evidence_class: "live_real_world", + materialization: $elf_materialization[0], + report: { + json: "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + markdown: "tmp/real-world-job/operator-ux-live-adapters/elf-report.md", + summary: $elf_report[0].summary, + suites: $elf_report[0].suites + } + }, + { + adapter_id: "qmd_operator_debug_live", + evidence_class: "live_real_world", + materialization: $qmd_materialization[0], + report: { + json: "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", + markdown: "tmp/real-world-job/operator-ux-live-adapters/qmd-report.md", + summary: $qmd_report[0].summary, + suites: $qmd_report[0].suites + } + } + ], + scenario_dimensions: [ + "trace_available", + "replay_command_available", + "candidate_drop_visibility", + "repair_action_clarity", + "raw_sql_needed" + ], + boundary: "This narrow sweep scores operator-debugging fixtures only. It does not change core ranking, launch OpenMemory or claude-mem UI flows, or convert fixture-only UX evidence into broad product superiority." + }' >"${REPORT_DIR}/summary.json" + +echo "Operator-debug live adapter reports:" +echo " ${REPORT_DIR}/elf-report.json" +echo " ${REPORT_DIR}/elf-report.md" +echo " ${REPORT_DIR}/qmd-report.json" +echo " ${REPORT_DIR}/qmd-report.md" +echo " ${REPORT_DIR}/summary.json" diff --git a/scripts/smoke-docker.sh b/scripts/smoke-docker.sh new file mode 100755 index 00000000..6aa816a8 --- /dev/null +++ b/scripts/smoke-docker.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +set -euo pipefail + +smoke="${1:-}" +if [ -z "$smoke" ]; then + echo "usage: scripts/smoke-docker.sh <smoke>" >&2 + exit 2 +fi + +case "$smoke" in +graphify-docker-graph-report) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_GRAPHIFY_SMOKE_RUN \ + -e ELF_GRAPHIFY_SMOKE_REPORT_DIR \ + -e ELF_GRAPHIFY_SMOKE_WORK_DIR \ + -e ELF_GRAPHIFY_SMOKE_INSTALL \ + -e ELF_GRAPHIFY_PACKAGE \ + -e ELF_GRAPHIFY_REF \ + -e ELF_GRAPHIFY_TIMEOUT_SECONDS \ + -e ELF_GRAPHIFY_QUERY_BUDGET \ + baseline-runner python3 scripts/graphify-docker-graph-report-smoke.py + ;; +graphiti-zep-docker-temporal) + start="$(printenv ELF_GRAPHITI_ZEP_SMOKE_START || true)" + status=0 + if [ "$start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile graphiti-zep up -d graphiti-falkordb + fi + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_GRAPHITI_ZEP_SMOKE_RUN \ + -e ELF_GRAPHITI_ZEP_SMOKE_REPORT_DIR \ + -e ELF_GRAPHITI_ZEP_SMOKE_WORK_DIR \ + -e ELF_GRAPHITI_ZEP_SMOKE_INSTALL \ + -e ELF_GRAPHITI_ZEP_VERSION \ + -e ELF_GRAPHITI_ZEP_PACKAGE \ + -e ELF_GRAPHITI_ZEP_REF \ + -e ELF_GRAPHITI_ZEP_API_BASE \ + -e ELF_GRAPHITI_ZEP_API_KEY \ + -e ELF_GRAPHITI_ZEP_LLM_MODEL \ + -e ELF_GRAPHITI_ZEP_EMBEDDING_MODEL \ + -e ELF_GRAPHITI_ZEP_FALKORDB_HOST \ + -e ELF_GRAPHITI_ZEP_FALKORDB_PORT \ + -e ELF_GRAPHITI_ZEP_FALKORDB_DATABASE \ + -e ELF_GRAPHITI_ZEP_TIMEOUT_SECONDS \ + -e ELF_GRAPHITI_ZEP_STARTUP_ATTEMPTS \ + -e ELF_GRAPHITI_ZEP_STARTUP_INTERVAL_SECONDS \ + baseline-runner python3 scripts/graphiti-zep-docker-temporal-smoke.py || status=$? + if [ "$start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile graphiti-zep stop graphiti-falkordb >/dev/null 2>&1 || true + fi + exit "$status" + ;; +graphrag-docker) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_GRAPHRAG_SMOKE_RUN \ + -e ELF_GRAPHRAG_SMOKE_REPORT_DIR \ + -e ELF_GRAPHRAG_SMOKE_WORK_DIR \ + -e ELF_GRAPHRAG_SMOKE_INSTALL \ + -e ELF_GRAPHRAG_VERSION \ + -e ELF_GRAPHRAG_PACKAGE \ + -e ELF_GRAPHRAG_REF \ + -e ELF_GRAPHRAG_CHAT_MODEL \ + -e ELF_GRAPHRAG_EMBEDDING_MODEL \ + -e ELF_GRAPHRAG_API_BASE \ + -e ELF_GRAPHRAG_API_KEY \ + -e ELF_GRAPHRAG_INDEX_METHOD \ + -e ELF_GRAPHRAG_QUERY_METHOD \ + -e ELF_GRAPHRAG_TIMEOUT_SECONDS \ + -e ELF_GRAPHRAG_MAX_DOCS \ + -e ELF_GRAPHRAG_MAX_INPUT_CHARS \ + baseline-runner python3 scripts/graphrag-docker-smoke.py + ;; +lightrag-docker-context) + start="$(printenv ELF_LIGHTRAG_CONTEXT_START || true)" + status=0 + if [ "$start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile lightrag up -d lightrag + fi + docker compose -f docker-compose.baseline.yml run --build --rm \ + baseline-runner bash scripts/lightrag-docker-context-smoke.sh || status=$? + if [ "$start" = "1" ]; then + docker compose -f docker-compose.baseline.yml --profile lightrag stop lightrag lightrag-mock-provider >/dev/null 2>&1 || true + fi + exit "$status" + ;; +*) + echo "unknown smoke: $smoke" >&2 + exit 2 + ;; +esac diff --git a/scripts/trace-gate.sh b/scripts/trace-gate.sh new file mode 100755 index 00000000..5cbdd52e --- /dev/null +++ b/scripts/trace-gate.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -euo pipefail + +DSN="${TRACE_GATE_PG_DSN:-${PG_DSN:-postgres://postgres:postgres@127.0.0.1:5432/elf}}" +VECTOR_DIM="${TRACE_GATE_VECTOR_DIM:-4}" +SCHEMA_PATH="tmp/trace_gate.schema.sql" +REPORT_PATH="${TRACE_GATE_REPORT_PATH:-tmp/trace_gate.report.json}" + +mkdir -p tmp + +TRACE_GATE_VECTOR_DIM="${VECTOR_DIM}" python3 - <<'PY' > "${SCHEMA_PATH}" +import os +from pathlib import Path + +vector_dim = int(os.environ["TRACE_GATE_VECTOR_DIM"]) +root = Path(".") +sql_dir = root / "sql" + +out = [] +for raw_line in (sql_dir / "init.sql").read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if line.startswith(r"\ir "): + rel = line[len(r"\ir ") :].strip() + out.append((sql_dir / rel).read_text(encoding="utf-8")) + else: + out.append(raw_line) + +expanded = "\n".join(out) + "\n" +print(expanded.replace("<VECTOR_DIM>", str(vector_dim)), end="") +PY + +psql "${DSN}" -v ON_ERROR_STOP=1 -f "${SCHEMA_PATH}" +psql "${DSN}" -v ON_ERROR_STOP=1 -f .github/fixtures/trace_gate/fixture.sql +cargo run -p elf-eval --bin trace_regression_gate -- \ + --config .github/fixtures/trace_gate/config.toml \ + --gate .github/fixtures/trace_gate/gate.json \ + --out "${REPORT_PATH}" diff --git a/sql/init.sql b/sql/init.sql index dad79477..99641a31 100644 --- a/sql/init.sql +++ b/sql/init.sql @@ -1,11 +1,42 @@ \ir 00_extensions.sql \ir tables/001_memory_notes.sql +\ir tables/016_graph_entities.sql +\ir tables/017_graph_entity_aliases.sql +\ir tables/020_graph_predicates.sql +\ir tables/021_graph_predicate_aliases.sql +\ir tables/018_graph_facts.sql +\ir tables/019_graph_fact_evidence.sql +\ir tables/022_graph_fact_supersessions.sql +\ir tables/013_memory_note_fields.sql \ir tables/009_memory_note_chunks.sql \ir tables/010_note_chunk_embeddings.sql +\ir tables/014_note_field_embeddings.sql \ir tables/002_note_embeddings.sql \ir tables/003_memory_note_versions.sql +\ir tables/023_memory_ingest_decisions.sql +\ir tables/024_memory_space_grants.sql \ir tables/004_memory_hits.sql \ir tables/005_indexing_outbox.sql \ir tables/006_search_traces.sql +\ir tables/012_search_trace_candidates.sql +\ir tables/015_search_trace_stages.sql \ir tables/007_search_trace_outbox.sql \ir tables/008_llm_cache.sql +\ir tables/011_search_sessions.sql +\ir tables/025_doc_documents.sql +\ir tables/026_doc_chunks.sql +\ir tables/027_doc_chunk_embeddings.sql +\ir tables/028_doc_indexing_outbox.sql +\ir tables/029_memory_ingestion_profiles.sql +\ir tables/030_memory_ingestion_profile_defaults.sql +\ir tables/031_consolidation_runs.sql +\ir tables/032_consolidation_proposals.sql +\ir tables/033_consolidation_proposal_reviews.sql +\ir tables/034_consolidation_run_jobs.sql +\ir tables/035_knowledge_pages.sql +\ir tables/036_knowledge_page_sections.sql +\ir tables/037_knowledge_page_source_refs.sql +\ir tables/038_knowledge_page_lint_findings.sql +\ir tables/039_core_memory_blocks.sql +\ir tables/040_core_memory_block_attachments.sql +\ir tables/041_core_memory_block_events.sql diff --git a/sql/tables/001_memory_notes.sql b/sql/tables/001_memory_notes.sql index be3b11e3..e98be7e2 100644 --- a/sql/tables/001_memory_notes.sql +++ b/sql/tables/001_memory_notes.sql @@ -1,28 +1,28 @@ CREATE TABLE IF NOT EXISTS memory_notes ( - note_id uuid PRIMARY KEY, - tenant_id text NOT NULL, - project_id text NOT NULL, - agent_id text NOT NULL, - scope text NOT NULL, - type text NOT NULL, - key text NULL, - text text NOT NULL, - importance real NOT NULL, - confidence real NOT NULL, - status text NOT NULL, - created_at timestamptz NOT NULL, - updated_at timestamptz NOT NULL, - expires_at timestamptz NULL, - embedding_version text NOT NULL, - source_ref jsonb NOT NULL, - hit_count bigint NOT NULL DEFAULT 0, - last_hit_at timestamptz NULL + note_id uuid PRIMARY KEY, + tenant_id text NOT NULL, + project_id text NOT NULL, + agent_id text NOT NULL, + scope text NOT NULL, + type text NOT NULL, + key text NULL, + text text NOT NULL, + importance real NOT NULL, + confidence real NOT NULL, + status text NOT NULL, + created_at timestamptz NOT NULL, + updated_at timestamptz NOT NULL, + expires_at timestamptz NULL, + embedding_version text NOT NULL, + source_ref jsonb NOT NULL, + hit_count bigint NOT NULL DEFAULT 0, + last_hit_at timestamptz NULL ); CREATE INDEX IF NOT EXISTS idx_notes_scope_status - ON memory_notes (tenant_id, project_id, scope, status); + ON memory_notes (tenant_id, project_id, scope, status); CREATE INDEX IF NOT EXISTS idx_notes_key - ON memory_notes (tenant_id, project_id, agent_id, scope, type, key) - WHERE key IS NOT NULL; + ON memory_notes (tenant_id, project_id, agent_id, scope, type, key) + WHERE key IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_notes_expires - ON memory_notes (expires_at); + ON memory_notes (expires_at); diff --git a/sql/tables/002_note_embeddings.sql b/sql/tables/002_note_embeddings.sql index 6fdd9269..8499fe30 100644 --- a/sql/tables/002_note_embeddings.sql +++ b/sql/tables/002_note_embeddings.sql @@ -1,8 +1,8 @@ CREATE TABLE IF NOT EXISTS note_embeddings ( - note_id uuid NOT NULL REFERENCES memory_notes(note_id) ON DELETE CASCADE, - embedding_version text NOT NULL, - embedding_dim int NOT NULL, - vec vector(<VECTOR_DIM>) NOT NULL, - created_at timestamptz NOT NULL DEFAULT now(), - PRIMARY KEY (note_id, embedding_version) + note_id uuid NOT NULL REFERENCES memory_notes(note_id) ON DELETE CASCADE, + embedding_version text NOT NULL, + embedding_dim int NOT NULL, + vec vector(<VECTOR_DIM>) NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + PRIMARY KEY (note_id, embedding_version) ); diff --git a/sql/tables/003_memory_note_versions.sql b/sql/tables/003_memory_note_versions.sql index 5ced3886..ac11ddd9 100644 --- a/sql/tables/003_memory_note_versions.sql +++ b/sql/tables/003_memory_note_versions.sql @@ -1,10 +1,10 @@ CREATE TABLE IF NOT EXISTS memory_note_versions ( - version_id uuid PRIMARY KEY, - note_id uuid NOT NULL, - op text NOT NULL, - prev_snapshot jsonb NULL, - new_snapshot jsonb NULL, - reason text NOT NULL, - actor text NOT NULL, - ts timestamptz NOT NULL DEFAULT now() + version_id uuid PRIMARY KEY, + note_id uuid NOT NULL, + op text NOT NULL, + prev_snapshot jsonb NULL, + new_snapshot jsonb NULL, + reason text NOT NULL, + actor text NOT NULL, + ts timestamptz NOT NULL DEFAULT now() ); diff --git a/sql/tables/004_memory_hits.sql b/sql/tables/004_memory_hits.sql index e3b1a0f0..c72f5567 100644 --- a/sql/tables/004_memory_hits.sql +++ b/sql/tables/004_memory_hits.sql @@ -1,11 +1,9 @@ CREATE TABLE IF NOT EXISTS memory_hits ( - hit_id uuid PRIMARY KEY, - note_id uuid NOT NULL, - query_hash text NOT NULL, - rank int NOT NULL, - final_score real NOT NULL, - ts timestamptz NOT NULL DEFAULT now() + hit_id uuid PRIMARY KEY, + note_id uuid NOT NULL, + chunk_id uuid NULL, + query_hash text NOT NULL, + rank int NOT NULL, + final_score real NOT NULL, + ts timestamptz NOT NULL DEFAULT now() ); - -ALTER TABLE memory_hits - ADD COLUMN IF NOT EXISTS chunk_id uuid NULL; diff --git a/sql/tables/005_indexing_outbox.sql b/sql/tables/005_indexing_outbox.sql index e4dec5a9..18f1c0cb 100644 --- a/sql/tables/005_indexing_outbox.sql +++ b/sql/tables/005_indexing_outbox.sql @@ -1,17 +1,17 @@ CREATE TABLE IF NOT EXISTS indexing_outbox ( - outbox_id uuid PRIMARY KEY, - note_id uuid NOT NULL, - op text NOT NULL, - embedding_version text NOT NULL, - status text NOT NULL, - attempts int NOT NULL DEFAULT 0, - last_error text NULL, - available_at timestamptz NOT NULL DEFAULT now(), - created_at timestamptz NOT NULL DEFAULT now(), - updated_at timestamptz NOT NULL DEFAULT now() + outbox_id uuid PRIMARY KEY, + note_id uuid NOT NULL, + op text NOT NULL, + embedding_version text NOT NULL, + status text NOT NULL, + attempts int NOT NULL DEFAULT 0, + last_error text NULL, + available_at timestamptz NOT NULL DEFAULT now(), + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_outbox_status_available - ON indexing_outbox (status, available_at); + ON indexing_outbox (status, available_at); CREATE INDEX IF NOT EXISTS idx_outbox_note_op_status - ON indexing_outbox (note_id, op, status); + ON indexing_outbox (note_id, op, status); diff --git a/sql/tables/006_search_traces.sql b/sql/tables/006_search_traces.sql index 27fa32c3..5c5cc3ea 100644 --- a/sql/tables/006_search_traces.sql +++ b/sql/tables/006_search_traces.sql @@ -1,45 +1,37 @@ CREATE TABLE IF NOT EXISTS search_traces ( - trace_id uuid PRIMARY KEY, - tenant_id text NOT NULL, - project_id text NOT NULL, - agent_id text NOT NULL, - read_profile text NOT NULL, - query text NOT NULL, - expansion_mode text NOT NULL, - expanded_queries jsonb NOT NULL, - allowed_scopes jsonb NOT NULL, - candidate_count int NOT NULL, - top_k int NOT NULL, - config_snapshot jsonb NOT NULL, - trace_version int NOT NULL, - created_at timestamptz NOT NULL, - expires_at timestamptz NOT NULL + trace_id uuid PRIMARY KEY, + tenant_id text NOT NULL, + project_id text NOT NULL, + agent_id text NOT NULL, + read_profile text NOT NULL, + query text NOT NULL, + expansion_mode text NOT NULL, + expanded_queries jsonb NOT NULL, + allowed_scopes jsonb NOT NULL, + candidate_count int NOT NULL, + top_k int NOT NULL, + config_snapshot jsonb NOT NULL, + trace_version int NOT NULL, + created_at timestamptz NOT NULL, + expires_at timestamptz NOT NULL ); CREATE INDEX IF NOT EXISTS idx_search_traces_expires - ON search_traces (expires_at); + ON search_traces (expires_at); CREATE INDEX IF NOT EXISTS idx_search_traces_context - ON search_traces (tenant_id, project_id, created_at); + ON search_traces (tenant_id, project_id, created_at); CREATE TABLE IF NOT EXISTS search_trace_items ( - item_id uuid PRIMARY KEY, - trace_id uuid NOT NULL REFERENCES search_traces(trace_id) ON DELETE CASCADE, - note_id uuid NOT NULL, - rank int NOT NULL, - retrieval_score real NULL, - retrieval_rank int NULL, - rerank_score real NOT NULL, - tie_breaker_score real NOT NULL, - final_score real NOT NULL, - boosts jsonb NOT NULL, - matched_terms jsonb NOT NULL, - matched_fields jsonb NOT NULL + item_id uuid PRIMARY KEY, + trace_id uuid NOT NULL REFERENCES search_traces(trace_id) ON DELETE CASCADE, + note_id uuid NOT NULL, + chunk_id uuid NULL, + rank int NOT NULL, + final_score real NOT NULL, + explain jsonb NOT NULL ); -ALTER TABLE search_trace_items - ADD COLUMN IF NOT EXISTS chunk_id uuid NULL; - CREATE INDEX IF NOT EXISTS idx_search_trace_items_trace - ON search_trace_items (trace_id, rank); + ON search_trace_items (trace_id, rank); CREATE INDEX IF NOT EXISTS idx_search_trace_items_note - ON search_trace_items (note_id); + ON search_trace_items (note_id); diff --git a/sql/tables/007_search_trace_outbox.sql b/sql/tables/007_search_trace_outbox.sql index e5972e64..8e441f36 100644 --- a/sql/tables/007_search_trace_outbox.sql +++ b/sql/tables/007_search_trace_outbox.sql @@ -1,16 +1,16 @@ CREATE TABLE IF NOT EXISTS search_trace_outbox ( - outbox_id uuid PRIMARY KEY, - trace_id uuid NOT NULL, - status text NOT NULL, - attempts int NOT NULL DEFAULT 0, - last_error text NULL, - available_at timestamptz NOT NULL DEFAULT now(), - payload jsonb NOT NULL, - created_at timestamptz NOT NULL DEFAULT now(), - updated_at timestamptz NOT NULL DEFAULT now() + outbox_id uuid PRIMARY KEY, + trace_id uuid NOT NULL, + status text NOT NULL, + attempts int NOT NULL DEFAULT 0, + last_error text NULL, + available_at timestamptz NOT NULL DEFAULT now(), + payload jsonb NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_trace_outbox_status_available - ON search_trace_outbox (status, available_at); + ON search_trace_outbox (status, available_at); CREATE INDEX IF NOT EXISTS idx_trace_outbox_trace_status - ON search_trace_outbox (trace_id, status); + ON search_trace_outbox (trace_id, status); diff --git a/sql/tables/008_llm_cache.sql b/sql/tables/008_llm_cache.sql index 727bbccb..7f2e172c 100644 --- a/sql/tables/008_llm_cache.sql +++ b/sql/tables/008_llm_cache.sql @@ -1,15 +1,15 @@ CREATE TABLE IF NOT EXISTS llm_cache ( - cache_id uuid PRIMARY KEY, - cache_kind text NOT NULL, - cache_key text NOT NULL, - payload jsonb NOT NULL, - created_at timestamptz NOT NULL, - last_accessed_at timestamptz NOT NULL, - expires_at timestamptz NOT NULL, - hit_count bigint NOT NULL DEFAULT 0 + cache_id uuid PRIMARY KEY, + cache_kind text NOT NULL, + cache_key text NOT NULL, + payload jsonb NOT NULL, + created_at timestamptz NOT NULL, + last_accessed_at timestamptz NOT NULL, + expires_at timestamptz NOT NULL, + hit_count bigint NOT NULL DEFAULT 0 ); CREATE UNIQUE INDEX IF NOT EXISTS idx_llm_cache_key - ON llm_cache (cache_kind, cache_key); + ON llm_cache (cache_kind, cache_key); CREATE INDEX IF NOT EXISTS idx_llm_cache_expires - ON llm_cache (expires_at); + ON llm_cache (expires_at); diff --git a/sql/tables/009_memory_note_chunks.sql b/sql/tables/009_memory_note_chunks.sql index f5a15811..fb5bd790 100644 --- a/sql/tables/009_memory_note_chunks.sql +++ b/sql/tables/009_memory_note_chunks.sql @@ -1,15 +1,15 @@ CREATE TABLE IF NOT EXISTS memory_note_chunks ( - chunk_id uuid PRIMARY KEY, - note_id uuid NOT NULL REFERENCES memory_notes(note_id) ON DELETE CASCADE, - chunk_index int NOT NULL, - start_offset int NOT NULL, - end_offset int NOT NULL, - text text NOT NULL, - embedding_version text NOT NULL, - created_at timestamptz NOT NULL DEFAULT now() + chunk_id uuid PRIMARY KEY, + note_id uuid NOT NULL REFERENCES memory_notes(note_id) ON DELETE CASCADE, + chunk_index int NOT NULL, + start_offset int NOT NULL, + end_offset int NOT NULL, + text text NOT NULL, + embedding_version text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_note_chunks_note - ON memory_note_chunks (note_id); + ON memory_note_chunks (note_id); CREATE INDEX IF NOT EXISTS idx_note_chunks_note_index - ON memory_note_chunks (note_id, chunk_index); + ON memory_note_chunks (note_id, chunk_index); diff --git a/sql/tables/010_note_chunk_embeddings.sql b/sql/tables/010_note_chunk_embeddings.sql index 088dff26..7a04625d 100644 --- a/sql/tables/010_note_chunk_embeddings.sql +++ b/sql/tables/010_note_chunk_embeddings.sql @@ -1,8 +1,8 @@ CREATE TABLE IF NOT EXISTS note_chunk_embeddings ( - chunk_id uuid NOT NULL REFERENCES memory_note_chunks(chunk_id) ON DELETE CASCADE, - embedding_version text NOT NULL, - embedding_dim int NOT NULL, - vec vector(<VECTOR_DIM>) NOT NULL, - created_at timestamptz NOT NULL DEFAULT now(), - PRIMARY KEY (chunk_id, embedding_version) + chunk_id uuid NOT NULL REFERENCES memory_note_chunks(chunk_id) ON DELETE CASCADE, + embedding_version text NOT NULL, + embedding_dim int NOT NULL, + vec vector(<VECTOR_DIM>) NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + PRIMARY KEY (chunk_id, embedding_version) ); diff --git a/sql/tables/011_search_sessions.sql b/sql/tables/011_search_sessions.sql new file mode 100644 index 00000000..f8a1d8e9 --- /dev/null +++ b/sql/tables/011_search_sessions.sql @@ -0,0 +1,23 @@ +CREATE TABLE IF NOT EXISTS search_sessions ( + search_session_id uuid PRIMARY KEY, + trace_id uuid NOT NULL, + tenant_id text NOT NULL, + project_id text NOT NULL, + agent_id text NOT NULL, + read_profile text NOT NULL, + mode text NOT NULL, + query text NOT NULL, + trajectory_summary jsonb, + query_plan jsonb, + items jsonb NOT NULL, + created_at timestamptz NOT NULL, + expires_at timestamptz NOT NULL +); + +ALTER TABLE search_sessions + ADD COLUMN IF NOT EXISTS trajectory_summary jsonb; + +CREATE INDEX IF NOT EXISTS idx_search_sessions_expires + ON search_sessions (expires_at); +CREATE INDEX IF NOT EXISTS idx_search_sessions_context + ON search_sessions (tenant_id, project_id, created_at); diff --git a/sql/tables/012_search_trace_candidates.sql b/sql/tables/012_search_trace_candidates.sql new file mode 100644 index 00000000..548604f2 --- /dev/null +++ b/sql/tables/012_search_trace_candidates.sql @@ -0,0 +1,23 @@ +CREATE TABLE IF NOT EXISTS search_trace_candidates ( + candidate_id uuid PRIMARY KEY, + trace_id uuid NOT NULL REFERENCES search_traces(trace_id) ON DELETE CASCADE, + note_id uuid NOT NULL, + chunk_id uuid NOT NULL, + chunk_index int NOT NULL, + snippet text NOT NULL, + candidate_snapshot jsonb NOT NULL, + retrieval_rank int NOT NULL, + rerank_score real NOT NULL, + note_scope text NOT NULL, + note_importance real NOT NULL, + note_updated_at timestamptz NOT NULL, + note_hit_count bigint NOT NULL, + note_last_hit_at timestamptz, + created_at timestamptz NOT NULL, + expires_at timestamptz NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_search_trace_candidates_expires + ON search_trace_candidates (expires_at); +CREATE INDEX IF NOT EXISTS idx_search_trace_candidates_trace + ON search_trace_candidates (trace_id, retrieval_rank); diff --git a/sql/tables/013_memory_note_fields.sql b/sql/tables/013_memory_note_fields.sql new file mode 100644 index 00000000..81bf1750 --- /dev/null +++ b/sql/tables/013_memory_note_fields.sql @@ -0,0 +1,17 @@ +CREATE TABLE IF NOT EXISTS memory_note_fields ( + field_id uuid PRIMARY KEY, + note_id uuid NOT NULL REFERENCES memory_notes(note_id) ON DELETE CASCADE, + field_kind text NOT NULL, + item_index int NOT NULL, + text text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_note_fields_note_kind_index + ON memory_note_fields (note_id, field_kind, item_index); +CREATE INDEX IF NOT EXISTS idx_note_fields_note + ON memory_note_fields (note_id); +CREATE INDEX IF NOT EXISTS idx_note_fields_kind + ON memory_note_fields (field_kind); + diff --git a/sql/tables/014_note_field_embeddings.sql b/sql/tables/014_note_field_embeddings.sql new file mode 100644 index 00000000..1ffc56b3 --- /dev/null +++ b/sql/tables/014_note_field_embeddings.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS note_field_embeddings ( + field_id uuid NOT NULL REFERENCES memory_note_fields(field_id) ON DELETE CASCADE, + embedding_version text NOT NULL, + embedding_dim int NOT NULL, + vec vector(<VECTOR_DIM>) NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + PRIMARY KEY (field_id, embedding_version) +); diff --git a/sql/tables/015_search_trace_stages.sql b/sql/tables/015_search_trace_stages.sql new file mode 100644 index 00000000..1aa4aacd --- /dev/null +++ b/sql/tables/015_search_trace_stages.sql @@ -0,0 +1,25 @@ +CREATE TABLE IF NOT EXISTS search_trace_stages ( + stage_id uuid PRIMARY KEY, + trace_id uuid NOT NULL REFERENCES search_traces(trace_id) ON DELETE CASCADE, + stage_order int NOT NULL, + stage_name text NOT NULL, + stage_payload jsonb NOT NULL, + created_at timestamptz NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_search_trace_stages_trace_order + ON search_trace_stages (trace_id, stage_order); +CREATE INDEX IF NOT EXISTS idx_search_trace_stages_trace_name + ON search_trace_stages (trace_id, stage_name); + +CREATE TABLE IF NOT EXISTS search_trace_stage_items ( + id uuid PRIMARY KEY, + stage_id uuid NOT NULL REFERENCES search_trace_stages(stage_id) ON DELETE CASCADE, + item_id uuid NULL, + note_id uuid NULL, + chunk_id uuid NULL, + metrics jsonb NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_search_trace_stage_items_stage_item + ON search_trace_stage_items (stage_id, item_id); diff --git a/sql/tables/016_graph_entities.sql b/sql/tables/016_graph_entities.sql new file mode 100644 index 00000000..4785fec5 --- /dev/null +++ b/sql/tables/016_graph_entities.sql @@ -0,0 +1,14 @@ +CREATE TABLE IF NOT EXISTS graph_entities ( + entity_id uuid PRIMARY KEY, + tenant_id text NOT NULL, + project_id text NOT NULL, + canonical text NOT NULL, + canonical_norm text NOT NULL, + kind text NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_graph_entities_tenant_project_canonical_norm + ON graph_entities (tenant_id, project_id, canonical_norm); + diff --git a/sql/tables/017_graph_entity_aliases.sql b/sql/tables/017_graph_entity_aliases.sql new file mode 100644 index 00000000..cc38b815 --- /dev/null +++ b/sql/tables/017_graph_entity_aliases.sql @@ -0,0 +1,13 @@ +CREATE TABLE IF NOT EXISTS graph_entity_aliases ( + alias_id uuid PRIMARY KEY, + entity_id uuid NOT NULL REFERENCES graph_entities(entity_id) ON DELETE CASCADE, + alias text NOT NULL, + alias_norm text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now() +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_graph_entity_aliases_entity_alias_norm + ON graph_entity_aliases (entity_id, alias_norm); +CREATE INDEX IF NOT EXISTS idx_graph_entity_aliases_alias_norm + ON graph_entity_aliases (alias_norm); + diff --git a/sql/tables/018_graph_facts.sql b/sql/tables/018_graph_facts.sql new file mode 100644 index 00000000..db11cef1 --- /dev/null +++ b/sql/tables/018_graph_facts.sql @@ -0,0 +1,47 @@ +CREATE TABLE IF NOT EXISTS graph_facts ( + fact_id uuid PRIMARY KEY, + tenant_id text NOT NULL, + project_id text NOT NULL, + agent_id text NOT NULL, + scope text NOT NULL, + subject_entity_id uuid NOT NULL REFERENCES graph_entities(entity_id), + predicate text NOT NULL, + predicate_id uuid NULL REFERENCES graph_predicates(predicate_id), + object_entity_id uuid NULL REFERENCES graph_entities(entity_id), + object_value text NULL, + valid_from timestamptz NOT NULL, + valid_to timestamptz NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + CONSTRAINT graph_facts_object_exactly_one_source + CHECK ((object_entity_id IS NULL AND object_value IS NOT NULL) + OR (object_entity_id IS NOT NULL AND object_value IS NULL)), + CONSTRAINT graph_facts_valid_window + CHECK (valid_to IS NULL OR valid_to > valid_from) +); + +ALTER TABLE graph_facts ADD COLUMN IF NOT EXISTS predicate_id uuid NULL; + +ALTER TABLE graph_facts DROP CONSTRAINT IF EXISTS graph_facts_predicate_id_fkey; +ALTER TABLE graph_facts + ADD CONSTRAINT graph_facts_predicate_id_fkey + FOREIGN KEY (predicate_id) REFERENCES graph_predicates(predicate_id); + +DROP INDEX IF EXISTS idx_graph_facts_tenant_project_subject_predicate; +DROP INDEX IF EXISTS uq_graph_facts_active_entity_object; +DROP INDEX IF EXISTS uq_graph_facts_active_entity_value; + +CREATE INDEX IF NOT EXISTS idx_graph_facts_tenant_project_subject_predicate + ON graph_facts (tenant_id, project_id, subject_entity_id, predicate_id); +CREATE INDEX IF NOT EXISTS idx_graph_facts_tenant_project_valid_to + ON graph_facts (tenant_id, project_id, valid_to); +CREATE INDEX IF NOT EXISTS idx_graph_facts_tenant_project_object_entity + ON graph_facts (tenant_id, project_id, object_entity_id) + WHERE object_entity_id IS NOT NULL; + +CREATE UNIQUE INDEX IF NOT EXISTS uq_graph_facts_active_entity_object + ON graph_facts (tenant_id, project_id, scope, subject_entity_id, predicate_id, object_entity_id) + WHERE valid_to IS NULL AND object_entity_id IS NOT NULL; +CREATE UNIQUE INDEX IF NOT EXISTS uq_graph_facts_active_entity_value + ON graph_facts (tenant_id, project_id, scope, subject_entity_id, predicate_id, object_value) + WHERE valid_to IS NULL AND object_value IS NOT NULL; diff --git a/sql/tables/019_graph_fact_evidence.sql b/sql/tables/019_graph_fact_evidence.sql new file mode 100644 index 00000000..0eee36dd --- /dev/null +++ b/sql/tables/019_graph_fact_evidence.sql @@ -0,0 +1,14 @@ +CREATE TABLE IF NOT EXISTS graph_fact_evidence ( + evidence_id uuid PRIMARY KEY, + fact_id uuid NOT NULL REFERENCES graph_facts(fact_id) ON DELETE CASCADE, + note_id uuid NOT NULL REFERENCES memory_notes(note_id) ON DELETE CASCADE, + created_at timestamptz NOT NULL DEFAULT now() +); + +CREATE UNIQUE INDEX IF NOT EXISTS uq_graph_fact_evidence_fact_note + ON graph_fact_evidence (fact_id, note_id); +CREATE INDEX IF NOT EXISTS idx_graph_fact_evidence_note + ON graph_fact_evidence (note_id); +CREATE INDEX IF NOT EXISTS idx_graph_fact_evidence_fact + ON graph_fact_evidence (fact_id); + diff --git a/sql/tables/020_graph_predicates.sql b/sql/tables/020_graph_predicates.sql new file mode 100644 index 00000000..626868b6 --- /dev/null +++ b/sql/tables/020_graph_predicates.sql @@ -0,0 +1,23 @@ +CREATE TABLE IF NOT EXISTS graph_predicates ( + predicate_id uuid PRIMARY KEY, + scope_key text NOT NULL, + tenant_id text NULL, + project_id text NULL, + canonical text NOT NULL, + canonical_norm text NOT NULL, + cardinality text NOT NULL, + status text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + CONSTRAINT graph_predicates_cardinality_check + CHECK (cardinality IN ('single', 'multi')), + CONSTRAINT graph_predicates_status_check + CHECK (status IN ('pending', 'active', 'deprecated')) +); + +CREATE UNIQUE INDEX IF NOT EXISTS uq_graph_predicates_scope_canonical_norm + ON graph_predicates (scope_key, canonical_norm); + +CREATE INDEX IF NOT EXISTS idx_graph_predicates_tenant_project_status + ON graph_predicates (tenant_id, project_id, status); + diff --git a/sql/tables/021_graph_predicate_aliases.sql b/sql/tables/021_graph_predicate_aliases.sql new file mode 100644 index 00000000..fca0a420 --- /dev/null +++ b/sql/tables/021_graph_predicate_aliases.sql @@ -0,0 +1,18 @@ +CREATE TABLE IF NOT EXISTS graph_predicate_aliases ( + alias_id uuid PRIMARY KEY, + predicate_id uuid NOT NULL REFERENCES graph_predicates(predicate_id) ON DELETE CASCADE, + scope_key text NOT NULL, + alias text NOT NULL, + alias_norm text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now() +); + +CREATE UNIQUE INDEX IF NOT EXISTS uq_graph_predicate_aliases_scope_alias_norm + ON graph_predicate_aliases (scope_key, alias_norm); + +CREATE INDEX IF NOT EXISTS idx_graph_predicate_aliases_predicate + ON graph_predicate_aliases (predicate_id); + +CREATE INDEX IF NOT EXISTS idx_graph_predicate_aliases_alias_norm + ON graph_predicate_aliases (alias_norm); + diff --git a/sql/tables/022_graph_fact_supersessions.sql b/sql/tables/022_graph_fact_supersessions.sql new file mode 100644 index 00000000..ef53e1c5 --- /dev/null +++ b/sql/tables/022_graph_fact_supersessions.sql @@ -0,0 +1,21 @@ +CREATE TABLE IF NOT EXISTS graph_fact_supersessions ( + supersession_id uuid PRIMARY KEY, + tenant_id text NOT NULL, + project_id text NOT NULL, + from_fact_id uuid NOT NULL REFERENCES graph_facts(fact_id) ON DELETE CASCADE, + to_fact_id uuid NOT NULL REFERENCES graph_facts(fact_id) ON DELETE CASCADE, + note_id uuid NOT NULL REFERENCES memory_notes(note_id) ON DELETE CASCADE, + effective_at timestamptz NOT NULL, + created_at timestamptz NOT NULL DEFAULT now() +); + +CREATE UNIQUE INDEX IF NOT EXISTS uq_graph_fact_supersessions_from_to_note + ON graph_fact_supersessions (from_fact_id, to_fact_id, note_id); + +CREATE INDEX IF NOT EXISTS idx_graph_fact_supersessions_from_fact + ON graph_fact_supersessions (from_fact_id); +CREATE INDEX IF NOT EXISTS idx_graph_fact_supersessions_to_fact + ON graph_fact_supersessions (to_fact_id); +CREATE INDEX IF NOT EXISTS idx_graph_fact_supersessions_note + ON graph_fact_supersessions (note_id); + diff --git a/sql/tables/023_memory_ingest_decisions.sql b/sql/tables/023_memory_ingest_decisions.sql new file mode 100644 index 00000000..b08843c6 --- /dev/null +++ b/sql/tables/023_memory_ingest_decisions.sql @@ -0,0 +1,40 @@ +CREATE TABLE IF NOT EXISTS memory_ingest_decisions ( + decision_id uuid PRIMARY KEY, + tenant_id text NOT NULL, + project_id text NOT NULL, + agent_id text NOT NULL, + scope text NOT NULL, + pipeline text NOT NULL, + note_type text NOT NULL, + note_key text NULL, + note_id uuid NULL, + note_version_id uuid NULL, + base_decision text NOT NULL, + policy_decision text NOT NULL, + note_op text NOT NULL, + reason_code text NULL, + details jsonb NOT NULL DEFAULT '{}'::jsonb, + ts timestamptz NOT NULL DEFAULT now(), + CONSTRAINT ck_memory_ingest_decisions_pipeline + CHECK (pipeline IN ('add_note', 'add_event')), + CONSTRAINT ck_memory_ingest_decisions_base_decision + CHECK (base_decision IN ('remember', 'update', 'ignore', 'reject')), + CONSTRAINT ck_memory_ingest_decisions_policy_decision + CHECK (policy_decision IN ('remember', 'update', 'ignore', 'reject')), + CONSTRAINT ck_memory_ingest_decisions_note_op + CHECK (note_op IN ('ADD', 'UPDATE', 'NONE', 'DELETE', 'REJECTED')) +); + +ALTER TABLE memory_ingest_decisions + ADD COLUMN IF NOT EXISTS note_version_id uuid NULL; + +CREATE INDEX IF NOT EXISTS idx_memory_ingest_decisions_context + ON memory_ingest_decisions (tenant_id, project_id, agent_id, ts desc); +CREATE INDEX IF NOT EXISTS idx_memory_ingest_decisions_note_id + ON memory_ingest_decisions (note_id); +CREATE INDEX IF NOT EXISTS idx_memory_ingest_decisions_note_version_id + ON memory_ingest_decisions (note_version_id); +CREATE INDEX IF NOT EXISTS idx_memory_ingest_decisions_policy_decision + ON memory_ingest_decisions (policy_decision); +CREATE INDEX IF NOT EXISTS idx_memory_ingest_decisions_pipeline + ON memory_ingest_decisions (pipeline); diff --git a/sql/tables/024_memory_space_grants.sql b/sql/tables/024_memory_space_grants.sql new file mode 100644 index 00000000..dd336fce --- /dev/null +++ b/sql/tables/024_memory_space_grants.sql @@ -0,0 +1,50 @@ +CREATE TABLE IF NOT EXISTS memory_space_grants ( + grant_id uuid PRIMARY KEY, + tenant_id text NOT NULL, + project_id text NOT NULL, + scope text NOT NULL, + space_owner_agent_id text NOT NULL, + grantee_kind text NOT NULL, + grantee_agent_id text NULL, + granted_by_agent_id text NOT NULL, + granted_at timestamptz NOT NULL DEFAULT now(), + revoked_by_agent_id text NULL, + revoked_at timestamptz NULL, + CONSTRAINT ck_memory_space_grants_scope + CHECK (scope IN ('project_shared', 'org_shared')), + CONSTRAINT ck_memory_space_grants_grantee_kind + CHECK (grantee_kind IN ('agent', 'project')), + CONSTRAINT ck_memory_space_grants_grantee_agent_id_by_kind + CHECK ( + (grantee_kind = 'agent' AND grantee_agent_id IS NOT NULL) + OR (grantee_kind = 'project' AND grantee_agent_id IS NULL) + ), + CONSTRAINT ck_memory_space_grants_owner_not_grantee_agent + CHECK (NOT (grantee_kind = 'agent' AND space_owner_agent_id = grantee_agent_id)) +); + +DROP INDEX IF EXISTS uq_memory_space_grants_active_grant; + +CREATE UNIQUE INDEX IF NOT EXISTS uq_memory_space_grants_active_agent_grant + ON memory_space_grants ( + tenant_id, + project_id, + scope, + space_owner_agent_id, + grantee_agent_id + ) + WHERE revoked_at IS NULL AND grantee_kind = 'agent'; + +CREATE UNIQUE INDEX IF NOT EXISTS uq_memory_space_grants_active_project_grant + ON memory_space_grants ( + tenant_id, + project_id, + scope, + space_owner_agent_id + ) + WHERE revoked_at IS NULL AND grantee_kind = 'project'; + +CREATE INDEX IF NOT EXISTS idx_memory_space_grants_lookup_by_grantee + ON memory_space_grants (tenant_id, project_id, grantee_kind, grantee_agent_id, scope); +CREATE INDEX IF NOT EXISTS idx_memory_space_grants_lookup_by_owner + ON memory_space_grants (tenant_id, project_id, scope, space_owner_agent_id); diff --git a/sql/tables/025_doc_documents.sql b/sql/tables/025_doc_documents.sql new file mode 100644 index 00000000..c698bd26 --- /dev/null +++ b/sql/tables/025_doc_documents.sql @@ -0,0 +1,40 @@ +CREATE TABLE IF NOT EXISTS doc_documents ( + doc_id uuid PRIMARY KEY, + tenant_id text NOT NULL, + project_id text NOT NULL, + agent_id text NOT NULL, + scope text NOT NULL, + doc_type text NOT NULL DEFAULT 'knowledge', + status text NOT NULL, + title text NULL, + source_ref jsonb NULL, + content text NOT NULL, + content_bytes int NOT NULL, + content_hash text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE doc_documents + ADD COLUMN IF NOT EXISTS doc_type text NOT NULL DEFAULT 'knowledge'; + +ALTER TABLE doc_documents + DROP CONSTRAINT IF EXISTS ck_doc_documents_scope; +ALTER TABLE doc_documents + ADD CONSTRAINT ck_doc_documents_scope + CHECK (scope IN ('agent_private', 'project_shared', 'org_shared')); + +ALTER TABLE doc_documents + DROP CONSTRAINT IF EXISTS ck_doc_documents_doc_type; +ALTER TABLE doc_documents + ADD CONSTRAINT ck_doc_documents_doc_type + CHECK (doc_type IN ('knowledge', 'chat', 'search', 'dev')); + +ALTER TABLE doc_documents + DROP CONSTRAINT IF EXISTS ck_doc_documents_status; +ALTER TABLE doc_documents + ADD CONSTRAINT ck_doc_documents_status + CHECK (status IN ('active', 'deleted')); + +CREATE INDEX IF NOT EXISTS idx_doc_documents_tenant_project_scope_status_updated + ON doc_documents (tenant_id, project_id, scope, status, updated_at DESC); diff --git a/sql/tables/026_doc_chunks.sql b/sql/tables/026_doc_chunks.sql new file mode 100644 index 00000000..6f906000 --- /dev/null +++ b/sql/tables/026_doc_chunks.sql @@ -0,0 +1,23 @@ +CREATE TABLE IF NOT EXISTS doc_chunks ( + chunk_id uuid PRIMARY KEY, + doc_id uuid NOT NULL REFERENCES doc_documents(doc_id) ON DELETE CASCADE, + chunk_index int NOT NULL, + start_offset int NOT NULL, + end_offset int NOT NULL, + chunk_text text NOT NULL, + chunk_hash text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE doc_chunks + DROP CONSTRAINT IF EXISTS ck_doc_chunks_offsets; +ALTER TABLE doc_chunks + ADD CONSTRAINT ck_doc_chunks_offsets + CHECK (start_offset >= 0 AND end_offset >= start_offset); + +CREATE UNIQUE INDEX IF NOT EXISTS uq_doc_chunks_doc_index + ON doc_chunks (doc_id, chunk_index); + +CREATE INDEX IF NOT EXISTS idx_doc_chunks_doc_id + ON doc_chunks (doc_id); + diff --git a/sql/tables/027_doc_chunk_embeddings.sql b/sql/tables/027_doc_chunk_embeddings.sql new file mode 100644 index 00000000..5d132bcc --- /dev/null +++ b/sql/tables/027_doc_chunk_embeddings.sql @@ -0,0 +1,9 @@ +CREATE TABLE IF NOT EXISTS doc_chunk_embeddings ( + chunk_id uuid NOT NULL REFERENCES doc_chunks(chunk_id) ON DELETE CASCADE, + embedding_version text NOT NULL, + embedding_dim int NOT NULL, + vec vector(<VECTOR_DIM>) NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + PRIMARY KEY (chunk_id, embedding_version) +); + diff --git a/sql/tables/028_doc_indexing_outbox.sql b/sql/tables/028_doc_indexing_outbox.sql new file mode 100644 index 00000000..ecabba4d --- /dev/null +++ b/sql/tables/028_doc_indexing_outbox.sql @@ -0,0 +1,33 @@ +CREATE TABLE IF NOT EXISTS doc_indexing_outbox ( + outbox_id uuid PRIMARY KEY, + doc_id uuid NOT NULL REFERENCES doc_documents(doc_id) ON DELETE CASCADE, + chunk_id uuid NOT NULL REFERENCES doc_chunks(chunk_id) ON DELETE CASCADE, + op text NOT NULL, + embedding_version text NOT NULL, + status text NOT NULL, + attempts int NOT NULL DEFAULT 0, + last_error text NULL, + available_at timestamptz NOT NULL DEFAULT now(), + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE doc_indexing_outbox + DROP CONSTRAINT IF EXISTS ck_doc_indexing_outbox_op; +ALTER TABLE doc_indexing_outbox + ADD CONSTRAINT ck_doc_indexing_outbox_op + CHECK (op IN ('UPSERT', 'DELETE')); + +ALTER TABLE doc_indexing_outbox + DROP CONSTRAINT IF EXISTS ck_doc_indexing_outbox_status; +ALTER TABLE doc_indexing_outbox + ADD CONSTRAINT ck_doc_indexing_outbox_status + CHECK (status IN ('PENDING', 'CLAIMED', 'DONE', 'FAILED')); + +CREATE INDEX IF NOT EXISTS idx_doc_outbox_status_available + ON doc_indexing_outbox (status, available_at); +CREATE INDEX IF NOT EXISTS idx_doc_outbox_doc_op_status + ON doc_indexing_outbox (doc_id, op, status); +CREATE INDEX IF NOT EXISTS idx_doc_outbox_chunk_op_status + ON doc_indexing_outbox (chunk_id, op, status); + diff --git a/sql/tables/029_memory_ingestion_profiles.sql b/sql/tables/029_memory_ingestion_profiles.sql new file mode 100644 index 00000000..6004406f --- /dev/null +++ b/sql/tables/029_memory_ingestion_profiles.sql @@ -0,0 +1,21 @@ +CREATE TABLE IF NOT EXISTS memory_ingestion_profiles ( + tenant_id text NOT NULL, + project_id text NOT NULL, + pipeline text NOT NULL, + profile_id text NOT NULL, + version integer NOT NULL, + profile jsonb NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + created_by text NOT NULL DEFAULT 'system', + CONSTRAINT pk_memory_ingestion_profiles + PRIMARY KEY (tenant_id, project_id, pipeline, profile_id, version), + CONSTRAINT ck_memory_ingestion_profiles_pipeline + CHECK (pipeline IN ('add_event')), + CONSTRAINT ck_memory_ingestion_profiles_version + CHECK (version > 0), + CONSTRAINT ck_memory_ingestion_profiles_profile + CHECK (jsonb_typeof(profile) = 'object') +); + +CREATE INDEX IF NOT EXISTS idx_memory_ingestion_profiles_lookup + ON memory_ingestion_profiles (tenant_id, project_id, pipeline, profile_id, version DESC); diff --git a/sql/tables/030_memory_ingestion_profile_defaults.sql b/sql/tables/030_memory_ingestion_profile_defaults.sql new file mode 100644 index 00000000..99f40b36 --- /dev/null +++ b/sql/tables/030_memory_ingestion_profile_defaults.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS memory_ingestion_profile_defaults ( + tenant_id text NOT NULL, + project_id text NOT NULL, + pipeline text NOT NULL, + profile_id text NOT NULL, + version integer NULL, + updated_at timestamptz NOT NULL DEFAULT now(), + CONSTRAINT pk_memory_ingestion_profile_defaults + PRIMARY KEY (tenant_id, project_id, pipeline), + CONSTRAINT ck_memory_ingestion_profile_defaults_pipeline + CHECK (pipeline IN ('add_event')) +); + +CREATE INDEX IF NOT EXISTS idx_memory_ingestion_profile_defaults_lookup + ON memory_ingestion_profile_defaults (tenant_id, project_id, pipeline); diff --git a/sql/tables/031_consolidation_runs.sql b/sql/tables/031_consolidation_runs.sql new file mode 100644 index 00000000..ca7504d2 --- /dev/null +++ b/sql/tables/031_consolidation_runs.sql @@ -0,0 +1,52 @@ +CREATE TABLE IF NOT EXISTS consolidation_runs ( + run_id uuid PRIMARY KEY, + tenant_id text NOT NULL, + project_id text NOT NULL, + agent_id text NOT NULL, + contract_schema text NOT NULL, + job_kind text NOT NULL, + status text NOT NULL, + input_refs jsonb NOT NULL, + source_snapshot jsonb NOT NULL, + lineage jsonb NOT NULL, + error jsonb NOT NULL DEFAULT '{}'::jsonb, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + completed_at timestamptz NULL +); + +ALTER TABLE consolidation_runs + DROP CONSTRAINT IF EXISTS ck_consolidation_runs_status; +ALTER TABLE consolidation_runs + ADD CONSTRAINT ck_consolidation_runs_status + CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled')); + +ALTER TABLE consolidation_runs + DROP CONSTRAINT IF EXISTS ck_consolidation_runs_input_refs; +ALTER TABLE consolidation_runs + ADD CONSTRAINT ck_consolidation_runs_input_refs + CHECK (jsonb_typeof(input_refs) = 'array'); + +ALTER TABLE consolidation_runs + DROP CONSTRAINT IF EXISTS ck_consolidation_runs_source_snapshot; +ALTER TABLE consolidation_runs + ADD CONSTRAINT ck_consolidation_runs_source_snapshot + CHECK (jsonb_typeof(source_snapshot) = 'object'); + +ALTER TABLE consolidation_runs + DROP CONSTRAINT IF EXISTS ck_consolidation_runs_lineage; +ALTER TABLE consolidation_runs + ADD CONSTRAINT ck_consolidation_runs_lineage + CHECK (jsonb_typeof(lineage) = 'object'); + +ALTER TABLE consolidation_runs + DROP CONSTRAINT IF EXISTS ck_consolidation_runs_error; +ALTER TABLE consolidation_runs + ADD CONSTRAINT ck_consolidation_runs_error + CHECK (jsonb_typeof(error) = 'object'); + +CREATE INDEX IF NOT EXISTS idx_consolidation_runs_context_created + ON consolidation_runs (tenant_id, project_id, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_consolidation_runs_status_updated + ON consolidation_runs (tenant_id, project_id, status, updated_at DESC); diff --git a/sql/tables/032_consolidation_proposals.sql b/sql/tables/032_consolidation_proposals.sql new file mode 100644 index 00000000..bdb470b4 --- /dev/null +++ b/sql/tables/032_consolidation_proposals.sql @@ -0,0 +1,116 @@ +CREATE TABLE IF NOT EXISTS consolidation_proposals ( + proposal_id uuid PRIMARY KEY, + run_id uuid NOT NULL REFERENCES consolidation_runs(run_id) ON DELETE CASCADE, + tenant_id text NOT NULL, + project_id text NOT NULL, + agent_id text NOT NULL, + contract_schema text NOT NULL, + proposal_kind text NOT NULL, + apply_intent text NOT NULL, + review_state text NOT NULL, + source_refs jsonb NOT NULL, + source_snapshot jsonb NOT NULL, + lineage jsonb NOT NULL, + diff jsonb NOT NULL, + confidence real NOT NULL, + unsupported_claim_flags jsonb NOT NULL DEFAULT '[]'::jsonb, + contradiction_markers jsonb NOT NULL DEFAULT '[]'::jsonb, + staleness_markers jsonb NOT NULL DEFAULT '[]'::jsonb, + target_ref jsonb NOT NULL DEFAULT '{}'::jsonb, + proposed_payload jsonb NOT NULL DEFAULT '{}'::jsonb, + reviewer_agent_id text NULL, + review_comment text NULL, + reviewed_at timestamptz NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_apply_intent; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_apply_intent + CHECK ( + apply_intent IN ( + 'create_derived_note', + 'update_derived_note', + 'create_derived_knowledge_page', + 'update_derived_knowledge_page', + 'create_derived_graph_view', + 'no_op' + ) + ); + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_review_state; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_review_state + CHECK (review_state IN ('proposed', 'approved', 'rejected', 'applied', 'archived')); + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_source_refs; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_source_refs + CHECK (jsonb_typeof(source_refs) = 'array'); + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_source_snapshot; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_source_snapshot + CHECK (jsonb_typeof(source_snapshot) = 'object'); + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_lineage; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_lineage + CHECK (jsonb_typeof(lineage) = 'object'); + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_diff; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_diff + CHECK (jsonb_typeof(diff) = 'object'); + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_confidence; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_confidence + CHECK (confidence >= 0.0 AND confidence <= 1.0); + +ALTER TABLE consolidation_proposals + ADD COLUMN IF NOT EXISTS unsupported_claim_flags jsonb NOT NULL DEFAULT '[]'::jsonb; + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_unsupported_claim_flags; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_unsupported_claim_flags + CHECK (jsonb_typeof(unsupported_claim_flags) = 'array'); + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_contradiction_markers; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_contradiction_markers + CHECK (jsonb_typeof(contradiction_markers) = 'array'); + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_staleness_markers; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_staleness_markers + CHECK (jsonb_typeof(staleness_markers) = 'array'); + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_target_ref; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_target_ref + CHECK (jsonb_typeof(target_ref) = 'object'); + +ALTER TABLE consolidation_proposals + DROP CONSTRAINT IF EXISTS ck_consolidation_proposals_proposed_payload; +ALTER TABLE consolidation_proposals + ADD CONSTRAINT ck_consolidation_proposals_proposed_payload + CHECK (jsonb_typeof(proposed_payload) = 'object'); + +CREATE INDEX IF NOT EXISTS idx_consolidation_proposals_run_created + ON consolidation_proposals (run_id, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_consolidation_proposals_context_state_created + ON consolidation_proposals (tenant_id, project_id, review_state, created_at DESC); diff --git a/sql/tables/033_consolidation_proposal_reviews.sql b/sql/tables/033_consolidation_proposal_reviews.sql new file mode 100644 index 00000000..1ce15c73 --- /dev/null +++ b/sql/tables/033_consolidation_proposal_reviews.sql @@ -0,0 +1,37 @@ +CREATE TABLE IF NOT EXISTS consolidation_proposal_reviews ( + review_id uuid PRIMARY KEY, + proposal_id uuid NOT NULL REFERENCES consolidation_proposals(proposal_id) ON DELETE CASCADE, + run_id uuid NOT NULL REFERENCES consolidation_runs(run_id) ON DELETE CASCADE, + tenant_id text NOT NULL, + project_id text NOT NULL, + reviewer_agent_id text NOT NULL, + action text NOT NULL, + from_review_state text NOT NULL, + to_review_state text NOT NULL, + review_comment text NULL, + created_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE consolidation_proposal_reviews + DROP CONSTRAINT IF EXISTS ck_consolidation_proposal_reviews_action; +ALTER TABLE consolidation_proposal_reviews + ADD CONSTRAINT ck_consolidation_proposal_reviews_action + CHECK (action IN ('approve', 'apply', 'discard', 'defer')); + +ALTER TABLE consolidation_proposal_reviews + DROP CONSTRAINT IF EXISTS ck_consolidation_proposal_reviews_from_state; +ALTER TABLE consolidation_proposal_reviews + ADD CONSTRAINT ck_consolidation_proposal_reviews_from_state + CHECK (from_review_state IN ('proposed', 'approved', 'rejected', 'applied', 'archived')); + +ALTER TABLE consolidation_proposal_reviews + DROP CONSTRAINT IF EXISTS ck_consolidation_proposal_reviews_to_state; +ALTER TABLE consolidation_proposal_reviews + ADD CONSTRAINT ck_consolidation_proposal_reviews_to_state + CHECK (to_review_state IN ('proposed', 'approved', 'rejected', 'applied', 'archived')); + +CREATE INDEX IF NOT EXISTS idx_consolidation_proposal_reviews_proposal_created + ON consolidation_proposal_reviews (proposal_id, created_at ASC, review_id ASC); + +CREATE INDEX IF NOT EXISTS idx_consolidation_proposal_reviews_context_created + ON consolidation_proposal_reviews (tenant_id, project_id, created_at DESC); diff --git a/sql/tables/034_consolidation_run_jobs.sql b/sql/tables/034_consolidation_run_jobs.sql new file mode 100644 index 00000000..600bf102 --- /dev/null +++ b/sql/tables/034_consolidation_run_jobs.sql @@ -0,0 +1,33 @@ +CREATE TABLE IF NOT EXISTS consolidation_run_jobs ( + job_id uuid PRIMARY KEY, + run_id uuid NOT NULL REFERENCES consolidation_runs(run_id) ON DELETE CASCADE, + tenant_id text NOT NULL, + project_id text NOT NULL, + agent_id text NOT NULL, + job_kind text NOT NULL, + status text NOT NULL, + payload jsonb NOT NULL, + attempts int NOT NULL DEFAULT 0, + last_error text NULL, + available_at timestamptz NOT NULL DEFAULT now(), + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE consolidation_run_jobs + DROP CONSTRAINT IF EXISTS ck_consolidation_run_jobs_status; +ALTER TABLE consolidation_run_jobs + ADD CONSTRAINT ck_consolidation_run_jobs_status + CHECK (status IN ('PENDING', 'CLAIMED', 'DONE', 'FAILED')); + +ALTER TABLE consolidation_run_jobs + DROP CONSTRAINT IF EXISTS ck_consolidation_run_jobs_payload; +ALTER TABLE consolidation_run_jobs + ADD CONSTRAINT ck_consolidation_run_jobs_payload + CHECK (jsonb_typeof(payload) = 'object'); + +CREATE INDEX IF NOT EXISTS idx_consolidation_run_jobs_status_available + ON consolidation_run_jobs (status, available_at); + +CREATE INDEX IF NOT EXISTS idx_consolidation_run_jobs_run_status + ON consolidation_run_jobs (run_id, status); diff --git a/sql/tables/035_knowledge_pages.sql b/sql/tables/035_knowledge_pages.sql new file mode 100644 index 00000000..a13f3cbe --- /dev/null +++ b/sql/tables/035_knowledge_pages.sql @@ -0,0 +1,54 @@ +CREATE TABLE IF NOT EXISTS knowledge_pages ( + page_id uuid PRIMARY KEY, + tenant_id text NOT NULL, + project_id text NOT NULL, + page_kind text NOT NULL, + page_key text NOT NULL, + title text NOT NULL, + contract_schema text NOT NULL, + status text NOT NULL, + rebuild_source_hash text NOT NULL, + content_hash text NOT NULL, + source_coverage jsonb NOT NULL DEFAULT '{}'::jsonb, + source_snapshot jsonb NOT NULL DEFAULT '{}'::jsonb, + rebuild_metadata jsonb NOT NULL DEFAULT '{}'::jsonb, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + rebuilt_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE knowledge_pages + DROP CONSTRAINT IF EXISTS ck_knowledge_pages_page_kind; +ALTER TABLE knowledge_pages + ADD CONSTRAINT ck_knowledge_pages_page_kind + CHECK (page_kind IN ('project', 'entity', 'concept', 'issue', 'decision')); + +ALTER TABLE knowledge_pages + DROP CONSTRAINT IF EXISTS ck_knowledge_pages_status; +ALTER TABLE knowledge_pages + ADD CONSTRAINT ck_knowledge_pages_status + CHECK (status IN ('active', 'stale', 'archived')); + +ALTER TABLE knowledge_pages + DROP CONSTRAINT IF EXISTS ck_knowledge_pages_source_coverage; +ALTER TABLE knowledge_pages + ADD CONSTRAINT ck_knowledge_pages_source_coverage + CHECK (jsonb_typeof(source_coverage) = 'object'); + +ALTER TABLE knowledge_pages + DROP CONSTRAINT IF EXISTS ck_knowledge_pages_source_snapshot; +ALTER TABLE knowledge_pages + ADD CONSTRAINT ck_knowledge_pages_source_snapshot + CHECK (jsonb_typeof(source_snapshot) = 'object'); + +ALTER TABLE knowledge_pages + DROP CONSTRAINT IF EXISTS ck_knowledge_pages_rebuild_metadata; +ALTER TABLE knowledge_pages + ADD CONSTRAINT ck_knowledge_pages_rebuild_metadata + CHECK (jsonb_typeof(rebuild_metadata) = 'object'); + +CREATE UNIQUE INDEX IF NOT EXISTS uq_knowledge_pages_context_key + ON knowledge_pages (tenant_id, project_id, page_kind, page_key); + +CREATE INDEX IF NOT EXISTS idx_knowledge_pages_context_updated + ON knowledge_pages (tenant_id, project_id, updated_at DESC); diff --git a/sql/tables/036_knowledge_page_sections.sql b/sql/tables/036_knowledge_page_sections.sql new file mode 100644 index 00000000..0312f5e4 --- /dev/null +++ b/sql/tables/036_knowledge_page_sections.sql @@ -0,0 +1,32 @@ +CREATE TABLE IF NOT EXISTS knowledge_page_sections ( + section_id uuid PRIMARY KEY, + page_id uuid NOT NULL REFERENCES knowledge_pages(page_id) ON DELETE CASCADE, + section_key text NOT NULL, + heading text NOT NULL, + role text NOT NULL, + content text NOT NULL, + ordinal int NOT NULL, + citations jsonb NOT NULL DEFAULT '[]'::jsonb, + unsupported_reason text NULL, + content_hash text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE knowledge_page_sections + DROP CONSTRAINT IF EXISTS ck_knowledge_page_sections_citations; +ALTER TABLE knowledge_page_sections + ADD CONSTRAINT ck_knowledge_page_sections_citations + CHECK (jsonb_typeof(citations) = 'array'); + +ALTER TABLE knowledge_page_sections + DROP CONSTRAINT IF EXISTS ck_knowledge_page_sections_cited_or_unsupported; +ALTER TABLE knowledge_page_sections + ADD CONSTRAINT ck_knowledge_page_sections_cited_or_unsupported + CHECK (jsonb_array_length(citations) > 0 OR unsupported_reason IS NOT NULL); + +CREATE UNIQUE INDEX IF NOT EXISTS uq_knowledge_page_sections_page_key + ON knowledge_page_sections (page_id, section_key); + +CREATE INDEX IF NOT EXISTS idx_knowledge_page_sections_page_ordinal + ON knowledge_page_sections (page_id, ordinal); diff --git a/sql/tables/037_knowledge_page_source_refs.sql b/sql/tables/037_knowledge_page_source_refs.sql new file mode 100644 index 00000000..d157c563 --- /dev/null +++ b/sql/tables/037_knowledge_page_source_refs.sql @@ -0,0 +1,37 @@ +CREATE TABLE IF NOT EXISTS knowledge_page_source_refs ( + ref_id uuid PRIMARY KEY, + page_id uuid NOT NULL REFERENCES knowledge_pages(page_id) ON DELETE CASCADE, + section_id uuid NULL REFERENCES knowledge_page_sections(section_id) ON DELETE CASCADE, + source_kind text NOT NULL, + source_id uuid NOT NULL, + source_status text NULL, + source_updated_at timestamptz NULL, + source_content_hash text NULL, + source_snapshot jsonb NOT NULL DEFAULT '{}'::jsonb, + citation_metadata jsonb NOT NULL DEFAULT '{}'::jsonb, + created_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE knowledge_page_source_refs + DROP CONSTRAINT IF EXISTS ck_knowledge_page_source_refs_source_kind; +ALTER TABLE knowledge_page_source_refs + ADD CONSTRAINT ck_knowledge_page_source_refs_source_kind + CHECK (source_kind IN ('note', 'event', 'relation', 'proposal')); + +ALTER TABLE knowledge_page_source_refs + DROP CONSTRAINT IF EXISTS ck_knowledge_page_source_refs_source_snapshot; +ALTER TABLE knowledge_page_source_refs + ADD CONSTRAINT ck_knowledge_page_source_refs_source_snapshot + CHECK (jsonb_typeof(source_snapshot) = 'object'); + +ALTER TABLE knowledge_page_source_refs + DROP CONSTRAINT IF EXISTS ck_knowledge_page_source_refs_citation_metadata; +ALTER TABLE knowledge_page_source_refs + ADD CONSTRAINT ck_knowledge_page_source_refs_citation_metadata + CHECK (jsonb_typeof(citation_metadata) = 'object'); + +CREATE INDEX IF NOT EXISTS idx_knowledge_page_source_refs_page + ON knowledge_page_source_refs (page_id, source_kind, source_id); + +CREATE INDEX IF NOT EXISTS idx_knowledge_page_source_refs_source + ON knowledge_page_source_refs (source_kind, source_id); diff --git a/sql/tables/038_knowledge_page_lint_findings.sql b/sql/tables/038_knowledge_page_lint_findings.sql new file mode 100644 index 00000000..e76a5aa2 --- /dev/null +++ b/sql/tables/038_knowledge_page_lint_findings.sql @@ -0,0 +1,33 @@ +CREATE TABLE IF NOT EXISTS knowledge_page_lint_findings ( + finding_id uuid PRIMARY KEY, + page_id uuid NOT NULL REFERENCES knowledge_pages(page_id) ON DELETE CASCADE, + section_id uuid NULL REFERENCES knowledge_page_sections(section_id) ON DELETE SET NULL, + finding_type text NOT NULL, + severity text NOT NULL, + source_kind text NULL, + source_id uuid NULL, + message text NOT NULL, + details jsonb NOT NULL DEFAULT '{}'::jsonb, + created_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE knowledge_page_lint_findings + DROP CONSTRAINT IF EXISTS ck_knowledge_page_lint_findings_severity; +ALTER TABLE knowledge_page_lint_findings + ADD CONSTRAINT ck_knowledge_page_lint_findings_severity + CHECK (severity IN ('info', 'warning', 'error')); + +ALTER TABLE knowledge_page_lint_findings + DROP CONSTRAINT IF EXISTS ck_knowledge_page_lint_findings_source_kind; +ALTER TABLE knowledge_page_lint_findings + ADD CONSTRAINT ck_knowledge_page_lint_findings_source_kind + CHECK (source_kind IS NULL OR source_kind IN ('note', 'event', 'relation', 'proposal')); + +ALTER TABLE knowledge_page_lint_findings + DROP CONSTRAINT IF EXISTS ck_knowledge_page_lint_findings_details; +ALTER TABLE knowledge_page_lint_findings + ADD CONSTRAINT ck_knowledge_page_lint_findings_details + CHECK (jsonb_typeof(details) = 'object'); + +CREATE INDEX IF NOT EXISTS idx_knowledge_page_lint_findings_page + ON knowledge_page_lint_findings (page_id, severity, created_at DESC); diff --git a/sql/tables/039_core_memory_blocks.sql b/sql/tables/039_core_memory_blocks.sql new file mode 100644 index 00000000..76ad8604 --- /dev/null +++ b/sql/tables/039_core_memory_blocks.sql @@ -0,0 +1,27 @@ +CREATE TABLE IF NOT EXISTS core_memory_blocks ( + block_id uuid PRIMARY KEY, + tenant_id text NOT NULL, + project_id text NOT NULL, + agent_id text NOT NULL, + scope text NOT NULL, + key text NOT NULL, + title text NOT NULL, + content text NOT NULL, + source_ref jsonb NOT NULL, + status text NOT NULL, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now(), + CONSTRAINT ck_core_memory_blocks_scope + CHECK (scope IN ('agent_private', 'project_shared', 'org_shared')), + CONSTRAINT ck_core_memory_blocks_status + CHECK (status IN ('active', 'archived')), + CONSTRAINT ck_core_memory_blocks_source_ref_object + CHECK (jsonb_typeof(source_ref) = 'object') +); + +CREATE UNIQUE INDEX IF NOT EXISTS uq_core_memory_blocks_active_key + ON core_memory_blocks (tenant_id, project_id, agent_id, scope, key) + WHERE status = 'active'; + +CREATE INDEX IF NOT EXISTS idx_core_memory_blocks_scope_status + ON core_memory_blocks (tenant_id, project_id, scope, status); diff --git a/sql/tables/040_core_memory_block_attachments.sql b/sql/tables/040_core_memory_block_attachments.sql new file mode 100644 index 00000000..55fc0229 --- /dev/null +++ b/sql/tables/040_core_memory_block_attachments.sql @@ -0,0 +1,24 @@ +CREATE TABLE IF NOT EXISTS core_memory_block_attachments ( + attachment_id uuid PRIMARY KEY, + block_id uuid NOT NULL REFERENCES core_memory_blocks(block_id) ON DELETE CASCADE, + tenant_id text NOT NULL, + project_id text NOT NULL, + agent_id text NOT NULL, + read_profile text NOT NULL, + attached_by_agent_id text NOT NULL, + attached_at timestamptz NOT NULL DEFAULT now(), + detached_by_agent_id text NULL, + detached_at timestamptz NULL, + CONSTRAINT ck_core_memory_block_attachments_read_profile + CHECK (read_profile IN ('private_only', 'private_plus_project', 'all_scopes')) +); + +CREATE UNIQUE INDEX IF NOT EXISTS uq_core_memory_block_attachments_active + ON core_memory_block_attachments (tenant_id, project_id, agent_id, read_profile, block_id) + WHERE detached_at IS NULL; + +CREATE INDEX IF NOT EXISTS idx_core_memory_block_attachments_read + ON core_memory_block_attachments (tenant_id, project_id, agent_id, read_profile, detached_at); + +CREATE INDEX IF NOT EXISTS idx_core_memory_block_attachments_block + ON core_memory_block_attachments (block_id, detached_at); diff --git a/sql/tables/041_core_memory_block_events.sql b/sql/tables/041_core_memory_block_events.sql new file mode 100644 index 00000000..b6033847 --- /dev/null +++ b/sql/tables/041_core_memory_block_events.sql @@ -0,0 +1,30 @@ +CREATE TABLE IF NOT EXISTS core_memory_block_events ( + event_id uuid PRIMARY KEY, + block_id uuid NOT NULL REFERENCES core_memory_blocks(block_id) ON DELETE CASCADE, + attachment_id uuid NULL REFERENCES core_memory_block_attachments(attachment_id) ON DELETE SET NULL, + tenant_id text NOT NULL, + project_id text NOT NULL, + actor_agent_id text NOT NULL, + event_type text NOT NULL, + target_agent_id text NULL, + read_profile text NULL, + prev_snapshot jsonb NULL, + new_snapshot jsonb NULL, + reason text NOT NULL, + ts timestamptz NOT NULL DEFAULT now(), + CONSTRAINT ck_core_memory_block_events_event_type + CHECK ( + event_type IN ( + 'block_created', + 'block_updated', + 'attachment_added', + 'attachment_removed' + ) + ) +); + +CREATE INDEX IF NOT EXISTS idx_core_memory_block_events_block_ts + ON core_memory_block_events (block_id, ts); + +CREATE INDEX IF NOT EXISTS idx_core_memory_block_events_attachment_ts + ON core_memory_block_events (attachment_id, ts);