diff --git a/Makefile.toml b/Makefile.toml index 02654763..0f76e427 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -1114,9 +1114,9 @@ args = [ "--", "run", "--cursor", - "docs/research/external_memory_pattern_radar/cursor.json", + "apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json", "--summary", - "docs/research/external_memory_pattern_radar/latest.md", + "docs/evidence/external_memory_pattern_radar_latest.md", ] [tasks.external-memory-radar-artifact] @@ -1138,7 +1138,7 @@ args = [ "--", "run", "--cursor", - "docs/research/external_memory_pattern_radar/cursor.json", + "apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json", "--out-cursor", "tmp/external-memory-pattern-radar/cursor.json", "--summary", @@ -1181,7 +1181,7 @@ args = [ "--mode", "offline", "--cursor", - "docs/research/external_memory_pattern_radar/cursor.json", + "apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json", "--out-cursor", "tmp/external-memory-pattern-radar/cursor.json", "--summary", @@ -1215,7 +1215,7 @@ args = [ "--", "validate", "--cursor", - "docs/research/external_memory_pattern_radar/cursor.json", + "apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json", ] # Smoke diff --git a/README.md b/README.md index 5649d0d6..3628775b 100644 --- a/README.md +++ b/README.md @@ -36,11 +36,11 @@ ELF is a memory service for LLM agents that stores short, evidence-linked facts ## Quickstart -Use the canonical setup guide: +Use the canonical setup runbook: -- `docs/guide/getting_started.md` +- `docs/runbook/getting_started.md` - For single-user production operation, backup, restore, and Qdrant rebuild, use - [docs/guide/single_user_production.md](docs/guide/single_user_production.md). + [docs/runbook/single_user_production.md](docs/runbook/single_user_production.md). Fast path: @@ -259,24 +259,24 @@ provider-backed ELF evidence was required. Detailed evidence and interpretation: -- [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md) -- [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md) -- [Production Adoption Gate Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md) -- [Real-World Comparison Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md) -- [Live Real-World Adapter Sweep Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md) -- [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md) -- [qmd and OpenViking Strength-Profile Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md) -- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) -- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) -- [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) -- [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md) -- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md) -- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) -- [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) -- [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md) -- [Scheduled Memory Task Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md) -- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) -- [Single-User Production Runbook](docs/guide/single_user_production.md) +- [Live Baseline Benchmark Report - June 9, 2026](docs/evidence/benchmarking/2026-06-09-live-baseline-report.md) +- [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/evidence/benchmarking/2026-06-09-production-corpus-report.md) +- [Production Adoption Gate Report - June 9, 2026](docs/evidence/benchmarking/2026-06-09-production-adoption-gate-report.md) +- [Real-World Comparison Report - June 10, 2026](docs/evidence/benchmarking/2026-06-10-real-world-comparison-report.md) +- [Live Real-World Adapter Sweep Report - June 10, 2026](docs/evidence/benchmarking/2026-06-10-live-real-world-sweep-report.md) +- [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md) +- [qmd and OpenViking Strength-Profile Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md) +- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) +- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) +- [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) +- [Capture/Write-Policy Live Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-capture-write-policy-live-report.md) +- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md) +- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) +- [Live Temporal Reconciliation Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) +- [Proactive Brief Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md) +- [Scheduled Memory Task Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md) +- [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) +- [Single-User Production Runbook](docs/runbook/single_user_production.md) - Benchmark contract: [Real-World Agent Memory Benchmark v1](docs/spec/real_world_agent_memory_benchmark_v1.md). This contract defines job-level suites for agent work. `cargo make real-world-memory` @@ -341,31 +341,33 @@ Project signature strengths (what each does especially well): Detailed comparison, mechanism-level analysis, and source map: -- [Live Baseline Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-live-baseline-report.md) -- [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-corpus-report.md) -- [Production Adoption Gate Report - June 9, 2026](docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md) -- [Real-World Comparison Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md) -- [Live Real-World Adapter Sweep Report - June 10, 2026](docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md) -- [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md) -- [Competitor Strength Evidence Matrix - June 11, 2026](docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md) -- [Temporal History Competitor Gap Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md) -- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) -- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) -- [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) -- [Capture/Write-Policy Live Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md) -- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md) -- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) -- [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) -- [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md) -- [Scheduled Memory Task Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md) -- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) -- [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md) -- [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md) -- [Detailed External Comparison](docs/guide/research/comparison_external_projects.md) -- [Research Projects Inventory](docs/guide/research/research_projects_inventory.md) -- [Agent Memory Selection Research Run](docs/research/2026-06-08-agent-memory-selection.json) -- [Real-World Benchmark Dimension Research Run](docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json) -- [RAG/Graph Adapter Feasibility Research Run](docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json) +- [Live Baseline Benchmark Report - June 9, 2026](docs/evidence/benchmarking/2026-06-09-live-baseline-report.md) +- [Synthetic Production Corpus Benchmark Report - June 9, 2026](docs/evidence/benchmarking/2026-06-09-production-corpus-report.md) +- [Production Adoption Gate Report - June 9, 2026](docs/evidence/benchmarking/2026-06-09-production-adoption-gate-report.md) +- [Real-World Comparison Report - June 10, 2026](docs/evidence/benchmarking/2026-06-10-real-world-comparison-report.md) +- [Live Real-World Adapter Sweep Report - June 10, 2026](docs/evidence/benchmarking/2026-06-10-live-real-world-sweep-report.md) +- [Post-Adapter Production Adoption Refresh - June 10, 2026](docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md) +- [Competitor Strength Evidence Matrix - June 11, 2026](docs/evidence/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md) +- [Temporal History Competitor Gap Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md) +- [ELF/qmd Trace Replay Diagnostics Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md) +- [Graph/RAG Scored Smoke Adapter Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md) +- [mem0/OpenMemory History and UI Export Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md) +- [Capture/Write-Policy Live Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-capture-write-policy-live-report.md) +- [Live Consolidation Proposal Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md) +- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) +- [Live Temporal Reconciliation Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) +- [Proactive Brief Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md) +- [Scheduled Memory Task Scoring Report - June 16, 2026](docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md) +- [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) +- [Real-World Agent Memory Benchmark](docs/runbook/benchmarking/real_world_agent_memory_benchmark.md) +- [External Memory Improvement Plan](docs/evidence/external_memory/external_memory_improvement_plan.md) +- [Detailed External Comparison](docs/evidence/external_memory/comparison_external_projects.md) +- [Research Projects Inventory](docs/evidence/external_memory/research_projects_inventory.md) +- [Agent Memory Selection Decision](docs/decisions/2026-06-08-agent-memory-selection.md) +- [Real-World Agent Memory Benchmark Spec](docs/spec/real_world_agent_memory_benchmark_v1.md) +- [Graph/RAG Adapter Follow-Up Research](docs/research/graph_rag_adapter_followup.md) +- [Derived Knowledge Page Follow-Up Research](docs/research/derived_knowledge_page_followup.md) +- [Dreaming Product Surface Follow-Up Research](docs/research/dreaming_product_surface_followup.md) Latest real-world benchmark report: June 16, 2026. Latest external research refresh: June 11, 2026; June 16 adds live temporal reconciliation, live consolidation @@ -374,17 +376,18 @@ self-check evidence, and fixture-backed scheduled-memory task scoring. ## Documentation - Start here: `docs/index.md` -- Operational guide index: `docs/guide/index.md` +- Runbook index: `docs/runbook/index.md` - Single-user production runbook: - [docs/guide/single_user_production.md](docs/guide/single_user_production.md) -- Benchmarking guides and reports: `docs/guide/benchmarking/index.md` -- Research index: `docs/guide/research/index.md` + [docs/runbook/single_user_production.md](docs/runbook/single_user_production.md) +- Benchmarking runbooks: `docs/runbook/benchmarking/index.md` +- Benchmarking evidence: `docs/evidence/benchmarking/index.md` +- External memory evidence: `docs/evidence/external_memory/index.md` - Specifications: `docs/spec/index.md` - System contract: `docs/spec/system_elf_memory_service_v2.md` - Ingest policy: `policy_decision` values (`remember`, `update`, `ignore`, `reject`) are returned for each note result in `add_note` and `add_event`. - All ingest decisions are also written to `memory_ingest_decisions` with policy inputs and thresholds for auditability. -- Evaluation guide: `docs/guide/evaluation.md` -- Integration testing: `docs/guide/integration-testing.md` +- Evaluation runbook: `docs/runbook/evaluation.md` +- Integration testing: `docs/runbook/integration-testing.md` ## Development @@ -394,7 +397,7 @@ cargo make check cargo make test-rust ``` -For integration and E2E workflows, use `docs/guide/getting_started.md` and `docs/guide/integration-testing.md`. +For integration and E2E workflows, use `docs/runbook/getting_started.md` and `docs/runbook/integration-testing.md`. ## Support Me diff --git a/docs/guide/eval-sample.json b/apps/elf-eval/fixtures/evaluation/eval-sample.json similarity index 100% rename from docs/guide/eval-sample.json rename to apps/elf-eval/fixtures/evaluation/eval-sample.json diff --git a/docs/guide/eval-structured-facts-sample.json b/apps/elf-eval/fixtures/evaluation/eval-structured-facts-sample.json similarity index 100% rename from docs/guide/eval-structured-facts-sample.json rename to apps/elf-eval/fixtures/evaluation/eval-structured-facts-sample.json diff --git a/docs/research/external_memory_pattern_radar/cursor.json b/apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json similarity index 91% rename from docs/research/external_memory_pattern_radar/cursor.json rename to apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json index 2ce50573..936d9086 100644 --- a/docs/research/external_memory_pattern_radar/cursor.json +++ b/apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json @@ -3,9 +3,9 @@ "cadence": "weekly", "generated_at": "2026-06-10T08:32:00.790878Z", "source_docs": [ - "docs/guide/research/external_memory_improvement_plan.md", - "docs/guide/research/comparison_external_projects.md", - "docs/guide/research/research_projects_inventory.md", + "docs/evidence/external_memory/external_memory_improvement_plan.md", + "docs/evidence/external_memory/comparison_external_projects.md", + "docs/evidence/external_memory/research_projects_inventory.md", "docs/spec/external_memory_pattern_radar_v1.md" ], "projects": [ @@ -20,14 +20,14 @@ "rw.lifecycle-staleness" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md", - "docs/research/2026-06-08-agent-memory-selection.json", - "docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json" + "docs/evidence/external_memory/comparison_external_projects.md", + "docs/decisions/2026-06-08-agent-memory-selection.md", + "docs/spec/real_world_agent_memory_benchmark_v1.md" ], "coverage_evidence": [ { "label": "adapter evidence boundary", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "agentmemory is tracked for operator continuity and resume evidence, but current benchmark evidence does not prove durable lifecycle quality." } ], @@ -58,13 +58,13 @@ "rw.operator-continuity" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md", - "docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json" + "docs/evidence/external_memory/comparison_external_projects.md", + "docs/spec/real_world_agent_memory_benchmark_v1.md" ], "coverage_evidence": [ { "label": "lifecycle and graph reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "mem0 remains the ecosystem and entity-scoped lifecycle reference while ELF keeps deterministic evidence-bound writes." } ], @@ -95,13 +95,13 @@ "rw.resume-evidence" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md", + "docs/evidence/external_memory/comparison_external_projects.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" ], "coverage_evidence": [ { "label": "retrieval-debug baseline", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "qmd is the strongest local retrieval-debug reference and has targeted live real-world adapter evidence." } ], @@ -132,13 +132,13 @@ "rw.retrieval-debug" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md", - "docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json" + "docs/evidence/external_memory/comparison_external_projects.md", + "docs/spec/real_world_agent_memory_benchmark_v1.md" ], "coverage_evidence": [ { "label": "progressive disclosure UX reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "claude-mem remains a product reference for progressive disclosure and viewer workflow, not a proven ELF replacement." } ], @@ -169,13 +169,13 @@ "rw.retrieval-debug" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md", + "docs/evidence/external_memory/comparison_external_projects.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" ], "coverage_evidence": [ { "label": "trajectory reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "OpenViking informs hierarchical context trajectory while current adapter evidence remains incomplete." } ], @@ -205,13 +205,13 @@ "rw.resume-evidence" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md", + "docs/evidence/external_memory/comparison_external_projects.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" ], "coverage_evidence": [ { "label": "temporal graph reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "Graphiti/Zep remains the broader temporal graph workflow reference for current-versus-historical facts." } ], @@ -241,13 +241,13 @@ "rw.operator-continuity" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md", + "docs/evidence/external_memory/comparison_external_projects.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" ], "coverage_evidence": [ { "label": "core versus archival memory reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "Letta informs core memory block ergonomics while ELF keeps archival notes source-of-truth bound." } ], @@ -278,13 +278,13 @@ "rw.retrieval-debug" ], "primary_references": [ - "docs/guide/research/research_projects_inventory.md", + "docs/evidence/external_memory/research_projects_inventory.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" ], "coverage_evidence": [ { "label": "research gate", - "path": "docs/guide/research/research_projects_inventory.md", + "path": "docs/evidence/external_memory/research_projects_inventory.md", "summary": "LightRAG is a D0 watch item with a research gate; no adapter strength claim is allowed yet." } ], @@ -315,13 +315,13 @@ "rw.retrieval-debug" ], "primary_references": [ - "docs/guide/research/research_projects_inventory.md", + "docs/evidence/external_memory/research_projects_inventory.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" ], "coverage_evidence": [ { "label": "research gate", - "path": "docs/guide/research/research_projects_inventory.md", + "path": "docs/evidence/external_memory/research_projects_inventory.md", "summary": "GraphRAG is a D0 watch item with a research gate; no adapter strength claim is allowed yet." } ], @@ -352,13 +352,13 @@ "rw.retrieval-debug" ], "primary_references": [ - "docs/guide/research/research_projects_inventory.md", + "docs/evidence/external_memory/research_projects_inventory.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" ], "coverage_evidence": [ { "label": "research gate", - "path": "docs/guide/research/research_projects_inventory.md", + "path": "docs/evidence/external_memory/research_projects_inventory.md", "summary": "RAGFlow is a D0 watch item with a research gate; no adapter strength claim is allowed yet." } ], @@ -389,12 +389,12 @@ "rw.resume-evidence" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md" + "docs/evidence/external_memory/comparison_external_projects.md" ], "coverage_evidence": [ { "label": "markdown-first reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "memsearch remains a source-transparency reference while current adapter evidence is incomplete or wrong-result typed." } ], @@ -424,12 +424,12 @@ "rw.resume-evidence" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md" + "docs/evidence/external_memory/comparison_external_projects.md" ], "coverage_evidence": [ { "label": "replay regression reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "LangGraph informs replay and checkpoint regression workflows; ELF traces do not replace full agent-state replay." } ], @@ -459,13 +459,13 @@ "rw.retrieval-debug" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md", + "docs/evidence/external_memory/comparison_external_projects.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" ], "coverage_evidence": [ { "label": "typed graph ergonomics reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "nanograph is a typed graph DX reference, not a full memory backend benchmark claim." } ], @@ -495,12 +495,12 @@ "rw.resume-evidence" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md" + "docs/evidence/external_memory/comparison_external_projects.md" ], "coverage_evidence": [ { "label": "derived knowledge pages reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "llm-wiki informs rebuildable cited knowledge pages and lint/repair loops." } ], @@ -530,12 +530,12 @@ "rw.operator-continuity" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md" + "docs/evidence/external_memory/comparison_external_projects.md" ], "coverage_evidence": [ { "label": "operational brain reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "gbrain informs current-truth and timeline presentation while ELF source notes remain authoritative." } ], @@ -562,12 +562,12 @@ "rw.resume-evidence" ], "primary_references": [ - "docs/guide/research/comparison_external_projects.md" + "docs/evidence/external_memory/comparison_external_projects.md" ], "coverage_evidence": [ { "label": "graph-compressed navigation reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "graphify informs rebuildable graph reports and pre-search guidance without replacing ELF storage." } ], @@ -612,7 +612,7 @@ "duplicate_coverage_evidence": [ { "label": "adapter evidence boundary", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "agentmemory is tracked for operator continuity and resume evidence, but current benchmark evidence does not prove durable lifecycle quality." } ], @@ -648,7 +648,7 @@ "duplicate_coverage_evidence": [ { "label": "lifecycle and graph reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "mem0 remains the ecosystem and entity-scoped lifecycle reference while ELF keeps deterministic evidence-bound writes." } ], @@ -684,7 +684,7 @@ "duplicate_coverage_evidence": [ { "label": "retrieval-debug baseline", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "qmd is the strongest local retrieval-debug reference and has targeted live real-world adapter evidence." } ], @@ -720,7 +720,7 @@ "duplicate_coverage_evidence": [ { "label": "progressive disclosure UX reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "claude-mem remains a product reference for progressive disclosure and viewer workflow, not a proven ELF replacement." } ], @@ -756,7 +756,7 @@ "duplicate_coverage_evidence": [ { "label": "trajectory reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "OpenViking informs hierarchical context trajectory while current adapter evidence remains incomplete." } ], @@ -792,7 +792,7 @@ "duplicate_coverage_evidence": [ { "label": "temporal graph reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "Graphiti/Zep remains the broader temporal graph workflow reference for current-versus-historical facts." } ], @@ -828,7 +828,7 @@ "duplicate_coverage_evidence": [ { "label": "core versus archival memory reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "Letta informs core memory block ergonomics while ELF keeps archival notes source-of-truth bound." } ], @@ -864,7 +864,7 @@ "duplicate_coverage_evidence": [ { "label": "research gate", - "path": "docs/guide/research/research_projects_inventory.md", + "path": "docs/evidence/external_memory/research_projects_inventory.md", "summary": "LightRAG is a D0 watch item with a research gate; no adapter strength claim is allowed yet." } ], @@ -900,7 +900,7 @@ "duplicate_coverage_evidence": [ { "label": "research gate", - "path": "docs/guide/research/research_projects_inventory.md", + "path": "docs/evidence/external_memory/research_projects_inventory.md", "summary": "GraphRAG is a D0 watch item with a research gate; no adapter strength claim is allowed yet." } ], @@ -936,7 +936,7 @@ "duplicate_coverage_evidence": [ { "label": "research gate", - "path": "docs/guide/research/research_projects_inventory.md", + "path": "docs/evidence/external_memory/research_projects_inventory.md", "summary": "RAGFlow is a D0 watch item with a research gate; no adapter strength claim is allowed yet." } ], @@ -972,7 +972,7 @@ "duplicate_coverage_evidence": [ { "label": "markdown-first reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "memsearch remains a source-transparency reference while current adapter evidence is incomplete or wrong-result typed." } ], @@ -1008,7 +1008,7 @@ "duplicate_coverage_evidence": [ { "label": "replay regression reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "LangGraph informs replay and checkpoint regression workflows; ELF traces do not replace full agent-state replay." } ], @@ -1044,7 +1044,7 @@ "duplicate_coverage_evidence": [ { "label": "typed graph ergonomics reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "nanograph is a typed graph DX reference, not a full memory backend benchmark claim." } ], @@ -1080,7 +1080,7 @@ "duplicate_coverage_evidence": [ { "label": "derived knowledge pages reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "llm-wiki informs rebuildable cited knowledge pages and lint/repair loops." } ], @@ -1116,7 +1116,7 @@ "duplicate_coverage_evidence": [ { "label": "operational brain reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "gbrain informs current-truth and timeline presentation while ELF source notes remain authoritative." } ], @@ -1151,7 +1151,7 @@ "duplicate_coverage_evidence": [ { "label": "graph-compressed navigation reference", - "path": "docs/guide/research/comparison_external_projects.md", + "path": "docs/evidence/external_memory/comparison_external_projects.md", "summary": "graphify informs rebuildable graph reports and pre-search guidance without replacing ELF storage." } ], diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 0ba49733..00490fc1 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -373,7 +373,7 @@ "result": { "status": "pass", "evidence": "This live_baseline_only record is same-corpus evidence only; cite qmd_live_real_world for the full live real-world sweep.", - "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + "artifact": "docs/runbook/benchmarking/live_baseline_benchmark.md" }, "capabilities": [ { @@ -973,8 +973,8 @@ ], "evidence": [ { - "kind": "guide", - "ref": "docs/guide/research/agentmemory_adapter.md", + "kind": "evidence", + "ref": "docs/evidence/external_memory/agentmemory_adapter.md", "status": "real" }, { @@ -1107,9 +1107,9 @@ "status": "pass", "elf_position": "loses", "comparison_outcome": "loss", - "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", - "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" }, { "scenario_id": "entity_scoped_personalization", @@ -1117,9 +1117,9 @@ "status": "pass", "elf_position": "ties", "comparison_outcome": "tie", - "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", - "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md" }, { "scenario_id": "delete_audit_readback", @@ -1127,9 +1127,9 @@ "status": "pass", "elf_position": "ties", "comparison_outcome": "tie", - "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", - "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" }, { "scenario_id": "local_get_all_export_readback", @@ -1333,7 +1333,7 @@ "result": { "status": "wrong_result", "evidence": "The current OpenViking Docker evidence is a behavioral wrong_result, not a local embedding setup blocker and not a real_world_job pass.", - "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + "artifact": "docs/runbook/benchmarking/live_baseline_benchmark.md" }, "capabilities": [ { @@ -1593,7 +1593,7 @@ "result": { "status": "not_encoded", "evidence": "The XY-899 report records qmd scenario-level retrieval/debug/replay outcomes and wrong-result diagnosis taxonomy, while expansion/fusion/rerank scoring remains not_encoded.", - "artifact": "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json" + "artifact": "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" }, "capabilities": [ { @@ -1678,7 +1678,7 @@ "result": { "status": "blocked", "evidence": "No OpenViking deep context-trajectory result is claimed from the current wrong-result smoke run; the XY-928 fixtures preserve trajectory surfaces as blocked/not_tested.", - "artifact": "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json" + "artifact": "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" }, "capabilities": [ { diff --git a/docs/research/2026-06-11-capture-write-policy-live-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-capture-write-policy-live-report.json similarity index 100% rename from docs/research/2026-06-11-capture-write-policy-live-report.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-capture-write-policy-live-report.json diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json similarity index 88% rename from docs/research/2026-06-11-competitor-strength-adoption-report.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json index 6404bc35..01f0831e 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json @@ -39,7 +39,7 @@ "source_artifacts": [ { "command": "cargo make real-world-memory", - "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "artifact": "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md", "claim": "ELF fixture aggregate covers 60 jobs across 16 suites with 53 pass and 7 blocked production-ops, private-corpus, private/provider scheduler, or OpenViking context-trajectory measurement gates, including 6 passing core_archival_memory jobs, 1 passing memory_summary source-trace job, 4 passing proactive_brief suggestion jobs plus 1 private-corpus blocker, and 4 passing scheduled_memory task-readback jobs plus 1 private/provider scheduler blocker." }, { @@ -64,12 +64,12 @@ }, { "command": "cargo make real-world-memory-live-adapters", - "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", + "artifact": "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md", "claim": "ELF live service adapter reports 22 pass, 5 wrong_result, 2 blocked, and 11 not_encoded jobs; qmd reports 17 pass, 6 wrong_result, 2 blocked, and 15 not_encoded jobs." }, { "command": "cargo make real-world-memory-live-adapters", - "artifact": "docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md", + "artifact": "docs/evidence/benchmarking/2026-06-11-capture-write-policy-live-report.md", "claim": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage; qmd remains not_encoded, while agentmemory and claude-mem capture breadth are blocked until durable hook/viewer evidence exists." }, { @@ -79,27 +79,27 @@ }, { "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", - "artifact": "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", + "artifact": "docs/evidence/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", "claim": "mem0/OpenMemory and memsearch pass basic local baseline smokes; agentmemory remains lifecycle_fail and claude-mem remains wrong_result on same-corpus retrieval." }, { "command": "cargo make real-world-first-generation-oss", - "artifact": "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md", + "artifact": "docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md", "claim": "First-generation OSS fixture slice reports 6 jobs: 4 pass, 2 blocked, full evidence/source-ref/quote coverage, and manifest scenario outcomes across win, tie, loss, not_tested, blocked, and non_goal without promoting smoke evidence into live suite passes." }, { "command": "cargo make openmemory-ui-export-readback", - "artifact": "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", + "artifact": "docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", "claim": "mem0 local OSS passes preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history; OpenMemory export-helper setup emits a separate blocked artifact with DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER, and hosted Platform export remains non-goal." }, { "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", - "artifact": "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "artifact": "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", "claim": "Graphiti/Zep temporal smoke remains blocked by provider_api_key_missing when live provider execution is explicitly enabled without credentials." }, { "command": "cargo make smoke-graphify-docker-graph-report", - "artifact": "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md", + "artifact": "docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md", "claim": "graphify reaches tiny Docker graph/report scoring but remains wrong_result; broad graph/RAG quality is not tested." }, { @@ -109,12 +109,12 @@ }, { "command": "cargo make baseline-production-synthetic, cargo make baseline-backfill-docker, backup/restore plus Qdrant rebuild proof", - "artifact": "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md", + "artifact": "docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md", "claim": "ELF has provider synthetic, stress, backfill, restore, and rebuild evidence, while private-corpus proof remains blocked by missing operator-owned manifest." }, { "command": "ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker plus ELF trace-bundle and qmd CLI replay commands", - "artifact": "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md", + "artifact": "docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md", "claim": "Retrieval correctness remains tied, but qmd wins current immediate top-10/replay artifact ergonomics; ELF trace/admin surfaces are useful but not yet hydrated into the default stress artifact." } ], @@ -130,8 +130,8 @@ ], "measured_claim": "ELF has the strongest measured source-of-truth and rebuild story: Postgres is authoritative, Qdrant is rebuildable, trust_source_of_truth passes in fixture and live sweeps, and production restore/rebuild proof exists.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md" + "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md" ], "follow_up_issues": [], "caveat": "XY-925 encodes fixture-backed memsearch canonical Markdown source-store prompts, but no live memsearch real_world_job runtime adapter pass is claimed." @@ -149,9 +149,9 @@ ], "measured_claim": "ELF and qmd both pass the encoded live work_resume jobs. XY-925 selects agentmemory's durable local path but keeps it blocked until the SDK KV/index and observation log survive a fresh process; claude-mem work_resume remains not_encoded, and OpenViking continuity trajectory remains blocked.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", - "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" + "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/evidence/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", + "docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" ], "follow_up_issues": [ "XY-928" @@ -170,7 +170,7 @@ ], "measured_claim": "ELF and qmd both pass encoded project_decisions jobs. The new ELF core_archival_memory fixture also scores project-decision recovery through core routing plus archival rationale, but Letta-style comparison remains blocked without contained export evidence.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" + "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md" ], "follow_up_issues": [ "XY-927" @@ -188,8 +188,8 @@ ], "measured_claim": "ELF and qmd both pass the encoded live retrieval suite and both pass stress/same-corpus retrieval evidence.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" + "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", + "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md" ], "follow_up_issues": [ "XY-923" @@ -208,9 +208,9 @@ ], "measured_claim": "The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", - "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" + "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", + "docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" ], "follow_up_issues": [ "XY-923" @@ -230,8 +230,8 @@ ], "measured_claim": "ELF fixture memory_evolution passes, but live ELF passes only the delete/TTL job and reports five wrong_result jobs where evidence is retrieved but current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", - "docs/research/2026-06-11-temporal-history-competitor-gap-report.json" + "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" ], "follow_up_issues": [ "XY-905" @@ -250,8 +250,8 @@ ], "measured_claim": "ELF fixture consolidation passes, and XY-934 adds live service-backed proposal materialization, source lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit evidence. Managed dreaming and Always-On Memory Agent patterns remain product references, not direct live competitors.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md", - "docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json" + "docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md", + "docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md" ], "follow_up_issues": [ "XY-934" @@ -272,8 +272,8 @@ ], "measured_claim": "ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. The XY-929 graph/RAG representative slice scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md" + "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md" ], "follow_up_issues": [ "XY-926", @@ -296,8 +296,8 @@ "tmp/real-world-job/operator-ux-live-adapters/summary.json", "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", - "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md", - "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md", + "docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" ], "follow_up_issues": [ "XY-926" @@ -317,10 +317,10 @@ ], "measured_claim": "ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains not_encoded; XY-925 records agentmemory and claude-mem hook capture as typed blockers until Docker-contained hook observations and write-policy/viewer readback artifacts exist.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md", - "docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", - "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" + "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/evidence/benchmarking/2026-06-11-capture-write-policy-live-report.md", + "docs/evidence/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md", + "docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" ], "follow_up_issues": [ "XY-933", @@ -339,8 +339,8 @@ ], "measured_claim": "ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence are checked in.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md", - "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md" + "docs/evidence/benchmarking/2026-06-09-production-adoption-gate-report.md", + "docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md" ], "follow_up_issues": [ "XY-930" @@ -356,8 +356,8 @@ ], "measured_claim": "The private production profile fails closed without an operator-owned manifest, and provider-backed production-ops gates require explicit credentials.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md", - "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md" + "docs/evidence/benchmarking/2026-06-09-production-adoption-gate-report.md", + "docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md" ], "follow_up_issues": [ "XY-930" @@ -376,8 +376,8 @@ ], "measured_claim": "ELF and qmd both pass the single encoded live personalization job. mem0 local OSS now passes entity-scoped personalization, so scoped preference behavior is a measured tie; preference correction history remains a separate ELF loss.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md" + "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md" ], "follow_up_issues": [ "XY-927" @@ -397,7 +397,7 @@ ], "measured_claim": "OpenViking reaches the pinned Docker local embedding path and now exposes expected/matched/missing evidence ids, but same-corpus evidence is still wrong_result; staged trajectory, hierarchy selection, and recursive expansion are encoded as blocked fixtures, not scored comparisons.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" + "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" ], "follow_up_issues": [ "XY-928" @@ -440,7 +440,7 @@ ], "measured_claim": "cargo make real-world-memory-graph-rag adds representative citation, graph-summary, temporal-validity, graph-report, stale-source-lint, and unsupported-claim fixtures. The slice is typed non-pass: RAGFlow, GraphRAG, and Graphiti/Zep are blocked; LightRAG is incomplete with comparison blocked; graphify is wrong_result; llm-wiki is not_tested; gbrain is blocked. Broad graph/RAG navigation and citation quality remain not_tested.", "command_artifacts": [ - "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md" + "docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md" ], "follow_up_issues": [ "XY-929" diff --git a/docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-elf-qmd-memory-evolution-diagnostic.json similarity index 100% rename from docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-elf-qmd-memory-evolution-diagnostic.json diff --git a/docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-elf-qmd-retrieval-debug-profile.json similarity index 100% rename from docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-elf-qmd-retrieval-debug-profile.json diff --git a/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json similarity index 92% rename from docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json index 84a38938..28de9b09 100644 --- a/docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json @@ -5,12 +5,12 @@ "created_at": "2026-06-11", "scope": "ELF versus qmd trace-level replay and wrong-result diagnostics, with retrieval correctness kept as a separate guardrail.", "inputs": [ - "docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", - "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json", - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", - "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json", - "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", - "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json", + "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", + "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", + "docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "docs/evidence/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md", "scripts/live-baseline-benchmark.sh", "apps/elf-eval/src/app.rs", "docs/spec/system_elf_memory_service_v2.md" @@ -99,7 +99,7 @@ "outcome": "tie", "diagnostic_judgment": "Both systems pass encoded retrieval and stress same-corpus checks; this row does not score debugging ergonomics.", "artifacts": [ - "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json", + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", "tmp/live-baseline/live-baseline-report.json" ] }, @@ -114,7 +114,7 @@ "diagnostic_judgment": "qmd exposes file, score, line/snippet, and distractor rows directly; ELF records trace ids and top evidence but not the full candidate list in the report.", "artifacts": [ "tmp/live-baseline/qmd-query.json", - "docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" ] }, { @@ -230,7 +230,7 @@ "outcome": "not_tested", "diagnostic_judgment": "No comparable artifact shows expansion variants or dynamic expansion decisions for both systems.", "artifacts": [ - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" ] }, { @@ -257,7 +257,7 @@ "outcome": "not_tested", "diagnostic_judgment": "No comparable artifact shows fusion inputs, RRF or weighted-fusion contribution, or fusion-stage candidate drops.", "artifacts": [ - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" ] }, { @@ -271,7 +271,7 @@ "diagnostic_judgment": "The current qmd stress and materializer paths use --no-rerank; no rerank-on comparison is claimed.", "artifacts": [ "scripts/live-baseline-benchmark.sh", - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" ] }, { @@ -287,8 +287,8 @@ "retrieved_but_dropped" ], "artifacts": [ - "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json", - "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json" + "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", + "docs/evidence/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md" ] }, { @@ -305,7 +305,7 @@ "contradicted_by_lifecycle_evidence" ], "artifacts": [ - "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md" ] }, { @@ -322,7 +322,7 @@ "contradicted_by_lifecycle_evidence" ], "artifacts": [ - "docs/research/2026-06-11-elf-qmd-memory-evolution-diagnostic.json" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md" ] } ], diff --git a/docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-first-generation-oss-continuity-source-store-report.json similarity index 100% rename from docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-first-generation-oss-continuity-source-store-report.json diff --git a/docs/research/2026-06-11-measurement-coverage-audit.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-measurement-coverage-audit.json similarity index 100% rename from docs/research/2026-06-11-measurement-coverage-audit.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-measurement-coverage-audit.json diff --git a/docs/research/2026-06-11-qmd-openviking-strength-profile-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-qmd-openviking-strength-profile-report.json similarity index 95% rename from docs/research/2026-06-11-qmd-openviking-strength-profile-report.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-qmd-openviking-strength-profile-report.json index decee8e7..e38783a2 100644 --- a/docs/research/2026-06-11-qmd-openviking-strength-profile-report.json +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-qmd-openviking-strength-profile-report.json @@ -4,9 +4,9 @@ "created_at": "2026-06-11", "scope": "Scenario-level qmd retrieval-debug and OpenViking context-trajectory strength profile outcomes for XY-899.", "inputs": [ - "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", - "docs/guide/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md", - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", + "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/evidence/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md", + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md", "docs/spec/real_world_agent_memory_benchmark_v1.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", "scripts/real-world-live-adapters.sh" @@ -73,7 +73,7 @@ "source_artifacts": [ "tmp/real-world-memory/live-adapters/elf-report.json", "tmp/real-world-memory/live-adapters/qmd-report.json", - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" ] }, { @@ -88,7 +88,7 @@ "debug_replay_ergonomics": "qmd stress artifacts expose per-query top-10 files, line numbers, snippets, scores, and distractor density; ELF stress artifacts expose trace ids and top evidence but do not hydrate an equivalent candidate list in the checked-in report, so this surface is not scored as a comparative ELF loss.", "source_artifacts": [ "scripts/live-baseline-benchmark.sh", - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" ] }, { @@ -103,7 +103,7 @@ "debug_replay_ergonomics": "The qmd materializer and stress baseline use structured lex/vec query input with --no-rerank; no scenario scores expansion, fusion, or rerank superiority for either system.", "source_artifacts": [ "scripts/real-world-live-adapters.sh", - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" ] }, { @@ -119,7 +119,7 @@ "source_artifacts": [ "apps/elf-eval/fixtures/real_world_memory/retrieval/current_vs_obsolete.json", "apps/elf-eval/fixtures/real_world_memory/retrieval/distractor_heavy.json", - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" ] }, { @@ -134,7 +134,7 @@ "debug_replay_ergonomics": "ELF has additional service lifecycle, backfill, rebuild, and resource evidence, but the equivalent qmd strength surface is a tie.", "source_artifacts": [ "tmp/live-baseline/live-baseline-report.json", - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" ] }, { @@ -165,7 +165,7 @@ "debug_replay_ergonomics": "qmd's observed replay path is collection add, update, embed -f, and query --json in a fresh CLI process; ELF has service traces and admin bundle endpoints, but no scored replayability rule compares the two surfaces yet.", "source_artifacts": [ "scripts/live-baseline-benchmark.sh", - "docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md" ] }, { @@ -179,7 +179,7 @@ "retrieval_quality": "The memory-evolution diagnostic classifies qmd misses and selected-but-not-narrated lifecycle failures from produced evidence; candidate-drop classification remains untested because qmd live job artifacts do not expose candidate-stage traces.", "debug_replay_ergonomics": "The report taxonomy supports absent evidence, retrieved-but-dropped evidence, selected-but-not-narrated evidence, and lifecycle-contradicted evidence. Current qmd data exercises absent and selected-but-not-narrated classes; retrieved-but-dropped remains not observed.", "source_artifacts": [ - "docs/guide/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md" ] } ], diff --git a/docs/research/2026-06-11-temporal-history-competitor-gap-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-temporal-history-competitor-gap-report.json similarity index 100% rename from docs/research/2026-06-11-temporal-history-competitor-gap-report.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-temporal-history-competitor-gap-report.json diff --git a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-xy-897-competitor-strength-matrix.json similarity index 98% rename from docs/research/2026-06-11-xy-897-competitor-strength-matrix.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-xy-897-competitor-strength-matrix.json index f74e0d45..031bf5a6 100644 --- a/docs/research/2026-06-11-xy-897-competitor-strength-matrix.json +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-xy-897-competitor-strength-matrix.json @@ -5,13 +5,13 @@ "authority": "XY-897", "purpose": "Keep competitor-strength claims tied to measured evidence classes, typed blockers, and next benchmark gates.", "source_inputs": [ - "docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md", - "docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md", - "docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md", - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", - "docs/guide/research/external_memory_improvement_plan.md", - "docs/guide/research/research_projects_inventory.md", + "docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md", + "docs/evidence/benchmarking/2026-06-10-real-world-comparison-report.md", + "docs/evidence/benchmarking/2026-06-10-live-real-world-sweep-report.md", + "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "docs/evidence/external_memory/external_memory_improvement_plan.md", + "docs/evidence/external_memory/research_projects_inventory.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", "Makefile.toml" ], @@ -338,7 +338,7 @@ "measured_status": "not_encoded", "proof": { "command": null, - "artifact": "docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json" + "artifact": "docs/research/graph_rag_adapter_followup.md" }, "unsupported_or_blocked_status": { "state": "unsupported", @@ -358,7 +358,7 @@ "measured_status": "not_encoded", "proof": { "command": null, - "artifact": "docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json" + "artifact": "docs/research/graph_rag_adapter_followup.md" }, "unsupported_or_blocked_status": { "state": "unsupported", @@ -378,7 +378,7 @@ "measured_status": "not_encoded", "proof": { "command": null, - "artifact": "docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json" + "artifact": "docs/research/graph_rag_adapter_followup.md" }, "unsupported_or_blocked_status": { "state": "unsupported", @@ -398,7 +398,7 @@ "measured_status": "not_encoded", "proof": { "command": null, - "artifact": "docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json" + "artifact": "docs/research/graph_rag_adapter_followup.md" }, "unsupported_or_blocked_status": { "state": "blocked", diff --git a/docs/research/2026-06-11-xy-898-first-generation-oss-adapter-promotion.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-xy-898-first-generation-oss-adapter-promotion.json similarity index 95% rename from docs/research/2026-06-11-xy-898-first-generation-oss-adapter-promotion.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-xy-898-first-generation-oss-adapter-promotion.json index 81e9179c..cea12c00 100644 --- a/docs/research/2026-06-11-xy-898-first-generation-oss-adapter-promotion.json +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-xy-898-first-generation-oss-adapter-promotion.json @@ -5,11 +5,11 @@ "date": "2026-06-11", "scope": "Scenario-level adapter evidence for agentmemory, mem0/OpenMemory, memsearch, and claude-mem without ELF optimization changes.", "source_inputs": [ - "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", - "docs/research/2026-06-11-temporal-history-competitor-gap-report.json", - "docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md", + "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/evidence/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md", "docs/spec/real_world_agent_memory_benchmark_v1.md", - "docs/guide/benchmarking/live_baseline_benchmark.md", + "docs/runbook/benchmarking/live_baseline_benchmark.md", "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", "tmp/live-baseline/live-baseline-report.json" ], diff --git a/docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-xy-931-openmemory-ui-export-readback.json similarity index 100% rename from docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-11-xy-931-openmemory-ui-export-readback.json diff --git a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json similarity index 90% rename from docs/research/2026-06-16-dreaming-readiness-stage-ledger.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json index ea5d1bcf..cbd7c1ed 100644 --- a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-dreaming-readiness-stage-ledger.json @@ -87,11 +87,11 @@ } ], "evidence_files": [ - "docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", - "docs/research/2026-06-16-live-temporal-reconciliation-report.json", - "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", - "docs/research/2026-06-11-temporal-history-competitor-gap-report.json", - "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + "docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", + "docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", + "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md" ], "baseline_counts": { "pass": 1, @@ -132,7 +132,7 @@ }, { "command": "cargo make openmemory-ui-export-readback", - "artifact": "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", + "artifact": "docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", "purpose": "External comparison boundary for mem0/OpenMemory preference correction and export-style history." } ], @@ -151,11 +151,11 @@ } ], "evidence_files": [ - "docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", - "docs/research/2026-06-16-live-temporal-reconciliation-report.json", - "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", - "docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", - "docs/research/2026-06-11-temporal-history-competitor-gap-report.json" + "docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", + "docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", + "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md", + "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" ], "baseline_counts": { "pass": 0, @@ -206,10 +206,10 @@ } ], "evidence_files": [ - "docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", - "docs/research/2026-06-16-live-temporal-reconciliation-report.json", - "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", - "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md" + "docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", + "docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md", + "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md" ], "baseline_counts": { "pass": 1, @@ -260,8 +260,8 @@ ], "evidence_files": [ "docs/spec/system_consolidation_proposals_v1.md", - "docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md", - "docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json", + "docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md", + "docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md", "apps/elf-eval/fixtures/real_world_memory/consolidation/" ], "baseline_counts": { @@ -323,7 +323,7 @@ "evidence_files": [ "docs/spec/system_memory_summary_v1.md", "apps/elf-eval/fixtures/real_world_memory/memory_summary/", - "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md", "apps/elf-eval/fixtures/real_world_memory/knowledge/", "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/" ], @@ -380,12 +380,12 @@ } ], "evidence_files": [ - "docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md", - "docs/research/2026-06-16-proactive-brief-scoring-report.json", + "docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md", + "docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md", "apps/elf-eval/fixtures/real_world_memory/proactive_brief/", - "docs/research/2026-06-08-agent-memory-selection.json", - "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md", - "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + "docs/decisions/2026-06-08-agent-memory-selection.md", + "docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md", + "docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md" ], "baseline_counts": { "pass": 0, @@ -443,10 +443,10 @@ ], "evidence_files": [ "apps/elf-eval/fixtures/real_world_memory/scheduled_memory/", - "docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md", - "docs/research/2026-06-16-scheduled-memory-task-scoring-report.json", + "docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md", + "docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md", "docs/spec/system_consolidation_proposals_v1.md", - "docs/research/2026-06-08-agent-memory-selection.json" + "docs/decisions/2026-06-08-agent-memory-selection.md" ], "baseline_counts": { "pass": 0, @@ -534,10 +534,10 @@ } ], "evidence_files": [ - "docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md", - "docs/research/2026-06-11-competitor-strength-adoption-report.json", - "docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md", - "docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" + "docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md", + "docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md", + "docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md" ], "baseline_counts": { "pass": 22, diff --git a/docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-consolidation-proposal-scoring-report.json similarity index 100% rename from docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-consolidation-proposal-scoring-report.json diff --git a/docs/research/2026-06-16-live-temporal-reconciliation-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-temporal-reconciliation-report.json similarity index 98% rename from docs/research/2026-06-16-live-temporal-reconciliation-report.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-temporal-reconciliation-report.json index e6620577..55ddc931 100644 --- a/docs/research/2026-06-16-live-temporal-reconciliation-report.json +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-live-temporal-reconciliation-report.json @@ -25,7 +25,7 @@ } ], "baseline": { - "source": "docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", + "source": "docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md", "elf_memory_evolution": { "encoded_jobs": 6, "job_status_counts": { diff --git a/docs/research/2026-06-16-proactive-brief-scoring-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-proactive-brief-scoring-report.json similarity index 100% rename from docs/research/2026-06-16-proactive-brief-scoring-report.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-16-proactive-brief-scoring-report.json diff --git a/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-scheduled-memory-task-scoring-report.json similarity index 99% rename from docs/research/2026-06-16-scheduled-memory-task-scoring-report.json rename to apps/elf-eval/fixtures/report_snapshots/2026-06-16-scheduled-memory-task-scoring-report.json index 9bdae08b..35a5ba78 100644 --- a/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-16-scheduled-memory-task-scoring-report.json @@ -456,7 +456,7 @@ "result": { "status": "pass", "evidence": "This live_baseline_only record is same-corpus evidence only; cite qmd_live_real_world for the full live real-world sweep.", - "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + "artifact": "docs/runbook/benchmarking/live_baseline_benchmark.md" }, "capabilities": [ { @@ -1058,8 +1058,8 @@ ], "evidence": [ { - "kind": "guide", - "ref": "docs/guide/research/agentmemory_adapter.md", + "kind": "evidence", + "ref": "docs/evidence/external_memory/agentmemory_adapter.md", "status": "real" }, { @@ -1192,9 +1192,9 @@ "status": "pass", "elf_position": "loses", "comparison_outcome": "loss", - "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", - "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" }, { "scenario_id": "entity_scoped_personalization", @@ -1202,9 +1202,9 @@ "status": "pass", "elf_position": "ties", "comparison_outcome": "tie", - "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", - "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md" }, { "scenario_id": "delete_audit_readback", @@ -1212,9 +1212,9 @@ "status": "pass", "elf_position": "ties", "comparison_outcome": "tie", - "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", - "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" }, { "scenario_id": "local_get_all_export_readback", @@ -1418,7 +1418,7 @@ "result": { "status": "wrong_result", "evidence": "The current OpenViking Docker evidence is a behavioral wrong_result, not a local embedding setup blocker and not a real_world_job pass.", - "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + "artifact": "docs/runbook/benchmarking/live_baseline_benchmark.md" }, "capabilities": [ { @@ -1679,7 +1679,7 @@ "result": { "status": "not_encoded", "evidence": "The XY-899 report records qmd scenario-level retrieval/debug/replay outcomes and wrong-result diagnosis taxonomy, while expansion/fusion/rerank scoring remains not_encoded.", - "artifact": "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json" + "artifact": "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" }, "capabilities": [ { @@ -1765,7 +1765,7 @@ "result": { "status": "blocked", "evidence": "No OpenViking deep context-trajectory result is claimed from the current wrong-result smoke run; the XY-928 fixtures preserve trajectory surfaces as blocked/not_tested.", - "artifact": "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json" + "artifact": "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" }, "capabilities": [ { diff --git a/apps/elf-eval/src/bin/external_memory_pattern_radar.rs b/apps/elf-eval/src/bin/external_memory_pattern_radar.rs index 9a843a7b..208ca3fe 100644 --- a/apps/elf-eval/src/bin/external_memory_pattern_radar.rs +++ b/apps/elf-eval/src/bin/external_memory_pattern_radar.rs @@ -19,8 +19,8 @@ use time::{OffsetDateTime, format_description::well_known::Rfc3339}; const CURSOR_SCHEMA: &str = "elf.external_memory_pattern_radar_cursor/v1"; const RUN_SCHEMA: &str = "elf.external_memory_pattern_radar_run/v1"; -const DEFAULT_CURSOR: &str = "docs/research/external_memory_pattern_radar/cursor.json"; -const DEFAULT_SUMMARY: &str = "docs/research/external_memory_pattern_radar/latest.md"; +const DEFAULT_CURSOR: &str = "apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json"; +const DEFAULT_SUMMARY: &str = "docs/evidence/external_memory_pattern_radar_latest.md"; #[derive(Debug, Parser)] #[command( @@ -634,13 +634,36 @@ fn validate_create_issue(decision: &RadarDecision, errors: &mut Vec) { fn render_summary(cursor: &RadarCursor) -> Result { let run = cursor.last_run.as_ref().ok_or_else(|| eyre::eyre!("cursor has no last_run"))?; + let last_verified = run.generated_at.get(..10).unwrap_or("unknown"); let mut out = String::new(); + out.push_str("---\n"); + out.push_str("type: Evidence\n"); + out.push_str("title: \"External Memory Pattern Radar Summary\"\n"); + out.push_str("description: \"Latest weekly ELF external memory pattern radar outcome.\"\n"); + out.push_str("resource: docs/evidence/external_memory_pattern_radar_latest.md\n"); + out.push_str("status: active\n"); + out.push_str("authority: current_state\n"); + out.push_str("owner: evidence\n"); + out.push_str(&format!("last_verified: {last_verified}\n")); + out.push_str("tags:\n"); + out.push_str(" - docs\n"); + out.push_str(" - external-memory-pattern-radar\n"); + out.push_str(" - evidence\n"); + out.push_str("source_refs: []\n"); + out.push_str("code_refs:\n"); + out.push_str(" - apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json\n"); + out.push_str(" - apps/elf-eval/src/bin/external_memory_pattern_radar.rs\n"); + out.push_str("related: []\n"); + out.push_str("drift_watch:\n"); + out.push_str(" - apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json\n"); + out.push_str(" - apps/elf-eval/src/bin/external_memory_pattern_radar.rs\n"); + out.push_str("---\n\n"); out.push_str("# External Memory Pattern Radar Summary\n\n"); out.push_str("Goal: Preserve the latest weekly ELF external memory pattern radar outcome.\n"); out.push_str("Read this when: Feeding the next full comparison report or deciding whether a watched upstream memory project created an ELF follow-up.\n"); - out.push_str("Inputs: `docs/research/external_memory_pattern_radar/cursor.json`, GitHub repository metadata, checked-in ELF comparison evidence, and any Codex source-review notes.\n"); - out.push_str("Depends on: `docs/spec/external_memory_pattern_radar_v1.md` and `docs/guide/research/external_memory_pattern_radar.md`.\n"); + out.push_str("Inputs: `apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json`, GitHub repository metadata, checked-in ELF comparison evidence, and any Codex source-review notes.\n"); + out.push_str("Depends on: `docs/spec/external_memory_pattern_radar_v1.md` and `docs/runbook/external_memory_pattern_radar.md`.\n"); out.push_str("Outputs: Latest no-issue, rejection, or issue-ready radar decisions.\n\n"); out.push_str(&format!("- Run id: `{}`\n", run.run_id)); out.push_str(&format!("- Generated at: `{}`\n", run.generated_at)); diff --git a/apps/elf-eval/src/bin/live_baseline_elf.rs b/apps/elf-eval/src/bin/live_baseline_elf.rs index d20ea4dd..c1a87143 100644 --- a/apps/elf-eval/src/bin/live_baseline_elf.rs +++ b/apps/elf-eval/src/bin/live_baseline_elf.rs @@ -1182,7 +1182,7 @@ fn operational_cases() -> Vec { "compose_start_stop_upgrade", "documented", "runbook", - "docs/guide/single_user_production.md Sections 2, 4, and 5", + "docs/runbook/single_user_production.md Sections 2, 4, and 5", "storage health, API health, migration check, and post-upgrade search smoke", "Backup Postgres before binary/config upgrade; rollback restores the previous backup and rebuilds Qdrant.", ), @@ -1190,7 +1190,7 @@ fn operational_cases() -> Vec { "postgres_restore_qdrant_rebuild", "documented", "runbook_or_clean_volume_proof", - "docs/guide/single_user_production.md Sections 6 through 9", + "docs/runbook/single_user_production.md Sections 6 through 9", "Postgres restored row count, admin qdrant rebuild counts, and search-after-restore response", "Qdrant remains derived and rebuild uses Postgres-held vectors without embedding provider calls.", ), @@ -1198,7 +1198,7 @@ fn operational_cases() -> Vec { "migration_rollback", "documented", "runbook", - "docs/guide/single_user_production.md Section 5 rollback path", + "docs/runbook/single_user_production.md Section 5 rollback path", "pre-upgrade backup path, restored source rows, qdrant rebuild, and health check", "No reverse migration is claimed; rollback means previous binary/config plus restored Postgres backup.", ), diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index a9a6a8f7..532add8b 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -105,17 +105,23 @@ fn collapse_whitespace(text: &str) -> String { text.split_whitespace().collect::>().join(" ") } -fn strength_profile_report_path() -> Result { +fn report_snapshot_path(file_name: &str) -> Result { Ok(workspace_root()? - .join("docs") - .join("research") - .join("2026-06-11-qmd-openviking-strength-profile-report.json")) + .join("apps") + .join("elf-eval") + .join("fixtures") + .join("report_snapshots") + .join(file_name)) +} + +fn strength_profile_report_path() -> Result { + report_snapshot_path("2026-06-11-qmd-openviking-strength-profile-report.json") } fn strength_profile_markdown_path() -> Result { Ok(workspace_root()? .join("docs") - .join("guide") + .join("evidence") .join("benchmarking") .join("2026-06-11-qmd-openviking-strength-profile-report.md")) } @@ -123,36 +129,27 @@ fn strength_profile_markdown_path() -> Result { fn measurement_coverage_audit_path() -> Result { Ok(workspace_root()? .join("docs") - .join("guide") + .join("evidence") .join("benchmarking") .join("2026-06-11-measurement-coverage-audit.md")) } fn measurement_coverage_audit_json_path() -> Result { - Ok(workspace_root()? - .join("docs") - .join("research") - .join("2026-06-11-measurement-coverage-audit.json")) + report_snapshot_path("2026-06-11-measurement-coverage-audit.json") } fn retrieval_debug_profile_json_path() -> Result { - Ok(workspace_root()? - .join("docs") - .join("research") - .join("2026-06-11-elf-qmd-retrieval-debug-profile.json")) + report_snapshot_path("2026-06-11-elf-qmd-retrieval-debug-profile.json") } fn trace_replay_diagnostics_report_path() -> Result { - Ok(workspace_root()? - .join("docs") - .join("research") - .join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.json")) + report_snapshot_path("2026-06-11-elf-qmd-trace-replay-diagnostics-report.json") } fn trace_replay_diagnostics_markdown_path() -> Result { Ok(workspace_root()? .join("docs") - .join("guide") + .join("evidence") .join("benchmarking") .join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md")) } @@ -160,81 +157,63 @@ fn trace_replay_diagnostics_markdown_path() -> Result { fn competitor_strength_adoption_report_path() -> Result { Ok(workspace_root()? .join("docs") - .join("guide") + .join("evidence") .join("benchmarking") .join("2026-06-11-competitor-strength-adoption-report.md")) } fn competitor_strength_adoption_report_json_path() -> Result { - Ok(workspace_root()? - .join("docs") - .join("research") - .join("2026-06-11-competitor-strength-adoption-report.json")) + report_snapshot_path("2026-06-11-competitor-strength-adoption-report.json") } fn capture_write_policy_live_report_path() -> Result { - Ok(workspace_root()? - .join("docs") - .join("research") - .join("2026-06-11-capture-write-policy-live-report.json")) + report_snapshot_path("2026-06-11-capture-write-policy-live-report.json") } fn capture_write_policy_live_markdown_path() -> Result { Ok(workspace_root()? .join("docs") - .join("guide") + .join("evidence") .join("benchmarking") .join("2026-06-11-capture-write-policy-live-report.md")) } fn live_consolidation_proposal_scoring_report_path() -> Result { - Ok(workspace_root()? - .join("docs") - .join("research") - .join("2026-06-16-live-consolidation-proposal-scoring-report.json")) + report_snapshot_path("2026-06-16-live-consolidation-proposal-scoring-report.json") } fn live_consolidation_proposal_scoring_markdown_path() -> Result { Ok(workspace_root()? .join("docs") - .join("guide") + .join("evidence") .join("benchmarking") .join("2026-06-16-live-consolidation-proposal-scoring-report.md")) } fn temporal_history_competitor_gap_json_path() -> Result { - Ok(workspace_root()? - .join("docs") - .join("research") - .join("2026-06-11-temporal-history-competitor-gap-report.json")) + report_snapshot_path("2026-06-11-temporal-history-competitor-gap-report.json") } fn dreaming_readiness_stage_ledger_json_path() -> Result { - Ok(workspace_root()? - .join("docs") - .join("research") - .join("2026-06-16-dreaming-readiness-stage-ledger.json")) + report_snapshot_path("2026-06-16-dreaming-readiness-stage-ledger.json") } fn dreaming_readiness_stage_ledger_markdown_path() -> Result { Ok(workspace_root()? .join("docs") - .join("guide") + .join("evidence") .join("benchmarking") .join("2026-06-16-dreaming-readiness-stage-ledger.md")) } fn live_temporal_reconciliation_report_json_path() -> Result { - Ok(workspace_root()? - .join("docs") - .join("research") - .join("2026-06-16-live-temporal-reconciliation-report.json")) + report_snapshot_path("2026-06-16-live-temporal-reconciliation-report.json") } fn live_temporal_reconciliation_report_markdown_path() -> Result { Ok(workspace_root()? .join("docs") - .join("guide") + .join("evidence") .join("benchmarking") .join("2026-06-16-live-temporal-reconciliation-report.md")) } @@ -242,16 +221,13 @@ fn live_temporal_reconciliation_report_markdown_path() -> Result { fn competitor_strength_matrix_path() -> Result { Ok(workspace_root()? .join("docs") - .join("guide") + .join("evidence") .join("benchmarking") .join("2026-06-11-competitor-strength-evidence-matrix.md")) } fn competitor_strength_matrix_json_path() -> Result { - Ok(workspace_root()? - .join("docs") - .join("research") - .join("2026-06-11-xy-897-competitor-strength-matrix.json")) + report_snapshot_path("2026-06-11-xy-897-competitor-strength-matrix.json") } fn readme_path() -> Result { @@ -261,19 +237,19 @@ fn readme_path() -> Result { fn comparison_external_projects_path() -> Result { Ok(workspace_root()? .join("docs") - .join("guide") - .join("research") + .join("evidence") + .join("external_memory") .join("comparison_external_projects.md")) } fn benchmarking_index_path() -> Result { - Ok(workspace_root()?.join("docs").join("guide").join("benchmarking").join("index.md")) + Ok(workspace_root()?.join("docs").join("evidence").join("benchmarking").join("index.md")) } fn iteration_direction_report_path() -> Result { Ok(workspace_root()? .join("docs") - .join("guide") + .join("evidence") .join("benchmarking") .join("2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md")) } @@ -916,7 +892,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { ); assert_eq!( qmd_deep.pointer("/result/artifact").and_then(Value::as_str), - Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") + Some("docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md") ); assert_eq!( openviking_deep.pointer("/adapter_kind").and_then(Value::as_str), @@ -927,7 +903,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { assert_eq!( openviking_deep.pointer("/result/artifact").and_then(Value::as_str), - Some("docs/research/2026-06-11-qmd-openviking-strength-profile-report.json") + Some("docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md") ); Ok(()) @@ -1724,9 +1700,9 @@ fn openmemory_ui_export_probe_has_dedicated_docker_task() -> Result<()> { let docker_script = fs::read_to_string(workspace_root.join("scripts/baseline-docker.sh"))?; let compose = fs::read_to_string(workspace_root.join("docker-compose.baseline.yml"))?; let script = fs::read_to_string(workspace_root.join("scripts/live-baseline-benchmark.sh"))?; - let report = serde_json::from_str::(&fs::read_to_string( - workspace_root.join("docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json"), - )?)?; + let report = serde_json::from_str::(&fs::read_to_string(workspace_root.join( + "apps/elf-eval/fixtures/report_snapshots/2026-06-11-xy-931-openmemory-ui-export-readback.json", + ))?)?; assert!(makefile.contains("[tasks.openmemory-ui-export-readback]")); assert!(makefile.contains("scripts/baseline-docker.sh")); @@ -2085,10 +2061,10 @@ fn live_consolidation_report_preserves_reviewable_output_boundaries() -> Result< let markdown = fs::read_to_string(live_consolidation_proposal_scoring_markdown_path()?)?; let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; let readme = fs::read_to_string(readme_path()?)?; - let benchmark_guide = fs::read_to_string( + let benchmark_runbook = fs::read_to_string( workspace .join("docs") - .join("guide") + .join("runbook") .join("benchmarking") .join("real_world_agent_memory_benchmark.md"), )?; @@ -2181,8 +2157,8 @@ fn live_consolidation_report_preserves_reviewable_output_boundaries() -> Result< ); assert!(readme.contains("Live Consolidation Proposal Scoring Report - June 16, 2026")); assert!(readme.contains("real-world-memory-live-consolidation")); - assert!(benchmark_guide.contains("Current live consolidation increment")); - assert!(benchmark_guide.contains("tmp/real-world-memory/live-consolidation/summary.json")); + assert!(benchmark_runbook.contains("Current live consolidation increment")); + assert!(benchmark_runbook.contains("tmp/real-world-memory/live-consolidation/summary.json")); assert!(makefile.contains("[tasks.real-world-memory-live-consolidation]")); assert!(makefile.contains("scripts/real-world-docker.sh")); @@ -3081,7 +3057,7 @@ fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> { assert!(array_contains_str( local_debug, "/command_artifacts", - "docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" + "docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" )?); assert!(array_contains_str( adoption, diff --git a/docs/decisions/2026-06-08-agent-memory-selection.md b/docs/decisions/2026-06-08-agent-memory-selection.md new file mode 100644 index 00000000..e21800b3 --- /dev/null +++ b/docs/decisions/2026-06-08-agent-memory-selection.md @@ -0,0 +1,101 @@ +--- +type: Decision +title: "Agent Memory Selection" +description: "Accepted decision to keep ELF as the evidence-bound memory core while borrowing external memory systems only as adapters, baselines, and derived patterns." +resource: docs/decisions/2026-06-08-agent-memory-selection.md +status: active +authority: normative +owner: decisions +last_verified: 2026-06-18 +tags: + - docs + - decision + - memory + - research-promotion +source_refs: [] +code_refs: + - docs/evidence/external_memory/comparison_external_projects.md + - docs/evidence/external_memory/research_projects_inventory.md +related: [] +drift_watch: + - docs/evidence/external_memory/comparison_external_projects.md + - docs/evidence/external_memory/research_projects_inventory.md + - docs/spec/system_competitive_parity_gate_v1.md + - docs/spec/system_consolidation_proposals_v1.md +--- +# Agent Memory Selection + +Purpose: Preserve the accepted June 2026 decision about ELF's relationship to +external agent-memory systems. +Status: normative +Read this when: You are deciding whether ELF should adopt, replace, or integrate with +agentmemory, managed dreaming systems, or adjacent memory projects. +Not this document: A live benchmark result, upstream market survey, or adapter +implementation plan. +Defines: ELF remains the evidence-bound memory core; external systems are optional +capture, benchmark, viewer, and derived-consolidation inputs. + +## Decision + +Continue ELF as the evidence-bound memory core. Do not replace ELF with agentmemory, +managed dreaming APIs, or another external memory product. + +Borrow external systems only where they preserve ELF's source-of-truth boundary: + +- optional capture/import adapters +- benchmark baselines +- viewer and operator UX references +- reviewable derived consolidation patterns +- graph, timeline, and knowledge-page presentation patterns + +## Rationale + +ELF's durable advantage is the explicit evidence contract: deterministic writes, +scoped service semantics, Postgres as the source of truth, rebuildable derived indexes, +and provenance-oriented evaluation. External systems reviewed in June 2026 are useful +but do not replace that contract. + +agentmemory is valuable for coding-agent continuity, hooks, MCP/REST packaging, a +viewer, and benchmark UX. That value supports an adapter and benchmark baseline, not a +core replacement. + +Dreaming-style systems are valuable because OpenAI, Anthropic, and Google converge on +background memory curation as a product direction. The safe shared pattern is +reviewable derived output over immutable input evidence, not destructive rewriting of +authoritative memory. + +## Rejected Options + +- Replace ELF with agentmemory. +- Replace ELF's roadmap with managed dreaming APIs. +- Pause ELF core development until the agent-memory market stabilizes. + +## Promotion + +This decision promotes the accepted conclusion from the retired +`2026-06-08-agent-memory-selection` research run. Settled facts are now owned by this +decision, `docs/evidence/external_memory/comparison_external_projects.md`, +`docs/spec/system_competitive_parity_gate_v1.md`, and +`docs/spec/system_consolidation_proposals_v1.md`. + +Remaining unresolved value points are tracked as active research contracts instead of +raw JSON artifacts: + +- `docs/research/derived_knowledge_page_followup.md` +- `docs/research/dreaming_product_surface_followup.md` +- `docs/research/graph_rag_adapter_followup.md` + +## Drift Watch + +Revisit this decision only if an external project provides an ELF-equivalent +evidence-bound deterministic write contract, source-of-truth storage, multi-tenant +service semantics, and lower integration risk, or if a self-hostable managed dreaming +system provides portable, reviewable, evidence-linked memory stores that satisfy ELF's +governance boundary. + +## Citations + +- `docs/evidence/external_memory/comparison_external_projects.md` +- `docs/evidence/external_memory/research_projects_inventory.md` +- `docs/spec/system_competitive_parity_gate_v1.md` +- `docs/spec/system_consolidation_proposals_v1.md` diff --git a/docs/decisions/index.md b/docs/decisions/index.md new file mode 100644 index 00000000..2427f7f5 --- /dev/null +++ b/docs/decisions/index.md @@ -0,0 +1,13 @@ +# Decision Index + +Purpose: Route agents to accepted rationale and durable decision records. +Read this when: You need to understand why an accepted repository direction exists. +Not this document: Latent research, operational runbooks, or raw machine artifacts. +Routes to: Decision concepts under `docs/decisions/` and historical decision-shaped +planning artifacts under `docs/reference/plans/`. + +## Concepts + +- `2026-06-08-agent-memory-selection.md`: Accepted decision to keep ELF as the + evidence-bound memory core while using external memory systems as adapters, + baselines, and derived patterns. diff --git a/docs/evidence/2026-06-18-docs-okf-self-check.md b/docs/evidence/2026-06-18-docs-okf-self-check.md new file mode 100644 index 00000000..234263e0 --- /dev/null +++ b/docs/evidence/2026-06-18-docs-okf-self-check.md @@ -0,0 +1,71 @@ +--- +type: Drift Audit +title: "Docs OKF Self-Check" +description: "Drift audit anchoring the documentation bundle migration to the current OKF and LLM Wiki profile." +resource: docs/evidence/2026-06-18-docs-okf-self-check.md +status: active +authority: current_state +owner: docs +last_verified: 2026-06-18 +tags: + - docs + - drift-audit + - okf +source_refs: [] +code_refs: + - Makefile.toml + - scripts/check-docs.py +related: [] +drift_watch: + - docs/ + - Makefile.toml + - scripts/check-docs.py +--- +# Docs OKF Self-Check + +Purpose: Anchor the documentation structure migration against the current +Markdown-only OKF and LLM Wiki profile. +Read this when: You need the evidence boundary for the docs readiness claim. +Not this document: Product behavior validation, benchmark result interpretation, or +runtime proof. + +## Watched Claims + +- `docs/` is a Markdown-only OKF and LLM Wiki bundle. +- Required root files and lane indexes exist. +- Machine-readable JSON artifacts are outside `docs/`; legacy research JSON artifacts + were promoted, moved to app fixtures, or moved as active tool state. +- Repository-native docs validation still runs through `cargo make check-docs`. +- Decodex profile validation runs through `decodex docs check`. + +## Evidence Anchors + +- `docs/policy.md` owns the current docs profile. +- `docs/log.md` records the migration. +- `docs/evidence/2026-06-18-research-artifact-disposition.md` records the legacy + research JSON disposition. +- `Makefile.toml` defines `check-docs` as the repository-native docs task. +- `scripts/check-docs.py` validates repository Markdown links and cargo-make task + references. + +## Reverse Checks + +- Search `docs/` for non-Markdown files before claiming readiness. +- Search docs references for stale legacy JSON paths after artifact moves. +- Run both `decodex docs check` and `cargo make check-docs`. + +## Verdict + +pass + +## Required Updates + +- Re-run `decodex docs check` after material docs or research layout changes. +- Record any remaining intentional limitations in the final handoff. + +## Citations + +- `docs/policy.md` +- `docs/log.md` +- `Makefile.toml` +- `scripts/check-docs.py` diff --git a/docs/evidence/2026-06-18-research-artifact-disposition.md b/docs/evidence/2026-06-18-research-artifact-disposition.md new file mode 100644 index 00000000..c222e820 --- /dev/null +++ b/docs/evidence/2026-06-18-research-artifact-disposition.md @@ -0,0 +1,92 @@ +--- +type: Evidence +title: "Research Artifact Disposition" +description: "Evidence record for promoting, carrying forward, or deleting legacy research JSON artifacts during the OKF and LLM Wiki migration." +resource: docs/evidence/2026-06-18-research-artifact-disposition.md +status: active +authority: current_state +owner: docs +last_verified: 2026-06-18 +tags: + - docs + - evidence + - research-promotion + - okf +source_refs: [] +code_refs: + - docs/policy.md + - apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json +related: [] +drift_watch: + - docs/research/ + - docs/evidence/external_memory/ + - docs/evidence/benchmarking/ +--- +# Research Artifact Disposition + +Purpose: Record how legacy research JSON artifacts were handled while forming the +Markdown-only OKF and LLM Wiki bundle. +Read this when: You need to know whether an old research JSON was promoted, carried +forward, moved as tool state, or deleted. +Not this document: Raw research payload storage or a benchmark result. + +## Disposition Rules + +- Settled decisions move to `docs/decisions/`, `docs/spec/`, `docs/runbook/`, or + `docs/evidence/`. +- Unresolved but valuable points move to new `docs/research/` contracts. +- Machine reports already represented by Markdown benchmark reports leave the + research lane; test-required structured snapshots move to app-owned fixtures. +- Tool cursor state moves outside `docs/` and outside the research lane. + +## Promoted Research Runs + +| Retired artifact | Disposition | New owner | +| --- | --- | --- | +| `2026-06-08-agent-memory-selection` | Accepted decision promoted. | `docs/decisions/2026-06-08-agent-memory-selection.md` | +| `2026-06-09-xy-841-external-memory-benchmark-dimensions` | Benchmark-dimension conclusions promoted. | `docs/spec/real_world_agent_memory_benchmark_v1.md`; `docs/evidence/external_memory/comparison_external_projects.md`; `docs/evidence/external_memory/research_projects_inventory.md` | +| `2026-06-10-xy-882-rag-graph-adapter-feasibility` | Accepted verdicts promoted; unresolved follow-up preserved. | `docs/evidence/external_memory/research_projects_inventory.md`; `docs/research/graph_rag_adapter_followup.md`; `docs/research/derived_knowledge_page_followup.md` | + +## Rehomed Machine Reports + +The June 11 and June 16 JSON reports were removed from `docs/research/` because their +settled content is already owned by Markdown benchmark reports under +`docs/evidence/benchmarking/` and by the relevant specs or fixtures. Structured snapshots +that Rust boundary tests still parse now live under +`apps/elf-eval/fixtures/report_snapshots/`; they are app fixtures, not documentation +owners or research contracts. + +Representative owners: + +- `docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md` +- `docs/evidence/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md` +- `docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md` +- `docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md` +- `docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md` +- `docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md` + +## Carried Forward Research + +Unresolved value points now live as explicit research contracts: + +- `docs/research/graph_rag_adapter_followup.md` +- `docs/research/derived_knowledge_page_followup.md` +- `docs/research/dreaming_product_surface_followup.md` + +## Tool State + +The external memory pattern radar cursor is active tool state, not a research +conclusion. It now lives at +`apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json`. + +## Verdict + +pass + +## Citations + +- `docs/policy.md` +- `docs/decisions/2026-06-08-agent-memory-selection.md` +- `docs/research/graph_rag_adapter_followup.md` +- `docs/research/derived_knowledge_page_followup.md` +- `docs/research/dreaming_product_surface_followup.md` diff --git a/docs/guide/benchmarking/2026-06-09-live-baseline-report.md b/docs/evidence/benchmarking/2026-06-09-live-baseline-report.md similarity index 94% rename from docs/guide/benchmarking/2026-06-09-live-baseline-report.md rename to docs/evidence/benchmarking/2026-06-09-live-baseline-report.md index 9551adeb..a4af7442 100644 --- a/docs/guide/benchmarking/2026-06-09-live-baseline-report.md +++ b/docs/evidence/benchmarking/2026-06-09-live-baseline-report.md @@ -1,10 +1,24 @@ +--- +type: Evidence +title: "Live Baseline Benchmark Report - 2026-06-09" +description: "Checked-in benchmark evidence record: Live Baseline Benchmark Report - 2026-06-09." +resource: docs/evidence/benchmarking/2026-06-09-live-baseline-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Live Baseline Benchmark Report - 2026-06-09 Goal: Preserve the checked-in evidence snapshot behind the README benchmark claims. Read this when: You need the June 9, 2026 live baseline result, pass/fail reasons, or the next benchmark iteration backlog. Inputs: Docker-only benchmark reports generated by `cargo make baseline-live-docker`. -Depends on: `docs/guide/benchmarking/live_baseline_benchmark.md`, +Depends on: `docs/runbook/benchmarking/live_baseline_benchmark.md`, `docker-compose.baseline.yml`, `scripts/live-baseline-benchmark.sh`, and `scripts/live-baseline-report-to-md.sh`. Verification: Re-run the commands in this report and compare @@ -186,7 +200,7 @@ overhead. Whether that is acceptable depends on the production workflow: it is a cold/backfill measurement, not an interactive-ingest target. This report is benchmark evidence, not the production operating procedure. Use -`docs/guide/single_user_production.md` for Docker Compose production start, stop, +`docs/runbook/single_user_production.md` for Docker Compose production start, stop, health, backup, restore, Qdrant rebuild, rollback, provider config handling, and cleanup commands. @@ -223,7 +237,7 @@ cargo make baseline-live-docker Convert the latest JSON report into Markdown: ```sh -ELF_BASELINE_MARKDOWN_REPORT=docs/guide/benchmarking/YYYY-MM-DD-live-baseline-report.md \ +ELF_BASELINE_MARKDOWN_REPORT=docs/evidence/benchmarking/YYYY-MM-DD-live-baseline-report.md \ cargo make baseline-live-report ``` diff --git a/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md b/docs/evidence/benchmarking/2026-06-09-operator-debugging-ux-report.md similarity index 89% rename from docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md rename to docs/evidence/benchmarking/2026-06-09-operator-debugging-ux-report.md index 4b7944c6..08688011 100644 --- a/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md +++ b/docs/evidence/benchmarking/2026-06-09-operator-debugging-ux-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Real-World Job Benchmark Report" +description: "Checked-in benchmark evidence record: Real-World Job Benchmark Report." +resource: docs/evidence/benchmarking/2026-06-09-operator-debugging-ux-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Real-World Job Benchmark Report Goal: Publish a Markdown summary for one generated real_world_job benchmark report. @@ -85,11 +99,11 @@ The real-world job runner is fixture-backed. This section separates encoded evid | Job | Failure Mode | Trace Evidence | Steps | Raw SQL | Dropped Candidate Visibility | Trace Completeness | Repair Clarity | UX Gaps | | --- | --- | --- | ---: | --- | --- | --- | --- | --- | -| operator-debug-dropped-evidence-001 | expected_evidence_dropped | `11111111-1111-4111-8111-111111111111`
[viewer](/viewer?trace_id=11111111-1111-4111-8111-111111111111)
[bundle](/v2/admin/traces/11111111-1111-4111-8111-111111111111/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 4 | `false` | visible in Retrieval Funnel and Replay Candidates | `complete` | `clear` | `none` | -| operator-debug-provider-latency-001 | provider_latency_or_failure | `33333333-3333-4333-8333-333333333333`
[viewer](/viewer?trace_id=33333333-3333-4333-8333-333333333333)
[bundle](/v2/admin/traces/33333333-3333-4333-8333-333333333333/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 3 | `false` | visible as low recall counts rather than a post-recall drop | `complete` | `clear` | `none` | -| operator-debug-rebuild-changed-results-001 | rebuild_changed_results | `44444444-4444-4444-8444-444444444444`
[viewer](/viewer?trace_id=44444444-4444-4444-8444-444444444444)
[bundle](/v2/admin/traces/44444444-4444-4444-8444-444444444444/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 5 | `false` | visible by comparing before and after trace candidates | `complete` | `clear` | `none` | -| operator-debug-relation-context-mislead-001 | relation_context_misled_search | `55555555-5555-4555-8555-555555555555`
[viewer](/viewer?trace_id=55555555-5555-4555-8555-555555555555)
[bundle](/v2/admin/traces/55555555-5555-4555-8555-555555555555/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 4 | `false` | not dropped; misleading context is visible on selected result | `complete` | `clear` | `none` | -| operator-debug-rerank-bad-candidate-001 | rerank_promoted_bad_candidate | `22222222-2222-4222-8222-222222222222`
[viewer](/viewer?trace_id=22222222-2222-4222-8222-222222222222)
[bundle](/v2/admin/traces/22222222-2222-4222-8222-222222222222/bundle?mode=full&stage_items_limit=128&candidates_limit=200) | 3 | `false` | not dropped; visible with lower final rank in Replay Candidates | `complete` | `clear` | `none` | +| operator-debug-dropped-evidence-001 | expected_evidence_dropped | `11111111-1111-4111-8111-111111111111`
viewer: `/viewer?trace_id=11111111-1111-4111-8111-111111111111`
bundle: `/v2/admin/traces/11111111-1111-4111-8111-111111111111/bundle?mode=full&stage_items_limit=128&candidates_limit=200` | 4 | `false` | visible in Retrieval Funnel and Replay Candidates | `complete` | `clear` | `none` | +| operator-debug-provider-latency-001 | provider_latency_or_failure | `33333333-3333-4333-8333-333333333333`
viewer: `/viewer?trace_id=33333333-3333-4333-8333-333333333333`
bundle: `/v2/admin/traces/33333333-3333-4333-8333-333333333333/bundle?mode=full&stage_items_limit=128&candidates_limit=200` | 3 | `false` | visible as low recall counts rather than a post-recall drop | `complete` | `clear` | `none` | +| operator-debug-rebuild-changed-results-001 | rebuild_changed_results | `44444444-4444-4444-8444-444444444444`
viewer: `/viewer?trace_id=44444444-4444-4444-8444-444444444444`
bundle: `/v2/admin/traces/44444444-4444-4444-8444-444444444444/bundle?mode=full&stage_items_limit=128&candidates_limit=200` | 5 | `false` | visible by comparing before and after trace candidates | `complete` | `clear` | `none` | +| operator-debug-relation-context-mislead-001 | relation_context_misled_search | `55555555-5555-4555-8555-555555555555`
viewer: `/viewer?trace_id=55555555-5555-4555-8555-555555555555`
bundle: `/v2/admin/traces/55555555-5555-4555-8555-555555555555/bundle?mode=full&stage_items_limit=128&candidates_limit=200` | 4 | `false` | not dropped; misleading context is visible on selected result | `complete` | `clear` | `none` | +| operator-debug-rerank-bad-candidate-001 | rerank_promoted_bad_candidate | `22222222-2222-4222-8222-222222222222`
viewer: `/viewer?trace_id=22222222-2222-4222-8222-222222222222`
bundle: `/v2/admin/traces/22222222-2222-4222-8222-222222222222/bundle?mode=full&stage_items_limit=128&candidates_limit=200` | 3 | `false` | not dropped; visible with lower final rank in Replay Candidates | `complete` | `clear` | `none` | ### Operator Debug Details diff --git a/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md b/docs/evidence/benchmarking/2026-06-09-production-adoption-gate-report.md similarity index 96% rename from docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md rename to docs/evidence/benchmarking/2026-06-09-production-adoption-gate-report.md index 5dda8783..0b4f38f6 100644 --- a/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md +++ b/docs/evidence/benchmarking/2026-06-09-production-adoption-gate-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Production Adoption Gate Report - June 9, 2026" +description: "Checked-in benchmark evidence record: Production Adoption Gate Report - June 9, 2026." +resource: docs/evidence/benchmarking/2026-06-09-production-adoption-gate-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Production Adoption Gate Report - June 9, 2026 Goal: Record the XY-836 full comparison gate and personal production adoption decision. @@ -130,7 +144,7 @@ Single-user restore proof: ```sh awk '/^bash <<'\''EOF'\''$/{flag=1; next} flag && /^EOF$/{exit} flag {print}' \ - docs/guide/single_user_production.md \ + docs/runbook/single_user_production.md \ | perl -0pe 's#tmp/single-user-restore-proof#tmp/xy836-single-user-restore-proof#g; s/51988/52988/g; s/51989/52989/g; s/51990/52990/g; s/51991/52991/g; s/51992/52992/g; s/51993/52993/g; s/elf-restore-proof/elf-xy836-restore-proof/g' \ > tmp/xy836-restore-proof.sh bash tmp/xy836-restore-proof.sh diff --git a/docs/guide/benchmarking/2026-06-09-production-corpus-report.md b/docs/evidence/benchmarking/2026-06-09-production-corpus-report.md similarity index 84% rename from docs/guide/benchmarking/2026-06-09-production-corpus-report.md rename to docs/evidence/benchmarking/2026-06-09-production-corpus-report.md index b050f1df..46143cf9 100644 --- a/docs/guide/benchmarking/2026-06-09-production-corpus-report.md +++ b/docs/evidence/benchmarking/2026-06-09-production-corpus-report.md @@ -1,9 +1,23 @@ +--- +type: Evidence +title: "Live Baseline Benchmark Report" +description: "Checked-in benchmark evidence record: Live Baseline Benchmark Report." +resource: docs/evidence/benchmarking/2026-06-09-production-corpus-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Live Baseline Benchmark Report Goal: Publish a Markdown summary for one generated live baseline aggregate report. Read this when: You need a durable, reviewable summary of a live baseline JSON report. Inputs: `tmp/live-baseline/live-baseline-report.json`. -Depends on: `scripts/live-baseline-benchmark.sh` and `docs/guide/benchmarking/live_baseline_benchmark.md`. +Depends on: `scripts/live-baseline-benchmark.sh` and `docs/runbook/benchmarking/live_baseline_benchmark.md`. Verification: Compare this Markdown summary with the source JSON before committing. ## Summary @@ -24,7 +38,7 @@ Verification: Compare this Markdown summary with the source JSON before committi - Full check summary: `7/7 pass` This report is production-corpus benchmark evidence only. Use -`docs/guide/single_user_production.md` for the single-user Docker Compose production +`docs/runbook/single_user_production.md` for the single-user Docker Compose production runbook, including backup, restore, Qdrant rebuild, rollback, provider config handling, and cleanup commands. diff --git a/docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md b/docs/evidence/benchmarking/2026-06-10-live-real-world-sweep-report.md similarity index 88% rename from docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md rename to docs/evidence/benchmarking/2026-06-10-live-real-world-sweep-report.md index 7a3dfa4e..04b766aa 100644 --- a/docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md +++ b/docs/evidence/benchmarking/2026-06-10-live-real-world-sweep-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Live Real-World Adapter Sweep Report - June 10, 2026" +description: "Checked-in benchmark evidence record: Live Real-World Adapter Sweep Report - June 10, 2026." +resource: docs/evidence/benchmarking/2026-06-10-live-real-world-sweep-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Live Real-World Adapter Sweep Report - June 10, 2026 Goal: Publish the XY-880 full-suite live real-world sweep evidence for ELF and qmd. @@ -7,8 +21,8 @@ Inputs: `cargo make real-world-memory-live-adapters`, `apps/elf-eval/fixtures/real_world_memory/`, and `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, -`docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md`, and -`docs/guide/benchmarking/live_baseline_benchmark.md`. +`docs/evidence/benchmarking/2026-06-10-real-world-comparison-report.md`, and +`docs/runbook/benchmarking/live_baseline_benchmark.md`. Verification: `cargo make real-world-memory-live-adapters` ran on branch `y/elf-xy-880` and wrote the generated reports under `tmp/real-world-memory/live-adapters/`. diff --git a/docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md b/docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md similarity index 95% rename from docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md rename to docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md index 5826e2f2..1cb7f69d 100644 --- a/docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md +++ b/docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Post-Adapter Production Adoption Refresh - June 10, 2026" +description: "Checked-in benchmark evidence record: Post-Adapter Production Adoption Refresh - June 10, 2026." +resource: docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Post-Adapter Production Adoption Refresh - June 10, 2026 Goal: Publish the XY-884 post-adapter production adoption refresh after the live @@ -7,11 +21,11 @@ production use under the latest checked-in benchmark evidence. Inputs: `2026-06-09-production-adoption-gate-report.md`, `2026-06-10-real-world-comparison-report.md`, `2026-06-10-live-real-world-sweep-report.md`, -`docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`, and +`docs/research/graph_rag_adapter_followup.md`, and `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, -`docs/guide/benchmarking/live_baseline_benchmark.md`, and -`docs/guide/single_user_production.md`. +`docs/runbook/benchmarking/live_baseline_benchmark.md`, and +`docs/runbook/single_user_production.md`. Outputs: Current production adoption decision, evidence-class separation, accepted caveats, and follow-up issue routing. diff --git a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md b/docs/evidence/benchmarking/2026-06-10-real-world-comparison-report.md similarity index 96% rename from docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md rename to docs/evidence/benchmarking/2026-06-10-real-world-comparison-report.md index 2868b4b8..8b48c3bb 100644 --- a/docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md +++ b/docs/evidence/benchmarking/2026-06-10-real-world-comparison-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Real-World Comparison Report - June 10, 2026" +description: "Checked-in benchmark evidence record: Real-World Comparison Report - June 10, 2026." +resource: docs/evidence/benchmarking/2026-06-10-real-world-comparison-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Real-World Comparison Report - June 10, 2026 Goal: Publish the post-P1 real-world agent memory benchmark evidence and adoption @@ -6,10 +20,10 @@ Read this when: You need the checked-in evidence behind README-level real-world benchmark claims after XY-833 and XY-861 through XY-864 landed. Inputs: Generated reports under `tmp/real-world-memory/` and `tmp/real-world-job/`, `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, -and the live-baseline reports linked from this guide. +and the live-baseline reports linked from this evidence record. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, -`docs/guide/benchmarking/real_world_agent_memory_benchmark.md`, and -`docs/guide/benchmarking/live_baseline_benchmark.md`. +`docs/runbook/benchmarking/real_world_agent_memory_benchmark.md`, and +`docs/runbook/benchmarking/live_baseline_benchmark.md`. Verification: The original commands listed below were run from branch `y/elf-xy-865`. XY-881 refreshed `cargo make real-world-memory`, `cargo make real-world-memory-production-ops`, and `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker` from branch @@ -18,7 +32,7 @@ dependency boundary is discussed. Postscript: XY-880 superseded the live-adapter state in this report for ELF and qmd. The successor evidence is -`docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md`: ELF and qmd now +`docs/evidence/benchmarking/2026-06-10-live-real-world-sweep-report.md`: ELF and qmd now emit full-suite live sweep records, but neither has a full-suite live pass. ## Context diff --git a/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md b/docs/evidence/benchmarking/2026-06-11-capture-write-policy-live-report.md similarity index 91% rename from docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md rename to docs/evidence/benchmarking/2026-06-11-capture-write-policy-live-report.md index 185ab65b..a06dd616 100644 --- a/docs/guide/benchmarking/2026-06-11-capture-write-policy-live-report.md +++ b/docs/evidence/benchmarking/2026-06-11-capture-write-policy-live-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Capture/Write-Policy Live Report - June 11, 2026" +description: "Checked-in benchmark evidence record: Capture/Write-Policy Live Report - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-capture-write-policy-live-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Capture/Write-Policy Live Report - June 11, 2026 Goal: Record the XY-933 live capture/write-policy evidence and competitor claim diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md similarity index 97% rename from docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md rename to docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 12aeeb01..14007b4e 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Competitor-Strength Adoption Report - June 11, 2026" +description: "Checked-in benchmark evidence record: Competitor-Strength Adoption Report - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Competitor-Strength Adoption Report - June 11, 2026 Goal: Publish the final benchmark vNext adoption decision and scenario matrix for @@ -16,7 +30,7 @@ Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md` and the current external adapter manifest. Outputs: Adoption decision, evidence-class boundaries, scenario matrix, follow-up optimization queue, and the machine-readable companion file -`docs/research/2026-06-11-competitor-strength-adoption-report.json`. +`docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md`. ## Adoption Decision diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/evidence/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md similarity index 91% rename from docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md rename to docs/evidence/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index 6402b188..84dea005 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/evidence/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -1,24 +1,38 @@ +--- +type: Evidence +title: "Competitor-Strength Evidence Matrix - June 11, 2026" +description: "Checked-in benchmark evidence record: Competitor-Strength Evidence Matrix - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Competitor-Strength Evidence Matrix - June 11, 2026 Goal: Define a durable competitor-strength matrix so ELF benchmark claims are tied to measured evidence classes, typed blockers, and explicit next measurement gates. Read this when: You need to decide whether ELF can claim a win, tie, loss, gap, or non-claim against a tracked memory, RAG, or graph project. -Inputs: `docs/guide/benchmarking/2026-06-10-production-adoption-refresh.md`, -`docs/guide/benchmarking/2026-06-10-real-world-comparison-report.md`, -`docs/guide/benchmarking/2026-06-10-live-real-world-sweep-report.md`, -`docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md`, -`docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`, -`docs/guide/research/external_memory_improvement_plan.md`, -`docs/guide/research/research_projects_inventory.md`, +Inputs: `docs/evidence/benchmarking/2026-06-10-production-adoption-refresh.md`, +`docs/evidence/benchmarking/2026-06-10-real-world-comparison-report.md`, +`docs/evidence/benchmarking/2026-06-10-live-real-world-sweep-report.md`, +`docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md`, +`docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md`, +`docs/evidence/external_memory/external_memory_improvement_plan.md`, +`docs/evidence/external_memory/research_projects_inventory.md`, `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, and `Makefile.toml`. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, -`docs/guide/benchmarking/live_baseline_benchmark.md`, and the current external adapter +`docs/runbook/benchmarking/live_baseline_benchmark.md`, and the current external adapter manifest. Outputs: Human-readable matrix, claim boundaries, scenario next-measurement gates, and the machine-readable companion file -`docs/research/2026-06-11-xy-897-competitor-strength-matrix.json`. +`docs/evidence/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md`. ## Decision Boundary @@ -95,10 +109,10 @@ lifecycle-fail -> `lifecycle_fail`, and not-encoded -> `not_encoded`. | GraphRAG | GraphRAG indexing, graph summaries, and document/text-unit evidence tables. | `research_gate`. | `blocked`: `ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker`, `tmp/real-world-memory/graphrag-smoke/summary.json`. | `blocked`: indexing resource envelope and source citation mapping are not proven. | XY-887 cost-bounded Docker adapter over a tiny corpus and scored output tables. | Graph summary artifacts, local/global search separation, and source table evidence mapping. | | Graphiti/Zep | Temporal graph memory with current, historical, and future fact validity windows. | `research_gate`. | `blocked`: `ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal`, `tmp/real-world-memory/graphiti-zep-smoke/summary.json`. | `blocked`: Docker graph-store and temporal adapter are not proven. | XY-888 Docker-local temporal graph adapter scoring current/historical fact validity. | Temporal fact windows, invalidation/supersession semantics, and graph fact provenance. | | Letta | Core memory blocks versus archival memory with explicit operating-context surfaces. | `research_gate`. | `blocked`: the selected comparison contract is a Docker-only benchmark-created agent export that returns core block JSON, archival search/readback JSON, and source ids; no materialized export exists yet. | `blocked`: no Letta materializer currently creates the benchmark agent, imports the ELF `core_archival_memory` fixture corpus, or exports comparable core and archival evidence. | Implement and run the contained export/readback adapter before any Letta win, tie, or loss claim; keep personalization and project-decision scenarios blocked or not tested until that evidence exists. | Core memory block ergonomics, archival separation, and shared operating context readback. | -| LangGraph | Checkpoint/replay regression workflow and durable state replay for agent runs. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: not a standalone memory backend adapter. | Non-goal for direct win/loss until a standalone memory output contract exists; use replay jobs as benchmark infrastructure reference. | Checkpoint replay, deterministic regression, and state-diff evaluation patterns. | -| nanograph | Typed graph schema and query ergonomics for graph-lite developer experience. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: not a memory backend comparison target. | Non-goal for direct win/loss unless a contained memory-backed target emerges; measure ELF graph-lite DX instead. | Typed relation schema, query ergonomics, and small graph developer experience. | -| llm-wiki | LLM-maintained wiki or knowledge-page workflow with query-save and lint loops. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `unsupported`: no live service runtime for adapter proof. | Select contained plugin or instruction harness, then score knowledge pages for citations, unsupported claims, rebuild, and stale-source lint. | Maintained wiki workflows, page lint, query-save loops, and topic-scoped navigation. | -| gbrain | Operational knowledge brain with compiled_truth pages, timelines, enrichment, and maintenance loops. | `research_gate`. | `not_encoded`: `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`. | `blocked`: Docker-local brain repo and database path are missing. | Prove Docker-local repository/database setup, then encode compiled_truth/timeline and operator-continuity jobs. | Compiled truth pages, timeline maintenance, and human-operable knowledge-brain navigation. | +| LangGraph | Checkpoint/replay regression workflow and durable state replay for agent runs. | `research_gate`. | `not_encoded`: `docs/research/graph_rag_adapter_followup.md`. | `unsupported`: not a standalone memory backend adapter. | Non-goal for direct win/loss until a standalone memory output contract exists; use replay jobs as benchmark infrastructure reference. | Checkpoint replay, deterministic regression, and state-diff evaluation patterns. | +| nanograph | Typed graph schema and query ergonomics for graph-lite developer experience. | `research_gate`. | `not_encoded`: `docs/research/graph_rag_adapter_followup.md`. | `unsupported`: not a memory backend comparison target. | Non-goal for direct win/loss unless a contained memory-backed target emerges; measure ELF graph-lite DX instead. | Typed relation schema, query ergonomics, and small graph developer experience. | +| llm-wiki | LLM-maintained wiki or knowledge-page workflow with query-save and lint loops. | `research_gate`. | `not_encoded`: `docs/research/graph_rag_adapter_followup.md`. | `unsupported`: no live service runtime for adapter proof. | Select contained plugin or instruction harness, then score knowledge pages for citations, unsupported claims, rebuild, and stale-source lint. | Maintained wiki workflows, page lint, query-save loops, and topic-scoped navigation. | +| gbrain | Operational knowledge brain with compiled_truth pages, timelines, enrichment, and maintenance loops. | `research_gate`. | `not_encoded`: `docs/research/graph_rag_adapter_followup.md`. | `blocked`: Docker-local brain repo and database path are missing. | Prove Docker-local repository/database setup, then encode compiled_truth/timeline and operator-continuity jobs. | Compiled truth pages, timeline maintenance, and human-operable knowledge-brain navigation. | | graphify | Graph-compressed navigation with `graph.json` and `GRAPH_REPORT` evidence outputs. | Scored tiny `live_real_world` smoke; not broad graph-quality proof. | `wrong_result`: `cargo make smoke-graphify-docker-graph-report`, `tmp/real-world-memory/graphify-smoke/graphify-report.json`. | `not_encoded`: broad graph navigation, multimodal, private-corpus, and large-corpus quality remain outside the tiny smoke. | Expand beyond the generated smoke only after graph/report output maps to scored evidence on representative graph/RAG jobs. | Graph compression, source-location graph reports, and navigation hints for large code or document spaces. | ## Scenario Matrix diff --git a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md b/docs/evidence/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md similarity index 97% rename from docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md rename to docs/evidence/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md index 7c03cb74..59f6cf39 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +++ b/docs/evidence/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "ELF Iteration Direction From Competitor Benchmarks - June 11, 2026" +description: "Checked-in benchmark evidence record: ELF Iteration Direction From Competitor Benchmarks - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # ELF Iteration Direction From Competitor Benchmarks - June 11, 2026 Goal: Convert the current benchmark evidence and competitor-strength matrix into an @@ -9,7 +23,7 @@ Inputs: `2026-06-11-competitor-strength-evidence-matrix.md`, `2026-06-10-production-adoption-refresh.md`, `2026-06-10-real-world-comparison-report.md`, `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, -and `docs/guide/research/external_memory_improvement_plan.md`. +and `docs/evidence/external_memory/external_memory_improvement_plan.md`. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`. Outputs: Current measured data, scenario gaps, and a prioritized optimization direction for future ELF work. diff --git a/docs/guide/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md b/docs/evidence/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md similarity index 96% rename from docs/guide/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md rename to docs/evidence/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md index bf4e53a1..1fbe20c0 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md +++ b/docs/evidence/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "ELF/qmd Memory-Evolution Diagnostic - June 11, 2026" +description: "Checked-in benchmark evidence record: ELF/qmd Memory-Evolution Diagnostic - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-elf-qmd-memory-evolution-diagnostic.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # ELF/qmd Memory-Evolution Diagnostic - June 11, 2026 Goal: Explain the fresh live memory-evolution failures for ELF and qmd, and turn the diff --git a/docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md b/docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md similarity index 96% rename from docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md rename to docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md index 8054b3fe..2ade8802 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md +++ b/docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "ELF/qmd Retrieval-Debug Profile - June 11, 2026" +description: "Checked-in benchmark evidence record: ELF/qmd Retrieval-Debug Profile - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # ELF/qmd Retrieval-Debug Profile - June 11, 2026 Goal: Compare the measured retrieval-debug evidence for ELF and qmd without turning diff --git a/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md b/docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md similarity index 94% rename from docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md rename to docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md index 189566c2..cf2ef71d 100644 --- a/docs/guide/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md +++ b/docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "ELF/qmd Trace Replay Diagnostics Report - June 11, 2026" +description: "Checked-in benchmark evidence record: ELF/qmd Trace Replay Diagnostics Report - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # ELF/qmd Trace Replay Diagnostics Report - June 11, 2026 Goal: Compare ELF and qmd on trace-level replay and wrong-result diagnostics while @@ -10,8 +24,8 @@ runner, ELF trace replay code, and the ELF service trace/admin contract. Outputs: Scenario-level `win`, `tie`, `loss`, `not_tested`, `blocked`, or `non_goal` outcomes plus concrete replay commands and artifact paths. -Machine-readable companion: -`docs/research/2026-06-11-elf-qmd-trace-replay-diagnostics-report.json`. +Markdown report owner: +`docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md`. ## Executive Judgment @@ -49,11 +63,11 @@ This is not a broad qmd-over-ELF claim. It is a scored local-debug artifact gap. | System | Replay surface | Command | Artifact | | --- | --- | --- | --- | -| ELF | Stress guardrail with trace ids | `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/live-baseline-report.json`; summarized in `docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json` | +| ELF | Stress guardrail with trace ids | `ELF_BASELINE_PROJECTS=ELF,qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/live-baseline-report.json`; summarized in `docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md` | | ELF | Admin trace bundle hydration | `curl -fsS 'http://127.0.0.1:51891/v2/admin/traces//bundle?mode=full&stage_items_limit=256&candidates_limit=200' -H 'X-ELF-Tenant-Id: ' -H 'X-ELF-Project-Id: ' -H 'X-ELF-Agent-Id: '` | `elf.trace_bundle/v1` response from the admin service | | ELF | Trace ranking replay | `cargo run -p elf-eval -- --config-a config/local/elf.docker.toml --config-b config/local/elf.docker.toml --trace-id ` | JSON trace compare output over `search_trace_candidates` | | ELF | Operator-debug live trace slice | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/elf-report.json` and `summary.json` | -| qmd | Stress guardrail and top-10 rows | `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/qmd-query.json`; summarized in `docs/research/2026-06-11-elf-qmd-retrieval-debug-profile.json` | +| qmd | Stress guardrail and top-10 rows | `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker` | `tmp/live-baseline/qmd-query.json`; summarized in `docs/evidence/benchmarking/2026-06-11-elf-qmd-retrieval-debug-profile.md` | | qmd | Per-query CLI replay | `npx tsx src/cli/qmd.ts query 'lex: \nvec: ' -c elfbench --json --no-rerank --min-score 0 -n 10` | JSON top-10 rows with `file`, line/snippet/score fields when qmd returns them | | qmd | Lifecycle replay | `npx tsx src/cli/qmd.ts update && npx tsx src/cli/qmd.ts embed -f -c elfbench && npx tsx src/cli/qmd.ts query ... --json --no-rerank` | `tmp/live-baseline/qmd-query.json` checks for update, delete, and cold-start recovery | | qmd | Operator-debug live replay slice | `cargo make real-world-job-operator-ux-live-adapters` | `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` and `summary.json` | diff --git a/docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md b/docs/evidence/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md similarity index 94% rename from docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md rename to docs/evidence/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md index 63b44b2b..1865dac8 100644 --- a/docs/guide/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md +++ b/docs/evidence/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "First-Generation OSS Adapter Promotion Report - June 11, 2026" +description: "Checked-in benchmark evidence record: First-Generation OSS Adapter Promotion Report - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-first-generation-oss-adapter-promotion-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # First-Generation OSS Adapter Promotion Report - June 11, 2026 Goal: Promote first-generation OSS memory baselines into scenario-level adapter diff --git a/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md b/docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md similarity index 89% rename from docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md rename to docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md index 80e944cc..47b4e103 100644 --- a/docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md +++ b/docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "First-Generation OSS Continuity and Source-Store Report - June 11, 2026" +description: "Checked-in benchmark evidence record: First-Generation OSS Continuity and Source-Store Report - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # First-Generation OSS Continuity and Source-Store Report - June 11, 2026 Goal: Expand first-generation OSS adapter coverage for durable continuity, @@ -95,5 +109,5 @@ Not allowed: checked-in prompt and blocker fixtures. - `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`: updated scenario rows and explicit `comparison_outcome` values. -- `docs/research/2026-06-11-first-generation-oss-continuity-source-store-report.json`: +- `docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md`: machine-readable companion report. diff --git a/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md b/docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md similarity index 95% rename from docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md rename to docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md index 290092d3..2440786e 100644 --- a/docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md +++ b/docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Graph/RAG Scored Smoke Adapter Report - June 11, 2026" +description: "Checked-in benchmark evidence record: Graph/RAG Scored Smoke Adapter Report - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Graph/RAG Scored Smoke Adapter Report - June 11, 2026 Goal: Record the XY-900 promotion of graph/RAG Docker smokes and the XY-929 diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md similarity index 97% rename from docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md rename to docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md index 841e945f..4d3cbe91 100644 --- a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md +++ b/docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "ELF Benchmark Measurement Coverage Audit - June 11, 2026" +description: "Checked-in benchmark evidence record: ELF Benchmark Measurement Coverage Audit - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # ELF Benchmark Measurement Coverage Audit - June 11, 2026 Goal: Record what is actually measured today, where competitor comparisons are still @@ -103,7 +117,7 @@ live adapter or competitor runtime can complete those jobs. `cargo make real-world-memory-live-adapters` produced: XY-934 update: the June 11 consolidation row below is superseded for ELF by -`docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md`. +`docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md`. ELF now has live service-backed consolidation proposal scoring for the 4 checked-in consolidation jobs; qmd remains typed `not_encoded` for this suite. diff --git a/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md b/docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md similarity index 92% rename from docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md rename to docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md index 9200bb86..943e2380 100644 --- a/docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md +++ b/docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "mem0/OpenMemory History and UI Export Report - June 11, 2026" +description: "Checked-in benchmark evidence record: mem0/OpenMemory History and UI Export Report - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # mem0/OpenMemory History and UI Export Report - June 11, 2026 Goal: Add scenario-level mem0/OpenMemory history, personalization, deletion-audit, @@ -15,7 +29,7 @@ Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`. Outputs: Per-scenario outcomes using `win`, `tie`, `loss`, `not_tested`, `blocked`, and `non_goal`, plus command and artifact evidence for each measured claim. -Machine-readable companion: `docs/research/2026-06-11-xy-931-openmemory-ui-export-readback.json`. +Markdown report owner: `docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md`. ## Executive Judgment @@ -64,9 +78,9 @@ mem0/OpenMemory rows in this report contain eight scenarios: `loss=1`, | Scenario | mem0/OpenMemory evidence | ELF comparison outcome | Status | Command | Artifact | | --- | --- | --- | --- | --- | --- | | Basic local lifecycle | mem0 passes same-corpus retrieval, update, delete, and cold-start reload in the prior first-generation baseline. | `tie` | `pass` | `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker` | `tmp/live-baseline/live-baseline-report.json` | -| Preference correction history | `Memory.history` exposes explicit `ADD` and `UPDATE` preference records; search returns only the current correction. | `loss` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | -| Entity-scoped personalization | `search()` with `user_id`, `agent_id`, and `run_id` filters returns the ELF-scoped preference and omits a PubFi-scoped preference. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | -| Delete audit readback | `Memory.history` exposes a `DELETE` event and post-delete search suppresses the deleted memory. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| Preference correction history | `Memory.history` exposes explicit `ADD` and `UPDATE` preference records; search returns only the current correction. | `loss` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| Entity-scoped personalization | `search()` with `user_id`, `agent_id`, and `run_id` filters returns the ELF-scoped preference and omits a PubFi-scoped preference. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Delete audit readback | `Memory.history` exposes a `DELETE` event and post-delete search suppresses the deleted memory. | `tie` | `pass` | mem0: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`; ELF: `cargo make real-world-memory-live-adapters` | mem0: `tmp/live-baseline/mem0-checks.json`; ELF: `tmp/real-world-memory/live-adapters/`, `docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | | Local SDK export-style readback | `Memory.get_all` returns the current scoped preference and omits the other scope. | `not_tested` | `pass` | `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker` | `tmp/live-baseline/mem0-checks.json` | | OpenMemory UI/export readback | The bounded export-helper setup probe finds OpenMemory product files but the export helper cannot run because Docker is unavailable inside the baseline runner. It does not reach browser/dashboard readback or same-corpus product app database validation. | `blocked` | `blocked` | `cargo make openmemory-ui-export-readback` | `tmp/live-baseline/mem0-openmemory-ui-export.json`, `tmp/live-baseline/mem0-openmemory-export-attempt.log` | | Hosted mem0 Platform export | Hosted Platform export is outside local OSS evidence. | `non_goal` | `unsupported` | Not run; local OSS comparison only. | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | diff --git a/docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md b/docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md similarity index 94% rename from docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md rename to docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md index 693ce98d..549bb430 100644 --- a/docs/guide/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md +++ b/docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "qmd and OpenViking Strength-Profile Report - June 11, 2026" +description: "Checked-in benchmark evidence record: qmd and OpenViking Strength-Profile Report - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # qmd and OpenViking Strength-Profile Report - June 11, 2026 Goal: Compare ELF against qmd and OpenViking on their actual strengths without @@ -11,8 +25,8 @@ Outputs: Scenario-level win/tie/loss/not-tested judgments, qmd wrong-result diagnosis taxonomy, OpenViking typed trajectory blockers, blocked context-trajectory jobs, and claim boundaries. -Machine-readable companion: -`docs/research/2026-06-11-qmd-openviking-strength-profile-report.json`. +Markdown report owner: +`docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md`. ## Executive Judgment diff --git a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md b/docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md similarity index 97% rename from docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md rename to docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md index 40fca7fa..01c166fb 100644 --- a/docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md +++ b/docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Temporal/History Competitor Gap Report - June 11, 2026" +description: "Checked-in benchmark evidence record: Temporal/History Competitor Gap Report - June 11, 2026." +resource: docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Temporal/History Competitor Gap Report - June 11, 2026 Goal: Turn the latest live measurements into a clear competitor-gap report and diff --git a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md b/docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md similarity index 82% rename from docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md rename to docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md index 9d1f9f7b..e6e0e379 100644 --- a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md +++ b/docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Dreaming-Readiness Stage Ledger - June 16, 2026" +description: "Checked-in benchmark evidence record: Dreaming-Readiness Stage Ledger - June 16, 2026." +resource: docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Dreaming-Readiness Stage Ledger - June 16, 2026 Goal: Define the Decodex benchmark gate for Dreaming-inspired ELF memory-system @@ -5,7 +19,7 @@ optimization stages. Read this when: You are starting or finishing a staged memory improvement lane and need the baseline command matrix, typed evidence status, post-stage outcome, and report shape required before claiming the stage improved. -Inputs: `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`, the June 11 +Inputs: `docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md`, the June 11 competitor-strength, temporal-history, and iteration-direction reports, the XY-905 June 16 live temporal reconciliation report, the consolidation proposal spec, the memory summary spec, the XY-953 proactive brief scoring report, the XY-954 scheduled @@ -65,7 +79,7 @@ provider-backed private-corpus quality, or silent source mutation safety. - Every downstream Dreaming or competitor-improvement stage must write a post-stage JSON report and Markdown summary before claiming phase completion. - The report must compare against the baseline counts in - `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`. + `docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md`. - The comparison judgment must be one of `improved`, `regressed`, `unchanged`, `blocked`, or `not_tested`. - Typed non-pass labels stay typed. Do not collapse `wrong_result`, `blocked`, @@ -93,14 +107,14 @@ provider-backed private-corpus quality, or silent source mutation safety. | Stage | Evidence file(s) | | --- | --- | -| Current-vs-historical correctness | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | -| Preference evolution and correction history | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md`; `docs/research/2026-06-11-temporal-history-competitor-gap-report.json` | -| Deletion, TTL, and tombstone behavior | `docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/research/2026-06-16-live-temporal-reconciliation-report.json`; `docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md` | -| Reviewable consolidation | `docs/spec/system_consolidation_proposals_v1.md`; `apps/elf-eval/fixtures/real_world_memory/consolidation/`; `docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md`; `docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json` | -| Memory summary and top-of-mind behavior | `docs/spec/system_memory_summary_v1.md`; `apps/elf-eval/fixtures/real_world_memory/memory_summary/`; `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | -| Proactive brief readiness | `docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md`; `docs/research/2026-06-16-proactive-brief-scoring-report.json`; `apps/elf-eval/fixtures/real_world_memory/proactive_brief/`; `docs/research/2026-06-08-agent-memory-selection.json`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | -| Scheduled memory task readiness | `docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md`; `docs/research/2026-06-16-scheduled-memory-task-scoring-report.json`; `apps/elf-eval/fixtures/real_world_memory/scheduled_memory/`; `docs/research/2026-06-08-agent-memory-selection.json` | -| Final competitor retest status | `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/research/2026-06-11-competitor-strength-adoption-report.json`; `docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | +| Current-vs-historical correctness | `docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Preference evolution and correction history | `docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/evidence/benchmarking/2026-06-11-mem0-openmemory-history-ui-export-report.md` | +| Deletion, TTL, and tombstone behavior | `docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`; `docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md`; `docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md` | +| Reviewable consolidation | `docs/spec/system_consolidation_proposals_v1.md`; `apps/elf-eval/fixtures/real_world_memory/consolidation/`; `docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md` | +| Memory summary and top-of-mind behavior | `docs/spec/system_memory_summary_v1.md`; `apps/elf-eval/fixtures/real_world_memory/memory_summary/`; `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| Proactive brief readiness | `docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md`; `apps/elf-eval/fixtures/real_world_memory/proactive_brief/`; `docs/decisions/2026-06-08-agent-memory-selection.md`; `docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | +| Scheduled memory task readiness | `docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md`; `apps/elf-eval/fixtures/real_world_memory/scheduled_memory/`; `docs/decisions/2026-06-08-agent-memory-selection.md` | +| Final competitor retest status | `docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/evidence/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | ## Report Shape For Downstream Issues diff --git a/docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md b/docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md similarity index 91% rename from docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md rename to docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md index 4e7f8302..91599dde 100644 --- a/docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md +++ b/docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Live Consolidation Proposal Scoring Report - June 16, 2026" +description: "Checked-in benchmark evidence record: Live Consolidation Proposal Scoring Report - June 16, 2026." +resource: docs/evidence/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Live Consolidation Proposal Scoring Report - June 16, 2026 Goal: Record the XY-934 live consolidation proposal scoring evidence and product diff --git a/docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md b/docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md similarity index 91% rename from docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md rename to docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md index f4385ad3..3d55c14e 100644 --- a/docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md +++ b/docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Live Temporal Reconciliation Report - June 16, 2026" +description: "Checked-in benchmark evidence record: Live Temporal Reconciliation Report - June 16, 2026." +resource: docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Live Temporal Reconciliation Report - June 16, 2026 Goal: Record the XY-905 live memory-evolution before/after result and trace contract. @@ -5,7 +19,7 @@ Read this when: You need the current evidence for ELF live current-vs-historical supersession, rationale, tombstone, and invalidation behavior. Inputs: `cargo make real-world-memory-evolution`, `cargo make real-world-memory-live-adapters`, and -`docs/research/2026-06-16-live-temporal-reconciliation-report.json`. +`docs/evidence/benchmarking/2026-06-16-live-temporal-reconciliation-report.md`. Outputs: A scoped benchmark result for ELF live `memory_evolution` only. ## Executive Judgment diff --git a/docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md b/docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md similarity index 90% rename from docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md rename to docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md index 255c544d..99a7dc10 100644 --- a/docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md +++ b/docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Proactive Brief Scoring Report - June 16, 2026" +description: "Checked-in benchmark evidence record: Proactive Brief Scoring Report - June 16, 2026." +resource: docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Proactive Brief Scoring Report - June 16, 2026 Purpose: Publish the XY-953 fixture-backed proactive project brief scoring result. @@ -6,7 +20,7 @@ Read this when: You need the current proactive-brief fixture evidence, stage-led delta, and claim boundaries. Not this document: A scheduler design, morning-dashboard UI, private-corpus run, or hosted managed-memory comparison. -Source: `docs/research/2026-06-16-proactive-brief-scoring-report.json`. +Report owner: `docs/evidence/benchmarking/2026-06-16-proactive-brief-scoring-report.md`. ## Summary diff --git a/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md b/docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md similarity index 97% rename from docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md rename to docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md index f0d5dedd..0e825852 100644 --- a/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md +++ b/docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Real-World Job Benchmark Report" +description: "Checked-in benchmark evidence record: Real-World Job Benchmark Report." +resource: docs/evidence/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - benchmarking +--- # Real-World Job Benchmark Report Goal: Publish a Markdown summary for one generated real_world_job benchmark report. @@ -68,17 +82,17 @@ This section is manifest-backed. It records external adapter coverage and blocke | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | ELF | `elf_real_world_memory_fixture` | `fixture_backed` | `blocked` | `pass` | `blocked` | `blocked` | `true` | `trust_source_of_truth`: `pass`
`work_resume`: `pass`
`project_decisions`: `pass`
`retrieval`: `pass`
`memory_evolution`: `pass`
`consolidation`: `pass`
`memory_summary`: `pass`
`proactive_brief`: `blocked`
`scheduled_memory`: `blocked`
`knowledge_compilation`: `pass`
`operator_debugging_ux`: `pass`
`capture_integration`: `pass`
`core_archival_memory`: `pass`
`production_ops`: `blocked`
`personalization`: `pass`
`context_trajectory`: `blocked` | setup: `cargo make real-world-memory`
result: `tmp/real-world-memory/real-world-memory-report.md` | | ELF | `elf_live_real_world` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `trust_source_of_truth`: `pass`
`work_resume`: `pass`
`retrieval`: `pass`
`project_decisions`: `pass`
`memory_evolution`: `wrong_result`
`consolidation`: `pass`
`knowledge_compilation`: `pass`
`operator_debugging_ux`: `pass`
`capture_integration`: `pass`
`production_ops`: `blocked`
`personalization`: `pass`
`core_archival_memory`: `not_encoded`
`context_trajectory`: `blocked` | setup: `cargo make real-world-memory-live-adapters`
result: `tmp/real-world-memory/live-adapters/elf-report.md` | -| qmd | `qmd_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `retrieval`: `not_encoded`
`memory_evolution`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker`
result: `docs/guide/benchmarking/live_baseline_benchmark.md` | +| qmd | `qmd_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `retrieval`: `not_encoded`
`memory_evolution`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker`
result: `docs/runbook/benchmarking/live_baseline_benchmark.md` | | qmd | `qmd_live_real_world` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `trust_source_of_truth`: `pass`
`work_resume`: `pass`
`retrieval`: `pass`
`project_decisions`: `pass`
`memory_evolution`: `wrong_result`
`consolidation`: `not_encoded`
`knowledge_compilation`: `not_encoded`
`operator_debugging_ux`: `wrong_result`
`capture_integration`: `not_encoded`
`production_ops`: `blocked`
`personalization`: `pass`
`core_archival_memory`: `not_encoded`
`context_trajectory`: `blocked` | setup: `cargo make real-world-memory-live-adapters`
result: `tmp/real-world-memory/live-adapters/qmd-report.md` | | ELF | `elf_operator_debug_live` | `live_real_world` | `pass` | `pass` | `pass` | `pass` | `true` | `operator_debugging_ux`: `pass` | setup: `cargo make real-world-job-operator-ux-live-adapters`
result: `tmp/real-world-job/operator-ux-live-adapters/elf-report.md` | | qmd | `qmd_operator_debug_live` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `operator_debugging_ux`: `wrong_result` | setup: `cargo make real-world-job-operator-ux-live-adapters`
result: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.md` | | agentmemory | `agentmemory_live_baseline` | `live_baseline_only` | `lifecycle_fail` | `pass` | `lifecycle_fail` | `lifecycle_fail` | `true` | `work_resume`: `blocked`
`capture_integration`: `blocked`
`memory_evolution`: `blocked` | setup: `ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | | mem0/OpenMemory | `mem0_openmemory_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `memory_evolution`: `not_encoded`
`personalization`: `not_encoded`
`operator_debugging_ux`: `blocked` | setup: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | | memsearch | `memsearch_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `trust_source_of_truth`: `not_encoded`
`retrieval`: `not_encoded`
`memory_evolution`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=memsearch cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | -| OpenViking | `openviking_live_baseline` | `live_baseline_only` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `retrieval`: `wrong_result`
`work_resume`: `not_encoded`
`context_trajectory`: `blocked` | setup: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`
result: `docs/guide/benchmarking/live_baseline_benchmark.md` | +| OpenViking | `openviking_live_baseline` | `live_baseline_only` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `retrieval`: `wrong_result`
`work_resume`: `not_encoded`
`context_trajectory`: `blocked` | setup: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`
result: `docs/runbook/benchmarking/live_baseline_benchmark.md` | | claude-mem | `claude_mem_live_baseline` | `live_baseline_only` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `work_resume`: `not_encoded`
`operator_debugging_ux`: `blocked`
`capture_integration`: `blocked` | setup: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | -| qmd | `qmd_deep_profile_gate` | `research_gate` | `not_encoded` | `pass` | `not_encoded` | `not_encoded` | `true` | `retrieval`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker`
result: `docs/research/2026-06-11-qmd-openviking-strength-profile-report.json` | -| OpenViking | `openviking_deep_profile_gate` | `research_gate` | `blocked` | `pass` | `blocked` | `blocked` | `true` | `retrieval`: `wrong_result`
`context_trajectory`: `blocked`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`
result: `docs/research/2026-06-11-qmd-openviking-strength-profile-report.json` | +| qmd | `qmd_deep_profile_gate` | `research_gate` | `not_encoded` | `pass` | `not_encoded` | `not_encoded` | `true` | `retrieval`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker`
result: `docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md` | +| OpenViking | `openviking_deep_profile_gate` | `research_gate` | `blocked` | `pass` | `blocked` | `blocked` | `true` | `retrieval`: `wrong_result`
`context_trajectory`: `blocked`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`
result: `docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md` | | RAGFlow | `ragflow_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`
`knowledge_compilation`: `not_encoded`
`production_ops`: `blocked` | setup: `cargo make smoke-ragflow-docker`
result: `tmp/real-world-memory/ragflow-smoke/ragflow-report.json` | | LightRAG | `lightrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`
`memory_evolution`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `cargo make smoke-lightrag-docker-context`
result: `tmp/real-world-memory/lightrag-context/lightrag-report.json` | | GraphRAG | `graphrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `knowledge_compilation`: `blocked`
`retrieval`: `not_encoded`
`production_ops`: `not_encoded`
`memory_evolution`: `not_encoded` | setup: `cargo make smoke-graphrag-docker`
result: `tmp/real-world-memory/graphrag-smoke/graphrag-report.json` | @@ -222,9 +236,9 @@ This section is manifest-backed. It records external adapter coverage and blocke | `agentmemory_live_baseline` | `durable_work_resume_local_path` | `work_resume` | `blocked` | `blocked` | The selected comparable path is explicit: capture into a Docker-local agentmemory session directory, persist the SDK KV/index and observation log, restart a fresh process, then score work_resume prompts. The checked-in fixture records this as blocked rather than scoring the current mock.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json` | | `agentmemory_live_baseline` | `capture_write_policy_hooks` | `capture_integration` | `blocked` | `blocked` | agentmemory capture/write-policy comparison needs live hook observations and write-policy audit evidence persisted through the selected local store. The fixture preserves this as a typed blocker and does not convert the mem::remember smoke into capture proof.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json` | | `mem0_openmemory_live_baseline` | `basic_local_lifecycle` | `memory_evolution` | `pass` | `tie` | Prior comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing basic same-corpus retrieval, update, delete, and cold-start reload checks. This remains a basic local lifecycle tie at the encoded smoke surface and is not reused as history/UI evidence.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | -| `mem0_openmemory_live_baseline` | `preference_correction_history` | `personalization` | `pass` | `loss` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | -| `mem0_openmemory_live_baseline` | `entity_scoped_personalization` | `personalization` | `pass` | `tie` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | -| `mem0_openmemory_live_baseline` | `delete_audit_readback` | `memory_evolution` | `pass` | `tie` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| `mem0_openmemory_live_baseline` | `preference_correction_history` | `personalization` | `pass` | `loss` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| `mem0_openmemory_live_baseline` | `entity_scoped_personalization` | `personalization` | `pass` | `tie` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| `mem0_openmemory_live_baseline` | `delete_audit_readback` | `memory_evolution` | `pass` | `tie` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | | `mem0_openmemory_live_baseline` | `local_get_all_export_readback` | `operator_debugging_ux` | `pass` | `not_tested` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.
command: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`
artifact: `tmp/live-baseline/mem0-checks.json` | | `mem0_openmemory_live_baseline` | `openmemory_ui_export_readback` | `operator_debugging_ux` | `blocked` | `blocked` | The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.
command: `cargo make openmemory-ui-export-readback`
artifact: `tmp/live-baseline/mem0-openmemory-ui-export.json` | | `mem0_openmemory_live_baseline` | `hosted_platform_export` | `operator_debugging_ux` | `unsupported` | `non_goal` | Hosted mem0 Platform export is explicitly outside the local OSS Docker comparison and is not counted as a local pass, loss, or blocker.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md new file mode 100644 index 00000000..e8f581b6 --- /dev/null +++ b/docs/evidence/benchmarking/index.md @@ -0,0 +1,37 @@ +# Benchmarking Evidence Index + +Purpose: Route agents to checked-in benchmark reports, matrices, diagnostics, and +adoption evidence. +Read this when: You need public-safe evidence behind benchmark or production-readiness +claims. +Not this document: Commands for running benchmarks or governing benchmark schemas. +Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`. + +## Concepts + +- `2026-06-09-live-baseline-report.md`: Live Baseline Benchmark Report - 2026-06-09. +- `2026-06-09-operator-debugging-ux-report.md`: Real-World Job Benchmark Report. +- `2026-06-09-production-adoption-gate-report.md`: Production Adoption Gate Report - June 9, 2026. +- `2026-06-09-production-corpus-report.md`: Live Baseline Benchmark Report. +- `2026-06-10-live-real-world-sweep-report.md`: Live Real-World Adapter Sweep Report - June 10, 2026. +- `2026-06-10-production-adoption-refresh.md`: Post-Adapter Production Adoption Refresh - June 10, 2026. +- `2026-06-10-real-world-comparison-report.md`: Real-World Comparison Report - June 10, 2026. +- `2026-06-11-capture-write-policy-live-report.md`: Capture/Write-Policy Live Report - June 11, 2026. +- `2026-06-11-competitor-strength-adoption-report.md`: Competitor-Strength Adoption Report - June 11, 2026. +- `2026-06-11-competitor-strength-evidence-matrix.md`: Competitor-Strength Evidence Matrix - June 11, 2026. +- `2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md`: ELF Iteration Direction From Competitor Benchmarks - June 11, 2026. +- `2026-06-11-elf-qmd-memory-evolution-diagnostic.md`: ELF/qmd Memory-Evolution Diagnostic - June 11, 2026. +- `2026-06-11-elf-qmd-retrieval-debug-profile.md`: ELF/qmd Retrieval-Debug Profile - June 11, 2026. +- `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md`: ELF/qmd Trace Replay Diagnostics Report - June 11, 2026; qmd top-10/replay artifact evidence is compared with ELF trace/admin surfaces. +- `2026-06-11-first-generation-oss-adapter-promotion-report.md`: First-Generation OSS Adapter Promotion Report - June 11, 2026. +- `2026-06-11-first-generation-oss-continuity-source-store-report.md`: First-Generation OSS Continuity and Source-Store Report - June 11, 2026. +- `2026-06-11-graph-rag-scored-smoke-adapter-report.md`: Graph/RAG Scored Smoke Adapter Report - June 11, 2026. +- `2026-06-11-measurement-coverage-audit.md`: ELF Benchmark Measurement Coverage Audit - June 11, 2026. +- `2026-06-11-mem0-openmemory-history-ui-export-report.md`: mem0/OpenMemory History and UI Export Report - June 11, 2026. +- `2026-06-11-qmd-openviking-strength-profile-report.md`: qmd and OpenViking Strength-Profile Report - June 11, 2026; separates qmd retrieval quality from debug/replay ergonomics, preserves XY-928 OpenViking evidence, and keeps context-trajectory surfaces as blocked/not-tested until scored staged evidence exists. +- `2026-06-11-temporal-history-competitor-gap-report.md`: Temporal/History Competitor Gap Report - June 11, 2026. +- `2026-06-16-dreaming-readiness-stage-ledger.md`: Dreaming-Readiness Stage Ledger - June 16, 2026. +- `2026-06-16-live-consolidation-proposal-scoring-report.md`: Live Consolidation Proposal Scoring Report - June 16, 2026. +- `2026-06-16-live-temporal-reconciliation-report.md`: Live Temporal Reconciliation Report - June 16, 2026. +- `2026-06-16-proactive-brief-scoring-report.md`: Proactive Brief Scoring Report - June 16, 2026. +- `2026-06-16-scheduled-memory-task-scoring-report.md`: Real-World Job Benchmark Report. diff --git a/docs/guide/research/agentmemory_adapter.md b/docs/evidence/external_memory/agentmemory_adapter.md similarity index 92% rename from docs/guide/research/agentmemory_adapter.md rename to docs/evidence/external_memory/agentmemory_adapter.md index 65d51662..81355ffc 100644 --- a/docs/guide/research/agentmemory_adapter.md +++ b/docs/evidence/external_memory/agentmemory_adapter.md @@ -1,3 +1,17 @@ +--- +type: Evidence +title: "Agentmemory Fixture Adapter" +description: "Evidence record for the agentmemory fixture adapter boundary." +resource: docs/evidence/external_memory/agentmemory_adapter.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - external_memory +--- # Agentmemory Fixture Adapter Goal: Convert sanitized agentmemory-style session exports into ELF-owned note/doc @@ -6,7 +20,7 @@ Read this when: You need to compare coding-agent memory capture against ELF with running an agentmemory server or bypassing ELF ingestion. Inputs: A local JSON fixture with agentmemory-style sessions, observations, memories, and retrieval cases. -Depends on: `elf-eval`, `docs/research/2026-06-08-agent-memory-selection.json`, +Depends on: `elf-eval`, `docs/decisions/2026-06-08-agent-memory-selection.md`, `docs/spec/system_elf_memory_service_v2.md`, `docs/spec/system_doc_source_ref_v1.md`, and `docs/spec/system_source_ref_doc_pointer_v1.md`. Outputs: A deterministic `elf.agentmemory_adapter/v1` JSON bundle with note candidates, @@ -161,7 +175,7 @@ Then run `elf-eval` as usual: cargo run -p elf-eval -- -c ./elf.toml --dataset tmp/agentmemory-eval.json ``` -For config-to-config comparisons or trace replay, follow `docs/guide/evaluation.md`. +For config-to-config comparisons or trace replay, follow `docs/runbook/evaluation.md`. ## Verification diff --git a/docs/guide/research/comparison_external_projects.md b/docs/evidence/external_memory/comparison_external_projects.md similarity index 98% rename from docs/guide/research/comparison_external_projects.md rename to docs/evidence/external_memory/comparison_external_projects.md index 42a861f8..3cbb583f 100644 --- a/docs/guide/research/comparison_external_projects.md +++ b/docs/evidence/external_memory/comparison_external_projects.md @@ -1,17 +1,31 @@ +--- +type: Evidence +title: "External Memory Project Comparison" +description: "Provide a detailed, evidence-backed comparison between ELF and adjacent memory projects." +resource: docs/evidence/external_memory/comparison_external_projects.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - external_memory +--- # External Memory Project Comparison Goal: Provide a detailed, evidence-backed comparison between ELF and adjacent memory projects. Read this when: You are evaluating architecture directions, positioning claims, or adoption trade-offs. Inputs: Current ELF docs/code and public documentation for the compared external projects. -Depends on: `docs/spec/system_elf_memory_service_v2.md` and `docs/guide/research/research_projects_inventory.md`. +Depends on: `docs/spec/system_elf_memory_service_v2.md` and `docs/evidence/external_memory/research_projects_inventory.md`. Outputs: A comparison matrix and trade-off summary suitable for follow-up design decisions. Scope note: This document is intentionally detailed and source-heavy. Keep `README.md` concise and link here for full analysis. -For a full list of reviewed and pending projects, see `docs/guide/research/research_projects_inventory.md`. +For a full list of reviewed and pending projects, see `docs/evidence/external_memory/research_projects_inventory.md`. For the June 2026 agentmemory and dreaming decision run, see -`docs/research/2026-06-08-agent-memory-selection.json`. +`docs/decisions/2026-06-08-agent-memory-selection.md`. For the June 2026 real-world benchmark-dimension refresh, see -`docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`. +`docs/spec/real_world_agent_memory_benchmark_v1.md`. Comparison focuses on shared capabilities, ELF distinctives, and objective trade-offs. These projects solve adjacent problems, but their primary storage units and default workflows differ. diff --git a/docs/guide/research/external_memory_improvement_plan.md b/docs/evidence/external_memory/external_memory_improvement_plan.md similarity index 96% rename from docs/guide/research/external_memory_improvement_plan.md rename to docs/evidence/external_memory/external_memory_improvement_plan.md index 6ad45be2..e63e9515 100644 --- a/docs/guide/research/external_memory_improvement_plan.md +++ b/docs/evidence/external_memory/external_memory_improvement_plan.md @@ -1,9 +1,23 @@ +--- +type: Evidence +title: "External Memory Improvement Plan - June 9, 2026" +description: "Convert the June 2026 live benchmark, external memory-system research, and Dexter radar operating pattern into an issue-ready ELF improvement plan." +resource: docs/evidence/external_memory/external_memory_improvement_plan.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - external_memory +--- # External Memory Improvement Plan - June 9, 2026 Goal: Convert the June 2026 live benchmark, external memory-system research, and Dexter radar operating pattern into an issue-ready ELF improvement plan. Read this when: Deciding what to implement next before using ELF as a personal production memory system. -Inputs: `README.md`, `docs/guide/benchmarking/2026-06-09-live-baseline-report.md`, `docs/guide/research/comparison_external_projects.md`, `docs/guide/research/research_projects_inventory.md`, current Linear readback, and the local Dexter Pattern Radar automation pattern. -Depends on: `docs/governance.md`, `docs/spec/system_elf_memory_service_v2.md`, and the checked-in live baseline runner. +Inputs: `README.md`, `docs/evidence/benchmarking/2026-06-09-live-baseline-report.md`, `docs/evidence/external_memory/comparison_external_projects.md`, `docs/evidence/external_memory/research_projects_inventory.md`, current Linear readback, and the local Dexter Pattern Radar automation pattern. +Depends on: `docs/policy.md`, `docs/spec/system_elf_memory_service_v2.md`, and the checked-in live baseline runner. Outputs: Prioritized gaps, issue queue, parallelization plan, acceptance criteria, and follow-up radar model. ## Summary Judgment @@ -24,7 +38,7 @@ So the answer is not "ELF is universally better." The current evidence supports ### Live Benchmark Evidence -Checked-in report: `docs/guide/benchmarking/2026-06-09-live-baseline-report.md`. +Checked-in report: `docs/evidence/benchmarking/2026-06-09-live-baseline-report.md`. Current encoded result: diff --git a/docs/evidence/external_memory/index.md b/docs/evidence/external_memory/index.md new file mode 100644 index 00000000..dcc806ca --- /dev/null +++ b/docs/evidence/external_memory/index.md @@ -0,0 +1,16 @@ +# External Memory Evidence Index + +Purpose: Route agents to promoted external memory-system comparison evidence. +Read this when: You need accepted comparison inputs, reviewed-project inventory, or +external adapter evidence that is no longer latent research. +Not this document: Active research contracts or radar run commands. +Routes to: External memory evidence concepts under `docs/evidence/external_memory/`. + +## Concepts + +- `research_projects_inventory.md`: audited and pending external memory/context + projects. +- `comparison_external_projects.md`: detailed external memory-system comparison. +- `external_memory_improvement_plan.md`: June 2026 improvement backlog and adoption + evidence synthesis. +- `agentmemory_adapter.md`: agentmemory fixture adapter boundary and evidence. diff --git a/docs/guide/research/research_projects_inventory.md b/docs/evidence/external_memory/research_projects_inventory.md similarity index 74% rename from docs/guide/research/research_projects_inventory.md rename to docs/evidence/external_memory/research_projects_inventory.md index be322238..d19ad1d7 100644 --- a/docs/guide/research/research_projects_inventory.md +++ b/docs/evidence/external_memory/research_projects_inventory.md @@ -1,9 +1,23 @@ +--- +type: Evidence +title: "External Project Research Inventory" +description: "Maintain a single, auditable inventory of external memory/context projects reviewed for ELF architecture decisions." +resource: docs/evidence/external_memory/research_projects_inventory.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - evidence + - external_memory +--- # External Project Research Inventory Goal: Maintain a single, auditable inventory of external memory/context projects reviewed for ELF architecture decisions. Read this when: You need to know which external projects have already been reviewed or still need a deep dive. Inputs: Existing research notes, open architecture questions, and tracked adoption threads. -Depends on: `docs/guide/research/comparison_external_projects.md`. +Depends on: `docs/evidence/external_memory/comparison_external_projects.md`. Outputs: A current inventory of reviewed and pending external projects. Last updated: June 11, 2026. @@ -18,26 +32,26 @@ Last updated: June 11, 2026. | Project | Research depth | Current status | Benchmark dimension role | Why it matters to ELF | Primary reference | | ------- | -------------- | -------------- | ------------------------ | --------------------- | ----------------- | -| [agentmemory](https://github.com/rohitg00/agentmemory) | D1 | Reviewed | `rw.operator-continuity`, `rw.resume-evidence`, `rw.lifecycle-staleness` | Cross-agent coding-memory hooks, MCP/REST surface, viewer, consolidation lifecycle, and external benchmark target | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-08-agent-memory-selection.json`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [OpenAI ChatGPT Memory Dreaming](https://openai.com/index/chatgpt-memory-dreaming/) | D1 | Reviewed | `rw.consolidation-review` | Background memory synthesis and staleness repair as a product direction | `docs/research/2026-06-08-agent-memory-selection.json` | -| [Claude Managed Agents Dreams](https://platform.claude.com/docs/en/managed-agents/dreams) | D1 | Reviewed | `rw.consolidation-review` | Reviewable derived memory-store output over past sessions; strong safety shape for ELF consolidation | `docs/research/2026-06-08-agent-memory-selection.json` | -| [Gemini CLI Auto Memory](https://github.com/google-gemini/gemini-cli/blob/main/docs/cli/auto-memory.md) | D1 | Reviewed | `rw.consolidation-review`, `rw.operator-continuity` | Background session mining with project-local review inbox for memory patches and skills | `docs/research/2026-06-08-agent-memory-selection.json` | -| [mem0](https://github.com/mem0ai/mem0) | D2 | Reviewed | `rw.lifecycle-staleness`, `rw.graph-temporal`, `rw.operator-continuity` | Graph memory as additive context, memory history and async mode trade-offs | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [memsearch](https://github.com/zilliztech/memsearch) | D2 | Reviewed | `rw.lifecycle-staleness`, `rw.retrieval-debug`, `rw.resume-evidence` | Markdown-first SoT + rebuildable index pattern | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [qmd](https://github.com/tobi/qmd) | D2 | Reviewed | `rw.retrieval-debug`, `rw.lifecycle-staleness`, `rw.resume-evidence` | Retrieval routing, weighted fusion, and local-first explainability | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [claude-mem](https://github.com/thedotmack/claude-mem) | D2 | Reviewed | `rw.operator-continuity`, `rw.resume-evidence`, `rw.retrieval-debug` | Progressive disclosure and strong operator workflow | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [OpenViking](https://github.com/volcengine/OpenViking) | D2 | Reviewed | `rw.context-trajectory`, `rw.resume-evidence`, `rw.retrieval-debug` | Filesystem context paradigm, hierarchical retrieval, trajectory observability | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [llm-wiki](https://github.com/nvk/llm-wiki) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.knowledge-synthesis`, `rw.resume-evidence` | LLM-maintained wiki pattern, topic-scoped knowledge bases, query-save and lint workflows | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | -| [gbrain](https://github.com/garrytan/gbrain) | D1 | Reviewed; XY-882 verdict `blocked` | `rw.knowledge-synthesis`, `rw.operator-continuity` | Operational knowledge brain, `compiled_truth` + timeline pages, enrichment and maintenance loops; blocked on Docker-local brain repo and database proof | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | -| [Always-On Memory Agent](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent) | D1 | Reviewed | `rw.consolidation-review`, `rw.operator-continuity` | Always-on multimodal ingest + scheduled consolidation loop with simple local ops surface | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` | -| [graphify](https://github.com/safishamsi/graphify) | D1 | Reviewed; XY-882 verdict `adapter_candidate`; XY-889 adds Docker graph/report smoke | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Multimodal graph compression, deterministic code extraction, and graph/report outputs with source-file/source-location references; current ELF evidence is a generated-corpus Docker smoke, not broad graph-quality proof | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | -| [Letta](https://github.com/letta-ai/letta) | D1 | Reviewed; XY-882 verdict `research_only`; XY-927 selects blocked contained export/readback path | `rw.core-archival`, `rw.operator-continuity` | Core vs archival memory split, shared blocks; compare only after a Docker-only benchmark-created agent export returns core block JSON, archival readback JSON, and source ids | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | -| [LangGraph](https://docs.langchain.com/oss/python/langgraph/persistence) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.replay-regression`, `rw.resume-evidence` | Checkpoint/replay mindset for quality regression workflows; not a standalone memory backend adapter | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | -| [Graphiti / Zep](https://help.getzep.com/graphiti/core-concepts/temporal-awareness) | D1 | Reviewed; XY-882 verdict `adapter_candidate` | `rw.graph-temporal`, `rw.resume-evidence` | Temporal fact validity model with Docker-local graph-store options and UUID/fact/validity-window output | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | -| [nanograph](https://github.com/nanograph/nanograph) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema + typed query ergonomics for graph-lite developer experience; official shape is no server/no Docker | `docs/guide/research/comparison_external_projects.md`; `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | -| [RAGFlow](https://github.com/infiniflow/ragflow) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.resume-evidence`, `rw.graph-navigation`, `rw.retrieval-debug`; no live strength claim | Docker setup is resource-heavy but documented; API references expose document/chunk evidence handles for a tiny-corpus adapter smoke | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | -| [LightRAG](https://github.com/HKUDS/LightRAG) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.graph-navigation`, `rw.graph-temporal`, `rw.retrieval-debug`; no live strength claim | Docker compose path, context-only query modes, and source file-path citation shape support an implementation follow-up | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | -| [GraphRAG](https://github.com/microsoft/graphrag) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.retrieval-debug`; no live strength claim | Cost-bounded CLI/API path and parquet output tables expose document, text-unit, and graph-summary handles for evidence mapping | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` | +| [agentmemory](https://github.com/rohitg00/agentmemory) | D1 | Reviewed | `rw.operator-continuity`, `rw.resume-evidence`, `rw.lifecycle-staleness` | Cross-agent coding-memory hooks, MCP/REST surface, viewer, consolidation lifecycle, and external benchmark target | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/decisions/2026-06-08-agent-memory-selection.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md` | +| [OpenAI ChatGPT Memory Dreaming](https://openai.com/index/chatgpt-memory-dreaming/) | D1 | Reviewed | `rw.consolidation-review` | Background memory synthesis and staleness repair as a product direction | `docs/decisions/2026-06-08-agent-memory-selection.md` | +| [Claude Managed Agents Dreams](https://platform.claude.com/docs/en/managed-agents/dreams) | D1 | Reviewed | `rw.consolidation-review` | Reviewable derived memory-store output over past sessions; strong safety shape for ELF consolidation | `docs/decisions/2026-06-08-agent-memory-selection.md` | +| [Gemini CLI Auto Memory](https://github.com/google-gemini/gemini-cli/blob/main/docs/cli/auto-memory.md) | D1 | Reviewed | `rw.consolidation-review`, `rw.operator-continuity` | Background session mining with project-local review inbox for memory patches and skills | `docs/decisions/2026-06-08-agent-memory-selection.md` | +| [mem0](https://github.com/mem0ai/mem0) | D2 | Reviewed | `rw.lifecycle-staleness`, `rw.graph-temporal`, `rw.operator-continuity` | Graph memory as additive context, memory history and async mode trade-offs | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md` | +| [memsearch](https://github.com/zilliztech/memsearch) | D2 | Reviewed | `rw.lifecycle-staleness`, `rw.retrieval-debug`, `rw.resume-evidence` | Markdown-first SoT + rebuildable index pattern | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md` | +| [qmd](https://github.com/tobi/qmd) | D2 | Reviewed | `rw.retrieval-debug`, `rw.lifecycle-staleness`, `rw.resume-evidence` | Retrieval routing, weighted fusion, and local-first explainability | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md` | +| [claude-mem](https://github.com/thedotmack/claude-mem) | D2 | Reviewed | `rw.operator-continuity`, `rw.resume-evidence`, `rw.retrieval-debug` | Progressive disclosure and strong operator workflow | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md` | +| [OpenViking](https://github.com/volcengine/OpenViking) | D2 | Reviewed | `rw.context-trajectory`, `rw.resume-evidence`, `rw.retrieval-debug` | Filesystem context paradigm, hierarchical retrieval, trajectory observability | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md` | +| [llm-wiki](https://github.com/nvk/llm-wiki) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.knowledge-synthesis`, `rw.resume-evidence` | LLM-maintained wiki pattern, topic-scoped knowledge bases, query-save and lint workflows | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md`; `docs/research/derived_knowledge_page_followup.md` | +| [gbrain](https://github.com/garrytan/gbrain) | D1 | Reviewed; XY-882 verdict `blocked` | `rw.knowledge-synthesis`, `rw.operator-continuity` | Operational knowledge brain, `compiled_truth` + timeline pages, enrichment and maintenance loops; blocked on Docker-local brain repo and database proof | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md`; `docs/research/derived_knowledge_page_followup.md` | +| [Always-On Memory Agent](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent) | D1 | Reviewed | `rw.consolidation-review`, `rw.operator-continuity` | Always-on multimodal ingest + scheduled consolidation loop with simple local ops surface | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md`; `docs/research/dreaming_product_surface_followup.md` | +| [graphify](https://github.com/safishamsi/graphify) | D1 | Reviewed; XY-882 verdict `adapter_candidate`; XY-889 adds Docker graph/report smoke | `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.resume-evidence` | Multimodal graph compression, deterministic code extraction, and graph/report outputs with source-file/source-location references; current ELF evidence is a generated-corpus Docker smoke, not broad graph-quality proof | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md`; `docs/research/graph_rag_adapter_followup.md`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| [Letta](https://github.com/letta-ai/letta) | D1 | Reviewed; XY-882 verdict `research_only`; XY-927 selects blocked contained export/readback path | `rw.core-archival`, `rw.operator-continuity` | Core vs archival memory split, shared blocks; compare only after a Docker-only benchmark-created agent export returns core block JSON, archival readback JSON, and source ids | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md`; `docs/research/graph_rag_adapter_followup.md`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| [LangGraph](https://docs.langchain.com/oss/python/langgraph/persistence) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.replay-regression`, `rw.resume-evidence` | Checkpoint/replay mindset for quality regression workflows; not a standalone memory backend adapter | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md`; `docs/research/graph_rag_adapter_followup.md` | +| [Graphiti / Zep](https://help.getzep.com/graphiti/core-concepts/temporal-awareness) | D1 | Reviewed; XY-882 verdict `adapter_candidate` | `rw.graph-temporal`, `rw.resume-evidence` | Temporal fact validity model with Docker-local graph-store options and UUID/fact/validity-window output | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md`; `docs/research/graph_rag_adapter_followup.md` | +| [nanograph](https://github.com/nanograph/nanograph) | D1 | Reviewed; XY-882 verdict `research_only` | `rw.graph-temporal`, `rw.retrieval-debug` | Typed schema + typed query ergonomics for graph-lite developer experience; official shape is no server/no Docker | `docs/evidence/external_memory/comparison_external_projects.md`; `docs/spec/real_world_agent_memory_benchmark_v1.md`; `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/graph_rag_adapter_followup.md` | +| [RAGFlow](https://github.com/infiniflow/ragflow) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.resume-evidence`, `rw.graph-navigation`, `rw.retrieval-debug`; no live strength claim | Docker setup is resource-heavy but documented; API references expose document/chunk evidence handles for a tiny-corpus adapter smoke | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/graph_rag_adapter_followup.md` | +| [LightRAG](https://github.com/HKUDS/LightRAG) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.graph-navigation`, `rw.graph-temporal`, `rw.retrieval-debug`; no live strength claim | Docker compose path, context-only query modes, and source file-path citation shape support an implementation follow-up | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/graph_rag_adapter_followup.md` | +| [GraphRAG](https://github.com/microsoft/graphrag) | D2 feasibility gate | Research gate remains; XY-882 verdict `adapter_candidate` | Candidate `rw.graph-navigation`, `rw.knowledge-synthesis`, `rw.retrieval-debug`; no live strength claim | Cost-bounded CLI/API path and parquet output tables expose document, text-unit, and graph-summary handles for evidence mapping | `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`; `docs/research/graph_rag_adapter_followup.md` | ## June 10, 2026 Adapter Feasibility Verdicts @@ -91,10 +105,12 @@ replacing ELF's evidence-bound service contract. - [XY-40](https://linear.app/hack-ink/issue/XY-40/vision-track-elf-as-a-high-trust-memory-system-for-singlemulti-agent) - [XY-51](https://linear.app/hack-ink/issue/XY-51/agent-memory-ux-mcp-surface-skills-doc-pointers-epic) - [XY-63](https://linear.app/hack-ink/issue/XY-63/research-openviking-as-optional-doc-backend-integration-sketch) -- Current June 2026 research runs: - - `docs/research/2026-06-08-agent-memory-selection.json` - - `docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json` - - `docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json` +- Promoted June 2026 research: + - `docs/decisions/2026-06-08-agent-memory-selection.md` + - `docs/spec/real_world_agent_memory_benchmark_v1.md` + - `docs/research/graph_rag_adapter_followup.md` + - `docs/research/derived_knowledge_page_followup.md` + - `docs/research/dreaming_product_surface_followup.md` ## Notes diff --git a/docs/research/external_memory_pattern_radar/latest.md b/docs/evidence/external_memory_pattern_radar_latest.md similarity index 84% rename from docs/research/external_memory_pattern_radar/latest.md rename to docs/evidence/external_memory_pattern_radar_latest.md index 00cb8fa7..cad1348c 100644 --- a/docs/research/external_memory_pattern_radar/latest.md +++ b/docs/evidence/external_memory_pattern_radar_latest.md @@ -1,9 +1,30 @@ +--- +type: Evidence +title: "External Memory Pattern Radar Summary" +description: "Preserve the latest weekly ELF external memory pattern radar outcome." +resource: docs/evidence/external_memory_pattern_radar_latest.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-18 +tags: + - docs + - external-memory-pattern-radar + - evidence +source_refs: [] +code_refs: + - apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json + - apps/elf-eval/src/bin/external_memory_pattern_radar.rs +related: [] +drift_watch: + - docs/evidence/external_memory_pattern_radar_latest.md +--- # External Memory Pattern Radar Summary Goal: Preserve the latest weekly ELF external memory pattern radar outcome. Read this when: Feeding the next full comparison report or deciding whether a watched upstream memory project created an ELF follow-up. -Inputs: `docs/research/external_memory_pattern_radar/cursor.json`, GitHub repository metadata, checked-in ELF comparison evidence, and any Codex source-review notes. -Depends on: `docs/spec/external_memory_pattern_radar_v1.md` and `docs/guide/research/external_memory_pattern_radar.md`. +Inputs: `apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json`, GitHub repository metadata, checked-in ELF comparison evidence, and any Codex source-review notes. +Depends on: `docs/spec/external_memory_pattern_radar_v1.md` and `docs/runbook/external_memory_pattern_radar.md`. Outputs: Latest no-issue, rejection, or issue-ready radar decisions. - Run id: `external-memory-pattern-radar-2026-06-10` diff --git a/docs/evidence/index.md b/docs/evidence/index.md new file mode 100644 index 00000000..c726739e --- /dev/null +++ b/docs/evidence/index.md @@ -0,0 +1,21 @@ +# Evidence Index + +Purpose: Route agents to public-safe proof, validation evidence, and semantic drift +audits. +Read this when: You need evidence behind documentation readiness, checked claims, or +drift review. +Not this document: Raw machine-readable benchmark JSON or latent research contracts. +Routes to: Drift audits and evidence concepts under `docs/evidence/`. + +## Concepts + +- `benchmarking/index.md`: checked-in benchmark reports, matrices, diagnostics, and + adoption evidence. +- `external_memory/index.md`: external memory-system comparisons, inventories, and + promoted research evidence. +- `2026-06-18-docs-okf-self-check.md`: Drift audit for the docs OKF and LLM Wiki + migration. +- `2026-06-18-research-artifact-disposition.md`: Evidence record for promoted, + carried-forward, moved, and deleted legacy research JSON artifacts. +- `external_memory_pattern_radar_latest.md`: Latest weekly external memory pattern + radar summary. diff --git a/docs/governance.md b/docs/governance.md deleted file mode 100644 index e2b3fe1e..00000000 --- a/docs/governance.md +++ /dev/null @@ -1,105 +0,0 @@ -# Documentation Governance - -Purpose: Define how agent-facing documentation is organized, updated, and kept consistent -across this repository. -Status: normative -Read this when: You are creating, moving, splitting, or revising repository documentation. -Not this document: System behavior contracts or operational runbooks for one subsystem. -Defines: Document classes, placement rules, routing headers, and docs update workflow. - -Audience: All documentation under `docs/` is written for AI agents and LLM workflows. -The split between `spec` and `guide` is by task shape, not by reader type. - -## Principles - -- Optimize for retrieval, routing, and execution. -- Keep one authoritative document per topic. -- Separate normative truth from procedural steps. -- Prefer explicit section labels and stable links over prose-heavy narrative. -- Let structure emerge from real topics. Avoid premature folder taxonomies. - -## Document classes - -| Class | Location | Answers | Source of truth for | Update trigger | -| --- | --- | --- | --- | --- | -| Spec | `docs/spec/` | What must be true? | Contracts, schemas, invariants, required behavior | Any behavior or schema change | -| Guide | `docs/guide/` | What should I do? | Runbooks, migrations, validation, troubleshooting | Any procedure or operational change | -| Research runs | `docs/research/` | Which evidence-backed research run reached what state? | Machine-readable hypotheses, evidence, trade-offs, challenge records, and terminal decision state | A research workflow needs durable replayable state | -| Plan artifacts | `docs/plans/` | Which saved plan artifact should a planning tool or execution workflow use? | Tool-managed planning outputs | As emitted or updated by the relevant tool | - -## Placement rules - -- If a document defines correctness, it belongs in `docs/spec/`. -- If a document defines actions, it belongs in `docs/guide/`. -- If a document is non-normative decision support, comparison, or research input, treat it - as guide-class material and store it under `docs/guide/`. -- If a research workflow requires a machine-readable run file with replayable events, - store that run file under `docs/research/` and link to it from the relevant guide. -- Do not treat `docs/plans/` as a general-purpose docs bucket. -- Use `docs/plans/` only for artifacts produced or consumed by planning tools or - workflows that explicitly depend on saved plan files. -- Do not duplicate the same authoritative content across documents. Link to the source - of truth instead. -- A guide may summarize why a step exists, but normative statements still live in the - governing spec. - -## Document contracts - -Every document should start with a short routing header. - -Spec header: - -- `Purpose` -- `Status: normative` -- `Read this when` -- `Not this document` -- `Defines` - -Guide header: - -- `Goal` -- `Read this when` -- `Inputs` or `Preconditions` -- `Depends on` -- `Outputs` or `Verification` - -## Structure rules - -- Prefer shallow paths by default. -- Add subfolders only when they mirror stable system boundaries or improve retrieval. -- Use descriptive `snake_case` file names. -- Do not require fixed filename prefixes unless a real ambiguity appears. -- Do not create empty folders, empty indexes, or placeholder documents to satisfy a - taxonomy. - -## Canonical entry points - -- Unified documentation router: `docs/index.md` -- Normative router: `docs/spec/index.md` -- Procedural router: `docs/guide/index.md` -- Repo task and automation entrypoints: `Makefile.toml` - -## LLM reading guidance - -When answering a repository question: - -1. Read `docs/index.md` for routing. -2. Route by question type: - - "What must be true?" -> `docs/spec/index.md` - - "What should I do?" -> `docs/guide/index.md` -3. Read `Makefile.toml` when the task depends on repository automation or named tasks. -4. Use `docs/research/` only when the task explicitly concerns a machine-readable - research run file used by a research workflow. -5. Use `docs/plans/` only when the task explicitly concerns a saved plan artifact used by - a planning tool or execution workflow. - -## Update workflow - -- Behavior or schema change: update the relevant spec. -- Procedure change: update the relevant guide. -- If a change touches both truth and procedure, update both documents and keep their - boundary explicit. -- When a guide starts carrying normative content, move that content into spec and link - to it. -- Do not impose local document-header requirements on files under `docs/plans/`; those - files are owned by the planning tool or workflow that created them. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md deleted file mode 100644 index 56de3357..00000000 --- a/docs/guide/benchmarking/index.md +++ /dev/null @@ -1,157 +0,0 @@ -# Benchmarking Guide Index - -Goal: Route agents to live benchmark runbooks, report publication steps, and checked-in -benchmark evidence. -Read this when: You need to run, publish, interpret, or extend ELF benchmark evidence -against external memory systems. -Inputs: The benchmark question, selected corpus profile, and whether you need a runbook -or a saved evidence snapshot. -Depends on: `docs/index.md`, `docs/guide/index.md`, and `docs/governance.md`. -Outputs: The smallest benchmarking guide or report needed to continue. - -## Use This Index When - -- You need to run the live Docker-only benchmark matrix. -- You need to publish a Markdown report from a generated benchmark JSON report. -- You need the checked-in benchmark evidence behind README claims. -- You need to extend the benchmark matrix with new projects, profiles, or lifecycle - checks. - -Do not use benchmark commands as the production operating procedure. For single-user -Docker Compose production start, stop, backup, restore, Qdrant rebuild, rollback, and -cleanup, use `docs/guide/single_user_production.md`. - -## Guides And Reports - -- `live_baseline_benchmark.md`: run, clean up, publish, and interpret the live - Docker-only benchmark matrix, including generated public and production-corpus - profiles, private addendum publication, opt-in 10k/100k backfill, and soak - profiles. -- `2026-06-09-live-baseline-report.md`: checked-in evidence snapshot for the June 9, - 2026 ELF production-provider stress run and all-project smoke comparison. -- `2026-06-09-production-corpus-report.md`: checked-in synthetic production-corpus - ELF adoption benchmark report with task queries and evidence IDs. -- `2026-06-09-production-adoption-gate-report.md`: XY-836 production adoption - decision report with fresh provider-backed synthetic, stress, backfill, restore, and - external adapter evidence. -- `2026-06-09-operator-debugging-ux-report.md`: checked-in real-world job - operator-debugging UX report with trace/viewer links, raw-SQL avoidance, root-cause - step counts, dropped-candidate visibility, and repair-action clarity. -- `2026-06-10-real-world-comparison-report.md`: checked-in post-P1 real-world - comparison report with aggregate fixture evidence, external-adapter evidence classes, - remaining typed gaps, and adoption implications. -- `2026-06-10-live-real-world-sweep-report.md`: XY-880 full-suite live real-world - sweep report for ELF and qmd, showing per-suite live pass and typed non-pass states - without claiming full-suite live parity. -- `2026-06-10-production-adoption-refresh.md`: XY-884 post-adapter production - adoption refresh that keeps the decision at adopt with bounded caveats and separates - fixture, live adapter, private corpus, credentialed, blocked, and research-gate - evidence. -- `2026-06-11-competitor-strength-evidence-matrix.md`: XY-897 competitor-strength - matrix contract that maps every tracked memory/RAG/graph project to its strongest - scenario, current evidence class, typed blockers, next measurement gate, and ELF - borrow-if-stronger direction. -- `2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md`: current - optimization-direction report that translates measured benchmark data and competitor - strengths into prioritized ELF iteration themes and explicit non-claims. -- `2026-06-11-measurement-coverage-audit.md`: fresh coverage audit that separates - current measured ELF/qmd data, fixture evidence including the XY-927 - `core_archival_memory` suite, external adapter ledger coverage, scenario non-claims, - and the next measurement reports needed before stronger competitor claims. -- `2026-06-11-elf-qmd-retrieval-debug-profile.md`: fresh ELF/qmd retrieval-debug - profile with real-world retrieval-suite evidence, 480-document stress baseline - evidence, qmd top-10 artifact inspection, and explicit rerank/fusion non-claims. -- `2026-06-11-elf-qmd-memory-evolution-diagnostic.md`: fresh ELF/qmd - memory-evolution diagnostic showing fixture pass, live ELF/qmd current-vs-historical - wrong-result patterns, qmd tombstone evidence miss, and temporal-reconciliation - iteration directions. -- `2026-06-11-temporal-history-competitor-gap-report.md`: fresh report-only - temporal/history competitor-gap report that updates the mem0 basic lifecycle result, - records Graphiti/Zep and Letta claim boundaries, and turns qmd, mem0/OpenMemory, - Graphiti/Zep, Letta, and adjacent project strengths into benchmark-gated ELF - optimization directions. -- `2026-06-11-qmd-openviking-strength-profile-report.md`: XY-899 strength-profile - report that separates qmd retrieval quality from debug/replay ergonomics, records - qmd wrong-result diagnosis classes, and preserves XY-928 OpenViking - context-trajectory surfaces as blocked/not-tested until scored staged, - hierarchical, and recursive evidence exists. -- `2026-06-11-elf-qmd-trace-replay-diagnostics-report.md`: XY-923 trace-level - replay and wrong-result diagnostics report that scores qmd top-10/replay artifact - ergonomics against ELF trace/admin surfaces while keeping retrieval correctness, - rerank, fusion, candidate-drop, and typed non-pass boundaries separate. -- `2026-06-11-first-generation-oss-adapter-promotion-report.md`: XY-898 - first-generation OSS adapter promotion report that updates agentmemory, - mem0/OpenMemory, memsearch, and claude-mem with fresh scenario-level baseline - evidence and ELF win/tie/loss/untested positions without converting baseline-only - evidence into real-world suite wins. -- `2026-06-11-first-generation-oss-continuity-source-store-report.md`: XY-925 - follow-up report that adds first-generation OSS fixture-backed prompt coverage and - typed blockers for agentmemory durable continuity, memsearch canonical Markdown - source-store/debug jobs, and claude-mem progressive-disclosure, retrieval-repair, - hook, and viewer/operator surfaces. -- `2026-06-11-graph-rag-scored-smoke-adapter-report.md`: XY-900 graph/RAG - scored-smoke adapter report, updated by XY-929 with a representative - graph/RAG fixture slice, that keeps RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, - graphify, llm-wiki, and gbrain outputs as scored or typed non-pass - `real_world_job` evidence without converting smoke or representative - non-pass evidence into quality claims. -- `2026-06-11-competitor-strength-adoption-report.md`: XY-901 final - competitor-strength adoption report, updated by XY-927 with fixture-backed - core-vs-archival coverage and by XY-929 with representative graph/RAG - typed non-pass fixtures, plus the bounded personal-production decision, - scenario-level win/tie/loss/not-tested matrix, claim boundaries, and - optimization issue queue. -- `2026-06-11-capture-write-policy-live-report.md`: XY-933 live capture/write-policy - report that scores ELF redaction, exclusions, source ids, evidence binding, and no - secret leakage while preserving typed blocked/untested boundaries for agentmemory - and claude-mem capture breadth. -- `2026-06-16-live-consolidation-proposal-scoring-report.md`: XY-934 live - consolidation proposal scoring report that separates fixture-backed consolidation - passes from service-backed live proposal materialization, lineage, confidence, - unsupported-claim flags, and apply/defer/discard audit evidence. -- `2026-06-11-mem0-openmemory-history-ui-export-report.md`: XY-924 plus XY-931 - mem0/OpenMemory local OSS history, preference-correction, deletion-audit, - personalization, and export-readback comparison with normalized - win/tie/loss/not-tested/blocked/non-goal outcomes and explicit hosted/UI/graph - non-claims. -- `2026-06-16-dreaming-readiness-stage-ledger.md`: XY-951 stage-gate ledger for - Dreaming-inspired memory improvements, with the required current baseline, - post-stage command matrix, typed improved/regressed/unchanged/blocked/not-tested - buckets, and machine-readable companion file - `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`. -- `2026-06-16-proactive-brief-scoring-report.md`: XY-953 fixture-backed proactive - project brief scoring report with source refs, freshness/currentness markers, - reject/defer rationale, stale/tombstone guards, and the private-corpus blocker tied - to XY-930. -- `2026-06-16-scheduled-memory-task-scoring-report.md`: XY-954 fixture-backed - scheduled-memory task scoring report with source refs, freshness/currentness - markers, action rationale, execution trace/readback, source-mutation guards, and - the private/provider scheduler blocker tied to XY-930. -- `2026-06-16-live-temporal-reconciliation-report.md`: XY-905 live temporal - reconciliation follow-up showing ELF live `memory_evolution` moving from - `pass=1`, `wrong_result=5` to `pass=6`, `wrong_result=0`, with trace/readback - fields for selected current, historical, rationale, tombstone, invalidation, - dropped, and non-narrated evidence. -- `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world - agent memory benchmark contract, including suite taxonomy, typed report states, - knowledge-compilation fixture tasks, and the production-ops fixture target. -- `real_world_memory_evolution.md`: run and interpret the checked-in memory evolution - jobs for current facts, historical facts, stale traps, conflicts, update rationales, - and temporal graph limitations. - -## Update Rules - -- Add a dated report when a new run changes README-level claims. -- Keep generated raw JSON under `tmp/live-baseline/`; commit only reviewed Markdown - summaries and durable scripts. -- Keep generated real-world job smoke JSON and Markdown under `tmp/real-world-job/`; - commit fixture schemas, smoke fixtures, runner code, and durable docs only. -- Keep generated real-world memory trust/personalization/knowledge/production-ops JSON - and Markdown under `tmp/real-world-memory/`; commit fixtures, runner code, and - durable docs only. -- Link the newest decision-relevant report from README and this index. -- When benchmark semantics change, update `live_baseline_benchmark.md` and the - relevant spec before publishing a new result. -- Real-world job benchmark changes are governed by - `docs/spec/real_world_agent_memory_benchmark_v1.md`; keep this guide as routing and - do not duplicate the normative schema here. diff --git a/docs/guide/index.md b/docs/guide/index.md deleted file mode 100644 index bbeeec91..00000000 --- a/docs/guide/index.md +++ /dev/null @@ -1,73 +0,0 @@ -# Guide Index - -Goal: Route agents to procedural documents that tell them how to execute work safely and -repeatably. -Read this when: You know the question is operational and need the best execution path. -Inputs: The current task shape, subsystem, and whether you need background research. -Depends on: `docs/index.md` and `docs/governance.md`. -Outputs: The smallest guide or guide subfolder needed to continue execution. - -Question this index answers: "what should I do?" - -## Use this index when - -- You need a runbook, how-to, migration sequence, validation flow, troubleshooting - path, or maintenance procedure. -- You already know the relevant spec and need the operational steps. -- You need a bounded sequence with prerequisites and verification. -- You need external comparisons or research notes that inform an implementation choice. - -## Do not use this index when - -- You need the authoritative contract, schema, or invariant. -- You need a planning-tool artifact or a saved execution plan under `docs/plans/`. -- You need broad documentation policy or repo task-entrypoint rules; read - `docs/governance.md` or `Makefile.toml` instead. - -## What belongs in `docs/guide/` - -- Task-oriented runbooks. -- Validation and test procedures. -- Migration, rollout, rollback, and recovery sequences. -- Troubleshooting flows and operator checklists. -- Short implementation recipes that depend on a governing spec. -- Decision-support research and external comparisons that inform implementation choices. - -## Guide document contract - -Start each guide with a compact routing header: - -- `Goal` -- `Read this when` -- `Inputs` or `Preconditions` -- `Depends on` -- `Outputs` or `Verification` - -Then structure the body for execution: - -- Write steps in the order an agent should perform them. -- Keep commands, checks, and rollback points explicit. -- Link to specs for normative truth instead of restating contracts. -- Include failure branches only when they change the next action. -- End with verification so an agent can tell whether the guide succeeded. - -## Structure policy - -- Group guides by workflow or subsystem only when multiple guides exist and the grouping - improves retrieval. -- Do not create empty category folders or placeholder section headings. -- Prefer titles that encode the task or outcome, such as `validate_release.md` or - `rerun_ingest_job.md`. -- Keep the guide index as a router, not a dumping ground for long explanations. - -## Guide subfolders - -- `docs/guide/single_user_production.md` for the single-user production runbook, - backup/restore path, migration checks, and Qdrant rebuild proof. -- `docs/guide/benchmarking/` for live benchmark runbooks, report publication steps, - and checked-in benchmark evidence. -- `docs/guide/competitive_parity_testing.md` for running the Docker-only adoption - gate against external memory-system baselines. -- `docs/guide/development/` for repository-development workflows. -- `docs/guide/research/` for external comparisons and decision-support materials that are - non-normative. diff --git a/docs/guide/research/index.md b/docs/guide/research/index.md deleted file mode 100644 index cf11bc56..00000000 --- a/docs/guide/research/index.md +++ /dev/null @@ -1,22 +0,0 @@ -# Research Guide Index - -Goal: Route agents to external comparison and decision-support research for ELF memory architecture. -Read this when: You need to compare ELF with adjacent memory, context, RAG, or consolidation systems. -Inputs: Current ELF docs/code, public external project docs, tracker state, and checked-in research run files. -Depends on: `docs/index.md`, `docs/governance.md`, and `docs/research/` for machine-readable research runs. -Outputs: The smallest comparison or inventory document needed for implementation decisions. - -## Documents - -- `research_projects_inventory.md`: audited and pending external projects, research depth, and current planning surface. -- `comparison_external_projects.md`: detailed capability comparison, project trade-offs, source map, and research-backed ELF directions. -- `external_memory_improvement_plan.md`: prioritized June 2026 improvement backlog, issue queue, parallelization plan, and production-adoption gate from benchmark and external-project evidence. -- `agentmemory_adapter.md`: fixture-backed agentmemory import and baseline adapter boundary for `elf-eval`. -- `external_memory_pattern_radar.md`: weekly radar runbook for upstream memory-system - deltas, no-issue decisions, and issue-ready pattern evidence. - -## Machine-Readable Runs - -Machine-authoritative research run JSON files live under `docs/research/`. -Use those files when a research conclusion needs replayable hypotheses, evidence, -trade-offs, challenge records, and terminal decision state. diff --git a/docs/index.md b/docs/index.md index 1d364989..c4a952cb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -3,37 +3,47 @@ Purpose: Route agents to the smallest correct document set for the current task. Read this when: You are starting from repository docs and need to choose the right lane. Not this document: Detailed subsystem contracts, step-by-step runbooks, research run state, or saved plan artifacts. -Routes to: `docs/governance.md`, `docs/spec/`, `docs/guide/`, `docs/research/`, `docs/plans/`, and `Makefile.toml`. +Routes to: `docs/policy.md`, `docs/spec/`, `docs/runbook/`, `docs/reference/`, +`docs/decisions/`, `docs/research/`, `docs/evidence/`, and `Makefile.toml`. Audience: All documentation in this repository is written for AI agents and LLM workflows. The split below is by question type, not by human-versus-agent audience. ## Read order -- Read `docs/governance.md` for document contracts and placement rules. +- Read `docs/policy.md` for document contracts and placement rules. - Read `Makefile.toml` when the task depends on repo task names or execution entrypoints. - Then choose one primary lane: - `docs/spec/index.md` when the question is "what must be true?" - - `docs/guide/index.md` when the question is "what should I do?" -- Use `docs/research/` only when a research workflow explicitly points to a - machine-readable research run file there. -- Use `docs/plans/` only when a planning tool or execution workflow explicitly points to - a saved plan artifact there. + - `docs/runbook/index.md` when the question is "what should I do?" +- Use `docs/reference/` for current non-procedural orientation and retained + historical plan artifacts. +- Use `docs/decisions/` for accepted rationale. +- Use `docs/research/` for active OKF research contracts. Machine-readable artifacts + stay outside `docs/` and are cited only when an active owner still needs them. +- Use `docs/evidence/` for proof records, benchmark reports, external comparison + evidence, drift audits, and promoted research evidence. ## Routing matrix - Need contracts, invariants, schemas, enums, state machines, or required behavior -> `docs/spec/` - Need runbooks, migrations, validation steps, troubleshooting, or operational sequences -> - `docs/guide/` + `docs/runbook/` - Need the single-user production backup, restore, and Qdrant rebuild path -> - `docs/guide/single_user_production.md` -- Need external comparisons or architecture research inputs -> `docs/guide/research/` -- Need machine-readable research run state, evidence, trade-offs, and decision status -> - `docs/research/` + `docs/runbook/single_user_production.md` +- Need benchmark commands or interpretation steps -> `docs/runbook/benchmarking/` +- Need checked-in benchmark reports -> `docs/evidence/benchmarking/` +- Need external comparisons or architecture research inputs -> + `docs/evidence/external_memory/` +- Need external-memory radar commands -> `docs/runbook/external_memory_pattern_radar.md` +- Need research provenance, evidence, trade-offs, or decision status -> + `docs/research/`, `docs/decisions/`, and `docs/evidence/` depending on whether the + point is latent, accepted, or audit evidence. - Need repo task names or automation entrypoints -> `Makefile.toml` -- Need documentation placement or authoring rules -> `docs/governance.md` -- Need a planning-tool artifact or saved execution plan -> `docs/plans/` +- Need documentation placement or authoring rules -> `docs/policy.md` +- Need a retained planning-tool artifact or saved execution plan -> + `docs/reference/plans/` ## Retrieval rules diff --git a/docs/log.md b/docs/log.md new file mode 100644 index 00000000..8f352cad --- /dev/null +++ b/docs/log.md @@ -0,0 +1,34 @@ +# Documentation Maintenance Log + +Purpose: Record material OKF and LLM Wiki navigation, promotion, naming, and +maintenance changes. +Read this when: You need to understand why documentation structure changed. +Not this document: Detailed subsystem history, raw research state, or plan execution +logs. + +## 2026-06-18 + +- Adopted the Decodex Markdown-only OKF and LLM Wiki profile for `docs/`. +- Added `docs/policy.md` as the canonical documentation-shape owner. +- Added required lane indexes for `decisions`, `evidence`, `reference`, `research`, + and `runbook`. +- Moved raw JSON research and evaluation artifacts out of `docs/` so docs can remain + Markdown-only while preserving machine-readable evidence. +- Promoted settled legacy research JSON into decision, spec, runbook, and evidence + owners; moved test-required machine reports to app fixtures after Markdown reports + became the docs owners. +- Carried unresolved but valuable points forward as explicit research contracts under + `docs/research/`. +- Moved the external-memory pattern radar cursor to + `apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json` because it is + active tool state rather than a research conclusion. +- Moved the latest external-memory pattern radar summary to + `docs/evidence/external_memory_pattern_radar_latest.md` because it is evidence, not + latent research. +- Added a docs self-check drift audit under `docs/evidence/`. +- Removed the legacy guide top-level lane. Procedural documents now live under + `docs/runbook/`; checked reports and external comparison inputs live under + `docs/evidence/`. +- Moved retained plan artifacts from the legacy plans top-level lane to + `docs/reference/plans/` so the + top-level docs directories match the Decodex docs lane set. diff --git a/docs/plans/.gitkeep b/docs/plans/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/policy.md b/docs/policy.md new file mode 100644 index 00000000..006a4f47 --- /dev/null +++ b/docs/policy.md @@ -0,0 +1,93 @@ +--- +type: Policy +title: "Documentation OKF Policy" +description: "Canonical Markdown-only OKF and LLM Wiki policy for repository documentation." +resource: docs/policy.md +status: active +authority: normative +owner: docs +last_verified: 2026-06-18 +tags: + - docs + - okf + - llm-wiki +source_refs: [] +code_refs: + - Makefile.toml + - scripts/check-docs.py +related: [] +drift_watch: + - docs/ + - Makefile.toml + - scripts/check-docs.py +--- +# Documentation OKF Policy + +Purpose: Own the repository documentation shape, lane policy, and validation gates for +the Markdown-only OKF and LLM Wiki bundle. +Status: normative +Read this when: You are creating, moving, splitting, promoting, or validating +repository documentation. +Not this document: Product behavior contracts, operational runbooks for one +subsystem, or raw machine-readable research artifacts. +Defines: OKF concept shape, LLM Wiki lane ownership, docs validation, and research +artifact placement. + +## Bundle Contract + +- `docs/` is a Markdown-only OKF and LLM Wiki bundle. +- `docs/index.md` is the root router. +- `docs/policy.md` owns documentation shape and validation policy. +- `docs/log.md` records material navigation, promotion, naming, and maintenance changes. +- Every populated directory under `docs/` has an `index.md`. +- Non-index, non-log Markdown files are OKF concepts with YAML frontmatter. +- Machine-readable JSON research state, benchmark reports, cursors, and sample datasets + live outside `docs/`; docs concepts link to or name those artifacts as evidence. + +## Required Lanes + +- `docs/spec/`: normative contracts, schemas, invariants, and required behavior. +- `docs/runbook/`: procedural runbooks, migrations, validation flows, and + operational sequences. +- `docs/reference/`: current structure references and non-procedural orientation. +- `docs/decisions/`: accepted rationale and durable decision records. +- `docs/research/`: latent research contracts and research evidence candidates. +- `docs/evidence/`: public-safe proof, validation evidence, and drift audits. +Historical plan artifacts may live under `docs/reference/plans/` while they remain +useful for repository navigation. They are reference concepts, not a top-level docs +lane. + +## Concept Frontmatter + +Every OKF concept requires: + +- `type` +- `title` +- `description` +- `status` +- `authority` +- `owner` +- `last_verified` + +Allowed concept types are `Decision`, `Drift Audit`, `Evidence`, `Policy`, +`Reference`, `Research Contract`, `Runbook`, and `Spec`. + +Use `tags`, `source_refs`, `code_refs`, `related`, `promotes_to`, and `drift_watch` +when they improve owner discovery or drift review. + +## Research Boundary + +Research concepts are latent until explicitly promoted. A research concept may cite a +machine-readable artifact outside `docs/`, but the raw artifact is not the docs owner. +Promote accepted facts into `docs/spec/`, `docs/runbook/`, `docs/reference/`, +`docs/decisions/`, or `docs/evidence/`; retire stale raw artifacts once their settled +content has an owner; then update indexes, links, and `docs/log.md`. + +## Validation + +- Run `decodex docs check` before claiming the OKF and LLM Wiki bundle is ready. +- Run `cargo make check-docs` for the repository-native Markdown link and task-name + check. +- When docs claims touch commands, config, code, schemas, generated outputs, or runtime + behavior, perform a semantic drift audit and record the evidence under + `docs/evidence/`. diff --git a/docs/reference/index.md b/docs/reference/index.md new file mode 100644 index 00000000..7c5d7b49 --- /dev/null +++ b/docs/reference/index.md @@ -0,0 +1,10 @@ +# Reference Index + +Purpose: Route agents to current structure references and non-procedural orientation. +Read this when: You need a stable overview that is not a normative spec or runbook. +Not this document: Correctness contracts, execution steps, or latent research. +Routes to: Reference concepts under `docs/reference/`. + +## Concepts + +- `plans/index.md`: retained historical planning artifacts kept as reference concepts. diff --git a/docs/plans/2026-02-02-cli-alignment-design.md b/docs/reference/plans/2026-02-02-cli-alignment-design.md similarity index 90% rename from docs/plans/2026-02-02-cli-alignment-design.md rename to docs/reference/plans/2026-02-02-cli-alignment-design.md index c3dc24ef..3e57c2b6 100644 --- a/docs/plans/2026-02-02-cli-alignment-design.md +++ b/docs/reference/plans/2026-02-02-cli-alignment-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "CLI Alignment Implementation Plan" +description: "Retained historical plan artifact: CLI Alignment Implementation Plan." +resource: docs/reference/plans/2026-02-02-cli-alignment-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # CLI Alignment Implementation Plan > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. diff --git a/docs/plans/2026-02-02-project-cleanup-design.md b/docs/reference/plans/2026-02-02-project-cleanup-design.md similarity index 79% rename from docs/plans/2026-02-02-project-cleanup-design.md rename to docs/reference/plans/2026-02-02-project-cleanup-design.md index 4f6d6cf4..e6db9839 100644 --- a/docs/plans/2026-02-02-project-cleanup-design.md +++ b/docs/reference/plans/2026-02-02-project-cleanup-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Project Cleanup Architecture Design" +description: "Retained historical plan artifact: Project Cleanup Architecture Design." +resource: docs/reference/plans/2026-02-02-project-cleanup-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Project Cleanup Architecture Design **Goal:** Restructure each app into a library-plus-binary layout, remove `#[path]` test imports, and make `cargo make lint-rust` pass without suppressing lints. diff --git a/docs/plans/2026-02-02-project-cleanup.md b/docs/reference/plans/2026-02-02-project-cleanup.md similarity index 95% rename from docs/plans/2026-02-02-project-cleanup.md rename to docs/reference/plans/2026-02-02-project-cleanup.md index a0ef40d4..02599554 100644 --- a/docs/plans/2026-02-02-project-cleanup.md +++ b/docs/reference/plans/2026-02-02-project-cleanup.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Project Cleanup Implementation Plan" +description: "Retained historical plan artifact: Project Cleanup Implementation Plan." +resource: docs/reference/plans/2026-02-02-project-cleanup.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Project Cleanup Implementation Plan > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. diff --git a/docs/plans/2026-02-03-search-expansion-design.md b/docs/reference/plans/2026-02-03-search-expansion-design.md similarity index 88% rename from docs/plans/2026-02-03-search-expansion-design.md rename to docs/reference/plans/2026-02-03-search-expansion-design.md index 4f8c99e6..542d2132 100644 --- a/docs/plans/2026-02-03-search-expansion-design.md +++ b/docs/reference/plans/2026-02-03-search-expansion-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Search Expansion and Multi-Query Fusion Design" +description: "Retained historical plan artifact: Search Expansion and Multi-Query Fusion Design." +resource: docs/reference/plans/2026-02-03-search-expansion-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Search Expansion and Multi-Query Fusion Design ## Overview diff --git a/docs/plans/2026-02-04-chunked-embeddings-design.md b/docs/reference/plans/2026-02-04-chunked-embeddings-design.md similarity index 94% rename from docs/plans/2026-02-04-chunked-embeddings-design.md rename to docs/reference/plans/2026-02-04-chunked-embeddings-design.md index c90f1126..54a3b3bb 100644 --- a/docs/plans/2026-02-04-chunked-embeddings-design.md +++ b/docs/reference/plans/2026-02-04-chunked-embeddings-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Chunked Embeddings (Chunk-First Retrieval) Design" +description: "Retained historical plan artifact: Chunked Embeddings (Chunk-First Retrieval) Design." +resource: docs/reference/plans/2026-02-04-chunked-embeddings-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Chunked Embeddings (Chunk-First Retrieval) Design **Goal:** Deliver a chunk-first retrieval architecture that maximizes recall and precision while keeping indexing and updates efficient. diff --git a/docs/plans/2026-02-04-chunked-embeddings-implementation.md b/docs/reference/plans/2026-02-04-chunked-embeddings-implementation.md similarity index 96% rename from docs/plans/2026-02-04-chunked-embeddings-implementation.md rename to docs/reference/plans/2026-02-04-chunked-embeddings-implementation.md index 87f560b0..d6d27477 100644 --- a/docs/plans/2026-02-04-chunked-embeddings-implementation.md +++ b/docs/reference/plans/2026-02-04-chunked-embeddings-implementation.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Chunked Embeddings Implementation Plan" +description: "Retained historical plan artifact: Chunked Embeddings Implementation Plan." +resource: docs/reference/plans/2026-02-04-chunked-embeddings-implementation.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Chunked Embeddings Implementation Plan > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. @@ -617,7 +631,7 @@ git commit -m '{"schema":"cmsg/1","type":"feat","scope":"api","summary":"Return **Files:** - Modify: `docs/spec/system_elf_memory_service_v1.md` -- Modify: `docs/guide/integration-testing.md` +- Modify: `docs/runbook/integration-testing.md` **Step 1: Write the failing test** @@ -634,9 +648,9 @@ Add a doc lint placeholder (if no doc lint exists, skip this test step). **Step 3: Commit** ```bash -git add docs/spec/system_elf_memory_service_v1.md docs/guide/integration-testing.md +git add docs/spec/system_elf_memory_service_v1.md docs/runbook/integration-testing.md -git commit -m '{"schema":"cmsg/1","type":"docs","scope":"global","summary":"Document chunk-first retrieval","intent":"Align specs and guides with chunk embeddings","impact":"Specs reflect new schema and API","breaking":false,"risk":"low","refs":[]}' +git commit -m '{"schema":"cmsg/1","type":"docs","scope":"global","summary":"Document chunk-first retrieval","intent":"Align specs and runbooks with chunk embeddings","impact":"Specs reflect new schema and API","breaking":false,"risk":"low","refs":[]}' ``` --- @@ -650,7 +664,7 @@ Expected: PASS (integration tests may be ignored if external services are not se ## Execution Handoff -Plan complete and saved to `docs/plans/2026-02-04-chunked-embeddings-implementation.md`. +Plan complete and saved to `docs/reference/plans/2026-02-04-chunked-embeddings-implementation.md`. Two execution options: diff --git a/docs/plans/2026-02-04-llm-cache-design.md b/docs/reference/plans/2026-02-04-llm-cache-design.md similarity index 90% rename from docs/plans/2026-02-04-llm-cache-design.md rename to docs/reference/plans/2026-02-04-llm-cache-design.md index 3a6bde14..b4b5d17f 100644 --- a/docs/plans/2026-02-04-llm-cache-design.md +++ b/docs/reference/plans/2026-02-04-llm-cache-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "LLM Cache for Query Expansion and Reranking Design" +description: "Retained historical plan artifact: LLM Cache for Query Expansion and Reranking Design." +resource: docs/reference/plans/2026-02-04-llm-cache-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # LLM Cache for Query Expansion and Reranking Design Date: 2026-02-04 diff --git a/docs/plans/2026-02-04-llm-cache-implementation-plan.md b/docs/reference/plans/2026-02-04-llm-cache-implementation-plan.md similarity index 97% rename from docs/plans/2026-02-04-llm-cache-implementation-plan.md rename to docs/reference/plans/2026-02-04-llm-cache-implementation-plan.md index 5a5bd692..597aa623 100644 --- a/docs/plans/2026-02-04-llm-cache-implementation-plan.md +++ b/docs/reference/plans/2026-02-04-llm-cache-implementation-plan.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "LLM Cache Implementation Plan" +description: "Retained historical plan artifact: LLM Cache Implementation Plan." +resource: docs/reference/plans/2026-02-04-llm-cache-implementation-plan.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # LLM Cache Implementation Plan > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. diff --git a/docs/plans/2026-02-04-search-explainability-design.md b/docs/reference/plans/2026-02-04-search-explainability-design.md similarity index 88% rename from docs/plans/2026-02-04-search-explainability-design.md rename to docs/reference/plans/2026-02-04-search-explainability-design.md index d419303a..98f0344a 100644 --- a/docs/plans/2026-02-04-search-explainability-design.md +++ b/docs/reference/plans/2026-02-04-search-explainability-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Search Explainability Outputs Design" +description: "Retained historical plan artifact: Search Explainability Outputs Design." +resource: docs/reference/plans/2026-02-04-search-explainability-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Search Explainability Outputs Design Date: 2026-02-04 diff --git a/docs/plans/2026-02-09-ranking-harness-trace-policy-compare.md b/docs/reference/plans/2026-02-09-ranking-harness-trace-policy-compare.md similarity index 90% rename from docs/plans/2026-02-09-ranking-harness-trace-policy-compare.md rename to docs/reference/plans/2026-02-09-ranking-harness-trace-policy-compare.md index 35787537..37b81e7e 100644 --- a/docs/plans/2026-02-09-ranking-harness-trace-policy-compare.md +++ b/docs/reference/plans/2026-02-09-ranking-harness-trace-policy-compare.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Trace-Based Ranking Harness: Next Steps" +description: "Retained historical plan artifact: Trace-Based Ranking Harness: Next Steps." +resource: docs/reference/plans/2026-02-09-ranking-harness-trace-policy-compare.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Trace-Based Ranking Harness: Next Steps ## Context diff --git a/docs/plans/2026-02-10-search-ranking-explain-v2-design.md b/docs/reference/plans/2026-02-10-search-ranking-explain-v2-design.md similarity index 87% rename from docs/plans/2026-02-10-search-ranking-explain-v2-design.md rename to docs/reference/plans/2026-02-10-search-ranking-explain-v2-design.md index 06d27d2b..995994ff 100644 --- a/docs/plans/2026-02-10-search-ranking-explain-v2-design.md +++ b/docs/reference/plans/2026-02-10-search-ranking-explain-v2-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Search Ranking Explain v2 (Additive Terms, v2-Only)" +description: "Retained historical plan artifact: Search Ranking Explain v2 (Additive Terms, v2-Only)." +resource: docs/reference/plans/2026-02-10-search-ranking-explain-v2-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Search Ranking Explain v2 (Additive Terms, v2-Only) ## Goal diff --git a/docs/plans/2026-02-10-structured-memory-fields-design.md b/docs/reference/plans/2026-02-10-structured-memory-fields-design.md similarity index 86% rename from docs/plans/2026-02-10-structured-memory-fields-design.md rename to docs/reference/plans/2026-02-10-structured-memory-fields-design.md index ac896740..895c5dc5 100644 --- a/docs/plans/2026-02-10-structured-memory-fields-design.md +++ b/docs/reference/plans/2026-02-10-structured-memory-fields-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Structured Memory Fields With Field-Level Embeddings" +description: "Retained historical plan artifact: Structured Memory Fields With Field-Level Embeddings." +resource: docs/reference/plans/2026-02-10-structured-memory-fields-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Structured Memory Fields With Field-Level Embeddings ## Goal diff --git a/docs/plans/2026-02-22-org-shared-design.md b/docs/reference/plans/2026-02-22-org-shared-design.md similarity index 94% rename from docs/plans/2026-02-22-org-shared-design.md rename to docs/reference/plans/2026-02-22-org-shared-design.md index 7b839bf4..7b47ec93 100644 --- a/docs/plans/2026-02-22-org-shared-design.md +++ b/docs/reference/plans/2026-02-22-org-shared-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Org-Shared (Tenant-Wide) Semantics Design" +description: "Retained historical plan artifact: Org-Shared (Tenant-Wide) Semantics Design." +resource: docs/reference/plans/2026-02-22-org-shared-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Org-Shared (Tenant-Wide) Semantics Design Date: 2026-02-22 diff --git a/docs/plans/2026-02-22-org-shared-implementation-plan.md b/docs/reference/plans/2026-02-22-org-shared-implementation-plan.md similarity index 92% rename from docs/plans/2026-02-22-org-shared-implementation-plan.md rename to docs/reference/plans/2026-02-22-org-shared-implementation-plan.md index 0bdcaf0f..129f5c34 100644 --- a/docs/plans/2026-02-22-org-shared-implementation-plan.md +++ b/docs/reference/plans/2026-02-22-org-shared-implementation-plan.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Org-Shared (Tenant-Wide) Semantics Implementation Plan" +description: "Retained historical plan artifact: Org-Shared (Tenant-Wide) Semantics Implementation Plan." +resource: docs/reference/plans/2026-02-22-org-shared-implementation-plan.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Org-Shared (Tenant-Wide) Semantics Implementation Plan > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. @@ -147,7 +161,7 @@ git commit -m '{"schema":"cmsg/1","type":"feat","scope":"sharing","summary":"Def --- -Plan complete and saved to `docs/plans/2026-02-22-org-shared-implementation-plan.md`. +Plan complete and saved to `docs/reference/plans/2026-02-22-org-shared-implementation-plan.md`. Two execution options: 1) **Subagent-Driven (this session)** — execute tasks one-by-one with review checkpoints diff --git a/docs/plans/2026-02-23-agent-memory-mcp-skills-backlog.md b/docs/reference/plans/2026-02-23-agent-memory-mcp-skills-backlog.md similarity index 94% rename from docs/plans/2026-02-23-agent-memory-mcp-skills-backlog.md rename to docs/reference/plans/2026-02-23-agent-memory-mcp-skills-backlog.md index 8ffbf71a..2ab08560 100644 --- a/docs/plans/2026-02-23-agent-memory-mcp-skills-backlog.md +++ b/docs/reference/plans/2026-02-23-agent-memory-mcp-skills-backlog.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Agent Memory (MCP + Skills) Backlog" +description: "Retained historical plan artifact: Agent Memory (MCP + Skills) Backlog." +resource: docs/reference/plans/2026-02-23-agent-memory-mcp-skills-backlog.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Agent Memory (MCP + Skills) Backlog Date: 2026-02-23 @@ -49,7 +63,7 @@ Proposed `source_ref` shape (v0): - `access`: optional hint for how to fetch (e.g. `"s3" | "http" | "local_fs"`) Acceptance criteria: -- Add a spec/guide page describing the schema and forward/backward compatibility rules. +- Add a spec/runbook page describing the schema and forward/backward compatibility rules. - Provide at least one reference implementation of encoding/decoding in an agent-side “skill”. ### Issue 3: Add a document hydration component (Doc Store and/or Doc MCP) diff --git a/docs/plans/2026-02-24-doc-ext-v1-design.md b/docs/reference/plans/2026-02-24-doc-ext-v1-design.md similarity index 93% rename from docs/plans/2026-02-24-doc-ext-v1-design.md rename to docs/reference/plans/2026-02-24-doc-ext-v1-design.md index 6f54e8c7..8cf35a2b 100644 --- a/docs/plans/2026-02-24-doc-ext-v1-design.md +++ b/docs/reference/plans/2026-02-24-doc-ext-v1-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Doc Extension v1 (Evidence Store) — Design" +description: "Retained historical plan artifact: Doc Extension v1 (Evidence Store) — Design." +resource: docs/reference/plans/2026-02-24-doc-ext-v1-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Doc Extension v1 (Evidence Store) — Design **Status:** Approved (v1 scope locked) diff --git a/docs/plans/2026-02-24-doc-ext-v1-implementation-plan.md b/docs/reference/plans/2026-02-24-doc-ext-v1-implementation-plan.md similarity index 93% rename from docs/plans/2026-02-24-doc-ext-v1-implementation-plan.md rename to docs/reference/plans/2026-02-24-doc-ext-v1-implementation-plan.md index 15ffebea..d66b7b26 100644 --- a/docs/plans/2026-02-24-doc-ext-v1-implementation-plan.md +++ b/docs/reference/plans/2026-02-24-doc-ext-v1-implementation-plan.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Doc Extension v1 (Evidence Store) Implementation Plan" +description: "Retained historical plan artifact: Doc Extension v1 (Evidence Store) Implementation Plan." +resource: docs/reference/plans/2026-02-24-doc-ext-v1-implementation-plan.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Doc Extension v1 (Evidence Store) Implementation Plan > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. diff --git a/docs/plans/2026-02-25-agent-skills-cookbook-design.md b/docs/reference/plans/2026-02-25-agent-skills-cookbook-design.md similarity index 79% rename from docs/plans/2026-02-25-agent-skills-cookbook-design.md rename to docs/reference/plans/2026-02-25-agent-skills-cookbook-design.md index 29b73aaf..c564fb07 100644 --- a/docs/plans/2026-02-25-agent-skills-cookbook-design.md +++ b/docs/reference/plans/2026-02-25-agent-skills-cookbook-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Agent Skills Cookbook (MCP-first) — Design" +description: "Retained historical plan artifact: Agent Skills Cookbook (MCP-first) — Design." +resource: docs/reference/plans/2026-02-25-agent-skills-cookbook-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Agent Skills Cookbook (MCP-first) — Design Status: Proposed @@ -19,7 +33,7 @@ Ship a non-normative "skills cookbook" that standardizes how an agent should use - Long-form evidence via Doc Extension v1 (store documents; hydrate bounded excerpts on demand). - Multi-agent sharing through explicit scopes and grants. -This cookbook is a guide/playbook, not a system contract. It must not change ELF Core semantics. +This cookbook is a runbook/playbook, not a system contract. It must not change ELF Core semantics. ## Core vs Skills contract @@ -45,9 +59,9 @@ Skills define agent-side workflows and policies, such as: ## Deliverable -Add a single guide document: +Add a single runbook document: -- `docs/guide/agent_skills_cookbook.md` +- `docs/runbook/agent_skills_cookbook.md` It should include: @@ -65,4 +79,3 @@ It should include: - No new server features or new endpoints (this is documentation only). - No changes to normative specs. - No attempt to ship a general-purpose doc/search platform in Core. - diff --git a/docs/plans/2026-02-25-ci-services-checks-design.md b/docs/reference/plans/2026-02-25-ci-services-checks-design.md similarity index 88% rename from docs/plans/2026-02-25-ci-services-checks-design.md rename to docs/reference/plans/2026-02-25-ci-services-checks-design.md index 92b8765d..56d1549e 100644 --- a/docs/plans/2026-02-25-ci-services-checks-design.md +++ b/docs/reference/plans/2026-02-25-ci-services-checks-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "CI Service-Backed Checks Design" +description: "Retained historical plan artifact: CI Service-Backed Checks Design." +resource: docs/reference/plans/2026-02-25-ci-services-checks-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # CI Service-Backed Checks Design **Date:** 2026-02-25 @@ -16,8 +30,8 @@ Today the repository already runs: Local developer guidance for service-backed testing lives in: -- `docs/guide/integration-testing.md` -- `docs/guide/testing.md` +- `docs/runbook/integration-testing.md` +- `docs/runbook/testing.md` ## Requirements diff --git a/docs/plans/2026-03-01-reflection-consolidation-loop-eval-scenarios.md b/docs/reference/plans/2026-03-01-reflection-consolidation-loop-eval-scenarios.md similarity index 85% rename from docs/plans/2026-03-01-reflection-consolidation-loop-eval-scenarios.md rename to docs/reference/plans/2026-03-01-reflection-consolidation-loop-eval-scenarios.md index b2b84bce..1b5a85be 100644 --- a/docs/plans/2026-03-01-reflection-consolidation-loop-eval-scenarios.md +++ b/docs/reference/plans/2026-03-01-reflection-consolidation-loop-eval-scenarios.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Reflection & Consolidation Loop: Evaluation Scenarios" +description: "Retained historical plan artifact: Reflection & Consolidation Loop: Evaluation Scenarios." +resource: docs/reference/plans/2026-03-01-reflection-consolidation-loop-eval-scenarios.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Reflection & Consolidation Loop: Evaluation Scenarios ## Decision diff --git a/docs/plans/2026-03-04-search-modes-design.md b/docs/reference/plans/2026-03-04-search-modes-design.md similarity index 85% rename from docs/plans/2026-03-04-search-modes-design.md rename to docs/reference/plans/2026-03-04-search-modes-design.md index f83a06d4..d3c696d4 100644 --- a/docs/plans/2026-03-04-search-modes-design.md +++ b/docs/reference/plans/2026-03-04-search-modes-design.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "Search Modes: `quick_find` vs `planned_search` (Design)" +description: "Retained historical plan artifact: Search Modes: `quick_find` vs `planned_search` (Design)." +resource: docs/reference/plans/2026-03-04-search-modes-design.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # Search Modes: `quick_find` vs `planned_search` (Design) Date: 2026-03-04 diff --git a/docs/plans/2026-06-08-elf-hardening-evaluation-decisions.md b/docs/reference/plans/2026-06-08-elf-hardening-evaluation-decisions.md similarity index 92% rename from docs/plans/2026-06-08-elf-hardening-evaluation-decisions.md rename to docs/reference/plans/2026-06-08-elf-hardening-evaluation-decisions.md index 77e0d95a..0477ee1a 100644 --- a/docs/plans/2026-06-08-elf-hardening-evaluation-decisions.md +++ b/docs/reference/plans/2026-06-08-elf-hardening-evaluation-decisions.md @@ -1,3 +1,17 @@ +--- +type: Reference +title: "ELF Hardening Evaluation Decisions" +description: "Retained historical plan artifact: ELF Hardening Evaluation Decisions." +resource: docs/reference/plans/2026-06-08-elf-hardening-evaluation-decisions.md +status: active +authority: current_state +owner: reference +last_verified: 2026-06-18 +tags: + - docs + - reference + - plans +--- # ELF Hardening Evaluation Decisions **Date:** 2026-06-08 diff --git a/docs/reference/plans/index.md b/docs/reference/plans/index.md new file mode 100644 index 00000000..7397d1af --- /dev/null +++ b/docs/reference/plans/index.md @@ -0,0 +1,38 @@ +# Plan Artifact Index + +Purpose: Route agents to retained planning artifacts. +Read this when: A planning tool, execution workflow, or historical design question +explicitly needs a saved plan. +Not this document: Current specs, runbooks, or accepted decisions. +Routes to: Markdown plan artifacts under `docs/reference/plans/`. + +## Boundary + +Plan artifacts are retained as OKF concepts for retrieval and provenance. Promote +accepted durable claims into `docs/spec/`, `docs/runbook/`, `docs/decisions/`, +`docs/reference/`, or `docs/evidence/` before treating them as current authority. + +## Concepts + +- `2026-02-02-cli-alignment-design.md`: retained historical plan artifact. +- `2026-02-02-project-cleanup-design.md`: retained historical plan artifact. +- `2026-02-02-project-cleanup.md`: retained historical plan artifact. +- `2026-02-03-search-expansion-design.md`: retained historical plan artifact. +- `2026-02-04-chunked-embeddings-design.md`: retained historical plan artifact. +- `2026-02-04-chunked-embeddings-implementation.md`: retained historical plan artifact. +- `2026-02-04-llm-cache-design.md`: retained historical plan artifact. +- `2026-02-04-llm-cache-implementation-plan.md`: retained historical plan artifact. +- `2026-02-04-search-explainability-design.md`: retained historical plan artifact. +- `2026-02-09-ranking-harness-trace-policy-compare.md`: retained historical plan artifact. +- `2026-02-10-search-ranking-explain-v2-design.md`: retained historical plan artifact. +- `2026-02-10-structured-memory-fields-design.md`: retained historical plan artifact. +- `2026-02-22-org-shared-design.md`: retained historical plan artifact. +- `2026-02-22-org-shared-implementation-plan.md`: retained historical plan artifact. +- `2026-02-23-agent-memory-mcp-skills-backlog.md`: retained historical plan artifact. +- `2026-02-24-doc-ext-v1-design.md`: retained historical plan artifact. +- `2026-02-24-doc-ext-v1-implementation-plan.md`: retained historical plan artifact. +- `2026-02-25-agent-skills-cookbook-design.md`: retained historical plan artifact. +- `2026-02-25-ci-services-checks-design.md`: retained historical plan artifact. +- `2026-03-01-reflection-consolidation-loop-eval-scenarios.md`: retained historical plan artifact. +- `2026-03-04-search-modes-design.md`: retained historical plan artifact. +- `2026-06-08-elf-hardening-evaluation-decisions.md`: retained historical plan artifact. diff --git a/docs/research/2026-06-08-agent-memory-selection.json b/docs/research/2026-06-08-agent-memory-selection.json deleted file mode 100644 index 0e4c6899..00000000 --- a/docs/research/2026-06-08-agent-memory-selection.json +++ /dev/null @@ -1,221 +0,0 @@ -{ - "schema": "research-run/2", - "run_id": "2026-06-08-agent-memory-selection", - "question": "Given agentmemory, current monitored memory projects, and OpenAI/Anthropic/Google dreaming-style memory consolidation, should ELF continue building its own memory system or adopt an external system?", - "success_criteria": [ - "Use current ELF main-branch evidence, current Decodex/Linear state, and current external sources.", - "Compare continue-build, adopt-agentmemory, and adopt-managed-dreaming options.", - "Return guidance that can shape the next ELF Linear issues without relaxing evidence/provenance requirements." - ], - "constraints": [ - "Do not treat external benchmark or README claims as independently verified unless ELF has reproduced them.", - "Do not recommend destructive memory rewriting without reviewable derived output and provenance.", - "Keep ELF source-of-truth semantics separate from optional adapters and derived views." - ], - "stop_rule": "Stop once the recommendation is decision-ready for issue shaping or the remaining uncertainty would require implementation benchmarks beyond this research pass.", - "primary_hypothesis": "ELF should continue as the evidence-bound core memory service and borrow or integrate external systems only at the capture, evaluation, viewer, and derived-consolidation layers.", - "rival_hypotheses": [ - "Replace ELF with agentmemory because it already packages cross-agent hooks, MCP tools, benchmarks, viewer, and consolidation.", - "Replace ELF's roadmap with managed dreaming APIs because large vendors are converging on background memory curation.", - "Pause ELF core development until the agent-memory market stabilizes." - ], - "falsifiers": [ - "If agentmemory or another external project exposes ELF-equivalent evidence-bound deterministic write contracts, multi-tenant service semantics, and rebuildable source-of-truth storage with lower integration risk, replacement becomes viable.", - "If managed dreaming APIs provide portable, self-hostable, reviewable, evidence-linked memory stores that can satisfy ELF governance boundaries, adopting them as core becomes viable.", - "If ELF's own hardening and validation surface is not operational after the June 2026 work, continuing core development should be deferred until reliability is restored." - ], - "coverage": { - "mode": "broad_external", - "min_source_families": 4 - }, - "continuation": { - "mode": "auto_if_not_decision_ready", - "attempt": 1, - "max_attempts": 2, - "session_id": "2026-06-08-agent-memory-selection" - }, - "events": [ - { - "seq": 1, - "type": "probe_completed", - "remaining_option_count": 3, - "independent_option_questions": [ - "Should ELF continue as the core memory service or be replaced by agentmemory?", - "Should dreaming-style consolidation become authoritative or derived/reviewed?", - "Which current ELF backlog items become higher priority after the refresh?" - ], - "external_slices": [] - }, - { - "seq": 2, - "type": "evidence_recorded", - "evidence": [ - { - "id": "E1", - "kind": "observation", - "summary": "Current ELF main presents itself as evidence-linked fact memory with deterministic add_note and LLM-driven add_event separation, Postgres source-of-truth, rebuildable Qdrant index, multi-tenant scoped APIs, HTTP/MCP surfaces, graph-lite relation context, and evaluation tooling.", - "source_family": "repo_docs", - "source_locator": "README.md; config/local/elf.docker.toml; docker-compose.yml; Makefile.toml" - }, - { - "id": "E2", - "kind": "observation", - "summary": "The June 2026 ELF hardening sequence landed local service gates, MCP default-set PUT forwarding, getting-started docs, utoipa/Scalar API docs, strict config field presence, Docker Compose dependencies, and a checked-in decision record.", - "source_family": "repo_docs", - "source_locator": "docs/plans/2026-06-08-elf-hardening-evaluation-decisions.md" - }, - { - "id": "E3", - "kind": "observation", - "summary": "GitHub and Linear current-state checks show PRs #109-#113 merged and XY-789, XY-790, XY-791, XY-792, and XY-798 completed; Decodex top-level live status has zero active, running, queued, waiting, and attention lanes, although old attempt history still includes a stale XY-790 needs_attention ledger.", - "source_family": "tracker_runtime", - "source_locator": "gh pr view 109-113; Linear issue(id) query; decodex status --live --json --config /Users/x/.codex/decodex/projects/elf" - }, - { - "id": "E4", - "kind": "observation", - "summary": "agentmemory is a fast-moving Apache-2.0 coding-agent memory project with cross-agent MCP/REST/hook integration, advertised hybrid BM25/vector/graph retrieval, lifecycle/consolidation claims, a local viewer, iii console observability, v0.9.27 release, and recent push activity. Its own roadmap still lists governance, benchmark CI, session replay UI, enterprise trust, and v1.0 stability as future work.", - "source_family": "external_project", - "source_locator": "https://github.com/rohitg00/agentmemory; https://raw.githubusercontent.com/rohitg00/agentmemory/main/ROADMAP.md; GitHub API snapshot 2026-06-08T06:01:57Z" - }, - { - "id": "E5", - "kind": "observation", - "summary": "OpenAI describes dreaming as a background memory curation process that synthesizes memory state from conversations, improves preference use, and keeps memory current over time rather than treating old memories as static facts.", - "source_family": "vendor_docs", - "source_locator": "https://openai.com/index/chatgpt-memory-dreaming/" - }, - { - "id": "E6", - "kind": "observation", - "summary": "Anthropic Claude Dreams treats dreaming as an asynchronous research-preview job over a memory store plus 1-100 past sessions. It produces a separate output memory store, never modifies the input store, exposes progress/session events, and expects review, attach, discard, archive, or delete decisions after completion.", - "source_family": "vendor_docs", - "source_locator": "https://platform.claude.com/docs/en/managed-agents/dreams" - }, - { - "id": "E7", - "kind": "observation", - "summary": "Google examples split into two useful patterns: Always-On Memory Agent productizes file/API/dashboard ingest plus timer-based consolidation, while Gemini CLI Auto Memory keeps background extraction review-gated by writing patches and skill drafts to a project-local inbox before any approval.", - "source_family": "vendor_docs", - "source_locator": "https://github.com/GoogleCloudPlatform/generative-ai/tree/main/gemini/agents/always-on-memory-agent; https://github.com/google-gemini/gemini-cli/blob/main/docs/cli/auto-memory.md" - }, - { - "id": "E8", - "kind": "observation", - "summary": "The monitored project set remains active as of 2026-06-08. GitHub API snapshots showed recent pushes for agentmemory, mem0, qmd, claude-mem, OpenViking, gbrain, graphify, LangGraph, Graphiti, RAGFlow, LightRAG, and GraphRAG, with agentmemory at 21,783 stars and v0.9.27, mem0 at 58,005 stars, claude-mem at 81,157 stars, graphify at 62,294 stars, and RAGFlow at 82,150 stars.", - "source_family": "external_project", - "source_locator": "GitHub API repository metadata snapshot 2026-06-08T06:01:57Z" - }, - { - "id": "E9", - "kind": "observation", - "summary": "The existing ELF vNext backlog already has directly relevant Backlog issues for knowledge memory pages with provenance and lint (XY-286), read-only viewer (XY-19), retrieval observability panels (XY-27), and graph-lite typed query/DX (XY-70).", - "source_family": "tracker_runtime", - "source_locator": "Linear issue(id) query for XY-286, XY-19, XY-27, XY-70" - } - ] - }, - { - "seq": 3, - "type": "tradeoffs_recorded", - "tradeoffs": [ - { - "id": "T1", - "summary": "Continuing ELF preserves the evidence-bound, deterministic, scoped service contract that external coding-agent products do not clearly replace; the trade-off is slower product UX unless viewer and capture adapters are prioritized.", - "supporting_evidence_ids": [ - "E1", - "E4", - "E8" - ], - "disconfirming_evidence_ids": [] - }, - { - "id": "T2", - "summary": "Dreaming-style consolidation is now validated by major vendors as a product direction, but the safest shared pattern is separate or review-gated output rather than destructive authoritative rewriting.", - "supporting_evidence_ids": [ - "E5", - "E6", - "E7" - ], - "disconfirming_evidence_ids": [] - }, - { - "id": "T3", - "summary": "agentmemory should be treated as an integration and benchmark target for coding-agent session capture, not as a core replacement, because its strongest value is hooks, viewer, tool breadth, and packaged local UX while ELF's strongest value is provenance and service governance.", - "supporting_evidence_ids": [ - "E1", - "E4" - ], - "disconfirming_evidence_ids": [] - }, - { - "id": "T4", - "summary": "The refreshed evidence reorders ELF priorities toward viewer/observability and derived consolidation before more automatic memory authority, because operators need to inspect what was remembered, why, and how consolidation proposals were formed.", - "supporting_evidence_ids": [ - "E4", - "E6", - "E7", - "E9" - ], - "disconfirming_evidence_ids": [] - } - ] - }, - { - "seq": 4, - "type": "judgment_candidate_created", - "judgment_payload": { - "decision_claim": "Continue ELF as the evidence-bound memory core. Do not replace it with agentmemory or managed dreaming. Use agentmemory and managed dreaming systems as comparison baselines and optional adapters while prioritizing reviewable derived consolidation, operator viewer/observability, and graph-lite/knowledge-memory work in ELF.", - "implementation_order": [ - "Persist the research refresh and use it as the source for issue shaping.", - "Build a reviewed, derived consolidation pipeline over immutable evidence-bound notes and traces.", - "Ship the read-only viewer and retrieval observability panels before expanding automatic consolidation authority.", - "Add an optional agentmemory import/baseline adapter for coding-agent session observations.", - "Advance graph-lite typed query and derived knowledge pages with provenance and lint." - ], - "judgment_type": "recommend", - "key_evidence_ids": [ - "E1", - "E2", - "E3", - "E4", - "E5", - "E6", - "E7", - "E8" - ], - "key_tradeoff_ids": [ - "T1", - "T2", - "T3", - "T4" - ], - "preferred_option": "continue-elf-core-with-dreaming-inspired-derived-consolidation-and-agentmemory-baseline-integration", - "rejected_options": [ - "replace-elf-with-agentmemory", - "replace-elf-with-managed-dreaming", - "pause-elf-core-development-until-the-market-settles" - ] - }, - "judgment_hash": "sha256:854918f581d32764fad76ac0481e58a72701bc348a827afa2a2b76978cc341f9" - }, - { - "seq": 5, - "type": "worker_completed", - "worker": "skeptic", - "target_judgment_hash": "sha256:854918f581d32764fad76ac0481e58a72701bc348a827afa2a2b76978cc341f9", - "summary": "The strongest objection is that agentmemory's product surface is already ahead of ELF for coding-agent continuity. That does not defeat the judgment because it supports an adapter/baseline and viewer priority, not replacement of ELF's stricter source-of-truth and evidence contract.", - "objections": [] - }, - { - "seq": 6, - "type": "finalized_decision_ready", - "judgment_hash": "sha256:854918f581d32764fad76ac0481e58a72701bc348a827afa2a2b76978cc341f9", - "confidence": "medium", - "missing_evidence": [ - "ELF has not independently reproduced agentmemory's benchmark claims.", - "The next implementation pass still needs issue-local design for the consolidation data model and adapter boundaries." - ] - } - ] -} diff --git a/docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json b/docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json deleted file mode 100644 index 198df1af..00000000 --- a/docs/research/2026-06-09-xy-841-external-memory-benchmark-dimensions.json +++ /dev/null @@ -1,136 +0,0 @@ -{ - "schema": "research-run/2", - "run_id": "2026-06-09-xy-841-external-memory-benchmark-dimensions", - "question": "How should ELF map reviewed external memory projects to real-world benchmark dimensions without overstating docs-only evidence as benchmark proof?", - "success_criteria": [ - "Map every reviewed external project in the issue scope to one or more real-world benchmark suites.", - "Separate benchmark-grounded adapter evidence from docs-grounded research claims.", - "Identify dimensions where ELF should not be treated as the reference yet.", - "Keep pending D0 projects as watch items unless current evidence is gathered in scope." - ], - "constraints": [ - "Do not implement benchmark adapters or change ELF runtime behavior.", - "Do not make benchmark pass/fail claims without runnable evidence from checked-in reports.", - "Use existing reviewed docs and benchmark reports as the authority for this docs-only refresh." - ], - "stop_rule": "Stop once the comparison and inventory can route future real_world_job benchmark design without implying unproven external quality claims.", - "primary_hypothesis": "The capability map should treat qmd, claude-mem, agentmemory, mem0/OpenMemory, OpenViking, memsearch, llm-wiki, gbrain, Always-On Memory Agent, graphify, Letta, LangGraph, Graphiti/Zep, and nanograph as dimension references only where docs or benchmark evidence supports the fit; D0 RAG projects should remain watch items.", - "rival_hypotheses": [ - "Use the current smoke benchmark status alone to rank external projects.", - "Treat official external README claims as sufficient benchmark-quality evidence.", - "Drop pending RAGFlow, LightRAG, and GraphRAG from the map until adapters exist." - ], - "falsifiers": [ - "If a current runnable adapter report exists for a broader dimension, docs-only confidence would be too conservative.", - "If a listed project lacks any documented mechanism matching the assigned suite, the suite map would overstate its reference role.", - "If D0 watch items are assigned strengths, the map would violate the no-current-evidence boundary." - ], - "coverage": { - "mode": "repo_docs_and_existing_external_research", - "min_source_families": 3 - }, - "events": [ - { - "seq": 1, - "type": "probe_completed", - "remaining_option_count": 3, - "independent_option_questions": [ - "Which benchmark dimensions are already proven by ELF's checked-in adapter evidence?", - "Which projects should be treated as docs-grounded references for unencoded dimensions?", - "Which pending projects must stay as watch items?" - ], - "external_slices": [] - }, - { - "seq": 2, - "type": "evidence_recorded", - "evidence": [ - { - "id": "E1", - "kind": "observation", - "summary": "README states that the June 9 Docker live baseline and production adoption gate prove a bounded ELF production-provider path, while the all-project smoke has ELF and qmd passing encoded checks and other external projects retaining typed failure or incomplete states.", - "source_family": "repo_docs", - "source_locator": "README.md" - }, - { - "id": "E2", - "kind": "observation", - "summary": "The production adoption gate explicitly bounds external comparison as an objective adapter matrix, not an overall superiority claim, and records qmd pass, agentmemory lifecycle_fail, and memsearch/mem0/OpenViking/claude-mem incomplete or wrong-result states.", - "source_family": "benchmark_report", - "source_locator": "docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md" - }, - { - "id": "E3", - "kind": "observation", - "summary": "The live baseline runbook defines pass, wrong_result, lifecycle_fail, incomplete, blocked, and not_encoded semantics, and warns that incomplete, blocked, and not_encoded are not passes.", - "source_family": "repo_runbook", - "source_locator": "docs/guide/benchmarking/live_baseline_benchmark.md" - }, - { - "id": "E4", - "kind": "observation", - "summary": "The existing comparison contains D1/D2 docs-grounded mechanism research for agentmemory, qmd, claude-mem, mem0/OpenMemory, memsearch, OpenViking, llm-wiki, gbrain, Always-On Memory Agent, graphify, Letta, LangGraph, Graphiti/Zep, and nanograph.", - "source_family": "repo_research_docs", - "source_locator": "docs/guide/research/comparison_external_projects.md" - }, - { - "id": "E5", - "kind": "observation", - "summary": "The inventory marks RAGFlow, LightRAG, and GraphRAG as D0 pending deep dives, so they can only be watch items in this lane.", - "source_family": "repo_research_docs", - "source_locator": "docs/guide/research/research_projects_inventory.md" - } - ] - }, - { - "seq": 3, - "type": "tradeoffs_recorded", - "tradeoffs": [ - { - "id": "T1", - "summary": "Using only current smoke results would hide useful future benchmark dimensions such as operator continuity, temporal graph validity, core/archival memory, and knowledge synthesis.", - "supporting_evidence_ids": [ - "E2", - "E4" - ], - "disconfirming_evidence_ids": [] - }, - { - "id": "T2", - "summary": "Using docs-grounded references without labels would overstate external project quality because the benchmark runner has not reproduced most broader claims.", - "supporting_evidence_ids": [ - "E2", - "E3" - ], - "disconfirming_evidence_ids": [] - }, - { - "id": "T3", - "summary": "Keeping D0 RAG projects as watch items preserves future coverage without pretending that adapter feasibility, resource envelope, or evidence quality has been audited.", - "supporting_evidence_ids": [ - "E3", - "E5" - ], - "disconfirming_evidence_ids": [] - } - ] - }, - { - "seq": 4, - "type": "challenge_recorded", - "summary": "The main risk is that a broad suite map could read like a quality ranking. The mitigation is to label evidence class per project, repeat that only current adapter reports can support pass/fail claims, and call out ELF gaps by reference dimension instead of claiming overall superiority.", - "resolved": true - }, - { - "seq": 5, - "type": "finalized_decision_ready", - "confidence": "medium", - "decision": "Update the comparison and inventory with a real-world benchmark-dimension map. Treat qmd, claude-mem, agentmemory, mem0/OpenMemory, memsearch, OpenViking, llm-wiki, gbrain, Always-On Memory Agent, graphify, Letta, LangGraph, Graphiti/Zep, and nanograph as reference projects for specific dimensions, but separate benchmark-grounded evidence from docs-grounded suite fit. Keep RAGFlow, LightRAG, and GraphRAG as D0 watch items.", - "missing_evidence": [ - "No new upstream source refresh was performed in this lane.", - "No new benchmark adapter or real_world_job suite was executed.", - "Most non-smoke dimensions remain docs-grounded until future adapter evidence exists." - ] - } - ] -} diff --git a/docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json b/docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json deleted file mode 100644 index 9f42812b..00000000 --- a/docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json +++ /dev/null @@ -1,348 +0,0 @@ -{ - "schema": "research-run/2", - "run_id": "2026-06-10-xy-882-rag-graph-adapter-feasibility", - "question": "Which RAG and graph-memory research gates should become Docker-bounded adapter implementation candidates for ELF real-world benchmarks?", - "success_criteria": [ - "Give RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, Letta, LangGraph, nanograph, llm-wiki, gbrain, and graphify one explicit verdict: adapter_candidate, research_only, blocked, or reject.", - "Separate setup/resource feasibility from product quality; heavy setup is not treated as a quality failure.", - "Require adapter_candidate projects to have both a Docker-contained path and an evidence-linked output contract.", - "Keep all researched projects in the research_gate evidence class until a Docker adapter executes real_world_job scoring." - ], - "constraints": [ - "Do not implement adapters in this issue.", - "Do not use host-global installs as proof.", - "Do not claim live adapter pass evidence from source or docs review.", - "Create implementation follow-ups only for adapter candidates with a scoped Docker boundary and evidence-linked output." - ], - "stop_rule": "Stop when every target project has a verdict, adapter candidates have scoped follow-up issue titles, and the docs/manifest still label these records as research gates rather than live evidence.", - "primary_hypothesis": "RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify have enough Docker-bounded setup and evidence-output shape to justify implementation follow-ups; Letta, LangGraph, nanograph, and llm-wiki remain research-only references; gbrain remains blocked until a Docker-local brain repo/database path is proven.", - "rival_hypotheses": [ - "All projects should remain research-only because none has executed in the benchmark runner.", - "All projects with official Docker or CLI instructions should become adapter candidates.", - "RAGFlow should be rejected because its official resource envelope is large." - ], - "falsifiers": [ - "If a candidate cannot run without host-global state, it is not an adapter implementation candidate for this benchmark lane.", - "If a candidate cannot emit source IDs, document IDs, file locations, citations, or equivalent evidence handles, it cannot support real_world_job scoring.", - "If a project is a useful architecture reference but not a standalone memory/retrieval output path, it should remain research_only." - ], - "coverage": { - "mode": "primary_source_docs_and_existing_repo_contracts", - "min_source_families": 4 - }, - "events": [ - { - "seq": 1, - "type": "probe_completed", - "remaining_option_count": 4, - "independent_option_questions": [ - "Does the project expose a Docker-contained setup path?", - "Does the project expose corpus ingest and query output that can map back to source evidence?", - "Is the project a direct adapter candidate, a reference-only design input, blocked by missing Docker proof, or rejected?" - ], - "external_slices": [ - "RAGFlow", - "LightRAG", - "GraphRAG", - "Graphiti/Zep", - "Letta", - "LangGraph", - "nanograph", - "llm-wiki", - "gbrain", - "graphify" - ] - }, - { - "seq": 2, - "type": "evidence_recorded", - "evidence": [ - { - "id": "E1", - "kind": "contract", - "summary": "The real-world benchmark spec defines research_gate records as source/setup/runtime/resource/retry metadata for future implementation; research gates must not count as fixture-backed, live-baseline, or live-real-world evidence.", - "source_family": "repo_spec", - "source_locator": "docs/spec/real_world_agent_memory_benchmark_v1.md" - }, - { - "id": "E2", - "kind": "setup", - "summary": "RAGFlow official quickstart documents Docker startup, 4 CPU / 16 GB RAM / 50 GB disk prerequisites, x86/Nvidia support, image-size caveats, dataset creation, chunk visibility, and citation-backed retrieval testing.", - "source_family": "upstream_docs", - "source_locator": "https://ragflow.io/docs/" - }, - { - "id": "E3", - "kind": "output_contract", - "summary": "RAGFlow HTTP API can include reference metadata and returns reference chunks containing chunk id, content, document id, document name, document metadata, dataset id, positions, and similarity scores.", - "source_family": "upstream_docs", - "source_locator": "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md" - }, - { - "id": "E4", - "kind": "setup", - "summary": "LightRAG Docker docs describe docker compose startup, generated compose files, persistent data paths, environment-driven LLM and embedding configuration, and optional Docker-local vLLM embedding/rerank services.", - "source_family": "upstream_docs", - "source_locator": "https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/DockerDeployment.md" - }, - { - "id": "E5", - "kind": "output_contract", - "summary": "LightRAG supports query prefixes including context-only modes, can return the context prepared for the LLM, supports inserting documents with stable ids, and traces sources through file_paths.", - "source_family": "upstream_docs", - "source_locator": "https://raw.githubusercontent.com/HKUDS/LightRAG/main/docs/LightRAG-API-Server.md" - }, - { - "id": "E6", - "kind": "output_contract", - "summary": "GraphRAG writes parquet output tables with UUIDs and human-readable ids; communities and reports carry text_unit_ids, and text_units carry raw text plus document ids and relationship/entity ids.", - "source_family": "upstream_docs", - "source_locator": "https://microsoft.github.io/graphrag/index/outputs/" - }, - { - "id": "E7", - "kind": "setup", - "summary": "GraphRAG input and query docs describe a CLI/API indexing and local-search path over structured documents, raw text chunks, graph data, and query context builders.", - "source_family": "upstream_docs", - "source_locator": "https://microsoft.github.io/graphrag/" - }, - { - "id": "E8", - "kind": "output_contract", - "summary": "Graphiti/Zep requires Python plus Neo4j or FalkorDB, supports Docker-local FalkorDB, adds episodes or fact triples, and search results include UUID, fact text, valid_at, and invalid_at fields.", - "source_family": "upstream_docs", - "source_locator": "https://help.getzep.com/graphiti/getting-started/quick-start" - }, - { - "id": "E9", - "kind": "boundary", - "summary": "Letta remains a strong core/archival memory reference, but Docker use needs explicit embedding configuration and the current docs steer new Letta Code users away from Docker-first evaluation.", - "source_family": "upstream_docs", - "source_locator": "https://docs.letta.com/guides/docker/" - }, - { - "id": "E10", - "kind": "boundary", - "summary": "LangGraph persistence provides checkpoints, replay, stores, and semantic memory search, but it is an agent-state framework rather than a standalone external memory service adapter.", - "source_family": "upstream_docs", - "source_locator": "https://docs.langchain.com/oss/python/langgraph/persistence" - }, - { - "id": "E11", - "kind": "boundary", - "summary": "nanograph documents one CLI, one folder, schema-as-code, no server, no cloud, and no Docker; this makes it a graph-lite DX reference rather than a Docker adapter candidate for this lane.", - "source_family": "upstream_docs", - "source_locator": "https://www.nanograph.io/" - }, - { - "id": "E12", - "kind": "boundary", - "summary": "llm-wiki ships as agent plugins or portable instructions with wiki query, compile, lint, audit, and output workflows; it is a derived knowledge workflow reference, not a service adapter candidate without a contained plugin harness.", - "source_family": "upstream_docs", - "source_locator": "https://github.com/nvk/llm-wiki" - }, - { - "id": "E13", - "kind": "boundary", - "summary": "gbrain has strong compiled-truth, append-only timeline, and source attribution contracts, but this lane did not prove a Docker-local brain repository and database setup path.", - "source_family": "upstream_docs", - "source_locator": "https://raw.githubusercontent.com/garrytan/gbrain/master/docs/guides/compiled-truth.md" - }, - { - "id": "E14", - "kind": "output_contract", - "summary": "graphify can run over a folder, produces graph.html, GRAPH_REPORT.md, graph.json, and cache artifacts, and query output includes node labels, edge types, confidence tags, source files, and source locations.", - "source_family": "upstream_docs", - "source_locator": "https://raw.githubusercontent.com/safishamsi/graphify/v3/README.md" - } - ] - }, - { - "seq": 3, - "type": "project_verdicts_recorded", - "verdicts": [ - { - "project": "RAGFlow", - "verdict": "adapter_candidate", - "supporting_evidence_ids": [ - "E2", - "E3" - ], - "docker_boundary": "Nested Docker service profile or baseline compose service using official RAGFlow Docker Compose, capped to a tiny corpus and CPU mode first.", - "output_contract": "Map RAGFlow reference.chunks fields to real_world_job expected evidence ids.", - "follow_up_title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter", - "follow_up_issue": "XY-885", - "follow_up_url": "https://linear.app/hack-ink/issue/XY-885/elf-benchmark-adapter-implement-ragflow-docker-evidence-smoke-adapter" - }, - { - "project": "LightRAG", - "verdict": "adapter_candidate", - "supporting_evidence_ids": [ - "E4", - "E5" - ], - "docker_boundary": "Docker Compose LightRAG server with explicit LLM, embedding, rerank, and data-volume configuration.", - "output_contract": "Use context-only query modes and file_paths-backed citations for evidence scoring.", - "follow_up_title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter", - "follow_up_issue": "XY-886", - "follow_up_url": "https://linear.app/hack-ink/issue/XY-886/elf-benchmark-adapter-implement-lightrag-docker-context-export-adapter" - }, - { - "project": "GraphRAG", - "verdict": "adapter_candidate", - "supporting_evidence_ids": [ - "E6", - "E7" - ], - "docker_boundary": "Cost-bounded Docker Python CLI/API run over a generated tiny corpus with container-local parquet artifacts.", - "output_contract": "Map documents, text_units, communities, and community_reports output tables back to source evidence ids.", - "follow_up_title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter", - "follow_up_issue": "XY-887", - "follow_up_url": "https://linear.app/hack-ink/issue/XY-887/elf-benchmark-adapter-implement-graphrag-cost-bounded-docker-adapter" - }, - { - "project": "Graphiti/Zep", - "verdict": "adapter_candidate", - "supporting_evidence_ids": [ - "E8" - ], - "docker_boundary": "Docker-local FalkorDB or Neo4j plus Python SDK runner with provider configuration explicit in benchmark artifacts.", - "output_contract": "Score UUID, fact, valid_at, and invalid_at search output against memory_evolution current/historical evidence.", - "follow_up_title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter", - "follow_up_issue": "XY-888", - "follow_up_url": "https://linear.app/hack-ink/issue/XY-888/elf-benchmark-adapter-implement-graphitizep-temporal-graph-adapter" - }, - { - "project": "Letta", - "verdict": "research_only", - "supporting_evidence_ids": [ - "E9" - ], - "reason": "Keep as core/archival memory semantics reference; do not create an implementation issue until a supported, contained server path can export archival evidence for scoring." - }, - { - "project": "LangGraph", - "verdict": "research_only", - "supporting_evidence_ids": [ - "E10" - ], - "reason": "Keep as checkpoint/replay regression reference; it is not a standalone external memory adapter candidate in this benchmark lane." - }, - { - "project": "nanograph", - "verdict": "research_only", - "supporting_evidence_ids": [ - "E11" - ], - "reason": "Keep as typed graph DX inspiration; official positioning is no server/no Docker and no real_world_job evidence contract is proven." - }, - { - "project": "llm-wiki", - "verdict": "research_only", - "supporting_evidence_ids": [ - "E12" - ], - "reason": "Keep as derived knowledge-page workflow inspiration; no host-global plugin install may be used as adapter proof." - }, - { - "project": "gbrain", - "verdict": "blocked", - "supporting_evidence_ids": [ - "E13" - ], - "reason": "The evidence contract is strong, but a Docker-local brain repo and database path must be proven before an implementation issue is safe." - }, - { - "project": "graphify", - "verdict": "adapter_candidate", - "supporting_evidence_ids": [ - "E14" - ], - "docker_boundary": "Docker-only CLI/materializer run using pip-installed graphifyy over mounted benchmark corpus, with no assistant global hook install.", - "output_contract": "Score graph.json query output and GRAPH_REPORT.md source-file/source-location references against expected evidence.", - "follow_up_title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", - "follow_up_issue": "XY-889", - "follow_up_url": "https://linear.app/hack-ink/issue/XY-889/elf-benchmark-adapter-implement-graphify-docker-graph-report-adapter" - } - ] - }, - { - "seq": 4, - "type": "tradeoffs_recorded", - "tradeoffs": [ - { - "id": "T1", - "summary": "RAGFlow is resource-heavy, but the official Docker and reference chunk output make it an adapter candidate as long as the follow-up starts with a tiny corpus and records resource bounds instead of making a quality claim.", - "supporting_evidence_ids": [ - "E2", - "E3" - ], - "disconfirming_evidence_ids": [] - }, - { - "id": "T2", - "summary": "LightRAG and GraphRAG can become adapter candidates because both expose bounded ingest/query paths and source mapping, but their first adapter issues must remain cost-bounded.", - "supporting_evidence_ids": [ - "E4", - "E5", - "E6", - "E7" - ], - "disconfirming_evidence_ids": [] - }, - { - "id": "T3", - "summary": "Graphiti/Zep is a stronger adapter candidate than generic graph-memory references because it can emit temporal facts with validity windows and run against Docker-local graph stores.", - "supporting_evidence_ids": [ - "E8" - ], - "disconfirming_evidence_ids": [] - }, - { - "id": "T4", - "summary": "Letta, LangGraph, nanograph, and llm-wiki should still inform ELF design, but creating adapter implementation issues now would blur reference workflows with executable memory-service evidence.", - "supporting_evidence_ids": [ - "E9", - "E10", - "E11", - "E12" - ], - "disconfirming_evidence_ids": [] - }, - { - "id": "T5", - "summary": "gbrain has a good citation and current-truth/timeline contract, but the missing Docker-local brain repo/database setup keeps it blocked rather than adapter_candidate.", - "supporting_evidence_ids": [ - "E13" - ], - "disconfirming_evidence_ids": [] - }, - { - "id": "T6", - "summary": "graphify is an adapter candidate only if implemented as an isolated CLI/materializer over generated corpus artifacts, not as a host-global assistant hook install.", - "supporting_evidence_ids": [ - "E14" - ], - "disconfirming_evidence_ids": [] - } - ] - }, - { - "seq": 5, - "type": "challenge_recorded", - "summary": "The main risk is that adapter_candidate could be read as benchmark evidence. The mitigation is to keep evidence_class=research_gate, keep overall status non-pass, and state that follow-up implementation issues must still run Docker and real_world_job scoring before any live evidence claim.", - "resolved": true - }, - { - "seq": 6, - "type": "finalized_decision_ready", - "confidence": "medium", - "decision": "Create implementation follow-ups only for RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify. Keep Letta, LangGraph, nanograph, and llm-wiki as research_only references. Keep gbrain blocked pending a Docker-local brain repo/database proof. Do not change any research_gate record into live evidence until an adapter executes inside Docker and emits evidence-linked outputs.", - "missing_evidence": [ - "No Docker adapter was implemented or executed in this lane.", - "No host-global install was used as proof.", - "Provider credentials and private corpora remain out of scope." - ] - } - ] -} diff --git a/docs/research/derived_knowledge_page_followup.md b/docs/research/derived_knowledge_page_followup.md new file mode 100644 index 00000000..178ae21b --- /dev/null +++ b/docs/research/derived_knowledge_page_followup.md @@ -0,0 +1,109 @@ +--- +type: Research Contract +title: "Derived Knowledge Page Follow-Up" +description: "Research contract for llm-wiki, gbrain, and OKF-style derived knowledge page patterns that are valuable but not fully implemented." +resource: docs/research/derived_knowledge_page_followup.md +status: active +authority: current_state +owner: research +last_verified: 2026-06-18 +tags: + - docs + - research + - llm-wiki + - okf +source_refs: [] +code_refs: + - docs/evidence/external_memory/comparison_external_projects.md + - docs/evidence/external_memory/research_projects_inventory.md + - docs/spec/system_knowledge_pages_v1.md +related: [] +drift_watch: + - docs/spec/system_knowledge_pages_v1.md + - docs/evidence/external_memory/research_projects_inventory.md +--- +# Derived Knowledge Page Follow-Up + +Purpose: Preserve the valuable but not fully implemented llm-wiki and gbrain research +thread as a new OKF research contract. +Read this when: You are designing evidence-to-knowledge pages, lint loops, wiki +navigation, or current-truth timeline views. +Not this document: The normative knowledge-page storage contract or a claim that +ELF already ships llm-wiki/gbrain parity. + +## Question + +How should ELF turn source evidence into rebuildable, cited, lintable project, +entity, concept, issue, and decision pages without letting derived pages replace +authoritative memory? + +## Scope + +In scope: + +- llm-wiki query/save/compile/lint/audit workflows. +- gbrain compiled-truth, timeline, backlink, and primary-home routing patterns. +- OKF and LLM Wiki navigation rules for durable repository docs. +- Citation, stale-source, unsupported-claim, and rebuild checks. + +Out of scope: + +- Treating generated wiki pages as source-of-truth memory. +- Host-global plugin installs as benchmark proof. +- Broad product parity claims against llm-wiki or gbrain. + +## Evidence + +- `docs/spec/system_knowledge_pages_v1.md` already owns the normative derived + knowledge page storage, rebuild, citation, and lint contract. +- `docs/evidence/external_memory/comparison_external_projects.md` records llm-wiki and gbrain + as reference projects for derived knowledge pages and operational knowledge brain + presentation. +- `docs/evidence/external_memory/research_projects_inventory.md` records llm-wiki as + `research_only` and gbrain as `blocked` for adapter purposes. + +## Options + +- Extend `elf.knowledge_page/v1` with additional LLM Wiki navigation and lint + evidence. +- Keep llm-wiki/gbrain as research references until ELF has a contained harness that + produces source-cited pages. +- Drop the thread and rely only on the existing storage spec. + +## Judgment + +Continue research. The value is real because the current spec defines storage and +lint contracts, but the product-level workflow still needs stronger evidence around +page navigation, source repair, unsupported-claim review, and current-truth/timeline +presentation. + +## Challenge + +The main risk is duplicating source memory into a polished wiki and then treating the +wiki as authoritative. The mitigation is to keep pages derived, rebuildable, and +explicitly linted against source refs. + +## Decision + +Not decision-ready for parity claims. Use this contract to route follow-up research +into either `docs/spec/system_knowledge_pages_v1.md` changes or concrete benchmark +evidence. + +## Promotion + +Promote accepted storage, rebuild, citation, and lint requirements to +`docs/spec/system_knowledge_pages_v1.md`. Promote comparative or upstream movement +only to `docs/evidence/external_memory/comparison_external_projects.md` or +`docs/evidence/external_memory/research_projects_inventory.md`. + +## Drift Impact + +Watch for upstream llm-wiki/gbrain changes that add contained execution, structured +citation output, unsupported-claim lint, or current-truth timeline maintenance that +ELF can reproduce without host-global state. + +## Citations + +- `docs/spec/system_knowledge_pages_v1.md` +- `docs/evidence/external_memory/comparison_external_projects.md` +- `docs/evidence/external_memory/research_projects_inventory.md` diff --git a/docs/research/dreaming_product_surface_followup.md b/docs/research/dreaming_product_surface_followup.md new file mode 100644 index 00000000..eb84d58a --- /dev/null +++ b/docs/research/dreaming_product_surface_followup.md @@ -0,0 +1,105 @@ +--- +type: Research Contract +title: "Dreaming Product Surface Follow-Up" +description: "Research contract for valuable but not fully implemented dreaming-style memory product surfaces." +resource: docs/research/dreaming_product_surface_followup.md +status: active +authority: current_state +owner: research +last_verified: 2026-06-18 +tags: + - docs + - research + - dreaming + - consolidation +source_refs: [] +code_refs: + - docs/decisions/2026-06-08-agent-memory-selection.md + - docs/spec/system_consolidation_proposals_v1.md + - docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md +related: [] +drift_watch: + - docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md + - docs/spec/system_consolidation_proposals_v1.md + - docs/spec/system_memory_summary_v1.md +--- +# Dreaming Product Surface Follow-Up + +Purpose: Preserve the valuable unresolved product research behind dreaming-style +memory consolidation, proactive briefs, scheduled memory, and top-of-mind summaries. +Read this when: You are deciding whether a vendor dreaming pattern should become an +ELF service-native workflow or benchmark stage. +Not this document: The accepted decision to keep consolidation derived and reviewable, +or a claim that fixture-backed benchmark stages prove hosted product parity. + +## Question + +Which dreaming-inspired product surfaces should ELF continue researching after the +current specs and fixture-backed benchmark stages? + +## Scope + +In scope: + +- Reviewable derived consolidation over immutable source evidence. +- Memory summary and top-of-mind readback. +- Proactive brief and scheduled memory task workflows. +- Private-corpus, provider-backed, and notification/scheduler blockers. + +Out of scope: + +- Destructive rewriting of authoritative memory. +- Hosted product parity claims from fixture-only evidence. +- Silent background mutation without review/audit. + +## Evidence + +- `docs/decisions/2026-06-08-agent-memory-selection.md` accepts dreaming as a + derived/reviewed pattern, not a core replacement. +- `docs/spec/system_consolidation_proposals_v1.md` owns reviewable derived + consolidation. +- `docs/spec/system_memory_summary_v1.md` owns memory summary shape. +- `docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md` owns the + current staged benchmark evidence and typed product-boundary warnings. + +## Options + +- Move fixture-backed behaviors into service-native readback and operator-visible + workflows. +- Keep provider/private/scheduler claims blocked until explicit operator-owned inputs + exist. +- Treat vendor dreaming as an inspiration source only and stop product research. + +## Judgment + +Continue research. Several fixture-backed stages improved, but service-native product +behavior, private-corpus quality, provider-backed generation, scheduling, and +notification delivery are not proven. + +## Challenge + +The main risk is over-promoting benchmark fixtures into product claims. The mitigation +is to keep each stage typed: fixture-backed, live adapter, provider-backed, private +corpus, scheduler, or hosted product. + +## Decision + +Not decision-ready for product parity. Keep this research contract open until a stage +has service-native evidence or is explicitly retired. + +## Promotion + +Promote service-native contracts to specs. Promote benchmark outcomes to +`docs/evidence/benchmarking/`. Promote accepted product decisions to `docs/decisions/`. + +## Drift Impact + +Watch the stage ledger, consolidation proposal spec, memory summary spec, and any +future proactive/scheduled service-native reports for stale fixture-only claims. + +## Citations + +- `docs/decisions/2026-06-08-agent-memory-selection.md` +- `docs/spec/system_consolidation_proposals_v1.md` +- `docs/spec/system_memory_summary_v1.md` +- `docs/evidence/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md` diff --git a/docs/research/graph_rag_adapter_followup.md b/docs/research/graph_rag_adapter_followup.md new file mode 100644 index 00000000..1c5ad489 --- /dev/null +++ b/docs/research/graph_rag_adapter_followup.md @@ -0,0 +1,118 @@ +--- +type: Research Contract +title: "Graph and RAG Adapter Follow-Up" +description: "Research contract for unresolved graph/RAG adapter value after the June 2026 feasibility verdicts." +resource: docs/research/graph_rag_adapter_followup.md +status: active +authority: current_state +owner: research +last_verified: 2026-06-18 +tags: + - docs + - research + - graph-rag + - adapter +source_refs: [] +code_refs: + - docs/evidence/external_memory/comparison_external_projects.md + - docs/evidence/external_memory/research_projects_inventory.md + - docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md + - apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +related: [] +drift_watch: + - apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json + - docs/evidence/external_memory/research_projects_inventory.md +--- +# Graph and RAG Adapter Follow-Up + +Purpose: Preserve only the unresolved, valuable research from the retired +`2026-06-10-xy-882-rag-graph-adapter-feasibility` run. +Read this when: You are deciding whether a RAG or graph-memory project has enough +contained evidence to become a scored ELF real-world adapter. +Not this document: A live adapter pass, a broad quality ranking, or a replacement +decision for ELF core memory. + +## Question + +Which graph/RAG systems still deserve further research or implementation proof before +ELF can score them as real-world memory adapters? + +## Scope + +In scope: + +- RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify adapter-candidate follow-up. +- Letta, LangGraph, nanograph, llm-wiki, and gbrain reference-only or blocked value + that should not be promoted into live evidence. +- Docker containment, resource envelope, source-id output, citation output, and + typed non-pass states. + +Out of scope: + +- Host-global installs as proof. +- Provider-backed private corpus claims. +- Any claim that `research_gate` is equivalent to fixture-backed or live evidence. + +## Evidence + +- `docs/evidence/external_memory/research_projects_inventory.md` owns the accepted June 10, + 2026 verdict table. +- `docs/evidence/external_memory/comparison_external_projects.md` owns the broader project + comparison and benchmark-dimension map. +- `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` + owns the executable adapter ledger. +- `docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md` + owns current scored graph/RAG smoke evidence. + +## Options + +- Promote candidate projects only after Docker execution emits evidence-linked + adapter output. +- Keep reference-only projects as research inputs for specs and UX, not adapter rows. +- Keep blocked projects blocked until contained setup is proven. + +## Judgment + +Continue research. The accepted verdicts remain: + +- `adapter_candidate`: RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, graphify. +- `research_only`: Letta, LangGraph, nanograph, llm-wiki. +- `blocked`: gbrain until a Docker-local brain repository and database path is proven. + +These labels do not imply live adapter quality. + +## Challenge + +The main risk is label drift: `adapter_candidate` can be mistaken for benchmark +evidence. The mitigation is to preserve `research_gate` until a Docker-contained run +emits source IDs, document IDs, file paths, citations, graph facts, or equivalent +evidence handles that `real_world_job` scoring can inspect. + +## Decision + +Not decision-ready for live evidence. Keep the active research contract open until the +next adapter implementation or source-review pass either promotes a concrete report or +retires the candidate. + +## Promotion + +Promote only these outputs: + +- Adapter implementation evidence goes to `docs/evidence/benchmarking/`. +- Schema or scoring-contract changes go to `docs/spec/real_world_agent_memory_benchmark_v1.md`. +- Accepted inventory status changes go to `docs/evidence/external_memory/research_projects_inventory.md`. + +Do not re-create a raw research JSON owner for this lane. + +## Drift Impact + +Watch for upstream changes that alter Docker setup, local resource envelope, source +mapping, citation output, or graph/temporal fact output. Also watch for new ELF adapter +rows that should replace this research contract with benchmark evidence. + +## Citations + +- `docs/evidence/external_memory/comparison_external_projects.md` +- `docs/evidence/external_memory/research_projects_inventory.md` +- `docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md` +- `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` diff --git a/docs/research/index.md b/docs/research/index.md new file mode 100644 index 00000000..878b3474 --- /dev/null +++ b/docs/research/index.md @@ -0,0 +1,22 @@ +# Research Index + +Purpose: Route agents to latent research contracts and evidence candidates in the OKF +and LLM Wiki bundle. +Read this when: You need research provenance, comparison evidence, or a promotion +candidate that is not yet normative. +Not this document: Accepted specs, runbooks, decisions, machine reports, or raw +machine-readable JSON. +Routes to: Active research contracts under `docs/research/` and promotion evidence +under `docs/evidence/`. + +## Concepts + +- `graph_rag_adapter_followup.md`: Unresolved graph/RAG adapter research after the + June 2026 feasibility verdicts. +- `derived_knowledge_page_followup.md`: llm-wiki, gbrain, and OKF-style derived + knowledge page research. +- `dreaming_product_surface_followup.md`: Dreaming-inspired product surface research + that is not yet service-native or product-parity evidence. + +For legacy research JSON disposition, read +`docs/evidence/2026-06-18-research-artifact-disposition.md`. diff --git a/docs/guide/agent-setup.md b/docs/runbook/agent-setup.md similarity index 90% rename from docs/guide/agent-setup.md rename to docs/runbook/agent-setup.md index 57257017..13210b1c 100644 --- a/docs/guide/agent-setup.md +++ b/docs/runbook/agent-setup.md @@ -1,12 +1,25 @@ -# Agent Setup Guide +--- +type: Runbook +title: "Agent Setup Runbook" +description: "Help an agent install and run ELF locally with minimal back-and-forth." +resource: docs/runbook/agent-setup.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook +--- +# Agent Setup Runbook Goal: Help an agent install and run ELF locally with minimal back-and-forth. Read this when: You need a practical local setup flow from an existing repository checkout. Inputs: This repository checkout plus Docker Compose or separately managed Postgres/Qdrant, and optional provider credentials. -Depends on: `Makefile.toml`, `docker-compose.yml`, `config/local/elf.docker.toml`, `elf.example.toml`, and `docs/guide/getting_started.md`. +Depends on: `Makefile.toml`, `docker-compose.yml`, `config/local/elf.docker.toml`, `elf.example.toml`, and `docs/runbook/getting_started.md`. Verification: ELF services start, required dependencies are reachable, and the local workflow can continue. -This guide is written for AI agents helping a human operator install and run ELF locally with minimal back-and-forth. +This runbook is written for AI agents helping a human operator install and run ELF locally with minimal back-and-forth. It assumes you have access to this repository checkout. ## What You Are Setting Up @@ -28,7 +41,7 @@ Important: The ELF config has no implicit defaults. All required config fields m ## Minimal Owner Inputs For the checked-in Docker local stack, no owner inputs are required. Use `docker-compose.yml` -and `config/local/elf.docker.toml` from `docs/guide/getting_started.md`. +and `config/local/elf.docker.toml` from `docs/runbook/getting_started.md`. For separately managed dependencies or provider-backed development, ask the owner for: diff --git a/docs/guide/agent_skills_cookbook.md b/docs/runbook/agent_skills_cookbook.md similarity index 96% rename from docs/guide/agent_skills_cookbook.md rename to docs/runbook/agent_skills_cookbook.md index ef3238d7..de16ef9e 100644 --- a/docs/guide/agent_skills_cookbook.md +++ b/docs/runbook/agent_skills_cookbook.md @@ -1,3 +1,16 @@ +--- +type: Runbook +title: "Agent Skills Cookbook (MCP-first)" +description: "Provide reference agent-side workflows for using ELF via MCP in a consistent, auditable, facts-first way." +resource: docs/runbook/agent_skills_cookbook.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook +--- # Agent Skills Cookbook (MCP-first) Goal: Provide reference agent-side workflows for using ELF via MCP in a consistent, auditable, facts-first way. @@ -6,7 +19,7 @@ Inputs: A working ELF deployment or design target plus the relevant ELF service Depends on: `docs/spec/system_elf_memory_service_v2.md` and related MCP-facing specs. Outputs: Reusable workflow patterns that stay within the ELF contract without redefining it. -Scope: This is a guide/playbook. It is non-normative and does not change the ELF system contract. +Scope: This is a runbook/playbook. It is non-normative and does not change the ELF system contract. ## 0) Contract: MCP vs Skills @@ -393,5 +406,5 @@ Notes: ## 10) Pinned references (internal) - Core contract: `docs/spec/system_elf_memory_service_v2.md` -- Doc Extension v1 design: `docs/plans/2026-02-24-doc-ext-v1-design.md` +- Doc Extension v1 design: `docs/reference/plans/2026-02-24-doc-ext-v1-design.md` - Doc pointer resolver: `docs/spec/system_source_ref_doc_pointer_v1.md` diff --git a/docs/runbook/benchmarking/index.md b/docs/runbook/benchmarking/index.md new file mode 100644 index 00000000..dfe8ea40 --- /dev/null +++ b/docs/runbook/benchmarking/index.md @@ -0,0 +1,15 @@ +# Benchmarking Runbook Index + +Purpose: Route agents to benchmark execution and interpretation procedures. +Read this when: You need to run, extend, publish, or interpret ELF benchmark +workflows. +Not this document: Checked-in report evidence or governing benchmark specs. +Routes to: Benchmarking runbooks under `docs/runbook/benchmarking/`. + +## Concepts + +- `live_baseline_benchmark.md`: Docker-isolated current-HEAD baseline checks against + ELF and external memory projects. +- `real_world_agent_memory_benchmark.md`: operator map for creating, extending, and + interpreting real-world agent memory benchmark jobs. +- `real_world_memory_evolution.md`: memory-evolution fixture runbook. diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/runbook/benchmarking/live_baseline_benchmark.md similarity index 97% rename from docs/guide/benchmarking/live_baseline_benchmark.md rename to docs/runbook/benchmarking/live_baseline_benchmark.md index 9d93a2d6..4597e2bc 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/runbook/benchmarking/live_baseline_benchmark.md @@ -1,3 +1,17 @@ +--- +type: Runbook +title: "Live Baseline Benchmark" +description: "Run Docker-isolated, current-HEAD baseline checks against ELF and the external memory projects compared with ELF." +resource: docs/runbook/benchmarking/live_baseline_benchmark.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook + - benchmarking +--- # Live Baseline Benchmark Goal: Run Docker-isolated, current-HEAD baseline checks against ELF and the external memory projects compared with ELF. @@ -10,9 +24,9 @@ Verification: `cargo make baseline-live-docker` writes `tmp/live-baseline/live-b ## Scope -This guide is for benchmark evidence, not for operating a personal production ELF service. For +This runbook is for benchmark evidence, not for operating a personal production ELF service. For single-user Docker Compose production start, stop, health, backup, restore, Qdrant rebuild, -rollback, and cleanup commands, use `docs/guide/single_user_production.md`. +rollback, and cleanup commands, use `docs/runbook/single_user_production.md`. The runner covers ELF plus the six external projects in the README comparison table: @@ -182,7 +196,7 @@ from provider-backed ELF/Qwen3 embedding evidence. ## Checked-In Reports -- `docs/guide/benchmarking/2026-06-09-live-baseline-report.md`: June 9, 2026 +- `docs/evidence/benchmarking/2026-06-09-live-baseline-report.md`: June 9, 2026 production-provider ELF stress run and all-project smoke comparison. ## Run @@ -353,13 +367,13 @@ cargo make baseline-live-report By default the task prints Markdown to stdout. To write a checked-in report: ```sh -ELF_BASELINE_MARKDOWN_REPORT=docs/guide/benchmarking/YYYY-MM-DD-live-baseline-report.md \ +ELF_BASELINE_MARKDOWN_REPORT=docs/evidence/benchmarking/YYYY-MM-DD-live-baseline-report.md \ cargo make baseline-live-report ``` The publisher summarizes one generated aggregate JSON report. For a combined report that compares multiple runs, use the generated Markdown as input evidence and then add -the interpretation manually under `docs/guide/benchmarking/`. +the interpretation manually under `docs/evidence/benchmarking/`. ## Real-World Job Smoke diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md similarity index 97% rename from docs/guide/benchmarking/real_world_agent_memory_benchmark.md rename to docs/runbook/benchmarking/real_world_agent_memory_benchmark.md index c4e5c141..2e8268e3 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md @@ -1,3 +1,17 @@ +--- +type: Runbook +title: "Real-World Agent Memory Benchmark" +description: "Runbook for real-world agent memory benchmark execution and interpretation." +resource: docs/runbook/benchmarking/real_world_agent_memory_benchmark.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook + - benchmarking +--- # Real-World Agent Memory Benchmark Goal: Explain the v1 real-world agent memory benchmark suite and route implementation @@ -7,7 +21,7 @@ or understand why retrieval-only comparisons are insufficient. Inputs: `docs/spec/real_world_agent_memory_benchmark_v1.md`, current live baseline reports, external project comparison docs, and the intended user-job scenario. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, -`live_baseline_benchmark.md`, and `docs/guide/research/comparison_external_projects.md`. +`live_baseline_benchmark.md`, and `docs/evidence/external_memory/comparison_external_projects.md`. Outputs: Operator-facing suite overview, bias explanation, and implementation routing. ## Governing Spec @@ -17,7 +31,7 @@ The authoritative contract is: - `docs/spec/real_world_agent_memory_benchmark_v1.md` Use the spec for field names, suite ids, report states, scoring rules, and claim -boundaries. This guide is only an operator map. +boundaries. This runbook is only an operator map. ## Why This Suite Exists @@ -389,7 +403,7 @@ dropped-candidate visibility, trace completeness, repair-action clarity, and any encoded UX gaps. Checked-in evidence snapshot: -`docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md`. +`docs/evidence/benchmarking/2026-06-09-operator-debugging-ux-report.md`. The same `real-world-memory` target also includes the current consolidation fixtures under the same fixture root. diff --git a/docs/guide/benchmarking/real_world_memory_evolution.md b/docs/runbook/benchmarking/real_world_memory_evolution.md similarity index 85% rename from docs/guide/benchmarking/real_world_memory_evolution.md rename to docs/runbook/benchmarking/real_world_memory_evolution.md index af578a15..d8c21d22 100644 --- a/docs/guide/benchmarking/real_world_memory_evolution.md +++ b/docs/runbook/benchmarking/real_world_memory_evolution.md @@ -1,3 +1,17 @@ +--- +type: Runbook +title: "Real-World Memory Evolution Benchmark" +description: "Run and interpret the checked-in memory evolution real-world job fixtures." +resource: docs/runbook/benchmarking/real_world_memory_evolution.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook + - benchmarking +--- # Real-World Memory Evolution Benchmark Goal: Run and interpret the checked-in memory evolution real-world job fixtures. @@ -6,8 +20,8 @@ conflicts, corrected memories, and temporal relation validity. Inputs: `apps/elf-eval/fixtures/real_world_memory/evolution/`, `apps/elf-eval/src/bin/real_world_job_benchmark.rs`, and `Makefile.toml`. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, -`docs/guide/benchmarking/real_world_agent_memory_benchmark.md`, and -`docs/guide/research/comparison_external_projects.md`. +`docs/runbook/benchmarking/real_world_agent_memory_benchmark.md`, and +`docs/evidence/external_memory/comparison_external_projects.md`. Outputs: `tmp/real-world-memory/evolution-report.json` and `tmp/real-world-memory/evolution-report.md`. diff --git a/docs/guide/competitive_parity_testing.md b/docs/runbook/competitive_parity_testing.md similarity index 88% rename from docs/guide/competitive_parity_testing.md rename to docs/runbook/competitive_parity_testing.md index 328bdd91..57c3c294 100644 --- a/docs/guide/competitive_parity_testing.md +++ b/docs/runbook/competitive_parity_testing.md @@ -1,9 +1,22 @@ +--- +type: Runbook +title: "Competitive Parity Testing" +description: "Run the Docker-only parity gate that decides whether ELF has enough evidence to be considered against external memory systems." +resource: docs/runbook/competitive_parity_testing.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook +--- # Competitive Parity Testing Goal: Run the Docker-only parity gate that decides whether ELF has enough evidence to be considered against external memory systems. Read this when: You need to prove ELF meets the minimum adoption bar instead of relying on architecture claims. Preconditions: Docker and Docker Compose are available on the host. -Depends on: `docs/spec/system_competitive_parity_gate_v1.md`, `docs/guide/research/agentmemory_adapter.md`, and `Makefile.toml`. +Depends on: `docs/spec/system_competitive_parity_gate_v1.md`, `docs/evidence/external_memory/agentmemory_adapter.md`, and `Makefile.toml`. Verification: `cargo make parity-docker` exits successfully and writes `tmp/parity/competitive-parity-report.json` with `verdict = "pass"`. ## Run diff --git a/docs/runbook/development/index.md b/docs/runbook/development/index.md new file mode 100644 index 00000000..e0f11157 --- /dev/null +++ b/docs/runbook/development/index.md @@ -0,0 +1,11 @@ +# Development Runbook Index + +Purpose: Route agents to repository-development procedures. +Read this when: You need a repeatable development workflow that is not product +operation. +Not this document: Product specs, benchmark reports, or saved plan artifacts. +Routes to: Development runbooks under `docs/runbook/development/`. + +## Concepts + +- `issue_labeling.md`: Linear issue-labeling workflow guidance. diff --git a/docs/guide/development/issue_labeling.md b/docs/runbook/development/issue_labeling.md similarity index 92% rename from docs/guide/development/issue_labeling.md rename to docs/runbook/development/issue_labeling.md index cbf18466..5a5c9336 100644 --- a/docs/guide/development/issue_labeling.md +++ b/docs/runbook/development/issue_labeling.md @@ -1,3 +1,17 @@ +--- +type: Runbook +title: "Issue Labeling" +description: "Standardize how Linear issues are labeled in this repository." +resource: docs/runbook/development/issue_labeling.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook + - development +--- # Issue Labeling Goal: Standardize how Linear issues are labeled in this repository. @@ -6,7 +20,7 @@ Inputs: The current Linear workspace labels plus the repository's issue taxonomy Depends on: Existing label groups and the repository's development workflow. Verification: Labels remain consistent, searchable, and aligned with the documented taxonomy. -This guide standardizes how Linear issues are labeled in this repository. +This runbook standardizes how Linear issues are labeled in this repository. Tracker policy: diff --git a/docs/guide/evaluation.md b/docs/runbook/evaluation.md similarity index 95% rename from docs/guide/evaluation.md rename to docs/runbook/evaluation.md index 39441ab9..5bc50a33 100644 --- a/docs/guide/evaluation.md +++ b/docs/runbook/evaluation.md @@ -1,3 +1,16 @@ +--- +type: Runbook +title: "Retrieval Evaluation" +description: "Provide a repeatable way to measure memory retrieval quality and prevent regressions." +resource: docs/runbook/evaluation.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook +--- # Retrieval Evaluation Goal: Provide a repeatable way to measure memory retrieval quality and prevent regressions. @@ -13,20 +26,20 @@ Use the `elf-eval` app to run an evaluation against a dataset of queries and exp Example: ```bash -cargo run -p elf-eval -- -c ./elf.toml --dataset ./docs/guide/eval-sample.json +cargo run -p elf-eval -- -c ./elf.toml --dataset ./apps/elf-eval/fixtures/evaluation/eval-sample.json ``` Search-mode selection: ```bash # Run the evaluation using the quick_find (faster) search mode. -cargo run -p elf-eval -- -c ./elf.toml --dataset ./docs/guide/eval-sample.json --search-mode quick_find +cargo run -p elf-eval -- -c ./elf.toml --dataset ./apps/elf-eval/fixtures/evaluation/eval-sample.json --search-mode quick_find # Compare two configs while forcing different modes per side (A vs B). cargo run -p elf-eval -- \ -c ./elf.a.toml \ --config-b ./elf.b.toml \ - --dataset ./docs/guide/eval-sample.json \ + --dataset ./apps/elf-eval/fixtures/evaluation/eval-sample.json \ --search-mode planned_search \ --search-mode-b quick_find ``` @@ -102,7 +115,7 @@ The command prints a JSON report containing summary metrics and per-query detail - `--search-mode planned_search` (planning-enabled path; useful when you need query plans and staged trajectory metadata) - When running a config comparison with `--config-b`, you can set `--search-mode-b` to override the mode for the B side. - To compare against sanitized agentmemory session fixtures without running an agentmemory server, use - `docs/guide/research/agentmemory_adapter.md`. + `docs/evidence/external_memory/agentmemory_adapter.md`. - The dataset should avoid secrets and sensitive data. - To persist traces for later replay without running `elf-worker`, set `search.explain.write_mode = "inline"` in the config used by `elf-eval`. diff --git a/docs/guide/research/external_memory_pattern_radar.md b/docs/runbook/external_memory_pattern_radar.md similarity index 82% rename from docs/guide/research/external_memory_pattern_radar.md rename to docs/runbook/external_memory_pattern_radar.md index 06638e2a..9ee39c86 100644 --- a/docs/guide/research/external_memory_pattern_radar.md +++ b/docs/runbook/external_memory_pattern_radar.md @@ -1,16 +1,29 @@ +--- +type: Runbook +title: "External Memory Pattern Radar" +description: "Runbook for the weekly external memory pattern radar workflow." +resource: docs/runbook/external_memory_pattern_radar.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook +--- # External Memory Pattern Radar Goal: Run ELF's weekly external memory pattern radar and preserve no-issue, rejection, or issue-ready outcomes for future comparison reports. Read this when: You are refreshing upstream memory/RAG/agent-continuity watch state or deciding whether a watched upstream pattern deserves an ELF follow-up issue. -Inputs: `docs/research/external_memory_pattern_radar/cursor.json`, GitHub repository +Inputs: `apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json`, GitHub repository metadata, current ELF research docs, and Linear duplicate-search readback when creating issues. Depends on: `docs/spec/external_memory_pattern_radar_v1.md`, -`docs/guide/research/comparison_external_projects.md`, and -`docs/guide/research/research_projects_inventory.md`. -Outputs: Updated cursor JSON plus `docs/research/external_memory_pattern_radar/latest.md`. +`docs/evidence/external_memory/comparison_external_projects.md`, and +`docs/evidence/external_memory/research_projects_inventory.md`. +Outputs: Updated cursor JSON plus `docs/evidence/external_memory_pattern_radar_latest.md`. ## Scope diff --git a/docs/guide/getting_started.md b/docs/runbook/getting_started.md similarity index 88% rename from docs/guide/getting_started.md rename to docs/runbook/getting_started.md index f5ede104..06fe67af 100644 --- a/docs/guide/getting_started.md +++ b/docs/runbook/getting_started.md @@ -1,3 +1,16 @@ +--- +type: Runbook +title: "Getting Started" +description: "Provide the canonical setup and local run flow for ELF." +resource: docs/runbook/getting_started.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook +--- # Getting Started Goal: Provide the canonical setup and local run flow for ELF. @@ -116,7 +129,7 @@ curl -fsS -X POST http://127.0.0.1:51892/v2/notes/ingest \ "importance": 0.7, "confidence": 0.9, "ttl_days": 14, - "source_ref": {"schema": "local_smoke/v1", "ref": {"command": "docs/guide/getting_started.md"}} + "source_ref": {"schema": "local_smoke/v1", "ref": {"command": "docs/runbook/getting_started.md"}} } ] }' @@ -130,7 +143,7 @@ Use `elf-eval` with your dataset. cargo run -p elf-eval -- -c elf.toml -i path/to/eval.json ``` -For dataset format and metric details, see `docs/guide/evaluation.md`. +For dataset format and metric details, see `docs/runbook/evaluation.md`. ## 7. Run local checks @@ -165,10 +178,10 @@ Notes: - Stop local dependencies with `docker compose -f docker-compose.yml down`. Add `-v` only when you intentionally want to delete the local development volumes. -## Related guides +## Related Runbooks -- Evaluation: `docs/guide/evaluation.md` -- Integration testing: `docs/guide/integration-testing.md` -- Single-user production: `docs/guide/single_user_production.md` -- Test taxonomy: `docs/guide/testing.md` -- Agent setup: `docs/guide/agent-setup.md` +- Evaluation: `docs/runbook/evaluation.md` +- Integration testing: `docs/runbook/integration-testing.md` +- Single-user production: `docs/runbook/single_user_production.md` +- Test taxonomy: `docs/runbook/testing.md` +- Agent setup: `docs/runbook/agent-setup.md` diff --git a/docs/runbook/index.md b/docs/runbook/index.md new file mode 100644 index 00000000..dd8dc72a --- /dev/null +++ b/docs/runbook/index.md @@ -0,0 +1,24 @@ +# Runbook Index + +Purpose: Route agents to procedural runbooks in the strict OKF lane. +Read this when: You need an operator sequence, validation flow, migration step, or +repeatable maintenance procedure. +Not this document: Normative specs, research contracts, proof reports, or external +comparison evidence. +Routes to: Runbook concepts under `docs/runbook/`. + +## Concepts + +- `getting_started.md`: canonical setup and local run flow. +- `single_user_production.md`: single-user production operation, backup, restore, + Qdrant rebuild, rollback, and cleanup. +- `agent-setup.md`: agent-oriented local installation flow. +- `evaluation.md`: retrieval evaluation commands and interpretation flow. +- `integration-testing.md`: integration and E2E test workflow. +- `testing.md`: test names, scopes, and matching commands. +- `observability.md`: logging and metrics operation notes. +- `agent_skills_cookbook.md`: MCP-first agent workflow patterns. +- `competitive_parity_testing.md`: Docker-only parity gate operation. +- `external_memory_pattern_radar.md`: weekly upstream memory-pattern radar workflow. +- `benchmarking/index.md`: benchmark runbooks and suite interpretation. +- `development/index.md`: repository-development runbooks. diff --git a/docs/guide/integration-testing.md b/docs/runbook/integration-testing.md similarity index 95% rename from docs/guide/integration-testing.md rename to docs/runbook/integration-testing.md index 336715f9..16125abe 100644 --- a/docs/guide/integration-testing.md +++ b/docs/runbook/integration-testing.md @@ -1,12 +1,25 @@ +--- +type: Runbook +title: "Integration Testing (Memory Retrieval)" +description: "Provide a repeatable E2E test for memory ingestion, indexing, and retrieval." +resource: docs/runbook/integration-testing.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook +--- # Integration Testing (Memory Retrieval) Goal: Provide a repeatable E2E test for memory ingestion, indexing, and retrieval. Read this when: You need to validate retrieval behavior after changing ingestion, ranking, or storage logic. Inputs: External Postgres and Qdrant services plus the repository test commands. -Depends on: `docs/guide/testing.md` and `Makefile.toml`. +Depends on: `docs/runbook/testing.md` and `Makefile.toml`. Verification: The integration or E2E commands complete without regressions. -Name: This flow is the E2E test in `docs/guide/testing.md`. +Name: This flow is the E2E test in `docs/runbook/testing.md`. ## When to use diff --git a/docs/guide/observability.md b/docs/runbook/observability.md similarity index 90% rename from docs/guide/observability.md rename to docs/runbook/observability.md index d0bfccfb..76046efb 100644 --- a/docs/guide/observability.md +++ b/docs/runbook/observability.md @@ -1,3 +1,16 @@ +--- +type: Runbook +title: "Observability and Correlation (MCP + Admin API)" +description: "Provide a practical traceability workflow for agents and operators." +resource: docs/runbook/observability.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook +--- # Observability and Correlation (MCP + Admin API) Goal: Provide a practical traceability workflow for agents and operators. diff --git a/docs/guide/single_user_production.md b/docs/runbook/single_user_production.md similarity index 97% rename from docs/guide/single_user_production.md rename to docs/runbook/single_user_production.md index 914b0fe7..4a961463 100644 --- a/docs/guide/single_user_production.md +++ b/docs/runbook/single_user_production.md @@ -1,3 +1,16 @@ +--- +type: Runbook +title: "Single-User Production Runbook" +description: "Runbook for operating one local ELF instance with Docker Compose managed services." +resource: docs/runbook/single_user_production.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook +--- # Single-User Production Runbook Goal: Operate one local ELF instance with Docker Compose managed Postgres and Qdrant, @@ -7,7 +20,7 @@ restore, migration, and Qdrant rebuild behavior. Preconditions: Docker Compose, this repository checkout, a Rust toolchain for building ELF binaries, and provider credentials for production embeddings/rerank/extraction. Depends on: `docker-compose.yml`, `elf.example.toml`, `docs/spec/system_elf_memory_service_v2.md`, -`docs/guide/getting_started.md`, and `docs/guide/integration-testing.md`. +`docs/runbook/getting_started.md`, and `docs/runbook/integration-testing.md`. Verification: Health succeeds, a note can be ingested and found, Postgres backup restores notes, Qdrant search state can be rebuilt from Postgres, and the clean-volume proof path below can run without host-global service installs. @@ -674,7 +687,7 @@ target/debug/elf diagnostics qdrant-rebuild --pretty ``` For batch backfill and benchmark reports, use the wrappers documented in -`docs/guide/benchmarking/live_baseline_benchmark.md`. Those wrappers delegate to the checked-in +`docs/runbook/benchmarking/live_baseline_benchmark.md`. Those wrappers delegate to the checked-in `cargo make` tasks and keep benchmark artifacts under `tmp/live-baseline/`. ## 11. Failure And Secret Rules @@ -691,8 +704,8 @@ For batch backfill and benchmark reports, use the wrappers documented in - Never commit `.env`, `elf.production.toml`, backups, dumps, API keys, bearer tokens, or provider credentials. -## Related Guides +## Related Runbooks -- Local bootstrap: `docs/guide/getting_started.md` -- Integration testing: `docs/guide/integration-testing.md` +- Local bootstrap: `docs/runbook/getting_started.md` +- Integration testing: `docs/runbook/integration-testing.md` - System contract: `docs/spec/system_elf_memory_service_v2.md` diff --git a/docs/guide/testing.md b/docs/runbook/testing.md similarity index 85% rename from docs/guide/testing.md rename to docs/runbook/testing.md index 480a8c61..1df3b361 100644 --- a/docs/guide/testing.md +++ b/docs/runbook/testing.md @@ -1,3 +1,16 @@ +--- +type: Runbook +title: "Test Names and Scope" +description: "Provide consistent names for test categories and the commands that run them." +resource: docs/runbook/testing.md +status: active +authority: procedural +owner: runbook +last_verified: 2026-06-18 +tags: + - docs + - runbook +--- # Test Names and Scope Goal: Provide consistent names for test categories and the commands that run them. diff --git a/docs/spec/external_memory_pattern_radar_v1.md b/docs/spec/external_memory_pattern_radar_v1.md index ccde7b34..00c2162a 100644 --- a/docs/spec/external_memory_pattern_radar_v1.md +++ b/docs/spec/external_memory_pattern_radar_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "External Memory Pattern Radar v1" +description: "Normative contract for external memory pattern radar cursors, runs, and issue decisions." +resource: docs/spec/external_memory_pattern_radar_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/external_memory_pattern_radar_v1.md +--- # External Memory Pattern Radar v1 Purpose: Define the durable cursor, run, and issue-decision contract for ELF's external @@ -21,8 +39,8 @@ The radar is a decision-support workflow. It is not an adoption workflow. Canonical checked-in paths: -- Cursor: `docs/research/external_memory_pattern_radar/cursor.json` -- Latest prose summary: `docs/research/external_memory_pattern_radar/latest.md` +- Cursor: `apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json` +- Latest prose summary: `docs/evidence/external_memory_pattern_radar_latest.md` Temporary dry-run outputs may be written under `tmp/external-memory-pattern-radar/`. diff --git a/docs/spec/index.md b/docs/spec/index.md index 86c90cd8..2dde84ef 100644 --- a/docs/spec/index.md +++ b/docs/spec/index.md @@ -13,13 +13,13 @@ Question this index answers: "what must remain true?" - You need an invariant, contract, schema, enum, state model, interface, or required behavior. - You are deciding whether code or data is correct. -- A guide says "see the governing spec" and you need the authoritative source. +- A runbook says "see the governing spec" and you need the authoritative source. ## Do not use this index when - You need step-by-step instructions, maintenance actions, migrations, or incident response. -- You need a planning-tool artifact or a saved execution plan under `docs/plans/`. +- You need a planning-tool artifact or a saved execution plan under `docs/reference/plans/`. - You want rationale only, without an authoritative contract. ## What belongs in `docs/spec/` @@ -31,23 +31,23 @@ Question this index answers: "what must remain true?" ## Documents -- `system_elf_memory_service_v2.md`: Core ELF memory service contract, API semantics, - and storage invariants. -- `system_consolidation_proposals_v1.md`: Reviewable derived consolidation run and - proposal contract over immutable source evidence. -- `system_memory_summary_v1.md`: Reviewable current/background/stale/superseded/ - tombstoned/derived memory summary and source-trace contract. -- `system_knowledge_pages_v1.md`: Derived project/entity/concept/issue/decision page - storage, rebuild, citation, and stale-source lint contract. -- `system_competitive_parity_gate_v1.md`: Docker-only adoption gate that decides - whether ELF meets or exceeds selected external memory-system baselines. -- `production_corpus_manifest_v1.md`: Sanitized/private coding-agent production - corpus manifest schema for adoption benchmark runs. -- `real_world_agent_memory_benchmark_v1.md`: Real-world agent memory benchmark job - schema, suite taxonomy, scoring dimensions, and report state semantics. -- `external_memory_pattern_radar_v1.md`: Weekly external memory pattern radar cursor, - run, decision, and issue-creation boundary schema. - +- `external_memory_pattern_radar_v1.md`: External Memory Pattern Radar v1. +- `production_corpus_manifest_v1.md`: Production Corpus Manifest v1. +- `real_world_agent_memory_benchmark_v1.md`: Real-World Agent Memory Benchmark v1. +- `system_competitive_parity_gate_v1.md`: Competitive Parity Gate v1 Specification. +- `system_consolidation_proposals_v1.md`: Consolidation Proposals v1 Specification. +- `system_doc_chunking_profiles_v1.md`: System: `doc_chunking_profiles/v1` for `docs_put`. +- `system_doc_extension_v1_filters.md`: System: Document Extension v1 Filter and Payload Contract. +- `system_doc_extension_v1_trajectory.md`: System: Doc Extension v1 Retrieval Trajectory (`doc_retrieval_trajectory/v1`). +- `system_doc_source_ref_v1.md`: System: `doc_source_ref/v1` for `docs_put`. +- `system_elf_memory_service_v2.md`: ELF Memory Service v2.0 Specification. +- `system_graph_memory_postgres_v1.md`: Graph Memory Postgres v1.0 Specification. +- `system_knowledge_pages_v1.md`: Derived Knowledge Pages v1 Specification. +- `system_memory_summary_v1.md`: Reviewable Memory Summary v1 Specification. +- `system_provenance_mapping_v1.md`: System: Note Provenance Mapping (v1). +- `system_search_filter_expr_v1.md`: System: Search Filter Expression Contract v1. +- `system_source_ref_doc_pointer_v1.md`: System: `source_ref` Doc Pointer Resolver (v1). +- `system_version_registry.md`: System Version Registry. ## Spec document contract Start each spec with a compact routing header: @@ -64,7 +64,7 @@ Then keep the body explicit: - Separate facts from rationale. - Include canonical names exactly as code or data uses them. - Include a small example when it removes ambiguity. -- Link to related guides instead of embedding procedures. +- Link to related runbooks instead of embedding procedures. ## Structure policy @@ -73,4 +73,4 @@ Then keep the body explicit: ambiguity. - Do not require fixed filename prefixes up front. - Choose names for topic clarity and retrieval quality, not visual uniformity. -- If a guide depends on a spec, the guide links back to the governing spec. +- If a runbook depends on a spec, the runbook links back to the governing spec. diff --git a/docs/spec/production_corpus_manifest_v1.md b/docs/spec/production_corpus_manifest_v1.md index 36347823..e341265d 100644 --- a/docs/spec/production_corpus_manifest_v1.md +++ b/docs/spec/production_corpus_manifest_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "Production Corpus Manifest v1" +description: "Normative contract for sanitized and private coding-agent production corpus manifests." +resource: docs/spec/production_corpus_manifest_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/production_corpus_manifest_v1.md +--- # Production Corpus Manifest v1 Purpose: Define the sanitized/private coding-agent production corpus manifest used by @@ -98,7 +116,7 @@ evidence ID. It must not silently fall back to the checked-in synthetic corpus. } ``` -## Related Guides +## Related Runbooks -- `docs/guide/benchmarking/live_baseline_benchmark.md`: run commands, private fixture +- `docs/runbook/benchmarking/live_baseline_benchmark.md`: run commands, private fixture placement, and report publication. diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index b371e9a5..2cac3834 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "Real-World Agent Memory Benchmark v1" +description: "Normative contract for real-world agent memory benchmark jobs and reports." +resource: docs/spec/real_world_agent_memory_benchmark_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/real_world_agent_memory_benchmark_v1.md +--- # Real-World Agent Memory Benchmark v1 Purpose: Define the v1 benchmark contract for evaluating agent memory systems through diff --git a/docs/spec/system_competitive_parity_gate_v1.md b/docs/spec/system_competitive_parity_gate_v1.md index 7c130f7f..36085afe 100644 --- a/docs/spec/system_competitive_parity_gate_v1.md +++ b/docs/spec/system_competitive_parity_gate_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "Competitive Parity Gate v1 Specification" +description: "Define the adoption gate ELF must pass before it can be treated as production-eligible memory infrastructure." +resource: docs/spec/system_competitive_parity_gate_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_competitive_parity_gate_v1.md +--- # Competitive Parity Gate v1 Specification Purpose: Define the adoption gate ELF must pass before it can be treated as production-eligible memory infrastructure. @@ -8,9 +26,9 @@ Defines: `elf.competitive_parity_gate/v1` dimensions, Docker isolation rules, ba Related inputs: -- `docs/research/2026-06-08-agent-memory-selection.json` -- `docs/guide/research/comparison_external_projects.md` -- `docs/guide/research/agentmemory_adapter.md` +- `docs/decisions/2026-06-08-agent-memory-selection.md` +- `docs/evidence/external_memory/comparison_external_projects.md` +- `docs/evidence/external_memory/agentmemory_adapter.md` - `docs/spec/system_elf_memory_service_v2.md` - `docs/spec/system_consolidation_proposals_v1.md` diff --git a/docs/spec/system_consolidation_proposals_v1.md b/docs/spec/system_consolidation_proposals_v1.md index 35f2f95a..65c3629b 100644 --- a/docs/spec/system_consolidation_proposals_v1.md +++ b/docs/spec/system_consolidation_proposals_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "Consolidation Proposals v1 Specification" +description: "Define the reviewable consolidation run and proposal contract for derived memory output." +resource: docs/spec/system_consolidation_proposals_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_consolidation_proposals_v1.md +--- # Consolidation Proposals v1 Specification Purpose: Define the reviewable consolidation run and proposal contract for derived memory output. @@ -8,8 +26,8 @@ Defines: `elf.consolidation/v1` runs, proposals, source snapshots, lineage, revi Related inputs: -- `docs/research/2026-06-08-agent-memory-selection.json` -- `docs/guide/research/comparison_external_projects.md` +- `docs/decisions/2026-06-08-agent-memory-selection.md` +- `docs/evidence/external_memory/comparison_external_projects.md` - `docs/spec/system_elf_memory_service_v2.md` ## Core Rule diff --git a/docs/spec/system_doc_chunking_profiles_v1.md b/docs/spec/system_doc_chunking_profiles_v1.md index 20ad1fd8..f6042c2b 100644 --- a/docs/spec/system_doc_chunking_profiles_v1.md +++ b/docs/spec/system_doc_chunking_profiles_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "System: `doc_chunking_profiles/v1` for `docs_put`" +description: "Define token-based chunking profiles used by Doc Extension v1 ingestion." +resource: docs/spec/system_doc_chunking_profiles_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_doc_chunking_profiles_v1.md +--- # System: `doc_chunking_profiles/v1` for `docs_put` Purpose: Define token-based chunking profiles used by Doc Extension v1 ingestion. diff --git a/docs/spec/system_doc_extension_v1_filters.md b/docs/spec/system_doc_extension_v1_filters.md index 3046881c..a2aa17c3 100644 --- a/docs/spec/system_doc_extension_v1_filters.md +++ b/docs/spec/system_doc_extension_v1_filters.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "System: Document Extension v1 Filter and Payload Contract" +description: "Normative contract for Doc Extension v1 search filters and payloads." +resource: docs/spec/system_doc_extension_v1_filters.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_doc_extension_v1_filters.md +--- # System: Document Extension v1 Filter and Payload Contract Purpose: Define the `docs_search_filters/v1` filter contract for diff --git a/docs/spec/system_doc_extension_v1_trajectory.md b/docs/spec/system_doc_extension_v1_trajectory.md index e13e542e..3c59d5bb 100644 --- a/docs/spec/system_doc_extension_v1_trajectory.md +++ b/docs/spec/system_doc_extension_v1_trajectory.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "System: Doc Extension v1 Retrieval Trajectory (`doc_retrieval_trajectory/v1`)" +description: "Normative contract for Doc Extension v1 retrieval trajectory traces." +resource: docs/spec/system_doc_extension_v1_trajectory.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_doc_extension_v1_trajectory.md +--- # System: Doc Extension v1 Retrieval Trajectory (`doc_retrieval_trajectory/v1`) Purpose: Define the optional, response-only stage traces for Doc Extension v1 retrieval diff --git a/docs/spec/system_doc_source_ref_v1.md b/docs/spec/system_doc_source_ref_v1.md index c11d4f4f..a695b40f 100644 --- a/docs/spec/system_doc_source_ref_v1.md +++ b/docs/spec/system_doc_source_ref_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "System: `doc_source_ref/v1` for `docs_put`" +description: "Normative contract for source_ref values accepted by docs_put." +resource: docs/spec/system_doc_source_ref_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_doc_source_ref_v1.md +--- # System: `doc_source_ref/v1` for `docs_put` Purpose: Define a minimal, versioned `source_ref` convention for docs ingested diff --git a/docs/spec/system_elf_memory_service_v2.md b/docs/spec/system_elf_memory_service_v2.md index b33588e9..82fddaf3 100644 --- a/docs/spec/system_elf_memory_service_v2.md +++ b/docs/spec/system_elf_memory_service_v2.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "ELF Memory Service v2.0 Specification" +description: "Define the ELF Memory Service v2.0 contract, invariants, and storage model." +resource: docs/spec/system_elf_memory_service_v2.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_elf_memory_service_v2.md +--- # ELF Memory Service v2.0 Specification Purpose: Define the ELF Memory Service v2.0 contract, invariants, and storage model. diff --git a/docs/spec/system_graph_memory_postgres_v1.md b/docs/spec/system_graph_memory_postgres_v1.md index 92012ae0..70610304 100644 --- a/docs/spec/system_graph_memory_postgres_v1.md +++ b/docs/spec/system_graph_memory_postgres_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "Graph Memory Postgres v1.0 Specification" +description: "Define the canonical entity/fact temporal memory schema and invariants for PostgreSQL-backed graph memory." +resource: docs/spec/system_graph_memory_postgres_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_graph_memory_postgres_v1.md +--- # Graph Memory Postgres v1.0 Specification Purpose: Define the canonical entity/fact temporal memory schema and invariants for PostgreSQL-backed graph memory. diff --git a/docs/spec/system_knowledge_pages_v1.md b/docs/spec/system_knowledge_pages_v1.md index a30336f9..146ee3ab 100644 --- a/docs/spec/system_knowledge_pages_v1.md +++ b/docs/spec/system_knowledge_pages_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "Derived Knowledge Pages v1 Specification" +description: "Define derived knowledge page storage, rebuild, citation, and lint contracts." +resource: docs/spec/system_knowledge_pages_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_knowledge_pages_v1.md +--- # Derived Knowledge Pages v1 Specification Purpose: Define derived knowledge page storage, rebuild, citation, and lint contracts. diff --git a/docs/spec/system_memory_summary_v1.md b/docs/spec/system_memory_summary_v1.md index 0db2fe57..3cb99235 100644 --- a/docs/spec/system_memory_summary_v1.md +++ b/docs/spec/system_memory_summary_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "Reviewable Memory Summary v1 Specification" +description: "Define the reviewable memory summary and source-trace contract." +resource: docs/spec/system_memory_summary_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_memory_summary_v1.md +--- # Reviewable Memory Summary v1 Specification Purpose: Define the reviewable memory summary and source-trace contract. diff --git a/docs/spec/system_provenance_mapping_v1.md b/docs/spec/system_provenance_mapping_v1.md index fdffaf11..6abdd12b 100644 --- a/docs/spec/system_provenance_mapping_v1.md +++ b/docs/spec/system_provenance_mapping_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "System: Note Provenance Mapping (v1)" +description: "Define the provenance bundle contract used by admin operations and traceability workflows." +resource: docs/spec/system_provenance_mapping_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_provenance_mapping_v1.md +--- # System: Note Provenance Mapping (v1) Purpose: Define the provenance bundle contract used by admin operations and traceability workflows. diff --git a/docs/spec/system_search_filter_expr_v1.md b/docs/spec/system_search_filter_expr_v1.md index 55635e73..7976c1e0 100644 --- a/docs/spec/system_search_filter_expr_v1.md +++ b/docs/spec/system_search_filter_expr_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "System: Search Filter Expression Contract v1" +description: "Define the structured filter payload used by search endpoints via `search_filter_expr/v1`." +resource: docs/spec/system_search_filter_expr_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_search_filter_expr_v1.md +--- # System: Search Filter Expression Contract v1 Purpose: Define the structured filter payload used by search endpoints via `search_filter_expr/v1`. diff --git a/docs/spec/system_source_ref_doc_pointer_v1.md b/docs/spec/system_source_ref_doc_pointer_v1.md index ae83154d..c76be322 100644 --- a/docs/spec/system_source_ref_doc_pointer_v1.md +++ b/docs/spec/system_source_ref_doc_pointer_v1.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "System: `source_ref` Doc Pointer Resolver (v1)" +description: "Define a concrete, versioned `source_ref` schema for document pointers so agents can reliably hydrate long-form evidence after a note is retrieved." +resource: docs/spec/system_source_ref_doc_pointer_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_source_ref_doc_pointer_v1.md +--- # System: `source_ref` Doc Pointer Resolver (v1) Purpose: Define a concrete, versioned `source_ref` schema for document pointers so agents can reliably hydrate long-form evidence after a note is retrieved. diff --git a/docs/spec/system_version_registry.md b/docs/spec/system_version_registry.md index efe338af..d2f9fc0b 100644 --- a/docs/spec/system_version_registry.md +++ b/docs/spec/system_version_registry.md @@ -1,3 +1,21 @@ +--- +type: Spec +title: "System Version Registry" +description: "Provide a single registry for versioned identifiers used across ELF." +resource: docs/spec/system_version_registry.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-18 +tags: + - docs + - spec +source_refs: [] +code_refs: [] +related: [] +drift_watch: + - docs/spec/system_version_registry.md +--- # System Version Registry Purpose: Provide a single registry for versioned identifiers used across ELF. diff --git a/scripts/live-baseline-report-to-md.sh b/scripts/live-baseline-report-to-md.sh index 38ef83ff..8532ccff 100755 --- a/scripts/live-baseline-report-to-md.sh +++ b/scripts/live-baseline-report-to-md.sh @@ -33,7 +33,7 @@ render_report() { "Goal: Publish a Markdown summary for one generated live baseline aggregate report.", "Read this when: You need a durable, reviewable summary of a live baseline JSON report.", ("Inputs: `" + $report_path + "`."), - "Depends on: `scripts/live-baseline-benchmark.sh` and `docs/guide/benchmarking/live_baseline_benchmark.md`.", + "Depends on: `scripts/live-baseline-benchmark.sh` and `docs/runbook/benchmarking/live_baseline_benchmark.md`.", "Verification: Compare this Markdown summary with the source JSON before committing.", "", "## Summary",