From ab9311fb05a413c30b15e0f54000bafd30b424bf Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Sat, 20 Jun 2026 03:23:27 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Add live knowledge-page rebuild lint benchmark","authority":"XY-935"} --- Makefile.toml | 9 ++ README.md | 21 +++- ...1-competitor-strength-adoption-report.json | 5 +- .../tests/real_world_job_benchmark.rs | 57 ++++++++- ...-11-competitor-strength-adoption-report.md | 4 +- ...live-knowledge-page-rebuild-lint-report.md | 108 ++++++++++++++++++ docs/evidence/benchmarking/index.md | 1 + docs/log.md | 8 ++ .../benchmarking/live_baseline_benchmark.md | 20 ++++ .../real_world_agent_memory_benchmark.md | 23 ++++ scripts/real-world-docker.sh | 6 + scripts/real-world-knowledge-live-adapter.sh | 80 +++++++++++++ 12 files changed, 331 insertions(+), 11 deletions(-) create mode 100644 docs/evidence/benchmarking/2026-06-20-live-knowledge-page-rebuild-lint-report.md create mode 100755 scripts/real-world-knowledge-live-adapter.sh diff --git a/Makefile.toml b/Makefile.toml index 17fa3b7e..967d9f02 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -43,6 +43,7 @@ # | real-world-memory-knowledge-report | command | | # | real-world-memory-live-adapters | command | | # | real-world-memory-live-consolidation | command | | +# | real-world-memory-live-knowledge | command | | # | real-world-memory-proactive-brief | composite | | # | real-world-memory-proactive-brief-json | command | | # | real-world-memory-proactive-brief-report | command | | @@ -650,6 +651,14 @@ args = [ "memory-live-consolidation", ] +[tasks.real-world-memory-live-knowledge] +workspace = false +command = "bash" +args = [ + "scripts/real-world-docker.sh", + "memory-live-knowledge", +] + [tasks.real-world-memory-proactive-brief] workspace = false dependencies = [ diff --git a/README.md b/README.md index 81b99066..da46d724 100644 --- a/README.md +++ b/README.md @@ -207,6 +207,15 @@ provider-backed ELF evidence was required. This improves local Dreaming runtime authority and auditability, but it does not prove Pulse, ChatGPT Tasks, Claude Dreams, hosted managed-memory, or private-corpus parity. +- Live knowledge-page rebuild/lint after XY-935: the June 20 follow-up adds + `cargo make real-world-memory-live-knowledge`, a Docker-contained ELF service + materialization command for `knowledge_compilation`. The slice runs + `ElfService::knowledge_page_rebuild`, `knowledge_page_lint`, and + `knowledge_pages_search` before scoring citation coverage, stale-source lint, + unsupported-section flags, rebuild metadata, backlinks, and source-of-truth + boundaries. This upgrades ELF's own knowledge-page evidence from fixture-only to + service-native proof, but it does not claim llm-wiki, gbrain, GraphRAG, RAGFlow, + LightRAG, or graphify parity without comparable contained adapter outputs. - Operator-approved public-proxy addendum after XY-930: the June 19 follow-up runs `cargo make baseline-production-private-addendum` with a simulated/public-proxy production corpus manifest approved for this stage. The run records 12 documents, @@ -433,12 +442,12 @@ Detailed comparison, mechanism-level analysis, and source map: - [Derived Knowledge Page Follow-Up Research](docs/research/derived_knowledge_page_followup.md) - [Dreaming Product Surface Follow-Up Research](docs/research/dreaming_product_surface_followup.md) -Latest real-world benchmark report: June 19, 2026. Latest external research refresh: -June 11, 2026; June 19 adds the XY-930 operator-approved public-proxy production -addendum and service-native Dreaming readback after the qmd debug-ergonomics Dreaming -retest, the June 17 competitor-strength closeout, and the June 16 temporal -reconciliation, live consolidation self-check, proactive-brief, and scheduled-memory -scoring evidence. +Latest real-world benchmark report: June 20, 2026. Latest external research refresh: +June 11, 2026; June 20 adds the Live Knowledge-Page Rebuild/Lint Report - June 20, 2026 +after the June 19 XY-930 operator-approved public-proxy production addendum and +service-native Dreaming readback, the qmd debug-ergonomics Dreaming retest, the +June 17 competitor-strength closeout, and the June 16 temporal reconciliation, +live consolidation self-check, proactive-brief, and scheduled-memory scoring evidence. ## Documentation diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json index 01f0831e..5bac6e40 100644 --- a/apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json @@ -270,13 +270,14 @@ "blocked", "not_encoded" ], - "measured_claim": "ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. The XY-929 graph/RAG representative slice scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references.", + "measured_claim": "ELF fixture knowledge pages pass, and XY-935 adds a Docker-contained ELF service-native rebuild/lint/search command for the checked-in knowledge pack. The XY-929 graph/RAG representative slice still scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references, so broad external knowledge-product comparison remains unproven.", "command_artifacts": [ "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md", + "docs/evidence/benchmarking/2026-06-20-live-knowledge-page-rebuild-lint-report.md", "docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md" ], "follow_up_issues": [ - "XY-926", + "XY-935", "XY-929" ], "caveat": "GraphRAG, graphify, llm-wiki, and gbrain remain references until contained citation, graph-report, and lint jobs produce passable evidence-linked output." diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 0075265c..97eada0e 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -2339,6 +2339,61 @@ fn live_consolidation_report_preserves_reviewable_output_boundaries() -> Result< Ok(()) } +#[test] +fn live_knowledge_page_rebuild_lint_has_dedicated_docker_task() -> Result<()> { + let workspace = workspace_root()?; + let makefile = fs::read_to_string(workspace.join("Makefile.toml"))?; + let docker_script = fs::read_to_string(workspace.join("scripts/real-world-docker.sh"))?; + let live_script = + fs::read_to_string(workspace.join("scripts/real-world-knowledge-live-adapter.sh"))?; + let live_adapter = + fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_live_adapter.rs"))?; + let benchmark_runbook = fs::read_to_string( + workspace + .join("docs") + .join("runbook") + .join("benchmarking") + .join("real_world_agent_memory_benchmark.md"), + )?; + let live_runbook = fs::read_to_string( + workspace + .join("docs") + .join("runbook") + .join("benchmarking") + .join("live_baseline_benchmark.md"), + )?; + let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?; + let readme = fs::read_to_string(readme_path()?)?; + + assert!(makefile.contains("[tasks.real-world-memory-live-knowledge]")); + assert!(makefile.contains("scripts/real-world-docker.sh")); + assert!(makefile.contains("memory-live-knowledge")); + assert!(docker_script.contains("memory-live-knowledge)")); + assert!(docker_script.contains("-e ELF_KNOWLEDGE_LIVE_REPORT_DIR")); + assert!(docker_script.contains("-e ELF_KNOWLEDGE_LIVE_FIXTURES")); + assert!(docker_script.contains("scripts/real-world-knowledge-live-adapter.sh")); + assert!(live_script.contains("elf.real_world_knowledge_live_adapter_sweep/v1")); + assert!(live_script.contains("apps/elf-eval/fixtures/real_world_memory/knowledge")); + assert!(live_script.contains("tmp/real-world-memory/live-knowledge")); + assert!(live_script.contains("real-world-memory-live-knowledge")); + assert!(live_script.contains("ElfService knowledge_page_rebuild")); + assert!(live_script.contains("knowledge_page_lint")); + assert!(live_script.contains("knowledge_pages_search")); + assert!(live_script.contains("pages remain derived benchmark artifacts")); + assert!(live_adapter.contains("fn materialize_elf_knowledge(")); + assert!(live_adapter.contains("KnowledgePageRebuildRequest")); + assert!(live_adapter.contains("KnowledgePageLintRequest")); + assert!(live_adapter.contains("KnowledgePageSearchRequest")); + assert!(benchmark_runbook.contains("Current live knowledge-page rebuild/lint increment")); + assert!(benchmark_runbook.contains("cargo make real-world-memory-live-knowledge")); + assert!(benchmark_runbook.contains("tmp/real-world-memory/live-knowledge/summary.json")); + assert!(live_runbook.contains("cargo make real-world-memory-live-knowledge")); + assert!(benchmarking_index.contains("2026-06-20-live-knowledge-page-rebuild-lint-report.md")); + assert!(readme.contains("Live Knowledge-Page Rebuild/Lint Report - June 20, 2026")); + + Ok(()) +} + fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Result<()> { let suites = array_at(adapter, "/suites")?; let capabilities = array_at(adapter, "/capabilities")?; @@ -3199,7 +3254,7 @@ fn assert_qmd_debug_retest_markdown_and_indexes( benchmarking_index.contains("2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.md") ); assert!(readme.contains("qmd Debug-Ergonomics Dreaming Retest Report - June 19, 2026")); - assert!(readme.contains("Latest real-world benchmark report: June 19, 2026")); + assert!(readme.contains("Latest real-world benchmark report: June 20, 2026")); assert!(readme.contains("keeps the qmd edge unchanged")); } diff --git a/docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md index 14007b4e..12322bdf 100644 --- a/docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -130,7 +130,7 @@ results, or lifecycle failures into one aggregate leaderboard. | Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 | | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 | | Consolidation/proposal review | `not_tested` for direct competitors; ELF self-check passes | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF fixture consolidation passes and XY-934 adds live service-backed proposal materialization, lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit evidence. Managed dreaming and Always-On Memory Agent patterns remain product references, not direct live competitors. | XY-934 | -| Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `blocked`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. The XY-929 graph/RAG representative slice scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references. | XY-926, XY-929 | +| Knowledge page compilation | `not_tested` for direct competitors; ELF self-check passes | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `blocked`, `not_encoded` | ELF fixture knowledge pages pass, and XY-935 adds a Docker-contained ELF service-native rebuild/lint/search command for the checked-in knowledge pack. The XY-929 graph/RAG representative slice still scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references, so broad external knowledge-product comparison remains unproven. | XY-935, XY-929 | | Operator debugging/viewer UX | `win` | `fixture_backed`, `live_real_world`, `blocked`, `not_encoded` | ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, but claude-mem viewer/operator workflows and OpenMemory UI/export remain blocked, so this is not a broad viewer-product superiority claim. | XY-926 | | Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains `not_encoded`; agentmemory and claude-mem hook-capture comparisons remain `blocked` until Docker-contained hook observations and write-policy/viewer readback artifacts exist, so no broad capture-hook superiority claim is allowed. | XY-933, XY-925 | | Production ops, restore, backfill, and rebuild | `win` | `live_baseline_only`, `blocked` | ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence. | XY-930 | @@ -148,7 +148,7 @@ results, or lifecycle failures into one aggregate leaderboard. | XY-923 | P0 | Backlog | qmd trace-level replay and wrong-result diagnostics. | | XY-924/XY-931 | P0 | Encoded local OSS history; UI/export setup blocker measured | mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export has a blocked export-helper setup probe and still needs a dedicated compose/import path before any product-UX comparison. | | XY-925 | P1 | Fixture slice encoded; runtime paths still blocked | First-generation OSS prompt coverage and typed blockers are recorded for agentmemory, memsearch, and claude-mem; durable agentmemory hooks and claude-mem viewer/operator runs still need runtime adapters. | -| XY-926 | P1 | Partial live suites encoded | ELF live knowledge-page scoring is encoded; broader knowledge-page external comparisons and broad operator-debugging remain dependent on contained llm-wiki/gbrain/GraphRAG/OpenMemory/claude-mem runners. Consolidation is split to XY-934. | +| XY-926/XY-935 | P1 | ELF live knowledge self-check encoded | ELF live knowledge-page scoring is encoded through a dedicated XY-935 rebuild/lint/search command; broader knowledge-page external comparisons and broad operator-debugging remain dependent on contained llm-wiki/gbrain/GraphRAG/OpenMemory/claude-mem runners. Consolidation is split to XY-934. | | XY-934 | P1 | ELF live self-check encoded | Live consolidation proposal scoring is encoded for ELF with lineage, confidence/usefulness, unsupported-claim flags, and review-action audit; direct competitor runners remain untested or product-reference only. | | XY-933 | P1 | Live ELF self-check encoded | Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked. | | XY-927 | P1 | Fixture encoded; Letta export blocked | ELF core-vs-archival fixture coverage is encoded; a contained Letta export/readback adapter remains future work before win/tie/loss claims. | diff --git a/docs/evidence/benchmarking/2026-06-20-live-knowledge-page-rebuild-lint-report.md b/docs/evidence/benchmarking/2026-06-20-live-knowledge-page-rebuild-lint-report.md new file mode 100644 index 00000000..202659bd --- /dev/null +++ b/docs/evidence/benchmarking/2026-06-20-live-knowledge-page-rebuild-lint-report.md @@ -0,0 +1,108 @@ +--- +type: Evidence +title: "Live Knowledge-Page Rebuild/Lint Report - June 20, 2026" +description: "Checked-in benchmark evidence record: Live Knowledge-Page Rebuild/Lint Report - June 20, 2026." +resource: docs/evidence/benchmarking/2026-06-20-live-knowledge-page-rebuild-lint-report.md +status: active +authority: current_state +owner: evidence +last_verified: 2026-06-20 +tags: + - docs + - evidence + - benchmarking +--- +# Live Knowledge-Page Rebuild/Lint Report - June 20, 2026 + +Goal: Close XY-935 by moving ELF knowledge-page rebuild/lint scoring from fixture-only +evidence into a Docker-contained service materialization command. +Read this when: You need to know whether ELF has service-native evidence for +derived knowledge pages, citation coverage, stale-source lint, unsupported sections, +rebuild metadata, backlinks, and page search. +Inputs: `cargo make real-world-memory-knowledge`, +`cargo make real-world-memory-live-knowledge`, +`apps/elf-eval/fixtures/real_world_memory/knowledge/`, and +`apps/elf-eval/src/bin/real_world_live_adapter.rs`. +Outputs: A narrow live knowledge-page benchmark command and typed comparison +boundaries for wiki, graph, and RAG-style knowledge systems. + +## Executive Judgment + +ELF now has a dedicated service-native knowledge-page rebuild/lint benchmark command. +The command materializes the checked-in `knowledge_compilation` jobs through +`ElfService::knowledge_page_rebuild`, `knowledge_page_lint`, and +`knowledge_pages_search`, then scores the generated real-world job fixtures. + +This improves ELF's own knowledge-page authority from fixture-only page artifacts to +service-backed rebuild/lint/search evidence. It does not prove parity or superiority +against llm-wiki, gbrain, GraphRAG, RAGFlow, LightRAG, or graphify. Those comparisons +remain valid only when a contained adapter emits comparable page sections, source ids, +citation mappings, lint findings, and typed benchmark statuses. + +## Command Evidence + +| Command | Expected result | Artifact | +| --- | --- | --- | +| `cargo make real-world-memory-knowledge` | Fixture knowledge page gate passes. | `tmp/real-world-memory/knowledge-report.json` | +| `cargo make real-world-memory-live-knowledge` | Docker-contained ELF service materialization and scored report pass for the encoded knowledge fixture pack. | `tmp/real-world-memory/live-knowledge/summary.json` | + +## Live Materialization Contract + +`cargo make real-world-memory-live-knowledge` publishes: + +| Artifact | Purpose | +| --- | --- | +| `tmp/real-world-memory/live-knowledge/elf-materialization.json` | Records live adapter materialization, generated fixtures, per-job evidence ids, and service-path metadata. | +| `tmp/real-world-memory/live-knowledge/elf-report.json` | Scores generated jobs with normal real-world job benchmark status and knowledge metrics. | +| `tmp/real-world-memory/live-knowledge/elf-report.md` | Human-readable report for citation coverage, stale lint, rebuild determinism, backlinks, and unsupported sections. | +| `tmp/real-world-memory/live-knowledge/summary.json` | Aggregates materialization and report summaries under `elf.real_world_knowledge_live_adapter_sweep/v1`. | + +The command is intentionally Docker-scoped. Host execution is refused unless +`ELF_KNOWLEDGE_LIVE_ALLOW_HOST=1` is set for an explicit local diagnostic run. + +## Scored Dimensions + +| Dimension | Evidence requirement | +| --- | --- | +| Citation coverage | Page sections cite source evidence or timeline events, or are explicitly flagged unsupported. | +| Stale-source lint | Stale source updates after rebuild produce lint findings instead of silently rewriting truth. | +| Unsupported sections | Unsupported summaries remain visible as unsupported, not hidden claims. | +| Rebuild metadata | First and second rebuild hashes, deterministic status, and allowed variance remain explicit. | +| Backlinks and search | Page artifacts expose backlinks, and `knowledge_pages_search` returns the materialized page surface. | +| Source-of-truth boundary | Knowledge pages remain derived benchmark artifacts and do not replace Memory Notes or source records. | + +## Comparison Boundary + +| Compared target | Current position | Why | +| --- | --- | --- | +| llm-wiki | `product_reference` | Query-save/lint and wiki maintenance are useful reference patterns, but no contained llm-wiki adapter emits comparable scored pages here. | +| gbrain | `product_reference` | Timeline and compiled-truth page shape remain references until a contained runner emits source-linked pages and lint output. | +| GraphRAG/RAGFlow/LightRAG | `blocked_or_incomplete_reference` | Graph/RAG outputs need document/text-unit/source-id mappings before they can be scored as knowledge pages. | +| graphify | `wrong_result_reference` | Existing representative graphify evidence remains typed `wrong_result`; stale-source lint and unsupported-summary handling are not passing. | +| qmd | `not_encoded` | qmd live adapter retrieves evidence-linked answers but does not generate derived knowledge pages. | + +## Follow-Up Queue + +| Follow-up | Reason | +| --- | --- | +| XY-1019 | Productize Knowledge Workspace pages with rebuild diffs, citation lint, unsupported-claim warnings, and previous-version diffs. | +| XY-1020 | Add graph-lite temporal facts and source-backed reports after knowledge pages remain derived and citation-checked. | +| Graph/RAG contained adapters | Promote external comparison only when adapters emit comparable source ids, page sections, citation mappings, and lint findings. | + +## Claims Allowed + +- ELF has a dedicated Docker-contained service-native knowledge-page rebuild/lint + command for the checked-in `knowledge_compilation` fixture pack. +- The command exercises `knowledge_page_rebuild`, `knowledge_page_lint`, and + `knowledge_pages_search` before scoring. +- ELF's own knowledge-page evidence is stronger than fixture-only proof for this + narrow slice. + +## Claims Not Allowed + +- Do not claim ELF beats llm-wiki, gbrain, GraphRAG, RAGFlow, LightRAG, or graphify on + broad knowledge products from this command alone. +- Do not treat generated knowledge pages as authoritative storage. +- Do not mark Knowledge Workspace productization complete; XY-1019 still owns page + version diffs, product-quality rebuild metadata, broader page types, and recall + integration. diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md index 6c92eaf3..b8533592 100644 --- a/docs/evidence/benchmarking/index.md +++ b/docs/evidence/benchmarking/index.md @@ -43,3 +43,4 @@ Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`. - `2026-06-19-operator-approved-public-proxy-production-private-addendum.md`: Operator-Approved Public-Proxy Production-Private Addendum - June 19, 2026; closes the current XY-930 proxy/simulated-corpus stage with 8/8 query pass, 0 wrong_result, and explicit boundaries that this is not real private-corpus or provider-backed proof. - `2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.md`: qmd Debug-Ergonomics Dreaming Retest Report - June 19, 2026; confirms qmd's default top-k/replay edge is unchanged while ELF keeps the narrow operator-debug trace/stage visibility wins. - `2026-06-19-service-native-dreaming-readback-report.md`: Service-Native Dreaming Readback Report - June 19, 2026; materializes memory summary, proactive brief, and scheduled-memory derived outputs through `ElfService` readback with 9 pass, 0 wrong_result, and 2 typed XY-930 blockers. +- `2026-06-20-live-knowledge-page-rebuild-lint-report.md`: Live Knowledge-Page Rebuild/Lint Report - June 20, 2026; adds a Docker-contained ELF service-native knowledge-page materialization command while preserving llm-wiki, gbrain, GraphRAG, RAGFlow, LightRAG, and graphify as separate comparison targets until they emit comparable scored page artifacts. diff --git a/docs/log.md b/docs/log.md index 8bc421b7..88fc55aa 100644 --- a/docs/log.md +++ b/docs/log.md @@ -59,3 +59,11 @@ logs. snapshot for XY-930, recording `baseline-production-private-addendum` as 8/8 pass on the simulated/public-proxy corpus while preserving real private-corpus and provider-backed production quality as unproven. + +## 2026-06-20 + +- Added the live knowledge-page rebuild/lint report for XY-935, plus + `cargo make real-world-memory-live-knowledge`, proving the checked-in knowledge + fixture pack can be materialized through `ElfService` rebuild, lint, and page + search before scoring while keeping external wiki/graph/RAG product comparisons + separate. diff --git a/docs/runbook/benchmarking/live_baseline_benchmark.md b/docs/runbook/benchmarking/live_baseline_benchmark.md index 4597e2bc..83fe2f32 100644 --- a/docs/runbook/benchmarking/live_baseline_benchmark.md +++ b/docs/runbook/benchmarking/live_baseline_benchmark.md @@ -519,6 +519,26 @@ citation coverage, stale-claim linting, rebuild determinism, backlink coverage, usefulness, and explicitly flagged unsupported summaries. Generated pages are benchmark artifacts, not source-truth replacements. +To run the Docker-contained live knowledge-page rebuild/lint materialization: + +```sh +cargo make real-world-memory-live-knowledge +``` + +Artifacts: + +```text +tmp/real-world-memory/live-knowledge/elf-materialization.json +tmp/real-world-memory/live-knowledge/elf-report.json +tmp/real-world-memory/live-knowledge/elf-report.md +tmp/real-world-memory/live-knowledge/summary.json +``` + +This command materializes the same knowledge fixture pack through +`ElfService::knowledge_page_rebuild`, `knowledge_page_lint`, and +`knowledge_pages_search` inside the baseline Docker runner before publishing the +scored report. It is an ELF service self-check, not a direct competitor win. + ## Clean Up ```sh diff --git a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md index 2e8268e3..f9b0dc90 100644 --- a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md @@ -461,6 +461,29 @@ be explicitly flagged unsupported. The report publishes citation coverage, stale detection, rebuild determinism, aggregate backlink counts and page coverage, page usefulness, unsupported summary count, and untraced section count. +Current live knowledge-page rebuild/lint increment: + +```sh +cargo make real-world-memory-live-knowledge +``` + +Artifacts: + +```text +tmp/real-world-memory/live-knowledge/elf-materialization.json +tmp/real-world-memory/live-knowledge/elf-report.json +tmp/real-world-memory/live-knowledge/elf-report.md +tmp/real-world-memory/live-knowledge/summary.json +``` + +The live increment runs inside the Docker baseline runner and materializes the +knowledge fixtures through `ElfService::knowledge_page_rebuild`, +`knowledge_page_lint`, and `knowledge_pages_search` before scoring them with the +real-world job benchmark. It proves ELF service-native rebuild/lint/search behavior +for the checked-in `knowledge_compilation` pack. It does not claim llm-wiki, gbrain, +GraphRAG, RAGFlow, LightRAG, or graphify parity unless those projects emit comparable +page sections, source ids, citation mappings, lint findings, and typed statuses. + Current checked-in production-ops increment: ```sh diff --git a/scripts/real-world-docker.sh b/scripts/real-world-docker.sh index ee7e9685..163c4d1f 100755 --- a/scripts/real-world-docker.sh +++ b/scripts/real-world-docker.sh @@ -22,6 +22,12 @@ memory-live-consolidation) -e ELF_CONSOLIDATION_LIVE_FIXTURES \ baseline-runner bash scripts/real-world-consolidation-live-adapter.sh ;; +memory-live-knowledge) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_KNOWLEDGE_LIVE_REPORT_DIR \ + -e ELF_KNOWLEDGE_LIVE_FIXTURES \ + baseline-runner bash scripts/real-world-knowledge-live-adapter.sh + ;; memory-service-native-dreaming) docker compose -f docker-compose.baseline.yml run --build --rm \ -e ELF_DREAMING_SERVICE_NATIVE_REPORT_DIR \ diff --git a/scripts/real-world-knowledge-live-adapter.sh b/scripts/real-world-knowledge-live-adapter.sh new file mode 100755 index 00000000..f34ca6e7 --- /dev/null +++ b/scripts/real-world-knowledge-live-adapter.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_KNOWLEDGE_LIVE_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/live-knowledge}" +FIXTURE_DIR="${ELF_KNOWLEDGE_LIVE_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory/knowledge}" + +if [[ ! -f "/.dockerenv" && "${ELF_KNOWLEDGE_LIVE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run live knowledge adapter outside Docker. Use cargo make real-world-memory-live-knowledge." >&2 + exit 1 +fi + +for cmd in bash cargo jq; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in live knowledge runner." >&2 + exit 1 + fi +done + +mkdir -p "${REPORT_DIR}" +rm -rf "${REPORT_DIR:?}/elf-fixtures" \ + "${REPORT_DIR:?}/elf-materialization.json" \ + "${REPORT_DIR:?}/elf-report.json" \ + "${REPORT_DIR:?}/elf-report.md" \ + "${REPORT_DIR:?}/summary.json" + +cd "${ROOT_DIR}" + +cargo run -p elf-eval --bin real_world_live_adapter -- elf \ + --fixtures "${FIXTURE_DIR}" \ + --out-fixtures "${REPORT_DIR}/elf-fixtures" \ + --evidence-out "${REPORT_DIR}/elf-materialization.json" \ + --config config/local/elf.docker.toml + +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures "${REPORT_DIR}/elf-fixtures" \ + --out "${REPORT_DIR}/elf-report.json" \ + --run-id real-world-memory-live-knowledge \ + --adapter-id elf_live_real_world \ + --adapter-name "ELF live knowledge-page service adapter" \ + --adapter-behavior live_real_world_adapter \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Materialized by real_world_live_adapter through ElfService knowledge_page_rebuild, knowledge_page_lint, and knowledge_pages_search across the encoded knowledge_compilation fixture pack; pages remain derived benchmark artifacts, not authoritative storage." + +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report "${REPORT_DIR}/elf-report.json" \ + --out "${REPORT_DIR}/elf-report.md" + +jq -n \ + --slurpfile materialization "${REPORT_DIR}/elf-materialization.json" \ + --slurpfile report "${REPORT_DIR}/elf-report.json" \ + '{ + schema: "elf.real_world_knowledge_live_adapter_sweep/v1", + generated_at: (now | todateiso8601), + fixture_dir: (env.ELF_KNOWLEDGE_LIVE_FIXTURES // "apps/elf-eval/fixtures/real_world_memory/knowledge"), + artifact_dir: (env.ELF_KNOWLEDGE_LIVE_REPORT_DIR // "tmp/real-world-memory/live-knowledge"), + adapter: { + adapter_id: "elf_live_real_world", + evidence_class: "live_real_world", + materialization: $materialization[0], + report: { + json: "tmp/real-world-memory/live-knowledge/elf-report.json", + markdown: "tmp/real-world-memory/live-knowledge/elf-report.md", + summary: $report[0].summary, + suites: $report[0].suites + } + }, + comparison_boundary: { + baseline: "fixture-backed knowledge_compilation pages plus graph/RAG representative typed non-pass coverage", + judgment_rule: "improved only when service-native rebuild/lint/search materialization preserves citations, stale-source lint, unsupported-section flags, rebuild metadata, and source-of-truth boundaries", + competitor_boundary: "llm-wiki, gbrain, GraphRAG, RAGFlow, LightRAG, and graphify remain separate comparison targets unless a contained adapter emits comparable source ids, page sections, citation mappings, lint findings, and typed statuses" + } + }' >"${REPORT_DIR}/summary.json" + +echo "Live knowledge reports:" +echo " ${REPORT_DIR}/elf-materialization.json" +echo " ${REPORT_DIR}/elf-report.json" +echo " ${REPORT_DIR}/elf-report.md" +echo " ${REPORT_DIR}/summary.json"