hack-ink · yvette-carlisle · Jun 19, 2026 · Jun 19, 2026
diff --git a/Makefile.toml b/Makefile.toml
@@ -43,6 +43,7 @@
 # | real-world-memory-knowledge-report         | command   |     |
 # | real-world-memory-live-adapters            | command   |     |
 # | real-world-memory-live-consolidation       | command   |     |
+# | real-world-memory-live-knowledge           | command   |     |
 # | real-world-memory-proactive-brief          | composite |     |
 # | real-world-memory-proactive-brief-json     | command   |     |
 # | real-world-memory-proactive-brief-report   | command   |     |
@@ -650,6 +651,14 @@ args = [
 	"memory-live-consolidation",
 ]
 
+[tasks.real-world-memory-live-knowledge]
+workspace = false
+command = "bash"
+args = [
+	"scripts/real-world-docker.sh",
+	"memory-live-knowledge",
+]
+
 [tasks.real-world-memory-proactive-brief]
 workspace = false
 dependencies = [

diff --git a/README.md b/README.md
@@ -207,6 +207,15 @@ provider-backed ELF evidence was required.
   This improves local Dreaming runtime authority and auditability, but it does not
   prove Pulse, ChatGPT Tasks, Claude Dreams, hosted managed-memory, or private-corpus
   parity.
+- Live knowledge-page rebuild/lint after XY-935: the June 20 follow-up adds
+  `cargo make real-world-memory-live-knowledge`, a Docker-contained ELF service
+  materialization command for `knowledge_compilation`. The slice runs
+  `ElfService::knowledge_page_rebuild`, `knowledge_page_lint`, and
+  `knowledge_pages_search` before scoring citation coverage, stale-source lint,
+  unsupported-section flags, rebuild metadata, backlinks, and source-of-truth
+  boundaries. This upgrades ELF's own knowledge-page evidence from fixture-only to
+  service-native proof, but it does not claim llm-wiki, gbrain, GraphRAG, RAGFlow,
+  LightRAG, or graphify parity without comparable contained adapter outputs.
 - Operator-approved public-proxy addendum after XY-930: the June 19 follow-up runs
   `cargo make baseline-production-private-addendum` with a simulated/public-proxy
   production corpus manifest approved for this stage. The run records 12 documents,
@@ -433,12 +442,12 @@ Detailed comparison, mechanism-level analysis, and source map:
 - [Derived Knowledge Page Follow-Up Research](docs/research/derived_knowledge_page_followup.md)
 - [Dreaming Product Surface Follow-Up Research](docs/research/dreaming_product_surface_followup.md)
 
-Latest real-world benchmark report: June 19, 2026. Latest external research refresh:
-June 11, 2026; June 19 adds the XY-930 operator-approved public-proxy production
-addendum and service-native Dreaming readback after the qmd debug-ergonomics Dreaming
-retest, the June 17 competitor-strength closeout, and the June 16 temporal
-reconciliation, live consolidation self-check, proactive-brief, and scheduled-memory
-scoring evidence.
+Latest real-world benchmark report: June 20, 2026. Latest external research refresh:
+June 11, 2026; June 20 adds the Live Knowledge-Page Rebuild/Lint Report - June 20, 2026
+after the June 19 XY-930 operator-approved public-proxy production addendum and
+service-native Dreaming readback, the qmd debug-ergonomics Dreaming retest, the
+June 17 competitor-strength closeout, and the June 16 temporal reconciliation,
+live consolidation self-check, proactive-brief, and scheduled-memory scoring evidence.
 
 ## Documentation
 

diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-11-competitor-strength-adoption-report.json
@@ -270,13 +270,14 @@
         "blocked",
         "not_encoded"
       ],
-      "measured_claim": "ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. The XY-929 graph/RAG representative slice scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references.",
+      "measured_claim": "ELF fixture knowledge pages pass, and XY-935 adds a Docker-contained ELF service-native rebuild/lint/search command for the checked-in knowledge pack. The XY-929 graph/RAG representative slice still scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references, so broad external knowledge-product comparison remains unproven.",
       "command_artifacts": [
         "docs/evidence/benchmarking/2026-06-11-measurement-coverage-audit.md",
+        "docs/evidence/benchmarking/2026-06-20-live-knowledge-page-rebuild-lint-report.md",
         "docs/evidence/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md"
       ],
       "follow_up_issues": [
-        "XY-926",
+        "XY-935",
         "XY-929"
       ],
       "caveat": "GraphRAG, graphify, llm-wiki, and gbrain remain references until contained citation, graph-report, and lint jobs produce passable evidence-linked output."

diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs
@@ -2339,6 +2339,61 @@ fn live_consolidation_report_preserves_reviewable_output_boundaries() -> Result<
 	Ok(())
 }
 
+#[test]
+fn live_knowledge_page_rebuild_lint_has_dedicated_docker_task() -> Result<()> {
+	let workspace = workspace_root()?;
+	let makefile = fs::read_to_string(workspace.join("Makefile.toml"))?;
+	let docker_script = fs::read_to_string(workspace.join("scripts/real-world-docker.sh"))?;
+	let live_script =
+		fs::read_to_string(workspace.join("scripts/real-world-knowledge-live-adapter.sh"))?;
+	let live_adapter =
+		fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_live_adapter.rs"))?;
+	let benchmark_runbook = fs::read_to_string(
+		workspace
+			.join("docs")
+			.join("runbook")
+			.join("benchmarking")
+			.join("real_world_agent_memory_benchmark.md"),
+	)?;
+	let live_runbook = fs::read_to_string(
+		workspace
+			.join("docs")
+			.join("runbook")
+			.join("benchmarking")
+			.join("live_baseline_benchmark.md"),
+	)?;
+	let benchmarking_index = fs::read_to_string(benchmarking_index_path()?)?;
+	let readme = fs::read_to_string(readme_path()?)?;
+
+	assert!(makefile.contains("[tasks.real-world-memory-live-knowledge]"));
+	assert!(makefile.contains("scripts/real-world-docker.sh"));
+	assert!(makefile.contains("memory-live-knowledge"));
+	assert!(docker_script.contains("memory-live-knowledge)"));
+	assert!(docker_script.contains("-e ELF_KNOWLEDGE_LIVE_REPORT_DIR"));
+	assert!(docker_script.contains("-e ELF_KNOWLEDGE_LIVE_FIXTURES"));
+	assert!(docker_script.contains("scripts/real-world-knowledge-live-adapter.sh"));
+	assert!(live_script.contains("elf.real_world_knowledge_live_adapter_sweep/v1"));
+	assert!(live_script.contains("apps/elf-eval/fixtures/real_world_memory/knowledge"));
+	assert!(live_script.contains("tmp/real-world-memory/live-knowledge"));
+	assert!(live_script.contains("real-world-memory-live-knowledge"));
+	assert!(live_script.contains("ElfService knowledge_page_rebuild"));
+	assert!(live_script.contains("knowledge_page_lint"));
+	assert!(live_script.contains("knowledge_pages_search"));
+	assert!(live_script.contains("pages remain derived benchmark artifacts"));
+	assert!(live_adapter.contains("fn materialize_elf_knowledge("));
+	assert!(live_adapter.contains("KnowledgePageRebuildRequest"));
+	assert!(live_adapter.contains("KnowledgePageLintRequest"));
+	assert!(live_adapter.contains("KnowledgePageSearchRequest"));
+	assert!(benchmark_runbook.contains("Current live knowledge-page rebuild/lint increment"));
+	assert!(benchmark_runbook.contains("cargo make real-world-memory-live-knowledge"));
+	assert!(benchmark_runbook.contains("tmp/real-world-memory/live-knowledge/summary.json"));
+	assert!(live_runbook.contains("cargo make real-world-memory-live-knowledge"));
+	assert!(benchmarking_index.contains("2026-06-20-live-knowledge-page-rebuild-lint-report.md"));
+	assert!(readme.contains("Live Knowledge-Page Rebuild/Lint Report - June 20, 2026"));
+
+	Ok(())
+}
+
 fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Result<()> {
 	let suites = array_at(adapter, "/suites")?;
 	let capabilities = array_at(adapter, "/capabilities")?;
@@ -3199,7 +3254,7 @@ fn assert_qmd_debug_retest_markdown_and_indexes(
 		benchmarking_index.contains("2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.md")
 	);
 	assert!(readme.contains("qmd Debug-Ergonomics Dreaming Retest Report - June 19, 2026"));
-	assert!(readme.contains("Latest real-world benchmark report: June 19, 2026"));
+	assert!(readme.contains("Latest real-world benchmark report: June 20, 2026"));
 	assert!(readme.contains("keeps the qmd edge unchanged"));
 }
 

diff --git a/docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md
@@ -130,7 +130,7 @@ results, or lifecycle failures into one aggregate leaderboard.
 | Retrieval quality and local debug UX | `loss` | `live_baseline_only`, `research_gate`, `wrong_result`, `not_encoded` | The XY-923 trace/replay report scores qmd stronger on immediate top-10 candidate artifacts and short CLI replay commands. ELF keeps useful service trace/admin replay surfaces, and expansion, fusion, rerank-on, and candidate-drop diagnostics remain untested. | XY-923 |
 | Memory evolution and temporal history | `loss` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `wrong_result`, `blocked` | ELF fixture memory evolution passes, but live ELF passes only delete/TTL and reports five wrong_result jobs where current-vs-historical state is not reconciled. The mem0 local OSS preference-correction history scenario is now measured and is also an ELF loss. | XY-905 |
 | Consolidation/proposal review | `not_tested` for direct competitors; ELF self-check passes | `fixture_backed`, `live_real_world`, `research_gate`, `not_encoded` | ELF fixture consolidation passes and XY-934 adds live service-backed proposal materialization, lineage, confidence/usefulness, unsupported-claim flags, and apply/defer/discard audit evidence. Managed dreaming and Always-On Memory Agent patterns remain product references, not direct live competitors. | XY-934 |
-| Knowledge page compilation | `not_tested` | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `blocked`, `not_encoded` | ELF fixture knowledge pages pass, but live knowledge compilation is not encoded. The XY-929 graph/RAG representative slice scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references. | XY-926, XY-929 |
+| Knowledge page compilation | `not_tested` for direct competitors; ELF self-check passes | `fixture_backed`, `live_real_world`, `wrong_result`, `research_gate`, `blocked`, `not_encoded` | ELF fixture knowledge pages pass, and XY-935 adds a Docker-contained ELF service-native rebuild/lint/search command for the checked-in knowledge pack. The XY-929 graph/RAG representative slice still scores graphify as wrong_result and keeps GraphRAG, llm-wiki, and gbrain as blocked or not_tested references, so broad external knowledge-product comparison remains unproven. | XY-935, XY-929 |
 | Operator debugging/viewer UX | `win` | `fixture_backed`, `live_real_world`, `blocked`, `not_encoded` | ELF now has a narrow live operator-debug win over qmd on trace hydration, candidate-drop visibility, and selected-but-not-narrated evidence. ELF ties qmd on replay-command availability and repair-action clarity. XY-925 adds claude-mem progressive-disclosure and retrieval-repair prompt coverage, but claude-mem viewer/operator workflows and OpenMemory UI/export remain blocked, so this is not a broad viewer-product superiority claim. | XY-926 |
 | Capture/write policy and redaction | `not_tested` | `fixture_backed`, `live_real_world`, `live_baseline_only`, `blocked`, `not_encoded` | ELF live capture/write-policy self-check jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. qmd remains `not_encoded`; agentmemory and claude-mem hook-capture comparisons remain `blocked` until Docker-contained hook observations and write-policy/viewer readback artifacts exist, so no broad capture-hook superiority claim is allowed. | XY-933, XY-925 |
 | Production ops, restore, backfill, and rebuild | `win` | `live_baseline_only`, `blocked` | ELF has the strongest measured local production-operation story: provider synthetic, stress, resumable backfill, backup/restore, and Qdrant rebuild evidence. | XY-930 |
@@ -148,7 +148,7 @@ results, or lifecycle failures into one aggregate leaderboard.
 | XY-923 | P0 | Backlog | qmd trace-level replay and wrong-result diagnostics. |
 | XY-924/XY-931 | P0 | Encoded local OSS history; UI/export setup blocker measured | mem0/OpenMemory local OSS history and SDK export-style readback are measured; OpenMemory UI/export has a blocked export-helper setup probe and still needs a dedicated compose/import path before any product-UX comparison. |
 | XY-925 | P1 | Fixture slice encoded; runtime paths still blocked | First-generation OSS prompt coverage and typed blockers are recorded for agentmemory, memsearch, and claude-mem; durable agentmemory hooks and claude-mem viewer/operator runs still need runtime adapters. |
-| XY-926 | P1 | Partial live suites encoded | ELF live knowledge-page scoring is encoded; broader knowledge-page external comparisons and broad operator-debugging remain dependent on contained llm-wiki/gbrain/GraphRAG/OpenMemory/claude-mem runners. Consolidation is split to XY-934. |
+| XY-926/XY-935 | P1 | ELF live knowledge self-check encoded | ELF live knowledge-page scoring is encoded through a dedicated XY-935 rebuild/lint/search command; broader knowledge-page external comparisons and broad operator-debugging remain dependent on contained llm-wiki/gbrain/GraphRAG/OpenMemory/claude-mem runners. Consolidation is split to XY-934. |
 | XY-934 | P1 | ELF live self-check encoded | Live consolidation proposal scoring is encoded for ELF with lineage, confidence/usefulness, unsupported-claim flags, and review-action audit; direct competitor runners remain untested or product-reference only. |
 | XY-933 | P1 | Live ELF self-check encoded | Capture/write-policy redaction, exclusion, source-id, evidence-binding, and no-leak scoring for ELF; durable agentmemory/claude-mem capture-hook comparison remains blocked. |
 | XY-927 | P1 | Fixture encoded; Letta export blocked | ELF core-vs-archival fixture coverage is encoded; a contained Letta export/readback adapter remains future work before win/tie/loss claims. |

diff --git a/docs/evidence/benchmarking/2026-06-20-live-knowledge-page-rebuild-lint-report.md b/docs/evidence/benchmarking/2026-06-20-live-knowledge-page-rebuild-lint-report.md
@@ -0,0 +1,108 @@
+---
+type: Evidence
+title: "Live Knowledge-Page Rebuild/Lint Report - June 20, 2026"
+description: "Checked-in benchmark evidence record: Live Knowledge-Page Rebuild/Lint Report - June 20, 2026."
+resource: docs/evidence/benchmarking/2026-06-20-live-knowledge-page-rebuild-lint-report.md
+status: active
+authority: current_state
+owner: evidence
+last_verified: 2026-06-20
+tags:
+  - docs
+  - evidence
+  - benchmarking
+---
+# Live Knowledge-Page Rebuild/Lint Report - June 20, 2026
+
+Goal: Close XY-935 by moving ELF knowledge-page rebuild/lint scoring from fixture-only
+evidence into a Docker-contained service materialization command.
+Read this when: You need to know whether ELF has service-native evidence for
+derived knowledge pages, citation coverage, stale-source lint, unsupported sections,
+rebuild metadata, backlinks, and page search.
+Inputs: `cargo make real-world-memory-knowledge`,
+`cargo make real-world-memory-live-knowledge`,
+`apps/elf-eval/fixtures/real_world_memory/knowledge/`, and
+`apps/elf-eval/src/bin/real_world_live_adapter.rs`.
+Outputs: A narrow live knowledge-page benchmark command and typed comparison
+boundaries for wiki, graph, and RAG-style knowledge systems.
+
+## Executive Judgment
+
+ELF now has a dedicated service-native knowledge-page rebuild/lint benchmark command.
+The command materializes the checked-in `knowledge_compilation` jobs through
+`ElfService::knowledge_page_rebuild`, `knowledge_page_lint`, and
+`knowledge_pages_search`, then scores the generated real-world job fixtures.
+
+This improves ELF's own knowledge-page authority from fixture-only page artifacts to
+service-backed rebuild/lint/search evidence. It does not prove parity or superiority
+against llm-wiki, gbrain, GraphRAG, RAGFlow, LightRAG, or graphify. Those comparisons
+remain valid only when a contained adapter emits comparable page sections, source ids,
+citation mappings, lint findings, and typed benchmark statuses.
+
+## Command Evidence
+
+| Command | Expected result | Artifact |
+| --- | --- | --- |
+| `cargo make real-world-memory-knowledge` | Fixture knowledge page gate passes. | `tmp/real-world-memory/knowledge-report.json` |
+| `cargo make real-world-memory-live-knowledge` | Docker-contained ELF service materialization and scored report pass for the encoded knowledge fixture pack. | `tmp/real-world-memory/live-knowledge/summary.json` |
+
+## Live Materialization Contract
+
+`cargo make real-world-memory-live-knowledge` publishes:
+
+| Artifact | Purpose |
+| --- | --- |
+| `tmp/real-world-memory/live-knowledge/elf-materialization.json` | Records live adapter materialization, generated fixtures, per-job evidence ids, and service-path metadata. |
+| `tmp/real-world-memory/live-knowledge/elf-report.json` | Scores generated jobs with normal real-world job benchmark status and knowledge metrics. |
+| `tmp/real-world-memory/live-knowledge/elf-report.md` | Human-readable report for citation coverage, stale lint, rebuild determinism, backlinks, and unsupported sections. |
+| `tmp/real-world-memory/live-knowledge/summary.json` | Aggregates materialization and report summaries under `elf.real_world_knowledge_live_adapter_sweep/v1`. |
+
+The command is intentionally Docker-scoped. Host execution is refused unless
+`ELF_KNOWLEDGE_LIVE_ALLOW_HOST=1` is set for an explicit local diagnostic run.
+
+## Scored Dimensions
+
+| Dimension | Evidence requirement |
+| --- | --- |
+| Citation coverage | Page sections cite source evidence or timeline events, or are explicitly flagged unsupported. |
+| Stale-source lint | Stale source updates after rebuild produce lint findings instead of silently rewriting truth. |
+| Unsupported sections | Unsupported summaries remain visible as unsupported, not hidden claims. |
+| Rebuild metadata | First and second rebuild hashes, deterministic status, and allowed variance remain explicit. |
+| Backlinks and search | Page artifacts expose backlinks, and `knowledge_pages_search` returns the materialized page surface. |
+| Source-of-truth boundary | Knowledge pages remain derived benchmark artifacts and do not replace Memory Notes or source records. |
+
+## Comparison Boundary
+
+| Compared target | Current position | Why |
+| --- | --- | --- |
+| llm-wiki | `product_reference` | Query-save/lint and wiki maintenance are useful reference patterns, but no contained llm-wiki adapter emits comparable scored pages here. |
+| gbrain | `product_reference` | Timeline and compiled-truth page shape remain references until a contained runner emits source-linked pages and lint output. |
+| GraphRAG/RAGFlow/LightRAG | `blocked_or_incomplete_reference` | Graph/RAG outputs need document/text-unit/source-id mappings before they can be scored as knowledge pages. |
+| graphify | `wrong_result_reference` | Existing representative graphify evidence remains typed `wrong_result`; stale-source lint and unsupported-summary handling are not passing. |
+| qmd | `not_encoded` | qmd live adapter retrieves evidence-linked answers but does not generate derived knowledge pages. |
+
+## Follow-Up Queue
+
+| Follow-up | Reason |
+| --- | --- |
+| XY-1019 | Productize Knowledge Workspace pages with rebuild diffs, citation lint, unsupported-claim warnings, and previous-version diffs. |
+| XY-1020 | Add graph-lite temporal facts and source-backed reports after knowledge pages remain derived and citation-checked. |
+| Graph/RAG contained adapters | Promote external comparison only when adapters emit comparable source ids, page sections, citation mappings, and lint findings. |
+
+## Claims Allowed
+
+- ELF has a dedicated Docker-contained service-native knowledge-page rebuild/lint
+  command for the checked-in `knowledge_compilation` fixture pack.
+- The command exercises `knowledge_page_rebuild`, `knowledge_page_lint`, and
+  `knowledge_pages_search` before scoring.
+- ELF's own knowledge-page evidence is stronger than fixture-only proof for this
+  narrow slice.
+
+## Claims Not Allowed
+
+- Do not claim ELF beats llm-wiki, gbrain, GraphRAG, RAGFlow, LightRAG, or graphify on
+  broad knowledge products from this command alone.
+- Do not treat generated knowledge pages as authoritative storage.
+- Do not mark Knowledge Workspace productization complete; XY-1019 still owns page
+  version diffs, product-quality rebuild metadata, broader page types, and recall
+  integration.