diff --git a/Makefile.toml b/Makefile.toml index 04068ebb..7513eb0d 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -424,6 +424,9 @@ args = [ # | real-world-memory-proactive-brief | composite | | # | real-world-memory-proactive-brief-json | command | | # | real-world-memory-proactive-brief-report | command | | +# | real-world-memory-scheduled | composite | | +# | real-world-memory-scheduled-json | command | | +# | real-world-memory-scheduled-report | command | | # | real-world-memory-live-consolidation | command | | # | real-world-job-operator-ux | composite | | # | real-world-job-operator-ux-json | command | | @@ -935,6 +938,55 @@ args = [ "tmp/real-world-memory/proactive-brief/report.md", ] +[tasks.real-world-memory-scheduled] +workspace = false +dependencies = [ + "real-world-memory-scheduled-report", +] + +[tasks.real-world-memory-scheduled-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/scheduled_memory", + "--out", + "tmp/real-world-memory/scheduled/report.json", + "--run-id", + "real-world-memory-scheduled", + "--adapter-id", + "fixture_scheduled_memory", + "--adapter-name", + "ELF scheduled memory fixture", +] + +[tasks.real-world-memory-scheduled-report] +workspace = false +dependencies = [ + "real-world-memory-scheduled-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/scheduled/report.json", + "--out", + "tmp/real-world-memory/scheduled/report.md", +] + [tasks.real-world-memory-live-consolidation] workspace = false command = "bash" diff --git a/README.md b/README.md index f52c4bc3..13de0803 100644 --- a/README.md +++ b/README.md @@ -152,12 +152,14 @@ provider-backed ELF evidence was required. its pinned Docker local embedding path and is reported as `wrong_result` when same-corpus evidence terms are missed; claude-mem and OpenViking non-retrieval coverage remain typed non-pass states. -- Real-world agent memory aggregate after XY-953: 55 fixture-backed - jobs across 15 suites, 49 pass, 0 incomplete, 6 blocked, 0 wrong-result, +- Real-world agent memory aggregate after XY-954: 60 fixture-backed + jobs across 16 suites, 53 pass, 0 incomplete, 7 blocked, 0 wrong-result, 0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are production-ops operator boundaries plus blocked OpenViking staged trajectory, hierarchy selection, recursive/context expansion measurement gates, and the - private-corpus refresh blocker tied to XY-930, not hidden benchmark wins. The + private-corpus/private-provider scheduler blockers tied to XY-930, not hidden benchmark wins. The + `scheduled_memory` suite contributes four passing source-linked scheduled task + readbacks plus one typed private/provider scheduler blocker tied to XY-930. The `core_archival_memory` suite passes 6 fixture jobs for core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery; it does not create an ELF-over-Letta claim. The @@ -272,6 +274,7 @@ Detailed evidence and interpretation: - [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) - [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) - [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md) +- [Scheduled Memory Task Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/guide/single_user_production.md) - Benchmark contract: @@ -354,6 +357,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md) - [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md) - [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md) +- [Scheduled Memory Task Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md) - [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md) - [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md) - [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md) @@ -364,8 +368,8 @@ Detailed comparison, mechanism-level analysis, and source map: - [RAG/Graph Adapter Feasibility Research Run](docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json) Latest real-world benchmark report: June 16, 2026. Latest external research refresh: -June 11, 2026; June 16 adds live temporal reconciliation and live consolidation -self-check evidence. +June 11, 2026; June 16 adds live temporal reconciliation, live consolidation +self-check evidence, and fixture-backed scheduled-memory task scoring. ## Documentation diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index e1802f44..afd789bc 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -29,7 +29,7 @@ }, "run": { "status": "blocked", - "evidence": "The current fixture set reports 55 jobs across 15 suites: 49 pass, 0 incomplete, 6 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", + "evidence": "The current fixture set reports 60 jobs across 16 suites: 53 pass, 0 incomplete, 7 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; the scheduled_memory suite scores 4 passing scheduled readback tasks plus one blocked private/provider scheduler case tied to XY-930, not hosted scheduler, ChatGPT Tasks, Pulse, or provider-backed private-corpus parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", "command": "cargo make real-world-memory", "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, @@ -96,6 +96,11 @@ "status": "blocked", "evidence": "The proactive brief suite scores 4 passing source-linked suggestions and 1 typed private-corpus refresh blocker tied to XY-930." }, + { + "suite_id": "scheduled_memory", + "status": "blocked", + "evidence": "The scheduled memory suite scores 4 passing source-linked task readbacks with execution trace coverage and 1 typed private/provider scheduler blocker tied to XY-930." + }, { "suite_id": "knowledge_compilation", "status": "pass", diff --git a/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/knowledge_page_refresh_suggestion.json b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/knowledge_page_refresh_suggestion.json new file mode 100644 index 00000000..6a9b01f3 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/knowledge_page_refresh_suggestion.json @@ -0,0 +1,304 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "scheduled-knowledge-page-refresh-suggestion-001", + "suite": "scheduled_memory", + "title": "Suggest a knowledge-page refresh from scheduled memory", + "corpus": { + "corpus_id": "real-world-memory-scheduled-2026-06-17", + "profile": "synthetic", + "items": [ + { + "evidence_id": "scheduled-knowledge-page-stale-finding", + "kind": "fact", + "text": "Knowledge-page lint finding: the project ELF benchmark suite page references the old scheduled-memory blocked state after the scheduled_memory fixture suite was added.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "knowledge_page_refresh_suggestion", + "evidence_id": "scheduled-knowledge-page-stale-finding" + }, + "locator": { + "quote": "old scheduled-memory blocked state" + } + }, + "created_at": "2026-06-17T00:22:00Z" + }, + { + "evidence_id": "scheduled-knowledge-reviewable-refresh", + "kind": "constraint", + "text": "Current knowledge-page refresh rule: scheduled tasks may suggest a reviewable rebuild, but they must not silently rewrite authoritative source notes.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "knowledge_page_refresh_suggestion", + "evidence_id": "scheduled-knowledge-reviewable-refresh" + }, + "locator": { + "quote": "must not silently rewrite authoritative source notes" + } + }, + "created_at": "2026-06-17T00:24:00Z" + }, + { + "evidence_id": "scheduled-knowledge-silent-rewrite-trap", + "kind": "note", + "text": "Stale claim: a scheduled knowledge refresh may rewrite authoritative source notes automatically after lint finds a stale page.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "knowledge_page_refresh_suggestion", + "evidence_id": "scheduled-knowledge-silent-rewrite-trap" + } + }, + "created_at": "2026-06-16T18:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_scheduled_memory", + "answer": { + "content": "Scheduled knowledge-page refresh suggestion: suggest a reviewable rebuild because lint found the old scheduled-memory blocked state, and do not silently rewrite source notes.", + "claims": [ + { + "claim_id": "scheduled_knowledge_refresh_suggested", + "text": "A reviewable knowledge-page rebuild should be suggested because the page still references the old scheduled-memory blocked state.", + "evidence_ids": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "scheduled_tasks": [ + { + "task_run_id": "scheduled-knowledge-refresh-2026-06-17", + "contract_schema": "elf.scheduled_memory_task/v1", + "generated_at": "2026-06-17T00:45:00Z", + "scheduled_for": "2026-06-17T00:42:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-954-fixture-agent", + "read_profile": "private_plus_project", + "task_kind": "knowledge_page_refresh_suggestion", + "outputs": [ + { + "output_id": "scheduled-suggest-reviewable-knowledge-rebuild", + "output_kind": "knowledge_page_refresh_suggestion", + "text": "Suggest a reviewable knowledge-page rebuild for the stale scheduled-memory blocked-state reference; do not rewrite source notes silently.", + "evidence_refs": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "freshness": { + "status": "current", + "observed_at": "2026-06-17T00:24:00Z", + "valid_from": "2026-06-17T00:22:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-17T00:45:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "action": { + "decision": "recommend", + "reason_code": "RECOMMEND_REVIEWABLE_KNOWLEDGE_REBUILD", + "reason": "The lint finding is current and the refresh rule requires reviewable derived output instead of source mutation." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "scheduled-knowledge-page-stale-finding", + "status": "current", + "reason": "Current stale-page lint finding." + }, + { + "evidence_id": "scheduled-knowledge-reviewable-refresh", + "status": "current", + "reason": "Current refresh boundary." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [ + { + "evidence_id": "scheduled-knowledge-silent-rewrite-trap", + "status": "stale", + "reason": "Silent authoritative source-note rewrites are not allowed." + } + ], + "superseded_source_refs": [], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + }, + "execution_trace": { + "trace_id": "trace-scheduled-knowledge-refresh-2026-06-17", + "trigger_kind": "fixture_schedule", + "status": "completed", + "started_at": "2026-06-17T00:42:00Z", + "completed_at": "2026-06-17T00:45:00Z", + "output_ref": "scheduled-suggest-reviewable-knowledge-rebuild", + "stages": [ + { + "stage_name": "memory_read", + "summary": "Read current lint finding and refresh boundary.", + "evidence_refs": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ] + }, + { + "stage_name": "mutation_guard", + "summary": "Rejected silent authoritative source-note rewrite.", + "evidence_refs": ["scheduled-knowledge-silent-rewrite-trap"] + }, + { + "stage_name": "output_readback", + "summary": "Recorded reviewable knowledge-page refresh suggestion.", + "evidence_refs": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ] + } + ] + }, + "source_mutations": [], + "unsupported_claim_flags": [] + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "scheduled-knowledge-lint-recorded", + "ts": "2026-06-17T00:22:00Z", + "actor": "knowledge_lint_fixture", + "action": "recorded_stale_page_finding", + "evidence_ids": ["scheduled-knowledge-page-stale-finding"], + "summary": "The stale scheduled-memory blocked-state page reference was recorded." + }, + { + "event_id": "scheduled-knowledge-refresh-output-recorded", + "ts": "2026-06-17T00:45:00Z", + "actor": "scheduler_fixture", + "action": "recorded_source_linked_output", + "evidence_ids": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "summary": "The scheduled task recorded a reviewable knowledge-page refresh suggestion." + } + ], + "prompt": { + "role": "system", + "content": "Run the scheduled knowledge-page refresh suggestion task.", + "job_mode": "scheduled_memory", + "constraints": [ + "cite_evidence", + "mark_currentness", + "record_execution_trace", + "do_not_mutate_source_notes_silently" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "scheduled_knowledge_refresh_suggested", + "text": "A reviewable knowledge-page rebuild should be suggested because the page still references the old scheduled-memory blocked state." + } + ], + "must_not_include": [ + "scheduled knowledge refresh may rewrite authoritative source notes automatically" + ], + "evidence_links": { + "scheduled_knowledge_refresh_suggested": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ] + }, + "answer_type": "scheduled_memory_task", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "scheduled-knowledge-page-stale-finding", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite", + "quote": "old scheduled-memory blocked state" + }, + { + "evidence_id": "scheduled-knowledge-reviewable-refresh", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite", + "quote": "must not silently rewrite authoritative source notes" + } + ], + "negative_traps": [ + { + "trap_id": "scheduled-knowledge-silent-rewrite-trap", + "type": "stale_fact", + "evidence_ids": ["scheduled-knowledge-silent-rewrite-trap"], + "failure_if_used": true + } + ], + "scheduled_memory": { + "required_task_kinds": ["knowledge_page_refresh_suggestion"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Output suggests the reviewable knowledge-page rebuild." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Output cites lint finding and refresh boundary evidence." + }, + "trace_readback": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Execution trace includes output readback." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Silent source-note rewrite trap is not selected." + }, + "source_immutability": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Scheduled refresh suggestion leaves source mutation count at zero." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "scheduled task output lacks execution trace readback", + "source mutation count must remain zero" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No reviewable rebuild boundary is available."], + "fallback_action": "defer_knowledge_refresh" + }, + "tags": ["synthetic", "scheduled_memory", "knowledge_page_refresh_suggestion", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/private_provider_scheduler_blocked.json b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/private_provider_scheduler_blocked.json new file mode 100644 index 00000000..54461f9d --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/private_provider_scheduler_blocked.json @@ -0,0 +1,129 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "scheduled-private-provider-scheduler-blocked-001", + "suite": "scheduled_memory", + "title": "Block private/provider scheduled tasks without operator inputs", + "corpus": { + "corpus_id": "real-world-memory-scheduled-private-provider-2026-06-17", + "profile": "private_sanitized", + "items": [ + { + "evidence_id": "scheduled-private-provider-missing-inputs", + "kind": "blocker", + "text": "Private/provider scheduled task blocker: no operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available to this fixture.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_provider_scheduler_blocked", + "evidence_id": "scheduled-private-provider-missing-inputs" + }, + "locator": { + "quote": "no operator-owned private production corpus manifest" + } + }, + "created_at": "2026-06-17T00:50:00Z" + }, + { + "evidence_id": "scheduled-private-provider-xy930-boundary", + "kind": "issue", + "text": "XY-930 owns private-corpus and credentialed production gates when operator inputs exist; fixture-only scheduled-memory output must not claim private/provider scheduler quality.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "private_provider_scheduler_blocked", + "evidence_id": "scheduled-private-provider-xy930-boundary" + }, + "locator": { + "quote": "XY-930 owns private-corpus and credentialed production gates" + } + }, + "created_at": "2026-06-17T00:51:00Z" + } + ] + }, + "timeline": [ + { + "event_id": "scheduled-private-provider-blocked-recorded", + "ts": "2026-06-17T00:51:00Z", + "actor": "scheduler_fixture", + "action": "recorded_private_provider_blocker", + "evidence_ids": [ + "scheduled-private-provider-missing-inputs", + "scheduled-private-provider-xy930-boundary" + ], + "summary": "Private/provider scheduled task scoring remains blocked until operator-owned inputs exist." + } + ], + "prompt": { + "role": "system", + "content": "Run private/provider scheduled memory tasks when operator inputs exist.", + "job_mode": "scheduled_memory", + "constraints": [ + "do_not_require_private_corpus", + "block_without_operator_inputs", + "tie_blocker_to_xy930" + ] + }, + "expected_answer": { + "must_include": [], + "must_not_include": [ + "private/provider scheduled task quality is proven", + "hosted scheduler production behavior passed", + "private production corpus scheduled refresh passed" + ], + "evidence_links": {}, + "answer_type": "scheduled_memory_task", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": true + }, + "required_evidence": [], + "negative_traps": [], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The fixture records private/provider scheduled tasks as blocked." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "The blocker points to missing operator inputs and XY-930." + }, + "uncertainty_handling": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "No private/provider scheduler pass is claimed." + }, + "lifecycle_behavior": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The blocked state remains typed and visible." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "No operator-owned private production corpus manifest is available.", + "No hosted scheduler configuration is available." + ], + "fallback_action": "block_private_provider_scheduler_until_xy930_inputs_exist" + }, + "encoding": { + "status": "blocked", + "reason": "No operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available; private/provider scheduled tasks stay blocked under XY-930.", + "follow_up": { + "title": "XY-930 private/provider scheduled-memory input gate", + "reason": "Run private-corpus, provider-backed, and hosted scheduler gates only when operator-owned inputs exist." + } + }, + "tags": ["private_sanitized", "scheduled_memory", "private_provider_scheduler", "xy930_blocked"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_decision_audit.json b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_decision_audit.json new file mode 100644 index 00000000..2efd9140 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_decision_audit.json @@ -0,0 +1,283 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "scheduled-stale-decision-audit-001", + "suite": "scheduled_memory", + "title": "Audit a stale project decision during a scheduled task", + "corpus": { + "corpus_id": "real-world-memory-scheduled-2026-06-17", + "profile": "synthetic", + "items": [ + { + "evidence_id": "scheduled-old-consolidation-only-decision", + "kind": "decision", + "text": "Historical decision: scheduled-memory readiness stays blocked and should only run cargo make real-world-memory-consolidation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_decision_audit", + "evidence_id": "scheduled-old-consolidation-only-decision" + }, + "locator": { + "quote": "only run cargo make real-world-memory-consolidation" + } + }, + "created_at": "2026-06-16T05:00:00Z" + }, + { + "evidence_id": "scheduled-current-direct-suite-decision", + "kind": "decision", + "text": "Current decision: scheduled-memory readiness must use the direct real-world-memory-scheduled fixture suite plus aggregate real-world-memory regression guard.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_decision_audit", + "evidence_id": "scheduled-current-direct-suite-decision" + }, + "locator": { + "quote": "direct real-world-memory-scheduled fixture suite" + } + }, + "created_at": "2026-06-17T00:20:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_scheduled_memory", + "answer": { + "content": "Scheduled stale decision audit: the consolidation-only readiness decision is superseded by the direct real-world-memory-scheduled fixture suite plus aggregate real-world-memory regression guard.", + "claims": [ + { + "claim_id": "scheduled_decision_superseded", + "text": "The consolidation-only scheduled readiness decision is superseded by the direct scheduled-memory fixture suite.", + "evidence_ids": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ], + "scheduled_tasks": [ + { + "task_run_id": "scheduled-stale-decision-audit-2026-06-17", + "contract_schema": "elf.scheduled_memory_task/v1", + "generated_at": "2026-06-17T00:40:00Z", + "scheduled_for": "2026-06-17T00:37:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-954-fixture-agent", + "read_profile": "private_plus_project", + "task_kind": "stale_decision_audit", + "outputs": [ + { + "output_id": "scheduled-defer-consolidation-only-decision", + "output_kind": "stale_decision_audit", + "text": "Defer the consolidation-only scheduled readiness decision; the current gate is the direct scheduled-memory fixture suite plus aggregate regression guard.", + "evidence_refs": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-17T00:20:00Z", + "valid_from": "2026-06-16T05:00:00Z", + "valid_to": "2026-06-17T00:20:00Z", + "last_confirmed_at": "2026-06-17T00:40:00Z", + "superseded_by": ["scheduled-current-direct-suite-decision"], + "tombstone_refs": [] + }, + "action": { + "decision": "defer", + "reason_code": "DEFER_SUPERSEDED_DECISION", + "reason": "The old consolidation-only decision is retained as history and is not the current scheduled-memory readiness gate." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "scheduled-current-direct-suite-decision", + "status": "current", + "reason": "Current direct scheduled-memory readiness gate." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [], + "superseded_source_refs": [ + { + "evidence_id": "scheduled-old-consolidation-only-decision", + "status": "superseded", + "reason": "Replaced by the direct scheduled-memory fixture suite.", + "superseded_by": "scheduled-current-direct-suite-decision" + } + ], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + }, + "execution_trace": { + "trace_id": "trace-scheduled-stale-decision-audit-2026-06-17", + "trigger_kind": "fixture_schedule", + "status": "completed", + "started_at": "2026-06-17T00:37:00Z", + "completed_at": "2026-06-17T00:40:00Z", + "output_ref": "scheduled-defer-consolidation-only-decision", + "stages": [ + { + "stage_name": "memory_read", + "summary": "Read historical and current scheduled-readiness decisions.", + "evidence_refs": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ] + }, + { + "stage_name": "supersession_check", + "summary": "Classified the consolidation-only decision as superseded.", + "evidence_refs": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ] + }, + { + "stage_name": "output_readback", + "summary": "Recorded scheduled stale-decision output for review.", + "evidence_refs": ["scheduled-current-direct-suite-decision"] + } + ] + }, + "source_mutations": [], + "unsupported_claim_flags": [] + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "scheduled-direct-suite-decision-recorded", + "ts": "2026-06-17T00:20:00Z", + "actor": "agent", + "action": "recorded_current_decision", + "evidence_ids": ["scheduled-current-direct-suite-decision"], + "summary": "The direct scheduled-memory fixture suite became the current readiness gate." + }, + { + "event_id": "scheduled-decision-audit-output-recorded", + "ts": "2026-06-17T00:40:00Z", + "actor": "scheduler_fixture", + "action": "recorded_source_linked_output", + "evidence_ids": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ], + "summary": "The stale decision audit was recorded with supersession evidence." + } + ], + "prompt": { + "role": "system", + "content": "Run the scheduled stale decision audit.", + "job_mode": "scheduled_memory", + "constraints": [ + "cite_evidence", + "mark_superseded_decisions", + "record_execution_trace", + "do_not_use_old_decision_as_current" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "scheduled_decision_superseded", + "text": "The consolidation-only scheduled readiness decision is superseded by the direct scheduled-memory fixture suite." + } + ], + "must_not_include": ["scheduled-memory readiness stays blocked and should only run cargo make real-world-memory-consolidation"], + "evidence_links": { + "scheduled_decision_superseded": [ + "scheduled-old-consolidation-only-decision", + "scheduled-current-direct-suite-decision" + ] + }, + "answer_type": "scheduled_memory_task", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "scheduled-old-consolidation-only-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite", + "quote": "only run cargo make real-world-memory-consolidation" + }, + { + "evidence_id": "scheduled-current-direct-suite-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite", + "quote": "direct real-world-memory-scheduled fixture suite" + } + ], + "negative_traps": [ + { + "trap_id": "scheduled-consolidation-only-current-trap", + "type": "stale_fact", + "evidence_ids": ["scheduled-old-consolidation-only-decision"], + "failure_if_used": false + } + ], + "scheduled_memory": { + "required_task_kinds": ["stale_decision_audit"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Audit identifies the superseded decision and current replacement." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Audit cites both old and new decision evidence." + }, + "trace_readback": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Execution trace includes output readback." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The old decision is not presented as current." + }, + "lifecycle_behavior": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Supersession markers are present." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "scheduled task output lacks execution trace readback" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No current scheduled-memory decision is available."], + "fallback_action": "defer_superseded_decision" + }, + "tags": ["synthetic", "scheduled_memory", "stale_decision_audit", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_preference_plan_audit.json b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_preference_plan_audit.json new file mode 100644 index 00000000..99005250 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/stale_preference_plan_audit.json @@ -0,0 +1,412 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "scheduled-stale-preference-plan-audit-001", + "suite": "scheduled_memory", + "title": "Audit stale preferences and plans during a scheduled task", + "corpus": { + "corpus_id": "real-world-memory-scheduled-2026-06-17", + "profile": "synthetic", + "items": [ + { + "evidence_id": "scheduled-stale-old-plan", + "kind": "plan", + "text": "Old scheduled plan: publish the scheduled-memory report by reusing proactive-brief fixtures and skipping execution trace readback.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_preference_plan_audit", + "evidence_id": "scheduled-stale-old-plan" + }, + "locator": { + "quote": "skipping execution trace readback" + } + }, + "created_at": "2026-06-16T09:00:00Z" + }, + { + "evidence_id": "scheduled-stale-plan-expired", + "kind": "tombstone", + "text": "TTL invalidation: the old scheduled-memory report plan expired at 2026-06-17T00:00:00Z and must not be recommended as current work.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_preference_plan_audit", + "evidence_id": "scheduled-stale-plan-expired" + }, + "locator": { + "quote": "expired at 2026-06-17T00:00:00Z" + } + }, + "created_at": "2026-06-17T00:00:00Z" + }, + { + "evidence_id": "scheduled-current-trace-plan", + "kind": "plan", + "text": "Current scheduled plan: scheduled-memory tasks must record execution trace/readback and source-linked output before the lane is validation-ready.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_preference_plan_audit", + "evidence_id": "scheduled-current-trace-plan" + }, + "locator": { + "quote": "record execution trace/readback" + } + }, + "created_at": "2026-06-17T00:15:00Z" + }, + { + "evidence_id": "scheduled-old-silent-mutation-preference", + "kind": "preference", + "text": "Historical preference: scheduled audits may silently rewrite stale plans after detecting them.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_preference_plan_audit", + "evidence_id": "scheduled-old-silent-mutation-preference" + } + }, + "created_at": "2026-06-16T09:10:00Z" + }, + { + "evidence_id": "scheduled-current-reviewable-preference", + "kind": "preference", + "text": "Current preference: scheduled audits should produce reviewable derived output and must not mutate source notes silently.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "stale_preference_plan_audit", + "evidence_id": "scheduled-current-reviewable-preference" + }, + "locator": { + "quote": "must not mutate source notes silently" + } + }, + "created_at": "2026-06-17T00:18:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_scheduled_memory", + "answer": { + "content": "Scheduled stale preference/plan audit: the old report plan is expired, the silent-mutation preference is historical, and the current path requires trace/readback plus reviewable derived output.", + "claims": [ + { + "claim_id": "scheduled_stale_plan_expired", + "text": "The old scheduled-memory report plan is expired and superseded by the trace/readback requirement.", + "evidence_ids": [ + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired", + "scheduled-current-trace-plan" + ], + "confidence": "high" + }, + { + "claim_id": "scheduled_silent_mutation_rejected", + "text": "Scheduled audits must not mutate source notes silently; they should produce reviewable derived output.", + "evidence_ids": [ + "scheduled-old-silent-mutation-preference", + "scheduled-current-reviewable-preference" + ], + "confidence": "high" + } + ], + "evidence_ids": [ + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired", + "scheduled-current-trace-plan", + "scheduled-old-silent-mutation-preference", + "scheduled-current-reviewable-preference" + ], + "scheduled_tasks": [ + { + "task_run_id": "scheduled-stale-plan-audit-2026-06-17", + "contract_schema": "elf.scheduled_memory_task/v1", + "generated_at": "2026-06-17T00:35:00Z", + "scheduled_for": "2026-06-17T00:32:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-954-fixture-agent", + "read_profile": "private_plus_project", + "task_kind": "stale_preference_plan_audit", + "outputs": [ + { + "output_id": "scheduled-defer-expired-report-plan", + "output_kind": "stale_preference_plan_audit", + "text": "Defer the old scheduled-memory report plan because it expired; use the current trace/readback requirement instead.", + "evidence_refs": [ + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired", + "scheduled-current-trace-plan" + ], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-17T00:15:00Z", + "valid_from": "2026-06-16T09:00:00Z", + "valid_to": "2026-06-17T00:00:00Z", + "last_confirmed_at": "2026-06-17T00:35:00Z", + "superseded_by": ["scheduled-current-trace-plan"], + "tombstone_refs": ["scheduled-stale-plan-expired"] + }, + "action": { + "decision": "defer", + "reason_code": "DEFER_EXPIRED_PLAN", + "reason": "The old plan is retained as history and must not be recommended as current work." + }, + "unsupported_claim_flags": [] + }, + { + "output_id": "scheduled-reject-silent-source-mutation", + "output_kind": "stale_preference_plan_audit", + "text": "Reject silent source-note mutation during scheduled audits and keep the audit output reviewable.", + "evidence_refs": [ + "scheduled-old-silent-mutation-preference", + "scheduled-current-reviewable-preference" + ], + "freshness": { + "status": "superseded", + "observed_at": "2026-06-17T00:18:00Z", + "valid_from": "2026-06-16T09:10:00Z", + "valid_to": "2026-06-17T00:18:00Z", + "last_confirmed_at": "2026-06-17T00:35:00Z", + "superseded_by": ["scheduled-current-reviewable-preference"], + "tombstone_refs": [] + }, + "action": { + "decision": "reject", + "reason_code": "REJECT_SILENT_SOURCE_MUTATION", + "reason": "The current preference requires reviewable derived output rather than silent source rewrites." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "scheduled-current-trace-plan", + "status": "current", + "reason": "Current trace/readback requirement." + }, + { + "evidence_id": "scheduled-current-reviewable-preference", + "status": "current", + "reason": "Current reviewable-output boundary." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [], + "superseded_source_refs": [ + { + "evidence_id": "scheduled-stale-old-plan", + "status": "superseded", + "reason": "Replaced by current trace/readback requirement.", + "superseded_by": "scheduled-current-trace-plan" + }, + { + "evidence_id": "scheduled-old-silent-mutation-preference", + "status": "superseded", + "reason": "Replaced by current reviewable-output preference.", + "superseded_by": "scheduled-current-reviewable-preference" + } + ], + "tombstone_source_refs": [ + { + "evidence_id": "scheduled-stale-plan-expired", + "status": "tombstoned", + "reason": "TTL invalidation for the old report plan." + } + ], + "unsupported_claim_flags": [] + }, + "execution_trace": { + "trace_id": "trace-scheduled-stale-plan-audit-2026-06-17", + "trigger_kind": "fixture_schedule", + "status": "completed", + "started_at": "2026-06-17T00:32:00Z", + "completed_at": "2026-06-17T00:35:00Z", + "output_ref": "scheduled-defer-expired-report-plan", + "stages": [ + { + "stage_name": "memory_read", + "summary": "Read old and current plan/preference sources.", + "evidence_refs": [ + "scheduled-stale-old-plan", + "scheduled-current-trace-plan", + "scheduled-current-reviewable-preference" + ] + }, + { + "stage_name": "ttl_filter", + "summary": "Detected TTL invalidation before action selection.", + "evidence_refs": ["scheduled-stale-plan-expired"] + }, + { + "stage_name": "output_readback", + "summary": "Recorded reviewable audit output without source mutation.", + "evidence_refs": [ + "scheduled-current-trace-plan", + "scheduled-current-reviewable-preference" + ] + } + ] + }, + "source_mutations": [], + "unsupported_claim_flags": [] + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "scheduled-stale-plan-expired", + "ts": "2026-06-17T00:00:00Z", + "actor": "gc_fixture", + "action": "recorded_ttl_invalidation", + "evidence_ids": ["scheduled-stale-plan-expired"], + "summary": "The old scheduled-memory plan expired before the scheduled audit ran." + }, + { + "event_id": "scheduled-stale-audit-output-recorded", + "ts": "2026-06-17T00:35:00Z", + "actor": "scheduler_fixture", + "action": "recorded_source_linked_output", + "evidence_ids": [ + "scheduled-current-trace-plan", + "scheduled-current-reviewable-preference" + ], + "summary": "The stale preference/plan audit was recorded as reviewable output." + } + ], + "prompt": { + "role": "system", + "content": "Run the scheduled stale preference and plan audit.", + "job_mode": "scheduled_memory", + "constraints": [ + "cite_evidence", + "mark_currentness", + "do_not_recommend_expired_plans", + "do_not_mutate_source_notes_silently" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "scheduled_stale_plan_expired", + "text": "The old scheduled-memory report plan is expired and superseded by the trace/readback requirement." + }, + { + "claim_id": "scheduled_silent_mutation_rejected", + "text": "Scheduled audits must not mutate source notes silently; they should produce reviewable derived output." + } + ], + "must_not_include": [ + "publish the scheduled-memory report by reusing proactive-brief fixtures", + "scheduled audits may silently rewrite stale plans" + ], + "evidence_links": { + "scheduled_stale_plan_expired": [ + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired", + "scheduled-current-trace-plan" + ], + "scheduled_silent_mutation_rejected": [ + "scheduled-old-silent-mutation-preference", + "scheduled-current-reviewable-preference" + ] + }, + "answer_type": "scheduled_memory_task", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "scheduled-stale-old-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite", + "quote": "skipping execution trace readback" + }, + { + "evidence_id": "scheduled-stale-plan-expired", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite", + "quote": "expired at 2026-06-17T00:00:00Z" + }, + { + "evidence_id": "scheduled-current-trace-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite", + "quote": "record execution trace/readback" + }, + { + "evidence_id": "scheduled-current-reviewable-preference", + "claim_id": "scheduled_silent_mutation_rejected", + "requirement": "cite", + "quote": "must not mutate source notes silently" + } + ], + "negative_traps": [ + { + "trap_id": "scheduled-stale-plan-current-trap", + "type": "stale_fact", + "evidence_ids": ["scheduled-stale-old-plan"], + "failure_if_used": false + } + ], + "scheduled_memory": { + "required_task_kinds": ["stale_preference_plan_audit"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Audit identifies the expired plan and rejected silent-mutation preference." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Audit cites old, current, and invalidation evidence." + }, + "trace_readback": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Execution trace includes output readback." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The expired plan is not treated as current." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Supersession and tombstone markers are present." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "scheduled task output lacks execution trace readback", + "source mutation count must remain zero" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No current replacement plan is available."], + "fallback_action": "defer_expired_plan" + }, + "tags": ["synthetic", "scheduled_memory", "stale_preference_plan_audit", "fixture_backed"] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/weekly_project_status_summary.json b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/weekly_project_status_summary.json new file mode 100644 index 00000000..ad8fa2ac --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/scheduled_memory/weekly_project_status_summary.json @@ -0,0 +1,299 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "scheduled-weekly-project-status-summary-001", + "suite": "scheduled_memory", + "title": "Run a weekly project status summary from current memory", + "corpus": { + "corpus_id": "real-world-memory-scheduled-2026-06-17", + "profile": "synthetic", + "items": [ + { + "evidence_id": "scheduled-weekly-current-gate", + "kind": "decision", + "text": "Current scheduled-memory gate: run cargo make real-world-memory-scheduled and targeted real_world_job_benchmark tests before any validation-ready claim.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "weekly_project_status_summary", + "evidence_id": "scheduled-weekly-current-gate" + }, + "locator": { + "quote": "real-world-memory-scheduled" + } + }, + "created_at": "2026-06-17T00:10:00Z" + }, + { + "evidence_id": "scheduled-weekly-ledger-update", + "kind": "plan", + "text": "Current ledger action: update the XY-951 scheduled-memory-task readiness stage with the scheduled benchmark delta and regression analysis.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "weekly_project_status_summary", + "evidence_id": "scheduled-weekly-ledger-update" + }, + "locator": { + "quote": "XY-951 scheduled-memory-task readiness stage" + } + }, + "created_at": "2026-06-17T00:12:00Z" + }, + { + "evidence_id": "scheduled-weekly-hosted-parity-trap", + "kind": "note", + "text": "Stale claim: fixture-backed scheduled memory proves parity with ChatGPT Tasks, Pulse, and managed background products.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "weekly_project_status_summary", + "evidence_id": "scheduled-weekly-hosted-parity-trap" + } + }, + "created_at": "2026-06-16T20:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_scheduled_memory", + "answer": { + "content": "Weekly scheduled summary: run cargo make real-world-memory-scheduled, update the XY-951 scheduled-memory-task readiness ledger, and do not claim hosted scheduled-product parity from fixture evidence.", + "claims": [ + { + "claim_id": "scheduled_weekly_gate", + "text": "The scheduled-memory validation gate is cargo make real-world-memory-scheduled plus targeted real_world_job_benchmark tests.", + "evidence_ids": ["scheduled-weekly-current-gate"], + "confidence": "high" + }, + { + "claim_id": "scheduled_weekly_ledger", + "text": "The XY-951 scheduled-memory-task readiness stage needs the scheduled benchmark delta and regression analysis.", + "evidence_ids": ["scheduled-weekly-ledger-update"], + "confidence": "high" + } + ], + "evidence_ids": ["scheduled-weekly-current-gate", "scheduled-weekly-ledger-update"], + "scheduled_tasks": [ + { + "task_run_id": "scheduled-weekly-status-2026-06-17", + "contract_schema": "elf.scheduled_memory_task/v1", + "generated_at": "2026-06-17T00:30:00Z", + "scheduled_for": "2026-06-17T00:25:00Z", + "tenant_id": "fixture-tenant", + "project_id": "elf", + "agent_id": "xy-954-fixture-agent", + "read_profile": "private_plus_project", + "task_kind": "weekly_project_status_summary", + "outputs": [ + { + "output_id": "weekly-summary-validation-ready-next-step", + "output_kind": "weekly_project_status_summary", + "text": "Run the scheduled-memory fixture command, update the XY-951 scheduled-memory-task readiness stage, and keep hosted scheduler parity out of the claim.", + "evidence_refs": [ + "scheduled-weekly-current-gate", + "scheduled-weekly-ledger-update" + ], + "freshness": { + "status": "current", + "observed_at": "2026-06-17T00:12:00Z", + "valid_from": "2026-06-17T00:10:00Z", + "valid_to": null, + "last_confirmed_at": "2026-06-17T00:30:00Z", + "superseded_by": [], + "tombstone_refs": [] + }, + "action": { + "decision": "recommend", + "reason_code": "RECOMMEND_CURRENT_SCHEDULED_GATE", + "reason": "Both selected source refs are current project-memory items and the hosted parity trap was dropped." + }, + "unsupported_claim_flags": [] + } + ], + "source_trace": { + "selected_source_refs": [ + { + "evidence_id": "scheduled-weekly-current-gate", + "status": "current", + "reason": "Current scheduled-memory validation command." + }, + { + "evidence_id": "scheduled-weekly-ledger-update", + "status": "current", + "reason": "Current ledger update requirement." + } + ], + "dropped_source_refs": [], + "stale_source_refs": [ + { + "evidence_id": "scheduled-weekly-hosted-parity-trap", + "status": "stale", + "reason": "Fixture evidence cannot prove hosted scheduled-product parity." + } + ], + "superseded_source_refs": [], + "tombstone_source_refs": [], + "unsupported_claim_flags": [] + }, + "execution_trace": { + "trace_id": "trace-scheduled-weekly-status-2026-06-17", + "trigger_kind": "fixture_schedule", + "status": "completed", + "started_at": "2026-06-17T00:25:00Z", + "completed_at": "2026-06-17T00:30:00Z", + "output_ref": "weekly-summary-validation-ready-next-step", + "stages": [ + { + "stage_name": "memory_read", + "summary": "Read current validation and ledger sources.", + "evidence_refs": ["scheduled-weekly-current-gate", "scheduled-weekly-ledger-update"] + }, + { + "stage_name": "stale_filter", + "summary": "Dropped hosted parity trap before output.", + "evidence_refs": ["scheduled-weekly-hosted-parity-trap"] + }, + { + "stage_name": "output_readback", + "summary": "Recorded source-linked scheduled output for review.", + "evidence_refs": ["scheduled-weekly-current-gate", "scheduled-weekly-ledger-update"] + } + ] + }, + "source_mutations": [], + "unsupported_claim_flags": [] + } + ], + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "scheduled-weekly-run-created", + "ts": "2026-06-17T00:25:00Z", + "actor": "scheduler_fixture", + "action": "started_scheduled_task", + "evidence_ids": ["scheduled-weekly-current-gate"], + "summary": "The weekly scheduled task started from current project memory." + }, + { + "event_id": "scheduled-weekly-output-recorded", + "ts": "2026-06-17T00:30:00Z", + "actor": "scheduler_fixture", + "action": "recorded_source_linked_output", + "evidence_ids": ["scheduled-weekly-current-gate", "scheduled-weekly-ledger-update"], + "summary": "The scheduled output was recorded with readback trace and source refs." + } + ], + "prompt": { + "role": "system", + "content": "Run the weekly project status summary scheduled task.", + "job_mode": "scheduled_memory", + "constraints": [ + "cite_evidence", + "mark_currentness", + "record_execution_trace", + "do_not_claim_hosted_scheduler_parity" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "scheduled_weekly_gate", + "text": "The scheduled-memory validation gate is cargo make real-world-memory-scheduled plus targeted real_world_job_benchmark tests." + }, + { + "claim_id": "scheduled_weekly_ledger", + "text": "The XY-951 scheduled-memory-task readiness stage needs the scheduled benchmark delta and regression analysis." + } + ], + "must_not_include": [ + "fixture-backed scheduled memory proves parity with ChatGPT Tasks", + "fixture-backed scheduled memory proves parity with Pulse", + "fixture-backed scheduled memory proves parity with managed background products" + ], + "evidence_links": { + "scheduled_weekly_gate": ["scheduled-weekly-current-gate"], + "scheduled_weekly_ledger": ["scheduled-weekly-ledger-update"] + }, + "answer_type": "scheduled_memory_task", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "scheduled-weekly-current-gate", + "claim_id": "scheduled_weekly_gate", + "requirement": "cite", + "quote": "real-world-memory-scheduled" + }, + { + "evidence_id": "scheduled-weekly-ledger-update", + "claim_id": "scheduled_weekly_ledger", + "requirement": "cite", + "quote": "XY-951 scheduled-memory-task readiness stage" + } + ], + "negative_traps": [ + { + "trap_id": "scheduled-weekly-hosted-parity-trap", + "type": "stale_fact", + "evidence_ids": ["scheduled-weekly-hosted-parity-trap"], + "failure_if_used": true + } + ], + "scheduled_memory": { + "required_task_kinds": ["weekly_project_status_summary"] + }, + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Scheduled output names the current scheduled-memory command and ledger update." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Every scheduled output carries source evidence refs." + }, + "trace_readback": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "The task run records execution trace and output readback." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "The hosted parity trap is not selected as current evidence." + }, + "lifecycle_behavior": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Freshness and currentness markers are present." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "scheduled task output lacks execution trace readback" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": ["No hosted scheduler parity claim is supported by this fixture."], + "fallback_action": "defer_hosted_scheduler_claim" + }, + "tags": ["synthetic", "scheduled_memory", "weekly_project_status_summary", "fixture_backed"] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index d93398c7..eae9659f 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -51,6 +51,7 @@ const SUITES: &[&str] = &[ "consolidation", "memory_summary", "proactive_brief", + "scheduled_memory", "knowledge_compilation", "operator_debugging_ux", "capture_integration", @@ -152,6 +153,7 @@ struct RealWorldJob { memory_evolution: Option, memory_summary: Option, proactive_brief: Option, + scheduled_memory: Option, } #[derive(Debug, Deserialize)] @@ -371,6 +373,12 @@ struct ProactiveBriefExpectation { required_suggestion_kinds: Vec, } +#[derive(Debug, Deserialize)] +struct ScheduledMemoryExpectation { + #[serde(default)] + required_task_kinds: Vec, +} + #[derive(Debug, Deserialize)] struct ScoringRubric { #[serde(default)] @@ -415,6 +423,8 @@ struct ProducedAnswer { memory_summaries: Vec, #[serde(default)] proactive_briefs: Vec, + #[serde(default)] + scheduled_tasks: Vec, #[serde(skip_serializing_if = "Option::is_none")] latency_ms: Option, #[serde(skip_serializing_if = "Option::is_none")] @@ -600,6 +610,61 @@ struct ProactiveSuggestionAction { reason: String, } +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ScheduledMemoryTaskArtifact { + task_run_id: String, + contract_schema: String, + generated_at: String, + scheduled_for: String, + tenant_id: String, + project_id: String, + agent_id: String, + read_profile: String, + task_kind: String, + #[serde(default)] + outputs: Vec, + source_trace: MemorySummarySourceTrace, + #[serde(skip_serializing_if = "Option::is_none")] + execution_trace: Option, + #[serde(default)] + source_mutations: Vec, + #[serde(default)] + unsupported_claim_flags: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ScheduledMemoryOutput { + output_id: String, + output_kind: String, + text: String, + #[serde(default)] + evidence_refs: Vec, + freshness: MemorySummaryFreshness, + action: ProactiveSuggestionAction, + #[serde(default)] + unsupported_claim_flags: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ScheduledMemoryExecutionTrace { + trace_id: String, + trigger_kind: String, + status: String, + started_at: String, + completed_at: String, + output_ref: String, + #[serde(default)] + stages: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ScheduledMemoryTraceStage { + stage_name: String, + summary: String, + #[serde(default)] + evidence_refs: Vec, +} + #[derive(Clone, Debug, Deserialize)] struct ConsolidationFixture { #[serde(default)] @@ -1083,6 +1148,8 @@ struct ReportSummary { #[serde(skip_serializing_if = "Option::is_none")] proactive_brief: Option, #[serde(skip_serializing_if = "Option::is_none")] + scheduled_memory: Option, + #[serde(skip_serializing_if = "Option::is_none")] knowledge: Option, } @@ -1164,6 +1231,38 @@ struct ProactiveBriefSummaryReport { source_trace_tombstone_count: usize, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScheduledMemorySummaryReport { + job_count: usize, + task_run_count: usize, + output_count: usize, + required_task_kind_count: usize, + covered_required_task_kind_count: usize, + missing_required_task_kind_count: usize, + evidence_ref_required_count: usize, + evidence_ref_output_count: usize, + evidence_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + action_rationale_count: usize, + action_rationale_coverage: f64, + trace_required_count: usize, + trace_complete_count: usize, + trace_coverage: f64, + source_mutation_count: usize, + current_output_count: usize, + non_current_output_count: usize, + invalid_current_output_count: usize, + untraced_output_count: usize, + unsupported_current_output_count: usize, + tombstone_violation_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] struct KnowledgeSummary { job_count: usize, @@ -1242,6 +1341,8 @@ struct JobReport { memory_summary: Option, #[serde(skip_serializing_if = "Option::is_none")] proactive_brief: Option, + #[serde(skip_serializing_if = "Option::is_none")] + scheduled_memory: Option, trap_ids_used: Vec, dimension_scores: Vec, reason: String, @@ -1435,6 +1536,37 @@ struct ProactiveBriefJobMetrics { source_trace_tombstone_count: usize, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScheduledMemoryJobMetrics { + task_run_count: usize, + output_count: usize, + required_task_kind_count: usize, + covered_required_task_kind_count: usize, + missing_required_task_kind_count: usize, + evidence_ref_required_count: usize, + evidence_ref_output_count: usize, + evidence_ref_coverage: f64, + freshness_marker_count: usize, + freshness_coverage: f64, + action_rationale_count: usize, + action_rationale_coverage: f64, + trace_required_count: usize, + trace_complete_count: usize, + trace_coverage: f64, + source_mutation_count: usize, + current_output_count: usize, + non_current_output_count: usize, + invalid_current_output_count: usize, + untraced_output_count: usize, + unsupported_current_output_count: usize, + tombstone_violation_count: usize, + source_trace_selected_count: usize, + source_trace_dropped_count: usize, + source_trace_stale_count: usize, + source_trace_superseded_count: usize, + source_trace_tombstone_count: usize, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] struct EvolutionSummary { stale_answer_count: usize, @@ -1502,6 +1634,7 @@ struct JobScoring { consolidation: Option, memory_summary: Option, proactive_brief: Option, + scheduled_memory: Option, } #[derive(Debug, Default)] @@ -1537,6 +1670,14 @@ struct FailureCounts { proactive_brief_missing_kinds: usize, proactive_brief_unsupported_current_suggestions: usize, proactive_brief_tombstone_violations: usize, + scheduled_memory_invalid_current_outputs: usize, + scheduled_memory_untraced_outputs: usize, + scheduled_memory_missing_freshness: usize, + scheduled_memory_missing_action_rationale: usize, + scheduled_memory_missing_task_kinds: usize, + scheduled_memory_unsupported_current_outputs: usize, + scheduled_memory_tombstone_violations: usize, + scheduled_memory_missing_trace: usize, untraced_page_sections: usize, missed_stale_findings: usize, rebuild_failures: usize, @@ -1666,6 +1807,7 @@ fn validate_job(job: &RealWorldJob, path: &Path) -> Result<()> { validate_memory_evolution(job, path)?; validate_memory_summary_expectation(job, path)?; validate_proactive_brief_expectation(job, path)?; + validate_scheduled_memory_expectation(job, path)?; validate_trace_explainability(job, path)?; Ok(()) @@ -1948,6 +2090,9 @@ fn validate_adapter_response(job: &RealWorldJob, path: &Path) -> Result<()> { for brief in &adapter_response.answer.proactive_briefs { validate_proactive_brief_artifact(brief, path, &evidence_ids)?; } + for task in &adapter_response.answer.scheduled_tasks { + validate_scheduled_memory_artifact(task, path, &evidence_ids)?; + } if job.suite == "memory_summary" && adapter_response.answer.memory_summaries.is_empty() @@ -1967,6 +2112,15 @@ fn validate_adapter_response(job: &RealWorldJob, path: &Path) -> Result<()> { path.display() )); } + if job.suite == "scheduled_memory" + && adapter_response.answer.scheduled_tasks.is_empty() + && job.encoding.status.is_none() + { + return Err(eyre::eyre!( + "{} scheduled_memory jobs must provide adapter_response.answer.scheduled_tasks.", + path.display() + )); + } Ok(()) } @@ -2281,6 +2435,179 @@ fn validate_proactive_suggestion( Ok(()) } +fn validate_scheduled_memory_artifact( + task: &ScheduledMemoryTaskArtifact, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if task.task_run_id.trim().is_empty() + || task.contract_schema != "elf.scheduled_memory_task/v1" + || task.generated_at.trim().is_empty() + || task.scheduled_for.trim().is_empty() + || task.tenant_id.trim().is_empty() + || task.project_id.trim().is_empty() + || task.agent_id.trim().is_empty() + || task.read_profile.trim().is_empty() + || task.task_kind.trim().is_empty() + || task.outputs.is_empty() + { + return Err(eyre::eyre!("{} has an incomplete scheduled memory task.", path.display())); + } + if !is_scheduled_task_kind(task.task_kind.as_str()) { + return Err(eyre::eyre!( + "{} has unknown scheduled task kind {}.", + path.display(), + task.task_kind + )); + } + + validate_optional_rfc3339(&task.generated_at, path, task.task_run_id.as_str())?; + validate_optional_rfc3339(&task.scheduled_for, path, task.task_run_id.as_str())?; + + for output in &task.outputs { + validate_scheduled_memory_output(output, path, evidence_ids)?; + } + for mutation in &task.source_mutations { + if !mutation.is_object() { + return Err(eyre::eyre!( + "{} scheduled memory source mutations must be JSON objects.", + path.display() + )); + } + } + for flag in &task.unsupported_claim_flags { + if !flag.is_object() { + return Err(eyre::eyre!( + "{} scheduled memory unsupported-claim flags must be JSON objects.", + path.display() + )); + } + } + + validate_memory_summary_source_trace(&task.source_trace, path, evidence_ids)?; + + if let Some(trace) = &task.execution_trace { + validate_scheduled_memory_trace(trace, path, evidence_ids)?; + } + + Ok(()) +} + +fn validate_scheduled_memory_output( + output: &ScheduledMemoryOutput, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if output.output_id.trim().is_empty() + || output.output_kind.trim().is_empty() + || output.text.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete scheduled memory output.", path.display())); + } + if !is_scheduled_task_kind(output.output_kind.as_str()) { + return Err(eyre::eyre!( + "{} has unknown scheduled output kind {}.", + path.display(), + output.output_kind + )); + } + if !is_memory_summary_freshness_status(output.freshness.status.as_str()) { + return Err(eyre::eyre!( + "{} has unknown scheduled output freshness status {}.", + path.display(), + output.freshness.status + )); + } + if !is_proactive_action_decision(output.action.decision.as_str()) { + return Err(eyre::eyre!( + "{} has unknown scheduled output action decision {}.", + path.display(), + output.action.decision + )); + } + if output.action.reason_code.trim().is_empty() || output.action.reason.trim().is_empty() { + return Err(eyre::eyre!( + "{} has incomplete scheduled output action rationale.", + path.display() + )); + } + + for evidence_id in &output.evidence_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for evidence_id in &output.freshness.tombstone_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + for flag in &output.unsupported_claim_flags { + if !flag.is_object() { + return Err(eyre::eyre!( + "{} scheduled output unsupported-claim flags must be JSON objects.", + path.display() + )); + } + } + + validate_optional_summary_time( + path, + output.freshness.observed_at.as_deref(), + output.output_id.as_str(), + )?; + validate_optional_summary_time( + path, + output.freshness.valid_from.as_deref(), + output.output_id.as_str(), + )?; + validate_optional_summary_time( + path, + output.freshness.valid_to.as_deref(), + output.output_id.as_str(), + )?; + validate_optional_summary_time( + path, + output.freshness.last_confirmed_at.as_deref(), + output.output_id.as_str(), + )?; + + Ok(()) +} + +fn validate_scheduled_memory_trace( + trace: &ScheduledMemoryExecutionTrace, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if trace.trace_id.trim().is_empty() + || trace.trigger_kind.trim().is_empty() + || trace.status.trim().is_empty() + || trace.started_at.trim().is_empty() + || trace.completed_at.trim().is_empty() + || trace.output_ref.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete scheduled memory execution trace.", + path.display() + )); + } + + validate_optional_rfc3339(&trace.started_at, path, trace.trace_id.as_str())?; + validate_optional_rfc3339(&trace.completed_at, path, trace.trace_id.as_str())?; + + for stage in &trace.stages { + if stage.stage_name.trim().is_empty() || stage.summary.trim().is_empty() { + return Err(eyre::eyre!( + "{} has an incomplete scheduled memory trace stage.", + path.display() + )); + } + + for evidence_id in &stage.evidence_refs { + ensure_known_evidence(path, evidence_ids, evidence_id)?; + } + } + + Ok(()) +} + fn validate_optional_summary_time(path: &Path, value: Option<&str>, id: &str) -> Result<()> { if let Some(value) = value { validate_optional_rfc3339(value, path, id)?; @@ -2327,6 +2654,17 @@ fn is_proactive_suggestion_kind(kind: &str) -> bool { ) } +fn is_scheduled_task_kind(kind: &str) -> bool { + matches!( + kind, + "weekly_project_status_summary" + | "stale_preference_plan_audit" + | "stale_decision_audit" + | "knowledge_page_refresh_suggestion" + | "private_provider_scheduler" + ) +} + fn is_proactive_action_decision(decision: &str) -> bool { matches!(decision, "recommend" | "defer" | "reject") } @@ -2558,6 +2896,31 @@ fn validate_proactive_brief_expectation(job: &RealWorldJob, path: &Path) -> Resu Ok(()) } +fn validate_scheduled_memory_expectation(job: &RealWorldJob, path: &Path) -> Result<()> { + let Some(scheduled) = &job.scheduled_memory else { + if job.suite == "scheduled_memory" && job.encoding.status.is_none() { + return Err(eyre::eyre!( + "{} scheduled_memory jobs must provide scheduled_memory expectations.", + path.display() + )); + } + + return Ok(()); + }; + + for kind in &scheduled.required_task_kinds { + if !is_scheduled_task_kind(kind.as_str()) { + return Err(eyre::eyre!( + "{} scheduled_memory expectation references unknown task kind {}.", + path.display(), + kind + )); + } + } + + Ok(()) +} + fn validate_evolution_conflict( path: &Path, evidence_ids: &BTreeSet, @@ -2824,11 +3187,13 @@ fn score_job(job: &RealWorldJob) -> JobScoring { let knowledge = knowledge_metrics(job, answer); let memory_summary = memory_summary_metrics(job, answer); let proactive_brief = proactive_brief_metrics(job, answer); + let scheduled_memory = scheduled_memory_metrics(job, answer); let mut unsupported_claims = unsupported_claims(job, answer); unsupported_claims.extend(unsupported_page_claims(answer)); unsupported_claims.extend(unsupported_memory_summary_claims(job, answer)); unsupported_claims.extend(unsupported_proactive_suggestions(job, answer)); + unsupported_claims.extend(unsupported_scheduled_outputs(job, answer)); let operator_counts = operator_debug_failure_counts(job); let latency_violations = latency_violations(job, answer); @@ -2869,6 +3234,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { apply_memory_summary_failure_counts(&mut counts, memory_summary.as_ref()); apply_proactive_brief_failure_counts(&mut counts, proactive_brief.as_ref()); + apply_scheduled_memory_failure_counts(&mut counts, scheduled_memory.as_ref()); let dimension_scores = dimension_scores(job, &counts); let normalized_score = normalized_score(&dimension_scores); @@ -2902,6 +3268,7 @@ fn score_job(job: &RealWorldJob) -> JobScoring { consolidation, memory_summary, proactive_brief, + scheduled_memory, } } @@ -2943,6 +3310,28 @@ fn apply_proactive_brief_failure_counts( counts.proactive_brief_tombstone_violations = metrics.tombstone_violation_count; } +fn apply_scheduled_memory_failure_counts( + counts: &mut FailureCounts, + metrics: Option<&ScheduledMemoryJobMetrics>, +) { + let Some(metrics) = metrics else { + return; + }; + + counts.scheduled_memory_invalid_current_outputs = metrics.invalid_current_output_count; + counts.scheduled_memory_untraced_outputs = metrics.untraced_output_count; + counts.scheduled_memory_missing_freshness = + metrics.output_count.saturating_sub(metrics.freshness_marker_count); + counts.scheduled_memory_missing_action_rationale = + metrics.output_count.saturating_sub(metrics.action_rationale_count); + counts.scheduled_memory_missing_task_kinds = metrics.missing_required_task_kind_count; + counts.scheduled_memory_unsupported_current_outputs = metrics.unsupported_current_output_count; + counts.scheduled_memory_tombstone_violations = metrics.tombstone_violation_count; + counts.scheduled_memory_missing_trace = + metrics.trace_required_count.saturating_sub(metrics.trace_complete_count); + counts.source_mutations += metrics.source_mutation_count; +} + fn score_declared_job( job: &RealWorldJob, status: TypedStatus, @@ -2968,6 +3357,7 @@ fn score_declared_job( consolidation, memory_summary: None, proactive_brief: None, + scheduled_memory: None, } } @@ -2998,6 +3388,14 @@ fn wrong_result_count(counts: &FailureCounts) -> usize { + counts.proactive_brief_missing_kinds + counts.proactive_brief_unsupported_current_suggestions + counts.proactive_brief_tombstone_violations + + counts.scheduled_memory_invalid_current_outputs + + counts.scheduled_memory_untraced_outputs + + counts.scheduled_memory_missing_freshness + + counts.scheduled_memory_missing_action_rationale + + counts.scheduled_memory_missing_task_kinds + + counts.scheduled_memory_unsupported_current_outputs + + counts.scheduled_memory_tombstone_violations + + counts.scheduled_memory_missing_trace + counts.untraced_page_sections + counts.missed_stale_findings + counts.rebuild_failures @@ -3053,6 +3451,7 @@ fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { pages: Vec::new(), memory_summaries: Vec::new(), proactive_briefs: Vec::new(), + scheduled_tasks: Vec::new(), latency_ms: None, cost: None, trace_explainability: None, @@ -3070,6 +3469,11 @@ fn produced_evidence_ids(answer: &ProducedAnswer) -> BTreeSet { evidence.extend(suggestion.evidence_refs.iter().cloned()); } } + for task in &answer.scheduled_tasks { + for output in &task.outputs { + evidence.extend(output.evidence_refs.iter().cloned()); + } + } evidence } @@ -3948,6 +4352,210 @@ fn proactive_unsupported_claim_report( } } +fn scheduled_memory_metrics( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Option { + if answer.scheduled_tasks.is_empty() { + return None; + } + + let mut metrics = ScheduledMemoryJobMetrics { + task_run_count: answer.scheduled_tasks.len(), + required_task_kind_count: job + .scheduled_memory + .as_ref() + .map_or(0, |scheduled| scheduled.required_task_kinds.len()), + ..ScheduledMemoryJobMetrics::default() + }; + let mut task_kinds = BTreeSet::new(); + + for task in &answer.scheduled_tasks { + accumulate_scheduled_memory_metrics(task, &mut metrics, &mut task_kinds); + } + + let covered_required_task_kind_count = job.scheduled_memory.as_ref().map_or(0, |scheduled| { + scheduled.required_task_kinds.iter().filter(|kind| task_kinds.contains(*kind)).count() + }); + + metrics.covered_required_task_kind_count = covered_required_task_kind_count; + metrics.missing_required_task_kind_count = + metrics.required_task_kind_count.saturating_sub(covered_required_task_kind_count); + metrics.evidence_ref_coverage = + ratio(metrics.evidence_ref_output_count, metrics.evidence_ref_required_count); + metrics.freshness_coverage = ratio(metrics.freshness_marker_count, metrics.output_count); + metrics.action_rationale_coverage = ratio(metrics.action_rationale_count, metrics.output_count); + metrics.trace_coverage = ratio(metrics.trace_complete_count, metrics.trace_required_count); + + Some(metrics) +} + +fn accumulate_scheduled_memory_metrics( + task: &ScheduledMemoryTaskArtifact, + metrics: &mut ScheduledMemoryJobMetrics, + task_kinds: &mut BTreeSet, +) { + metrics.source_trace_selected_count += task.source_trace.selected_source_refs.len(); + metrics.source_trace_dropped_count += task.source_trace.dropped_source_refs.len(); + metrics.source_trace_stale_count += task.source_trace.stale_source_refs.len(); + metrics.source_trace_superseded_count += task.source_trace.superseded_source_refs.len(); + metrics.source_trace_tombstone_count += task.source_trace.tombstone_source_refs.len(); + metrics.trace_required_count += 1; + metrics.source_mutation_count += task.source_mutations.len() + + task.source_mutations.iter().map(forbidden_diff_key_count).sum::(); + + task_kinds.insert(task.task_kind.clone()); + + if scheduled_trace_is_complete(task.execution_trace.as_ref()) { + metrics.trace_complete_count += 1; + } + + let non_current_refs = memory_summary_non_current_trace_refs(&task.source_trace); + let tombstone_refs = proactive_tombstone_trace_refs(&task.source_trace); + + for output in &task.outputs { + metrics.output_count += 1; + metrics.evidence_ref_required_count += 1; + + if output.evidence_refs.is_empty() { + metrics.untraced_output_count += 1; + } else { + metrics.evidence_ref_output_count += 1; + } + if scheduled_output_has_freshness(output) { + metrics.freshness_marker_count += 1; + } + if scheduled_output_has_action_rationale(output) { + metrics.action_rationale_count += 1; + } + if output.freshness.status == "current" { + metrics.current_output_count += 1; + } else { + metrics.non_current_output_count += 1; + } + if scheduled_output_is_invalid_current(output, &non_current_refs) { + metrics.invalid_current_output_count += 1; + } + if scheduled_output_is_unsupported_current(output) { + metrics.unsupported_current_output_count += 1; + } + if scheduled_output_is_tombstone_violation(output, &tombstone_refs) { + metrics.tombstone_violation_count += 1; + } + } +} + +fn scheduled_trace_is_complete(trace: Option<&ScheduledMemoryExecutionTrace>) -> bool { + let Some(trace) = trace else { + return false; + }; + + trace.status == "completed" + && !trace.trace_id.trim().is_empty() + && !trace.output_ref.trim().is_empty() + && !trace.stages.is_empty() + && trace + .stages + .iter() + .any(|stage| stage.stage_name == "output_readback" && !stage.evidence_refs.is_empty()) +} + +fn scheduled_output_has_freshness(output: &ScheduledMemoryOutput) -> bool { + if output.freshness.status.trim().is_empty() { + return false; + } + + match output.freshness.status.as_str() { + "superseded" => !output.freshness.superseded_by.is_empty(), + "tombstoned" => !output.freshness.tombstone_refs.is_empty(), + _ => true, + } +} + +fn scheduled_output_has_action_rationale(output: &ScheduledMemoryOutput) -> bool { + !output.action.decision.trim().is_empty() + && !output.action.reason_code.trim().is_empty() + && !output.action.reason.trim().is_empty() +} + +fn scheduled_output_is_invalid_current( + output: &ScheduledMemoryOutput, + non_current_refs: &BTreeSet<&str>, +) -> bool { + output.freshness.status == "current" + && (!output.freshness.superseded_by.is_empty() + || !output.freshness.tombstone_refs.is_empty() + || output + .evidence_refs + .iter() + .any(|evidence_id| non_current_refs.contains(evidence_id.as_str()))) +} + +fn scheduled_output_is_unsupported_current(output: &ScheduledMemoryOutput) -> bool { + !output.unsupported_claim_flags.is_empty() + && (output.action.decision == "recommend" || output.freshness.status == "current") +} + +fn scheduled_output_is_tombstone_violation( + output: &ScheduledMemoryOutput, + tombstone_refs: &BTreeSet<&str>, +) -> bool { + output.freshness.status == "current" + && (!output.freshness.tombstone_refs.is_empty() + || output + .evidence_refs + .iter() + .any(|evidence_id| tombstone_refs.contains(evidence_id.as_str()))) +} + +fn unsupported_scheduled_outputs( + job: &RealWorldJob, + answer: &ProducedAnswer, +) -> Vec { + answer + .scheduled_tasks + .iter() + .flat_map(|task| { + task.outputs.iter().filter_map(|output| { + if output.evidence_refs.is_empty() { + return Some(scheduled_unsupported_claim_report( + job, + task, + output, + "scheduled task output has no evidence refs", + )); + } + if scheduled_output_is_unsupported_current(output) { + return Some(scheduled_unsupported_claim_report( + job, + task, + output, + "unsupported scheduled task claim is still recommended or marked current", + )); + } + + None + }) + }) + .collect() +} + +fn scheduled_unsupported_claim_report( + job: &RealWorldJob, + task: &ScheduledMemoryTaskArtifact, + output: &ScheduledMemoryOutput, + reason: &str, +) -> UnsupportedClaimReport { + UnsupportedClaimReport { + suite_id: job.suite.clone(), + job_id: job.job_id.clone(), + claim_id: Some(format!("{}:{}", task.task_run_id, output.output_id)), + claim_text: bounded_text(output.text.as_str(), 240), + reason: reason.to_string(), + evidence_ids: output.evidence_refs.clone(), + } +} + fn hard_fail_hits( job: &RealWorldJob, unsupported_claims: &[UnsupportedClaimReport], @@ -4027,6 +4635,11 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) || counts.proactive_brief_missing_kinds > 0 || counts.proactive_brief_unsupported_current_suggestions > 0 || counts.proactive_brief_tombstone_violations > 0 + || counts.scheduled_memory_invalid_current_outputs > 0 + || counts.scheduled_memory_missing_task_kinds > 0 + || counts.scheduled_memory_unsupported_current_outputs > 0 + || counts.scheduled_memory_tombstone_violations > 0 + || counts.scheduled_memory_missing_trace > 0 || counts.page_usefulness_failures > 0, "evidence_grounding" => counts.missing_evidence > 0 @@ -4034,17 +4647,22 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) || counts.lineage_failures > 0 || counts.memory_summary_untraced_entries > 0 || counts.proactive_brief_untraced_suggestions > 0 + || counts.scheduled_memory_untraced_outputs > 0 + || counts.scheduled_memory_missing_trace > 0 || counts.untraced_page_sections > 0, "trap_avoidance" => counts.trap_uses > 0 || counts.memory_summary_invalid_current_entries > 0 || counts.proactive_brief_invalid_current_suggestions > 0 || counts.proactive_brief_tombstone_violations > 0 + || counts.scheduled_memory_invalid_current_outputs > 0 + || counts.scheduled_memory_tombstone_violations > 0 || counts.missed_stale_findings > 0, "uncertainty_handling" => counts.unsupported_claims > 0 || counts.memory_summary_unsupported_current_entries > 0 - || counts.proactive_brief_unsupported_current_suggestions > 0, + || counts.proactive_brief_unsupported_current_suggestions > 0 + || counts.scheduled_memory_unsupported_current_outputs > 0, "lifecycle_behavior" => counts.stale_answers > 0 || counts.conflict_detection_missing > 0 @@ -4059,6 +4677,12 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) || counts.proactive_brief_missing_action_rationale > 0 || counts.proactive_brief_unsupported_current_suggestions > 0 || counts.proactive_brief_tombstone_violations > 0 + || counts.scheduled_memory_invalid_current_outputs > 0 + || counts.scheduled_memory_missing_freshness > 0 + || counts.scheduled_memory_missing_action_rationale > 0 + || counts.scheduled_memory_unsupported_current_outputs > 0 + || counts.scheduled_memory_tombstone_violations > 0 + || counts.scheduled_memory_missing_trace > 0 || counts.rebuild_failures > 0, "source_immutability" => counts.source_mutations > 0, "proposal_usefulness" => counts.proposal_usefulness_failures > 0, @@ -4069,7 +4693,9 @@ fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) || counts.unsupported_claims > 0 || counts.operator_debug_missing > 0 || counts.operator_debug_raw_sql > 0 - || counts.operator_debug_trace_gaps > 0, + || counts.operator_debug_trace_gaps > 0 + || counts.scheduled_memory_missing_trace > 0, + "trace_readback" => counts.scheduled_memory_missing_trace > 0, "latency_resource" => counts.latency_violations > 0, "personalization_fit" | "ownership_correctness" => counts.missing_claims > 0 || counts.unsupported_claims > 0, @@ -4177,6 +4803,21 @@ fn wrong_result_signal_count(counts: &FailureCounts) -> usize { + counts.memory_summary_missing_rationale + counts.memory_summary_missing_categories + counts.memory_summary_unsupported_current_entries + + counts.proactive_brief_invalid_current_suggestions + + counts.proactive_brief_untraced_suggestions + + counts.proactive_brief_missing_freshness + + counts.proactive_brief_missing_action_rationale + + counts.proactive_brief_missing_kinds + + counts.proactive_brief_unsupported_current_suggestions + + counts.proactive_brief_tombstone_violations + + counts.scheduled_memory_invalid_current_outputs + + counts.scheduled_memory_untraced_outputs + + counts.scheduled_memory_missing_freshness + + counts.scheduled_memory_missing_action_rationale + + counts.scheduled_memory_missing_task_kinds + + counts.scheduled_memory_unsupported_current_outputs + + counts.scheduled_memory_tombstone_violations + + counts.scheduled_memory_missing_trace + counts.untraced_page_sections + counts.missed_stale_findings + counts.rebuild_failures @@ -4231,6 +4872,7 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { knowledge: scoring.knowledge, memory_summary: scoring.memory_summary, proactive_brief: scoring.proactive_brief, + scheduled_memory: scoring.scheduled_memory, trap_ids_used: scoring.trap_ids_used, dimension_scores: scoring.dimension_scores, reason: scoring.reason, @@ -4734,6 +5376,7 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { consolidation: consolidation_summary(jobs), memory_summary: memory_summary_summary(jobs), proactive_brief: proactive_brief_summary(jobs), + scheduled_memory: scheduled_memory_summary(jobs), knowledge: knowledge_summary(jobs), ..ReportSummary::default() }; @@ -5037,6 +5680,106 @@ fn proactive_brief_summary(jobs: &[JobReport]) -> Option Option { + let scheduled_jobs = + jobs.iter().filter_map(|job| job.scheduled_memory.as_ref()).collect::>(); + + if scheduled_jobs.is_empty() { + return None; + } + + let job_count = scheduled_jobs.len(); + let output_count = scheduled_jobs.iter().map(|metrics| metrics.output_count).sum::(); + let evidence_ref_required_count = + scheduled_jobs.iter().map(|metrics| metrics.evidence_ref_required_count).sum(); + let evidence_ref_output_count = + scheduled_jobs.iter().map(|metrics| metrics.evidence_ref_output_count).sum(); + let freshness_marker_count = + scheduled_jobs.iter().map(|metrics| metrics.freshness_marker_count).sum(); + let action_rationale_count = + scheduled_jobs.iter().map(|metrics| metrics.action_rationale_count).sum(); + let trace_required_count = + scheduled_jobs.iter().map(|metrics| metrics.trace_required_count).sum(); + let trace_complete_count = + scheduled_jobs.iter().map(|metrics| metrics.trace_complete_count).sum(); + + Some(ScheduledMemorySummaryReport { + job_count, + task_run_count: scheduled_jobs.iter().map(|metrics| metrics.task_run_count).sum(), + output_count, + required_task_kind_count: scheduled_jobs + .iter() + .map(|metrics| metrics.required_task_kind_count) + .sum(), + covered_required_task_kind_count: scheduled_jobs + .iter() + .map(|metrics| metrics.covered_required_task_kind_count) + .sum(), + missing_required_task_kind_count: scheduled_jobs + .iter() + .map(|metrics| metrics.missing_required_task_kind_count) + .sum(), + evidence_ref_required_count, + evidence_ref_output_count, + evidence_ref_coverage: ratio(evidence_ref_output_count, evidence_ref_required_count), + freshness_marker_count, + freshness_coverage: ratio(freshness_marker_count, output_count), + action_rationale_count, + action_rationale_coverage: ratio(action_rationale_count, output_count), + trace_required_count, + trace_complete_count, + trace_coverage: ratio(trace_complete_count, trace_required_count), + source_mutation_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_mutation_count) + .sum(), + current_output_count: scheduled_jobs + .iter() + .map(|metrics| metrics.current_output_count) + .sum(), + non_current_output_count: scheduled_jobs + .iter() + .map(|metrics| metrics.non_current_output_count) + .sum(), + invalid_current_output_count: scheduled_jobs + .iter() + .map(|metrics| metrics.invalid_current_output_count) + .sum(), + untraced_output_count: scheduled_jobs + .iter() + .map(|metrics| metrics.untraced_output_count) + .sum(), + unsupported_current_output_count: scheduled_jobs + .iter() + .map(|metrics| metrics.unsupported_current_output_count) + .sum(), + tombstone_violation_count: scheduled_jobs + .iter() + .map(|metrics| metrics.tombstone_violation_count) + .sum(), + source_trace_selected_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_trace_selected_count) + .sum(), + source_trace_dropped_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_trace_dropped_count) + .sum(), + source_trace_stale_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_trace_stale_count) + .sum(), + source_trace_superseded_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_trace_superseded_count) + .sum(), + source_trace_tombstone_count: scheduled_jobs + .iter() + .map(|metrics| metrics.source_trace_tombstone_count) + .sum(), + }) +} + fn knowledge_summary(jobs: &[JobReport]) -> Option { let knowledge_jobs = jobs.iter().filter_map(|job| job.knowledge.as_ref()).collect::>(); @@ -5749,6 +6492,7 @@ fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { render_markdown_consolidation(&mut out, report); render_markdown_memory_summary(&mut out, report); render_markdown_proactive_brief(&mut out, report); + render_markdown_scheduled_memory(&mut out, report); render_markdown_knowledge(&mut out, report); render_markdown_unsupported_claims(&mut out, report); render_markdown_follow_ups(&mut out, report); @@ -6119,6 +6863,32 @@ fn render_markdown_optional_summary_metrics(out: &mut String, summary: &ReportSu proactive.rejected_count, proactive.deferred_count )); } + if let Some(scheduled) = &summary.scheduled_memory { + out.push_str(&format!( + "- Scheduled memory outputs: `{}` across `{}` task run(s)\n", + scheduled.output_count, scheduled.task_run_count + )); + out.push_str(&format!( + "- Scheduled memory evidence-ref coverage: `{}/{}` (`{:.3}`)\n", + scheduled.evidence_ref_output_count, + scheduled.evidence_ref_required_count, + scheduled.evidence_ref_coverage + )); + out.push_str(&format!( + "- Scheduled memory freshness/action/trace coverage: `{:.3}` / `{:.3}` / `{:.3}`\n", + scheduled.freshness_coverage, + scheduled.action_rationale_coverage, + scheduled.trace_coverage + )); + out.push_str(&format!( + "- Scheduled memory stale/currentness violations: `{}` invalid current, `{}` tombstone violation(s)\n", + scheduled.invalid_current_output_count, scheduled.tombstone_violation_count + )); + out.push_str(&format!( + "- Scheduled memory source mutations: `{}`\n", + scheduled.source_mutation_count + )); + } } fn render_markdown_quality_summary(out: &mut String, report: &RealWorldReport) { @@ -6633,6 +7403,47 @@ fn render_markdown_proactive_brief(out: &mut String, report: &RealWorldReport) { out.push('\n'); } +fn render_markdown_scheduled_memory(out: &mut String, report: &RealWorldReport) { + let scheduled_jobs = + report.jobs.iter().filter(|job| job.scheduled_memory.is_some()).collect::>(); + + if scheduled_jobs.is_empty() { + return; + } + + out.push_str("## Scheduled Memory Metrics\n\n"); + out.push_str("| Job | Task Runs | Outputs | Kinds | Evidence Coverage | Freshness | Action Rationale | Trace Coverage | Invalid Current | Untraced | Unsupported Current | Tombstone Violations | Source Mutations |\n"); + out.push_str( + "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |\n", + ); + + for job in scheduled_jobs { + let Some(metrics) = &job.scheduled_memory else { + continue; + }; + + out.push_str(&format!( + "| {} | {} | {} | `{}/{}` | `{:.3}` | `{:.3}` | `{:.3}` | `{:.3}` | {} | {} | {} | {} | {} |\n", + md_cell(job.job_id.as_str()), + metrics.task_run_count, + metrics.output_count, + metrics.covered_required_task_kind_count, + metrics.required_task_kind_count, + metrics.evidence_ref_coverage, + metrics.freshness_coverage, + metrics.action_rationale_coverage, + metrics.trace_coverage, + metrics.invalid_current_output_count, + metrics.untraced_output_count, + metrics.unsupported_current_output_count, + metrics.tombstone_violation_count, + metrics.source_mutation_count + )); + } + + out.push('\n'); +} + fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport) { out.push_str("## Unsupported Claims\n\n"); @@ -6705,6 +7516,7 @@ fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { out.push_str("For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims.\n\n"); out.push_str("For `memory_summary` jobs, summary artifacts are derived review surfaces. Top-of-mind entries must be current, included or downgraded entries must carry source refs, and derived project-profile entries must either cite sources or be explicitly flagged as unsupported.\n\n"); out.push_str("For `proactive_brief` jobs, brief artifacts are fixture-scored derived outputs, not scheduled UI behavior. Every suggestion must carry evidence refs, freshness/currentness metadata, and an action rationale; stale, superseded, or tombstoned sources must not be presented as current recommendations.\n\n"); + out.push_str("For `scheduled_memory` jobs, task artifacts are deterministic fixture-scored stand-ins for asynchronous work. Every output must carry evidence refs, freshness/currentness metadata, action rationale, and execution trace/readback evidence; scheduled tasks must not mutate source notes silently or claim hosted scheduler/private-provider parity from fixture-only output.\n\n"); out.push_str("## Suites With `not_encoded` Status\n\n"); if report.not_encoded_suites.is_empty() { diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 37e99898..ff9d3c6f 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -64,6 +64,10 @@ fn proactive_brief_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("proactive_brief") } +fn scheduled_memory_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("scheduled_memory") +} + fn knowledge_fixture_dir() -> PathBuf { real_world_memory_fixture_dir().join("knowledge") } @@ -705,7 +709,7 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(22) + Some(23) ); assert_eq!( report @@ -1026,17 +1030,19 @@ fn assert_elf_fixture_adapter_record(adapter: &Value) -> Result<()> { assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert!(adapter.pointer("/run/evidence").and_then(Value::as_str).is_some_and(|evidence| { - evidence.contains("55 jobs across 15 suites") - && evidence.contains("49 pass") - && evidence.contains("6 blocked") + evidence.contains("60 jobs across 16 suites") + && evidence.contains("53 pass") + && evidence.contains("7 blocked") && evidence.contains("core_archival_memory") && evidence.contains("memory_summary") && evidence.contains("proactive_brief") + && evidence.contains("scheduled_memory") && evidence.contains("context_trajectory") })); let suites = array_at(adapter, "/suites")?; let core_archival = find_by_field(suites, "/suite_id", "core_archival_memory")?; + let scheduled = find_by_field(suites, "/suite_id", "scheduled_memory")?; let context_trajectory = find_by_field(suites, "/suite_id", "context_trajectory")?; assert_eq!(core_archival.pointer("/status").and_then(Value::as_str), Some("pass")); @@ -1045,6 +1051,11 @@ fn assert_elf_fixture_adapter_record(adapter: &Value) -> Result<()> { && evidence.contains("project-decision recovery") && evidence.contains("archival note search") })); + assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!(scheduled.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("4 passing source-linked task readbacks") + && evidence.contains("private/provider scheduler blocker") + })); assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); assert!( adapter @@ -2236,7 +2247,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(55)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(60)); Ok(()) } @@ -4120,8 +4131,18 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) - let scheduled = find_by_field(stages, "/stage_id", "scheduled_memory_task_readiness")?; - assert_eq!(scheduled.pointer("/comparison_judgment").and_then(Value::as_str), Some("blocked")); + assert_eq!(scheduled.pointer("/comparison_judgment").and_then(Value::as_str), Some("improved")); assert_eq!(scheduled.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(scheduled.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(scheduled.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!( + scheduled.pointer("/post_stage_counts/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + scheduled.pointer("/post_stage_counts/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); let retest = find_by_field(stages, "/stage_id", "final_competitor_retest_status")?; @@ -4139,10 +4160,11 @@ fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) - "memory_summary_top_of_mind_behavior" )?); assert!(array_contains_str(ledger, "/summary/improved", "proactive_brief_readiness")?); + assert!(array_contains_str(ledger, "/summary/improved", "scheduled_memory_task_readiness")?); assert!(array_at(ledger, "/summary/regressed")?.is_empty()); assert!(array_contains_str(ledger, "/summary/unchanged", "deletion_ttl_tombstone_behavior")?); assert!(array_contains_str(ledger, "/summary/unchanged", "final_competitor_retest_status")?); - assert!(array_contains_str(ledger, "/summary/blocked", "scheduled_memory_task_readiness")?); + assert!(array_at(ledger, "/summary/blocked")?.is_empty()); assert!(array_at(ledger, "/summary/not_tested")?.is_empty()); assert_dreaming_memory_summary_stage(stages)?; @@ -4225,9 +4247,14 @@ fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) { ); assert!(markdown.contains("memory-summary/top-of-mind fixture readback")); assert!(markdown.contains("XY-953 adds a direct `proactive_brief` suite")); + assert!(markdown.contains("XY-954 adds a direct `scheduled_memory` suite")); assert!(markdown.contains( "Do not claim fixture-backed proactive brief scoring proves OpenAI Pulse parity" )); + assert!( + markdown + .contains("Do not claim fixture-backed scheduled-memory scoring proves ChatGPT Tasks") + ); assert!(markdown.contains("`regressed`: none")); assert!(markdown.contains("the XY-905 run passes all six memory-evolution jobs")); assert!(markdown.contains("XY-952 adds a reviewable `elf.memory_summary/v1`")); @@ -4739,6 +4766,248 @@ fn proactive_brief_fixture_fails_tombstone_ttl_violations() -> Result<()> { Ok(()) } +#[test] +fn scheduled_memory_fixtures_score_task_trace_gate() -> Result<()> { + let report = run_json_report_from(scheduled_memory_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/scheduled_memory/job_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/task_run_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/output_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/invalid_current_output_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + + let suites = array_at(&report, "/suites")?; + let scheduled = find_by_field(suites, "/suite_id", "scheduled_memory")?; + + assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(scheduled.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + + let jobs = array_at(&report, "/jobs")?; + let weekly = find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + let private = + find_by_field(jobs, "/job_id", "scheduled-private-provider-scheduler-blocked-001")?; + + assert_eq!(weekly.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + weekly.pointer("/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!(private.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + report + .pointer("/follow_ups/0/title") + .and_then(Value::as_str) + .is_some_and(|title| title.contains("XY-930")) + ); + + Ok(()) +} + +#[test] +fn scheduled_memory_markdown_renders_trace_metrics() -> Result<()> { + let report = run_json_report_from(scheduled_memory_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-scheduled-memory-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("scheduled-memory-report.json"); + let markdown_path = temp_dir.join("scheduled-memory-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Scheduled Memory Metrics")); + assert!(markdown.contains("scheduled-weekly-project-status-summary-001")); + assert!(markdown.contains("Scheduled memory evidence-ref coverage")); + assert!(markdown.contains("Trace Coverage")); + assert!(markdown.contains("Source Mutations")); + + Ok(()) +} + +#[test] +fn scheduled_memory_fixture_fails_missing_execution_trace() -> Result<()> { + let fixture_path = scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0] + .as_object_mut() + .ok_or_else(|| eyre::eyre!("missing scheduled task object"))? + .remove("execution_trace"); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-missing-trace-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("missing_trace.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/scheduled_memory/trace_complete_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn scheduled_memory_fixture_fails_untraced_outputs() -> Result<()> { + let fixture_path = scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["evidence_refs"] = + Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-untraced-output-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("untraced_output.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); + assert_eq!( + job.pointer("/scheduled_memory/untraced_output_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn scheduled_memory_fixture_fails_superseded_sources_presented_current() -> Result<()> { + let fixture_path = scheduled_memory_fixture_dir().join("stale_decision_audit.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["evidence_refs"] = + serde_json::json!(["scheduled-old-consolidation-only-decision"]); + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-superseded-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("superseded_current.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "scheduled-stale-decision-audit-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/scheduled_memory/invalid_current_output_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn scheduled_memory_fixture_fails_source_mutation() -> Result<()> { + let fixture_path = scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); + let mut fixture = load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["source_mutations"] = serde_json::json!([ + { + "table": "memory_notes", + "op": "update", + "note_id": "scheduled-weekly-current-gate" + } + ]); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-source-mutation-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("source_mutation.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("lifecycle_fail")); + assert_eq!( + job.pointer("/scheduled_memory/source_mutation_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/lifecycle_fail").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + #[test] fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { let report = run_json_report_from(production_ops_fixture_dir())?; @@ -4898,12 +5167,12 @@ fn assert_root_knowledge_summary(report: &Value) { } fn assert_root_aggregate_summary(report: &Value) { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(55)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(15)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(49)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(60)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(16)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(53)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(7)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); @@ -4943,11 +5212,11 @@ fn assert_root_aggregate_summary(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(123) + Some(133) ); assert_eq!( report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), - Some(123) + Some(133) ); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); @@ -4989,6 +5258,7 @@ fn assert_root_aggregate_summary(report: &Value) { assert_root_knowledge_summary(report); assert_root_proactive_brief_summary(report); + assert_root_scheduled_memory_summary(report); } fn assert_root_proactive_brief_summary(report: &Value) { @@ -5028,6 +5298,51 @@ fn assert_root_proactive_brief_summary(report: &Value) { ); } +fn assert_root_scheduled_memory_summary(report: &Value) { + assert_eq!( + report.pointer("/summary/scheduled_memory/job_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/task_run_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/output_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/invalid_current_output_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); +} + fn assert_root_aggregate_suites(report: &Value) -> Result<()> { let suites = array_at(report, "/suites")?; @@ -5081,6 +5396,11 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { assert_eq!(proactive.pointer("/status").and_then(Value::as_str), Some("blocked")); assert_eq!(proactive.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + let scheduled = find_by_field(suites, "/suite_id", "scheduled_memory")?; + + assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(scheduled.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + let context_trajectory = find_by_field(suites, "/suite_id", "context_trajectory")?; assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); @@ -5101,6 +5421,8 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; let core_fallback = find_by_field(jobs, "/job_id", "core-archival-archival-fallback-001")?; let stale_core = find_by_field(jobs, "/job_id", "core-archival-stale-core-detection-001")?; + let scheduled_weekly = + find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; assert_eq!(rebuild.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); assert_eq!( @@ -5123,6 +5445,11 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { ); assert_eq!(core_fallback.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(stale_core.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(scheduled_weekly.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + scheduled_weekly.pointer("/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); assert_eq!( stage_job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), Some("rerank.score") diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md index c893db22..4f960804 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md @@ -88,7 +88,8 @@ results, or lifecycle failures into one aggregate leaderboard. | Command or run | Artifact | Supported claim | | --- | --- | --- | -| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` plus XY-952 and XY-953 fixture updates | ELF fixture aggregate covers 55 jobs across 15 suites with 49 pass and 6 blocked production-ops, private-corpus, or OpenViking context-trajectory measurement gates, including 6 passing `core_archival_memory` jobs, 1 passing `memory_summary` source-trace job, and 4 passing `proactive_brief` suggestion jobs plus 1 private-corpus blocker. | +| `cargo make real-world-memory` | `2026-06-11-measurement-coverage-audit.md` plus XY-952, XY-953, and XY-954 fixture updates | ELF fixture aggregate covers 60 jobs across 16 suites with 53 pass and 7 blocked production-ops, private-corpus, private/provider scheduler, or OpenViking context-trajectory measurement gates, including 6 passing `core_archival_memory` jobs, 1 passing `memory_summary` source-trace job, 4 passing `proactive_brief` suggestion jobs plus 1 private-corpus blocker, and 4 passing `scheduled_memory` task-readback jobs plus 1 private/provider scheduler blocker. | +| `cargo make real-world-memory-scheduled` | `tmp/real-world-memory/scheduled/report.json` and `2026-06-16-scheduled-memory-task-scoring-report.md` | The scheduled-memory fixture scores weekly project status summary, stale preference/plan audit, stale decision audit, knowledge-page refresh suggestion, and private/provider scheduler blocker scenarios with evidence refs, freshness/currentness markers, action rationale, execution trace/readback, source-mutation guards, and stale/tombstone guards; this is fixture-backed contract evidence, not hosted scheduler, ChatGPT Tasks, Pulse, notification, or provider-backed private-corpus parity. | | `cargo make real-world-memory-summary` | `tmp/real-world-memory/memory-summary/report.json` | The memory summary fixture scores reviewable top-of-mind, background, stale, superseded, tombstoned, and derived project-profile entries with source refs, freshness metadata, rationale, and unsupported-claim flags; this is fixture-backed contract evidence, not managed-memory parity. | | `cargo make real-world-memory-proactive-brief` | `tmp/real-world-memory/proactive-brief/report.json` and `2026-06-16-proactive-brief-scoring-report.md` | The proactive brief fixture scores daily project brief, resume-work brief, stale decision audit, stale plan/preference warning, and private-corpus refresh blocker scenarios with evidence refs, freshness/currentness markers, action rationale, and stale/tombstone guards; this is fixture-backed contract evidence, not Pulse or hosted managed-memory parity. | | `cargo make real-world-memory-core-archival` | `tmp/real-world-memory/core-archival/report.json` | ELF core-block behavior is scored separately from archival note search for attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery. | diff --git a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md index 80b7620e..c48bdcf2 100644 --- a/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md +++ b/docs/guide/benchmarking/2026-06-11-competitor-strength-evidence-matrix.md @@ -31,15 +31,18 @@ Current boundary: live pass. The fresh ELF sweep produced 40 jobs with 22 pass, 5 wrong_result, 0 incomplete, 2 blocked, and 11 not_encoded; the fresh qmd sweep produced 17 pass, 6 wrong_result, 0 incomplete, 2 blocked, and 15 not_encoded. -- ELF fixture evidence is strong: `cargo make real-world-memory` reports 55 jobs - across 15 suites with 49 pass and 6 blocked production-ops, private-corpus, or +- ELF fixture evidence is strong: `cargo make real-world-memory` reports 60 jobs + across 16 suites with 53 pass and 7 blocked production-ops, private-corpus, private/provider scheduler, or OpenViking context-trajectory measurement gates. The `core_archival_memory` suite contributes 6 fixture-only passes for ELF core-block behavior; it does not create an ELF-over-Letta claim. The `memory_summary` suite contributes one fixture-backed source-trace pass; it does not create managed-memory parity evidence. The `proactive_brief` suite contributes four fixture-backed source-linked suggestion passes and one private-corpus blocker; it does not create Pulse or hosted - managed-memory parity. This proves the fixture contract, not live-service parity. + managed-memory parity. The `scheduled_memory` suite contributes four fixture-backed + scheduled task readbacks plus one private/provider scheduler blocker; it does not + create hosted scheduler, ChatGPT Tasks, Pulse, notification, or provider-backed + private-corpus parity. This proves the fixture contract, not live-service parity. - qmd is the strongest measured local retrieval-debug comparison, but the current evidence still separates its same-corpus/live-retrieval strengths from the full-suite live non-pass sweep. diff --git a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md index 0835990f..9d1f9f7b 100644 --- a/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md +++ b/docs/guide/benchmarking/2026-06-16-dreaming-readiness-stage-ledger.md @@ -8,8 +8,8 @@ report shape required before claiming the stage improved. Inputs: `docs/research/2026-06-16-dreaming-readiness-stage-ledger.json`, the June 11 competitor-strength, temporal-history, and iteration-direction reports, the XY-905 June 16 live temporal reconciliation report, the consolidation proposal spec, the -memory summary spec, the XY-953 proactive brief scoring report, and the checked-in -real-world fixture suites. +memory summary spec, the XY-953 proactive brief scoring report, the XY-954 scheduled +memory task scoring report, and the checked-in real-world fixture suites. Outputs: A stage-by-stage ledger that downstream issues can update with `improved`, `regressed`, `unchanged`, `blocked`, or `not_tested` judgments. @@ -22,12 +22,12 @@ and now includes the XY-905 post-stage result for live temporal reconciliation. Current stage status: - `improved`: current-vs-historical correctness, preference evolution, reviewable - consolidation, memory-summary/top-of-mind fixture readback, and proactive brief - fixture scoring. + consolidation, memory-summary/top-of-mind fixture readback, proactive brief fixture + scoring, and scheduled-memory task fixture scoring. - `regressed`: none. - `unchanged`: deletion/TTL/tombstone behavior and the final competitor retest baseline. -- `blocked`: scheduled-memory-task readiness. +- `blocked`: none. - `not_tested`: none. The known live `memory_evolution` loss is now repaired for the encoded ELF live @@ -53,6 +53,13 @@ brief, stale decision audit, stale plan/preference warning, and private-corpus r blocker scenarios. It does not prove OpenAI Pulse parity, hosted managed-memory parity, background scheduling, or private-corpus production quality. +Scheduled-memory task readiness is improved only at the fixture-backed benchmark +level: XY-954 adds a direct `scheduled_memory` suite with weekly project status +summary, stale preference/plan audit, stale decision audit, knowledge-page refresh +suggestion, and private/provider scheduler blocker scenarios. It does not prove a +hosted scheduler, ChatGPT Tasks parity, Pulse parity, notification delivery, +provider-backed private-corpus quality, or silent source mutation safety. + ## Ledger Rules - Every downstream Dreaming or competitor-improvement stage must write a post-stage @@ -79,7 +86,7 @@ parity, background scheduling, or private-corpus production quality. | Reviewable consolidation | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Keep Dreaming output derived and reviewable, and add direct competitor/reference runners only when they emit comparable source ids, confidence, unsupported-claim flags, and review audit artifacts. | | Memory summary and top-of-mind behavior | `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival` | `cargo make real-world-memory-summary`; `cargo make real-world-memory-knowledge`; `cargo make real-world-memory-core-archival`; `cargo make real-world-memory-live-adapters` | `pass=8`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=9`, `wrong_result=0`, `blocked=0`, `not_tested=0`, `not_encoded=0` | `improved` | Move from fixture-backed summary/source-trace readback into service-native admin readback and later live top-of-mind behavior; do not turn hidden summaries into authoritative memory. | | Proactive brief readiness | `cargo make real-world-first-generation-oss`; `cargo make real-world-job-operator-ux` | `cargo make real-world-memory-proactive-brief`; `cargo make real-world-memory`; `cargo test -p elf-eval --test real_world_job_benchmark -- --test-threads=1` | `pass=0`, `wrong_result=0`, `blocked=0`, `not_tested=1`, `not_encoded=1` | `pass=4`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0`; evidence-ref/freshness/rationale coverage `1.000`; invalid-current and tombstone violations `0` | `improved` | Move from fixture-backed proactive brief scoring into service-native generated brief readback and later live adapter materialization; keep scheduling and private-corpus refresh behind owned lanes and operator inputs. | -| Scheduled memory task readiness | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-consolidation`; `cargo make real-world-memory-live-adapters` | `pass=0`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0` | not run by XY-905 | `blocked` | Scheduled runs are future work; start with queued derived proposal runs and keep operator review mandatory. | +| Scheduled memory task readiness | `cargo make real-world-memory-consolidation` | `cargo make real-world-memory-scheduled`; `cargo make real-world-memory`; `cargo test -p elf-eval --test real_world_job_benchmark scheduled_memory -- --test-threads=1` | `pass=0`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0` | `pass=4`, `wrong_result=0`, `blocked=1`, `not_tested=0`, `not_encoded=0`; evidence-ref/freshness/action/trace coverage `1.000`; invalid-current, unsupported-current, tombstone, and source-mutation violations `0` | `improved` | Move from fixture-backed scheduled task scoring into service-native queued task materialization and operator-visible readback; keep hosted/private/provider scheduler gates behind XY-930 inputs. | | Final competitor retest status | `cargo make real-world-memory-live-adapters`; `cargo make real-world-first-generation-oss`; `cargo make real-world-memory-graph-rag`; `cargo make openmemory-ui-export-readback`; `cargo make baseline-production-private-addendum` when operator input exists | Same commands; private/provider commands may remain typed blocked under XY-930 | `pass=22`, `wrong_result=5`, `blocked=2`, `not_tested=11`, `not_encoded=11` | partial XY-905 evidence: ELF live adapter `pass=40`, `wrong_result=0`, `blocked=5`, `not_encoded=10` | `unchanged` | Rerun the broader competitor matrix after each optimization; the XY-905 live adapter improvement does not replace private/provider or external competitor gates. | ## Evidence Anchors @@ -92,7 +99,7 @@ parity, background scheduling, or private-corpus production quality. | Reviewable consolidation | `docs/spec/system_consolidation_proposals_v1.md`; `apps/elf-eval/fixtures/real_world_memory/consolidation/`; `docs/guide/benchmarking/2026-06-16-live-consolidation-proposal-scoring-report.md`; `docs/research/2026-06-16-live-consolidation-proposal-scoring-report.json` | | Memory summary and top-of-mind behavior | `docs/spec/system_memory_summary_v1.md`; `apps/elf-eval/fixtures/real_world_memory/memory_summary/`; `apps/elf-eval/fixtures/real_world_memory/knowledge/`; `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/`; `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | | Proactive brief readiness | `docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md`; `docs/research/2026-06-16-proactive-brief-scoring-report.json`; `apps/elf-eval/fixtures/real_world_memory/proactive_brief/`; `docs/research/2026-06-08-agent-memory-selection.json`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | -| Scheduled memory task readiness | `docs/spec/system_consolidation_proposals_v1.md`; `docs/research/2026-06-08-agent-memory-selection.json` | +| Scheduled memory task readiness | `docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md`; `docs/research/2026-06-16-scheduled-memory-task-scoring-report.json`; `apps/elf-eval/fixtures/real_world_memory/scheduled_memory/`; `docs/research/2026-06-08-agent-memory-selection.json` | | Final competitor retest status | `docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md`; `docs/research/2026-06-11-competitor-strength-adoption-report.json`; `docs/guide/benchmarking/2026-06-11-graph-rag-scored-smoke-adapter-report.md`; `docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md` | ## Report Shape For Downstream Issues @@ -127,6 +134,9 @@ Allowed: - The current ledger records the XY-953 fixture-backed proactive brief scoring improvement with source refs, freshness/currentness markers, reject/defer rationale, and typed private-corpus blocking. +- The current ledger records the XY-954 fixture-backed scheduled-memory scoring + improvement with source refs, freshness/currentness markers, action rationale, + completed trace readback, zero source mutations, and typed private/provider blocking. - Fixture-backed knowledge and core/archival jobs can be used as regression guards for report shape. - Reviewable consolidation now has ELF live service-backed proposal scoring evidence, @@ -135,11 +145,14 @@ Allowed: Not allowed: - Do not claim this ledger proves preference history against mem0/OpenMemory, - live top-of-mind behavior, live proactive brief behavior, scheduled tasks, + live top-of-mind behavior, live proactive brief behavior, hosted scheduled tasks, private-corpus gates, hosted memory, broad consolidation superiority, or competitor adapters. - Do not claim fixture-backed proactive brief scoring proves OpenAI Pulse parity or hosted managed-memory parity. +- Do not claim fixture-backed scheduled-memory scoring proves ChatGPT Tasks, Pulse, + hosted scheduler, notification, provider-backed private-corpus, or silent-mutation + parity. - Do not claim ELF has full-suite live real-world pass evidence. - Do not claim private-corpus or provider-backed production quality without the operator-owned inputs required by XY-930. diff --git a/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md b/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md new file mode 100644 index 00000000..7907c225 --- /dev/null +++ b/docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md @@ -0,0 +1,400 @@ +# Real-World Job Benchmark Report + +Goal: Publish a Markdown summary for one generated real_world_job benchmark report. +Read this when: You need a durable smoke report for real-world agent memory job fixtures. +Inputs: `tmp/real-world-memory/scheduled/report.json`. +Depends on: `apps/elf-eval/fixtures/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`. +Verification: Compare this Markdown summary with the source JSON before committing. + +## Summary + +- Run ID: `real-world-memory-scheduled` +- Generated at: `2026-06-16T16:29:13.720856Z` +- Runner version: `0.2.0-7f08eb504271123fa861e24e6e6861227682acda-aarch64-apple-darwin` +- Corpus profile: `mixed` +- Adapter: `fixture_scheduled_memory` (offline_fixture_response) +- Jobs: `5` +- Suites with encoded jobs: `1` +- Suites with `not_encoded` status: `15` +- Status summary: `4` pass, `0` wrong_result, `0` lifecycle_fail, `0` incomplete, `1` blocked, `0` not_encoded, `0` unsupported_claim +- Unsupported claim count: `0` +- Wrong-result count: `0` +- Stale-answer count: `0` +- Conflict detections: `0` +- Update rationales available: `0` +- Temporal validity not encoded: `0` +- History readback encoded: `0` +- Evidence coverage: `10/10` (`1.000`) +- Source-ref coverage: `10/10` (`1.000`) +- Quote coverage: `10/10` (`1.000`) +- Stale retrieval count: `0` +- Scope correctness: `0/0` (`0.000`), violations `0` +- Redaction leak count: `0` +- Qdrant rebuild cases: `0` encoded, `0` pass +- Expected evidence recall: `1.000` (10/10) +- Irrelevant context ratio: `0.000` (0 irrelevant) +- Trace explainability: `0` job(s), `0` wrong-result stage attribution(s) +- Consolidation source mutation count: `0` +- Mean score: `0.800` +- Mean latency: `2.000 ms` +- Cost: `0.000 USD` +- Operator-debug jobs: `0` +- Raw SQL needed: `0` +- Trace-incomplete debug jobs: `0` +- Operator UX gaps: `0` +- Scheduled memory outputs: `5` across `4` task run(s) +- Scheduled memory evidence-ref coverage: `5/5` (`1.000`) +- Scheduled memory freshness/action/trace coverage: `1.000` / `1.000` / `1.000` +- Scheduled memory stale/currentness violations: `0` invalid current, `0` tombstone violation(s) +- Scheduled memory source mutations: `0` +- Private corpus redaction: `publish evidence ids and bounded score summaries only; do not publish private text` + +## External Adapter Coverage + +This section is manifest-backed. It records external adapter coverage and blockers, but it does not convert live-baseline retrieval results into real-world suite wins. + +- Manifest: `real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store` +- Docker default: `true` via `docker-compose.baseline.yml`; artifact dir `tmp/live-baseline/` +- Adapter records: `23` total, `16` external project(s), `23` Docker-default, `0` requiring host-global installs +- Evidence classes: `1` fixture-backed, `6` live-baseline-only, `5` live real-world, `11` research-gate +- Overall statuses: `blocked=7, wrong_result=6, lifecycle_fail=1, pass=4, not_encoded=5` +- Capability coverage statuses: `real=8, mocked=1, unsupported=6, blocked=22, wrong_result=10, pass=30, not_encoded=26` +- Real-world suite statuses: `blocked=23, wrong_result=7, pass=27, not_encoded=38` +- Scenario coverage statuses: `unsupported=3, blocked=12, incomplete=1, wrong_result=6, lifecycle_fail=1, pass=23, not_encoded=11` +- ELF scenario positions: `wins=10, ties=11, loses=1, untested=35` +- Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=17, blocked=13, non_goal=5` + +| Project | Adapter | Evidence Class | Overall | Setup | Run | Result | Docker | Suites | Evidence | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| ELF | `elf_real_world_memory_fixture` | `fixture_backed` | `blocked` | `pass` | `blocked` | `blocked` | `true` | `trust_source_of_truth`: `pass`
`work_resume`: `pass`
`project_decisions`: `pass`
`retrieval`: `pass`
`memory_evolution`: `pass`
`consolidation`: `pass`
`memory_summary`: `pass`
`proactive_brief`: `blocked`
`scheduled_memory`: `blocked`
`knowledge_compilation`: `pass`
`operator_debugging_ux`: `pass`
`capture_integration`: `pass`
`core_archival_memory`: `pass`
`production_ops`: `blocked`
`personalization`: `pass`
`context_trajectory`: `blocked` | setup: `cargo make real-world-memory`
result: `tmp/real-world-memory/real-world-memory-report.md` | +| ELF | `elf_live_real_world` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `trust_source_of_truth`: `pass`
`work_resume`: `pass`
`retrieval`: `pass`
`project_decisions`: `pass`
`memory_evolution`: `wrong_result`
`consolidation`: `pass`
`knowledge_compilation`: `pass`
`operator_debugging_ux`: `pass`
`capture_integration`: `pass`
`production_ops`: `blocked`
`personalization`: `pass`
`core_archival_memory`: `not_encoded`
`context_trajectory`: `blocked` | setup: `cargo make real-world-memory-live-adapters`
result: `tmp/real-world-memory/live-adapters/elf-report.md` | +| qmd | `qmd_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `retrieval`: `not_encoded`
`memory_evolution`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker`
result: `docs/guide/benchmarking/live_baseline_benchmark.md` | +| qmd | `qmd_live_real_world` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `trust_source_of_truth`: `pass`
`work_resume`: `pass`
`retrieval`: `pass`
`project_decisions`: `pass`
`memory_evolution`: `wrong_result`
`consolidation`: `not_encoded`
`knowledge_compilation`: `not_encoded`
`operator_debugging_ux`: `wrong_result`
`capture_integration`: `not_encoded`
`production_ops`: `blocked`
`personalization`: `pass`
`core_archival_memory`: `not_encoded`
`context_trajectory`: `blocked` | setup: `cargo make real-world-memory-live-adapters`
result: `tmp/real-world-memory/live-adapters/qmd-report.md` | +| ELF | `elf_operator_debug_live` | `live_real_world` | `pass` | `pass` | `pass` | `pass` | `true` | `operator_debugging_ux`: `pass` | setup: `cargo make real-world-job-operator-ux-live-adapters`
result: `tmp/real-world-job/operator-ux-live-adapters/elf-report.md` | +| qmd | `qmd_operator_debug_live` | `live_real_world` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `operator_debugging_ux`: `wrong_result` | setup: `cargo make real-world-job-operator-ux-live-adapters`
result: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.md` | +| agentmemory | `agentmemory_live_baseline` | `live_baseline_only` | `lifecycle_fail` | `pass` | `lifecycle_fail` | `lifecycle_fail` | `true` | `work_resume`: `blocked`
`capture_integration`: `blocked`
`memory_evolution`: `blocked` | setup: `ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | +| mem0/OpenMemory | `mem0_openmemory_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `memory_evolution`: `not_encoded`
`personalization`: `not_encoded`
`operator_debugging_ux`: `blocked` | setup: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | +| memsearch | `memsearch_live_baseline` | `live_baseline_only` | `pass` | `pass` | `pass` | `pass` | `true` | `trust_source_of_truth`: `not_encoded`
`retrieval`: `not_encoded`
`memory_evolution`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=memsearch cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | +| OpenViking | `openviking_live_baseline` | `live_baseline_only` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `retrieval`: `wrong_result`
`work_resume`: `not_encoded`
`context_trajectory`: `blocked` | setup: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`
result: `docs/guide/benchmarking/live_baseline_benchmark.md` | +| claude-mem | `claude_mem_live_baseline` | `live_baseline_only` | `wrong_result` | `pass` | `wrong_result` | `wrong_result` | `true` | `work_resume`: `not_encoded`
`operator_debugging_ux`: `blocked`
`capture_integration`: `blocked` | setup: `ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker`
result: `tmp/live-baseline/live-baseline-report.json` | +| qmd | `qmd_deep_profile_gate` | `research_gate` | `not_encoded` | `pass` | `not_encoded` | `not_encoded` | `true` | `retrieval`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker`
result: `docs/research/2026-06-11-qmd-openviking-strength-profile-report.json` | +| OpenViking | `openviking_deep_profile_gate` | `research_gate` | `blocked` | `pass` | `blocked` | `blocked` | `true` | `retrieval`: `wrong_result`
`context_trajectory`: `blocked`
`operator_debugging_ux`: `not_encoded` | setup: `ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker`
result: `docs/research/2026-06-11-qmd-openviking-strength-profile-report.json` | +| RAGFlow | `ragflow_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`
`knowledge_compilation`: `not_encoded`
`production_ops`: `blocked` | setup: `cargo make ragflow-docker-smoke`
result: `tmp/real-world-memory/ragflow-smoke/ragflow-report.json` | +| LightRAG | `lightrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `retrieval`: `blocked`
`memory_evolution`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `cargo make lightrag-docker-context-smoke`
result: `tmp/real-world-memory/lightrag-context/lightrag-report.json` | +| GraphRAG | `graphrag_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `knowledge_compilation`: `blocked`
`retrieval`: `not_encoded`
`production_ops`: `not_encoded`
`memory_evolution`: `not_encoded` | setup: `cargo make graphrag-docker-smoke`
result: `tmp/real-world-memory/graphrag-smoke/graphrag-report.json` | +| Graphiti/Zep | `graphiti_zep_research_gate` | `research_gate` | `blocked` | `blocked` | `blocked` | `blocked` | `true` | `memory_evolution`: `blocked`
`retrieval`: `not_encoded`
`production_ops`: `not_encoded` | setup: `cargo make graphiti-zep-docker-temporal-smoke`
result: `tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json` | +| Letta | `letta_research_gate` | `research_gate` | `blocked` | `blocked` | `not_encoded` | `not_encoded` | `true` | `personalization`: `not_encoded`
`project_decisions`: `not_encoded`
`work_resume`: `not_encoded`
`core_archival_memory`: `blocked` | setup: `Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract is a Docker-only benchmark-created agent export that must return core block JSON, archival search readback, and source ids before any scenario claim is scored.`
result: `No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision result is claimed.` | +| LangGraph | `langgraph_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `production_ops`: `not_encoded`
`work_resume`: `not_encoded` | setup: `LangGraph is D1 reviewed as a replay/checkpoint reference, not a direct memory backend adapter.`
result: `No production-ops or resume suite result is claimed.` | +| nanograph | `nanograph_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `memory_evolution`: `not_encoded`
`retrieval`: `not_encoded` | setup: `nanograph is D1 reviewed as typed graph DX, but no Docker adapter is implemented.`
result: `No graph temporal or retrieval-debug result is claimed.` | +| llm-wiki | `llm_wiki_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `knowledge_compilation`: `not_encoded`
`work_resume`: `not_encoded` | setup: `llm-wiki is D1 reviewed as a knowledge-compilation reference, but no plugin or generated-page adapter is implemented.`
result: `No knowledge page citation or lint result is claimed.` | +| gbrain | `gbrain_research_gate` | `research_gate` | `not_encoded` | `not_encoded` | `not_encoded` | `not_encoded` | `true` | `knowledge_compilation`: `not_encoded`
`operator_debugging_ux`: `not_encoded` | setup: `gbrain is D1 reviewed as a compiled-truth and timeline reference, but no Docker adapter is implemented.`
result: `No knowledge-synthesis or operator-continuity result is claimed.` | +| graphify | `graphify_docker_smoke` | `live_real_world` | `wrong_result` | `pass` | `pass` | `wrong_result` | `true` | `knowledge_compilation`: `wrong_result`
`retrieval`: `blocked`
`work_resume`: `not_encoded` | setup: `cargo make graphify-docker-graph-report-smoke`
result: `tmp/real-world-memory/graphify-smoke/graphify-report.json` | + +### Adapter Capability Details + +| Adapter | Capability | Status | Evidence | +| --- | --- | --- | --- | +| `elf_real_world_memory_fixture` | real_world_job_fixture_scoring | `real` | The runner scores checked-in real_world_job records with expected evidence, traps, and typed status output. | +| `elf_real_world_memory_fixture` | live_external_adapter_execution | `not_encoded` | The ELF fixture response path does not exercise an external memory project runtime. | +| `elf_real_world_memory_fixture` | docker_isolated_baseline | `pass` | ELF live baseline runs execute through docker-compose.baseline.yml for retrieval and lifecycle evidence. | +| `elf_live_real_world` | real_world_job_adapter | `pass` | The adapter executes real_world_job prompts after runtime ingestion and writes generated answer artifacts before scoring. | +| `elf_live_real_world` | service_runtime_execution | `real` | The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker. | +| `elf_live_real_world` | targeted_live_pass | `pass` | The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions. | +| `elf_live_real_world` | full_suite_live_sweep | `wrong_result` | The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution is wrong_result and production/core/context boundaries remain typed non-pass. | +| `elf_live_real_world` | full_suite_live_pass | `wrong_result` | No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes. | +| `elf_live_real_world` | typed_failure_reporting | `pass` | Adapter setup/runtime limitations are materialized as typed jobs with evidence JSON instead of silent claim upgrades. | +| `qmd_live_baseline` | same_corpus_retrieval | `pass` | qmd has an encoded Docker same-corpus retrieval adapter. | +| `qmd_live_baseline` | update_delete_cold_start | `pass` | qmd lifecycle smoke checks are encoded in the live-baseline runner. | +| `qmd_live_baseline` | real_world_job_adapter | `not_encoded` | This live_baseline_only record does not execute real_world_job prompts; cite qmd_live_real_world for the full live real-world sweep. | +| `qmd_live_real_world` | real_world_job_adapter | `pass` | qmd executes real_world_job prompts through its local CLI retrieval/query workflow and records generated answer artifacts. | +| `qmd_live_real_world` | local_cli_retrieval | `real` | The adapter uses qmd collection add, update, embed -f, and query --json inside Docker. | +| `qmd_live_real_world` | targeted_live_pass | `pass` | The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions. | +| `qmd_live_real_world` | full_suite_live_sweep | `wrong_result` | The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution and operator_debugging_ux are wrong_result while non-qmd product surfaces remain typed not_encoded or blocked. | +| `qmd_live_real_world` | full_suite_live_pass | `wrong_result` | No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes. | +| `qmd_live_real_world` | typed_failure_reporting | `pass` | qmd setup/runtime limitations are materialized as typed jobs with command evidence and retry artifacts. | +| `elf_operator_debug_live` | operator_debug_real_world_job_adapter | `pass` | The adapter executes the checked-in operator_debugging_ux jobs through the live service materializer and generated scoring fixtures. | +| `elf_operator_debug_live` | trace_hydration_metadata | `pass` | Generated operator_debug records include service trace ids, viewer links, admin trace-bundle URLs, and trace_available=true. | +| `elf_operator_debug_live` | replay_command_metadata | `pass` | Generated operator_debug records include admin trace-bundle curl replay commands; no raw SQL path is required. | +| `elf_operator_debug_live` | candidate_drop_visibility | `pass` | The operator-debug jobs keep dropped-candidate visibility as explicit job-level evidence instead of relying on direct database inspection. | +| `elf_operator_debug_live` | openmemory_or_claude_mem_ui_runner | `not_encoded` | This ELF live slice does not launch OpenMemory or claude-mem UI flows. | +| `qmd_operator_debug_live` | operator_debug_real_world_job_adapter | `pass` | The adapter executes the checked-in operator_debugging_ux jobs through qmd local CLI materialization and generated scoring fixtures. | +| `qmd_operator_debug_live` | local_replay_command_metadata | `pass` | Generated operator_debug records include qmd query replay commands tied to per-job collections. | +| `qmd_operator_debug_live` | trace_hydration_metadata | `wrong_result` | Generated qmd operator_debug records have trace_available=false and no ELF viewer/admin trace bundle because qmd exposes local replay rows rather than service trace hydration. | +| `qmd_operator_debug_live` | candidate_drop_visibility | `wrong_result` | qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed in the generated artifact. | +| `qmd_operator_debug_live` | openmemory_or_claude_mem_ui_runner | `not_encoded` | This qmd live slice does not launch OpenMemory or claude-mem UI flows. | +| `agentmemory_live_baseline` | same_corpus_retrieval | `pass` | The current adapter can run mem::remember and mem::search against the shared corpus. | +| `agentmemory_live_baseline` | adapter_storage | `mocked` | The current adapter uses a process-local StateKV Map and in-memory index. | +| `agentmemory_live_baseline` | durable_cold_start | `blocked` | A persistent upstream KV/index path or hosted runtime is needed before cold-start recovery can be fairly scored. | +| `agentmemory_live_baseline` | durable_work_resume_capture_path | `blocked` | XY-925 selects the next local path as a Docker-contained agentmemory session directory with persisted SDK KV store, observation log, and searchable index across a fresh process; the current StateKV Map and in-memory index still block scoring. | +| `agentmemory_live_baseline` | write_policy_hook_capture | `blocked` | Capture/write-policy jobs require live agentmemory hook observations plus persisted write-policy audit evidence. The current adapter does not execute those hooks. | +| `agentmemory_live_baseline` | real_world_job_adapter | `blocked` | XY-925 adds fixture-backed blocked prompt coverage for the required durable path, but no live agentmemory real_world_job adapter executes prompts until the persistent local store exists. | +| `mem0_openmemory_live_baseline` | local_storage | `real` | The adapter targets local FastEmbed, Qdrant path storage, and local history DB paths in Docker. | +| `mem0_openmemory_live_baseline` | same_corpus_retrieval | `pass` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks. | +| `mem0_openmemory_live_baseline` | local_lifecycle_update_delete_reload | `pass` | The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; the fresh scoped run reports those lifecycle checks passing. | +| `mem0_openmemory_live_baseline` | preference_correction_history | `pass` | The fresh scoped run reports preference_correction_history as pass: Memory.history preserved explicit ADD and UPDATE records with old and current preference text, and search returned only the current correction. | +| `mem0_openmemory_live_baseline` | entity_scoped_personalization | `pass` | The fresh scoped run reports entity_scoped_personalization as pass: user_id, agent_id, and run_id filters returned the ELF scoped preference and omitted a PubFi scoped preference. | +| `mem0_openmemory_live_baseline` | local_get_all_export_readback | `pass` | The fresh scoped run reports local_get_all_export_readback as pass: Memory.get_all returned the current scoped preference and omitted the other scope. | +| `mem0_openmemory_live_baseline` | deletion_audit_history | `pass` | The fresh scoped run reports delete_history_audit_readback as pass: Memory.history exposed a DELETE event and search suppressed the deleted memory. | +| `mem0_openmemory_live_baseline` | openmemory_ui_readback | `blocked` | XY-931 runs a bounded OpenMemory export-helper setup probe after the mem0 SDK corpus checks. The probe finds the OpenMemory tree, UI package, compose file, and export helper, then records a setup blocker because the export helper requires Docker access to a running OpenMemory container. Local SDK get_all readback is measured separately and must not be reused as UI evidence. | +| `mem0_openmemory_live_baseline` | hosted_managed_memory_claims | `unsupported` | Hosted mem0 Platform behavior and Platform UI export are outside the local OSS Docker adapter and are non-goals for this local evidence record. | +| `mem0_openmemory_live_baseline` | real_world_job_adapter | `not_encoded` | No mem0/OpenMemory adapter currently executes real_world_job prompts and answer scoring. | +| `mem0_openmemory_live_baseline` | optional_graph_memory | `not_encoded` | Optional graph memory is not enabled in the default local OSS path and remains an opt-in scenario gate rather than a default pass/fail claim. | +| `memsearch_live_baseline` | canonical_markdown_store | `real` | memsearch is tracked as a Markdown-first source-of-truth reference. | +| `memsearch_live_baseline` | same_corpus_retrieval | `pass` | Fresh comparable baseline run live-baseline-20260611061612 reports memsearch retrieval_pass with 3/3 same-corpus retrieval checks. | +| `memsearch_live_baseline` | reindex_update_delete_reload | `pass` | The runner rewrites auth-memory.md, deletes a second corpus file, reruns memsearch index, and starts fresh memsearch search processes; the fresh scoped run reports update, delete, and cold-start reload passing. | +| `memsearch_live_baseline` | real_world_job_adapter | `not_encoded` | XY-925 adds fixture-backed prompt coverage for the Markdown source-store and retrieval-debug jobs, but no live memsearch runtime adapter executes real_world_job prompts and answer scoring. | +| `memsearch_live_baseline` | markdown_source_store_prompt_jobs | `pass` | The first-generation OSS fixture slice encodes source-of-truth rebuild/reload and retrieval-debug prompts over the canonical Markdown store while preserving the live-baseline-only evidence boundary. | +| `openviking_live_baseline` | local_embed_setup | `pass` | Docker local embedding dependency setup is pinned to llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu and reached import/runtime in the smoke run. | +| `openviking_live_baseline` | same_corpus_retrieval | `wrong_result` | OpenViking add_resource/find returned resources but missed expected evidence-term matches for every smoke query. | +| `openviking_live_baseline` | context_trajectory | `blocked` | OpenViking staged/hierarchical retrieval is now encoded as blocked context_trajectory fixtures until same-corpus expected evidence ids match and staged artifacts are materialized. | +| `openviking_live_baseline` | real_world_job_adapter | `not_encoded` | No OpenViking adapter currently executes real_world_job prompts and answer scoring. | +| `claude_mem_live_baseline` | same_corpus_retrieval | `wrong_result` | The current Docker adapter did not prove correct same-corpus retrieval. | +| `claude_mem_live_baseline` | durable_storage | `real` | The runner writes to a Docker-local SQLite file and constructs a new Database plus repository instances for cold-start recovery search. | +| `claude_mem_live_baseline` | repository_lifecycle | `real` | The runner uses MemoryItemsRepository.update, deletes from the repository-owned memory_items table, and relies on repository FTS triggers for update/delete checks. | +| `claude_mem_live_baseline` | repository_progressive_disclosure | `real` | The runner verifies search result to getById detail hydration and listSources source evidence on the durable repository path. | +| `claude_mem_live_baseline` | progressive_disclosure_real_world_job | `pass` | XY-925 adds fixture-backed prompt coverage for the Docker-contained repository progressive-disclosure path: search result to getById detail hydration and listSources evidence on durable SQLite. Hook, timeline, and viewer workflows remain blocked separately. | +| `claude_mem_live_baseline` | retrieval_repair_artifact | `wrong_result` | The same-corpus retrieval smoke remains wrong_result, and XY-925 records a repair prompt that tells operators to rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker before inspecting tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. | +| `claude_mem_live_baseline` | hook_capture_viewer_workflow | `blocked` | The current Docker runner does not launch claude-mem hooks, timeline capture, local viewer readback, or an operator workflow over the same corpus. | +| `qmd_deep_profile_gate` | stress_profile_retrieval_debug | `not_encoded` | The stress command path exists, but this adapter-pack gate has not published a deep qmd profile result. | +| `qmd_deep_profile_gate` | real_world_job_adapter | `not_encoded` | The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run. | +| `qmd_deep_profile_gate` | host_global_install_boundary | `unsupported` | Repository-supported qmd benchmark runs must stay inside docker-compose.baseline.yml and must not require host-global installs. | +| `openviking_deep_profile_gate` | docker_local_embed_setup | `pass` | The local embedding setup is pinned and reaches import/runtime in Docker. | +| `openviking_deep_profile_gate` | hierarchical_context_trajectory | `blocked` | Stage trajectory scoring is encoded as blocked until the smoke adapter returns evidence-bearing same-corpus output and selected hierarchy/expansion artifacts. | +| `openviking_deep_profile_gate` | host_global_install_boundary | `unsupported` | The adapter pack must not ask operators to install OpenViking dependencies globally on the host. | +| `ragflow_research_gate` | adapter_candidate_verdict | `not_encoded` | XY-882 completed D1/D2 feasibility research and marks RAGFlow adapter_candidate; no adapter run is encoded. | +| `ragflow_research_gate` | docker_service_setup | `blocked` | The smoke records official Docker setup, image/disk/startup envelope, CPU/GPU mode, vm.max_map_count handling, provider boundaries, and retry behavior. | +| `ragflow_research_gate` | real_world_job_adapter | `blocked` | One generated retrieval job is scored from the smoke artifact or typed blocked when resource, service, or local API-key boundaries stop execution. | +| `ragflow_research_gate` | quality_or_scale_claim | `not_encoded` | The scored smoke does not claim broad RAGFlow quality, private corpus behavior, scale, or comparative ranking. | +| `lightrag_research_gate` | docker_service_setup | `blocked` | The opt-in compose profile records explicit LightRAG image, LLM, embedding, rerank, workspace, and Docker volume configuration without host-global installs. | +| `lightrag_research_gate` | retrieved_context_export | `blocked` | The materializer calls /documents/texts, waits on /documents/track_status, and queries /query with only_need_context plus chunk references when the service is reachable. | +| `lightrag_research_gate` | real_world_job_adapter | `blocked` | The LightRAG materializer rewrites generated retrieval fixtures with adapter_response evidence only when source paths or context map to required evidence ids. | +| `lightrag_research_gate` | quality_or_scale_claim | `not_encoded` | The smoke does not score broad graph-RAG quality, private corpora, scale, or comparative ranking claims. | +| `graphrag_research_gate` | indexing_resource_envelope | `blocked` | The smoke bounds the generated public corpus, timeout, GraphRAG package, model configuration, cache size, output size, elapsed time, and observed cache entries. | +| `graphrag_research_gate` | source_citation_mapping | `blocked` | The generated artifact maps GraphRAG documents, text_units, communities, community_reports, entities, and relationships parquet rows back to real_world_job evidence ids when available. | +| `graphrag_research_gate` | real_world_job_adapter | `blocked` | The smoke writes a generated real_world_job fixture and scored report; provider/setup limits remain blocked until live GraphRAG output maps to expected evidence ids. | +| `graphrag_research_gate` | quality_or_scale_claim | `not_encoded` | The smoke does not claim broad graph-navigation quality, knowledge-synthesis quality, private corpora, or large-corpus indexing. | +| `graphiti_zep_research_gate` | temporal_graph_memory | `blocked` | The smoke materializes generated current, historical, and rationale facts with validity windows, but the checked-in record stays blocked until a live artifact maps search output. | +| `graphiti_zep_research_gate` | docker_graph_store_setup | `blocked` | The task uses a Docker Compose graphiti-zep profile for FalkorDB and a container-local Python venv; no host-global graph database or hosted Zep service is used. | +| `graphiti_zep_research_gate` | real_world_job_adapter | `blocked` | The generated temporal-validity fixture is scored or typed blocked; live quality evidence requires Graphiti/Zep search output mapped to current and historical evidence ids. | +| `graphiti_zep_research_gate` | quality_or_scale_claim | `not_encoded` | The smoke does not claim broad graph-memory quality, managed Zep service behavior, private-corpus behavior, or large-corpus performance. | +| `letta_research_gate` | core_archival_memory | `blocked` | ELF fixture jobs now score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids. | +| `letta_research_gate` | docker_embedding_configuration | `blocked` | Docker setup requires explicit embedding configuration before archival retrieval can be tested. | +| `letta_research_gate` | real_world_job_adapter | `not_encoded` | No Letta materializer or scorer mapping exists. | +| `langgraph_research_gate` | checkpoint_replay_regression | `not_encoded` | Replay/fork behavior needs an agent graph harness before scoring. | +| `langgraph_research_gate` | standalone_memory_backend | `unsupported` | LangGraph persistence is an agent-state/checkpoint layer, not a drop-in memory retrieval backend. | +| `langgraph_research_gate` | real_world_job_adapter | `not_encoded` | No LangGraph benchmark materializer exists. | +| `nanograph_research_gate` | typed_graph_schema | `not_encoded` | Schema-as-code and typed query ergonomics need a benchmark harness. | +| `nanograph_research_gate` | memory_backend_comparison | `unsupported` | nanograph is a graph database reference, not a complete agent memory service. | +| `nanograph_research_gate` | real_world_job_adapter | `not_encoded` | No nanograph materializer exists. | +| `llm_wiki_research_gate` | knowledge_page_compilation | `not_encoded` | Wiki generation and citation lint are not executed by the runner. | +| `llm_wiki_research_gate` | live_service_runtime | `unsupported` | llm-wiki is a plugin/workflow reference rather than a service adapter. | +| `llm_wiki_research_gate` | real_world_job_adapter | `not_encoded` | No page materializer or scorer mapping exists. | +| `gbrain_research_gate` | compiled_truth_timeline | `not_encoded` | Compiled truth plus timeline output is a reference pattern but not scored. | +| `gbrain_research_gate` | postgres_backed_brain_repo | `blocked` | A Docker-local brain repo and Postgres setup path must be proven before execution. | +| `gbrain_research_gate` | real_world_job_adapter | `not_encoded` | No gbrain materializer exists. | +| `graphify_docker_smoke` | docker_cli_boundary | `pass` | The smoke uses docker-compose.baseline.yml baseline-runner, a container-local Python venv, and isolated assistant config paths; it does not install host-global assistant hooks. | +| `graphify_docker_smoke` | graph_report_generation | `pass` | The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, command logs, build time, graph size, and report size. | +| `graphify_docker_smoke` | real_world_job_adapter | `wrong_result` | The smoke writes a generated real_world_job fixture and scored report; current knowledge_compilation scoring is wrong_result, not pass. | +| `graphify_docker_smoke` | multimodal_code_graph | `not_encoded` | Multimodal extraction for videos, images, PDFs, or broad codebase understanding is a reference capability but not scored by this smoke. | +| `graphify_docker_smoke` | quality_or_scale_claim | `not_encoded` | The smoke does not claim broad graph quality, private corpus behavior, scale, or authoritative memory-store behavior. | + +### Adapter Scenario Judgments + +| Adapter | Scenario | Suite | Status | Outcome | Evidence | +| --- | --- | --- | --- | --- | --- | +| `elf_live_real_world` | `live_capture_write_policy` | `capture_integration` | `pass` | `tie` | ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is an ELF self-check, not a win over external hook systems.
command: `cargo make real-world-memory-live-adapters`
artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_live_real_world` | `live_consolidation_proposal_review` | `consolidation` | `pass` | `tie` | ELF live consolidation jobs now exercise source lineage, unsupported-claim flags, and apply/defer/discard review audit transitions. This is an ELF service self-check, not a broad competitor win.
command: `cargo make real-world-memory-live-adapters`
artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_live_real_world` | `live_knowledge_page_rebuild_lint` | `knowledge_compilation` | `pass` | `tie` | ELF live knowledge jobs now exercise page rebuild, search, stale-source lint, citations, backlinks, and unsupported-section handling. This is an ELF service self-check, not a broad knowledge-product win.
command: `cargo make real-world-memory-live-adapters`
artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_live_real_world` | `full_sweep_operator_debug` | `operator_debugging_ux` | `pass` | `win` | ELF full live sweep now includes the operator-debug fixture tree with hydrated trace ids, trace-bundle replay commands, dropped-candidate visibility, repair guidance, and no raw SQL requirement.
command: `cargo make real-world-memory-live-adapters`
artifact: `tmp/real-world-memory/live-adapters/elf-materialization.json` | +| `elf_operator_debug_live` | `operator_debug_trace_hydration` | `operator_debugging_ux` | `pass` | `win` | ELF generated trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs for the operator-debug jobs; qmd has replay rows but no ELF trace hydration surface.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/elf-report.json` | +| `elf_operator_debug_live` | `operator_debug_replay_command` | `operator_debugging_ux` | `pass` | `tie` | ELF generated admin trace-bundle replay commands; qmd generated local CLI query replay commands. These are comparable replay-command availability artifacts, not equivalent UI quality claims.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/summary.json` | +| `elf_operator_debug_live` | `operator_debug_candidate_drop_visibility` | `operator_debugging_ux` | `pass` | `win` | ELF generated operator_debug candidate-drop visibility from trace and replay-candidate metadata without direct SQL assumptions; qmd keeps only top-k replay rows and lacks intermediate candidate-drop stages.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json` | +| `elf_operator_debug_live` | `operator_debug_repair_action_clarity` | `operator_debugging_ux` | `pass` | `tie` | ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/summary.json` | +| `elf_operator_debug_live` | `operator_debug_selected_but_not_narrated` | `operator_debugging_ux` | `pass` | `win` | The new selected-but-not-narrated job scores whether selected trace evidence is available for answer-composition repair without direct database inspection.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/elf-report.json` | +| `qmd_operator_debug_live` | `operator_debug_trace_hydration` | `operator_debugging_ux` | `wrong_result` | `win` | qmd generated replay-command metadata but trace_available=false, so ELF wins only this trace-hydration dimension; this is not a broad qmd loss.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` | +| `qmd_operator_debug_live` | `operator_debug_replay_command` | `operator_debugging_ux` | `pass` | `tie` | qmd generated local CLI query replay commands for the same operator-debugging scenarios; ELF generated admin trace-bundle curl commands.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/summary.json` | +| `qmd_operator_debug_live` | `operator_debug_candidate_drop_visibility` | `operator_debugging_ux` | `wrong_result` | `win` | qmd generated top-k replay output but not intermediate retrieved-but-dropped stage visibility, so candidate-drop diagnosis remains a qmd wrong_result in this narrow slice.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json` | +| `qmd_operator_debug_live` | `operator_debug_repair_action_clarity` | `operator_debugging_ux` | `pass` | `tie` | qmd generated clear local replay steps for repair investigation, matching ELF on repair-action clarity while differing on trace hydration.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` | +| `qmd_operator_debug_live` | `operator_debug_selected_but_not_narrated` | `operator_debugging_ux` | `wrong_result` | `win` | qmd can replay top-k rows, but the generated artifact does not expose service trace narration stages for the selected-but-not-narrated diagnosis.
command: `cargo make real-world-job-operator-ux-live-adapters`
artifact: `tmp/real-world-job/operator-ux-live-adapters/qmd-report.json` | +| `agentmemory_live_baseline` | `basic_same_corpus_retrieval` | `retrieval` | `pass` | `not_tested` | Fresh comparable baseline run live-baseline-20260611061612 reports agentmemory retrieval_pass with 3/3 same-corpus retrieval checks through mem::remember and mem::search. This is live-baseline-only evidence through an in-memory mock, not a real_world_job suite pass.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `agentmemory_live_baseline` | `durable_update_reload_lifecycle` | `memory_evolution` | `lifecycle_fail` | `win` | Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks, while agentmemory update_replaces_note_text is lifecycle_fail and cold_start_recovery_search is blocked because the harness uses an in-memory SDK/KV mock. This is an ELF baseline win only at the local lifecycle-smoke evidence class.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `agentmemory_live_baseline` | `work_resume_capture_continuity` | `work_resume` | `blocked` | `blocked` | agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. XY-925 selects the durable local path as a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; keep work_resume and capture claims blocked until that path exists.
command: `cargo make real-world-first-generation-oss`
artifact: `tmp/real-world-memory/first-generation-oss/report.json` | +| `agentmemory_live_baseline` | `durable_work_resume_local_path` | `work_resume` | `blocked` | `blocked` | The selected comparable path is explicit: capture into a Docker-local agentmemory session directory, persist the SDK KV/index and observation log, restart a fresh process, then score work_resume prompts. The checked-in fixture records this as blocked rather than scoring the current mock.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json` | +| `agentmemory_live_baseline` | `capture_write_policy_hooks` | `capture_integration` | `blocked` | `blocked` | agentmemory capture/write-policy comparison needs live hook observations and write-policy audit evidence persisted through the selected local store. The fixture preserves this as a typed blocker and does not convert the mem::remember smoke into capture proof.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json` | +| `mem0_openmemory_live_baseline` | `basic_local_lifecycle` | `memory_evolution` | `pass` | `tie` | Prior comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing basic same-corpus retrieval, update, delete, and cold-start reload checks. This remains a basic local lifecycle tie at the encoded smoke surface and is not reused as history/UI evidence.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `mem0_openmemory_live_baseline` | `preference_correction_history` | `personalization` | `pass` | `loss` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| `mem0_openmemory_live_baseline` | `entity_scoped_personalization` | `personalization` | `pass` | `tie` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md` | +| `mem0_openmemory_live_baseline` | `delete_audit_readback` | `memory_evolution` | `pass` | `tie` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.
command: `mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters`
artifact: `mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md` | +| `mem0_openmemory_live_baseline` | `local_get_all_export_readback` | `operator_debugging_ux` | `pass` | `not_tested` | Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.
command: `ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker`
artifact: `tmp/live-baseline/mem0-checks.json` | +| `mem0_openmemory_live_baseline` | `openmemory_ui_export_readback` | `operator_debugging_ux` | `blocked` | `blocked` | The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.
command: `cargo make openmemory-ui-export-readback`
artifact: `tmp/live-baseline/mem0-openmemory-ui-export.json` | +| `mem0_openmemory_live_baseline` | `hosted_platform_export` | `operator_debugging_ux` | `unsupported` | `non_goal` | Hosted mem0 Platform export is explicitly outside the local OSS Docker comparison and is not counted as a local pass, loss, or blocker.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `mem0_openmemory_live_baseline` | `optional_graph_memory` | `memory_evolution` | `not_encoded` | `non_goal` | Optional graph memory is kept as an opt-in scenario gate. It is not enabled in the default mem0 local OSS run and is not part of the default pass/fail comparison.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `memsearch_live_baseline` | `canonical_markdown_reindex_reload` | `trust_source_of_truth` | `pass` | `not_tested` | Fresh comparable baseline run live-baseline-20260611061612 reports memsearch passed same-corpus retrieval, update reindex, delete suppression, and cold-start reload over a canonical Markdown corpus. ELF has no directly comparable canonical Markdown source-store scenario in this baseline, so the ELF position remains untested.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `memsearch_live_baseline` | `markdown_source_store_rebuild_reload_prompt` | `trust_source_of_truth` | `pass` | `not_tested` | XY-925 adds a checked-in real_world_job prompt fixture that asks for the memsearch source-of-truth path and rebuild/reload boundary: canonical Markdown files are authoritative, while the index is derived by rerunning memsearch index. This is fixture-backed scenario coverage plus baseline artifact evidence, not a memsearch live real_world_job suite pass.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json` | +| `memsearch_live_baseline` | `markdown_retrieval_debug_prompt` | `operator_debugging_ux` | `pass` | `not_tested` | XY-925 adds a checked-in retrieval-debug prompt over memsearch's canonical Markdown store. The expected debug surface is CLI replay plus Markdown source inspection and reindexing; staged expansion/fusion/rerank/candidate-drop trace bundles remain not encoded for memsearch.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json` | +| `memsearch_live_baseline` | `ttl_expiry_lifecycle` | `memory_evolution` | `unsupported` | `non_goal` | The encoded memsearch CLI path supports reindex/delete but no TTL or expiry behavior. Unsupported TTL behavior is preserved as unsupported competitor evidence and does not create an ELF win/loss claim without a directly comparable scenario artifact.
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `memsearch_live_baseline` | `real_world_prompt_adapter` | `retrieval` | `not_encoded` | `not_tested` | No live memsearch runtime adapter currently executes real_world_job prompts and answer scoring. XY-925 fixture-backed prompt jobs document the source-store and retrieval-debug shape, while baseline retrieval/reindex evidence remains separate from suite pass claims.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `claude_mem_live_baseline` | `same_corpus_retrieval` | `retrieval` | `wrong_result` | `win` | Fresh comparable baseline run live-baseline-20260611061612 reports ELF retrieval_pass and claude-mem same_corpus_retrieval as wrong_result with 0/3 expected query checks passing, while its durable repository setup completed. This is an ELF baseline win for the narrow retrieval smoke scenario.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `claude_mem_live_baseline` | `retrieval_repair_artifact_path` | `retrieval` | `wrong_result` | `win` | XY-925 adds a checked-in repair prompt that preserves the claude-mem wrong_result and names rerun/inspection targets from the reproducible Docker baseline: tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. This is repair evidence for a miss, not a retrieval pass.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json` | +| `claude_mem_live_baseline` | `repository_lifecycle_reload` | `memory_evolution` | `pass` | `tie` | Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing local lifecycle checks and claude-mem update, delete, and cold-start reload checks passing over a durable Docker-local SQLite repository. This is a local lifecycle-smoke tie, not a hook-driven work-resume or full progressive-disclosure job pass.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `claude_mem_live_baseline` | `progressive_disclosure_detail_hydration` | `operator_debugging_ux` | `pass` | `not_tested` | claude-mem passed the repository-level search-to-detail/source hydration check, which is a useful progressive-disclosure signal. ELF does not have a directly comparable claude-mem-style progressive-disclosure scenario in this baseline, so the ELF position remains untested rather than a loss claim.
command: `ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker`
artifact: `tmp/live-baseline/live-baseline-report.json` | +| `claude_mem_live_baseline` | `progressive_disclosure_prompt` | `operator_debugging_ux` | `pass` | `not_tested` | XY-925 adds fixture-backed prompt coverage that asks for the measured claude-mem progressive-disclosure boundary: repository search results hydrate through getById and listSources on durable SQLite, but hooks, timeline, viewer, and live prompt scoring are not executed.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json` | +| `claude_mem_live_baseline` | `hook_capture_viewer_workflow` | `capture_integration` | `blocked` | `blocked` | The Docker baseline uses repository classes only. claude-mem hooks, viewer, timeline, and observation workflows are not executed by the runner, so XY-925 preserves this as a typed blocker rather than not_encoded prose.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json` | +| `claude_mem_live_baseline` | `viewer_operator_workflow` | `operator_debugging_ux` | `blocked` | `blocked` | A fair claude-mem viewer/operator comparison needs a Docker-contained run that opens the local viewer or equivalent readback over the same durable SQLite corpus and emits timeline, detail hydration, and repair-command artifacts. That path is not available in the current runner.
command: `cargo make real-world-first-generation-oss`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json` | +| `ragflow_research_gate` | `reference_chunk_citation_mapping` | `retrieval` | `blocked` | `blocked` | XY-929 adds a representative blocked fixture for RAGFlow reference-chunk citation scoring. The job must remain blocked until returned reference chunks include generated document ids, chunk ids, content, and document metadata mapped to benchmark evidence ids.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json` | +| `ragflow_research_gate` | `private_or_large_corpus_ragflow_quality` | `retrieval` | `not_encoded` | `non_goal` | Private corpus, large-corpus, and hosted RAGFlow quality are outside the generated-public Docker representative lane and must not be inferred from smoke reports.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `lightrag_research_gate` | `context_source_reference_mapping` | `retrieval` | `incomplete` | `blocked` | XY-929 adds a representative incomplete fixture for LightRAG context/source-reference scoring. The job cannot score until the opt-in Docker API exports generated source file paths, snippets, or reference content.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json` | +| `lightrag_research_gate` | `graph_rag_navigation_quality` | `retrieval` | `not_encoded` | `not_tested` | LightRAG graph-RAG navigation quality remains not_tested beyond the context-source output contract; no ELF win, tie, or loss is claimed.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `graphrag_research_gate` | `output_table_citation_mapping` | `knowledge_compilation` | `blocked` | `blocked` | XY-929 adds a representative blocked fixture for GraphRAG output-table citation scoring. The job requires provider-backed Docker output tables whose document, text-unit, community, report, entity, and relationship identifiers map to generated evidence ids.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json` | +| `graphrag_research_gate` | `graph_summary_synthesis_quality` | `knowledge_compilation` | `not_encoded` | `not_tested` | GraphRAG graph-summary synthesis quality remains not_tested until provider-backed output tables and local-search context are scored beyond the smoke contract.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `graphiti_zep_research_gate` | `temporal_validity_window_mapping` | `memory_evolution` | `blocked` | `blocked` | XY-929 adds a representative blocked fixture for Graphiti/Zep temporal-validity scoring. The job remains blocked until provider-backed Docker output maps current and historical validity-window facts to generated evidence ids.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json` | +| `graphiti_zep_research_gate` | `hosted_zep_temporal_memory` | `memory_evolution` | `unsupported` | `non_goal` | Hosted Zep service behavior is outside the Docker-local representative lane; no hosted-service result is used as ELF win/loss evidence.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `letta_research_gate` | `core_block_attachment_readback` | `core_archival_memory` | `not_encoded` | `not_tested` | ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta has no comparable exported core block attachment evidence.
artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json` | +| `letta_research_gate` | `core_block_scope_readback` | `core_archival_memory` | `not_encoded` | `not_tested` | ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains unscored without a contained export of agent, block, and visibility metadata.
artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json` | +| `letta_research_gate` | `core_block_provenance_readback` | `core_archival_memory` | `not_encoded` | `not_tested` | ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains not_tested until exported core memory includes stable source ids and audit-equivalent events.
artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json` | +| `letta_research_gate` | `stale_core_detection` | `core_archival_memory` | `blocked` | `blocked` | ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.
artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json` | +| `letta_research_gate` | `archival_fallback_readback` | `core_archival_memory` | `blocked` | `blocked` | ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.
artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json` | +| `letta_research_gate` | `core_archival_project_decision_recovery` | `core_archival_memory` | `not_encoded` | `not_tested` | ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains not_tested until the contained export/readback contract exists.
artifact: `apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json` | +| `llm_wiki_research_gate` | `wiki_page_citation_lint` | `knowledge_compilation` | `not_encoded` | `not_tested` | llm-wiki remains a knowledge-workflow reference. No Docker-contained plugin or file-based page materializer emits cited wiki sections for scoring.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `gbrain_research_gate` | `compiled_truth_timeline_export` | `knowledge_compilation` | `blocked` | `blocked` | gbrain compiled-truth and timeline scoring remains blocked until a Docker-local brain repository and database setup emits current-truth pages with source timeline evidence.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | +| `graphify_docker_smoke` | `graph_report_navigation_lint` | `knowledge_compilation` | `wrong_result` | `not_tested` | XY-929 adds a representative graphify fixture that scores graph report navigation, source-location citations, stale-source lint, and unsupported-summary handling as wrong_result because stale-source lint is still missing. This remains graphify non-pass evidence, not an ELF victory claim.
command: `cargo make real-world-memory-graph-rag`
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json` | +| `graphify_docker_smoke` | `broad_graph_navigation_quality` | `retrieval` | `not_encoded` | `not_tested` | Broad graph-navigation, codebase, multimodal, and private-corpus quality remain not_tested; the graphify evidence is bounded to generated graph/report artifacts.
artifact: `apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json` | + +### Adapter Execution Metadata + +| Adapter | Sources | Setup Path | Runtime Boundary | Resource Expectation | Retry Guidance | Research Depth | +| --- | --- | --- | --- | --- | --- | --- | +| `openviking_live_baseline` | [OpenViking repository](https://github.com/volcengine/OpenViking/): Official source for OpenViking local context database, resource, and retrieval APIs.
[llama-cpp-python CPU wheel index](https://abetlen.github.io/llama-cpp-python/whl/cpu): Official prebuilt CPU wheel index used by the Docker-local embedding pin. | Run ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker. The runner installs llama-cpp-python==0.3.28 with --only-binary llama-cpp-python from the CPU wheel index before OpenViking add_resource/find. | docker-compose.baseline.yml baseline-runner container; no host-global OpenViking, llama-cpp-python, or model service install is required. | Local embedding setup may download a CPU wheel and model assets; record OpenViking.log, elapsed time, and cache size before claiming adapter quality. | Use the default pinned CPU wheel path first.; Override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the default wheel is unavailable for the Docker platform.; Treat install/import failure as incomplete, not wrong_result; treat add_resource/find evidence misses as wrong_result. | not recorded | +| `qmd_deep_profile_gate` | [qmd repository](https://github.com/tobi/qmd): Official qmd source for local hybrid search, CLI setup, and query behavior. | Use the existing Docker baseline qmd install, collection add, update, embed, and query flow with scale or stress profiles. | docker-compose.baseline.yml baseline-runner container with project files and caches inside Docker volumes. | CPU local embedding and rerank cost scale with corpus size; record elapsed time and qmd log artifacts before claims. | Run qmd stress profile in Docker and publish the artifact path.; Map qmd JSON output to retrieval-debug real_world_job scoring before suite claims. | D2 reviewed; deep profile not encoded | +| `openviking_deep_profile_gate` | [OpenViking repository](https://github.com/volcengine/OpenViking/): Official source for OpenViking local context database, resource, and retrieval APIs. | Use the pinned Docker local embedding path from scripts/live-baseline-benchmark.sh, then run OpenViking add_resource/find before any deep profile scoring. | docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker. | Local embedding setup can download CPU wheels and model assets; record build/import logs, model cache size, and elapsed time. | Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.; Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.; Fix evidence-bearing same-corpus output and materialize selected hierarchy/expansion artifacts before converting blocked context_trajectory fixtures into scored jobs. | D2 reviewed; local embedding setup pinned; blocked fixtures encoded | +| `ragflow_research_gate` | [RAGFlow repository](https://github.com/infiniflow/ragflow): Official source for RAGFlow service code and Docker Compose setup.
[RAGFlow docs](https://ragflow.io/docs/): Official deployment and setup documentation.
[RAGFlow HTTP API reference](https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md): Official reference for OpenAI-compatible responses with reference chunks and document metadata. | Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API. | Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs. | Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring. | Run cargo make ragflow-docker-smoke first to produce a typed preflight artifact.; Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.; Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids. | D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output | +| `lightrag_research_gate` | [LightRAG repository](https://github.com/HKUDS/LightRAG): Official source for LightRAG server, Docker, and retrieval modes.
[LightRAG Docker docs](https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md): Official Docker deployment reference.
[LightRAG API server docs](https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md): Official query-mode and context-output reference.
[LightRAG core programming docs](https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md): Official source-id and file-path citation reference. | Run cargo make lightrag-docker-context-smoke for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export. | docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes. | The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts. | Run cargo make lightrag-docker-context-smoke first; a missing API must remain a typed incomplete artifact, not a pass claim.; Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.; Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids. | D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output | +| `graphrag_research_gate` | [GraphRAG repository](https://github.com/microsoft/graphrag): Official Microsoft GraphRAG source and setup reference.
[GraphRAG docs](https://microsoft.github.io/graphrag/): Official documentation for indexing and querying.
[GraphRAG input docs](https://microsoft.github.io/graphrag/index/inputs/): Official input format and document metadata reference.
[GraphRAG output tables](https://microsoft.github.io/graphrag/index/outputs/): Official output schema with document, text unit, community, and relationship identifiers.
[GraphRAG local search docs](https://microsoft.github.io/graphrag/query/local_search/): Official local-search context and graph traversal reference. | Run cargo make graphrag-docker-smoke for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt. | docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke. | The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries. | Run cargo make graphrag-docker-smoke first; missing provider configuration must remain a typed blocked artifact, not a pass claim.; Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.; Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs. | D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output | +| `graphiti_zep_research_gate` | [Graphiti repository](https://github.com/getzep/graphiti): Official open-source temporal context graph engine.
[Zep Graphiti overview](https://www.getzep.com/platform/graphiti/): Official product documentation for temporal context graph behavior.
[Graphiti quick start](https://help.getzep.com/graphiti/getting-started/quick-start): Official setup, episode ingest, and search output reference.
[Graphiti FalkorDB configuration](https://help.getzep.com/graphiti/configuration/falkor-db-configuration): Official Docker-local FalkorDB setup reference.
[Graphiti fact triples](https://help.getzep.com/graphiti/working-with-data/adding-fact-triples): Official manual fact-triple ingest contract. | Run cargo make graphiti-zep-docker-temporal-smoke for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt. | docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke. | Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring. | Run cargo make graphiti-zep-docker-temporal-smoke first to produce a typed blocked artifact.; Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.; Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass. | D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output | +| `letta_research_gate` | [Letta repository](https://github.com/letta-ai/letta): Official source for Letta stateful agents and memory.
[Letta Docker docs](https://docs.letta.com/guides/docker/): Official Docker deployment guide and embedding configuration boundary. | Use a Docker-only Letta server or CLI flow that creates a benchmark-owned agent, loads the checked-in core_archival_memory fixture corpus, writes core memory and archival memory with fixture source ids, then exports core block JSON plus archival search/readback JSON. | Docker-only Letta server or CLI flow with benchmark-created agents, benchmark-owned storage, no host-global state, and no unstated hosted service dependency. | Embedding model, agent server state, exported core memory, archival search output, and provider boundaries must be explicit in the artifact. | Create a tiny Docker agent with core memory and archival memory loaded from the ELF core_archival_memory fixtures.; Export core block readback, archival search results, source ids, and any audit-equivalent metadata as JSON before scoring.; Score core-versus-archival scenarios only after source evidence can be exported and mapped to the fixture evidence ids. | D1 feasibility verdict: research_only (XY-882); XY-927 selects the contained export/readback contract, but the Letta adapter remains blocked until that artifact exists | +| `langgraph_research_gate` | [LangGraph persistence docs](https://docs.langchain.com/oss/python/langgraph/persistence): Official documentation for checkpoints, replay, fork, and persistence behavior. | Build a tiny LangGraph agent with a checkpointer and explicit memory read/write steps before scoring. | Docker-only Python harness with checkpoint store under the artifact directory. | Small runtime expected, but LLM calls and side effects must be stubbed or deterministic before replay claims. | Encode one replay/fork failure recovery job.; Keep LangGraph classified as replay reference unless memory retrieval is actually exercised. | D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded | +| `nanograph_research_gate` | [nanograph repository](https://github.com/nanograph/nanograph): Official source for on-device typed property graph behavior. | Build or install nanograph inside Docker and load a typed graph fixture from generated corpus facts. | Docker-only CLI run with graph folder under benchmark artifacts. | Light local graph runtime expected; record binary build/install time and graph artifact size. | Define a minimal schema for memory_evolution facts.; Score typed query output only if it cites fixture evidence IDs. | D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded | +| `llm_wiki_research_gate` | [llm-wiki repository](https://github.com/nvk/llm-wiki): Official source for the LLM Wiki plugin and knowledge-base workflow. | Research plugin bootstrap inside a Docker-contained Codex or file-based harness, then materialize page artifacts. | Docker-only plugin or fixture materializer; no user-global Codex plugin install. | LLM generation cost depends on page build; record provider boundary and generated artifact size. | Prototype a fixture-only page build with explicit citations.; Do not score until generated sections can be mapped to evidence IDs. | D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded | +| `gbrain_research_gate` | [gbrain repository](https://github.com/garrytan/gbrain): Official source for brain repo and retrieval workflow.
[compiled truth guide](https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md): Official guide for compiled truth plus timeline behavior. | Create a Docker-local brain repo fixture, run import/sync, and export compiled truth plus timeline evidence. | Docker-only repository and database state with no operator-owned brain repo. | Postgres-backed sync and embedding choices must be explicit; record DB size and import time. | Prototype a tiny brain repo with one current-truth page and timeline.; Score only if compiled truth cites the source timeline evidence. | D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven | +| `graphify_docker_smoke` | [graphify repository](https://github.com/safishamsi/graphify): Official source for graphify graph extraction and query workflow.
[graphify README](https://github.com/safishamsi/graphify/blob/v3/README.md): Official CLI, output artifact, query, and source-location contract. | Run cargo make graphify-docker-graph-report-smoke to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks. | docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke. | Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior. | Run cargo make graphify-docker-graph-report-smoke first; setup/runtime failures must remain typed artifacts, not pass claims.; Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.; Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids. | D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result | + +## Capture And Integration Coverage + +The real-world job runner is fixture-backed. This section separates encoded evidence from live adapter claims. + +| Class | Behaviors | +| --- | --- | +| real | - | +| fixture-backed | - | +| mocked | - | +| blocked | - | +| not encoded | No capture/integration behavior was declared by encoded fixtures. | + +## Suites + +| Suite | Status | Jobs | Score | Evidence Recall | Irrelevant Context | Trace Explain | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | History Readback | Unsupported Claims | Wrong Results | Reason | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| trust_source_of_truth | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| work_resume | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| project_decisions | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| retrieval | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| memory_evolution | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| consolidation | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| memory_summary | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| proactive_brief | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| scheduled_memory | `blocked` | 5 | `0.800` | `1.000` | `0.000` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | At least one encoded job is blocked. | +| knowledge_compilation | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| operator_debugging_ux | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| capture_integration | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| production_ops | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| personalization | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| core_archival_memory | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| context_trajectory | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | + +## Jobs + +| Suite | Job | Status | Answer Type | Caveat Required | Refusal Required | Unknown Allowed | Score | Evidence Recall | Irrelevant Context | Expected Evidence | Produced Evidence | Trace Failure Stage | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost | +| --- | --- | --- | --- | --- | --- | --- | ---: | ---: | ---: | --- | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- | +| scheduled_memory | scheduled-knowledge-page-refresh-suggestion-001 | `pass` | `scheduled_memory_task` | `false` | `false` | `true` | `1.000` | `1.000` | `0.000` | `scheduled-knowledge-page-stale-finding, scheduled-knowledge-reviewable-refresh` | `scheduled-knowledge-page-stale-finding, scheduled-knowledge-reviewable-refresh` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.000 ms` | `0.000 USD` | +| scheduled_memory | scheduled-private-provider-scheduler-blocked-001 | `blocked` | `scheduled_memory_task` | `true` | `true` | `true` | `0.000` | `1.000` | `0.000` | `` | `` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `-` | `-` | +| scheduled_memory | scheduled-stale-decision-audit-001 | `pass` | `scheduled_memory_task` | `false` | `false` | `true` | `1.000` | `1.000` | `0.000` | `scheduled-old-consolidation-only-decision, scheduled-current-direct-suite-decision` | `scheduled-current-direct-suite-decision, scheduled-old-consolidation-only-decision` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.000 ms` | `0.000 USD` | +| scheduled_memory | scheduled-stale-preference-plan-audit-001 | `pass` | `scheduled_memory_task` | `false` | `false` | `true` | `1.000` | `1.000` | `0.000` | `scheduled-stale-old-plan, scheduled-stale-plan-expired, scheduled-current-trace-plan, scheduled-current-reviewable-preference` | `scheduled-current-reviewable-preference, scheduled-current-trace-plan, scheduled-old-silent-mutation-preference, scheduled-stale-old-plan, scheduled-stale-plan-expired` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.000 ms` | `0.000 USD` | +| scheduled_memory | scheduled-weekly-project-status-summary-001 | `pass` | `scheduled_memory_task` | `false` | `false` | `true` | `1.000` | `1.000` | `0.000` | `scheduled-weekly-current-gate, scheduled-weekly-ledger-update` | `scheduled-weekly-current-gate, scheduled-weekly-ledger-update` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.000 ms` | `0.000 USD` | + +## Operator Debugging UX + +No encoded job reported operator debugging evidence. + +## Memory Evolution + +- Stale answers: `0` +- Conflict detections: `0` +- Update rationales available: `0` +- Temporal validity not encoded: `0` + +- History readback encoded: `0` + +| Suite | Job | Current Evidence | Historical Evidence | Tombstone/Invalidation | Selected Current | Selected Historical | Selected Rationale | Selected Tombstone/Invalidation | Selected But Not Narrated | Stale Traps Used | Conflict Count | Detected | Update Rationale | Temporal Validity | History Readback | Follow-up | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---: | ---: | --- | --- | --- | --- | + +## Trace Explainability + +No encoded job reported trace explainability metadata. + +## Scheduled Memory Metrics + +| Job | Task Runs | Outputs | Kinds | Evidence Coverage | Freshness | Action Rationale | Trace Coverage | Invalid Current | Untraced | Unsupported Current | Tombstone Violations | Source Mutations | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| scheduled-knowledge-page-refresh-suggestion-001 | 1 | 1 | `1/1` | `1.000` | `1.000` | `1.000` | `1.000` | 0 | 0 | 0 | 0 | 0 | +| scheduled-stale-decision-audit-001 | 1 | 1 | `1/1` | `1.000` | `1.000` | `1.000` | `1.000` | 0 | 0 | 0 | 0 | 0 | +| scheduled-stale-preference-plan-audit-001 | 1 | 2 | `1/1` | `1.000` | `1.000` | `1.000` | `1.000` | 0 | 0 | 0 | 0 | 0 | +| scheduled-weekly-project-status-summary-001 | 1 | 1 | `1/1` | `1.000` | `1.000` | `1.000` | `1.000` | 0 | 0 | 0 | 0 | 0 | + +## Unsupported Claims + +No unsupported claims were produced by encoded jobs. + +## Follow-Ups + +| Suite | Job | Follow-up | Reason | +| --- | --- | --- | --- | +| scheduled_memory | scheduled-private-provider-scheduler-blocked-001 | XY-930 private/provider scheduled-memory input gate | Run private-corpus, provider-backed, and hosted scheduler gates only when operator-owned inputs exist. | + +## Result Semantics + +This report uses `docs/spec/real_world_agent_memory_benchmark_v1.md` status terms. +It is a real-world job fixture report, not a Docker live-baseline report. +Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins. + +The summary counters report required evidence coverage, source-ref coverage, quote coverage, expected evidence recall, irrelevant context ratio, trace explainability, stale retrievals, scope violations, redaction leaks, Qdrant rebuild case coverage, stale answers, conflict detections, update rationale availability, and temporal validity gaps across encoded jobs. + +- `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule. +- `wrong_result`: a job completed but missed required answer or evidence expectations. +- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links. +- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed. + +For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims. + +For `memory_summary` jobs, summary artifacts are derived review surfaces. Top-of-mind entries must be current, included or downgraded entries must carry source refs, and derived project-profile entries must either cite sources or be explicitly flagged as unsupported. + +For `proactive_brief` jobs, brief artifacts are fixture-scored derived outputs, not scheduled UI behavior. Every suggestion must carry evidence refs, freshness/currentness metadata, and an action rationale; stale, superseded, or tombstoned sources must not be presented as current recommendations. + +For `scheduled_memory` jobs, task artifacts are deterministic fixture-scored stand-ins for asynchronous work. Every output must carry evidence refs, freshness/currentness metadata, action rationale, and execution trace/readback evidence; scheduled tasks must not mutate source notes silently or claim hosted scheduler/private-provider parity from fixture-only output. + +## Suites With `not_encoded` Status + +- `trust_source_of_truth` +- `work_resume` +- `project_decisions` +- `retrieval` +- `memory_evolution` +- `consolidation` +- `memory_summary` +- `proactive_brief` +- `knowledge_compilation` +- `operator_debugging_ux` +- `capture_integration` +- `production_ops` +- `personalization` +- `core_archival_memory` +- `context_trajectory` diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 9c8449f0..56de3357 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -123,6 +123,10 @@ cleanup, use `docs/guide/single_user_production.md`. project brief scoring report with source refs, freshness/currentness markers, reject/defer rationale, stale/tombstone guards, and the private-corpus blocker tied to XY-930. +- `2026-06-16-scheduled-memory-task-scoring-report.md`: XY-954 fixture-backed + scheduled-memory task scoring report with source refs, freshness/currentness + markers, action rationale, execution trace/readback, source-mutation guards, and + the private/provider scheduler blocker tied to XY-930. - `2026-06-16-live-temporal-reconciliation-report.md`: XY-905 live temporal reconciliation follow-up showing ELF live `memory_evolution` moving from `pass=1`, `wrong_result=5` to `pass=6`, `wrong_result=0`, with trace/readback diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 84640e02..969dc125 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -229,17 +229,21 @@ research gates. Its `external_adapters` report section distinguishes: - `research_gate`: checked-in source/setup/runtime/resource/retry metadata for a future adapter path, not fixture-backed or live execution evidence. -Current fixture state: `cargo make real-world-memory` covers 55 jobs across 15 suites, -with 49 pass and 6 blocked. The `core_archival_memory` suite contributes six passing +Current fixture state: `cargo make real-world-memory` covers 60 jobs across 16 suites, +with 53 pass and 7 blocked. The `core_archival_memory` suite contributes six passing fixture jobs for core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery. The `memory_summary` suite contributes one passing fixture-backed source-trace job for reviewable current, background, stale, superseded, tombstoned, and derived project-profile entries. The `proactive_brief` suite contributes four passing source-linked proactive suggestions and one typed private-corpus refresh blocker tied to XY-930. The blocked jobs are -production-ops operator boundaries, the private-corpus refresh blocker, plus the -XY-928 OpenViking `context_trajectory` gates for staged retrieval, hierarchy -selection, and recursive context expansion. +production-ops operator boundaries, the private-corpus refresh blocker, the +private/provider scheduler blocker, plus the XY-928 OpenViking `context_trajectory` +gates for staged retrieval, hierarchy selection, and recursive context expansion. +The `scheduled_memory` suite contributes four passing source-linked scheduled task +readbacks plus one typed private/provider scheduler blocker tied to XY-930; it is not +hosted scheduler, ChatGPT Tasks, Pulse, notification, or provider-backed private-corpus +parity evidence. Current live-adapter state: the `elf_live_real_world` and `qmd_live_real_world` adapters run a full checked-in suite sweep through `cargo make real-world-memory-live-adapters`. Each adapter diff --git a/docs/research/2026-06-11-competitor-strength-adoption-report.json b/docs/research/2026-06-11-competitor-strength-adoption-report.json index 83e8d854..cfe2f5ca 100644 --- a/docs/research/2026-06-11-competitor-strength-adoption-report.json +++ b/docs/research/2026-06-11-competitor-strength-adoption-report.json @@ -40,7 +40,7 @@ { "command": "cargo make real-world-memory", "artifact": "docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md", - "claim": "ELF fixture aggregate covers 55 jobs across 15 suites with 49 pass and 6 blocked production-ops, private-corpus, or OpenViking context-trajectory measurement gates, including 6 passing core_archival_memory jobs, 1 passing memory_summary source-trace job, and 4 passing proactive_brief suggestion jobs plus 1 private-corpus blocker." + "claim": "ELF fixture aggregate covers 60 jobs across 16 suites with 53 pass and 7 blocked production-ops, private-corpus, private/provider scheduler, or OpenViking context-trajectory measurement gates, including 6 passing core_archival_memory jobs, 1 passing memory_summary source-trace job, 4 passing proactive_brief suggestion jobs plus 1 private-corpus blocker, and 4 passing scheduled_memory task-readback jobs plus 1 private/provider scheduler blocker." }, { "command": "cargo make real-world-memory-summary", @@ -52,6 +52,11 @@ "artifact": "tmp/real-world-memory/proactive-brief/report.json", "claim": "The proactive brief fixture scores daily project brief, resume-work brief, stale decision audit, stale plan/preference warning, and private-corpus refresh blocker scenarios with evidence refs, freshness/currentness markers, action rationale, and stale/tombstone guards; this is fixture-backed contract evidence, not Pulse or hosted managed-memory parity." }, + { + "command": "cargo make real-world-memory-scheduled", + "artifact": "tmp/real-world-memory/scheduled/report.json", + "claim": "The scheduled-memory fixture scores weekly project status summary, stale preference/plan audit, stale decision audit, knowledge-page refresh suggestion, and private/provider scheduler blocker scenarios with evidence refs, freshness/currentness markers, action rationale, execution trace/readback, source-mutation guards, and stale/tombstone guards; this is fixture-backed contract evidence, not hosted scheduler, ChatGPT Tasks, Pulse, notification, or provider-backed private-corpus parity." + }, { "command": "cargo make real-world-memory-core-archival", "artifact": "tmp/real-world-memory/core-archival/report.json", diff --git a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json index 1737c065..ea5d1bcf 100644 --- a/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json +++ b/docs/research/2026-06-16-dreaming-readiness-stage-ledger.json @@ -4,7 +4,7 @@ "authority": "XY-951", "created_at": "2026-06-16T00:00:00Z", "purpose": "Define the benchmark evidence gate that every Dreaming-inspired ELF optimization stage must update before claiming completion.", - "source_evidence_cutoff": "Checked-in benchmark and research evidence through the XY-905 live temporal reconciliation run, XY-934 live consolidation proposal scoring run, XY-952 fixture-backed memory summary/source-trace contract, and XY-953 fixture-backed proactive brief scoring on 2026-06-16; no private-corpus or provider-backed production pass is claimed by this ledger.", + "source_evidence_cutoff": "Checked-in benchmark and research evidence through the XY-905 live temporal reconciliation run, XY-934 live consolidation proposal scoring run, XY-952 fixture-backed memory summary/source-trace contract, XY-953 fixture-backed proactive brief scoring, and XY-954 fixture-backed scheduled-memory task scoring on 2026-06-16; no private-corpus or provider-backed production pass is claimed by this ledger.", "typed_status_terms": [ "pass", "wrong_result", @@ -38,7 +38,8 @@ "Private-corpus and provider-backed production gates remain typed blocked unless the operator supplies explicit inputs; those blockers are tracked under XY-930.", "The XY-905 post-stage live memory_evolution result is a narrow temporal reconciliation improvement only; it must not be converted into private-corpus, hosted memory, or broad competitor superiority claims.", "The XY-934 live consolidation result is a narrow ELF self-check only; it must not be converted into broad managed dreaming, Always-On Memory Agent, qmd, agentmemory, or llm-wiki superiority claims without comparable contained runners.", - "The XY-953 proactive brief result is fixture-backed benchmark-shape evidence only; it must not be converted into OpenAI Pulse, hosted managed-memory, scheduler, or private-corpus parity claims." + "The XY-953 proactive brief result is fixture-backed benchmark-shape evidence only; it must not be converted into OpenAI Pulse, hosted managed-memory, scheduler, or private-corpus parity claims.", + "The XY-954 scheduled-memory result is fixture-backed benchmark-shape evidence only; it must not be converted into hosted scheduler, ChatGPT Tasks, Pulse, provider-backed private-corpus, notification, or silent source-mutation claims." ], "summary": { "improved": [ @@ -46,16 +47,15 @@ "preference_evolution", "reviewable_consolidation", "memory_summary_top_of_mind_behavior", - "proactive_brief_readiness" + "proactive_brief_readiness", + "scheduled_memory_task_readiness" ], "regressed": [], "unchanged": [ "deletion_ttl_tombstone_behavior", "final_competitor_retest_status" ], - "blocked": [ - "scheduled_memory_task_readiness" - ], + "blocked": [], "not_tested": [] }, "stage_gates": [ @@ -418,8 +418,8 @@ { "stage_id": "scheduled_memory_task_readiness", "stage_name": "Scheduled memory task readiness", - "dependent_issue": "XY-926", - "evidence_class": "blocked", + "dependent_issue": "XY-954", + "evidence_class": "fixture_backed", "baseline_commands": [ { "command": "cargo make real-world-memory-consolidation", @@ -429,15 +429,22 @@ ], "post_stage_commands": [ { - "command": "cargo make real-world-memory-consolidation", - "required_artifact": "tmp/real-world-memory/consolidation/report.json" + "command": "cargo make real-world-memory-scheduled", + "required_artifact": "tmp/real-world-memory/scheduled/report.json" }, { - "command": "cargo make real-world-memory-live-adapters", - "required_artifact": "tmp/real-world-memory/live-adapters/" + "command": "cargo make real-world-memory", + "required_artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + { + "command": "cargo test -p elf-eval --test real_world_job_benchmark scheduled_memory -- --test-threads=1", + "required_artifact": "target/debug/deps/real_world_job_benchmark-*" } ], "evidence_files": [ + "apps/elf-eval/fixtures/real_world_memory/scheduled_memory/", + "docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md", + "docs/research/2026-06-16-scheduled-memory-task-scoring-report.json", "docs/spec/system_consolidation_proposals_v1.md", "docs/research/2026-06-08-agent-memory-selection.json" ], @@ -449,10 +456,28 @@ "not_encoded": 0 }, "baseline_basis": "The consolidation spec permits fixture and manual job_kind only; scheduled is explicitly future work and no scheduled-memory-task benchmark is encoded.", - "comparison_judgment": "blocked", - "regression_rule": "Adding scheduled tasks without reviewable output, immutable source snapshots, and explicit operator review is a regression.", - "improvement_rule": "An improvement requires a scheduled-task fixture or live report that keeps task output reviewable and records provider/private boundaries as typed blockers.", - "next_optimization_direction": "Model scheduled tasks as queued derived proposal runs first; do not allow a scheduler to mutate authoritative memory silently." + "post_stage_counts": { + "pass": 4, + "wrong_result": 0, + "blocked": 1, + "not_tested": 0, + "not_encoded": 0, + "task_runs": 4, + "outputs": 5, + "evidence_ref_coverage": 1.0, + "freshness_coverage": 1.0, + "action_rationale_coverage": 1.0, + "trace_coverage": 1.0, + "invalid_current_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_mutation_count": 0 + }, + "post_stage_basis": "XY-954 adds five scheduled_memory fixture jobs: weekly project status summary, stale preference/plan audit, stale decision audit, knowledge-page refresh suggestion, and a typed private/provider scheduler blocker tied to XY-930. The four runnable jobs pass with five evidence-linked outputs, freshness/currentness metadata, action rationale, completed execution trace readback, stale/superseded/tombstone source traces, and zero source mutations.", + "comparison_judgment": "improved", + "regression_rule": "A scheduled-memory task that omits source refs, freshness/currentness markers, execution trace/readback, reviewable action rationale, or silently mutates source memory is a regression.", + "improvement_rule": "An improvement requires direct scheduled-memory fixture or live adapter evidence with source refs, freshness/currentness markers, execution trace/readback, source immutability, and typed blockers for unavailable private/provider scheduler prerequisites.", + "next_optimization_direction": "Move from fixture-backed scheduled task scoring into service-native queued task materialization and operator-visible readback; keep hosted/private/provider scheduler gates behind XY-930 inputs." }, { "stage_id": "final_competitor_retest_status", diff --git a/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json b/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json new file mode 100644 index 00000000..612802ff --- /dev/null +++ b/docs/research/2026-06-16-scheduled-memory-task-scoring-report.json @@ -0,0 +1,4107 @@ +{ + "schema": "elf.real_world_job_report/v1", + "run_id": "real-world-memory-scheduled", + "generated_at": "2026-06-16T16:29:13.720856Z", + "runner_version": "0.2.0-7f08eb504271123fa861e24e6e6861227682acda-aarch64-apple-darwin", + "corpus_profile": "mixed", + "adapter": { + "adapter_id": "fixture_scheduled_memory", + "name": "ELF scheduled memory fixture", + "behavior": "offline_fixture_response", + "storage": "not_encoded", + "runtime": "not_encoded", + "notes": "Offline runner scores checked-in fixture responses; it does not exercise a live external adapter." + }, + "external_adapters": { + "schema": "elf.real_world_external_adapter_report/v1", + "manifest_id": "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store", + "docker_isolation": { + "default": true, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/live-baseline-benchmark.sh", + "artifact_dir": "tmp/live-baseline/", + "host_global_installs_required": false, + "notes": [ + "External project runs default to Docker Compose and Docker-managed caches.", + "Real-world job fixture reports and live baseline reports use separate schemas and claim boundaries." + ] + }, + "summary": { + "adapter_count": 23, + "external_project_count": 16, + "docker_default_count": 23, + "host_global_install_required_count": 0, + "fixture_backed_count": 1, + "live_baseline_only_count": 6, + "live_real_world_count": 5, + "research_gate_count": 11, + "overall_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 7, + "incomplete": 0, + "wrong_result": 6, + "lifecycle_fail": 1, + "pass": 4, + "not_encoded": 5 + }, + "capability_status_counts": { + "real": 8, + "mocked": 1, + "unsupported": 6, + "blocked": 22, + "incomplete": 0, + "wrong_result": 10, + "lifecycle_fail": 0, + "pass": 30, + "not_encoded": 26 + }, + "suite_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 23, + "incomplete": 0, + "wrong_result": 7, + "lifecycle_fail": 0, + "pass": 27, + "not_encoded": 38 + }, + "scenario_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 3, + "blocked": 12, + "incomplete": 1, + "wrong_result": 6, + "lifecycle_fail": 1, + "pass": 23, + "not_encoded": 11 + }, + "scenario_position_counts": { + "wins": 10, + "ties": 11, + "loses": 1, + "untested": 35 + }, + "scenario_outcome_counts": { + "win": 10, + "tie": 11, + "loss": 1, + "not_tested": 17, + "blocked": 13, + "non_goal": 5 + } + }, + "adapters": [ + { + "adapter_id": "elf_real_world_memory_fixture", + "project": "ELF", + "adapter_kind": "offline_fixture_response", + "evidence_class": "fixture_backed", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The checked-in real_world_memory fixtures parse and score through the ELF fixture runner.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "run": { + "status": "blocked", + "evidence": "The current fixture set reports 60 jobs across 16 suites: 53 pass, 0 incomplete, 7 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; the scheduled_memory suite scores 4 passing scheduled readback tasks plus one blocked private/provider scheduler case tied to XY-930, not hosted scheduler, ChatGPT Tasks, Pulse, or provider-backed private-corpus parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "result": { + "status": "blocked", + "evidence": "This is fixture-backed ELF scoring, not a live external adapter result.", + "artifact": "tmp/real-world-memory/real-world-memory-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_fixture_scoring", + "status": "real", + "evidence": "The runner scores checked-in real_world_job records with expected evidence, traps, and typed status output." + }, + { + "capability": "live_external_adapter_execution", + "status": "not_encoded", + "evidence": "The ELF fixture response path does not exercise an external memory project runtime." + }, + { + "capability": "docker_isolated_baseline", + "status": "pass", + "evidence": "ELF live baseline runs execute through docker-compose.baseline.yml for retrieval and lifecycle evidence." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "Checked-in source-of-truth rebuild fixture is encoded and passing." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "Checked-in work-resume fixtures are encoded and passing." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "Checked-in project-decision fixtures cover accepted decisions, reversals, current validation gates, rationale, and bounded caveats." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "Checked-in retrieval fixtures cover alternate phrasing, distractors, multi-hop routing, current-versus-obsolete selection, and minimal context." + }, + { + "suite_id": "memory_evolution", + "status": "pass", + "evidence": "Checked-in memory-evolution fixtures cover current-versus-historical facts and the relation temporal-validity case is encoded." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "Proposal-only consolidation fixtures are encoded and passing without source mutation." + }, + { + "suite_id": "memory_summary", + "status": "pass", + "evidence": "The source-trace memory summary fixture is encoded and passing with freshness, rationale, tombstone, and unsupported-claim guards." + }, + { + "suite_id": "proactive_brief", + "status": "blocked", + "evidence": "The proactive brief suite scores 4 passing source-linked suggestions and 1 typed private-corpus refresh blocker tied to XY-930." + }, + { + "suite_id": "scheduled_memory", + "status": "blocked", + "evidence": "The scheduled memory suite scores 4 passing source-linked task readbacks with execution trace coverage and 1 typed private/provider scheduler blocker tied to XY-930." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "Knowledge page fixtures are encoded and passing with citation and rebuild metrics." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "Operator-debugging fixtures now expose stage attribution and dropped-candidate evidence without raw SQL." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "Four redaction, exclusion, source-id, evidence-binding, and capture-boundary fixtures are encoded and passing." + }, + { + "suite_id": "core_archival_memory", + "status": "pass", + "evidence": "Six fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Production-ops fixtures encode restore, Qdrant rebuild, backfill resume, resource-envelope interpretation, OpenViking wrong-result classification, plus typed blocked operator boundaries." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The scoped preference fixture is encoded and passing." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked until same-corpus evidence ids and staged artifacts are materialized." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory", + "status": "pass" + } + ], + "notes": [ + "This adapter record exists to keep ELF fixture results separate from live external adapter results.", + "The remaining non-pass ELF fixture states are production-ops operator boundaries plus OpenViking context-trajectory measurement gates.", + "Use elf_live_real_world for service-runtime real_world_job evidence; this fixture-backed record must not imply live-service behavior." + ] + }, + { + "adapter_id": "elf_live_real_world", + "project": "ELF", + "adapter_kind": "docker_service_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "ELF materializes 55 real_world_job adapter_response objects through ElfService, worker indexing, search_raw, live capture/write-policy ingestion, live consolidation proposal review, live knowledge-page rebuild/lint, and operator-debug trace metadata before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full live sweep scores 55 jobs across all 13 checked-in suites, including live-scored consolidation, knowledge-page, capture/write-policy, and operator-debug suites. This is not a full-suite live pass because memory-evolution, production-ops, core-archival, and context-trajectory gaps remain typed non-pass records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes real_world_job prompts after runtime ingestion and writes generated answer artifacts before scoring." + }, + { + "capability": "service_runtime_execution", + "status": "real", + "evidence": "The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution is wrong_result and production/core/context boundaries remain typed non-pass." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "Adapter setup/runtime limitations are materialized as typed jobs with evidence JSON instead of silent claim upgrades." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "The live adapter retrieved the restore/Qdrant rebuild proof evidence through the service runtime." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "The live adapter passed 5/5 work_resume jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "The live adapter passed 5/5 retrieval jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "The live adapter passed 5/5 project_decisions jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "The live adapter passed the delete/TTL case but failed five current-versus-historical conflict jobs because retrieval-backed answers did not provide the required historical conflict evidence links." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "The live adapter creates consolidation runs, materializes proposal jobs through the worker, preserves source lineage and unsupported-claim flags, and applies/defer/discards proposals through review audit transitions." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "The live adapter rebuilds derived knowledge pages through ElfService, searches page sections, lints stale source refs after runtime source updates, and emits citation/backlink/unsupported-section page artifacts." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The full live sweep includes operator_debugging_ux fixtures and emits trace ids, viewer/admin trace-bundle links, replay commands, dropped-candidate visibility, repair-action clarity, and raw_sql_needed=false." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "The live adapter passes 4/4 capture_integration jobs through Docker-local ELF ingestion, including capture-boundary classification, excluded evidence ids, source ids in source_ref, write_policy redaction audit counts, evidence binding, and zero secret leakage." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The live adapter retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The full live adapter sweep preserves the core/archival fixture gap as typed not_encoded; this issue does not add live core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "scenarios": [ + { + "scenario_id": "live_capture_write_policy", + "suite_id": "capture_integration", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is an ELF self-check, not a win over external hook systems.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_consolidation_proposal_review", + "suite_id": "consolidation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live consolidation jobs now exercise source lineage, unsupported-claim flags, and apply/defer/discard review audit transitions. This is an ELF service self-check, not a broad competitor win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_knowledge_page_rebuild_lint", + "suite_id": "knowledge_compilation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live knowledge jobs now exercise page rebuild, search, stale-source lint, citations, backlinks, and unsupported-section handling. This is an ELF service self-check, not a broad knowledge-product win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "full_sweep_operator_debug", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF full live sweep now includes the operator-debug fixture tree with hydrated trace ids, trace-bundle replay commands, dropped-candidate visibility, repair guidance, and no raw SQL requirement.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This Docker-isolated live real_world_job record now covers the full encoded fixture corpus, not only the original three-suite representative slice.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove private-corpus production quality or provider-backed production operations." + ] + }, + { + "adapter_id": "qmd_live_baseline", + "project": "qmd", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs qmd inside the baseline container.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "pass", + "evidence": "qmd same-corpus retrieval, update, delete, and cold-start checks are encoded in the live baseline runner.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "This live_baseline_only record is same-corpus evidence only; cite qmd_live_real_world for the full live real-world sweep.", + "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "qmd has an encoded Docker same-corpus retrieval adapter." + }, + { + "capability": "update_delete_cold_start", + "status": "pass", + "evidence": "qmd lifecycle smoke checks are encoded in the live-baseline runner." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job prompts; cite qmd_live_real_world for the full live real-world sweep." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job retrieval prompts; cite qmd_live_real_world for the live retrieval adapter run." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Live-baseline lifecycle checks exist, but no real_world_job memory_evolution run is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd debug ergonomics are a reference dimension; no operator_debugging_ux fixture is executed against qmd." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + }, + { + "kind": "compose", + "ref": "docker-compose.baseline.yml", + "status": "real" + } + ], + "notes": [ + "This same-corpus record remains separate from qmd_live_real_world, which records real_world_job prompt execution and scoring evidence." + ] + }, + { + "adapter_id": "qmd_live_real_world", + "project": "qmd", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes 55 real_world_job adapter_response objects through collection add, update, embed, and query --json before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records, with operator-debug fixtures scored through qmd replay metadata rather than ELF trace hydration.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full qmd live sweep scores 55 jobs across all 13 checked-in suites, preserving consolidation, knowledge-page, capture, production-ops, core-archival, and context-trajectory gaps as typed non-pass records. This is not a full-suite live pass.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "qmd executes real_world_job prompts through its local CLI retrieval/query workflow and records generated answer artifacts." + }, + { + "capability": "local_cli_retrieval", + "status": "real", + "evidence": "The adapter uses qmd collection add, update, embed -f, and query --json inside Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution and operator_debugging_ux are wrong_result while non-qmd product surfaces remain typed not_encoded or blocked." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "qmd setup/runtime limitations are materialized as typed jobs with command evidence and retry artifacts." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "qmd retrieved the restore/Qdrant rebuild proof evidence through the local CLI workflow." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "qmd passed 5/5 work_resume jobs through CLI evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "qmd passed 5/5 retrieval jobs through CLI evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "qmd passed 5/5 project_decisions jobs through CLI evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "qmd failed all six memory-evolution jobs in the fresh June 11 diagnostic, including the delete/TTL tombstone job where qmd retrieved only the current plan and missed the tombstone evidence." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate or review consolidation proposals." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages." + }, + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The full qmd live sweep includes operator_debugging_ux fixtures and records replay-command metadata, but it lacks ELF trace hydration, viewer links, and intermediate candidate-drop stages, so the suite remains wrong_result." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep does not exercise capture integrations or write-policy redaction boundaries; all capture_integration jobs remain typed not_encoded for qmd." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The qmd live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "qmd retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep preserves the core/archival fixture gap as typed not_encoded; qmd does not expose ELF core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/qmd-report.json", + "status": "pass" + } + ], + "notes": [ + "This qmd record is real-world job evidence and must not be conflated with the same-corpus qmd_live_baseline record.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove broad RAG/graph adapter parity or private-corpus production quality." + ] + }, + { + "adapter_id": "elf_operator_debug_live", + "project": "ELF", + "adapter_kind": "docker_service_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + "run": { + "status": "pass", + "evidence": "ELF materializes operator_debugging_ux adapter_response objects through ElfService, worker indexing, search_raw trace ids, and generated operator_debug metadata.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + "result": { + "status": "pass", + "evidence": "The narrow live slice scores operator-debugging jobs with trace availability, replay command availability, candidate-drop visibility, repair-action clarity, and raw-SQL avoidance separated in job-level evidence.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through the live service materializer and generated scoring fixtures." + }, + { + "capability": "trace_hydration_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include service trace ids, viewer links, admin trace-bundle URLs, and trace_available=true." + }, + { + "capability": "replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include admin trace-bundle curl replay commands; no raw SQL path is required." + }, + { + "capability": "candidate_drop_visibility", + "status": "pass", + "evidence": "The operator-debug jobs keep dropped-candidate visibility as explicit job-level evidence instead of relying on direct database inspection." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This ELF live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The narrow live operator-debug slice scores trace hydration, stage attribution, candidate-drop visibility, selected-but-not-narrated diagnosis, and repair-action clarity through generated ELF live artifacts." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs for the operator-debug jobs; qmd has replay rows but no ELF trace hydration surface.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF generated admin trace-bundle replay commands; qmd generated local CLI query replay commands. These are comparable replay-command availability artifacts, not equivalent UI quality claims.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated operator_debug candidate-drop visibility from trace and replay-candidate metadata without direct SQL assumptions; qmd keeps only top-k replay rows and lacks intermediate candidate-drop stages.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "The new selected-but-not-narrated job scores whether selected trace evidence is available for answer-composition repair without direct database inspection.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "The record does not implement product UI improvements and does not claim broad qmd/OpenMemory/claude-mem superiority." + ] + }, + { + "adapter_id": "qmd_operator_debug_live", + "project": "qmd", + "adapter_kind": "docker_cli_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes operator_debugging_ux adapter_response objects through collection add, update, embed, and query --json, then records local replay-command metadata but no service trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The narrow live slice gives qmd explicit replay-command evidence, but operator-debug jobs remain wrong_result where trace availability, trace completeness, or candidate-drop stage visibility is required.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through qmd local CLI materialization and generated scoring fixtures." + }, + { + "capability": "local_replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include qmd query replay commands tied to per-job collections." + }, + { + "capability": "trace_hydration_metadata", + "status": "wrong_result", + "evidence": "Generated qmd operator_debug records have trace_available=false and no ELF viewer/admin trace bundle because qmd exposes local replay rows rather than service trace hydration." + }, + { + "capability": "candidate_drop_visibility", + "status": "wrong_result", + "evidence": "qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed in the generated artifact." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This qmd live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The narrow qmd operator-debug slice scores local replay commands but remains wrong_result for trace hydration and candidate-drop stage visibility." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated replay-command metadata but trace_available=false, so ELF wins only this trace-hydration dimension; this is not a broad qmd loss.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated local CLI query replay commands for the same operator-debugging scenarios; ELF generated admin trace-bundle curl commands.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated top-k replay output but not intermediate retrieved-but-dropped stage visibility, so candidate-drop diagnosis remains a qmd wrong_result in this narrow slice.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated clear local replay steps for repair investigation, matching ELF on repair-action clarity while differing on trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd can replay top-k rows, but the generated artifact does not expose service trace narration stages for the selected-but-not-narrated diagnosis.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", + "status": "wrong_result" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "qmd's replay-command availability remains useful; the wrong_result status is limited to trace hydration and candidate-drop stage visibility." + ] + }, + { + "adapter_id": "agentmemory_live_baseline", + "project": "agentmemory", + "adapter_kind": "docker_sdk_mock_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "lifecycle_fail", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs and exercises agentmemory package APIs.", + "command": "ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/agentmemory.log" + }, + "run": { + "status": "lifecycle_fail", + "evidence": "Same-corpus retrieval can run, but durable lifecycle behavior is not proven because the adapter uses an in-memory SDK/KV mock.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "lifecycle_fail", + "evidence": "agentmemory remains a reference for capture and continuity UX, but current Docker evidence is not a durable lifecycle pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "The current adapter can run mem::remember and mem::search against the shared corpus." + }, + { + "capability": "adapter_storage", + "status": "mocked", + "evidence": "The current adapter uses a process-local StateKV Map and in-memory index." + }, + { + "capability": "durable_cold_start", + "status": "blocked", + "evidence": "A persistent upstream KV/index path or hosted runtime is needed before cold-start recovery can be fairly scored." + }, + { + "capability": "durable_work_resume_capture_path", + "status": "blocked", + "evidence": "XY-925 selects the next local path as a Docker-contained agentmemory session directory with persisted SDK KV store, observation log, and searchable index across a fresh process; the current StateKV Map and in-memory index still block scoring." + }, + { + "capability": "write_policy_hook_capture", + "status": "blocked", + "evidence": "Capture/write-policy jobs require live agentmemory hook observations plus persisted write-policy audit evidence. The current adapter does not execute those hooks." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed blocked prompt coverage for the required durable path, but no live agentmemory real_world_job adapter executes prompts until the persistent local store exists." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "blocked", + "evidence": "A durable upstream agentmemory session/capture path is required before work-resume jobs can be compared fairly." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "The current fixture import boundary is offline and does not run live agentmemory hooks." + }, + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Durable update/supersede/delete history is not proven by the in-memory adapter." + } + ], + "scenarios": [ + { + "scenario_id": "basic_same_corpus_retrieval", + "suite_id": "retrieval", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports agentmemory retrieval_pass with 3/3 same-corpus retrieval checks through mem::remember and mem::search. This is live-baseline-only evidence through an in-memory mock, not a real_world_job suite pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "durable_update_reload_lifecycle", + "suite_id": "memory_evolution", + "status": "lifecycle_fail", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks, while agentmemory update_replaces_note_text is lifecycle_fail and cold_start_recovery_search is blocked because the harness uses an in-memory SDK/KV mock. This is an ELF baseline win only at the local lifecycle-smoke evidence class.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "work_resume_capture_continuity", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. XY-925 selects the durable local path as a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; keep work_resume and capture claims blocked until that path exists.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json" + }, + { + "scenario_id": "durable_work_resume_local_path", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The selected comparable path is explicit: capture into a Docker-local agentmemory session directory, persist the SDK KV/index and observation log, restart a fresh process, then score work_resume prompts. The checked-in fixture records this as blocked rather than scoring the current mock.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + }, + { + "scenario_id": "capture_write_policy_hooks", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory capture/write-policy comparison needs live hook observations and write-policy audit evidence persisted through the selected local store. The fixture preserves this as a typed blocker and does not convert the mem::remember smoke into capture proof.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + } + ], + "evidence": [ + { + "kind": "guide", + "ref": "docs/guide/research/agentmemory_adapter.md", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "mocked" + } + ], + "notes": [ + "The offline agentmemory fixture adapter is an import/comparison boundary and must not be treated as live benchmark proof." + ], + "follow_up": { + "title": "[ELF benchmark P0] Make agentmemory adapter lifecycle-durable and fail-typed", + "reason": "A durable upstream agentmemory storage path is required before lifecycle and real-world job suites can be fairly scored." + } + }, + { + "adapter_id": "mem0_openmemory_live_baseline", + "project": "mem0/OpenMemory", + "adapter_kind": "docker_sdk_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install mem0 and configure local FastEmbed/Qdrant paths.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded SDK checks. XY-931 adds a separate OpenMemory export-helper setup probe artifact and keeps that blocked UI/export result out of the SDK check summary.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "The local OSS mem0 baseline now passes same-corpus retrieval, update/delete/reload, preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history. The separate OpenMemory export-helper setup probe is blocked because Docker is unavailable inside the baseline-runner container before any product app database readback can run. It still does not claim hosted Platform export, optional graph memory, or a real_world_job prompt adapter.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "local_storage", + "status": "real", + "evidence": "The adapter targets local FastEmbed, Qdrant path storage, and local history DB paths in Docker." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "local_lifecycle_update_delete_reload", + "status": "pass", + "evidence": "The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; the fresh scoped run reports those lifecycle checks passing." + }, + { + "capability": "preference_correction_history", + "status": "pass", + "evidence": "The fresh scoped run reports preference_correction_history as pass: Memory.history preserved explicit ADD and UPDATE records with old and current preference text, and search returned only the current correction." + }, + { + "capability": "entity_scoped_personalization", + "status": "pass", + "evidence": "The fresh scoped run reports entity_scoped_personalization as pass: user_id, agent_id, and run_id filters returned the ELF scoped preference and omitted a PubFi scoped preference." + }, + { + "capability": "local_get_all_export_readback", + "status": "pass", + "evidence": "The fresh scoped run reports local_get_all_export_readback as pass: Memory.get_all returned the current scoped preference and omitted the other scope." + }, + { + "capability": "deletion_audit_history", + "status": "pass", + "evidence": "The fresh scoped run reports delete_history_audit_readback as pass: Memory.history exposed a DELETE event and search suppressed the deleted memory." + }, + { + "capability": "openmemory_ui_readback", + "status": "blocked", + "evidence": "XY-931 runs a bounded OpenMemory export-helper setup probe after the mem0 SDK corpus checks. The probe finds the OpenMemory tree, UI package, compose file, and export helper, then records a setup blocker because the export helper requires Docker access to a running OpenMemory container. Local SDK get_all readback is measured separately and must not be reused as UI evidence." + }, + { + "capability": "hosted_managed_memory_claims", + "status": "unsupported", + "evidence": "Hosted mem0 Platform behavior and Platform UI export are outside the local OSS Docker adapter and are non-goals for this local evidence record." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No mem0/OpenMemory adapter currently executes real_world_job prompts and answer scoring." + }, + { + "capability": "optional_graph_memory", + "status": "not_encoded", + "evidence": "Optional graph memory is not enabled in the default local OSS path and remains an opt-in scenario gate rather than a default pass/fail claim." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure preference correction history and deletion audit readback, but no mem0 real_world_job memory_evolution prompt adapter is encoded." + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure entity-scoped personalization, but no mem0 real_world_job personalization prompt adapter is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "Local SDK get_all inspection is measured, but OpenMemory UI/export readback is blocked by the XY-931 export-helper setup probe until a dedicated OpenMemory compose/import path can load the same corpus into the OpenMemory app database." + } + ], + "scenarios": [ + { + "scenario_id": "basic_local_lifecycle", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Prior comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing basic same-corpus retrieval, update, delete, and cold-start reload checks. This remains a basic local lifecycle tie at the encoded smoke surface and is not reused as history/UI evidence.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "preference_correction_history", + "suite_id": "personalization", + "status": "pass", + "elf_position": "loses", + "comparison_outcome": "loss", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "entity_scoped_personalization", + "suite_id": "personalization", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + }, + { + "scenario_id": "delete_audit_readback", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/guide/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "local_get_all_export_readback", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0-checks.json" + }, + { + "scenario_id": "openmemory_ui_export_readback", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/mem0-openmemory-ui-export.json" + }, + { + "scenario_id": "hosted_platform_export", + "suite_id": "operator_debugging_ux", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted mem0 Platform export is explicitly outside the local OSS Docker comparison and is not counted as a local pass, loss, or blocker.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "optional_graph_memory", + "suite_id": "memory_evolution", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Optional graph memory is kept as an opt-in scenario gate. It is not enabled in the default mem0 local OSS run and is not part of the default pass/fail comparison.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Separate local OSS mem0 SDK evidence from OpenMemory product UI/export claims.", + "A blocked OpenMemory export-helper setup probe is not an ELF win or loss until the product app can import and export the same local corpus." + ] + }, + { + "adapter_id": "memsearch_live_baseline", + "project": "memsearch", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install memsearch and run its CLI path.", + "command": "ELF_BASELINE_PROJECTS=memsearch cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/memsearch.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 indexes a per-adapter corpus copy, rewrites and deletes files, reruns memsearch index, and reports memsearch 4/4 encoded checks passing.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "memsearch now passes the local same-corpus/reindex/update/delete/reload smoke. No real_world_job memsearch prompt adapter is encoded, so Markdown-first behavior remains baseline scenario evidence rather than suite pass evidence.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "canonical_markdown_store", + "status": "real", + "evidence": "memsearch is tracked as a Markdown-first source-of-truth reference." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "reindex_update_delete_reload", + "status": "pass", + "evidence": "The runner rewrites auth-memory.md, deletes a second corpus file, reruns memsearch index, and starts fresh memsearch search processes; the fresh scoped run reports update, delete, and cold-start reload passing." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Markdown source-store and retrieval-debug jobs, but no live memsearch runtime adapter executes real_world_job prompts and answer scoring." + }, + { + "capability": "markdown_source_store_prompt_jobs", + "status": "pass", + "evidence": "The first-generation OSS fixture slice encodes source-of-truth rebuild/reload and retrieval-debug prompts over the canonical Markdown store while preserving the live-baseline-only evidence boundary." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "not_encoded", + "evidence": "The Markdown-first source model passed the local reindex/reload smoke, and XY-925 adds fixture-backed source-of-truth prompt coverage over the canonical Markdown store. No live memsearch runtime adapter executes prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The Docker same-corpus check passes, and XY-925 adds fixture-backed retrieval-debug prompt coverage over memsearch CLI replay and Markdown source inspection. No live memsearch runtime adapter executes retrieval prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Update/delete reindex semantics pass in Docker, but memory_evolution real_world_job prompts are not encoded for memsearch." + } + ], + "scenarios": [ + { + "scenario_id": "canonical_markdown_reindex_reload", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch passed same-corpus retrieval, update reindex, delete suppression, and cold-start reload over a canonical Markdown corpus. ELF has no directly comparable canonical Markdown source-store scenario in this baseline, so the ELF position remains untested.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "markdown_source_store_rebuild_reload_prompt", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in real_world_job prompt fixture that asks for the memsearch source-of-truth path and rebuild/reload boundary: canonical Markdown files are authoritative, while the index is derived by rerunning memsearch index. This is fixture-backed scenario coverage plus baseline artifact evidence, not a memsearch live real_world_job suite pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json" + }, + { + "scenario_id": "markdown_retrieval_debug_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in retrieval-debug prompt over memsearch's canonical Markdown store. The expected debug surface is CLI replay plus Markdown source inspection and reindexing; staged expansion/fusion/rerank/candidate-drop trace bundles remain not encoded for memsearch.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json" + }, + { + "scenario_id": "ttl_expiry_lifecycle", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "The encoded memsearch CLI path supports reindex/delete but no TTL or expiry behavior. Unsupported TTL behavior is preserved as unsupported competitor evidence and does not create an ELF win/loss claim without a directly comparable scenario artifact.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "real_world_prompt_adapter", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "No live memsearch runtime adapter currently executes real_world_job prompts and answer scoring. XY-925 fixture-backed prompt jobs document the source-store and retrieval-debug shape, while baseline retrieval/reindex evidence remains separate from suite pass claims.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Do not mark memsearch worse solely because setup or local indexing is heavier; preserve the typed incomplete/wrong-result boundary." + ] + }, + { + "adapter_id": "openviking_live_baseline", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "OpenViking local-embed setup installed and imported pinned llama-cpp-python==0.3.28 from the CPU wheel index in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The adapter reached same-corpus add_resource/find and now exposes expected/matched/missing evidence ids, but returned 0 of 3 expected evidence-term matches in the smoke run.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The current OpenViking Docker evidence is a behavioral wrong_result, not a local embedding setup blocker and not a real_world_job pass.", + "artifact": "docs/guide/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "local_embed_setup", + "status": "pass", + "evidence": "Docker local embedding dependency setup is pinned to llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu and reached import/runtime in the smoke run." + }, + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "OpenViking add_resource/find returned resources but missed expected evidence-term matches for every smoke query." + }, + { + "capability": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged/hierarchical retrieval is now encoded as blocked context_trajectory fixtures until same-corpus expected evidence ids match and staged artifacts are materialized." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No OpenViking adapter currently executes real_world_job prompts and answer scoring." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "The Docker-local setup reached add_resource/find, but the retrieval check returned 0/3 expected evidence-term matches." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Hierarchical context resume scenarios are not encoded for OpenViking." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked behind same-corpus evidence output and staged artifact readback." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + }, + { + "label": "llama-cpp-python CPU wheel index", + "url": "https://abetlen.github.io/llama-cpp-python/whl/cpu", + "evidence": "Official prebuilt CPU wheel index used by the Docker-local embedding pin." + } + ], + "setup_path": "Run ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker. The runner installs llama-cpp-python==0.3.28 with --only-binary llama-cpp-python from the CPU wheel index before OpenViking add_resource/find.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host-global OpenViking, llama-cpp-python, or model service install is required.", + "resource_expectation": "Local embedding setup may download a CPU wheel and model assets; record OpenViking.log, elapsed time, and cache size before claiming adapter quality.", + "retry_guidance": [ + "Use the default pinned CPU wheel path first.", + "Override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the default wheel is unavailable for the Docker platform.", + "Treat install/import failure as incomplete, not wrong_result; treat add_resource/find evidence misses as wrong_result." + ] + }, + "notes": [ + "Record OpenViking as wrong_result now that the pinned Docker local embedding path reaches add_resource/find but misses expected evidence; keep context_trajectory as blocked until staged artifacts exist." + ], + "follow_up": { + "title": "Fix OpenViking evidence-bearing same-corpus retrieval output and materialize staged artifacts", + "reason": "The current adapter reaches add_resource/find and exposes expected evidence ids, but must match evidence ids and return stage/hierarchy/recursive artifacts before trajectory quality can be scored." + } + }, + { + "adapter_id": "claude_mem_live_baseline", + "project": "claude-mem", + "adapter_kind": "docker_repository_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install and build claude-mem.", + "command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/claude-mem.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The Docker runner now uses a durable SQLite file, exercises repository update/delete/reopen checks, and reports missed same-corpus or lifecycle evidence as typed non-pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "No real_world_job claude-mem adapter is encoded; progressive disclosure remains a design reference.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "The current Docker adapter did not prove correct same-corpus retrieval." + }, + { + "capability": "durable_storage", + "status": "real", + "evidence": "The runner writes to a Docker-local SQLite file and constructs a new Database plus repository instances for cold-start recovery search." + }, + { + "capability": "repository_lifecycle", + "status": "real", + "evidence": "The runner uses MemoryItemsRepository.update, deletes from the repository-owned memory_items table, and relies on repository FTS triggers for update/delete checks." + }, + { + "capability": "repository_progressive_disclosure", + "status": "real", + "evidence": "The runner verifies search result to getById detail hydration and listSources source evidence on the durable repository path." + }, + { + "capability": "progressive_disclosure_real_world_job", + "status": "pass", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Docker-contained repository progressive-disclosure path: search result to getById detail hydration and listSources evidence on durable SQLite. Hook, timeline, and viewer workflows remain blocked separately." + }, + { + "capability": "retrieval_repair_artifact", + "status": "wrong_result", + "evidence": "The same-corpus retrieval smoke remains wrong_result, and XY-925 records a repair prompt that tells operators to rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker before inspecting tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json." + }, + { + "capability": "hook_capture_viewer_workflow", + "status": "blocked", + "evidence": "The current Docker runner does not launch claude-mem hooks, timeline capture, local viewer readback, or an operator workflow over the same corpus." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "The durable repository run is encoded, but hook-driven capture and real_world_job work-resume prompts are not proven by that local repository check." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompt coverage, but local viewer/operator workflow remains blocked until a Docker-contained viewer or equivalent readback runner exists." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "claude-mem hook capture remains blocked because hooks, timeline capture, and observation workflows are not executed by this runner." + } + ], + "scenarios": [ + { + "scenario_id": "same_corpus_retrieval", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF retrieval_pass and claude-mem same_corpus_retrieval as wrong_result with 0/3 expected query checks passing, while its durable repository setup completed. This is an ELF baseline win for the narrow retrieval smoke scenario.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "retrieval_repair_artifact_path", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "XY-925 adds a checked-in repair prompt that preserves the claude-mem wrong_result and names rerun/inspection targets from the reproducible Docker baseline: tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. This is repair evidence for a miss, not a retrieval pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json" + }, + { + "scenario_id": "repository_lifecycle_reload", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing local lifecycle checks and claude-mem update, delete, and cold-start reload checks passing over a durable Docker-local SQLite repository. This is a local lifecycle-smoke tie, not a hook-driven work-resume or full progressive-disclosure job pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_detail_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "claude-mem passed the repository-level search-to-detail/source hydration check, which is a useful progressive-disclosure signal. ELF does not have a directly comparable claude-mem-style progressive-disclosure scenario in this baseline, so the ELF position remains untested rather than a loss claim.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds fixture-backed prompt coverage that asks for the measured claude-mem progressive-disclosure boundary: repository search results hydrate through getById and listSources on durable SQLite, but hooks, timeline, viewer, and live prompt scoring are not executed.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json" + }, + { + "scenario_id": "hook_capture_viewer_workflow", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The Docker baseline uses repository classes only. claude-mem hooks, viewer, timeline, and observation workflows are not executed by the runner, so XY-925 preserves this as a typed blocker rather than not_encoded prose.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + }, + { + "scenario_id": "viewer_operator_workflow", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "A fair claude-mem viewer/operator comparison needs a Docker-contained run that opens the local viewer or equivalent readback over the same durable SQLite corpus and emits timeline, detail hydration, and repair-command artifacts. That path is not available in the current runner.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "claude-mem remains a UX reference; durable repository checks do not prove hook, viewer, or full real-world progressive-disclosure behavior." + ] + }, + { + "adapter_id": "qmd_deep_profile_gate", + "project": "qmd", + "adapter_kind": "docker_cli_deep_profile_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "pass", + "evidence": "qmd already has a Docker CLI live-baseline adapter; this gate records the deeper profile extension before a separate scaled run is claimed.", + "command": "ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "not_encoded", + "evidence": "The XY-899 strength-profile report is checked in, but no new live qmd deep-profile adapter artifact is claimed from it." + }, + "result": { + "status": "not_encoded", + "evidence": "The XY-899 report records qmd scenario-level retrieval/debug/replay outcomes and wrong-result diagnosis taxonomy, while expansion/fusion/rerank scoring remains not_encoded.", + "artifact": "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json" + }, + "capabilities": [ + { + "capability": "stress_profile_retrieval_debug", + "status": "not_encoded", + "evidence": "The stress command path exists, but this adapter-pack gate has not published a deep qmd profile result." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "Repository-supported qmd benchmark runs must stay inside docker-compose.baseline.yml and must not require host-global installs." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "A deeper stress retrieval-debug report is not checked in for this gate." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd query planning and score readback are not yet scored as operator-debugging real_world_job outputs." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/tobi/qmd", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "qmd repository", + "url": "https://github.com/tobi/qmd", + "evidence": "Official qmd source for local hybrid search, CLI setup, and query behavior." + } + ], + "setup_path": "Use the existing Docker baseline qmd install, collection add, update, embed, and query flow with scale or stress profiles.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container with project files and caches inside Docker volumes.", + "resource_expectation": "CPU local embedding and rerank cost scale with corpus size; record elapsed time and qmd log artifacts before claims.", + "retry_guidance": [ + "Run qmd stress profile in Docker and publish the artifact path.", + "Map qmd JSON output to retrieval-debug real_world_job scoring before suite claims." + ], + "research_depth": "D2 reviewed; deep profile not encoded" + }, + "notes": [ + "This gate deepens qmd planning without changing the existing qmd pass evidence from the smoke live baseline." + ] + }, + { + "adapter_id": "openviking_deep_profile_gate", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_context_trajectory_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The default pinned OpenViking local embedding dependency path reaches runtime in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "blocked", + "evidence": "The XY-928 context_trajectory fixtures encode staged retrieval, hierarchy selection, and recursive/context expansion as blocked; no live trajectory adapter artifact is claimed." + }, + "result": { + "status": "blocked", + "evidence": "No OpenViking deep context-trajectory result is claimed from the current wrong-result smoke run; the XY-928 fixtures preserve trajectory surfaces as blocked/not_tested.", + "artifact": "docs/research/2026-06-11-qmd-openviking-strength-profile-report.json" + }, + "capabilities": [ + { + "capability": "docker_local_embed_setup", + "status": "pass", + "evidence": "The local embedding setup is pinned and reaches import/runtime in Docker." + }, + { + "capability": "hierarchical_context_trajectory", + "status": "blocked", + "evidence": "Stage trajectory scoring is encoded as blocked until the smoke adapter returns evidence-bearing same-corpus output and selected hierarchy/expansion artifacts." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "The adapter pack must not ask operators to install OpenViking dependencies globally on the host." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "Same-corpus retrieval is still the precondition and remains wrong_result in the live baseline." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion jobs are encoded as blocked fixtures." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Trajectory readback is a reference feature but not a scored adapter output." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/volcengine/OpenViking/", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + } + ], + "setup_path": "Use the pinned Docker local embedding path from scripts/live-baseline-benchmark.sh, then run OpenViking add_resource/find before any deep profile scoring.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker.", + "resource_expectation": "Local embedding setup can download CPU wheels and model assets; record build/import logs, model cache size, and elapsed time.", + "retry_guidance": [ + "Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.", + "Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.", + "Fix evidence-bearing same-corpus output and materialize selected hierarchy/expansion artifacts before converting blocked context_trajectory fixtures into scored jobs." + ], + "research_depth": "D2 reviewed; local embedding setup pinned; blocked fixtures encoded" + }, + "notes": [ + "OpenViking remains a context-trajectory reference, but this gate prevents a smoke wrong_result or blocked fixture from becoming a deep-profile win claim." + ] + }, + { + "adapter_id": "ragflow_research_gate", + "project": "RAGFlow", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe tiny-corpus evidence smoke into a generated real_world_job report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make ragflow-docker-smoke", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The live path requires explicit resource-envelope opt-in and a local self-hosted RAGFlow API key; setup failures stay typed in the generated smoke artifact.", + "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make ragflow-docker-smoke", + "artifact": "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits ragflow-report.json and ragflow-report.md from one generated retrieval job. Pass or wrong_result is allowed only when returned reference chunks map to generated evidence ids; resource, setup, and API-key limits remain typed blockers.", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json" + }, + "capabilities": [ + { + "capability": "adapter_candidate_verdict", + "status": "not_encoded", + "evidence": "XY-882 completed D1/D2 feasibility research and marks RAGFlow adapter_candidate; no adapter run is encoded." + }, + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The smoke records official Docker setup, image/disk/startup envelope, CPU/GPU mode, vm.max_map_count handling, provider boundaries, and retry behavior." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "One generated retrieval job is scored from the smoke artifact or typed blocked when resource, service, or local API-key boundaries stop execution." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The scored smoke does not claim broad RAGFlow quality, private corpus behavior, scale, or comparative ranking." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated retrieval smoke is scored as pass, wrong_result, blocked, or incomplete by ragflow-report.json; the checked-in row remains blocked until live reference chunks map to evidence ids." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "RAGFlow knowledge output is not mapped to real_world_job page or citation scoring." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Resource envelope and service startup retry guidance must be documented first." + } + ], + "scenarios": [ + { + "scenario_id": "reference_chunk_citation_mapping", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for RAGFlow reference-chunk citation scoring. The job must remain blocked until returned reference chunks include generated document ids, chunk ids, content, and document metadata mapped to benchmark evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "private_or_large_corpus_ragflow_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Private corpus, large-corpus, and hosted RAGFlow quality are outside the generated-public Docker representative lane and must not be inferred from smoke reports.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/infiniflow/ragflow", + "status": "real" + }, + { + "kind": "source", + "ref": "https://ragflow.io/docs/", + "status": "real" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "RAGFlow repository", + "url": "https://github.com/infiniflow/ragflow", + "evidence": "Official source for RAGFlow service code and Docker Compose setup." + }, + { + "label": "RAGFlow docs", + "url": "https://ragflow.io/docs/", + "evidence": "Official deployment and setup documentation." + }, + { + "label": "RAGFlow HTTP API reference", + "url": "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + "evidence": "Official reference for OpenAI-compatible responses with reference chunks and document metadata." + } + ], + "setup_path": "Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API.", + "runtime_boundary": "Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs.", + "resource_expectation": "Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring.", + "retry_guidance": [ + "Run cargo make ragflow-docker-smoke first to produce a typed preflight artifact.", + "Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.", + "Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids." + ], + "research_depth": "D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed resource/setup/API-key blockers.", + "Do not interpret ragflow-report.json as broad RAGFlow quality evidence unless reference chunks map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter", + "reason": "Created as XY-885. XY-882 found a Docker boundary and reference-chunk output contract; implementation must prove a tiny ingest/query run before any quality claim." + } + }, + { + "adapter_id": "lightrag_research_gate", + "project": "LightRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-886 adds a Docker-profile context-export smoke command, and XY-900 keeps its generated retrieval fixtures scored through real_world_job_benchmark. The checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make lightrag-docker-context-smoke", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if the LightRAG API is unavailable; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in Docker service profile.", + "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make lightrag-docker-context-smoke", + "artifact": "tmp/real-world-memory/lightrag-context/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke emits lightrag-report.json and lightrag-report.md over generated retrieval jobs. Pass or wrong_result is allowed only when returned context, references, or file paths map to generated evidence ids.", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-report.json" + }, + "capabilities": [ + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The opt-in compose profile records explicit LightRAG image, LLM, embedding, rerank, workspace, and Docker volume configuration without host-global installs." + }, + { + "capability": "retrieved_context_export", + "status": "blocked", + "evidence": "The materializer calls /documents/texts, waits on /documents/track_status, and queries /query with only_need_context plus chunk references when the service is reachable." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The LightRAG materializer rewrites generated retrieval fixtures with adapter_response evidence only when source paths or context map to required evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not score broad graph-RAG quality, private corpora, scale, or comparative ranking claims." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated smoke can exercise retrieval context/source mapping for retrieval fixtures, but the checked-in record stays blocked until a live artifact reaches query output." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "LightRAG update/delete/current-versus-historical behavior is not encoded by the context-export smoke." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "The smoke records context/source mappings, but full trace or viewer diagnostics are not mapped to benchmark scoring." + } + ], + "scenarios": [ + { + "scenario_id": "context_source_reference_mapping", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative incomplete fixture for LightRAG context/source-reference scoring. The job cannot score until the opt-in Docker API exports generated source file paths, snippets, or reference content.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "graph_rag_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "LightRAG graph-RAG navigation quality remains not_tested beyond the context-source output contract; no ELF win, tie, or loss is claimed.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make lightrag-docker-context-smoke", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LightRAG repository", + "url": "https://github.com/HKUDS/LightRAG", + "evidence": "Official source for LightRAG server, Docker, and retrieval modes." + }, + { + "label": "LightRAG Docker docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "evidence": "Official Docker deployment reference." + }, + { + "label": "LightRAG API server docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md", + "evidence": "Official query-mode and context-output reference." + }, + { + "label": "LightRAG core programming docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md", + "evidence": "Official source-id and file-path citation reference." + } + ], + "setup_path": "Run cargo make lightrag-docker-context-smoke for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes.", + "resource_expectation": "The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts.", + "retry_guidance": [ + "Run cargo make lightrag-docker-context-smoke first; a missing API must remain a typed incomplete artifact, not a pass claim.", + "Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.", + "Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids." + ], + "research_depth": "D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed service/setup blockers.", + "Do not interpret lightrag-report.json as broad graph-RAG quality evidence unless generated source/context mappings score as pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter", + "reason": "Created as XY-886. XY-882 found a Docker service path and context/source mapping contract; implementation must prove evidence export before scoring." + } + }, + { + "adapter_id": "graphrag_research_gate", + "project": "GraphRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe generated-corpus GraphRAG smoke into a scored knowledge_compilation report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make graphrag-docker-smoke", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed blocked artifact without model calls; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration to attempt live GraphRAG index/query.", + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make graphrag-docker-smoke", + "artifact": "tmp/real-world-memory/graphrag-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphrag-report.json and graphrag-report.md from one generated knowledge_compilation job. Pass or wrong_result is allowed only when GraphRAG output tables map to generated evidence ids.", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-report.json" + }, + "capabilities": [ + { + "capability": "indexing_resource_envelope", + "status": "blocked", + "evidence": "The smoke bounds the generated public corpus, timeout, GraphRAG package, model configuration, cache size, output size, elapsed time, and observed cache entries." + }, + { + "capability": "source_citation_mapping", + "status": "blocked", + "evidence": "The generated artifact maps GraphRAG documents, text_units, communities, community_reports, entities, and relationships parquet rows back to real_world_job evidence ids when available." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; provider/setup limits remain blocked until live GraphRAG output maps to expected evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-navigation quality, knowledge-synthesis quality, private corpora, or large-corpus indexing." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "blocked", + "evidence": "The generated smoke can exercise parquet table source coverage for one tiny knowledge-compilation fixture, but the checked-in record stays blocked until live output exists." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The smoke may run local search for reachability, but retrieval quality scoring is not encoded." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Resource bounds are recorded, but no production-ops suite scoring is encoded." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "GraphRAG update/delete/current-versus-historical behavior is not encoded by the smoke." + } + ], + "scenarios": [ + { + "scenario_id": "output_table_citation_mapping", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for GraphRAG output-table citation scoring. The job requires provider-backed Docker output tables whose document, text-unit, community, report, entity, and relationship identifiers map to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + }, + { + "scenario_id": "graph_summary_synthesis_quality", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "GraphRAG graph-summary synthesis quality remains not_tested until provider-backed output tables and local-search context are scored beyond the smoke contract.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/microsoft/graphrag", + "status": "real" + }, + { + "kind": "source", + "ref": "https://microsoft.github.io/graphrag/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make graphrag-docker-smoke", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "GraphRAG repository", + "url": "https://github.com/microsoft/graphrag", + "evidence": "Official Microsoft GraphRAG source and setup reference." + }, + { + "label": "GraphRAG docs", + "url": "https://microsoft.github.io/graphrag/", + "evidence": "Official documentation for indexing and querying." + }, + { + "label": "GraphRAG input docs", + "url": "https://microsoft.github.io/graphrag/index/inputs/", + "evidence": "Official input format and document metadata reference." + }, + { + "label": "GraphRAG output tables", + "url": "https://microsoft.github.io/graphrag/index/outputs/", + "evidence": "Official output schema with document, text unit, community, and relationship identifiers." + }, + { + "label": "GraphRAG local search docs", + "url": "https://microsoft.github.io/graphrag/query/local_search/", + "evidence": "Official local-search context and graph traversal reference." + } + ], + "setup_path": "Run cargo make graphrag-docker-smoke for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke.", + "resource_expectation": "The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries.", + "retry_guidance": [ + "Run cargo make graphrag-docker-smoke first; missing provider configuration must remain a typed blocked artifact, not a pass claim.", + "Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.", + "Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs." + ], + "research_depth": "D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed provider/setup blockers.", + "Do not interpret graphrag-report.json as broad graph-navigation or knowledge-synthesis quality evidence unless output tables map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter", + "reason": "Created as XY-887. XY-882 found a Docker-bounded CLI/API path and output-table evidence handles; implementation must stay tiny and cost-recorded." + } + }, + { + "adapter_id": "graphiti_zep_research_gate", + "project": "Graphiti/Zep", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-contained Graphiti/Zep temporal smoke into a scored memory_evolution report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make graphiti-zep-docker-temporal-smoke", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if live execution is not explicitly enabled. Set ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration to start Docker-local FalkorDB and run Graphiti.", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make graphiti-zep-docker-temporal-smoke", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphiti-zep-report.json and graphiti-zep-report.md from one generated memory_evolution job. The default blocker is live-run opt-in disabled; when ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 are set without provider credentials, the blocker is provider_api_key_missing. No hosted Zep service or unrecorded credentials are used.", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json" + }, + "capabilities": [ + { + "capability": "temporal_graph_memory", + "status": "blocked", + "evidence": "The smoke materializes generated current, historical, and rationale facts with validity windows, but the checked-in record stays blocked until a live artifact maps search output." + }, + { + "capability": "docker_graph_store_setup", + "status": "blocked", + "evidence": "The task uses a Docker Compose graphiti-zep profile for FalkorDB and a container-local Python venv; no host-global graph database or hosted Zep service is used." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The generated temporal-validity fixture is scored or typed blocked; live quality evidence requires Graphiti/Zep search output mapped to current and historical evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-memory quality, managed Zep service behavior, private-corpus behavior, or large-corpus performance." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Generated current/historical relation facts are encoded, but the checked-in manifest stays blocked until the Docker smoke returns validity-window search output." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Hybrid graph retrieval reachability is not scored beyond the temporal search smoke." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations." + } + ], + "scenarios": [ + { + "scenario_id": "temporal_validity_window_mapping", + "suite_id": "memory_evolution", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for Graphiti/Zep temporal-validity scoring. The job remains blocked until provider-backed Docker output maps current and historical validity-window facts to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json" + }, + { + "scenario_id": "hosted_zep_temporal_memory", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted Zep service behavior is outside the Docker-local representative lane; no hosted-service result is used as ELF win/loss evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/getzep/graphiti", + "status": "real" + }, + { + "kind": "source", + "ref": "https://www.getzep.com/platform/graphiti/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make graphiti-zep-docker-temporal-smoke", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Graphiti repository", + "url": "https://github.com/getzep/graphiti", + "evidence": "Official open-source temporal context graph engine." + }, + { + "label": "Zep Graphiti overview", + "url": "https://www.getzep.com/platform/graphiti/", + "evidence": "Official product documentation for temporal context graph behavior." + }, + { + "label": "Graphiti quick start", + "url": "https://help.getzep.com/graphiti/getting-started/quick-start", + "evidence": "Official setup, episode ingest, and search output reference." + }, + { + "label": "Graphiti FalkorDB configuration", + "url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", + "evidence": "Official Docker-local FalkorDB setup reference." + }, + { + "label": "Graphiti fact triples", + "url": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", + "evidence": "Official manual fact-triple ingest contract." + } + ], + "setup_path": "Run cargo make graphiti-zep-docker-temporal-smoke for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", + "resource_expectation": "Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring.", + "retry_guidance": [ + "Run cargo make graphiti-zep-docker-temporal-smoke first to produce a typed blocked artifact.", + "Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.", + "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass." + ], + "research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed live-run opt-in, provider, and setup blockers.", + "Graphiti/Zep remains the temporal-validity reference; do not claim ELF-over-Graphiti/Zep until provider-backed temporal output maps to scored evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter", + "reason": "Created as XY-888. XY-882 found a Docker-local graph-store path and fact/validity-window output contract for memory_evolution scoring." + } + }, + { + "adapter_id": "letta_research_gate", + "project": "Letta", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract is a Docker-only benchmark-created agent export that must return core block JSON, archival search readback, and source ids before any scenario claim is scored." + }, + "run": { + "status": "not_encoded", + "evidence": "No Letta materializer currently creates the benchmark agent, imports the ELF core_archival_memory fixture corpus, or exports comparable core and archival evidence." + }, + "result": { + "status": "not_encoded", + "evidence": "No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision result is claimed." + }, + "capabilities": [ + { + "capability": "core_archival_memory", + "status": "blocked", + "evidence": "ELF fixture jobs now score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids." + }, + { + "capability": "docker_embedding_configuration", + "status": "blocked", + "evidence": "Docker setup requires explicit embedding configuration before archival retrieval can be tested." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No Letta materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Core memory preference application is not encoded for Letta." + }, + { + "suite_id": "project_decisions", + "status": "not_encoded", + "evidence": "Archival memory decision retrieval is not encoded for Letta." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Agent resumption through Letta memory blocks is not encoded." + }, + { + "suite_id": "core_archival_memory", + "status": "blocked", + "evidence": "ELF fixture coverage exists, but Letta has no contained export/readback artifact for the same core-vs-archival jobs." + } + ], + "scenarios": [ + { + "scenario_id": "core_block_attachment_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta has no comparable exported core block attachment evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_attachment.json" + }, + { + "scenario_id": "core_block_scope_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains unscored without a contained export of agent, block, and visibility metadata.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_scope.json" + }, + { + "scenario_id": "core_block_provenance_readback", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains not_tested until exported core memory includes stable source ids and audit-equivalent events.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/core_block_provenance.json" + }, + { + "scenario_id": "stale_core_detection", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/stale_core_detection.json" + }, + { + "scenario_id": "archival_fallback_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/archival_fallback.json" + }, + { + "scenario_id": "core_archival_project_decision_recovery", + "suite_id": "core_archival_memory", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains not_tested until the contained export/readback contract exists.", + "artifact": "apps/elf-eval/fixtures/real_world_memory/core_archival_memory/project_decision_recovery.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/letta-ai/letta", + "status": "real" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/guides/docker/", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Letta repository", + "url": "https://github.com/letta-ai/letta", + "evidence": "Official source for Letta stateful agents and memory." + }, + { + "label": "Letta Docker docs", + "url": "https://docs.letta.com/guides/docker/", + "evidence": "Official Docker deployment guide and embedding configuration boundary." + } + ], + "setup_path": "Use a Docker-only Letta server or CLI flow that creates a benchmark-owned agent, loads the checked-in core_archival_memory fixture corpus, writes core memory and archival memory with fixture source ids, then exports core block JSON plus archival search/readback JSON.", + "runtime_boundary": "Docker-only Letta server or CLI flow with benchmark-created agents, benchmark-owned storage, no host-global state, and no unstated hosted service dependency.", + "resource_expectation": "Embedding model, agent server state, exported core memory, archival search output, and provider boundaries must be explicit in the artifact.", + "retry_guidance": [ + "Create a tiny Docker agent with core memory and archival memory loaded from the ELF core_archival_memory fixtures.", + "Export core block readback, archival search results, source ids, and any audit-equivalent metadata as JSON before scoring.", + "Score core-versus-archival scenarios only after source evidence can be exported and mapped to the fixture evidence ids." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); XY-927 selects the contained export/readback contract, but the Letta adapter remains blocked until that artifact exists" + }, + "notes": [] + }, + { + "adapter_id": "langgraph_research_gate", + "project": "LangGraph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "LangGraph is D1 reviewed as a replay/checkpoint reference, not a direct memory backend adapter." + }, + "run": { + "status": "not_encoded", + "evidence": "No checkpoint replay real_world_job harness is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No production-ops or resume suite result is claimed." + }, + "capabilities": [ + { + "capability": "checkpoint_replay_regression", + "status": "not_encoded", + "evidence": "Replay/fork behavior needs an agent graph harness before scoring." + }, + { + "capability": "standalone_memory_backend", + "status": "unsupported", + "evidence": "LangGraph persistence is an agent-state/checkpoint layer, not a drop-in memory retrieval backend." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No LangGraph benchmark materializer exists." + } + ], + "suites": [ + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Checkpoint recovery and replay regression are not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume from checkpoint with memory reads is not encoded." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://docs.langchain.com/oss/python/langgraph/persistence", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LangGraph persistence docs", + "url": "https://docs.langchain.com/oss/python/langgraph/persistence", + "evidence": "Official documentation for checkpoints, replay, fork, and persistence behavior." + } + ], + "setup_path": "Build a tiny LangGraph agent with a checkpointer and explicit memory read/write steps before scoring.", + "runtime_boundary": "Docker-only Python harness with checkpoint store under the artifact directory.", + "resource_expectation": "Small runtime expected, but LLM calls and side effects must be stubbed or deterministic before replay claims.", + "retry_guidance": [ + "Encode one replay/fork failure recovery job.", + "Keep LangGraph classified as replay reference unless memory retrieval is actually exercised." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "nanograph_research_gate", + "project": "nanograph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "nanograph is D1 reviewed as typed graph DX, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No typed graph schema/query real_world_job run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No graph temporal or retrieval-debug result is claimed." + }, + "capabilities": [ + { + "capability": "typed_graph_schema", + "status": "not_encoded", + "evidence": "Schema-as-code and typed query ergonomics need a benchmark harness." + }, + { + "capability": "memory_backend_comparison", + "status": "unsupported", + "evidence": "nanograph is a graph database reference, not a complete agent memory service." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No nanograph materializer exists." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Typed current/historical fact jobs are not encoded." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Typed query explainability is not scored." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nanograph/nanograph", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "nanograph repository", + "url": "https://github.com/nanograph/nanograph", + "evidence": "Official source for on-device typed property graph behavior." + } + ], + "setup_path": "Build or install nanograph inside Docker and load a typed graph fixture from generated corpus facts.", + "runtime_boundary": "Docker-only CLI run with graph folder under benchmark artifacts.", + "resource_expectation": "Light local graph runtime expected; record binary build/install time and graph artifact size.", + "retry_guidance": [ + "Define a minimal schema for memory_evolution facts.", + "Score typed query output only if it cites fixture evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "llm_wiki_research_gate", + "project": "llm-wiki", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "llm-wiki is D1 reviewed as a knowledge-compilation reference, but no plugin or generated-page adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No llm-wiki corpus-to-page run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge page citation or lint result is claimed." + }, + "capabilities": [ + { + "capability": "knowledge_page_compilation", + "status": "not_encoded", + "evidence": "Wiki generation and citation lint are not executed by the runner." + }, + { + "capability": "live_service_runtime", + "status": "unsupported", + "evidence": "llm-wiki is a plugin/workflow reference rather than a service adapter." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No page materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Corpus-to-wiki output is not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from wiki pages are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "wiki_page_citation_lint", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "llm-wiki remains a knowledge-workflow reference. No Docker-contained plugin or file-based page materializer emits cited wiki sections for scoring.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nvk/llm-wiki", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "llm-wiki repository", + "url": "https://github.com/nvk/llm-wiki", + "evidence": "Official source for the LLM Wiki plugin and knowledge-base workflow." + } + ], + "setup_path": "Research plugin bootstrap inside a Docker-contained Codex or file-based harness, then materialize page artifacts.", + "runtime_boundary": "Docker-only plugin or fixture materializer; no user-global Codex plugin install.", + "resource_expectation": "LLM generation cost depends on page build; record provider boundary and generated artifact size.", + "retry_guidance": [ + "Prototype a fixture-only page build with explicit citations.", + "Do not score until generated sections can be mapped to evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "gbrain_research_gate", + "project": "gbrain", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "gbrain is D1 reviewed as a compiled-truth and timeline reference, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No gbrain brain-repo import or compiled-truth run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge-synthesis or operator-continuity result is claimed." + }, + "capabilities": [ + { + "capability": "compiled_truth_timeline", + "status": "not_encoded", + "evidence": "Compiled truth plus timeline output is a reference pattern but not scored." + }, + { + "capability": "postgres_backed_brain_repo", + "status": "blocked", + "evidence": "A Docker-local brain repo and Postgres setup path must be proven before execution." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No gbrain materializer exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Compiled truth and timeline pages are not scored." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Operator continuity through brain pages is not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "compiled_truth_timeline_export", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "gbrain compiled-truth and timeline scoring remains blocked until a Docker-local brain repository and database setup emits current-truth pages with source timeline evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "gbrain repository", + "url": "https://github.com/garrytan/gbrain", + "evidence": "Official source for brain repo and retrieval workflow." + }, + { + "label": "compiled truth guide", + "url": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "evidence": "Official guide for compiled truth plus timeline behavior." + } + ], + "setup_path": "Create a Docker-local brain repo fixture, run import/sync, and export compiled truth plus timeline evidence.", + "runtime_boundary": "Docker-only repository and database state with no operator-owned brain repo.", + "resource_expectation": "Postgres-backed sync and embedding choices must be explicit; record DB size and import time.", + "retry_guidance": [ + "Prototype a tiny brain repo with one current-truth page and timeline.", + "Score only if compiled truth cites the source timeline evidence." + ], + "research_depth": "D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven" + }, + "notes": [] + }, + { + "adapter_id": "graphify_docker_smoke", + "project": "graphify", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "XY-900 validation reached the Docker-only graph/report smoke setup inside the baseline runner without host-global assistant hooks.", + "command": "cargo make graphify-docker-graph-report-smoke", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" + }, + "run": { + "status": "pass", + "evidence": "The smoke installed graphify in a container-local venv, ran over a generated public corpus, and produced graph/report/query output for scoring.", + "command": "cargo make graphify-docker-graph-report-smoke", + "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The smoke emits graphify-report.json and graphify-report.md from one generated knowledge_compilation job. The current scored report maps evidence ids but remains wrong_result because the scoring rubric still records a wrong-result signal.", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" + }, + "capabilities": [ + { + "capability": "docker_cli_boundary", + "status": "pass", + "evidence": "The smoke uses docker-compose.baseline.yml baseline-runner, a container-local Python venv, and isolated assistant config paths; it does not install host-global assistant hooks." + }, + { + "capability": "graph_report_generation", + "status": "pass", + "evidence": "The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, command logs, build time, graph size, and report size." + }, + { + "capability": "real_world_job_adapter", + "status": "wrong_result", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; current knowledge_compilation scoring is wrong_result, not pass." + }, + { + "capability": "multimodal_code_graph", + "status": "not_encoded", + "evidence": "Multimodal extraction for videos, images, PDFs, or broad codebase understanding is a reference capability but not scored by this smoke." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph quality, private corpus behavior, scale, or authoritative memory-store behavior." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "evidence": "The generated smoke exercised graph/report evidence mapping for one generated knowledge-compilation fixture and scored wrong_result with mean_score 0.75." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "Graph-guided query output is present only as support for the generated knowledge_compilation smoke; broad retrieval quality scoring remains unclaimed." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from graph context are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "graph_report_navigation_lint", + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-929 adds a representative graphify fixture that scores graph report navigation, source-location citations, stale-source lint, and unsupported-summary handling as wrong_result because stale-source lint is still missing. This remains graphify non-pass evidence, not an ELF victory claim.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json" + }, + { + "scenario_id": "broad_graph_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Broad graph-navigation, codebase, multimodal, and private-corpus quality remain not_tested; the graphify evidence is bounded to generated graph/report artifacts.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/safishamsi/graphify", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make graphify-docker-graph-report-smoke", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-report.md", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "graphify repository", + "url": "https://github.com/safishamsi/graphify", + "evidence": "Official source for graphify graph extraction and query workflow." + }, + { + "label": "graphify README", + "url": "https://github.com/safishamsi/graphify/blob/v3/README.md", + "evidence": "Official CLI, output artifact, query, and source-location contract." + } + ], + "setup_path": "Run cargo make graphify-docker-graph-report-smoke to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke.", + "resource_expectation": "Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior.", + "retry_guidance": [ + "Run cargo make graphify-docker-graph-report-smoke first; setup/runtime failures must remain typed artifacts, not pass claims.", + "Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.", + "Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids." + ], + "research_depth": "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result" + }, + "notes": [ + "Status class: live Docker scored smoke with a current wrong_result outcome.", + "Do not interpret graphify-report.json as broad graph-navigation or knowledge-compilation quality evidence; the tiny smoke is scored and currently non-pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", + "reason": "Created as XY-889. XY-882 found a Docker-only CLI/materializer path and source-file/source-location output contract." + } + } + ] + }, + "capture_integration": { + "real": [], + "fixture_backed": [], + "mocked": [], + "blocked": [], + "not_encoded": [ + "No capture/integration behavior was declared by encoded fixtures." + ], + "notes": [] + }, + "summary": { + "job_count": 5, + "encoded_suite_count": 1, + "pass": 4, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 1, + "not_encoded": 0, + "unsupported_claim": 0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_total": 10, + "expected_evidence_matched": 10, + "expected_evidence_recall": 1.0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "wrong_result_stage_attribution_count": 0, + "mean_score": 0.8, + "mean_latency_ms": 2.0, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "evidence_required_count": 10, + "evidence_covered_count": 10, + "evidence_coverage": 1.0, + "source_ref_required_count": 10, + "source_ref_covered_count": 10, + "source_ref_coverage": 1.0, + "quote_required_count": 10, + "quote_covered_count": 10, + "quote_coverage": 1.0, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_correctness": 0.0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case_count": 0, + "qdrant_rebuild_pass_count": 0, + "operator_debug_job_count": 0, + "raw_sql_needed_count": 0, + "trace_incomplete_count": 0, + "operator_ux_gap_count": 0, + "consolidation": { + "proposal_count": 0, + "proposal_usefulness": null, + "lineage_completeness": null, + "review_action_correctness": null, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 0, + "executable_gap_count": 0 + }, + "scheduled_memory": { + "job_count": 4, + "task_run_count": 4, + "output_count": 5, + "required_task_kind_count": 4, + "covered_required_task_kind_count": 4, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 5, + "evidence_ref_output_count": 5, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 5, + "freshness_coverage": 1.0, + "action_rationale_count": 5, + "action_rationale_coverage": 1.0, + "trace_required_count": 4, + "trace_complete_count": 4, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 2, + "non_current_output_count": 3, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 7, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 2, + "source_trace_superseded_count": 3, + "source_trace_tombstone_count": 1 + } + }, + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "project_decisions", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "memory_summary", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "proactive_brief", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "scheduled_memory", + "status": "blocked", + "encoded_job_count": 5, + "score_mean": 0.8, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "At least one encoded job is blocked." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + }, + { + "suite_id": "context_trajectory", + "status": "not_encoded", + "encoded_job_count": 0, + "score_mean": null, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": null, + "irrelevant_context_ratio": null, + "trace_explainability_count": 0, + "reason": "No checked-in real_world_job fixture is encoded for this suite." + } + ], + "jobs": [ + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-knowledge-page-refresh-suggestion-001", + "title": "Suggest a knowledge-page refresh from scheduled memory", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-knowledge-page-stale-finding", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-knowledge-reviewable-refresh", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite" + } + ], + "produced_answer": "Scheduled knowledge-page refresh suggestion: suggest a reviewable rebuild because lint found the old scheduled-memory blocked state, and do not silently rewrite source notes.", + "produced_evidence": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 1, + "non_current_output_count": 0, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "source_immutability", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-private-provider-scheduler-blocked-001", + "title": "Block private/provider scheduled tasks without operator inputs", + "status": "blocked", + "answer_type": "scheduled_memory_task", + "requires_caveat": true, + "requires_refusal": true, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [], + "produced_answer": "", + "produced_evidence": [], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 0, + "expected_evidence_matched": 0, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": null, + "cost": null, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 0.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + } + ], + "reason": "No operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available; private/provider scheduled tasks stay blocked under XY-930.", + "evidence_required_count": 0, + "evidence_covered_count": 0, + "source_ref_required_count": 0, + "source_ref_covered_count": 0, + "quote_required_count": 0, + "quote_covered_count": 0, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-stale-decision-audit-001", + "title": "Audit a stale project decision during a scheduled task", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-old-consolidation-only-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-direct-suite-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite" + } + ], + "produced_answer": "Scheduled stale decision audit: the consolidation-only readiness decision is superseded by the direct real-world-memory-scheduled fixture suite plus aggregate real-world-memory regression guard.", + "produced_evidence": [ + "scheduled-current-direct-suite-decision", + "scheduled-old-consolidation-only-decision" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 0, + "non_current_output_count": 1, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 1, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-stale-preference-plan-audit-001", + "title": "Audit stale preferences and plans during a scheduled task", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-stale-old-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-stale-plan-expired", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-trace-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-reviewable-preference", + "claim_id": "scheduled_silent_mutation_rejected", + "requirement": "cite" + } + ], + "produced_answer": "Scheduled stale preference/plan audit: the old report plan is expired, the silent-mutation preference is historical, and the current path requires trace/readback plus reviewable derived output.", + "produced_evidence": [ + "scheduled-current-reviewable-preference", + "scheduled-current-trace-plan", + "scheduled-old-silent-mutation-preference", + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 4, + "expected_evidence_matched": 4, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 5, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 2, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 2, + "evidence_ref_output_count": 2, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 2, + "freshness_coverage": 1.0, + "action_rationale_count": 2, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 0, + "non_current_output_count": 2, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 2, + "source_trace_tombstone_count": 1 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 4, + "evidence_covered_count": 4, + "source_ref_required_count": 4, + "source_ref_covered_count": 4, + "quote_required_count": 4, + "quote_covered_count": 4, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-weekly-project-status-summary-001", + "title": "Run a weekly project status summary from current memory", + "status": "pass", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-weekly-current-gate", + "claim_id": "scheduled_weekly_gate", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-weekly-ledger-update", + "claim_id": "scheduled_weekly_ledger", + "requirement": "cite" + } + ], + "produced_answer": "Weekly scheduled summary: run cargo make real-world-memory-scheduled, update the XY-951 scheduled-memory-task readiness ledger, and do not claim hosted scheduled-product parity from fixture evidence.", + "produced_evidence": [ + "scheduled-weekly-current-gate", + "scheduled-weekly-ledger-update" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 1, + "non_current_output_count": 0, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + } + ], + "unsupported_claims": [], + "not_encoded_suites": [ + "trust_source_of_truth", + "work_resume", + "project_decisions", + "retrieval", + "memory_evolution", + "consolidation", + "memory_summary", + "proactive_brief", + "knowledge_compilation", + "operator_debugging_ux", + "capture_integration", + "production_ops", + "personalization", + "core_archival_memory", + "context_trajectory" + ], + "private_corpus_redaction": { + "policy": "publish evidence ids and bounded score summaries only; do not publish private text", + "private_fixture_count": 1 + }, + "evolution": { + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0 + }, + "follow_ups": [ + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-private-provider-scheduler-blocked-001", + "title": "XY-930 private/provider scheduled-memory input gate", + "reason": "Run private-corpus, provider-backed, and hosted scheduler gates only when operator-owned inputs exist." + } + ] +} \ No newline at end of file