Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,9 @@ args = [
# | real-world-memory-proactive-brief | composite | |
# | real-world-memory-proactive-brief-json | command | |
# | real-world-memory-proactive-brief-report | command | |
# | real-world-memory-scheduled | composite | |
# | real-world-memory-scheduled-json | command | |
# | real-world-memory-scheduled-report | command | |
# | real-world-memory-live-consolidation | command | |
# | real-world-job-operator-ux | composite | |
# | real-world-job-operator-ux-json | command | |
Expand Down Expand Up @@ -935,6 +938,55 @@ args = [
"tmp/real-world-memory/proactive-brief/report.md",
]

[tasks.real-world-memory-scheduled]
workspace = false
dependencies = [
"real-world-memory-scheduled-report",
]

[tasks.real-world-memory-scheduled-json]
workspace = false
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"run",
"--fixtures",
"apps/elf-eval/fixtures/real_world_memory/scheduled_memory",
"--out",
"tmp/real-world-memory/scheduled/report.json",
"--run-id",
"real-world-memory-scheduled",
"--adapter-id",
"fixture_scheduled_memory",
"--adapter-name",
"ELF scheduled memory fixture",
]

[tasks.real-world-memory-scheduled-report]
workspace = false
dependencies = [
"real-world-memory-scheduled-json",
]
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"publish",
"--report",
"tmp/real-world-memory/scheduled/report.json",
"--out",
"tmp/real-world-memory/scheduled/report.md",
]

[tasks.real-world-memory-live-consolidation]
workspace = false
command = "bash"
Expand Down
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,14 @@ provider-backed ELF evidence was required.
its pinned Docker local embedding path and is reported as `wrong_result` when
same-corpus evidence terms are missed; claude-mem and OpenViking non-retrieval
coverage remain typed non-pass states.
- Real-world agent memory aggregate after XY-953: 55 fixture-backed
jobs across 15 suites, 49 pass, 0 incomplete, 6 blocked, 0 wrong-result,
- Real-world agent memory aggregate after XY-954: 60 fixture-backed
jobs across 16 suites, 53 pass, 0 incomplete, 7 blocked, 0 wrong-result,
0 not-encoded, and 0 unsupported-claim results. The remaining non-pass jobs are
production-ops operator boundaries plus blocked OpenViking staged trajectory,
hierarchy selection, recursive/context expansion measurement gates, and the
private-corpus refresh blocker tied to XY-930, not hidden benchmark wins. The
private-corpus/private-provider scheduler blockers tied to XY-930, not hidden benchmark wins. The
`scheduled_memory` suite contributes four passing source-linked scheduled task
readbacks plus one typed private/provider scheduler blocker tied to XY-930. The
`core_archival_memory` suite passes 6 fixture jobs for core block attachment, scope,
provenance, stale-core detection, archival fallback, and project-decision recovery;
it does not create an ELF-over-Letta claim. The
Expand Down Expand Up @@ -272,6 +274,7 @@ Detailed evidence and interpretation:
- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md)
- [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md)
- [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md)
- [Scheduled Memory Task Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md)
- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
- [Single-User Production Runbook](docs/guide/single_user_production.md)
- Benchmark contract:
Expand Down Expand Up @@ -354,6 +357,7 @@ Detailed comparison, mechanism-level analysis, and source map:
- [First-Generation OSS Continuity and Source-Store Report - June 11, 2026](docs/guide/benchmarking/2026-06-11-first-generation-oss-continuity-source-store-report.md)
- [Live Temporal Reconciliation Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-live-temporal-reconciliation-report.md)
- [Proactive Brief Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-proactive-brief-scoring-report.md)
- [Scheduled Memory Task Scoring Report - June 16, 2026](docs/guide/benchmarking/2026-06-16-scheduled-memory-task-scoring-report.md)
- [Live Baseline Benchmark Runbook](docs/guide/benchmarking/live_baseline_benchmark.md)
- [Real-World Agent Memory Benchmark](docs/guide/benchmarking/real_world_agent_memory_benchmark.md)
- [External Memory Improvement Plan](docs/guide/research/external_memory_improvement_plan.md)
Expand All @@ -364,8 +368,8 @@ Detailed comparison, mechanism-level analysis, and source map:
- [RAG/Graph Adapter Feasibility Research Run](docs/research/2026-06-10-xy-882-rag-graph-adapter-feasibility.json)

Latest real-world benchmark report: June 16, 2026. Latest external research refresh:
June 11, 2026; June 16 adds live temporal reconciliation and live consolidation
self-check evidence.
June 11, 2026; June 16 adds live temporal reconciliation, live consolidation
self-check evidence, and fixture-backed scheduled-memory task scoring.

## Documentation

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
},
"run": {
"status": "blocked",
"evidence": "The current fixture set reports 55 jobs across 15 suites: 49 pass, 0 incomplete, 6 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.",
"evidence": "The current fixture set reports 60 jobs across 16 suites: 53 pass, 0 incomplete, 7 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; the scheduled_memory suite scores 4 passing scheduled readback tasks plus one blocked private/provider scheduler case tied to XY-930, not hosted scheduler, ChatGPT Tasks, Pulse, or provider-backed private-corpus parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.",
"command": "cargo make real-world-memory",
"artifact": "tmp/real-world-memory/real-world-memory-report.json"
},
Expand Down Expand Up @@ -96,6 +96,11 @@
"status": "blocked",
"evidence": "The proactive brief suite scores 4 passing source-linked suggestions and 1 typed private-corpus refresh blocker tied to XY-930."
},
{
"suite_id": "scheduled_memory",
"status": "blocked",
"evidence": "The scheduled memory suite scores 4 passing source-linked task readbacks with execution trace coverage and 1 typed private/provider scheduler blocker tied to XY-930."
},
{
"suite_id": "knowledge_compilation",
"status": "pass",
Expand Down
Loading