From 51b55d7eee9acd3b89a984dc02c1e165387392b1 Mon Sep 17 00:00:00 2001
From: Alisha Kawaguchi <alisha@entire.io>
Date: Mon, 9 Mar 2026 12:20:56 -0700
Subject: [PATCH 1/4] Register local plugin marketplace for e2e and
 agent-integration plugins

Creates a local marketplace at .claude/plugins/ wrapping both the e2e and
agent-integration plugins, and registers it via extraKnownMarketplaces in
project settings so plugin subcommands (/e2e:debug, /agent-integration:research,
etc.) are discoverable by Claude Code.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Entire-Checkpoint: 13d3f0b6de12
---
 .../plugins/.claude-plugin/marketplace.json    | 18 ++++++++++++++++++
 .claude/settings.json                          | 12 ++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 .claude/plugins/.claude-plugin/marketplace.json

diff --git a/.claude/plugins/.claude-plugin/marketplace.json b/.claude/plugins/.claude-plugin/marketplace.json
new file mode 100644
index 000000000..8ae67e6ff
--- /dev/null
+++ b/.claude/plugins/.claude-plugin/marketplace.json
@@ -0,0 +1,18 @@
+{
+  "name": "entire-dev-tools",
+  "owner": {
+    "name": "Entire Team"
+  },
+  "plugins": [
+    {
+      "name": "e2e",
+      "source": "./e2e",
+      "description": "E2E test triage, debugging, and fix implementation toolkit"
+    },
+    {
+      "name": "agent-integration",
+      "source": "./agent-integration",
+      "description": "Multi-phase toolkit for integrating a new AI coding agent with the Entire CLI"
+    }
+  ]
+}
diff --git a/.claude/settings.json b/.claude/settings.json
index f7491410e..5bdb33ebb 100644
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -1,4 +1,16 @@
 {
+  "extraKnownMarketplaces": {
+    "entire-dev-tools": {
+      "source": {
+        "source": "directory",
+        "path": "./.claude/plugins"
+      }
+    }
+  },
+  "enabledPlugins": {
+    "e2e@entire-dev-tools": true,
+    "agent-integration@entire-dev-tools": true
+  },
   "hooks": {
     "SessionStart": [
       {

From 994602b6d108bb253243c640354a32d6c8da4f48 Mon Sep 17 00:00:00 2001
From: Alisha Kawaguchi <alisha@entire.io>
Date: Mon, 9 Mar 2026 12:33:48 -0700
Subject: [PATCH 2/4] Add e2e plugin, skills, and update agent-integration
 skills

Adds the e2e plugin (commands for debug, implement, triage-ci), e2e skills,
and updates agent-integration skills. These complement the marketplace
registration from the previous commit.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Entire-Checkpoint: 8d3b93e7ecd7
---
 .../plugins/e2e/.claude-plugin/plugin.json    |   5 +
 .claude/plugins/e2e/README.md                 |  15 ++
 .claude/plugins/e2e/commands/debug.md         |   7 +
 .claude/plugins/e2e/commands/implement.md     |   7 +
 .claude/plugins/e2e/commands/triage-ci.md     |   7 +
 .claude/skills/agent-integration/SKILL.md     |   2 +-
 .../skills/agent-integration/implementer.md   |  20 +-
 .../skills/agent-integration/test-writer.md   |   2 +-
 .claude/skills/e2e/SKILL.md                   |  32 ++++
 .claude/skills/e2e/debug.md                   |  88 +++++++++
 .claude/skills/e2e/implement.md               | 113 +++++++++++
 .claude/skills/e2e/triage-ci.md               | 176 ++++++++++++++++++
 12 files changed, 462 insertions(+), 12 deletions(-)
 create mode 100644 .claude/plugins/e2e/.claude-plugin/plugin.json
 create mode 100644 .claude/plugins/e2e/README.md
 create mode 100644 .claude/plugins/e2e/commands/debug.md
 create mode 100644 .claude/plugins/e2e/commands/implement.md
 create mode 100644 .claude/plugins/e2e/commands/triage-ci.md
 create mode 100644 .claude/skills/e2e/SKILL.md
 create mode 100644 .claude/skills/e2e/debug.md
 create mode 100644 .claude/skills/e2e/implement.md
 create mode 100644 .claude/skills/e2e/triage-ci.md

diff --git a/.claude/plugins/e2e/.claude-plugin/plugin.json b/.claude/plugins/e2e/.claude-plugin/plugin.json
new file mode 100644
index 000000000..7dd6d24b5
--- /dev/null
+++ b/.claude/plugins/e2e/.claude-plugin/plugin.json
@@ -0,0 +1,5 @@
+{
+  "name": "e2e",
+  "description": "E2E test triage, debugging, and fix implementation toolkit",
+  "version": "1.0.0"
+}
diff --git a/.claude/plugins/e2e/README.md b/.claude/plugins/e2e/README.md
new file mode 100644
index 000000000..a69d4d144
--- /dev/null
+++ b/.claude/plugins/e2e/README.md
@@ -0,0 +1,15 @@
+# E2E Plugin
+
+Local plugin providing individual commands for E2E test triage and debugging.
+
+## Commands
+
+| Command | Description |
+|---------|-------------|
+| `/e2e:triage-ci` | Run failing tests locally, classify flaky vs real-bug, present findings report |
+| `/e2e:debug` | Deep-dive artifact analysis for root cause diagnosis |
+| `/e2e:implement` | Apply fixes from triage/debug findings, verify with E2E tests |
+
+## Related
+
+- Orchestrator skill: `.claude/skills/e2e/SKILL.md` (`/e2e` — runs triage-ci then implement)
diff --git a/.claude/plugins/e2e/commands/debug.md b/.claude/plugins/e2e/commands/debug.md
new file mode 100644
index 000000000..609955117
--- /dev/null
+++ b/.claude/plugins/e2e/commands/debug.md
@@ -0,0 +1,7 @@
+---
+description: "Deep-dive artifact analysis for diagnosing E2E test failures"
+---
+
+# Debug Command
+
+Read and follow the full procedure from `.claude/skills/e2e/debug.md`.
diff --git a/.claude/plugins/e2e/commands/implement.md b/.claude/plugins/e2e/commands/implement.md
new file mode 100644
index 000000000..84cf00a41
--- /dev/null
+++ b/.claude/plugins/e2e/commands/implement.md
@@ -0,0 +1,7 @@
+---
+description: "Apply fixes from triage/debug findings, verify with scoped E2E tests"
+---
+
+# Implement Command
+
+Read and follow the full procedure from `.claude/skills/e2e/implement.md`.
diff --git a/.claude/plugins/e2e/commands/triage-ci.md b/.claude/plugins/e2e/commands/triage-ci.md
new file mode 100644
index 000000000..253f89633
--- /dev/null
+++ b/.claude/plugins/e2e/commands/triage-ci.md
@@ -0,0 +1,7 @@
+---
+description: "Run failing E2E tests locally, classify flaky vs real-bug, present findings report"
+---
+
+# Triage CI Command
+
+Read and follow the full procedure from `.claude/skills/e2e/triage-ci.md`.
diff --git a/.claude/skills/agent-integration/SKILL.md b/.claude/skills/agent-integration/SKILL.md
index 1c099efd7..d2b990c7b 100644
--- a/.claude/skills/agent-integration/SKILL.md
+++ b/.claude/skills/agent-integration/SKILL.md
@@ -52,7 +52,7 @@ This skill enforces strict E2E-first test-driven development. The rules:
 3. **Unit tests are written last.** After all E2E tiers pass (Step 14), you write unit tests using real data collected from E2E runs as golden fixtures.
 4. **If you didn't watch it fail, you don't know if it tests the right thing.** Never write a test you haven't seen fail first.
 5. **Minimum viable fix.** At each E2E failure, implement only the code needed to fix that failure. Don't anticipate future tiers.
-6. **`/debug-e2e` is your debugger.** When an E2E test fails, use the artifact directory with `/debug-e2e` before guessing at fixes.
+6. **`/e2e:debug` is your debugger.** When an E2E test fails, use the artifact directory with `/e2e:debug` before guessing at fixes.
 
 ## Pipeline
 
diff --git a/.claude/skills/agent-integration/implementer.md b/.claude/skills/agent-integration/implementer.md
index 70bbbb191..a4d781515 100644
--- a/.claude/skills/agent-integration/implementer.md
+++ b/.claude/skills/agent-integration/implementer.md
@@ -13,7 +13,7 @@ Build the agent Go package using strict E2E-first TDD. Unit tests are written ON
 1. **E2E tests are the spec.** The existing `ForEachAgent` test scenarios define "working". You implement until they pass.
 2. **Watch it fail first.** Every E2E tier starts by running the test and observing the failure. If you haven't seen the failure, you don't understand what needs fixing.
 3. **Minimum viable fix.** At each failure, implement only the code needed to make that specific assertion pass. Don't anticipate future tiers.
-4. **`/debug-e2e` is your debugger.** When an E2E test fails, use the artifact directory with `/debug-e2e` before guessing at fixes.
+4. **`/e2e:debug` is your debugger.** When an E2E test fails, use the artifact directory with `/e2e:debug` before guessing at fixes.
 5. **No unit tests during Steps 4-13.** Unit tests are written in Step 14 after all E2E tiers pass, using real data from E2E runs as golden fixtures.
 6. **Format and lint, don't unit test.** Between E2E tiers, run `mise run fmt && mise run lint` to keep code clean. Any earlier `mise run test` invocations (e.g., in Step 3) are strictly compile-only sanity checks — no `mise run test` between E2E tiers (Steps 4-13).
 7. **If you didn't watch it fail, you don't know if it tests the right thing.**
@@ -83,7 +83,7 @@ This test requires no agent prompts — it only exercises hooks, so it's the fas
 
 1. Run: `mise run test:e2e --agent $AGENT_SLUG TestHumanOnlyChangesAndCommits`
 2. **Watch it fail** — read the failure output carefully
-3. If there are artifact dirs, use `/debug-e2e {artifact-dir}` to understand what happened
+3. If there are artifact dirs, use `/e2e:debug {artifact-dir}` to understand what happened
 4. Implement the minimum code to fix the first failure
 5. Repeat until the test passes
 
@@ -105,7 +105,7 @@ The foundational test. This exercises the full agent lifecycle: start session 
 
 1. Run: `mise run test:e2e --agent $AGENT_SLUG TestSingleSessionManualCommit`
 2. **Watch it fail** — read the failure output carefully
-3. Use `/debug-e2e {artifact-dir}` to understand what happened
+3. Use `/e2e:debug {artifact-dir}` to understand what happened
 4. Implement the minimum code to fix the first failure
 5. Repeat until the test passes
 
@@ -127,7 +127,7 @@ Validates transcript quality: JSONL validity, content hash correctness, prompt e
 
 1. Run: `mise run test:e2e --agent $AGENT_SLUG TestCheckpointMetadataDeepValidation`
 2. **Watch it fail** — this test often exposes subtle transcript formatting bugs
-3. Use `/debug-e2e {artifact-dir}` on any failures
+3. Use `/e2e:debug {artifact-dir}` on any failures
 4. Fix and repeat
 
 Run: `mise run fmt && mise run lint`
@@ -146,7 +146,7 @@ Agent creates files and commits them within a single prompt turn. Tests the in-t
 **Cycle:**
 
 1. Run: `mise run test:e2e --agent $AGENT_SLUG TestSingleSessionAgentCommitInTurn`
-2. **Watch it fail** — use `/debug-e2e {artifact-dir}` on failures
+2. **Watch it fail** — use `/e2e:debug {artifact-dir}` on failures
 3. Fix and repeat — if the agent doesn't support committing, skip this test
 
 Run: `mise run fmt && mise run lint`
@@ -164,7 +164,7 @@ Run these tests to validate multi-session behavior:
 **Cycle (for each test):**
 
 1. Run: `mise run test:e2e --agent $AGENT_SLUG TestMultiSessionManualCommit`
-2. **Watch it fail** — use `/debug-e2e {artifact-dir}` on failures
+2. **Watch it fail** — use `/e2e:debug {artifact-dir}` on failures
 3. Fix and repeat
 4. Move to next test
 
@@ -183,7 +183,7 @@ Run these tests for file operation correctness:
 - `TestDeletedFilesCommitDeletion` — Agent deletes a file, user commits the deletion
 - `TestMixedNewAndModifiedFiles` — Agent both creates and modifies files
 
-**Cycle:** Same as above — run each test, **watch it fail**, use `/debug-e2e` on failures, fix, repeat.
+**Cycle:** Same as above — run each test, **watch it fail**, use `/e2e:debug` on failures, fix, repeat.
 
 Run: `mise run fmt && mise run lint`
 
@@ -215,7 +215,7 @@ Run these if the agent supports interactive multi-step sessions:
 - `TestRewindAfterCommit` — Rewind to a checkpoint after committing
 - `TestRewindMultipleFiles` — Rewind with multiple files changed
 
-**Cycle:** Same pattern — run, **watch it fail**, `/debug-e2e` on failures, fix, repeat.
+**Cycle:** Same pattern — run, **watch it fail**, `/e2e:debug` on failures, fix, repeat.
 
 Run: `mise run fmt && mise run lint`
 
@@ -256,7 +256,7 @@ mise run test:e2e --agent $AGENT_SLUG TestFailingTestName
 
 If a test passes when run individually but fails in the full suite, it's a flaky failure — not a real error. Only investigate failures that reproduce consistently when run in isolation.
 
-Fix any real failures before proceeding — the same cycle applies: read the failure, use `/debug-e2e {artifact-dir}`, implement the minimum fix, re-run.
+Fix any real failures before proceeding — the same cycle applies: read the failure, use `/e2e:debug {artifact-dir}`, implement the minimum fix, re-run.
 
 All E2E tests must pass before writing unit tests.
 
@@ -321,7 +321,7 @@ At every E2E failure, follow this protocol:
 
 1. **Read the test output** — the assertion message often tells you exactly what's wrong
 2. **Find the artifact directory** — E2E tests save artifacts (logs, transcripts, git state) to a temp dir printed in the output
-3. **Run `/debug-e2e {artifact-dir}`** — this skill analyzes artifacts and diagnoses the root cause
+3. **Run `/e2e:debug {artifact-dir}`** — this skill analyzes artifacts and diagnoses the root cause
 4. **Implement the minimum fix** — don't over-engineer; fix only what the test demands
 5. **Re-run the failing test** — not the whole suite, just the one test
 
diff --git a/.claude/skills/agent-integration/test-writer.md b/.claude/skills/agent-integration/test-writer.md
index 042561362..cb0ed4716 100644
--- a/.claude/skills/agent-integration/test-writer.md
+++ b/.claude/skills/agent-integration/test-writer.md
@@ -199,7 +199,7 @@ Use `/commit` to commit all files.
 - **Interactive tests**: Use `s.StartSession`, `s.Send`, `s.WaitFor` — tmux pane is auto-captured in artifacts
 - **Run commands**: `mise run test:e2e --agent ${slug} TestName` — see `e2e/README.md` for all options
 - **E2E tests are run during the implement phase**: This phase only creates the runner. The implement phase runs E2E tests at each tier to drive development.
-- **Debugging failures**: If tests fail during the implement phase, use `/debug-e2e` with the artifact directory to diagnose CLI-level issues (hooks, checkpoints, session phases, attribution)
+- **Debugging failures**: If tests fail during the implement phase, use `/e2e:debug` with the artifact directory to diagnose CLI-level issues (hooks, checkpoints, session phases, attribution)
 
 ## Output
 
diff --git a/.claude/skills/e2e/SKILL.md b/.claude/skills/e2e/SKILL.md
new file mode 100644
index 000000000..049336526
--- /dev/null
+++ b/.claude/skills/e2e/SKILL.md
@@ -0,0 +1,32 @@
+---
+name: e2e
+description: >
+  Orchestrate E2E test triage and fix implementation: runs triage-ci then implement sequentially.
+  Accepts test names, --agent, artifact path, or CI run reference.
+  For individual phases, use /e2e:triage-ci, /e2e:debug, or /e2e:implement.
+  Use when the user says "triage e2e", "fix e2e failures", or wants the full triage-to-fix pipeline.
+---
+
+# E2E Triage & Fix — Full Pipeline
+
+Run triage-ci then implement sequentially. Parameters are collected once and reused across both phases.
+
+## Parameters
+
+The user provides one or more of:
+- **Test name(s)** -- e.g., `TestInteractiveMultiStep`
+- **`--agent <agent>`** -- optional, defaults to all agents that previously failed
+- **A local artifact path** -- skip straight to analysis of existing artifacts
+- **CI run reference** -- `latest`, a run ID, or a run URL
+
+## Phase 1: Triage CI
+
+Read and follow the full procedure from `.claude/skills/e2e/triage-ci.md`.
+
+This produces a findings report with classifications (flaky/real-bug/test-bug) for each test+agent pair.
+
+## Phase 2: Implement Fixes
+
+Read and follow the full procedure from `.claude/skills/e2e/implement.md`.
+
+Uses the findings from Phase 1 (already in conversation context) to propose, apply, and verify fixes.
diff --git a/.claude/skills/e2e/debug.md b/.claude/skills/e2e/debug.md
new file mode 100644
index 000000000..94f814130
--- /dev/null
+++ b/.claude/skills/e2e/debug.md
@@ -0,0 +1,88 @@
+# Debug Entire CLI via E2E Artifacts
+
+Diagnose Entire CLI bugs using captured artifacts from the E2E test suite. Artifacts are written to `e2e/artifacts/` locally or downloaded from CI via GitHub Actions.
+
+## Inputs
+
+The user provides either:
+- **A test run directory:** `e2e/artifacts/{timestamp}/` -- triage all failures
+- **A specific test directory:** `e2e/artifacts/{timestamp}/{TestName}-{agent}/` -- debug one test
+
+## Artifact Layout
+
+```
+e2e/artifacts/{timestamp}/
+├── report.nocolor.txt          # Pass/fail/skip summary with error lines
+├── test-events.json            # Raw Go test events (NDJSON)
+├── entire-version.txt          # CLI version under test
+└── {TestName}-{agent}/
+    ├── PASS or FAIL            # Status marker
+    ├── console.log             # Full operation transcript
+    ├── git-log.txt             # git log --decorate --graph --all
+    ├── git-tree.txt            # ls-tree HEAD + checkpoint branch
+    ├── entire-logs/entire.log  # CLI structured JSON logs
+    ├── checkpoint-metadata/    # Checkpoint + session metadata
+    └── repo -> /tmp/...        # Symlink to preserved repo (E2E_KEEP_REPOS=1 only)
+```
+
+## Preserved Repo
+
+When the test run was executed with `E2E_KEEP_REPOS=1`, each test's artifact directory contains a `repo` symlink pointing to the preserved temporary git repository. This is the actual repo the test operated on -- you can inspect it directly.
+
+**Navigate via the symlink** (e.g., `{artifact-dir}/repo/`) rather than resolving the `/tmp/...` path. The symlink lives inside the artifact directory so permissions and paths stay consistent.
+
+The preserved repo contains:
+- Full git history with all branches (main, `entire/checkpoints/v1`)
+- The `.entire/` directory with CLI state, config, and raw logs
+- The `.claude/` directory (if Claude Code was the agent)
+- All files the agent created or modified, in their final state
+
+This is the most powerful debugging tool -- you can run `git log`, `git diff`, `git show`, inspect `.entire/` internals, and see exactly what the CLI left behind.
+
+## Debugging Workflow
+
+### 1. Triage (if given a run directory)
+
+Read `report.nocolor.txt` to identify failures and their error messages. Each entry shows the test name, agent, duration, and failure output with file:line references.
+
+### 2. Read console.log (most important)
+
+Full transcript of every operation:
+- `> claude -p "..." ...` -- agent prompts with stdout/stderr
+- `> git add/commit/...` -- git commands
+- `> send: ...` -- interactive session inputs
+
+This tells you what happened chronologically.
+
+### 3. Read test source code
+
+Use the file:line from the report to find the test in `e2e/tests/`. Understand what the test expected to happen vs what console.log shows actually happened.
+
+### 4. Diagnose the CLI behavior
+
+Cross-reference console.log (what happened) with the test (what should have happened). Focus on CLI-level issues:
+
+| Symptom | CLI Investigation |
+|---------|-------------------|
+| Checkpoint not created / timeout | Check `entire-logs/entire.log` for hook invocations, phase transitions, errors |
+| Wrong checkpoint content | Check `git-tree.txt` for checkpoint branch files, `checkpoint-metadata/` for session info |
+| Hooks didn't fire | Check `entire.log` for missing hook entries (session-start, user-prompt-submit, stop, post-commit) |
+| Stash/unstash problems | Check `entire.log` for stash-related log lines, `git-log.txt` for commit ordering |
+| Attribution issues | Check `checkpoint-metadata/` for `files_touched`, session metadata for attribution data |
+| Strategy mismatch | Check `entire.log` for `strategy` field, verify auto-commit vs manual-commit behavior |
+
+### 5. Deep dive files
+
+- **entire-logs/entire.log**: Structured JSON logs -- hook lifecycle, session phases (`active` -> `idle` -> `ended`), warnings, errors. Key fields: `component`, `hook`, `strategy`, `session_id`.
+- **git-log.txt**: Commit graph showing main branch, `entire/checkpoints/v1`, checkpoint initialization.
+- **git-tree.txt**: Files at HEAD vs checkpoint branch (separated by `--- entire/checkpoints/v1 ---`).
+- **checkpoint-metadata/**: `metadata.json` has `checkpoint_id`, `strategy`, `files_touched`, `token_usage`, and `sessions` array. Session subdirs have per-session details.
+
+### 6. Report findings
+
+Identify whether the issue is in:
+- **CLI hooks** (prepare-commit-msg, commit-msg, post-commit)
+- **Session management** (phase transitions, session tracking)
+- **Checkpoint creation** (branch management, metadata writing)
+- **Attribution** (file tracking, prompt correlation)
+- **Strategy logic** (auto-commit vs manual-commit behavior)
diff --git a/.claude/skills/e2e/implement.md b/.claude/skills/e2e/implement.md
new file mode 100644
index 000000000..342c87213
--- /dev/null
+++ b/.claude/skills/e2e/implement.md
@@ -0,0 +1,113 @@
+# E2E Implement Fixes
+
+Apply fixes for E2E test failures, verify with scoped E2E tests.
+
+> **IMPORTANT: Running real E2E tests is a HARD REQUIREMENT of this procedure.**
+> Every fix MUST be verified with real E2E tests before the summary step.
+> Canary tests use the Vogon fake agent and cannot catch agent-specific issues.
+> Do NOT skip E2E verification unless the user explicitly declines due to cost.
+
+## Inputs
+
+This procedure accepts findings from one of:
+- **`/e2e:triage-ci` output** -- findings report already in conversation context
+- **`/e2e:debug` output** -- root cause analysis already in conversation context
+- **Standalone description** -- user describes known failure and desired fix
+
+## Step 1: Identify Fixes
+
+From the findings in context, identify actionable fixes:
+
+### For `flaky` failures: describe the proposed fix
+
+For agent-behavior flaky issues, fixes typically modify test prompts. For test-bug flaky issues, fixes target `e2e/` infrastructure code (harness setup, helpers, env propagation).
+
+```
+**Proposed fix:** <description>
+  - File: <path to test file or e2e infrastructure file>
+  - Change: <what will be modified -- e.g., append "Do not ask for confirmation" to prompt, or fix env propagation in NewTmuxSession>
+```
+
+Common flaky fixes:
+- Agent asked for confirmation -> append "Do not ask for confirmation" to prompt
+- Agent wrote to wrong path -> be more explicit about paths in prompt
+- Agent committed when shouldn't -> add "Do not commit" to prompt
+- Checkpoint wait timeout -> increase timeout argument
+- Agent timeout (signal: killed) -> increase per-test timeout, simplify prompt
+- Auth/env not propagated -> fix test harness env setup in `e2e/` code
+- Test helper bug (wrong assertion, bad glob) -> fix test helper in `e2e/`
+- tmux session setup issue -> fix `NewTmuxSession` or session config in `e2e/`
+
+### For `real-bug` failures: describe root cause analysis
+
+```
+**Root cause analysis:**
+  - Component: <hooks | session | checkpoint | strategy | agent>
+  - Suspected location: <file:function>
+  - Description: <what's wrong and why>
+  - Proposed fix: <what code change would address it>
+```
+
+## Step 2: Ask the User
+
+Prompt the user:
+
+> **Should I fix these?**
+> - [list of tests with classifications and proposed fixes]
+> - You can select all, specific tests, or skip.
+
+Wait for user response before proceeding.
+
+## Step 3: Apply Fixes
+
+For **flaky** fixes the user approved:
+1. Apply fixes directly in the working tree (no branch creation)
+2. Run static checks:
+   ```bash
+   mise run fmt && mise run lint
+   mise run test:e2e:canary   # Must pass
+   ```
+3. **Run real E2E tests to verify the fix.** Scope depends on what was changed:
+   - **Agent-specific fix** (e.g., `e2e/agents/cursor_cli.go`, one agent's config/trust/env): run the full suite for that agent only:
+     ```bash
+     mise run test:e2e --agent <agent>
+     ```
+   - **Shared test infra fix** (e.g., `e2e/agents/agent.go`, `e2e/testutil/`, `TmuxSession`, test helpers): run the full suite for all agents that failed, since the fix could affect any of them:
+     ```bash
+     mise run test:e2e --agent <agent1>
+     mise run test:e2e --agent <agent2>
+     # ... for each agent that had failures
+     ```
+   - **Test prompt fix** (e.g., changed wording in a specific test): run that test across all agents that failed it:
+     ```bash
+     mise run test:e2e --agent <agent> <TestName>
+     ```
+   **This step is MANDATORY** -- canary tests use the Vogon fake agent and cannot verify agent-specific behavior (trust dialogs, env propagation, config directories, etc.).
+4. If any step fails, investigate and adjust. Report what happened to the user.
+
+For **real-bug** fixes the user approved:
+1. Apply the fix directly in the working tree (no branch creation)
+2. Run static checks and unit tests:
+   ```bash
+   mise run fmt && mise run lint
+   mise run test        # Unit tests
+   mise run test:e2e:canary  # Canary tests
+   ```
+3. **Run real E2E tests to verify the fix (MANDATORY).** Same scoping rules as flaky fixes above:
+   - **Agent-specific change** -> full suite for that agent
+   - **Shared CLI/infra change** -> full suite for all agents that failed
+   - **Narrow change** (single test affected) -> just that test across affected agents
+4. Report results to the user.
+
+**GATE: Do NOT proceed to the summary until real E2E tests have been run and results reported for every fix applied above.** If E2E tests were not run, go back and run them now.
+
+## Step 4: Summary
+
+Print a summary table:
+```
+| Test | Agent(s) | Classification | Action Taken |
+|------|----------|----------------|--------------|
+| TestFoo | claude-code | flaky | Fixed in working tree |
+| TestBar | all agents | real-bug | Fix applied, tests passing |
+| TestBaz | opencode | flaky | Skipped (user declined) |
+```
diff --git a/.claude/skills/e2e/triage-ci.md b/.claude/skills/e2e/triage-ci.md
new file mode 100644
index 000000000..19f36b546
--- /dev/null
+++ b/.claude/skills/e2e/triage-ci.md
@@ -0,0 +1,176 @@
+# E2E Triage CI
+
+Triage E2E test failures with **re-run verification**. Analyzes artifacts and re-runs failing tests locally to distinguish flaky from real bugs. Produces a findings report with classifications -- does NOT apply fixes. Does deep analysis of the code.
+
+---
+
+## Step L1: Parse User Input
+
+The user provides one or more of:
+- **Test name(s)** -- e.g., `TestInteractiveMultiStep`
+- **`--agent <agent>`** -- optional, defaults to all agents that previously failed
+- **A local artifact path** -- skip straight to analysis of existing artifacts
+- **CI run reference** -- triggers artifact download instead of local re-runs:
+  - `latest CI run` / `latest` -- most recent failed E2E run on main
+  - A GitHub Actions run ID (numeric, e.g., `12345678`)
+  - A GitHub Actions run URL
+
+**CI artifact download:** When a CI run reference is provided, download artifacts using:
+
+```bash
+scripts/download-e2e-artifacts.sh <latest | RUN_ID | RUN_URL>
+```
+
+The script outputs the absolute artifact path as its **last line of stdout** -- capture that and use it as the artifact path for analysis. After downloading, **skip Steps L2-L5** (local re-runs) and go straight to **Shared Analysis** (Step 1), since we're analyzing CI artifacts, not running tests locally.
+
+**Cost warning:** Real E2E tests consume API tokens. Before running, confirm with the user unless they provided specific test names (implying intent to run).
+
+## Step L2: First Run
+
+```bash
+mise run test:e2e --agent <agent> <TestName>
+```
+
+Capture the artifact directory from the `artifacts: <path>` output line.
+
+## Step L3: Re-run on Failure
+
+If the test **passes** on first run: report as passing, done for this test.
+
+If the test **fails**: run a **second time** with the same parameters.
+
+## Step L4: Tiebreaker (if needed)
+
+If results are **split** (1 pass, 1 fail): run a **third time** as tiebreaker.
+
+## Step L5: Collect Results
+
+For each test+agent pair, record: `(test, agent, run_1_result, run_2_result, [run_3_result])`
+
+Proceed to **Shared Analysis** (Step 1 below).
+
+---
+
+## Shared Analysis & Classification
+
+### Step 1: Analyze Each Failure
+
+For each failure, follow the **Debugging Workflow** in `.claude/skills/e2e/debug.md` (steps 2-5: console.log -> test source -> entire.log -> deep dive). Collect:
+- What the agent actually did (from console.log)
+- What was expected (from test source)
+- CLI-level errors or anomalies (from entire.log)
+- Repo/checkpoint state (from git-log.txt, git-tree.txt, checkpoint-metadata/)
+
+### Step 2: Classify Each Failure
+
+Use **re-run results as the primary signal**, supplemented by artifact analysis.
+
+#### Re-run signals (strongest):
+
+| Original | Re-run 1 | Re-run 2 | Classification |
+|----------|----------|----------|----------------|
+| FAIL | FAIL (same error) | FAIL (same error) | **real-bug** OR **flaky (test-bug)** -- see below |
+| FAIL | PASS | PASS | **flaky** |
+| FAIL | PASS | FAIL | **flaky** (non-deterministic) |
+| FAIL | FAIL | PASS | **flaky** (non-deterministic) |
+| FAIL | FAIL (different error) | FAIL (different error) | **needs deeper analysis** -- examine artifacts |
+
+**Important: Consistent failures can still be `flaky` (test-bug).** When all re-runs fail, check *where* the root cause is:
+- Root cause in `cmd/entire/cli/` -> **real-bug** (product code is broken)
+- Root cause in `e2e/` (test infra, test helpers, tmux setup, env propagation) -> **flaky (test-bug)** -- the CLI works fine, the test is broken
+
+#### Strong `real-bug` signals (root cause must be in `cmd/entire/cli/`, not `e2e/`):
+
+- `entire.log` contains `"level":"ERROR"` or panic/stack traces from CLI code
+- Checkpoint metadata structurally corrupt (malformed JSON, missing `checkpoint_id`/`strategy`)
+- Session state file missing or malformed when expected
+- Hooks did not fire at all (no `hook invoked` log entries)
+- Shadow/metadata branch has wrong tree structure
+- Same test fails across 3+ agents with same non-timeout symptom
+- Error references CLI code (panic in `cmd/entire/cli/`)
+
+**Key question:** Is the bug in `cmd/entire/cli/` (product code) or in `e2e/` (test code)? Only the former is a `real-bug`.
+
+#### Strong `flaky` signals (unless overridden by real-bug):
+
+**Agent behavior (non-deterministic):**
+- `signal: killed` (timeout)
+- `context deadline exceeded` or `WaitForCheckpoint.*exceeded deadline`
+- Agent asked for confirmation instead of acting
+- Agent created file at wrong path / wrong name
+- Agent produced no output
+- Agent committed when it shouldn't have (or vice versa)
+- Duration near timeout limit
+
+**Test-bug (consistent failure, but root cause is in `e2e/` not `cmd/entire/cli/`):**
+- Agent "Not logged in" / auth errors -> test env setup doesn't propagate auth credentials
+- Env vars not propagated to agent session -> tmux/test harness bug
+- Error references test code (`e2e/`) not CLI code (`cmd/entire/cli/`)
+- Test helper logic errors (wrong assertions, bad globs, incorrect expected values)
+- Consistent failure BUT root cause traced to `e2e/` code, not `cmd/entire/cli/`
+- Test setup/teardown issues (missing git config, temp dir cleanup, port conflicts)
+
+#### Ambiguous cases:
+
+Read `entire.log` carefully:
+- If hooks fired correctly and metadata is valid -> lean **flaky**
+- If hooks fired but produced wrong results -> lean **real-bug**
+
+### Step 3: Cross-Agent Correlation
+
+Before reporting, check correlations using re-run data:
+- Same test fails for 3+ agents, all re-runs also fail -> strong **real-bug**
+- Same test fails for multiple agents, but re-runs pass -> **flaky** (shared prompt issue)
+- One agent fails consistently, others pass -> agent-specific issue (still **real-bug** if re-runs confirm)
+
+### Step 4: Present Findings Report
+
+For each test+agent pair, print a findings block:
+
+```
+## <TestName> (<agent>) -- <classification>
+
+**Re-run results:** original=FAIL, rerun1=PASS, rerun2=PASS
+**Evidence:**
+- <1-2 sentence summary of what went wrong>
+- <key artifact evidence: entire.log excerpt, console.log excerpt, etc.>
+```
+
+For `flaky` failures, include proposed fix description:
+
+```
+**Proposed fix:** <description>
+  - File: <path to test file or e2e infrastructure file>
+  - Change: <what will be modified>
+```
+
+Common flaky fixes:
+- Agent asked for confirmation -> append "Do not ask for confirmation" to prompt
+- Agent wrote to wrong path -> be more explicit about paths in prompt
+- Agent committed when shouldn't -> add "Do not commit" to prompt
+- Checkpoint wait timeout -> increase timeout argument
+- Agent timeout (signal: killed) -> increase per-test timeout, simplify prompt
+- Auth/env not propagated -> fix test harness env setup in `e2e/` code
+- Test helper bug (wrong assertion, bad glob) -> fix test helper in `e2e/`
+- tmux session setup issue -> fix `NewTmuxSession` or session config in `e2e/`
+
+For `real-bug` failures, include root cause analysis:
+
+```
+**Root cause analysis:**
+  - Component: <hooks | session | checkpoint | strategy | agent>
+  - Suspected location: <file:function>
+  - Description: <what's wrong and why>
+  - Proposed fix: <what code change would address it>
+```
+
+### Step 5: Summary
+
+Print a summary table (classification only, no "Action Taken"):
+```
+| Test | Agent(s) | Re-runs | Classification |
+|------|----------|---------|----------------|
+| TestFoo | claude-code | FAIL/PASS/PASS | flaky |
+| TestBar | all agents | FAIL/FAIL/FAIL | real-bug |
+| TestBaz | opencode | FAIL/PASS/FAIL | flaky (non-deterministic) |
+```

From 1692c8ee7df34c59276973684f0d481032ddc19b Mon Sep 17 00:00:00 2001
From: Alisha Kawaguchi <alisha@entire.io>
Date: Mon, 9 Mar 2026 12:42:00 -0700
Subject: [PATCH 3/4] Fix inconsistent log paths and improve triage-ci
 description

- Standardize on `entire-logs/entire.log` in debug.md diagnostic table
- Update triage-ci command description to mention CI artifact support

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Entire-Checkpoint: 2046d09ffe0c
---
 .claude/plugins/e2e/commands/triage-ci.md | 2 +-
 .claude/skills/e2e/debug.md               | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.claude/plugins/e2e/commands/triage-ci.md b/.claude/plugins/e2e/commands/triage-ci.md
index 253f89633..085dafa6d 100644
--- a/.claude/plugins/e2e/commands/triage-ci.md
+++ b/.claude/plugins/e2e/commands/triage-ci.md
@@ -1,5 +1,5 @@
 ---
-description: "Run failing E2E tests locally, classify flaky vs real-bug, present findings report"
+description: "Triage E2E failures via local reruns or CI artifacts, classify flaky vs real-bug, present findings report"
 ---
 
 # Triage CI Command
diff --git a/.claude/skills/e2e/debug.md b/.claude/skills/e2e/debug.md
index 94f814130..567192ecc 100644
--- a/.claude/skills/e2e/debug.md
+++ b/.claude/skills/e2e/debug.md
@@ -66,10 +66,10 @@ Cross-reference console.log (what happened) with the test (what should have happ
 |---------|-------------------|
 | Checkpoint not created / timeout | Check `entire-logs/entire.log` for hook invocations, phase transitions, errors |
 | Wrong checkpoint content | Check `git-tree.txt` for checkpoint branch files, `checkpoint-metadata/` for session info |
-| Hooks didn't fire | Check `entire.log` for missing hook entries (session-start, user-prompt-submit, stop, post-commit) |
-| Stash/unstash problems | Check `entire.log` for stash-related log lines, `git-log.txt` for commit ordering |
+| Hooks didn't fire | Check `entire-logs/entire.log` for missing hook entries (session-start, user-prompt-submit, stop, post-commit) |
+| Stash/unstash problems | Check `entire-logs/entire.log` for stash-related log lines, `git-log.txt` for commit ordering |
 | Attribution issues | Check `checkpoint-metadata/` for `files_touched`, session metadata for attribution data |
-| Strategy mismatch | Check `entire.log` for `strategy` field, verify auto-commit vs manual-commit behavior |
+| Strategy mismatch | Check `entire-logs/entire.log` for `strategy` field, verify auto-commit vs manual-commit behavior |
 
 ### 5. Deep dive files
 

From 34ffb23bae8e7df26bd6a84ae2c5b0844c3acc30 Mon Sep 17 00:00:00 2001
From: Alisha Kawaguchi <alisha@entire.io>
Date: Mon, 9 Mar 2026 12:55:52 -0700
Subject: [PATCH 4/4] Add e2e artifact download script and remove debug-e2e
 skill

- Add scripts/download-e2e-artifacts.sh for downloading CI artifacts
- Remove .claude/skills/debug-e2e/ (consolidated into /e2e:debug command)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Entire-Checkpoint: 15ac9dc10e03
---
 .claude/skills/debug-e2e/SKILL.md |  93 -------------------------
 scripts/download-e2e-artifacts.sh | 111 ++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 93 deletions(-)
 delete mode 100644 .claude/skills/debug-e2e/SKILL.md
 create mode 100755 scripts/download-e2e-artifacts.sh

diff --git a/.claude/skills/debug-e2e/SKILL.md b/.claude/skills/debug-e2e/SKILL.md
deleted file mode 100644
index 3c407768a..000000000
--- a/.claude/skills/debug-e2e/SKILL.md
+++ /dev/null
@@ -1,93 +0,0 @@
----
-name: debug-e2e
-description: Use when investigating E2E test failures from artifacts to diagnose bugs in the Entire CLI, or when pointed at an artifact path for root cause analysis
----
-
-# Debug Entire CLI via E2E Artifacts
-
-Diagnose Entire CLI bugs using captured artifacts from the E2E test suite. Artifacts are written to `e2e/artifacts/` locally or downloaded from CI via GitHub Actions.
-
-## Inputs
-
-The user provides either:
-- **A test run directory:** `e2e/artifacts/{timestamp}/` — triage all failures
-- **A specific test directory:** `e2e/artifacts/{timestamp}/{TestName}-{agent}/` — debug one test
-
-## Artifact Layout
-
-```
-e2e/artifacts/{timestamp}/
-├── report.nocolor.txt          # Pass/fail/skip summary with error lines
-├── test-events.json            # Raw Go test events (NDJSON)
-├── entire-version.txt          # CLI version under test
-└── {TestName}-{agent}/
-    ├── PASS or FAIL            # Status marker
-    ├── console.log             # Full operation transcript
-    ├── git-log.txt             # git log --decorate --graph --all
-    ├── git-tree.txt            # ls-tree HEAD + checkpoint branch
-    ├── entire-logs/entire.log  # CLI structured JSON logs
-    ├── checkpoint-metadata/    # Checkpoint + session metadata
-    └── repo -> /tmp/...        # Symlink to preserved repo (E2E_KEEP_REPOS=1 only)
-```
-
-## Preserved Repo
-
-When the test run was executed with `E2E_KEEP_REPOS=1`, each test's artifact directory contains a `repo` symlink pointing to the preserved temporary git repository. This is the actual repo the test operated on — you can inspect it directly.
-
-**Navigate via the symlink** (e.g., `{artifact-dir}/repo/`) rather than resolving the `/tmp/...` path. The symlink lives inside the artifact directory so permissions and paths stay consistent.
-
-The preserved repo contains:
-- Full git history with all branches (main, `entire/checkpoints/v1`)
-- The `.entire/` directory with CLI state, config, and raw logs
-- The `.claude/` directory (if Claude Code was the agent)
-- All files the agent created or modified, in their final state
-
-This is the most powerful debugging tool — you can run `git log`, `git diff`, `git show`, inspect `.entire/` internals, and see exactly what the CLI left behind.
-
-## Debugging Workflow
-
-### 1. Triage (if given a run directory)
-
-Read `report.nocolor.txt` to identify failures and their error messages. Each entry shows the test name, agent, duration, and failure output with file:line references.
-
-### 2. Read console.log (most important)
-
-Full transcript of every operation:
-- `> claude -p "..." ...` — agent prompts with stdout/stderr
-- `> git add/commit/...` — git commands
-- `> send: ...` — interactive session inputs
-
-This tells you what happened chronologically.
-
-### 3. Read test source code
-
-Use the file:line from the report to find the test in `e2e/tests/`. Understand what the test expected to happen vs what console.log shows actually happened.
-
-### 4. Diagnose the CLI behavior
-
-Cross-reference console.log (what happened) with the test (what should have happened). Focus on CLI-level issues:
-
-| Symptom | CLI Investigation |
-|---------|-------------------|
-| Checkpoint not created / timeout | Check `entire-logs/entire.log` for hook invocations, phase transitions, errors |
-| Wrong checkpoint content | Check `git-tree.txt` for checkpoint branch files, `checkpoint-metadata/` for session info |
-| Hooks didn't fire | Check `entire.log` for missing hook entries (session-start, user-prompt-submit, stop, post-commit) |
-| Stash/unstash problems | Check `entire.log` for stash-related log lines, `git-log.txt` for commit ordering |
-| Attribution issues | Check `checkpoint-metadata/` for `files_touched`, session metadata for attribution data |
-| Strategy mismatch | Check `entire.log` for `strategy` field, verify auto-commit vs manual-commit behavior |
-
-### 5. Deep dive files
-
-- **entire-logs/entire.log**: Structured JSON logs — hook lifecycle, session phases (`active` → `idle` → `ended`), warnings, errors. Key fields: `component`, `hook`, `strategy`, `session_id`.
-- **git-log.txt**: Commit graph showing main branch, `entire/checkpoints/v1`, checkpoint initialization.
-- **git-tree.txt**: Files at HEAD vs checkpoint branch (separated by `--- entire/checkpoints/v1 ---`).
-- **checkpoint-metadata/**: `metadata.json` has `checkpoint_id`, `strategy`, `files_touched`, `token_usage`, and `sessions` array. Session subdirs have per-session details.
-
-### 6. Report findings
-
-Identify whether the issue is in:
-- **CLI hooks** (prepare-commit-msg, commit-msg, post-commit)
-- **Session management** (phase transitions, session tracking)
-- **Checkpoint creation** (branch management, metadata writing)
-- **Attribution** (file tracking, prompt correlation)
-- **Strategy logic** (auto-commit vs manual-commit behavior)
diff --git a/scripts/download-e2e-artifacts.sh b/scripts/download-e2e-artifacts.sh
new file mode 100755
index 000000000..b2663c906
--- /dev/null
+++ b/scripts/download-e2e-artifacts.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+#
+# Download E2E test artifacts from GitHub Actions.
+#
+# Usage: scripts/download-e2e-artifacts.sh [RUN_ID | RUN_URL | "latest"]
+#   RUN_ID:  numeric GitHub Actions run ID
+#   RUN_URL: full URL like https://github.com/entireio/cli/actions/runs/12345
+#   "latest": most recent failed E2E run on main
+#
+# Outputs the absolute path of the download directory as the last line of stdout.
+# All diagnostic messages go to stderr.
+
+set -euo pipefail
+
+log() { echo "$@" >&2; }
+die() { log "ERROR: $1"; exit 1; }
+
+# --- Validate prerequisites ---
+
+command -v gh >/dev/null 2>&1 || die "'gh' CLI is not installed. Install from https://cli.github.com/"
+gh auth status >/dev/null 2>&1 || die "'gh' is not authenticated. Run 'gh auth login' first."
+
+# --- Parse input ---
+
+input="${1:-}"
+[ -z "$input" ] && die "Usage: $0 [RUN_ID | RUN_URL | \"latest\"]"
+
+run_id=""
+
+case "$input" in
+  latest)
+    log "Finding most recent failed E2E run on main..."
+    run_id=$(gh run list -w e2e.yml --status=failure -L1 --json databaseId -q '.[0].databaseId' 2>/dev/null)
+    [ -z "$run_id" ] && die "No failed E2E runs found."
+    log "Found run: $run_id"
+    ;;
+  http*)
+    # Extract run ID from URL: https://github.com/<owner>/<repo>/actions/runs/<id>
+    run_id=$(echo "$input" | grep -oE '/runs/[0-9]+' | grep -oE '[0-9]+')
+    [ -z "$run_id" ] && die "Could not extract run ID from URL: $input"
+    log "Extracted run ID: $run_id"
+    ;;
+  *[!0-9]*)
+    die "Invalid input: '$input'. Provide a numeric run ID, a GitHub Actions URL, or 'latest'."
+    ;;
+  *)
+    run_id="$input"
+    ;;
+esac
+
+# --- Fetch run metadata ---
+
+log "Fetching run metadata..."
+run_url=$(gh run view "$run_id" --json url -q '.url' 2>/dev/null) || die "Run $run_id not found."
+commit=$(gh run view "$run_id" --json headSha -q '.headSha' 2>/dev/null) || commit="unknown"
+
+log "Run URL: $run_url"
+log "Commit:  $commit"
+
+# --- Download artifacts ---
+
+dest="e2e/artifacts/ci-${run_id}"
+
+# If artifacts were already downloaded, skip re-downloading
+if [ -d "$dest" ] && [ "$(ls -A "$dest" 2>/dev/null)" ]; then
+  log "Artifacts already downloaded at $dest/, skipping download."
+else
+  mkdir -p "$dest"
+  log "Downloading artifacts to $dest/ ..."
+  gh run download "$run_id" --dir "$dest" 2>&1 >&2 || die "Failed to download artifacts. They may have expired (retention: 7 days)."
+fi
+
+# --- Restructure: flatten e2e-artifacts-<agent>/ wrapper dirs ---
+
+cd "$dest"
+for wrapper in e2e-artifacts-*/; do
+  [ -d "$wrapper" ] || continue
+  agent="${wrapper#e2e-artifacts-}"
+  agent="${agent%/}"
+  # Move contents up: e2e-artifacts-claude-code/* -> claude-code/
+  if [ -d "$agent" ]; then
+    # Agent dir already exists (shouldn't happen, but be safe)
+    cp -r "$wrapper"/* "$agent"/ 2>/dev/null || true
+  else
+    mv "$wrapper" "$agent"
+  fi
+done
+cd - >/dev/null
+
+# --- Write run metadata ---
+
+agents_found=$(cd "$dest" && ls -d */ 2>/dev/null | tr -d '/' | tr '\n' ', ' | sed 's/,$//')
+
+cat > "$dest/.run-info.json" <<EOF
+{
+  "run_id": "$run_id",
+  "run_url": "$run_url",
+  "commit": "$commit",
+  "downloaded_at": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
+  "agents": "$(echo "$agents_found" | sed 's/"/\\"/g')"
+}
+EOF
+
+log ""
+log "Downloaded artifacts for: $agents_found"
+log "Run info: $dest/.run-info.json"
+log ""
+
+# Last line of stdout: absolute path for callers to capture
+abs_dest="$(cd "$dest" && pwd)"
+echo "$abs_dest"