From 51b55d7eee9acd3b89a984dc02c1e165387392b1 Mon Sep 17 00:00:00 2001 From: Alisha Kawaguchi Date: Mon, 9 Mar 2026 12:20:56 -0700 Subject: [PATCH 1/4] Register local plugin marketplace for e2e and agent-integration plugins Creates a local marketplace at .claude/plugins/ wrapping both the e2e and agent-integration plugins, and registers it via extraKnownMarketplaces in project settings so plugin subcommands (/e2e:debug, /agent-integration:research, etc.) are discoverable by Claude Code. Co-Authored-By: Claude Opus 4.6 Entire-Checkpoint: 13d3f0b6de12 --- .../plugins/.claude-plugin/marketplace.json | 18 ++++++++++++++++++ .claude/settings.json | 12 ++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 .claude/plugins/.claude-plugin/marketplace.json diff --git a/.claude/plugins/.claude-plugin/marketplace.json b/.claude/plugins/.claude-plugin/marketplace.json new file mode 100644 index 000000000..8ae67e6ff --- /dev/null +++ b/.claude/plugins/.claude-plugin/marketplace.json @@ -0,0 +1,18 @@ +{ + "name": "entire-dev-tools", + "owner": { + "name": "Entire Team" + }, + "plugins": [ + { + "name": "e2e", + "source": "./e2e", + "description": "E2E test triage, debugging, and fix implementation toolkit" + }, + { + "name": "agent-integration", + "source": "./agent-integration", + "description": "Multi-phase toolkit for integrating a new AI coding agent with the Entire CLI" + } + ] +} diff --git a/.claude/settings.json b/.claude/settings.json index f7491410e..5bdb33ebb 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -1,4 +1,16 @@ { + "extraKnownMarketplaces": { + "entire-dev-tools": { + "source": { + "source": "directory", + "path": "./.claude/plugins" + } + } + }, + "enabledPlugins": { + "e2e@entire-dev-tools": true, + "agent-integration@entire-dev-tools": true + }, "hooks": { "SessionStart": [ { From 994602b6d108bb253243c640354a32d6c8da4f48 Mon Sep 17 00:00:00 2001 From: Alisha Kawaguchi Date: Mon, 9 Mar 2026 12:33:48 -0700 Subject: [PATCH 2/4] Add e2e plugin, skills, and update agent-integration skills Adds the e2e plugin (commands for debug, implement, triage-ci), e2e skills, and updates agent-integration skills. These complement the marketplace registration from the previous commit. Co-Authored-By: Claude Opus 4.6 Entire-Checkpoint: 8d3b93e7ecd7 --- .../plugins/e2e/.claude-plugin/plugin.json | 5 + .claude/plugins/e2e/README.md | 15 ++ .claude/plugins/e2e/commands/debug.md | 7 + .claude/plugins/e2e/commands/implement.md | 7 + .claude/plugins/e2e/commands/triage-ci.md | 7 + .claude/skills/agent-integration/SKILL.md | 2 +- .../skills/agent-integration/implementer.md | 20 +- .../skills/agent-integration/test-writer.md | 2 +- .claude/skills/e2e/SKILL.md | 32 ++++ .claude/skills/e2e/debug.md | 88 +++++++++ .claude/skills/e2e/implement.md | 113 +++++++++++ .claude/skills/e2e/triage-ci.md | 176 ++++++++++++++++++ 12 files changed, 462 insertions(+), 12 deletions(-) create mode 100644 .claude/plugins/e2e/.claude-plugin/plugin.json create mode 100644 .claude/plugins/e2e/README.md create mode 100644 .claude/plugins/e2e/commands/debug.md create mode 100644 .claude/plugins/e2e/commands/implement.md create mode 100644 .claude/plugins/e2e/commands/triage-ci.md create mode 100644 .claude/skills/e2e/SKILL.md create mode 100644 .claude/skills/e2e/debug.md create mode 100644 .claude/skills/e2e/implement.md create mode 100644 .claude/skills/e2e/triage-ci.md diff --git a/.claude/plugins/e2e/.claude-plugin/plugin.json b/.claude/plugins/e2e/.claude-plugin/plugin.json new file mode 100644 index 000000000..7dd6d24b5 --- /dev/null +++ b/.claude/plugins/e2e/.claude-plugin/plugin.json @@ -0,0 +1,5 @@ +{ + "name": "e2e", + "description": "E2E test triage, debugging, and fix implementation toolkit", + "version": "1.0.0" +} diff --git a/.claude/plugins/e2e/README.md b/.claude/plugins/e2e/README.md new file mode 100644 index 000000000..a69d4d144 --- /dev/null +++ b/.claude/plugins/e2e/README.md @@ -0,0 +1,15 @@ +# E2E Plugin + +Local plugin providing individual commands for E2E test triage and debugging. + +## Commands + +| Command | Description | +|---------|-------------| +| `/e2e:triage-ci` | Run failing tests locally, classify flaky vs real-bug, present findings report | +| `/e2e:debug` | Deep-dive artifact analysis for root cause diagnosis | +| `/e2e:implement` | Apply fixes from triage/debug findings, verify with E2E tests | + +## Related + +- Orchestrator skill: `.claude/skills/e2e/SKILL.md` (`/e2e` — runs triage-ci then implement) diff --git a/.claude/plugins/e2e/commands/debug.md b/.claude/plugins/e2e/commands/debug.md new file mode 100644 index 000000000..609955117 --- /dev/null +++ b/.claude/plugins/e2e/commands/debug.md @@ -0,0 +1,7 @@ +--- +description: "Deep-dive artifact analysis for diagnosing E2E test failures" +--- + +# Debug Command + +Read and follow the full procedure from `.claude/skills/e2e/debug.md`. diff --git a/.claude/plugins/e2e/commands/implement.md b/.claude/plugins/e2e/commands/implement.md new file mode 100644 index 000000000..84cf00a41 --- /dev/null +++ b/.claude/plugins/e2e/commands/implement.md @@ -0,0 +1,7 @@ +--- +description: "Apply fixes from triage/debug findings, verify with scoped E2E tests" +--- + +# Implement Command + +Read and follow the full procedure from `.claude/skills/e2e/implement.md`. diff --git a/.claude/plugins/e2e/commands/triage-ci.md b/.claude/plugins/e2e/commands/triage-ci.md new file mode 100644 index 000000000..253f89633 --- /dev/null +++ b/.claude/plugins/e2e/commands/triage-ci.md @@ -0,0 +1,7 @@ +--- +description: "Run failing E2E tests locally, classify flaky vs real-bug, present findings report" +--- + +# Triage CI Command + +Read and follow the full procedure from `.claude/skills/e2e/triage-ci.md`. diff --git a/.claude/skills/agent-integration/SKILL.md b/.claude/skills/agent-integration/SKILL.md index 1c099efd7..d2b990c7b 100644 --- a/.claude/skills/agent-integration/SKILL.md +++ b/.claude/skills/agent-integration/SKILL.md @@ -52,7 +52,7 @@ This skill enforces strict E2E-first test-driven development. The rules: 3. **Unit tests are written last.** After all E2E tiers pass (Step 14), you write unit tests using real data collected from E2E runs as golden fixtures. 4. **If you didn't watch it fail, you don't know if it tests the right thing.** Never write a test you haven't seen fail first. 5. **Minimum viable fix.** At each E2E failure, implement only the code needed to fix that failure. Don't anticipate future tiers. -6. **`/debug-e2e` is your debugger.** When an E2E test fails, use the artifact directory with `/debug-e2e` before guessing at fixes. +6. **`/e2e:debug` is your debugger.** When an E2E test fails, use the artifact directory with `/e2e:debug` before guessing at fixes. ## Pipeline diff --git a/.claude/skills/agent-integration/implementer.md b/.claude/skills/agent-integration/implementer.md index 70bbbb191..a4d781515 100644 --- a/.claude/skills/agent-integration/implementer.md +++ b/.claude/skills/agent-integration/implementer.md @@ -13,7 +13,7 @@ Build the agent Go package using strict E2E-first TDD. Unit tests are written ON 1. **E2E tests are the spec.** The existing `ForEachAgent` test scenarios define "working". You implement until they pass. 2. **Watch it fail first.** Every E2E tier starts by running the test and observing the failure. If you haven't seen the failure, you don't understand what needs fixing. 3. **Minimum viable fix.** At each failure, implement only the code needed to make that specific assertion pass. Don't anticipate future tiers. -4. **`/debug-e2e` is your debugger.** When an E2E test fails, use the artifact directory with `/debug-e2e` before guessing at fixes. +4. **`/e2e:debug` is your debugger.** When an E2E test fails, use the artifact directory with `/e2e:debug` before guessing at fixes. 5. **No unit tests during Steps 4-13.** Unit tests are written in Step 14 after all E2E tiers pass, using real data from E2E runs as golden fixtures. 6. **Format and lint, don't unit test.** Between E2E tiers, run `mise run fmt && mise run lint` to keep code clean. Any earlier `mise run test` invocations (e.g., in Step 3) are strictly compile-only sanity checks — no `mise run test` between E2E tiers (Steps 4-13). 7. **If you didn't watch it fail, you don't know if it tests the right thing.** @@ -83,7 +83,7 @@ This test requires no agent prompts — it only exercises hooks, so it's the fas 1. Run: `mise run test:e2e --agent $AGENT_SLUG TestHumanOnlyChangesAndCommits` 2. **Watch it fail** — read the failure output carefully -3. If there are artifact dirs, use `/debug-e2e {artifact-dir}` to understand what happened +3. If there are artifact dirs, use `/e2e:debug {artifact-dir}` to understand what happened 4. Implement the minimum code to fix the first failure 5. Repeat until the test passes @@ -105,7 +105,7 @@ The foundational test. This exercises the full agent lifecycle: start session 1. Run: `mise run test:e2e --agent $AGENT_SLUG TestSingleSessionManualCommit` 2. **Watch it fail** — read the failure output carefully -3. Use `/debug-e2e {artifact-dir}` to understand what happened +3. Use `/e2e:debug {artifact-dir}` to understand what happened 4. Implement the minimum code to fix the first failure 5. Repeat until the test passes @@ -127,7 +127,7 @@ Validates transcript quality: JSONL validity, content hash correctness, prompt e 1. Run: `mise run test:e2e --agent $AGENT_SLUG TestCheckpointMetadataDeepValidation` 2. **Watch it fail** — this test often exposes subtle transcript formatting bugs -3. Use `/debug-e2e {artifact-dir}` on any failures +3. Use `/e2e:debug {artifact-dir}` on any failures 4. Fix and repeat Run: `mise run fmt && mise run lint` @@ -146,7 +146,7 @@ Agent creates files and commits them within a single prompt turn. Tests the in-t **Cycle:** 1. Run: `mise run test:e2e --agent $AGENT_SLUG TestSingleSessionAgentCommitInTurn` -2. **Watch it fail** — use `/debug-e2e {artifact-dir}` on failures +2. **Watch it fail** — use `/e2e:debug {artifact-dir}` on failures 3. Fix and repeat — if the agent doesn't support committing, skip this test Run: `mise run fmt && mise run lint` @@ -164,7 +164,7 @@ Run these tests to validate multi-session behavior: **Cycle (for each test):** 1. Run: `mise run test:e2e --agent $AGENT_SLUG TestMultiSessionManualCommit` -2. **Watch it fail** — use `/debug-e2e {artifact-dir}` on failures +2. **Watch it fail** — use `/e2e:debug {artifact-dir}` on failures 3. Fix and repeat 4. Move to next test @@ -183,7 +183,7 @@ Run these tests for file operation correctness: - `TestDeletedFilesCommitDeletion` — Agent deletes a file, user commits the deletion - `TestMixedNewAndModifiedFiles` — Agent both creates and modifies files -**Cycle:** Same as above — run each test, **watch it fail**, use `/debug-e2e` on failures, fix, repeat. +**Cycle:** Same as above — run each test, **watch it fail**, use `/e2e:debug` on failures, fix, repeat. Run: `mise run fmt && mise run lint` @@ -215,7 +215,7 @@ Run these if the agent supports interactive multi-step sessions: - `TestRewindAfterCommit` — Rewind to a checkpoint after committing - `TestRewindMultipleFiles` — Rewind with multiple files changed -**Cycle:** Same pattern — run, **watch it fail**, `/debug-e2e` on failures, fix, repeat. +**Cycle:** Same pattern — run, **watch it fail**, `/e2e:debug` on failures, fix, repeat. Run: `mise run fmt && mise run lint` @@ -256,7 +256,7 @@ mise run test:e2e --agent $AGENT_SLUG TestFailingTestName If a test passes when run individually but fails in the full suite, it's a flaky failure — not a real error. Only investigate failures that reproduce consistently when run in isolation. -Fix any real failures before proceeding — the same cycle applies: read the failure, use `/debug-e2e {artifact-dir}`, implement the minimum fix, re-run. +Fix any real failures before proceeding — the same cycle applies: read the failure, use `/e2e:debug {artifact-dir}`, implement the minimum fix, re-run. All E2E tests must pass before writing unit tests. @@ -321,7 +321,7 @@ At every E2E failure, follow this protocol: 1. **Read the test output** — the assertion message often tells you exactly what's wrong 2. **Find the artifact directory** — E2E tests save artifacts (logs, transcripts, git state) to a temp dir printed in the output -3. **Run `/debug-e2e {artifact-dir}`** — this skill analyzes artifacts and diagnoses the root cause +3. **Run `/e2e:debug {artifact-dir}`** — this skill analyzes artifacts and diagnoses the root cause 4. **Implement the minimum fix** — don't over-engineer; fix only what the test demands 5. **Re-run the failing test** — not the whole suite, just the one test diff --git a/.claude/skills/agent-integration/test-writer.md b/.claude/skills/agent-integration/test-writer.md index 042561362..cb0ed4716 100644 --- a/.claude/skills/agent-integration/test-writer.md +++ b/.claude/skills/agent-integration/test-writer.md @@ -199,7 +199,7 @@ Use `/commit` to commit all files. - **Interactive tests**: Use `s.StartSession`, `s.Send`, `s.WaitFor` — tmux pane is auto-captured in artifacts - **Run commands**: `mise run test:e2e --agent ${slug} TestName` — see `e2e/README.md` for all options - **E2E tests are run during the implement phase**: This phase only creates the runner. The implement phase runs E2E tests at each tier to drive development. -- **Debugging failures**: If tests fail during the implement phase, use `/debug-e2e` with the artifact directory to diagnose CLI-level issues (hooks, checkpoints, session phases, attribution) +- **Debugging failures**: If tests fail during the implement phase, use `/e2e:debug` with the artifact directory to diagnose CLI-level issues (hooks, checkpoints, session phases, attribution) ## Output diff --git a/.claude/skills/e2e/SKILL.md b/.claude/skills/e2e/SKILL.md new file mode 100644 index 000000000..049336526 --- /dev/null +++ b/.claude/skills/e2e/SKILL.md @@ -0,0 +1,32 @@ +--- +name: e2e +description: > + Orchestrate E2E test triage and fix implementation: runs triage-ci then implement sequentially. + Accepts test names, --agent, artifact path, or CI run reference. + For individual phases, use /e2e:triage-ci, /e2e:debug, or /e2e:implement. + Use when the user says "triage e2e", "fix e2e failures", or wants the full triage-to-fix pipeline. +--- + +# E2E Triage & Fix — Full Pipeline + +Run triage-ci then implement sequentially. Parameters are collected once and reused across both phases. + +## Parameters + +The user provides one or more of: +- **Test name(s)** -- e.g., `TestInteractiveMultiStep` +- **`--agent `** -- optional, defaults to all agents that previously failed +- **A local artifact path** -- skip straight to analysis of existing artifacts +- **CI run reference** -- `latest`, a run ID, or a run URL + +## Phase 1: Triage CI + +Read and follow the full procedure from `.claude/skills/e2e/triage-ci.md`. + +This produces a findings report with classifications (flaky/real-bug/test-bug) for each test+agent pair. + +## Phase 2: Implement Fixes + +Read and follow the full procedure from `.claude/skills/e2e/implement.md`. + +Uses the findings from Phase 1 (already in conversation context) to propose, apply, and verify fixes. diff --git a/.claude/skills/e2e/debug.md b/.claude/skills/e2e/debug.md new file mode 100644 index 000000000..94f814130 --- /dev/null +++ b/.claude/skills/e2e/debug.md @@ -0,0 +1,88 @@ +# Debug Entire CLI via E2E Artifacts + +Diagnose Entire CLI bugs using captured artifacts from the E2E test suite. Artifacts are written to `e2e/artifacts/` locally or downloaded from CI via GitHub Actions. + +## Inputs + +The user provides either: +- **A test run directory:** `e2e/artifacts/{timestamp}/` -- triage all failures +- **A specific test directory:** `e2e/artifacts/{timestamp}/{TestName}-{agent}/` -- debug one test + +## Artifact Layout + +``` +e2e/artifacts/{timestamp}/ +├── report.nocolor.txt # Pass/fail/skip summary with error lines +├── test-events.json # Raw Go test events (NDJSON) +├── entire-version.txt # CLI version under test +└── {TestName}-{agent}/ + ├── PASS or FAIL # Status marker + ├── console.log # Full operation transcript + ├── git-log.txt # git log --decorate --graph --all + ├── git-tree.txt # ls-tree HEAD + checkpoint branch + ├── entire-logs/entire.log # CLI structured JSON logs + ├── checkpoint-metadata/ # Checkpoint + session metadata + └── repo -> /tmp/... # Symlink to preserved repo (E2E_KEEP_REPOS=1 only) +``` + +## Preserved Repo + +When the test run was executed with `E2E_KEEP_REPOS=1`, each test's artifact directory contains a `repo` symlink pointing to the preserved temporary git repository. This is the actual repo the test operated on -- you can inspect it directly. + +**Navigate via the symlink** (e.g., `{artifact-dir}/repo/`) rather than resolving the `/tmp/...` path. The symlink lives inside the artifact directory so permissions and paths stay consistent. + +The preserved repo contains: +- Full git history with all branches (main, `entire/checkpoints/v1`) +- The `.entire/` directory with CLI state, config, and raw logs +- The `.claude/` directory (if Claude Code was the agent) +- All files the agent created or modified, in their final state + +This is the most powerful debugging tool -- you can run `git log`, `git diff`, `git show`, inspect `.entire/` internals, and see exactly what the CLI left behind. + +## Debugging Workflow + +### 1. Triage (if given a run directory) + +Read `report.nocolor.txt` to identify failures and their error messages. Each entry shows the test name, agent, duration, and failure output with file:line references. + +### 2. Read console.log (most important) + +Full transcript of every operation: +- `> claude -p "..." ...` -- agent prompts with stdout/stderr +- `> git add/commit/...` -- git commands +- `> send: ...` -- interactive session inputs + +This tells you what happened chronologically. + +### 3. Read test source code + +Use the file:line from the report to find the test in `e2e/tests/`. Understand what the test expected to happen vs what console.log shows actually happened. + +### 4. Diagnose the CLI behavior + +Cross-reference console.log (what happened) with the test (what should have happened). Focus on CLI-level issues: + +| Symptom | CLI Investigation | +|---------|-------------------| +| Checkpoint not created / timeout | Check `entire-logs/entire.log` for hook invocations, phase transitions, errors | +| Wrong checkpoint content | Check `git-tree.txt` for checkpoint branch files, `checkpoint-metadata/` for session info | +| Hooks didn't fire | Check `entire.log` for missing hook entries (session-start, user-prompt-submit, stop, post-commit) | +| Stash/unstash problems | Check `entire.log` for stash-related log lines, `git-log.txt` for commit ordering | +| Attribution issues | Check `checkpoint-metadata/` for `files_touched`, session metadata for attribution data | +| Strategy mismatch | Check `entire.log` for `strategy` field, verify auto-commit vs manual-commit behavior | + +### 5. Deep dive files + +- **entire-logs/entire.log**: Structured JSON logs -- hook lifecycle, session phases (`active` -> `idle` -> `ended`), warnings, errors. Key fields: `component`, `hook`, `strategy`, `session_id`. +- **git-log.txt**: Commit graph showing main branch, `entire/checkpoints/v1`, checkpoint initialization. +- **git-tree.txt**: Files at HEAD vs checkpoint branch (separated by `--- entire/checkpoints/v1 ---`). +- **checkpoint-metadata/**: `metadata.json` has `checkpoint_id`, `strategy`, `files_touched`, `token_usage`, and `sessions` array. Session subdirs have per-session details. + +### 6. Report findings + +Identify whether the issue is in: +- **CLI hooks** (prepare-commit-msg, commit-msg, post-commit) +- **Session management** (phase transitions, session tracking) +- **Checkpoint creation** (branch management, metadata writing) +- **Attribution** (file tracking, prompt correlation) +- **Strategy logic** (auto-commit vs manual-commit behavior) diff --git a/.claude/skills/e2e/implement.md b/.claude/skills/e2e/implement.md new file mode 100644 index 000000000..342c87213 --- /dev/null +++ b/.claude/skills/e2e/implement.md @@ -0,0 +1,113 @@ +# E2E Implement Fixes + +Apply fixes for E2E test failures, verify with scoped E2E tests. + +> **IMPORTANT: Running real E2E tests is a HARD REQUIREMENT of this procedure.** +> Every fix MUST be verified with real E2E tests before the summary step. +> Canary tests use the Vogon fake agent and cannot catch agent-specific issues. +> Do NOT skip E2E verification unless the user explicitly declines due to cost. + +## Inputs + +This procedure accepts findings from one of: +- **`/e2e:triage-ci` output** -- findings report already in conversation context +- **`/e2e:debug` output** -- root cause analysis already in conversation context +- **Standalone description** -- user describes known failure and desired fix + +## Step 1: Identify Fixes + +From the findings in context, identify actionable fixes: + +### For `flaky` failures: describe the proposed fix + +For agent-behavior flaky issues, fixes typically modify test prompts. For test-bug flaky issues, fixes target `e2e/` infrastructure code (harness setup, helpers, env propagation). + +``` +**Proposed fix:** + - File: + - Change: +``` + +Common flaky fixes: +- Agent asked for confirmation -> append "Do not ask for confirmation" to prompt +- Agent wrote to wrong path -> be more explicit about paths in prompt +- Agent committed when shouldn't -> add "Do not commit" to prompt +- Checkpoint wait timeout -> increase timeout argument +- Agent timeout (signal: killed) -> increase per-test timeout, simplify prompt +- Auth/env not propagated -> fix test harness env setup in `e2e/` code +- Test helper bug (wrong assertion, bad glob) -> fix test helper in `e2e/` +- tmux session setup issue -> fix `NewTmuxSession` or session config in `e2e/` + +### For `real-bug` failures: describe root cause analysis + +``` +**Root cause analysis:** + - Component: + - Suspected location: + - Description: + - Proposed fix: +``` + +## Step 2: Ask the User + +Prompt the user: + +> **Should I fix these?** +> - [list of tests with classifications and proposed fixes] +> - You can select all, specific tests, or skip. + +Wait for user response before proceeding. + +## Step 3: Apply Fixes + +For **flaky** fixes the user approved: +1. Apply fixes directly in the working tree (no branch creation) +2. Run static checks: + ```bash + mise run fmt && mise run lint + mise run test:e2e:canary # Must pass + ``` +3. **Run real E2E tests to verify the fix.** Scope depends on what was changed: + - **Agent-specific fix** (e.g., `e2e/agents/cursor_cli.go`, one agent's config/trust/env): run the full suite for that agent only: + ```bash + mise run test:e2e --agent + ``` + - **Shared test infra fix** (e.g., `e2e/agents/agent.go`, `e2e/testutil/`, `TmuxSession`, test helpers): run the full suite for all agents that failed, since the fix could affect any of them: + ```bash + mise run test:e2e --agent + mise run test:e2e --agent + # ... for each agent that had failures + ``` + - **Test prompt fix** (e.g., changed wording in a specific test): run that test across all agents that failed it: + ```bash + mise run test:e2e --agent + ``` + **This step is MANDATORY** -- canary tests use the Vogon fake agent and cannot verify agent-specific behavior (trust dialogs, env propagation, config directories, etc.). +4. If any step fails, investigate and adjust. Report what happened to the user. + +For **real-bug** fixes the user approved: +1. Apply the fix directly in the working tree (no branch creation) +2. Run static checks and unit tests: + ```bash + mise run fmt && mise run lint + mise run test # Unit tests + mise run test:e2e:canary # Canary tests + ``` +3. **Run real E2E tests to verify the fix (MANDATORY).** Same scoping rules as flaky fixes above: + - **Agent-specific change** -> full suite for that agent + - **Shared CLI/infra change** -> full suite for all agents that failed + - **Narrow change** (single test affected) -> just that test across affected agents +4. Report results to the user. + +**GATE: Do NOT proceed to the summary until real E2E tests have been run and results reported for every fix applied above.** If E2E tests were not run, go back and run them now. + +## Step 4: Summary + +Print a summary table: +``` +| Test | Agent(s) | Classification | Action Taken | +|------|----------|----------------|--------------| +| TestFoo | claude-code | flaky | Fixed in working tree | +| TestBar | all agents | real-bug | Fix applied, tests passing | +| TestBaz | opencode | flaky | Skipped (user declined) | +``` diff --git a/.claude/skills/e2e/triage-ci.md b/.claude/skills/e2e/triage-ci.md new file mode 100644 index 000000000..19f36b546 --- /dev/null +++ b/.claude/skills/e2e/triage-ci.md @@ -0,0 +1,176 @@ +# E2E Triage CI + +Triage E2E test failures with **re-run verification**. Analyzes artifacts and re-runs failing tests locally to distinguish flaky from real bugs. Produces a findings report with classifications -- does NOT apply fixes. Does deep analysis of the code. + +--- + +## Step L1: Parse User Input + +The user provides one or more of: +- **Test name(s)** -- e.g., `TestInteractiveMultiStep` +- **`--agent `** -- optional, defaults to all agents that previously failed +- **A local artifact path** -- skip straight to analysis of existing artifacts +- **CI run reference** -- triggers artifact download instead of local re-runs: + - `latest CI run` / `latest` -- most recent failed E2E run on main + - A GitHub Actions run ID (numeric, e.g., `12345678`) + - A GitHub Actions run URL + +**CI artifact download:** When a CI run reference is provided, download artifacts using: + +```bash +scripts/download-e2e-artifacts.sh +``` + +The script outputs the absolute artifact path as its **last line of stdout** -- capture that and use it as the artifact path for analysis. After downloading, **skip Steps L2-L5** (local re-runs) and go straight to **Shared Analysis** (Step 1), since we're analyzing CI artifacts, not running tests locally. + +**Cost warning:** Real E2E tests consume API tokens. Before running, confirm with the user unless they provided specific test names (implying intent to run). + +## Step L2: First Run + +```bash +mise run test:e2e --agent +``` + +Capture the artifact directory from the `artifacts: ` output line. + +## Step L3: Re-run on Failure + +If the test **passes** on first run: report as passing, done for this test. + +If the test **fails**: run a **second time** with the same parameters. + +## Step L4: Tiebreaker (if needed) + +If results are **split** (1 pass, 1 fail): run a **third time** as tiebreaker. + +## Step L5: Collect Results + +For each test+agent pair, record: `(test, agent, run_1_result, run_2_result, [run_3_result])` + +Proceed to **Shared Analysis** (Step 1 below). + +--- + +## Shared Analysis & Classification + +### Step 1: Analyze Each Failure + +For each failure, follow the **Debugging Workflow** in `.claude/skills/e2e/debug.md` (steps 2-5: console.log -> test source -> entire.log -> deep dive). Collect: +- What the agent actually did (from console.log) +- What was expected (from test source) +- CLI-level errors or anomalies (from entire.log) +- Repo/checkpoint state (from git-log.txt, git-tree.txt, checkpoint-metadata/) + +### Step 2: Classify Each Failure + +Use **re-run results as the primary signal**, supplemented by artifact analysis. + +#### Re-run signals (strongest): + +| Original | Re-run 1 | Re-run 2 | Classification | +|----------|----------|----------|----------------| +| FAIL | FAIL (same error) | FAIL (same error) | **real-bug** OR **flaky (test-bug)** -- see below | +| FAIL | PASS | PASS | **flaky** | +| FAIL | PASS | FAIL | **flaky** (non-deterministic) | +| FAIL | FAIL | PASS | **flaky** (non-deterministic) | +| FAIL | FAIL (different error) | FAIL (different error) | **needs deeper analysis** -- examine artifacts | + +**Important: Consistent failures can still be `flaky` (test-bug).** When all re-runs fail, check *where* the root cause is: +- Root cause in `cmd/entire/cli/` -> **real-bug** (product code is broken) +- Root cause in `e2e/` (test infra, test helpers, tmux setup, env propagation) -> **flaky (test-bug)** -- the CLI works fine, the test is broken + +#### Strong `real-bug` signals (root cause must be in `cmd/entire/cli/`, not `e2e/`): + +- `entire.log` contains `"level":"ERROR"` or panic/stack traces from CLI code +- Checkpoint metadata structurally corrupt (malformed JSON, missing `checkpoint_id`/`strategy`) +- Session state file missing or malformed when expected +- Hooks did not fire at all (no `hook invoked` log entries) +- Shadow/metadata branch has wrong tree structure +- Same test fails across 3+ agents with same non-timeout symptom +- Error references CLI code (panic in `cmd/entire/cli/`) + +**Key question:** Is the bug in `cmd/entire/cli/` (product code) or in `e2e/` (test code)? Only the former is a `real-bug`. + +#### Strong `flaky` signals (unless overridden by real-bug): + +**Agent behavior (non-deterministic):** +- `signal: killed` (timeout) +- `context deadline exceeded` or `WaitForCheckpoint.*exceeded deadline` +- Agent asked for confirmation instead of acting +- Agent created file at wrong path / wrong name +- Agent produced no output +- Agent committed when it shouldn't have (or vice versa) +- Duration near timeout limit + +**Test-bug (consistent failure, but root cause is in `e2e/` not `cmd/entire/cli/`):** +- Agent "Not logged in" / auth errors -> test env setup doesn't propagate auth credentials +- Env vars not propagated to agent session -> tmux/test harness bug +- Error references test code (`e2e/`) not CLI code (`cmd/entire/cli/`) +- Test helper logic errors (wrong assertions, bad globs, incorrect expected values) +- Consistent failure BUT root cause traced to `e2e/` code, not `cmd/entire/cli/` +- Test setup/teardown issues (missing git config, temp dir cleanup, port conflicts) + +#### Ambiguous cases: + +Read `entire.log` carefully: +- If hooks fired correctly and metadata is valid -> lean **flaky** +- If hooks fired but produced wrong results -> lean **real-bug** + +### Step 3: Cross-Agent Correlation + +Before reporting, check correlations using re-run data: +- Same test fails for 3+ agents, all re-runs also fail -> strong **real-bug** +- Same test fails for multiple agents, but re-runs pass -> **flaky** (shared prompt issue) +- One agent fails consistently, others pass -> agent-specific issue (still **real-bug** if re-runs confirm) + +### Step 4: Present Findings Report + +For each test+agent pair, print a findings block: + +``` +## () -- + +**Re-run results:** original=FAIL, rerun1=PASS, rerun2=PASS +**Evidence:** +- <1-2 sentence summary of what went wrong> +- +``` + +For `flaky` failures, include proposed fix description: + +``` +**Proposed fix:** + - File: + - Change: +``` + +Common flaky fixes: +- Agent asked for confirmation -> append "Do not ask for confirmation" to prompt +- Agent wrote to wrong path -> be more explicit about paths in prompt +- Agent committed when shouldn't -> add "Do not commit" to prompt +- Checkpoint wait timeout -> increase timeout argument +- Agent timeout (signal: killed) -> increase per-test timeout, simplify prompt +- Auth/env not propagated -> fix test harness env setup in `e2e/` code +- Test helper bug (wrong assertion, bad glob) -> fix test helper in `e2e/` +- tmux session setup issue -> fix `NewTmuxSession` or session config in `e2e/` + +For `real-bug` failures, include root cause analysis: + +``` +**Root cause analysis:** + - Component: + - Suspected location: + - Description: + - Proposed fix: +``` + +### Step 5: Summary + +Print a summary table (classification only, no "Action Taken"): +``` +| Test | Agent(s) | Re-runs | Classification | +|------|----------|---------|----------------| +| TestFoo | claude-code | FAIL/PASS/PASS | flaky | +| TestBar | all agents | FAIL/FAIL/FAIL | real-bug | +| TestBaz | opencode | FAIL/PASS/FAIL | flaky (non-deterministic) | +``` From 1692c8ee7df34c59276973684f0d481032ddc19b Mon Sep 17 00:00:00 2001 From: Alisha Kawaguchi Date: Mon, 9 Mar 2026 12:42:00 -0700 Subject: [PATCH 3/4] Fix inconsistent log paths and improve triage-ci description - Standardize on `entire-logs/entire.log` in debug.md diagnostic table - Update triage-ci command description to mention CI artifact support Co-Authored-By: Claude Opus 4.6 Entire-Checkpoint: 2046d09ffe0c --- .claude/plugins/e2e/commands/triage-ci.md | 2 +- .claude/skills/e2e/debug.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.claude/plugins/e2e/commands/triage-ci.md b/.claude/plugins/e2e/commands/triage-ci.md index 253f89633..085dafa6d 100644 --- a/.claude/plugins/e2e/commands/triage-ci.md +++ b/.claude/plugins/e2e/commands/triage-ci.md @@ -1,5 +1,5 @@ --- -description: "Run failing E2E tests locally, classify flaky vs real-bug, present findings report" +description: "Triage E2E failures via local reruns or CI artifacts, classify flaky vs real-bug, present findings report" --- # Triage CI Command diff --git a/.claude/skills/e2e/debug.md b/.claude/skills/e2e/debug.md index 94f814130..567192ecc 100644 --- a/.claude/skills/e2e/debug.md +++ b/.claude/skills/e2e/debug.md @@ -66,10 +66,10 @@ Cross-reference console.log (what happened) with the test (what should have happ |---------|-------------------| | Checkpoint not created / timeout | Check `entire-logs/entire.log` for hook invocations, phase transitions, errors | | Wrong checkpoint content | Check `git-tree.txt` for checkpoint branch files, `checkpoint-metadata/` for session info | -| Hooks didn't fire | Check `entire.log` for missing hook entries (session-start, user-prompt-submit, stop, post-commit) | -| Stash/unstash problems | Check `entire.log` for stash-related log lines, `git-log.txt` for commit ordering | +| Hooks didn't fire | Check `entire-logs/entire.log` for missing hook entries (session-start, user-prompt-submit, stop, post-commit) | +| Stash/unstash problems | Check `entire-logs/entire.log` for stash-related log lines, `git-log.txt` for commit ordering | | Attribution issues | Check `checkpoint-metadata/` for `files_touched`, session metadata for attribution data | -| Strategy mismatch | Check `entire.log` for `strategy` field, verify auto-commit vs manual-commit behavior | +| Strategy mismatch | Check `entire-logs/entire.log` for `strategy` field, verify auto-commit vs manual-commit behavior | ### 5. Deep dive files From 34ffb23bae8e7df26bd6a84ae2c5b0844c3acc30 Mon Sep 17 00:00:00 2001 From: Alisha Kawaguchi Date: Mon, 9 Mar 2026 12:55:52 -0700 Subject: [PATCH 4/4] Add e2e artifact download script and remove debug-e2e skill - Add scripts/download-e2e-artifacts.sh for downloading CI artifacts - Remove .claude/skills/debug-e2e/ (consolidated into /e2e:debug command) Co-Authored-By: Claude Opus 4.6 Entire-Checkpoint: 15ac9dc10e03 --- .claude/skills/debug-e2e/SKILL.md | 93 ------------------------- scripts/download-e2e-artifacts.sh | 111 ++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 93 deletions(-) delete mode 100644 .claude/skills/debug-e2e/SKILL.md create mode 100755 scripts/download-e2e-artifacts.sh diff --git a/.claude/skills/debug-e2e/SKILL.md b/.claude/skills/debug-e2e/SKILL.md deleted file mode 100644 index 3c407768a..000000000 --- a/.claude/skills/debug-e2e/SKILL.md +++ /dev/null @@ -1,93 +0,0 @@ ---- -name: debug-e2e -description: Use when investigating E2E test failures from artifacts to diagnose bugs in the Entire CLI, or when pointed at an artifact path for root cause analysis ---- - -# Debug Entire CLI via E2E Artifacts - -Diagnose Entire CLI bugs using captured artifacts from the E2E test suite. Artifacts are written to `e2e/artifacts/` locally or downloaded from CI via GitHub Actions. - -## Inputs - -The user provides either: -- **A test run directory:** `e2e/artifacts/{timestamp}/` — triage all failures -- **A specific test directory:** `e2e/artifacts/{timestamp}/{TestName}-{agent}/` — debug one test - -## Artifact Layout - -``` -e2e/artifacts/{timestamp}/ -├── report.nocolor.txt # Pass/fail/skip summary with error lines -├── test-events.json # Raw Go test events (NDJSON) -├── entire-version.txt # CLI version under test -└── {TestName}-{agent}/ - ├── PASS or FAIL # Status marker - ├── console.log # Full operation transcript - ├── git-log.txt # git log --decorate --graph --all - ├── git-tree.txt # ls-tree HEAD + checkpoint branch - ├── entire-logs/entire.log # CLI structured JSON logs - ├── checkpoint-metadata/ # Checkpoint + session metadata - └── repo -> /tmp/... # Symlink to preserved repo (E2E_KEEP_REPOS=1 only) -``` - -## Preserved Repo - -When the test run was executed with `E2E_KEEP_REPOS=1`, each test's artifact directory contains a `repo` symlink pointing to the preserved temporary git repository. This is the actual repo the test operated on — you can inspect it directly. - -**Navigate via the symlink** (e.g., `{artifact-dir}/repo/`) rather than resolving the `/tmp/...` path. The symlink lives inside the artifact directory so permissions and paths stay consistent. - -The preserved repo contains: -- Full git history with all branches (main, `entire/checkpoints/v1`) -- The `.entire/` directory with CLI state, config, and raw logs -- The `.claude/` directory (if Claude Code was the agent) -- All files the agent created or modified, in their final state - -This is the most powerful debugging tool — you can run `git log`, `git diff`, `git show`, inspect `.entire/` internals, and see exactly what the CLI left behind. - -## Debugging Workflow - -### 1. Triage (if given a run directory) - -Read `report.nocolor.txt` to identify failures and their error messages. Each entry shows the test name, agent, duration, and failure output with file:line references. - -### 2. Read console.log (most important) - -Full transcript of every operation: -- `> claude -p "..." ...` — agent prompts with stdout/stderr -- `> git add/commit/...` — git commands -- `> send: ...` — interactive session inputs - -This tells you what happened chronologically. - -### 3. Read test source code - -Use the file:line from the report to find the test in `e2e/tests/`. Understand what the test expected to happen vs what console.log shows actually happened. - -### 4. Diagnose the CLI behavior - -Cross-reference console.log (what happened) with the test (what should have happened). Focus on CLI-level issues: - -| Symptom | CLI Investigation | -|---------|-------------------| -| Checkpoint not created / timeout | Check `entire-logs/entire.log` for hook invocations, phase transitions, errors | -| Wrong checkpoint content | Check `git-tree.txt` for checkpoint branch files, `checkpoint-metadata/` for session info | -| Hooks didn't fire | Check `entire.log` for missing hook entries (session-start, user-prompt-submit, stop, post-commit) | -| Stash/unstash problems | Check `entire.log` for stash-related log lines, `git-log.txt` for commit ordering | -| Attribution issues | Check `checkpoint-metadata/` for `files_touched`, session metadata for attribution data | -| Strategy mismatch | Check `entire.log` for `strategy` field, verify auto-commit vs manual-commit behavior | - -### 5. Deep dive files - -- **entire-logs/entire.log**: Structured JSON logs — hook lifecycle, session phases (`active` → `idle` → `ended`), warnings, errors. Key fields: `component`, `hook`, `strategy`, `session_id`. -- **git-log.txt**: Commit graph showing main branch, `entire/checkpoints/v1`, checkpoint initialization. -- **git-tree.txt**: Files at HEAD vs checkpoint branch (separated by `--- entire/checkpoints/v1 ---`). -- **checkpoint-metadata/**: `metadata.json` has `checkpoint_id`, `strategy`, `files_touched`, `token_usage`, and `sessions` array. Session subdirs have per-session details. - -### 6. Report findings - -Identify whether the issue is in: -- **CLI hooks** (prepare-commit-msg, commit-msg, post-commit) -- **Session management** (phase transitions, session tracking) -- **Checkpoint creation** (branch management, metadata writing) -- **Attribution** (file tracking, prompt correlation) -- **Strategy logic** (auto-commit vs manual-commit behavior) diff --git a/scripts/download-e2e-artifacts.sh b/scripts/download-e2e-artifacts.sh new file mode 100755 index 000000000..b2663c906 --- /dev/null +++ b/scripts/download-e2e-artifacts.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +# +# Download E2E test artifacts from GitHub Actions. +# +# Usage: scripts/download-e2e-artifacts.sh [RUN_ID | RUN_URL | "latest"] +# RUN_ID: numeric GitHub Actions run ID +# RUN_URL: full URL like https://github.com/entireio/cli/actions/runs/12345 +# "latest": most recent failed E2E run on main +# +# Outputs the absolute path of the download directory as the last line of stdout. +# All diagnostic messages go to stderr. + +set -euo pipefail + +log() { echo "$@" >&2; } +die() { log "ERROR: $1"; exit 1; } + +# --- Validate prerequisites --- + +command -v gh >/dev/null 2>&1 || die "'gh' CLI is not installed. Install from https://cli.github.com/" +gh auth status >/dev/null 2>&1 || die "'gh' is not authenticated. Run 'gh auth login' first." + +# --- Parse input --- + +input="${1:-}" +[ -z "$input" ] && die "Usage: $0 [RUN_ID | RUN_URL | \"latest\"]" + +run_id="" + +case "$input" in + latest) + log "Finding most recent failed E2E run on main..." + run_id=$(gh run list -w e2e.yml --status=failure -L1 --json databaseId -q '.[0].databaseId' 2>/dev/null) + [ -z "$run_id" ] && die "No failed E2E runs found." + log "Found run: $run_id" + ;; + http*) + # Extract run ID from URL: https://github.com///actions/runs/ + run_id=$(echo "$input" | grep -oE '/runs/[0-9]+' | grep -oE '[0-9]+') + [ -z "$run_id" ] && die "Could not extract run ID from URL: $input" + log "Extracted run ID: $run_id" + ;; + *[!0-9]*) + die "Invalid input: '$input'. Provide a numeric run ID, a GitHub Actions URL, or 'latest'." + ;; + *) + run_id="$input" + ;; +esac + +# --- Fetch run metadata --- + +log "Fetching run metadata..." +run_url=$(gh run view "$run_id" --json url -q '.url' 2>/dev/null) || die "Run $run_id not found." +commit=$(gh run view "$run_id" --json headSha -q '.headSha' 2>/dev/null) || commit="unknown" + +log "Run URL: $run_url" +log "Commit: $commit" + +# --- Download artifacts --- + +dest="e2e/artifacts/ci-${run_id}" + +# If artifacts were already downloaded, skip re-downloading +if [ -d "$dest" ] && [ "$(ls -A "$dest" 2>/dev/null)" ]; then + log "Artifacts already downloaded at $dest/, skipping download." +else + mkdir -p "$dest" + log "Downloading artifacts to $dest/ ..." + gh run download "$run_id" --dir "$dest" 2>&1 >&2 || die "Failed to download artifacts. They may have expired (retention: 7 days)." +fi + +# --- Restructure: flatten e2e-artifacts-/ wrapper dirs --- + +cd "$dest" +for wrapper in e2e-artifacts-*/; do + [ -d "$wrapper" ] || continue + agent="${wrapper#e2e-artifacts-}" + agent="${agent%/}" + # Move contents up: e2e-artifacts-claude-code/* -> claude-code/ + if [ -d "$agent" ]; then + # Agent dir already exists (shouldn't happen, but be safe) + cp -r "$wrapper"/* "$agent"/ 2>/dev/null || true + else + mv "$wrapper" "$agent" + fi +done +cd - >/dev/null + +# --- Write run metadata --- + +agents_found=$(cd "$dest" && ls -d */ 2>/dev/null | tr -d '/' | tr '\n' ', ' | sed 's/,$//') + +cat > "$dest/.run-info.json" <