Skip to content

Commit 6e1e2e5

Browse files
committed
Add live regression canaries for folder-bind and mismatch states
1 parent 72222ec commit 6e1e2e5

File tree

10 files changed

+300
-11
lines changed

10 files changed

+300
-11
lines changed

.github/workflows/ci.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ jobs:
3333
- name: Setup mise
3434
uses: jdx/mise-action@v2
3535

36+
- name: Install sccache
37+
run: brew install sccache
38+
3639
- name: Install dependencies
3740
run: pnpm install --frozen-lockfile
3841

@@ -66,8 +69,8 @@ jobs:
6669
- name: Setup mise
6770
uses: jdx/mise-action@v2
6871

69-
- name: Install qemu
70-
run: brew install qemu
72+
- name: Install runtime deps
73+
run: brew install qemu sccache
7174

7275
- name: Install pi CLI for runtime build
7376
run: npm install -g @mariozechner/pi-coding-agent

TODO.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
- [ ] **Testing sanity gate (P0)** — define and ship a real automated regression suite for the liftoff path (Vitest + Rust), keep daily checks fast (`mise run check`) while enforcing live regressions in `mise run check-full`, and stop relying on ad-hoc/manual shell-harness runs for core correctness.
88
- [x] Add a machine-readable `state_snapshot` contract for deterministic assertions (no log-grep testing).
99
- [x] Add first gated Vitest regression: reopen cwd correctness (`/mnt/workdir...`) on folder-bound task reopen.
10-
- [ ] Add remaining gated Vitest regressions: folder-bind continuity (no UI reset), working-folder panel refresh on folder change, and runtime-mismatch badge rules.
10+
- [x] Add remaining gated Vitest regressions: folder-bind continuity (no UI reset), working-folder panel refresh on folder change, and runtime-mismatch badge rules.
1111
- [ ] Add one sequential live journey canary (messages + models + workdir + artifacts + reopen), while keeping focused canaries for isolated invariants.
1212
- [x] Add CI enforcement for both gates (`check` on PRs, `check-full` required before merge).
1313
- [x] Remove low-signal/noise tests and stale assertions (smoke test, legacy session-file assumptions, noisy debug logs).

docs/testing-strategy.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,13 @@ The regression gate should make these guarantees explicit:
5959
4. **Runtime mismatch badge semantics**
6060
- Badge only appears for real mismatches, not transient boot/reconfigure or legacy sentinel state.
6161

62+
Implemented live canaries (integration):
63+
64+
- `reopen-cwd.integration.test.ts`
65+
- `folder-bind-continuity.integration.test.ts`
66+
- `working-folder-panel-refresh.integration.test.ts`
67+
- `runtime-mismatch-badge.integration.test.ts`
68+
6269
## Live integration strategy (hybrid)
6370

6471
- Keep **one sequential journey canary** that covers a realistic end-to-end flow (messages, models, working folder, artifacts, reopen).

mise.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ run = "mise run test-vite && mise run test-rust"
7373

7474
[tasks.test-regressions]
7575
description = "Run runtime regression integration tests"
76-
run = "pnpm exec vitest run src/lib/__tests__/integration/*.integration.test.ts"
76+
run = "pnpm exec vitest run --minWorkers=1 --maxWorkers=1 src/lib/__tests__/integration/*.integration.test.ts"
7777

7878
[tasks.test-rust]
7979
description = "Run Rust tests"
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/* @vitest-environment node */
2+
3+
import path from "node:path";
4+
import { afterAll, beforeAll, describe, expect, it } from "vitest";
5+
import { IntegrationHarness } from "./harness";
6+
7+
describe.sequential("folder bind continuity regression", () => {
8+
const harness = new IntegrationHarness();
9+
const prefix = `regression-folder-bind-${Date.now()}`;
10+
11+
const workingFolder = path.resolve(process.cwd());
12+
13+
beforeAll(async () => {
14+
await harness.start();
15+
await harness.deleteTasksByPrefix(prefix);
16+
}, 240_000);
17+
18+
afterAll(async () => {
19+
await harness.deleteTasksByPrefix(prefix).catch(() => undefined);
20+
await harness.stop();
21+
}, 90_000);
22+
23+
it("does not reset conversation state when binding first working folder", async () => {
24+
const title = `${prefix}-task`;
25+
await harness.createTask(title, null);
26+
27+
const task = await harness.waitForTaskByTitle(title, 30_000);
28+
await harness.setTask(task.id);
29+
await harness.waitForTaskSettled(task.id, 90_000);
30+
31+
const initial = await harness.waitForSnapshot((snapshot) => {
32+
if (snapshot.task.currentTaskId !== task.id) {
33+
return null;
34+
}
35+
36+
if (!snapshot.runtime.rpcConnected || snapshot.runtime.taskSwitching) {
37+
return null;
38+
}
39+
40+
return snapshot;
41+
}, 60_000);
42+
43+
expect(initial.ui.quickStartVisible).toBe(true);
44+
45+
await harness.prompt("continuity probe");
46+
47+
const withMessage = await harness.waitForSnapshot((snapshot) => {
48+
if (snapshot.task.currentTaskId !== task.id) {
49+
return null;
50+
}
51+
52+
if (snapshot.conversation.messageCount < 1) {
53+
return null;
54+
}
55+
56+
return snapshot;
57+
}, 90_000);
58+
59+
const messageCountBeforeBind = withMessage.conversation.messageCount;
60+
expect(messageCountBeforeBind).toBeGreaterThan(0);
61+
expect(withMessage.ui.quickStartVisible).toBe(false);
62+
63+
await harness.setFolder(workingFolder);
64+
65+
const afterBind = await harness.waitForSnapshot((snapshot) => {
66+
if (snapshot.task.currentTaskId !== task.id) {
67+
return null;
68+
}
69+
70+
if (!snapshot.runtime.rpcConnected || snapshot.runtime.taskSwitching) {
71+
return null;
72+
}
73+
74+
if (snapshot.task.currentWorkingFolder !== workingFolder) {
75+
return null;
76+
}
77+
78+
return snapshot;
79+
}, 120_000);
80+
81+
expect(afterBind.conversation.messageCount).toBeGreaterThanOrEqual(messageCountBeforeBind);
82+
expect(afterBind.conversation.messageCount).toBeGreaterThan(0);
83+
expect(afterBind.ui.bootScreenVisible).toBe(false);
84+
expect(afterBind.ui.quickStartVisible).toBe(false);
85+
}, 240_000);
86+
});

src/lib/__tests__/integration/harness.ts

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ interface TaskSummary {
2020
workingFolder: string | null;
2121
}
2222

23+
function isOkResponse(response: string): boolean {
24+
return response === "OK";
25+
}
26+
2327
export interface StateSnapshot {
2428
schemaVersion: number;
2529
timestamp: string;
@@ -62,6 +66,11 @@ export interface StateSnapshot {
6266
workingFolderRelative: string | null;
6367
mismatchVisible: boolean;
6468
};
69+
panels: {
70+
workingFolderFileRowCount: number;
71+
workingFolderLoadingVisible: boolean;
72+
workingFolderEmptyVisible: boolean;
73+
};
6574
}
6675

6776
function sleep(ms: number): Promise<void> {
@@ -271,12 +280,14 @@ export class IntegrationHarness {
271280
): Promise<T> {
272281
return await waitFor(
273282
async () => {
283+
let snapshot: StateSnapshot;
274284
try {
275-
const snapshot = await this.snapshot();
276-
return predicate(snapshot);
285+
snapshot = await this.snapshot();
277286
} catch {
278287
return null;
279288
}
289+
290+
return predicate(snapshot);
280291
},
281292
timeoutMs,
282293
250,
@@ -291,11 +302,25 @@ export class IntegrationHarness {
291302
workingFolder,
292303
});
293304

294-
if (response !== "OK") {
305+
if (!isOkResponse(response)) {
295306
throw new Error(`create_task failed: ${response}`);
296307
}
297308
}
298309

310+
async prompt(message: string): Promise<void> {
311+
const response = await this.sendCommand({ cmd: "prompt", message });
312+
if (!isOkResponse(response)) {
313+
throw new Error(`prompt failed: ${response}`);
314+
}
315+
}
316+
317+
async setFolder(folder: string): Promise<void> {
318+
const response = await this.sendCommand({ cmd: "set_folder", folder });
319+
if (!isOkResponse(response)) {
320+
throw new Error(`set_folder failed: ${response}`);
321+
}
322+
}
323+
299324
async listTasks(): Promise<TaskSummary[]> {
300325
return await this.sendJson<TaskSummary[]>({ cmd: "task_list" });
301326
}
@@ -319,14 +344,14 @@ export class IntegrationHarness {
319344

320345
async setTask(taskId: string): Promise<void> {
321346
const response = await this.sendCommand({ cmd: "set_task", taskId });
322-
if (response !== "OK") {
347+
if (!isOkResponse(response)) {
323348
throw new Error(`set_task failed: ${response}`);
324349
}
325350
}
326351

327352
async deleteTask(taskId: string): Promise<void> {
328353
const response = await this.sendCommand({ cmd: "delete_task", taskId });
329-
if (response !== "OK") {
354+
if (!isOkResponse(response)) {
330355
throw new Error(`delete_task failed: ${response}`);
331356
}
332357
}
@@ -340,6 +365,20 @@ export class IntegrationHarness {
340365
}
341366
}
342367

368+
async waitForTaskSettled(taskId: string, timeoutMs = 120_000): Promise<StateSnapshot> {
369+
return await this.waitForSnapshot((snapshot) => {
370+
if (!snapshot.runtime.rpcConnected || snapshot.runtime.taskSwitching) {
371+
return null;
372+
}
373+
374+
if (snapshot.task.currentTaskId !== taskId) {
375+
return null;
376+
}
377+
378+
return snapshot;
379+
}, timeoutMs);
380+
}
381+
343382
logPath(): string {
344383
return LOG_PATH;
345384
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/* @vitest-environment node */
2+
3+
import path from "node:path";
4+
import { afterAll, beforeAll, describe, expect, it } from "vitest";
5+
import { IntegrationHarness } from "./harness";
6+
7+
describe.sequential("runtime mismatch badge semantics regression", () => {
8+
const harness = new IntegrationHarness();
9+
const prefix = `regression-mismatch-${Date.now()}`;
10+
11+
const workingFolder = path.resolve(process.cwd());
12+
13+
beforeAll(async () => {
14+
await harness.start();
15+
await harness.deleteTasksByPrefix(prefix);
16+
}, 240_000);
17+
18+
afterAll(async () => {
19+
await harness.deleteTasksByPrefix(prefix).catch(() => undefined);
20+
await harness.stop();
21+
}, 90_000);
22+
23+
it("does not show runtime mismatch badge during transient reconfigure states", async () => {
24+
const title = `${prefix}-task`;
25+
await harness.createTask(title, null);
26+
27+
const task = await harness.waitForTaskByTitle(title, 30_000);
28+
await harness.setTask(task.id);
29+
30+
const settledBeforeBind = await harness.waitForTaskSettled(task.id, 90_000);
31+
expect(settledBeforeBind.runtimeDebug.mismatchVisible).toBe(false);
32+
33+
await harness.setFolder(workingFolder);
34+
35+
const settledAfterBind = await harness.waitForSnapshot((snapshot) => {
36+
if (snapshot.task.currentTaskId !== task.id) {
37+
return null;
38+
}
39+
40+
const transient =
41+
snapshot.runtime.taskSwitching ||
42+
!snapshot.runtime.rpcConnected ||
43+
snapshot.ui.reconfigureBannerVisible;
44+
45+
if (transient) {
46+
if (snapshot.runtimeDebug.mismatchVisible) {
47+
throw new Error("runtime mismatch badge became visible during transient reconfigure state");
48+
}
49+
return null;
50+
}
51+
52+
if (snapshot.task.currentWorkingFolder !== workingFolder) {
53+
return null;
54+
}
55+
56+
return snapshot;
57+
}, 120_000);
58+
59+
expect(settledAfterBind.runtimeDebug.mismatchVisible).toBe(false);
60+
}, 240_000);
61+
});
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/* @vitest-environment node */
2+
3+
import path from "node:path";
4+
import { afterAll, beforeAll, describe, expect, it } from "vitest";
5+
import { IntegrationHarness } from "./harness";
6+
7+
describe.sequential("working folder panel refresh regression", () => {
8+
const harness = new IntegrationHarness();
9+
const prefix = `regression-working-panel-${Date.now()}`;
10+
11+
const workingFolder = path.resolve(process.cwd());
12+
13+
beforeAll(async () => {
14+
await harness.start();
15+
await harness.deleteTasksByPrefix(prefix);
16+
}, 240_000);
17+
18+
afterAll(async () => {
19+
await harness.deleteTasksByPrefix(prefix).catch(() => undefined);
20+
await harness.stop();
21+
}, 90_000);
22+
23+
it("refreshes working-folder file list immediately after first bind", async () => {
24+
const title = `${prefix}-task`;
25+
await harness.createTask(title, null);
26+
27+
const task = await harness.waitForTaskByTitle(title, 30_000);
28+
await harness.setTask(task.id);
29+
await harness.waitForTaskSettled(task.id, 90_000);
30+
31+
const beforeBind = await harness.waitForSnapshot((snapshot) => {
32+
if (snapshot.task.currentTaskId !== task.id) {
33+
return null;
34+
}
35+
36+
if (!snapshot.runtime.rpcConnected || snapshot.runtime.taskSwitching) {
37+
return null;
38+
}
39+
40+
if (snapshot.task.currentWorkingFolder !== null) {
41+
return null;
42+
}
43+
44+
return snapshot;
45+
}, 60_000);
46+
47+
expect(beforeBind.panels.workingFolderFileRowCount).toBe(0);
48+
49+
await harness.setFolder(workingFolder);
50+
51+
const afterBind = await harness.waitForSnapshot((snapshot) => {
52+
if (snapshot.task.currentTaskId !== task.id) {
53+
return null;
54+
}
55+
56+
if (!snapshot.runtime.rpcConnected || snapshot.runtime.taskSwitching) {
57+
return null;
58+
}
59+
60+
if (snapshot.task.currentWorkingFolder !== workingFolder) {
61+
return null;
62+
}
63+
64+
if (snapshot.panels.workingFolderFileRowCount < 1) {
65+
return null;
66+
}
67+
68+
return snapshot;
69+
}, 120_000);
70+
71+
expect(afterBind.panels.workingFolderFileRowCount).toBeGreaterThan(0);
72+
expect(afterBind.panels.workingFolderEmptyVisible).toBe(false);
73+
}, 240_000);
74+
});

0 commit comments

Comments
 (0)