Skip to content

Commit de13f7c

Browse files
committed
feat(context): implement app-aware dictation with specialized prompting
- Add ContextDetector to identify developer tools (VS Code, Xcode, etc.) vs prose apps - Introduce ContextProfile to supply technical glossaries for code context to Whisper - Update DictationStateMachine to capture context on hotkey press - Pass context prompts to both Local and Cloud transcription providers - Update Notch UI to display context-specific icons (terminal vs microphone) - Real-time context tracking via NSWorkspace app activation observers
1 parent 3097516 commit de13f7c

15 files changed

+285
-36
lines changed

OpenDictation/App/AppDelegate.swift

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSWindowDelegate {
3737
/// Active transcription task (for cancellation support)
3838
private var transcriptionTask: Task<Void, Never>?
3939

40+
/// App activation observer token (for cleanup)
41+
private var appActivationObserver: NSObjectProtocol?
42+
4043
// MARK: - NSApplicationDelegate
4144

4245
func applicationDidFinishLaunching(_ notification: Notification) {
@@ -72,13 +75,19 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSWindowDelegate {
7275
}
7376

7477
func applicationWillTerminate(_ notification: Notification) {
75-
// Remove screen change observer
78+
// Remove observers
7679
NotificationCenter.default.removeObserver(
7780
self,
7881
name: NSApplication.didChangeScreenParametersNotification,
7982
object: nil
8083
)
8184

85+
// Remove app activation observer
86+
if let observer = appActivationObserver {
87+
NSWorkspace.shared.notificationCenter.removeObserver(observer)
88+
appActivationObserver = nil
89+
}
90+
8291
// Restore volume if still ducked (safety net)
8392
audioFeedbackService?.restoreVolume()
8493
notchPanel?.hide()
@@ -241,6 +250,25 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSWindowDelegate {
241250
}
242251
.store(in: &cancellables)
243252

253+
// Observe app activation to update context in real-time during recording
254+
// Single source of truth: state machine holds the context, panel displays it
255+
appActivationObserver = NSWorkspace.shared.notificationCenter.addObserver(
256+
forName: NSWorkspace.didActivateApplicationNotification,
257+
object: nil,
258+
queue: .main
259+
) { [weak self] _ in
260+
guard let self = self,
261+
let sm = self.stateMachine,
262+
sm.state == .recording else { return }
263+
264+
// Update state machine (source of truth)
265+
let context = ContextDetector.detect()
266+
sm.currentContext = context
267+
268+
// Sync to panel display
269+
self.notchPanel?.setContext(context)
270+
}
271+
244272
// Wire up state machine callbacks
245273
sm.onShowPanel = { [weak self] in
246274
guard let self = self else { return }
@@ -274,6 +302,10 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSWindowDelegate {
274302
}
275303

276304
// Show panel if available (non-notch Macs get audio feedback only)
305+
// Pass captured context to the panel for icon display
306+
if let sm = self.stateMachine {
307+
self.notchPanel?.setContext(sm.currentContext)
308+
}
277309
self.notchPanel?.show()
278310
}
279311

@@ -378,7 +410,9 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSWindowDelegate {
378410
}
379411

380412
do {
381-
let text = try await TranscriptionCoordinator.shared.transcribe(audioURL: audioURL)
413+
// Capture state machine reference again inside Task if needed, but it's captured outside
414+
let context = stateMachine?.currentContext ?? .prose
415+
let text = try await TranscriptionCoordinator.shared.transcribe(audioURL: audioURL, context: context)
382416

383417
// Check if task was cancelled
384418
guard !Task.isCancelled else { return }
@@ -432,11 +466,14 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSWindowDelegate {
432466
hotkeyService?.onHotkeyPressed = { [weak sm] in
433467
guard let sm = sm else { return }
434468

469+
// Capture context NOW (at boundary) - runs on MainActor
470+
let context = ContextDetector.detect()
471+
435472
// Toggle behavior: if recording, stop; otherwise start
436473
if sm.state == .recording {
437-
sm.send(.hotkeyPressed) // This stops recording
474+
sm.send(.hotkeyPressed(context: context)) // This stops recording
438475
} else if sm.state == .idle {
439-
sm.send(.hotkeyPressed) // This starts recording
476+
sm.send(.hotkeyPressed(context: context)) // This starts recording
440477
}
441478
// Ignore hotkey in other states (processing, success, etc.)
442479
}
@@ -457,7 +494,7 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSWindowDelegate {
457494
sm.isMockMode = true
458495

459496
print("[Test] Error: recording → processing → error (mock mode)")
460-
sm.send(.hotkeyPressed) // → .recording (no real recording)
497+
sm.send(.hotkeyPressed(context: .prose)) // → .recording (no real recording)
461498

462499
DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) { [weak sm] in
463500
sm?.send(.stopRecording)
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import Foundation
2+
3+
/// Categories for app context with associated SF Symbol icons.
4+
/// Extensible design system for visual differentiation in the notch UI.
5+
///
6+
/// Icon Design Guidelines:
7+
/// - Use `.fill` variants for visibility on dark backgrounds
8+
/// - Icons should be recognizable at 14x14pt (matching waveform height)
9+
/// - Choose base symbols without decorative suffixes (.circle, .square, etc.)
10+
enum ContextCategory: String, CaseIterable, Equatable {
11+
/// Code editors, terminals, IDEs
12+
case code
13+
/// General prose apps (Notes, TextEdit, etc.)
14+
case prose
15+
/// Communication apps (Slack, Discord, Messages, etc.)
16+
case communication
17+
/// Productivity apps (Notion, Obsidian, Linear, etc.)
18+
case productivity
19+
/// Creative apps (Figma, Photoshop, etc.)
20+
case creative
21+
/// Web browsers
22+
case browser
23+
/// Media apps (Spotify, Music, etc.)
24+
case media
25+
/// Default fallback
26+
case general
27+
28+
/// SF Symbol name for this category.
29+
var sfSymbol: String {
30+
switch self {
31+
case .code:
32+
return "terminal.fill"
33+
case .prose:
34+
return "doc.text.fill"
35+
case .communication:
36+
return "bubble.left.fill"
37+
case .productivity:
38+
return "checklist"
39+
case .creative:
40+
return "paintbrush.fill"
41+
case .browser:
42+
return "globe"
43+
case .media:
44+
return "play.fill"
45+
case .general:
46+
return "microphone.fill"
47+
}
48+
}
49+
50+
/// Human-readable display name.
51+
var displayName: String {
52+
switch self {
53+
case .code: return "Code"
54+
case .prose: return "Prose"
55+
case .communication: return "Communication"
56+
case .productivity: return "Productivity"
57+
case .creative: return "Creative"
58+
case .browser: return "Browser"
59+
case .media: return "Media"
60+
case .general: return "General"
61+
}
62+
}
63+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import AppKit
2+
3+
/// Detects context profile from the frontmost application at dictation time.
4+
/// Follows Law #1: Early Exit - unknown apps default to prose.
5+
enum ContextDetector {
6+
7+
/// Bundle identifiers for developer applications.
8+
/// Sources: OpenInTerminal, Claude Island, AeroSpace
9+
private static let developerBundleIDs: Set<String> = [
10+
// IDEs
11+
"com.apple.Xcode",
12+
"com.microsoft.VSCode",
13+
"com.microsoft.VSCodeInsiders",
14+
"com.todesktop.230313mzl4w4u92", // Cursor
15+
"dev.zed.Zed",
16+
"dev.zed.Zed-Preview",
17+
"com.exafunction.windsurf",
18+
"com.google.antigravity",
19+
"ai.opencode.desktop",
20+
"com.visualstudio.code.oss", // VSCodium
21+
"com.sublimetext.3",
22+
"com.sublimetext.4",
23+
"com.panic.Nova",
24+
"org.vim.MacVim",
25+
"com.macromates.TextMate",
26+
"com.barebones.bbedit",
27+
28+
// JetBrains
29+
"com.jetbrains.AppCode",
30+
"com.jetbrains.CLion",
31+
"com.jetbrains.fleet",
32+
"com.jetbrains.goland",
33+
"com.jetbrains.intellij",
34+
"com.jetbrains.PhpStorm",
35+
"com.jetbrains.pycharm",
36+
"com.jetbrains.rubymine",
37+
"com.jetbrains.WebStorm",
38+
"com.jetbrains.rider",
39+
"com.jetbrains.datagrip",
40+
41+
// Git & DB Tools
42+
"com.github.GitHubClient", // GitHub Desktop
43+
"io.beekeeperstudio.desktop",
44+
45+
// Terminals
46+
"com.apple.Terminal",
47+
"com.googlecode.iterm2",
48+
"net.kovidgoyal.kitty",
49+
"com.github.wez.wezterm",
50+
"io.alacritty",
51+
"dev.warp.Warp-Stable",
52+
"com.mitchellh.ghostty",
53+
"co.zeit.hyper",
54+
"org.tabby"
55+
]
56+
57+
/// Detects the current context profile based on frontmost application.
58+
/// Called at dictation trigger time (Option+Space).
59+
///
60+
/// - Returns: `.code` if in a developer app, `.prose` otherwise.
61+
static func detect() -> ContextProfile {
62+
guard let bundleID = NSWorkspace.shared.frontmostApplication?.bundleIdentifier else {
63+
return .prose
64+
}
65+
return developerBundleIDs.contains(bundleID) ? .code : .prose
66+
}
67+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import Foundation
2+
3+
/// Represents the transcription context - once parsed, this is the trusted state.
4+
/// Follows Law #2: Make Illegal States Unrepresentable
5+
enum ContextProfile: Equatable {
6+
case code // IDE, terminal, code editor
7+
case prose // Standard apps (Mail, Slack, Notes, etc.)
8+
9+
/// The visual category for icon display in the notch UI.
10+
var category: ContextCategory {
11+
switch self {
12+
case .code:
13+
return .code
14+
case .prose:
15+
return .general // Default to general (mic icon) for prose
16+
}
17+
}
18+
19+
/// The initial prompt to bias Whisper transcription for this context.
20+
/// Returns nil for prose (vanilla Whisper behavior with style hint).
21+
var whisperPrompt: String? {
22+
switch self {
23+
case .prose:
24+
// Natural language prompt for proper punctuation/capitalization.
25+
// Pattern from VoiceInk - a greeting that demonstrates the style.
26+
return "Hello, how are you doing today? I hope you're having a great day."
27+
28+
case .code:
29+
// Glossary format per OpenAI Whisper prompting guide.
30+
// Whisper matches style/spelling, not instructions.
31+
// 224 token limit - glossary packs more terms efficiently.
32+
// swiftlint:disable:next line_length
33+
return """
34+
Glossary: shadcn, MCP, SDK, API, tRPC, RAG, LLM, CLI,
35+
git pull, git push, git commit, git merge, git rebase, git stash, git diff,
36+
npm, pnpm, bun, yarn, npx, pip, poetry, uv,
37+
zod, prisma, drizzle, tanstack, vitest, playwright,
38+
NextJS, Vercel, Supabase, Firebase, Cloudflare, Netlify,
39+
LangChain, LlamaIndex, OpenAI, Anthropic, Ollama, Gemini, Claude,
40+
useEffect, useState, useCallback, useMemo, async await,
41+
TypeScript, JavaScript, Python, Swift, Rust, Go,
42+
tailwind, vite, remix, astro, nuxt, svelte,
43+
kubectl, terraform, docker, nginx, redis, postgres,
44+
localhost, env, dotenv, JSON, YAML, GraphQL, REST
45+
"""
46+
}
47+
}
48+
}

OpenDictation/Core/Services/CloudTranscriptionProvider.swift

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,12 @@ actor CloudTranscriptionProvider: TranscriptionProvider {
109109

110110
/// Transcribes the audio file at the given URL.
111111
///
112-
/// - Parameter audioURL: URL to the audio file (wav, m4a, mp3, etc.)
112+
/// - Parameters:
113+
/// - audioURL: URL to the audio file (wav, m4a, mp3, etc.)
114+
/// - context: The pre-captured context profile.
113115
/// - Returns: The transcribed text.
114116
/// - Throws: `TranscriptionError` if transcription fails.
115-
func transcribe(audioURL: URL) async throws -> String {
117+
func transcribe(audioURL: URL, context: ContextProfile) async throws -> String {
116118
// Get API key from Keychain
117119
guard let apiKey = KeychainService.shared.load(KeychainService.Key.apiKey),
118120
!apiKey.isEmpty else {
@@ -167,7 +169,8 @@ actor CloudTranscriptionProvider: TranscriptionProvider {
167169
let body = buildMultipartBody(
168170
boundary: boundary,
169171
audioData: audioData,
170-
audioFileName: audioURL.lastPathComponent
172+
audioFileName: audioURL.lastPathComponent,
173+
context: context
171174
)
172175

173176
// Make the request using upload API (better for large files)
@@ -236,7 +239,7 @@ actor CloudTranscriptionProvider: TranscriptionProvider {
236239

237240
// MARK: - Multipart Form Data
238241

239-
private func buildMultipartBody(boundary: String, audioData: Data, audioFileName: String) -> Data {
242+
private func buildMultipartBody(boundary: String, audioData: Data, audioFileName: String, context: ContextProfile) -> Data {
240243
var body = Data()
241244
let crlf = "\r\n"
242245

@@ -278,6 +281,13 @@ actor CloudTranscriptionProvider: TranscriptionProvider {
278281
appendString("Content-Disposition: form-data; name=\"language\"\(crlf)\(crlf)")
279282
appendString("\(languageCode)\(crlf)")
280283
}
284+
285+
// Prompt field (only if context provides one)
286+
if let prompt = context.whisperPrompt {
287+
appendString("--\(boundary)\(crlf)")
288+
appendString("Content-Disposition: form-data; name=\"prompt\"\(crlf)\(crlf)")
289+
appendString("\(prompt)\(crlf)")
290+
}
281291

282292
// Closing boundary
283293
appendString("--\(boundary)--\(crlf)")

OpenDictation/Core/Services/DictationStateMachine.swift

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ enum DictationState: Equatable {
1616

1717
/// Events that trigger state transitions.
1818
enum DictationEvent {
19-
case hotkeyPressed
19+
case hotkeyPressed(context: ContextProfile)
2020
case stopRecording
2121
case transcriptionStarted
2222
case transcriptionCompleted(text: String)
@@ -47,6 +47,9 @@ final class DictationStateMachine: ObservableObject {
4747

4848
@Published private(set) var state: DictationState = .idle
4949

50+
/// The context profile captured at hotkey press (persists across states)
51+
internal(set) var currentContext: ContextProfile = .prose
52+
5053
// MARK: - Mock Mode
5154

5255
/// When true, state transitions occur without triggering service callbacks.
@@ -83,22 +86,32 @@ final class DictationStateMachine: ObservableObject {
8386
switch (state, event) {
8487

8588
// MARK: From Idle
86-
case (.idle, .hotkeyPressed):
89+
case (.idle, .hotkeyPressed(let context)):
90+
currentContext = context
8791
state = .recording
8892
onShowPanel?()
8993
if !isMockMode {
9094
onStartRecording?()
9195
}
9296

9397
// MARK: From Recording
94-
case (.recording, .hotkeyPressed),
95-
(.recording, .stopRecording):
98+
case (.recording, .hotkeyPressed(let context)):
99+
// Update context to match where user ENDS (not starts) dictation
100+
// This ensures the icon and transcription are in sync
101+
currentContext = context
96102
if !isMockMode {
97103
onStopRecording?()
98104
}
99105
// Don't transition yet - wait for transcription result or timeout
100106
// The transition to .processing happens via transcriptionStarted event
101107

108+
case (.recording, .stopRecording):
109+
// stopRecording doesn't carry context, keep existing
110+
if !isMockMode {
111+
onStopRecording?()
112+
}
113+
// Don't transition yet - wait for transcription result or timeout
114+
102115
case (.recording, .transcriptionStarted):
103116
// Transcription has started, show processing state
104117
state = .processing

0 commit comments

Comments
 (0)