Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Cotabby.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
046C133967B32BBF9205EBB1 /* LLMIOFileHandler.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8D610FCA3A97249DCCE7D0B8 /* LLMIOFileHandler.swift */; };
078FDE669437D756678E9AB7 /* SettingsRowLabel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 907549CB913B40C28B953A5D /* SettingsRowLabel.swift */; };
07D046D406411ED85AC5758A /* InputMonitorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = BAC01317B0B68E3C4125E421 /* InputMonitorTests.swift */; };
09F37A67341E57839C3349A8 /* VisualContextSummaryPromptRenderer.swift in Sources */ = {isa = PBXBuildFile; fileRef = C448F8CE0CC6C103C1A63FE5 /* VisualContextSummaryPromptRenderer.swift */; };
0A2DDD946654076675AC0FC6 /* LanguageCatalog.swift in Sources */ = {isa = PBXBuildFile; fileRef = BF4BB93056F291FD24EFAD22 /* LanguageCatalog.swift */; };
0A3443AEE6540F11E5E6BF8F /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = A3E8E86A14090BC7BD13BA76 /* AppDelegate.swift */; };
0A658BF137DBD0898E40B87F /* AcknowledgementsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2B7A28471B8526C2693FFF65 /* AcknowledgementsView.swift */; };
Expand All @@ -36,6 +37,7 @@
15FA56CEF6FB5FF54C2FBA6F /* PermissionAndContextModelTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E7F42112F14026E6253BB865 /* PermissionAndContextModelTests.swift */; };
190C571B3CDFE117F4D15484 /* LlamaPromptRendererTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3009812A35A1CDEF16295AB7 /* LlamaPromptRendererTests.swift */; };
19CB55B62977376E9AE8D428 /* VisualContextStartCoalescer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2F01FAC4F57EB08471521196 /* VisualContextStartCoalescer.swift */; };
1B3FFCB9A979F49BF86EAAD4 /* ScreenshotContextGeneratorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = B2BFD19A159680A495EE02FD /* ScreenshotContextGeneratorTests.swift */; };
1C4A2BAB2CCADF0A70B70AC6 /* LlamaPromptRenderer.swift in Sources */ = {isa = PBXBuildFile; fileRef = B5679E08C9A09065531C37B5 /* LlamaPromptRenderer.swift */; };
1F8CC88AFFE67C08944CF506 /* WindowScreenshotService.swift in Sources */ = {isa = PBXBuildFile; fileRef = 77B0121E7BB173F8A2B0B108 /* WindowScreenshotService.swift */; };
2197B68F1E4D0C3497DAC061 /* LlamaSuggestionEngine.swift in Sources */ = {isa = PBXBuildFile; fileRef = BE04620C905041680116BE80 /* LlamaSuggestionEngine.swift */; };
Expand Down Expand Up @@ -83,6 +85,7 @@
4CAFD8F3444FEDC9ACAFF529 /* LlamaRuntimeModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = A804F4DB6FD9BC8C27B2B65F /* LlamaRuntimeModels.swift */; };
4F369F5284DDCEABF082E59B /* SuggestionAvailabilityEvaluator.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3609CC88A5280B3AA40414DF /* SuggestionAvailabilityEvaluator.swift */; };
4F38CE1C2602CF4F41323032 /* PermissionOverlayTrackerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 12DD19BCE610808F1E38702D /* PermissionOverlayTrackerTests.swift */; };
5080B61DA368091E3614BE90 /* VisualContextSummaryPromptRendererTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 05E051F74207D1C9A7D2B991 /* VisualContextSummaryPromptRendererTests.swift */; };
51C069603DA16830868F1628 /* LanguageTagsEditor.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9A7CDA90E128350BFF1A9D66 /* LanguageTagsEditor.swift */; };
52518CF0760DFEE9AF7C786C /* SuggestionEngineRouter.swift in Sources */ = {isa = PBXBuildFile; fileRef = 384FBCF5D7A3A446C5BE2B8D /* SuggestionEngineRouter.swift */; };
53FB56A095BCF0389DAC0A56 /* SuggestionTextColorCodec.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1CE61E74928C221B8BB261C6 /* SuggestionTextColorCodec.swift */; };
Expand Down Expand Up @@ -240,6 +243,7 @@
04D853218B0A77B0CE090828 /* BrowserAppDetectorTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BrowserAppDetectorTests.swift; sourceTree = "<group>"; };
04E25414C307A20B6F9F20EC /* FocusSnapshotResolver.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusSnapshotResolver.swift; sourceTree = "<group>"; };
050D929E13BE52E6282B64D2 /* VisualContextStartCoalescerTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VisualContextStartCoalescerTests.swift; sourceTree = "<group>"; };
05E051F74207D1C9A7D2B991 /* VisualContextSummaryPromptRendererTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VisualContextSummaryPromptRendererTests.swift; sourceTree = "<group>"; };
07480CE96ED0EBD94817C6B1 /* GeneralPaneView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GeneralPaneView.swift; sourceTree = "<group>"; };
0850B07CCDBA67C756C6EC59 /* ShortcutConflictTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ShortcutConflictTests.swift; sourceTree = "<group>"; };
09FADF683BE7B3558377FA76 /* FocusPollBackoff.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusPollBackoff.swift; sourceTree = "<group>"; };
Expand Down Expand Up @@ -374,6 +378,7 @@
AD9573F3504CAE6891DF9B7D /* AppUpdateManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppUpdateManager.swift; sourceTree = "<group>"; };
ADBE3E6CC585C1683787C877 /* SuggestionEngineModels.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionEngineModels.swift; sourceTree = "<group>"; };
AF1E065C7FFB697FCEB2FA5C /* CotabbyTestFixtures.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CotabbyTestFixtures.swift; sourceTree = "<group>"; };
B2BFD19A159680A495EE02FD /* ScreenshotContextGeneratorTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ScreenshotContextGeneratorTests.swift; sourceTree = "<group>"; };
B2F95847D76893C8A5B504B4 /* SuggestionOverlayStabilityGate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionOverlayStabilityGate.swift; sourceTree = "<group>"; };
B424E2AC97C99D335B0D5751 /* SuggestionTextNormalizer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionTextNormalizer.swift; sourceTree = "<group>"; };
B4B4A2E2DD6733658EC05BD8 /* DownloadFileRescuer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadFileRescuer.swift; sourceTree = "<group>"; };
Expand All @@ -397,6 +402,7 @@
C1C5DE0F3FF63545000E2453 /* DisplayCoordinateConverterTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DisplayCoordinateConverterTests.swift; sourceTree = "<group>"; };
C375227649689775275AA4B3 /* SuggestionCoordinatorAcceptanceTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionCoordinatorAcceptanceTests.swift; sourceTree = "<group>"; };
C379D77029D6E88C8C1B9AF7 /* emoji.json */ = {isa = PBXFileReference; lastKnownFileType = text.json; path = emoji.json; sourceTree = "<group>"; };
C448F8CE0CC6C103C1A63FE5 /* VisualContextSummaryPromptRenderer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VisualContextSummaryPromptRenderer.swift; sourceTree = "<group>"; };
C71031E8DB171047318B92FC /* SyntheticReplacePlannerTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SyntheticReplacePlannerTests.swift; sourceTree = "<group>"; };
C7B2D34A6F3AC9DFD61350F7 /* CotabbyDebugOptions.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CotabbyDebugOptions.swift; sourceTree = "<group>"; };
CA942A53B7C09D1F4EC57239 /* SuggestionInteractionState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionInteractionState.swift; sourceTree = "<group>"; };
Expand Down Expand Up @@ -700,6 +706,7 @@
12DD19BCE610808F1E38702D /* PermissionOverlayTrackerTests.swift */,
0D80CC2CCAAFE3F23FB8C37A /* PromptContextSanitizerTests.swift */,
4696A84D17890B154533A08F /* PromptPolicyTests.swift */,
B2BFD19A159680A495EE02FD /* ScreenshotContextGeneratorTests.swift */,
2BC293F6125E2B14DCF05AD9 /* SettingsAttentionEvaluatorTests.swift */,
0850B07CCDBA67C756C6EC59 /* ShortcutConflictTests.swift */,
C05B0439348261163B37C508 /* SuggestionAvailabilityEvaluatorTests.swift */,
Expand All @@ -713,6 +720,7 @@
43E37A7E835D3BDE6265843C /* TerminalAppDetectorTests.swift */,
FC24FD54860CE6737E65EF65 /* TextDirectionDetectorTests.swift */,
050D929E13BE52E6282B64D2 /* VisualContextStartCoalescerTests.swift */,
05E051F74207D1C9A7D2B991 /* VisualContextSummaryPromptRendererTests.swift */,
1E0513E3B23937B099A3CFF2 /* WordCountFormatterTests.swift */,
);
path = CotabbyTests;
Expand Down Expand Up @@ -849,6 +857,7 @@
7F4C4A7EAF886E0CC945BFEF /* TerminalAppDetector.swift */,
328847A0F494360033366791 /* TextDirectionDetector.swift */,
2F01FAC4F57EB08471521196 /* VisualContextStartCoalescer.swift */,
C448F8CE0CC6C103C1A63FE5 /* VisualContextSummaryPromptRenderer.swift */,
815F2ABAF6AB75DA3AFBBCEF /* WordCountFormatter.swift */,
);
path = Support;
Expand Down Expand Up @@ -1118,6 +1127,7 @@
E9E4CC657771DF9F4C56183C /* VisualContextCoordinator.swift in Sources */,
4190F8A76196B16ED94D0A55 /* VisualContextModels.swift in Sources */,
19CB55B62977376E9AE8D428 /* VisualContextStartCoalescer.swift in Sources */,
09F37A67341E57839C3349A8 /* VisualContextSummaryPromptRenderer.swift in Sources */,
4AC255BE2D0CCC67B8882C7A /* WelcomeCoordinator.swift in Sources */,
344B9BF352C97CFA830853D6 /* WelcomePermissionStepView.swift in Sources */,
286B7022E2A2774275004447 /* WelcomeTemplateStepView.swift in Sources */,
Expand Down Expand Up @@ -1171,6 +1181,7 @@
4F38CE1C2602CF4F41323032 /* PermissionOverlayTrackerTests.swift in Sources */,
934885ACC2DEA20B27F10948 /* PromptContextSanitizerTests.swift in Sources */,
3CF1A4E39F24917DF0470A7D /* PromptPolicyTests.swift in Sources */,
1B3FFCB9A979F49BF86EAAD4 /* ScreenshotContextGeneratorTests.swift in Sources */,
C618C5595DA9C57C806A3E03 /* SettingsAttentionEvaluatorTests.swift in Sources */,
8441299082E6B68F7F88911B /* ShortcutConflictTests.swift in Sources */,
88BCD795A14E1C9308F7BB31 /* SuggestionAvailabilityEvaluatorTests.swift in Sources */,
Expand All @@ -1184,6 +1195,7 @@
DE236C9285635C686D66A2F6 /* TerminalAppDetectorTests.swift in Sources */,
5A441797D71A880A7482077D /* TextDirectionDetectorTests.swift in Sources */,
D5CAF3B590E5EC2AFC72E57A /* VisualContextStartCoalescerTests.swift in Sources */,
5080B61DA368091E3614BE90 /* VisualContextSummaryPromptRendererTests.swift in Sources */,
6AE0B46FB52D189D94E1F79A /* WordCountFormatterTests.swift in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
Expand Down
15 changes: 8 additions & 7 deletions Cotabby/Models/VisualContextModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@ struct VisualContextConfiguration: Equatable, Sendable {
let maxSummaryCharacters: Int

static let `default` = VisualContextConfiguration(
// Capture a compact area around the focused field instead of an entire window.
snapshotDimension: 500,
// Retina screenshots may still arrive at ~2x backing scale, so keep a small OCR ceiling.
maxImageDimension: 900,
// Capture a wider field-centered area so OCR can see nearby labels and conversation turns.
snapshotDimension: 700,
// Vision's accurate mode benefits from more pixels, especially on dense document UIs.
maxImageDimension: 1600,
minRecognizedCharacterCount: 12,
// OCR text is injected directly into the completion prompt, so keep it intentionally short.
maxRecognizedCharacters: 2000,
maxSummaryCharacters: 900
// The summarizer needs enough raw OCR to recover task, filenames, and nearby messages.
maxRecognizedCharacters: 5000,
// The final prompt still stays bounded even when summarization falls back to OCR.
maxSummaryCharacters: 1500
)
}

Expand Down
6 changes: 3 additions & 3 deletions Cotabby/Services/Permission/PermissionManager.swift
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ import Logging

/// File overview:
/// Polls and exposes the three system permissions Cotabby depends on: Accessibility for reading
/// focus state, Input Monitoring for global key capture, and Screen Recording for legacy screenshot
/// experiments that are currently deprecated in the autocomplete request path.
/// focus state, Input Monitoring for global key capture, and Screen Recording for screenshot
/// context that improves autocomplete relevance.
///
/// `@MainActor` guarantees permission state is mutated on the UI thread.
@MainActor
Expand Down Expand Up @@ -127,7 +127,7 @@ final class PermissionManager: ObservableObject {
openSettings(for: .inputMonitoring)
}

/// Opens System Settings directly to the Screen Recording pane for legacy screenshot tooling.
/// Opens System Settings directly to the Screen Recording pane for visual context capture.
func openScreenRecordingSettings() {
openSettings(for: .screenRecording)
}
Expand Down
62 changes: 31 additions & 31 deletions Cotabby/Services/Visual/LlamaVisualContextSummarizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,25 @@ protocol VisualContextSummarizing: AnyObject, Sendable {
func summarize(text: String, applicationName: String) async throws -> String
}

enum VisualContextSummarizationError: LocalizedError {
case emptyResult

var errorDescription: String? {
switch self {
case .emptyResult:
return "Visual context summarization produced no usable text."
}
}
}

/// Local-model implementation of visual-context summarization.
///
/// This type owns only the summarization prompt. Screenshot capture, OCR, prompt-injection limits,
/// and stale-session checks remain in their own services so model prompting does not become a
/// hidden owner of the visual-context lifecycle.
@MainActor
final class LlamaVisualContextSummarizer: VisualContextSummarizing {
private static let timeoutSeconds: UInt64 = 3
private static let timeoutSeconds: UInt64 = 6
private let runtimeManager: LlamaRuntimeManager

init(runtimeManager: LlamaRuntimeManager) {
Expand All @@ -32,37 +43,31 @@ final class LlamaVisualContextSummarizer: VisualContextSummarizing {
// repeating signal without losing any unique content.
let deduplicatedText = deduplicateConsecutiveLines(text)

let prompt = [
"Task: Write a concise, 4-sentence summary of what the provided text from the application '\(applicationName)' is about.",
"",
"Rules:",
"1. Output exactly and ONLY the summary text.",
"2. DO NOT add conversational filler (e.g., 'Here is the summary').",
"3. DO NOT add extra instructions or meta-commentary.",
"4. DO NOT repeat the prompt.",
"",
"--- START SCREEN TEXT ---",
deduplicatedText,
"--- END SCREEN TEXT ---",
"",
"Summary:"
].joined(separator: "\n")

let result = await summarizeWithTimeout(prompt: prompt)
let prompt = VisualContextSummaryPromptRenderer.prompt(
applicationName: applicationName,
screenText: deduplicatedText
)

let result = try await summarizeWithTimeout(prompt: prompt)
let trimmedResult = result.trimmingCharacters(in: .whitespacesAndNewlines)
return truncateAtRepeatedBlock(trimmedResult)
let cleanedResult = truncateAtRepeatedBlock(trimmedResult)
guard !cleanedResult.isEmpty else {
throw VisualContextSummarizationError.emptyResult
}

return cleanedResult
}

/// Soft timeout: runs generation in a child Task and cancels it after the deadline.
/// `LlamaRuntimeCore.summarize()` checks `Task.isCancelled` each token and returns whatever
/// partial text it has accumulated, so the result is the best-effort summary — not a failure.
private func summarizeWithTimeout(prompt: String) async -> String {
private func summarizeWithTimeout(prompt: String) async throws -> String {
let manager = runtimeManager

let generationTask = Task {
try await manager.summarize(
prompt: prompt,
maxPredictionTokens: 80,
maxPredictionTokens: 160,
temperature: 0
)
}
Expand All @@ -72,16 +77,11 @@ final class LlamaVisualContextSummarizer: VisualContextSummarizing {
generationTask.cancel()
}

// Wait for generation to finish. On timeout, cancel fires → Task.isCancelled breaks
// the token loop → core.summarize() returns partial text → task.value returns it.
let result: String
do {
result = try await generationTask.value
} catch {
CotabbyLogger.app.warning("Visual context summarization failed: \(error.localizedDescription)")
result = ""
}
timeoutTask.cancel()
defer { timeoutTask.cancel() }

// Wait for generation to finish. On timeout, cancellation either returns a partial summary
// from the runtime or throws; both paths are useful because the caller can fall back to OCR.
let result = try await generationTask.value
if result.isEmpty {
CotabbyLogger.app.debug("Summarization produced empty result")
} else {
Expand Down
23 changes: 15 additions & 8 deletions Cotabby/Services/Visual/ScreenTextExtractor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,22 @@ import Logging
/// This is the bridge between raw image capture and the existing text-only local LLM runtime.
///
/// We deliberately downsample very large screenshots before OCR. The goal is not archival fidelity;
/// it is fast, good-enough semantic extraction for autocomplete context.
///
/// DEPRECATED:
/// The current autocomplete request path no longer injects OCR-derived context.
/// Keep this extractor only for legacy experiments until the context rewrite lands.
/// it is bounded semantic extraction for autocomplete context. This pass favors useful text
/// recovery over minimum latency because the output is captured once per focused field.

struct ExtractedScreenText: Sendable {
let text: String
let lineCount: Int
}

/// Test seam for screenshot OCR.
///
/// `ScreenshotContextGenerator` owns orchestration, while this protocol lets tests inject
/// deterministic OCR without depending on Vision, Screen Recording permission, or real pixels.
protocol ScreenTextExtracting {
func extractText(from image: CGImage) async throws -> ExtractedScreenText
}

enum ScreenTextExtractionError: LocalizedError {
case noRecognizedText
case ocrFailed(String)
Expand All @@ -33,7 +38,7 @@ enum ScreenTextExtractionError: LocalizedError {
}
}

struct ScreenTextExtractor {
struct ScreenTextExtractor: ScreenTextExtracting {
let maxImageDimension: Int
let maxRecognizedCharacters: Int

Expand Down Expand Up @@ -98,9 +103,11 @@ struct ScreenTextExtractor {
continuation.resume(returning: ExtractedScreenText(text: cappedText, lineCount: orderedLines.count))
}

request.recognitionLevel = .fast
// Accurate OCR is slower, but visual context is only captured once per focused
// field and the result can materially improve autocomplete relevance.
request.recognitionLevel = .accurate
request.usesLanguageCorrection = false
request.minimumTextHeight = 0.012
request.minimumTextHeight = 0.008

do {
let handler = VNImageRequestHandler(cgImage: preparedImage, options: [:])
Expand Down
Loading