diff --git a/internal/appuse/display.go b/internal/appuse/display.go index b171748..a051251 100644 --- a/internal/appuse/display.go +++ b/internal/appuse/display.go @@ -24,10 +24,11 @@ type Rect struct { } type DisplayElement struct { - IsAX bool - Hint string - Name string - Frame Rect + IsAX bool + IsIcon bool + Hint string + Name string + Frame Rect // AX specific Role string Label string @@ -172,6 +173,31 @@ func PrintSnapshot(jsonOutput string, debug bool) []AreaNode { displayElements = append(displayElements, de) } + for _, icon := range resp.IconElements { + // Filter icon elements outside window bounds + if icon.Frame.X < wx-1 || icon.Frame.Y < wy-1 || + icon.Frame.X+icon.Frame.Width > wx+ww+1 || + icon.Frame.Y+icon.Frame.Height > wy+wh+1 { + continue + } + + // Convert screen points to window-relative pixels + px := (icon.Frame.X - wx) * scaleX + py := (icon.Frame.Y - wy) * scaleY + pw := icon.Frame.Width * scaleX + ph := icon.Frame.Height * scaleY + + de := DisplayElement{ + IsAX: false, + IsIcon: true, + Hint: icon.Hint, + Name: fmt.Sprintf("Icon (%.0f%%)", icon.Confidence*100), + OriginalFingerprint: getIconFingerprint(icon), + } + de.Frame.X, de.Frame.Y, de.Frame.Width, de.Frame.Height = px, py, pw, ph + displayElements = append(displayElements, de) + } + var areas []AreaNode if len(displayElements) > 0 { areas = printAreaHierarchy(img, imgBounds, displayElements, wx, wy, ww, wh, scaleX, scaleY, prevMap, debug) @@ -228,6 +254,10 @@ func getOCRFingerprint(ocr OCRElement) string { return "OCR|" + ocr.Name } +func getIconFingerprint(icon IconElement) string { + return fmt.Sprintf("ICON|%.0f,%.0f,%.0f,%.0f", icon.Frame.X, icon.Frame.Y, icon.Frame.Width, icon.Frame.Height) +} + type displayTreeNode struct { e DisplayElement children []*displayTreeNode @@ -309,7 +339,9 @@ func printAreaHierarchy(img image.Image, imgBounds image.Rectangle, elements []D printElementNode = func(n *displayTreeNode, baseIndent, extraSpace string) { e := n.e ocrChar := " " - if !e.IsAX || strings.Contains(e.Name, " (via OCR)") { + if e.IsIcon { + ocrChar = "#" + } else if !e.IsAX || strings.Contains(e.Name, " (via OCR)") { ocrChar = "*" } diffChar := " " @@ -342,9 +374,12 @@ func printAreaHierarchy(img image.Image, imgBounds image.Rectangle, elements []D // Draw bounding boxes for all elements green := color.RGBA{0, 255, 0, 255} yellow := color.RGBA{255, 255, 0, 255} + red := color.RGBA{255, 0, 0, 255} for _, e := range elements { c := green - if !e.IsAX { + if e.IsIcon { + c = red + } else if !e.IsAX { c = yellow } drawRect(debugImg, e.Frame, c) diff --git a/internal/appuse/models.go b/internal/appuse/models.go index 90145a0..7e2d854 100644 --- a/internal/appuse/models.go +++ b/internal/appuse/models.go @@ -54,10 +54,22 @@ type SnapshotResponse struct { Elements []ElementNode `json:"elements"` FrontmostWindow *BreadcrumbNode `json:"frontmostWindow"` OCRElements []OCRElement `json:"ocrElements,omitempty"` + IconElements []IconElement `json:"iconElements,omitempty"` Areas []AreaNode `json:"areas,omitempty"` Caret *CaretRect `json:"caret,omitempty"` } +type IconElement struct { + Confidence float64 `json:"confidence"` + Hint string `json:"hint"` + Frame struct { + X float64 `json:"x"` + Y float64 `json:"y"` + Width float64 `json:"width"` + Height float64 `json:"height"` + } `json:"frame"` +} + type AreaNode struct { Name string `json:"name"` Hint string `json:"hint"` diff --git a/internal/appuse/service.go b/internal/appuse/service.go index 2bf34e7..e26592e 100644 --- a/internal/appuse/service.go +++ b/internal/appuse/service.go @@ -228,6 +228,13 @@ func FindHint(hint string, targetAppName string) (float64, float64, bool) { } } + // Search in Icon elements + for _, icon := range resp.IconElements { + if strings.ToUpper(icon.Hint) == hint { + return icon.Frame.X + icon.Frame.Width/2, icon.Frame.Y + icon.Frame.Height/2, true + } + } + return 0, 0, false } diff --git a/internal/cgo/macos/appuse_bridge/AppUseSnapshot.swift b/internal/cgo/macos/appuse_bridge/AppUseSnapshot.swift index bcff4e7..438bb52 100644 --- a/internal/cgo/macos/appuse_bridge/AppUseSnapshot.swift +++ b/internal/cgo/macos/appuse_bridge/AppUseSnapshot.swift @@ -2,6 +2,7 @@ import Cocoa import ApplicationServices import Foundation import Vision +import CoreML // MARK: - Core Types @@ -192,10 +193,12 @@ class ScreenOCR { private var cachedResults: [(text: String, frame: NSRect)] = [] private var hasPerformedOCR = false + private(set) var lastCapturedImage: CGImage? func reset() { cachedResults = [] hasPerformedOCR = false + lastCapturedImage = nil } /// Synchronously performs a single-pass OCR on the specified window frame. @@ -217,6 +220,7 @@ class ScreenOCR { guard let cgImage = CGWindowListCreateImage(rect, .optionOnScreenOnly, kCGNullWindowID, .boundsIgnoreFraming) else { return } + lastCapturedImage = cgImage // Save to current_position.png for caching/UI purposes let nsImage = NSImage(cgImage: cgImage, size: NSSize(width: winW, height: winH)) @@ -295,6 +299,156 @@ func ocrElementFrame(_ frame: NSRect) -> String { return ScreenOCR.shared.textAt(frame: frame) } +// MARK: - CoreML Icon Detection + +class IconDetector { + static let shared = IconDetector() + + private var vnModel: VNCoreMLModel? + private var cachedResults: [(confidence: Float, frame: NSRect)] = [] + private var hasPerformed = false + + private init() { + loadModel() + } + + private func loadModel() { + // Check environment variable first + if let envPath = ProcessInfo.processInfo.environment["ICON_DETECT_MODEL_PATH"] { + if tryLoadModel(at: envPath) { return } + } + + // Search relative to executable + let execPath = CommandLine.arguments[0] + let execDir = (execPath as NSString).deletingLastPathComponent + + let candidates = [ + (execDir as NSString).appendingPathComponent("../omniparser_icon_detect/model_v1_5.mlpackage"), + (execDir as NSString).appendingPathComponent("omniparser_icon_detect/model_v1_5.mlpackage"), + (execDir as NSString).appendingPathComponent("models/icon_detect_v1_5.mlpackage"), + NSString(string: "~/.application-use/models/icon_detect_v1_5.mlpackage").expandingTildeInPath, + ] + + for path in candidates { + if tryLoadModel(at: path) { return } + } + } + + private func tryLoadModel(at path: String) -> Bool { + let fm = FileManager.default + guard fm.fileExists(atPath: path) else { return false } + + let url = URL(fileURLWithPath: path) + + // Use a cached compiled model to avoid recompiling every time + let cacheDir = (NSTemporaryDirectory() as NSString).appendingPathComponent("application-use-icon-detect") + let compiledPath = (cacheDir as NSString).appendingPathComponent("icon_detect_v1_5.mlmodelc") + let compiledURL = URL(fileURLWithPath: compiledPath) + + var useCache = false + if fm.fileExists(atPath: compiledPath) { + if let srcAttrs = try? fm.attributesOfItem(atPath: path), + let cacheAttrs = try? fm.attributesOfItem(atPath: compiledPath), + let srcMod = srcAttrs[.modificationDate] as? Date, + let cacheMod = cacheAttrs[.modificationDate] as? Date { + useCache = cacheMod >= srcMod + } + } + + do { + let mlModel: MLModel + if useCache { + mlModel = try MLModel(contentsOf: compiledURL) + } else { + let tempCompiled = try MLModel.compileModel(at: url) + try? fm.createDirectory(atPath: cacheDir, withIntermediateDirectories: true) + try? fm.removeItem(at: compiledURL) + try fm.copyItem(at: tempCompiled, to: compiledURL) + mlModel = try MLModel(contentsOf: compiledURL) + } + vnModel = try VNCoreMLModel(for: mlModel) + return true + } catch { + fputs("IconDetector: Failed to load model at \(path): \(error)\n", stderr) + return false + } + } + + var isAvailable: Bool { vnModel != nil } + + func reset() { + cachedResults = [] + hasPerformed = false + } + + func detectIcons(cgImage: CGImage, winFrame: NSRect) { + guard !hasPerformed else { return } + hasPerformed = true + guard let vnModel = vnModel else { return } + + let winX = winFrame.origin.x + let winY = winFrame.origin.y + let winW = winFrame.width + let winH = winFrame.height + + let semaphore = DispatchSemaphore(value: 0) + let request = VNCoreMLRequest(model: vnModel) { [weak self] req, error in + defer { semaphore.signal() } + guard let results = req.results as? [VNRecognizedObjectObservation] else { return } + + for observation in results { + let box = observation.boundingBox + // Vision normalized box: origin bottom-left, y-up, 0..1 + // Convert to AX screen space (origin top-left, y-down) + let axX = winX + box.origin.x * winW + let axY = winY + (1.0 - box.origin.y - box.height) * winH + let axW = box.width * winW + let axH = box.height * winH + + let conf = observation.confidence + let frame = NSRect(x: axX, y: axY, width: axW, height: axH) + self?.cachedResults.append((confidence: conf, frame: frame)) + } + } + request.imageCropAndScaleOption = .scaleFill + + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + try? handler.perform([request]) + semaphore.wait() + } + + func allResults(within targetFrame: NSRect) -> [(confidence: Float, frame: NSRect)] { + return cachedResults.filter { targetFrame.intersects($0.frame) } + } +} + +// MARK: - Deduplication Helpers (IoU / Containment) + +func computeIntersectionArea(_ a: NSRect, _ b: NSRect) -> CGFloat { + let x1 = max(a.origin.x, b.origin.x) + let y1 = max(a.origin.y, b.origin.y) + let x2 = min(a.origin.x + a.width, b.origin.x + b.width) + let y2 = min(a.origin.y + a.height, b.origin.y + b.height) + if x2 <= x1 || y2 <= y1 { return 0 } + return (x2 - x1) * (y2 - y1) +} + +func computeIoU(_ a: NSRect, _ b: NSRect) -> CGFloat { + let inter = computeIntersectionArea(a, b) + if inter == 0 { return 0 } + let unionArea = a.width * a.height + b.width * b.height - inter + if unionArea <= 0 { return 0 } + return inter / unionArea +} + +func computeContainment(_ a: NSRect, _ b: NSRect) -> CGFloat { + let inter = computeIntersectionArea(a, b) + if inter == 0 { return 0 } + let smallerArea = min(a.width * a.height, b.width * b.height) + if smallerArea <= 0 { return 0 } + return inter / smallerArea +} + // MARK: - Element Extractor class ElementExtractor { @@ -495,6 +649,7 @@ func getFocusedCaretScreenRect() -> CGRect? { @_cdecl("trigger_appuse_snapshot") public func trigger_appuse_snapshot() -> UnsafeMutablePointer? { ScreenOCR.shared.reset() + IconDetector.shared.reset() let options = [kAXTrustedCheckOptionPrompt.takeUnretainedValue() as String: true] as CFDictionary guard AXIsProcessTrustedWithOptions(options) else { return strdup("{\"error\": \"Accessibility permission denied\"}") @@ -518,6 +673,10 @@ public func trigger_appuse_snapshot() -> UnsafeMutablePointer? { ScreenOCR.shared.performWindowOCR(winFrame: rect) // OCR results are already filtered to be within rect by allResults(within:) ocrResults = ScreenOCR.shared.allResults(within: rect) + // Run icon detection on the same captured image + if let cgImage = ScreenOCR.shared.lastCapturedImage { + IconDetector.shared.detectIcons(cgImage: cgImage, winFrame: rect) + } } } @@ -552,8 +711,29 @@ public func trigger_appuse_snapshot() -> UnsafeMutablePointer? { filteredOCR.append(ocr) } } - - let totalCount = flatHintable.count + filteredOCR.count + + // Icon detection results + dedup against AX elements (IoU >= 0.3 or containment >= 0.7) + var filteredIcons: [(confidence: Float, frame: NSRect)] = [] + if let winRect = windowRect { + var iconResults = IconDetector.shared.allResults(within: winRect) + iconResults = iconResults.filter { winRect.contains($0.frame) } + for icon in iconResults { + var isDuplicate = false + for ax in flatHintable { + let iou = computeIoU(icon.frame, ax.frame) + let containment = computeContainment(icon.frame, ax.frame) + if iou >= 0.3 || containment >= 0.7 { + isDuplicate = true + break + } + } + if !isDuplicate { + filteredIcons.append(icon) + } + } + } + + let totalCount = flatHintable.count + filteredOCR.count + filteredIcons.count let hintStrings = AlphabetHints.hintStrings(linkCount: totalCount) pendingHints = [] @@ -582,12 +762,28 @@ public func trigger_appuse_snapshot() -> UnsafeMutablePointer? { currentHintIndex += 1 } + // Assign hints to Icon elements + var iconJson: [[String: Any]] = [] + for icon in filteredIcons { + let hintText = hintStrings[currentHintIndex] + let iconNode = [ + "confidence": icon.confidence, + "hint": hintText, + "frame": ["x": icon.frame.origin.x, "y": icon.frame.origin.y, "width": icon.frame.width, "height": icon.frame.height] + ] as [String: Any] + iconJson.append(iconNode) + + pendingHints.append(Hint(frame: icon.frame, text: hintText)) + currentHintIndex += 1 + } + // Return JSON now; overlay will be shown by show_appuse_overlay() after Go takes screenshots. var jsonDict: [String: Any] = [ "appName": targetApp.localizedName ?? "Unknown", "bundleID": targetApp.bundleIdentifier ?? "", "elements": rootNodes.map { $0.toDict() }, - "ocrElements": ocrJson + "ocrElements": ocrJson, + "iconElements": iconJson ] if let windowInfo = getFrontmostWindowInfo(appElement: appElement) { jsonDict["frontmostWindow"] = windowInfo @@ -641,20 +837,6 @@ public func clear_appuse_snapshot() { public func click_at(x: Double, y: Double) { let point = CGPoint(x: x, y: y) print("Clicking at: \(point)") - - // 1. Try AXPress first - let systemWide = AXUIElementCreateSystemWide() - var element: AXUIElement? - if AXUIElementCopyElementAtPosition(systemWide, Float(x), Float(y), &element) == .success, - let target = element { - let result = AXUIElementPerformAction(target, kAXPressAction as CFString) - if result == .success { - print("AXPress success at \(point)") - return - } - } - - // 2. Fallback to raw mouse click mouseClick(at: point) } diff --git a/internal/cgo/macos/appuse_bridge/Makefile b/internal/cgo/macos/appuse_bridge/Makefile index f6c04b1..a1e4e3b 100644 --- a/internal/cgo/macos/appuse_bridge/Makefile +++ b/internal/cgo/macos/appuse_bridge/Makefile @@ -2,7 +2,7 @@ ARCH ?= $(shell uname -m) OS_VER ?= 13.0 all: - swiftc -emit-library -static -target $(ARCH)-apple-macosx$(OS_VER) AppUseSnapshot.swift -o libappuse_bridge.a + swiftc -emit-library -static -target $(ARCH)-apple-macosx$(OS_VER) -framework CoreML AppUseSnapshot.swift -o libappuse_bridge.a clean: rm -f *.a *.dylib diff --git a/internal/cgo/macos/appuse_bridge/appuse_bridge.go b/internal/cgo/macos/appuse_bridge/appuse_bridge.go index 5100e82..ab786bb 100644 --- a/internal/cgo/macos/appuse_bridge/appuse_bridge.go +++ b/internal/cgo/macos/appuse_bridge/appuse_bridge.go @@ -1,7 +1,7 @@ package appuse_bridge /* -#cgo LDFLAGS: -L${SRCDIR} ${SRCDIR}/libappuse_bridge.a -framework Foundation -framework AppKit -framework ApplicationServices +#cgo LDFLAGS: -L${SRCDIR} ${SRCDIR}/libappuse_bridge.a -framework Foundation -framework AppKit -framework ApplicationServices -framework CoreML #include #include diff --git a/omniparser_icon_detect/model_v1_5.mlpackage/Data/com.apple.CoreML/model.mlmodel b/omniparser_icon_detect/model_v1_5.mlpackage/Data/com.apple.CoreML/model.mlmodel new file mode 100644 index 0000000..6279339 Binary files /dev/null and b/omniparser_icon_detect/model_v1_5.mlpackage/Data/com.apple.CoreML/model.mlmodel differ diff --git a/omniparser_icon_detect/model_v1_5.mlpackage/Data/com.apple.CoreML/weights/weight.bin b/omniparser_icon_detect/model_v1_5.mlpackage/Data/com.apple.CoreML/weights/weight.bin new file mode 100644 index 0000000..3e451d7 Binary files /dev/null and b/omniparser_icon_detect/model_v1_5.mlpackage/Data/com.apple.CoreML/weights/weight.bin differ diff --git a/omniparser_icon_detect/model_v1_5.mlpackage/Manifest.json b/omniparser_icon_detect/model_v1_5.mlpackage/Manifest.json new file mode 100644 index 0000000..4bfddbf --- /dev/null +++ b/omniparser_icon_detect/model_v1_5.mlpackage/Manifest.json @@ -0,0 +1,18 @@ +{ + "fileFormatVersion": "1.0.0", + "itemInfoEntries": { + "4A3CE2FE-F5B5-47D9-BCED-527A33DA2342": { + "author": "com.apple.CoreML", + "description": "CoreML Model Specification", + "name": "model.mlmodel", + "path": "com.apple.CoreML/model.mlmodel" + }, + "C47C13AE-9DB4-4AE8-894C-431A9DCA4F06": { + "author": "com.apple.CoreML", + "description": "CoreML Model Weights", + "name": "weights", + "path": "com.apple.CoreML/weights" + } + }, + "rootModelIdentifier": "4A3CE2FE-F5B5-47D9-BCED-527A33DA2342" +} diff --git a/package.json b/package.json index 5061058..a88e4c9 100644 --- a/package.json +++ b/package.json @@ -1,14 +1,18 @@ { - "name": "application-use", + "name": "@saiting/application-use", "version": "0.1.2", - "description": "macOS Desktop Automation CLI for AI agents", + "description": "macOS Desktop Automation CLI for AI agents (with CoreML icon detection)", "type": "module", "files": [ - "bin" + "bin", + "omniparser_icon_detect" ], "bin": { "application-use": "./bin/application-use.js" }, + "publishConfig": { + "access": "public" + }, "scripts": { "build": "make package-npm", "prepublishOnly": "npm run build",