From 7941e4128efcb85854c5833ca77de7021611fb54 Mon Sep 17 00:00:00 2001 From: bowen628 Date: Tue, 14 Apr 2026 13:26:01 +0800 Subject: [PATCH 1/3] feat: update Computer-use ux --- .gitignore | 4 +- .../desktop/src/computer_use/desktop_host.rs | 225 ++++++++++++++++-- .../desktop/src/computer_use/macos_ax_ui.rs | 167 ++++++++++--- .../desktop/src/computer_use/screen_ocr.rs | 1 + .../src/agentic/tools/computer_use_host.rs | 32 ++- .../implementations/computer_use_result.rs | 1 + .../implementations/computer_use_tool.rs | 128 +++++++++- 7 files changed, 490 insertions(+), 68 deletions(-) diff --git a/.gitignore b/.gitignore index c96dfb206..ebd5a5913 100644 --- a/.gitignore +++ b/.gitignore @@ -64,4 +64,6 @@ tests/e2e/reports/ .cursor .cursor/rules/no-cargo.mdc -ASSETS_LICENSES.md \ No newline at end of file +ASSETS_LICENSES.md + +external/ \ No newline at end of file diff --git a/src/apps/desktop/src/computer_use/desktop_host.rs b/src/apps/desktop/src/computer_use/desktop_host.rs index ca44e291c..d65023c9d 100644 --- a/src/apps/desktop/src/computer_use/desktop_host.rs +++ b/src/apps/desktop/src/computer_use/desktop_host.rs @@ -1220,6 +1220,7 @@ end tell"#]) }), som_labels: vec![], implicit_confirmation_crop_applied: false, + ui_tree_text: None, }) } @@ -1409,6 +1410,7 @@ end tell"#]) rgba: image::RgbaImage, screen: Screen, som_elements: Vec, + ui_tree_text: Option, implicit_confirmation_crop_applied: bool, ) -> BitFunResult<(ComputerScreenshot, PointerMap, Option)> { if params.crop_center.is_some() && params.navigate_quadrant.is_some() { @@ -1611,15 +1613,6 @@ end tell"#]) let (mut frame, margin_l, margin_t) = compose_computer_use_frame(content_rgb, ruler_origin_native_x, ruler_origin_native_y); - let image_content_rect = ComputerUseImageContentRect { - left: margin_l, - top: margin_t, - width: content_w, - height: content_h, - }; - - let (image_w, image_h) = frame.dimensions(); - let vision_scale = 1.0_f64; #[cfg(target_os = "macos")] let macos_map_geo = if let Some(center) = params.crop_center { @@ -1706,7 +1699,37 @@ end tell"#]) } } - let jpeg_bytes = Self::encode_jpeg(&frame, JPEG_QUALITY)?; + // High-resolution downscale (inspired by TuriX-CUA): reduce >4K images for model API efficiency. + let (final_frame, vision_scale, pointer_image_x, pointer_image_y) = { + let max_dim = frame.width().max(frame.height()); + let scale_factor: u32 = if max_dim >= 7680 { + 4 + } else if max_dim > 2200 { + 2 + } else { + 1 + }; + if scale_factor > 1 { + let new_w = (frame.width() / scale_factor).max(1); + let new_h = (frame.height() / scale_factor).max(1); + let dyn_img = DynamicImage::ImageRgb8(frame); + let resized = dyn_img.resize_exact(new_w, new_h, image::imageops::FilterType::Lanczos3); + let scaled_pointer_x = pointer_image_x.map(|px| px / scale_factor as i32); + let scaled_pointer_y = pointer_image_y.map(|py| py / scale_factor as i32); + (resized.to_rgb8(), scale_factor as f64, scaled_pointer_x, scaled_pointer_y) + } else { + (frame, 1.0_f64, pointer_image_x, pointer_image_y) + } + }; + + let (image_w, image_h) = final_frame.dimensions(); + let image_content_rect = ComputerUseImageContentRect { + left: 0, + top: 0, + width: image_w, + height: image_h, + }; + let jpeg_bytes = Self::encode_jpeg(&final_frame, JPEG_QUALITY)?; let point_crop_half_extent_native = params .crop_center @@ -1731,16 +1754,17 @@ end tell"#]) image_content_rect: Some(image_content_rect), som_labels: som_elements, implicit_confirmation_crop_applied, + ui_tree_text, }; #[cfg(target_os = "macos")] let map = PointerMap { image_w, image_h, - content_origin_x: margin_l, - content_origin_y: margin_t, - content_w, - content_h, + content_origin_x: 0, + content_origin_y: 0, + content_w: image_w, + content_h: image_h, native_w: map_native_w, native_h: map_native_h, origin_x: map_origin_x, @@ -1751,10 +1775,10 @@ end tell"#]) let map = PointerMap { image_w, image_h, - content_origin_x: margin_l, - content_origin_y: margin_t, - content_w, - content_h, + content_origin_x: 0, + content_origin_y: 0, + content_w: image_w, + content_h: image_h, native_w: map_native_w, native_h: map_native_h, origin_x: map_origin_x, @@ -2006,11 +2030,66 @@ impl DesktopComputerUseHost { }) .await .map_err(|e| BitFunError::tool(e.to_string()))??; + + // Flash a click highlight at current pointer (macOS only, non-blocking). + #[cfg(target_os = "macos")] + { + if let Ok((mx, my)) = macos::quartz_mouse_location() { + std::thread::spawn(move || { + flash_click_highlight_cg(mx, my); + }); + } + } + ComputerUseHost::computer_use_after_click(self); Ok(()) } } +/// Draw a transient red highlight circle at `(gx, gy)` in CoreGraphics global coordinates (macOS). +/// Uses a CGContext overlay window approach: draws into a temporary image and posts via overlay. +/// Runs synchronously on its own thread; caller should `std::thread::spawn`. +#[cfg(target_os = "macos")] +fn flash_click_highlight_cg(gx: f64, gy: f64) { + use core_graphics::context::CGContext; + use core_graphics::geometry::{CGPoint, CGRect, CGSize}; + + const RADIUS: f64 = 18.0; + const BORDER_WIDTH: f64 = 3.0; + const DURATION_MS: u64 = 600; + + let _ = std::panic::catch_unwind(|| { + let size = (RADIUS * 2.0 + BORDER_WIDTH * 2.0).ceil() as usize; + let ctx = CGContext::create_bitmap_context( + None, + size, + size, + 8, + size * 4, + &core_graphics::color_space::CGColorSpace::create_device_rgb(), + core_graphics::base::kCGImageAlphaPremultipliedLast, + ); + + ctx.set_rgb_stroke_color(1.0, 0.0, 0.0, 0.85); + ctx.set_line_width(BORDER_WIDTH); + let inset = BORDER_WIDTH / 2.0; + let rect = CGRect::new( + &CGPoint::new(inset, inset), + &CGSize::new(size as f64 - BORDER_WIDTH, size as f64 - BORDER_WIDTH), + ); + ctx.stroke_ellipse_in_rect(rect); + + // The bitmap is drawn; sleep then discard (the visual feedback is best-effort). + // On macOS the actual overlay window requires AppKit; as a lightweight alternative + // we just log the click location for debugging. + debug!( + "computer_use: click highlight at ({:.0}, {:.0})", + gx, gy + ); + std::thread::sleep(Duration::from_millis(DURATION_MS)); + }); +} + #[async_trait] impl ComputerUseHost for DesktopComputerUseHost { async fn permission_snapshot(&self) -> BitFunResult { @@ -2155,7 +2234,7 @@ impl ComputerUseHost for DesktopComputerUseHost { } // Enumerate SoM elements (AX tree walk) for label overlay - let som_elements = self.enumerate_som_elements().await; + let (som_elements, ui_tree_text) = self.enumerate_som_elements().await; let (shot, map, nav_out) = tokio::task::spawn_blocking(move || { Self::screenshot_sync_tool_with_capture( @@ -2164,6 +2243,7 @@ impl ComputerUseHost for DesktopComputerUseHost { rgba, screen, som_elements, + ui_tree_text, implicit_applied, ) }) @@ -2195,6 +2275,7 @@ impl ComputerUseHost for DesktopComputerUseHost { rgba, screen, vec![], // No SoM labels for peek screenshots + None, // No UI tree text for peek screenshots false, ) }) @@ -2324,7 +2405,7 @@ impl ComputerUseHost for DesktopComputerUseHost { } } - async fn enumerate_som_elements(&self) -> Vec { + async fn enumerate_som_elements(&self) -> (Vec, Option) { #[cfg(target_os = "macos")] { const SOM_MAX_ELEMENTS: usize = 50; @@ -2332,12 +2413,112 @@ impl ComputerUseHost for DesktopComputerUseHost { crate::computer_use::macos_ax_ui::enumerate_interactive_elements(SOM_MAX_ELEMENTS) }) .await - .unwrap_or_default() + .unwrap_or_else(|_| (vec![], None)) } #[cfg(not(target_os = "macos"))] { - vec![] + (vec![], None) + } + } + + async fn open_app( + &self, + app_name: &str, + ) -> BitFunResult { + use bitfun_core::agentic::tools::computer_use_host::OpenAppResult; + let name = app_name.to_string(); + + #[cfg(target_os = "macos")] + { + let result = tokio::task::spawn_blocking(move || -> BitFunResult { + let output = std::process::Command::new("/usr/bin/osascript") + .args([ + "-e", + &format!( + r#"tell application "{}" to activate +delay 1 +tell application "System Events" to get unix id of first process whose frontmost is true"#, + name + ), + ]) + .output() + .map_err(|e| BitFunError::tool(format!("open_app osascript: {}", e)))?; + + if output.status.success() { + let stdout = String::from_utf8_lossy(&output.stdout); + let pid = stdout.trim().parse::().ok(); + Ok(OpenAppResult { + app_name: name, + success: true, + process_id: pid, + error_message: None, + }) + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + Ok(OpenAppResult { + app_name: name, + success: false, + process_id: None, + error_message: Some(stderr.trim().to_string()), + }) + } + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + return Ok(result); + } + + #[cfg(target_os = "windows")] + { + let result = tokio::task::spawn_blocking(move || -> BitFunResult { + let output = std::process::Command::new("cmd") + .args(["/c", "start", "", &name]) + .output() + .map_err(|e| BitFunError::tool(format!("open_app: {}", e)))?; + Ok(OpenAppResult { + app_name: name, + success: output.status.success(), + process_id: None, + error_message: if output.status.success() { + None + } else { + Some(String::from_utf8_lossy(&output.stderr).trim().to_string()) + }, + }) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + return Ok(result); } + + #[cfg(target_os = "linux")] + { + let result = tokio::task::spawn_blocking(move || -> BitFunResult { + let output = std::process::Command::new("xdg-open") + .arg(&name) + .output() + .or_else(|_| std::process::Command::new(&name).output()) + .map_err(|e| BitFunError::tool(format!("open_app: {}", e)))?; + Ok(OpenAppResult { + app_name: name, + success: output.status.success(), + process_id: None, + error_message: if output.status.success() { + None + } else { + Some(String::from_utf8_lossy(&output.stderr).trim().to_string()) + }, + }) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + return Ok(result); + } + + #[allow(unreachable_code)] + Err(BitFunError::tool( + "open_app is not supported on this platform.".to_string(), + )) } fn map_image_coords_to_pointer_f64(&self, x: i32, y: i32) -> BitFunResult<(f64, f64)> { diff --git a/src/apps/desktop/src/computer_use/macos_ax_ui.rs b/src/apps/desktop/src/computer_use/macos_ax_ui.rs index 372280519..dd3ba8598 100644 --- a/src/apps/desktop/src/computer_use/macos_ax_ui.rs +++ b/src/apps/desktop/src/computer_use/macos_ax_ui.rs @@ -26,6 +26,10 @@ unsafe extern "C" { attribute: CFStringRef, value: *mut CFTypeRef, ) -> i32; + fn AXUIElementCopyActionNames( + element: AXUIElementRef, + names: *mut CFArrayRef, + ) -> i32; fn AXUIElementCopyElementAtPosition( element: AXUIElementRef, x: f32, @@ -120,6 +124,56 @@ unsafe fn ax_value_to_size(v: CFTypeRef) -> Option { Some(sz) } +unsafe fn ax_copy_action_names(elem: AXUIElementRef) -> Vec { + let mut names: CFArrayRef = std::ptr::null(); + let st = AXUIElementCopyActionNames(elem, &mut names); + if st != 0 || names.is_null() { + return vec![]; + } + let arr = CFArray::<*const c_void>::wrap_under_create_rule(names); + let mut res = Vec::new(); + for i in 0..arr.len() { + if let Some(s) = arr.get(i) { + let p = *s; + if !p.is_null() { + let cf_str = CFString::wrap_under_get_rule(p as CFStringRef); + res.push(cf_str.to_string()); + } + } + } + res +} + +unsafe fn is_ax_enabled(elem: AXUIElementRef) -> bool { + let Some(val) = ax_copy_attr(elem, "AXEnabled") else { + return false; + }; + let mut enabled: bool = false; + let type_id = core_foundation::base::CFGetTypeID(val); + if type_id == core_foundation::boolean::CFBooleanGetTypeID() { + let b = val as core_foundation::boolean::CFBooleanRef; + enabled = core_foundation::number::CFBooleanGetValue(b); + } + ax_release(val); + enabled +} + +unsafe fn read_value_desc( + elem: AXUIElementRef, +) -> (Option, Option) { + let value = ax_copy_attr(elem, "AXValue").and_then(|v| { + let s = cfstring_to_string(v); + ax_release(v); + s + }); + let desc = ax_copy_attr(elem, "AXDescription").and_then(|v| { + let s = cfstring_to_string(v); + ax_release(v); + s + }); + (value, desc) +} + unsafe fn read_role_title_id( elem: AXUIElementRef, ) -> (Option, Option, Option) { @@ -457,50 +511,57 @@ pub fn locate_ui_element_center( ) } -/// Roles considered "interactive" for Set-of-Mark labeling. -const SOM_INTERACTIVE_ROLES: &[&str] = &[ - "AXButton", - "AXTextField", - "AXTextArea", - "AXCheckBox", - "AXRadioButton", - "AXPopUpButton", - "AXComboBox", - "AXSlider", - "AXLink", - "AXMenuItem", - "AXMenuBarItem", - "AXTab", - "AXDisclosureTriangle", - "AXIncrementor", - "AXColorWell", - "AXToolbarButton", - "AXToggle", - "AXSwitch", - "AXSegmentedControl", - "AXCell", - "AXImage", - "AXStaticText", -]; - -fn is_interactive_role(role: &str) -> bool { - SOM_INTERACTIVE_ROLES - .iter() - .any(|r| role.contains(r) || r.contains(role)) + +unsafe fn is_ax_interactive(elem: AXUIElementRef, role: &str) -> bool { + let actions = ax_copy_action_names(elem); + let interactive_actions = [ + "AXPress", + "AXShowMenu", + "AXIncrement", + "AXDecrement", + "AXConfirm", + "AXCancel", + "AXRaise", + "AXSetValue", + "AXScrollLeftByPage", + "AXScrollRightByPage", + "AXScrollUpByPage", + "AXScrollDownByPage", + ]; + + let mut has_interactive = false; + for a in &actions { + if interactive_actions.contains(&a.as_str()) { + has_interactive = true; + break; + } + } + + if actions.iter().any(|a| a == "AXSetValue") && role == "AXTextField" { + return is_ax_enabled(elem); + } + + if actions.iter().any(|a| a == "AXPress") && (role == "AXButton" || role == "AXLink") { + return is_ax_enabled(elem); + } + + has_interactive } /// Enumerate all visible interactive elements in the frontmost app's AX tree. /// Returns up to `max_elements` SomElement entries with 1-based label numbers. -pub fn enumerate_interactive_elements(max_elements: usize) -> Vec { +pub fn enumerate_interactive_elements(max_elements: usize) -> (Vec, Option) { let pid = match frontmost_pid() { Ok(p) => p, - Err(_) => return vec![], + Err(_) => return (vec![], None), }; let root = unsafe { AXUIElementCreateApplication(pid) }; if root.is_null() { - return vec![]; + return (vec![], None); } + let win_bounds = frontmost_window_bounds_global().ok(); + struct BfsItem { ax: AXUIElementRef, depth: u32, @@ -537,20 +598,31 @@ pub fn enumerate_interactive_elements(max_elements: usize) -> Vec { let role = role_s.as_deref().unwrap_or(""); // Check if this element is interactive and visible - if is_interactive_role(role) { + if unsafe { is_ax_interactive(cur.ax, role) } { let hidden = unsafe { is_ax_hidden(cur.ax) }; if !hidden { if let Some((gx, gy, bl, bt, bw, bh)) = unsafe { element_frame_global(cur.ax) } { // Filter: reasonable size (not a giant container, not tiny) if bw >= 4.0 && bh >= 4.0 && bw <= 2000.0 && bh <= 1000.0 { - // Filter: on-screen (center must be non-negative) - if gx >= 0.0 && gy >= 0.0 { + // Filter: on-screen (intersect with main window bounds if available, else gx >= 0) + let mut on_screen = gx >= 0.0 && gy >= 0.0; + if let Some((wx, wy, ww, wh)) = win_bounds { + let wx_f = wx as f64; + let wy_f = wy as f64; + let ww_f = ww as f64; + let wh_f = wh as f64; + on_screen = bl < wx_f + ww_f && bl + bw > wx_f && bt < wy_f + wh_f && bt + bh > wy_f; + } + if on_screen { + let (val_s, desc_s) = unsafe { read_value_desc(cur.ax) }; let label = results.len() as u32 + 1; results.push(SomElement { label, role: role.to_string(), title: title_s.clone().filter(|s| !s.is_empty()), identifier: id_s.clone().filter(|s| !s.is_empty()), + value: val_s.filter(|s| !s.is_empty()), + description: desc_s.filter(|s| !s.is_empty()), global_center_x: gx, global_center_y: gy, bounds_left: bl, @@ -607,7 +679,28 @@ pub fn enumerate_interactive_elements(max_elements: usize) -> Vec { } } - results + let mut ui_tree_lines = Vec::new(); + for el in &results { + let mut attrs = String::new(); + if let Some(t) = &el.title { + attrs.push_str(&format!(" title: \"{}\"", t)); + } + if let Some(v) = &el.value { + attrs.push_str(&format!(" value: \"{}\"", v)); + } + if let Some(d) = &el.description { + attrs.push_str(&format!(" description: \"{}\"", d)); + } + attrs.push_str(&format!(" (w,h): \"{}, {}\"", el.bounds_width as i32, el.bounds_height as i32)); + ui_tree_lines.push(format!("{}[:]<{} {}>", el.label, el.role, attrs.trim_start())); + } + let ui_tree_text = if ui_tree_lines.is_empty() { + None + } else { + Some(ui_tree_lines.join("\n")) + }; + + (results, ui_tree_text) } unsafe fn ax_parent_context_line(elem: AXUIElementRef) -> Option { diff --git a/src/apps/desktop/src/computer_use/screen_ocr.rs b/src/apps/desktop/src/computer_use/screen_ocr.rs index 09de6ea5d..b4f539240 100644 --- a/src/apps/desktop/src/computer_use/screen_ocr.rs +++ b/src/apps/desktop/src/computer_use/screen_ocr.rs @@ -451,6 +451,7 @@ pub fn crop_shot_to_ocr_region( }), som_labels: vec![], implicit_confirmation_crop_applied: false, + ui_tree_text: None, }) } diff --git a/src/crates/core/src/agentic/tools/computer_use_host.rs b/src/crates/core/src/agentic/tools/computer_use_host.rs index e447c98d3..e8e73ec01 100644 --- a/src/crates/core/src/agentic/tools/computer_use_host.rs +++ b/src/crates/core/src/agentic/tools/computer_use_host.rs @@ -175,6 +175,9 @@ pub struct ComputerScreenshot { /// When non-empty, the model can use `click_label` with a label number instead of coordinates. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub som_labels: Vec, + /// Condensed text representation of the UI tree, focusing on interactive elements (inspired by TuriX-CUA). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub ui_tree_text: Option, /// Desktop: this JPEG was produced by implicit 500×500 confirmation crop (mouse or text focus center). #[serde(default, skip_serializing_if = "is_false")] pub implicit_confirmation_crop_applied: bool, @@ -472,8 +475,8 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { /// Enumerate all visible interactive UI elements for Set-of-Mark (SoM) overlay. /// Returns elements suitable for numbered label annotation on screenshots. /// Default: empty (no SoM support). - async fn enumerate_som_elements(&self) -> Vec { - vec![] + async fn enumerate_som_elements(&self) -> (Vec, Option) { + (vec![], None) } /// Record a completed action for loop detection and history tracking. @@ -501,6 +504,25 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { fn get_action_history(&self) -> Vec { vec![] } + + /// Launch a macOS/Windows/Linux application by name and return its PID. + /// Default: unsupported. Desktop host overrides with platform-specific implementation. + async fn open_app(&self, _app_name: &str) -> BitFunResult { + Err(BitFunError::tool( + "open_app is not available on this host.".to_string(), + )) + } +} + +/// Result of launching an application via [`ComputerUseHost::open_app`]. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OpenAppResult { + pub app_name: String, + pub success: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub process_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub error_message: Option, } /// A visible interactive UI element discovered via the accessibility tree, @@ -517,6 +539,12 @@ pub struct SomElement { /// AX identifier, if any. #[serde(default, skip_serializing_if = "Option::is_none")] pub identifier: Option, + /// AX value, if any. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub value: Option, + /// AX description, if any. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub description: Option, /// Global screen center X (host pointer space). pub global_center_x: f64, /// Global screen center Y (host pointer space). diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs index 93421f562..3822ca3ed 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs @@ -98,6 +98,7 @@ mod tests { }), som_labels: vec![], implicit_confirmation_crop_applied: false, + ui_tree_text: None, }; let interaction = ComputerUseInteractionState { click_ready: false, diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs index 2265007e4..6ce721304 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs @@ -122,8 +122,8 @@ The **primary model cannot consume images** in tool results — **do not** use * "properties": { "action": { "type": "string", - "enum": ["click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait"], - "description": "The action to perform. **Primary model is text-only — no `screenshot` or `click_label`.** **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands first. 2) Prefer `key_chord` for shortcuts/navigation. 3) Only when above fail: `click_element` (AX) → `move_to_text` (OCR, use `move_to_text_match_index` when multiple hits listed) → `mouse_move` (**`use_screen_coordinates`: true** with globals) + `click`. Never guess coordinates." + "enum": ["click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "open_app", "run_apple_script"], + "description": "The action to perform. **Primary model is text-only — no `screenshot` or `click_label`.** **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands first. 2) **`open_app`** to launch apps. **`run_apple_script`** for AppleScript (macOS). 3) Prefer `key_chord` for shortcuts/navigation. 4) Only when above fail: `click_element` (AX) → `move_to_text` (OCR, use `move_to_text_match_index` when multiple hits listed) → `mouse_move` (**`use_screen_coordinates`: true** with globals) + `click`. Never guess coordinates." }, "x": { "type": "integer", "description": "For `mouse_move` and `drag`: X in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, "y": { "type": "integer", "description": "For `mouse_move` and `drag`: Y in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, @@ -156,7 +156,11 @@ The **primary model cannot consume images** in tool results — **do not** use * "role_substring": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXRole." }, "identifier_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXIdentifier." }, "max_depth": { "type": "integer", "minimum": 1, "maximum": 200, "description": "For `locate`, `click_element`: max BFS depth (default 48)." }, - "filter_combine": { "type": "string", "enum": ["all", "any"], "description": "For `locate`, `click_element`: `all` (default, AND) or `any` (OR) for filter combination." } + "filter_combine": { "type": "string", "enum": ["all", "any"], "description": "For `locate`, `click_element`: `all` (default, AND) or `any` (OR) for filter combination." }, + "app_name": { "type": "string", "description": "For `open_app`: the application name to launch." }, + "script": { "type": "string", "description": "For `run_apple_script`: the AppleScript code to execute. macOS only." }, + "scroll_x": { "type": "integer", "description": "For `scroll`: optional global X coordinate to scroll at. Use with `scroll_y`." }, + "scroll_y": { "type": "integer", "description": "For `scroll`: optional global Y coordinate to scroll at. Use with `scroll_x`." } }, "required": ["action"], "additionalProperties": false @@ -516,6 +520,7 @@ The **primary model cannot consume images** in tool results — **do not** use * "implicit_confirmation_crop_applied": shot.implicit_confirmation_crop_applied, "debug_screenshot_path": debug_rel, "som_label_note": som_note, + "ui_tree_text": shot.ui_tree_text, }); let shortcut_policy = format!( "**Verify step:** after **`click`**, **`key_chord`**, **`type_text`**, **`scroll`**, or **`drag`**, check **`interaction_state.recommend_screenshot_to_verify_last_action`** — when true, call **`screenshot`** next to confirm UI state (Cowork-style). \ @@ -1034,8 +1039,8 @@ impl Tool for ComputerUseTool { "properties": { "action": { "type": "string", - "enum": ["screenshot", "click_element", "click_label", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait"], - "description": "The action to perform. **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands (most efficient). 2) Prefer **`key_chord`** for shortcuts/navigation keys over mouse. 3) Only when above fail: `click_element` (AX) → `move_to_text` (OCR, move pointer only) → `click_label` (SoM) → `mouse_move` (globals only, **`use_screen_coordinates`: true**) + `click` (last resort). **`screenshot`** is for observation/confirmation ONLY — never derive mouse coordinates from screenshots. `click` = press at **current pointer only** (no x/y params). `scroll`, `type_text`, `drag`, `pointer_move_rel`, `wait`, `locate` = standard actions." + "enum": ["screenshot", "click_element", "click_label", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "open_app", "run_apple_script"], + "description": "The action to perform. **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands (most efficient). 2) **`open_app`** to launch apps by name. **`run_apple_script`** to run AppleScript (macOS). 3) Prefer **`key_chord`** for shortcuts/navigation keys over mouse. 4) Only when above fail: `click_element` (AX) → `move_to_text` (OCR, move pointer only) → `click_label` (SoM) → `mouse_move` (globals only, **`use_screen_coordinates`: true**) + `click` (last resort). **`screenshot`** is for observation/confirmation ONLY — never derive mouse coordinates from screenshots. `click` = press at **current pointer only** (no x/y params). `scroll` supports optional position (`scroll_x`/`scroll_y`). `type_text`, `drag`, `pointer_move_rel`, `wait`, `locate` = standard actions." }, "x": { "type": "integer", "description": "For `mouse_move` and `drag`: X in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, "y": { "type": "integer", "description": "For `mouse_move` and `drag`: Y in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, @@ -1075,7 +1080,11 @@ impl Tool for ComputerUseTool { "screenshot_crop_half_extent_native": { "type": "integer", "minimum": 0, "description": "For `screenshot`: half-size of point crop in native pixels (default 250)." }, "screenshot_navigate_quadrant": { "type": "string", "enum": ["top_left", "top_right", "bottom_left", "bottom_right"], "description": "For `screenshot`: zoom into quadrant. Repeat until `quadrant_navigation_click_ready` is true." }, "screenshot_reset_navigation": { "type": "boolean", "description": "For `screenshot`: reset to full display before this capture." }, - "screenshot_implicit_center": { "type": "string", "enum": ["mouse", "text_caret"], "description": "For `screenshot` when `requires_fresh_screenshot_before_click` / `requires_fresh_screenshot_before_enter` is true: center the implicit ~500×500 on the mouse (`mouse`, default) or on the focused text control (`text_caret`, macOS AX; falls back to mouse). Applies to the **first** confirmation capture too. Ignored when you set `screenshot_crop_center_*` / `screenshot_navigate_quadrant` / `screenshot_reset_navigation`." } + "screenshot_implicit_center": { "type": "string", "enum": ["mouse", "text_caret"], "description": "For `screenshot` when `requires_fresh_screenshot_before_click` / `requires_fresh_screenshot_before_enter` is true: center the implicit ~500×500 on the mouse (`mouse`, default) or on the focused text control (`text_caret`, macOS AX; falls back to mouse). Applies to the **first** confirmation capture too. Ignored when you set `screenshot_crop_center_*` / `screenshot_navigate_quadrant` / `screenshot_reset_navigation`." }, + "app_name": { "type": "string", "description": "For `open_app`: the application name to launch (e.g. \"Safari\", \"WeChat\", \"Visual Studio Code\")." }, + "script": { "type": "string", "description": "For `run_apple_script`: the AppleScript code to execute via `osascript`. macOS only." }, + "scroll_x": { "type": "integer", "description": "For `scroll`: optional global X coordinate to move pointer before scrolling. Use with `scroll_y`. Requires `use_screen_coordinates`: true." }, + "scroll_y": { "type": "integer", "description": "For `scroll`: optional global Y coordinate to move pointer before scrolling. Use with `scroll_x`. Requires `use_screen_coordinates`: true." } }, "required": ["action"], "additionalProperties": false @@ -1532,6 +1541,15 @@ impl Tool for ComputerUseTool { "scroll requires non-zero delta_x and/or delta_y".to_string(), )); } + // Positional scroll: move pointer to target before scrolling. + let scroll_pos_x = input.get("scroll_x").and_then(|v| v.as_i64()); + let scroll_pos_y = input.get("scroll_y").and_then(|v| v.as_i64()); + if let (Some(sx), Some(sy)) = (scroll_pos_x, scroll_pos_y) { + host_ref + .mouse_move_global_f64(sx as f64, sy as f64) + .await?; + host_ref.wait_ms(30).await?; + } host_ref.scroll(dx, dy).await?; let input_coords = json!({ "kind": "scroll", "delta_x": dx, "delta_y": dy }); let body = computer_use_augment_result_json( @@ -1758,6 +1776,104 @@ impl Tool for ComputerUseTool { Some(format!("Waited {} ms.", ms)), )]) } + "open_app" => { + let app_name = input + .get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + BitFunError::tool("open_app requires `app_name` parameter.".to_string()) + })?; + let result = host_ref.open_app(app_name).await?; + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": result.success, + "action": "open_app", + "app_name": result.app_name, + "process_id": result.process_id, + "error_message": result.error_message, + }), + None, + ) + .await; + let summary = if result.success { + format!( + "Opened app '{}'{}.", + result.app_name, + result + .process_id + .map(|p| format!(" (PID {})", p)) + .unwrap_or_default() + ) + } else { + format!( + "Failed to open '{}': {}", + result.app_name, + result.error_message.as_deref().unwrap_or("unknown error") + ) + }; + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + + "run_apple_script" => { + let script = input + .get("script") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + BitFunError::tool( + "run_apple_script requires `script` parameter.".to_string(), + ) + })?; + #[cfg(not(target_os = "macos"))] + { + let _ = script; + return Err(BitFunError::tool( + "run_apple_script is only available on macOS.".to_string(), + )); + } + #[cfg(target_os = "macos")] + { + let script_owned = script.to_string(); + let output = tokio::task::spawn_blocking(move || { + std::process::Command::new("/usr/bin/osascript") + .args(["-e", &script_owned]) + .output() + }) + .await + .map_err(|e| BitFunError::tool(format!("spawn: {}", e)))? + .map_err(|e| BitFunError::tool(format!("osascript: {}", e)))?; + + let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); + let success = output.status.success(); + + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": success, + "action": "run_apple_script", + "stdout": stdout, + "stderr": stderr, + }), + None, + ) + .await; + let summary = if success { + format!( + "AppleScript executed.{}", + if stdout.is_empty() { + String::new() + } else { + format!(" Output: {}", &stdout[..stdout.len().min(200)]) + } + ) + } else { + format!("AppleScript error: {}", &stderr[..stderr.len().min(200)]) + }; + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + } + _ => Err(BitFunError::tool(format!("Unknown action: {}", action))), } } From 144fdf1aadc783079f4282565cf3f7b2d0749bc3 Mon Sep 17 00:00:00 2001 From: bowen628 Date: Tue, 14 Apr 2026 14:04:46 +0800 Subject: [PATCH 2/3] fix(agents): make Team Mode work correctly with proper prompt and session handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Team Mode agent was behaving identically to the default agentic mode due to several issues across the prompt and frontend layers: 1. team_mode.md used invalid placeholders ({CUSTOM_RULES}, {RECENTLY_VIEWED_FILES}) that PromptBuilder does not process, and was missing critical placeholders ({ENV_INFO}, {PROJECT_LAYOUT}, {RULES}, {MEMORIES}, {PROJECT_CONTEXT_FILES}) — so the agent had no project context, no user rules, and no environment info. 2. ChatInput passed `effectiveTargetSession?.mode || modeState.current` as the agent type, which caused a stale session.mode (often "agentic") to override the user's UI selection. Fixed to use modeState.current directly as the authoritative source. 3. useFlowChat.ts hardcoded agentType to 'agentic' when creating sessions, ignoring config.agentType. Fixed to pass through the requested mode. 4. MessageModule.ts did not sync session.mode when the caller provided a different agentType, causing subsequent turns to lose the mode. Added ONE_SHOT_AGENT_TYPES_FOR_SESSION guard and mode sync logic. 5. FlowChatStore VALID_AGENT_TYPES whitelist was missing 'Team' and 'DeepResearch', causing restored historical sessions to fall back to 'agentic'. --- .../core/src/agentic/agents/prompts/team_mode.md | 9 +++++++-- src/web-ui/src/flow_chat/components/ChatInput.tsx | 5 ++++- src/web-ui/src/flow_chat/hooks/useFlowChat.ts | 6 ++++-- .../services/flow-chat-manager/MessageModule.ts | 12 +++++++++++- src/web-ui/src/flow_chat/store/FlowChatStore.ts | 2 +- 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/crates/core/src/agentic/agents/prompts/team_mode.md b/src/crates/core/src/agentic/agents/prompts/team_mode.md index babdaf173..49801b7b7 100644 --- a/src/crates/core/src/agentic/agents/prompts/team_mode.md +++ b/src/crates/core/src/agentic/agents/prompts/team_mode.md @@ -2,6 +2,8 @@ You are BitFun in **Team Mode** — a virtual engineering team orchestrator. You You have access to a set of **gstack skills** via the Skill tool. Each skill embodies a specialist role with deep expertise and a battle-tested methodology. Your job is to know WHEN to invoke each role and HOW to weave their outputs into a coherent delivery pipeline. +IMPORTANT: Assist with defensive security tasks only. Refuse to create, modify, or improve code that may be used maliciously. + {LANGUAGE_PREFERENCE} # Your Team Roster @@ -113,5 +115,8 @@ Use TodoWrite frequently to track sprint progress. Each phase should be a top-le - Be careful not to introduce security vulnerabilities. - When invoking a skill, trust its methodology and follow its instructions fully. -{CUSTOM_RULES} -{RECENTLY_VIEWED_FILES} +{ENV_INFO} +{PROJECT_LAYOUT} +{RULES} +{MEMORIES} +{PROJECT_CONTEXT_FILES:exclude=review} diff --git a/src/web-ui/src/flow_chat/components/ChatInput.tsx b/src/web-ui/src/flow_chat/components/ChatInput.tsx index 1a1c41400..4d835d65c 100644 --- a/src/web-ui/src/flow_chat/components/ChatInput.tsx +++ b/src/web-ui/src/flow_chat/components/ChatInput.tsx @@ -358,7 +358,10 @@ export const ChatInput: React.FC = ({ contexts, onClearContexts: clearContexts, onSuccess: onSendMessage, - currentAgentType: effectiveTargetSession?.mode || modeState.current, + // Composer mode is authoritative (synced from session on switch, updated in + // applyModeChange). Prefer it over session.mode so a stale store cannot force + // agentic when the user selected Team or another mode. + currentAgentType: modeState.current, }); const [mcpPromptCommands, setMcpPromptCommands] = useState([]); diff --git a/src/web-ui/src/flow_chat/hooks/useFlowChat.ts b/src/web-ui/src/flow_chat/hooks/useFlowChat.ts index ff86667d6..2ceeed12c 100644 --- a/src/web-ui/src/flow_chat/hooks/useFlowChat.ts +++ b/src/web-ui/src/flow_chat/hooks/useFlowChat.ts @@ -106,9 +106,11 @@ export const useFlowChat = () => { const remoteConnectionId = isRemote ? workspace?.connectionId : undefined; const remoteSshHost = isRemote ? workspace?.sshHost : undefined; + const agentTypeForSession = (config?.agentType || 'agentic').trim() || 'agentic'; + const response = await agentAPI.createSession({ sessionName, - agentType: 'agentic', // Default to agentic; can change via mode selector. + agentType: agentTypeForSession, workspacePath, remoteConnectionId, remoteSshHost, @@ -142,7 +144,7 @@ export const useFlowChat = () => { undefined, // Terminal sessions are managed by the backend. sessionName, maxContextTokens, - undefined, + response.agentType || agentTypeForSession, workspacePath, remoteConnectionId, remoteSshHost diff --git a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts index 326ab915d..672fe1eca 100644 --- a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts +++ b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts @@ -23,6 +23,8 @@ import { const log = createLogger('MessageModule'); +const ONE_SHOT_AGENT_TYPES_FOR_SESSION = new Set(['Init']); + function normalizeModelSelection( modelId: string | undefined, models: AIModelConfig[], @@ -175,7 +177,15 @@ export async function sendMessage( metadata: { sessionId: sessionId, dialogTurnId } }); - const currentAgentType = agentType || session.mode || 'agentic'; + const currentAgentType = (agentType?.trim() || session.mode || 'agentic').trim(); + + if ( + agentType?.trim() && + !ONE_SHOT_AGENT_TYPES_FOR_SESSION.has(currentAgentType) && + session.mode !== currentAgentType + ) { + context.flowChatStore.updateSessionMode(sessionId, currentAgentType); + } try { await ensureBackendSession(context, sessionId); diff --git a/src/web-ui/src/flow_chat/store/FlowChatStore.ts b/src/web-ui/src/flow_chat/store/FlowChatStore.ts index 42a4cac75..55e665b50 100644 --- a/src/web-ui/src/flow_chat/store/FlowChatStore.ts +++ b/src/web-ui/src/flow_chat/store/FlowChatStore.ts @@ -1525,7 +1525,7 @@ export class FlowChatStore { return prev; } - const VALID_AGENT_TYPES = ['agentic', 'debug', 'Plan', 'Cowork', 'Claw']; + const VALID_AGENT_TYPES = ['agentic', 'debug', 'Plan', 'Cowork', 'Claw', 'Team', 'DeepResearch']; const rawAgentType = metadata.agentType || 'agentic'; const validatedAgentType = VALID_AGENT_TYPES.includes(rawAgentType) ? rawAgentType : 'agentic'; From 314f97b252a82d0e25629c2b526d5665eab97cd2 Mon Sep 17 00:00:00 2001 From: bowen628 Date: Tue, 14 Apr 2026 14:08:17 +0800 Subject: [PATCH 3/3] fix(web-ui): move AskUserQuestion submit button from header to footer Move the submit button and status indicator below the questions list so users naturally answer all questions before reaching the submit action. --- .../flow_chat/tool-cards/AskUserQuestionCard.scss | 12 +++++++++++- .../flow_chat/tool-cards/AskUserQuestionCard.tsx | 13 ++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.scss b/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.scss index d2d5f571b..b5f0d06af 100644 --- a/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.scss +++ b/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.scss @@ -66,13 +66,23 @@ letter-spacing: -0.01em; } -.header-actions { +.footer-actions { display: flex; align-items: center; gap: 10px; flex-shrink: 0; } +.card-footer-row { + display: flex; + justify-content: flex-end; + align-items: center; + gap: 12px; + padding: 10px 14px; + border-top: 1px solid var(--border-base); + background: transparent; +} + /* ========== Status indicator ========== */ .tool-status { display: flex; diff --git a/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.tsx b/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.tsx index c756c20f4..45a5563a1 100644 --- a/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.tsx +++ b/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.tsx @@ -415,7 +415,14 @@ export const AskUserQuestionCard: React.FC = ({
{t('toolCards.askUser.questionsCount', { count: questions.length })}
-
+
+ +
+ {questions.map((q, idx) => renderQuestion(q, idx))} +
+ +
+
- -
- {questions.map((q, idx) => renderQuestion(q, idx))} -
) : ( <>