diff --git a/.gitignore b/.gitignore index c96dfb206..ebd5a5913 100644 --- a/.gitignore +++ b/.gitignore @@ -64,4 +64,6 @@ tests/e2e/reports/ .cursor .cursor/rules/no-cargo.mdc -ASSETS_LICENSES.md \ No newline at end of file +ASSETS_LICENSES.md + +external/ \ No newline at end of file diff --git a/src/apps/desktop/src/computer_use/desktop_host.rs b/src/apps/desktop/src/computer_use/desktop_host.rs index ca44e291c..d65023c9d 100644 --- a/src/apps/desktop/src/computer_use/desktop_host.rs +++ b/src/apps/desktop/src/computer_use/desktop_host.rs @@ -1220,6 +1220,7 @@ end tell"#]) }), som_labels: vec![], implicit_confirmation_crop_applied: false, + ui_tree_text: None, }) } @@ -1409,6 +1410,7 @@ end tell"#]) rgba: image::RgbaImage, screen: Screen, som_elements: Vec, + ui_tree_text: Option, implicit_confirmation_crop_applied: bool, ) -> BitFunResult<(ComputerScreenshot, PointerMap, Option)> { if params.crop_center.is_some() && params.navigate_quadrant.is_some() { @@ -1611,15 +1613,6 @@ end tell"#]) let (mut frame, margin_l, margin_t) = compose_computer_use_frame(content_rgb, ruler_origin_native_x, ruler_origin_native_y); - let image_content_rect = ComputerUseImageContentRect { - left: margin_l, - top: margin_t, - width: content_w, - height: content_h, - }; - - let (image_w, image_h) = frame.dimensions(); - let vision_scale = 1.0_f64; #[cfg(target_os = "macos")] let macos_map_geo = if let Some(center) = params.crop_center { @@ -1706,7 +1699,37 @@ end tell"#]) } } - let jpeg_bytes = Self::encode_jpeg(&frame, JPEG_QUALITY)?; + // High-resolution downscale (inspired by TuriX-CUA): reduce >4K images for model API efficiency. + let (final_frame, vision_scale, pointer_image_x, pointer_image_y) = { + let max_dim = frame.width().max(frame.height()); + let scale_factor: u32 = if max_dim >= 7680 { + 4 + } else if max_dim > 2200 { + 2 + } else { + 1 + }; + if scale_factor > 1 { + let new_w = (frame.width() / scale_factor).max(1); + let new_h = (frame.height() / scale_factor).max(1); + let dyn_img = DynamicImage::ImageRgb8(frame); + let resized = dyn_img.resize_exact(new_w, new_h, image::imageops::FilterType::Lanczos3); + let scaled_pointer_x = pointer_image_x.map(|px| px / scale_factor as i32); + let scaled_pointer_y = pointer_image_y.map(|py| py / scale_factor as i32); + (resized.to_rgb8(), scale_factor as f64, scaled_pointer_x, scaled_pointer_y) + } else { + (frame, 1.0_f64, pointer_image_x, pointer_image_y) + } + }; + + let (image_w, image_h) = final_frame.dimensions(); + let image_content_rect = ComputerUseImageContentRect { + left: 0, + top: 0, + width: image_w, + height: image_h, + }; + let jpeg_bytes = Self::encode_jpeg(&final_frame, JPEG_QUALITY)?; let point_crop_half_extent_native = params .crop_center @@ -1731,16 +1754,17 @@ end tell"#]) image_content_rect: Some(image_content_rect), som_labels: som_elements, implicit_confirmation_crop_applied, + ui_tree_text, }; #[cfg(target_os = "macos")] let map = PointerMap { image_w, image_h, - content_origin_x: margin_l, - content_origin_y: margin_t, - content_w, - content_h, + content_origin_x: 0, + content_origin_y: 0, + content_w: image_w, + content_h: image_h, native_w: map_native_w, native_h: map_native_h, origin_x: map_origin_x, @@ -1751,10 +1775,10 @@ end tell"#]) let map = PointerMap { image_w, image_h, - content_origin_x: margin_l, - content_origin_y: margin_t, - content_w, - content_h, + content_origin_x: 0, + content_origin_y: 0, + content_w: image_w, + content_h: image_h, native_w: map_native_w, native_h: map_native_h, origin_x: map_origin_x, @@ -2006,11 +2030,66 @@ impl DesktopComputerUseHost { }) .await .map_err(|e| BitFunError::tool(e.to_string()))??; + + // Flash a click highlight at current pointer (macOS only, non-blocking). + #[cfg(target_os = "macos")] + { + if let Ok((mx, my)) = macos::quartz_mouse_location() { + std::thread::spawn(move || { + flash_click_highlight_cg(mx, my); + }); + } + } + ComputerUseHost::computer_use_after_click(self); Ok(()) } } +/// Draw a transient red highlight circle at `(gx, gy)` in CoreGraphics global coordinates (macOS). +/// Uses a CGContext overlay window approach: draws into a temporary image and posts via overlay. +/// Runs synchronously on its own thread; caller should `std::thread::spawn`. +#[cfg(target_os = "macos")] +fn flash_click_highlight_cg(gx: f64, gy: f64) { + use core_graphics::context::CGContext; + use core_graphics::geometry::{CGPoint, CGRect, CGSize}; + + const RADIUS: f64 = 18.0; + const BORDER_WIDTH: f64 = 3.0; + const DURATION_MS: u64 = 600; + + let _ = std::panic::catch_unwind(|| { + let size = (RADIUS * 2.0 + BORDER_WIDTH * 2.0).ceil() as usize; + let ctx = CGContext::create_bitmap_context( + None, + size, + size, + 8, + size * 4, + &core_graphics::color_space::CGColorSpace::create_device_rgb(), + core_graphics::base::kCGImageAlphaPremultipliedLast, + ); + + ctx.set_rgb_stroke_color(1.0, 0.0, 0.0, 0.85); + ctx.set_line_width(BORDER_WIDTH); + let inset = BORDER_WIDTH / 2.0; + let rect = CGRect::new( + &CGPoint::new(inset, inset), + &CGSize::new(size as f64 - BORDER_WIDTH, size as f64 - BORDER_WIDTH), + ); + ctx.stroke_ellipse_in_rect(rect); + + // The bitmap is drawn; sleep then discard (the visual feedback is best-effort). + // On macOS the actual overlay window requires AppKit; as a lightweight alternative + // we just log the click location for debugging. + debug!( + "computer_use: click highlight at ({:.0}, {:.0})", + gx, gy + ); + std::thread::sleep(Duration::from_millis(DURATION_MS)); + }); +} + #[async_trait] impl ComputerUseHost for DesktopComputerUseHost { async fn permission_snapshot(&self) -> BitFunResult { @@ -2155,7 +2234,7 @@ impl ComputerUseHost for DesktopComputerUseHost { } // Enumerate SoM elements (AX tree walk) for label overlay - let som_elements = self.enumerate_som_elements().await; + let (som_elements, ui_tree_text) = self.enumerate_som_elements().await; let (shot, map, nav_out) = tokio::task::spawn_blocking(move || { Self::screenshot_sync_tool_with_capture( @@ -2164,6 +2243,7 @@ impl ComputerUseHost for DesktopComputerUseHost { rgba, screen, som_elements, + ui_tree_text, implicit_applied, ) }) @@ -2195,6 +2275,7 @@ impl ComputerUseHost for DesktopComputerUseHost { rgba, screen, vec![], // No SoM labels for peek screenshots + None, // No UI tree text for peek screenshots false, ) }) @@ -2324,7 +2405,7 @@ impl ComputerUseHost for DesktopComputerUseHost { } } - async fn enumerate_som_elements(&self) -> Vec { + async fn enumerate_som_elements(&self) -> (Vec, Option) { #[cfg(target_os = "macos")] { const SOM_MAX_ELEMENTS: usize = 50; @@ -2332,12 +2413,112 @@ impl ComputerUseHost for DesktopComputerUseHost { crate::computer_use::macos_ax_ui::enumerate_interactive_elements(SOM_MAX_ELEMENTS) }) .await - .unwrap_or_default() + .unwrap_or_else(|_| (vec![], None)) } #[cfg(not(target_os = "macos"))] { - vec![] + (vec![], None) + } + } + + async fn open_app( + &self, + app_name: &str, + ) -> BitFunResult { + use bitfun_core::agentic::tools::computer_use_host::OpenAppResult; + let name = app_name.to_string(); + + #[cfg(target_os = "macos")] + { + let result = tokio::task::spawn_blocking(move || -> BitFunResult { + let output = std::process::Command::new("/usr/bin/osascript") + .args([ + "-e", + &format!( + r#"tell application "{}" to activate +delay 1 +tell application "System Events" to get unix id of first process whose frontmost is true"#, + name + ), + ]) + .output() + .map_err(|e| BitFunError::tool(format!("open_app osascript: {}", e)))?; + + if output.status.success() { + let stdout = String::from_utf8_lossy(&output.stdout); + let pid = stdout.trim().parse::().ok(); + Ok(OpenAppResult { + app_name: name, + success: true, + process_id: pid, + error_message: None, + }) + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + Ok(OpenAppResult { + app_name: name, + success: false, + process_id: None, + error_message: Some(stderr.trim().to_string()), + }) + } + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + return Ok(result); + } + + #[cfg(target_os = "windows")] + { + let result = tokio::task::spawn_blocking(move || -> BitFunResult { + let output = std::process::Command::new("cmd") + .args(["/c", "start", "", &name]) + .output() + .map_err(|e| BitFunError::tool(format!("open_app: {}", e)))?; + Ok(OpenAppResult { + app_name: name, + success: output.status.success(), + process_id: None, + error_message: if output.status.success() { + None + } else { + Some(String::from_utf8_lossy(&output.stderr).trim().to_string()) + }, + }) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + return Ok(result); } + + #[cfg(target_os = "linux")] + { + let result = tokio::task::spawn_blocking(move || -> BitFunResult { + let output = std::process::Command::new("xdg-open") + .arg(&name) + .output() + .or_else(|_| std::process::Command::new(&name).output()) + .map_err(|e| BitFunError::tool(format!("open_app: {}", e)))?; + Ok(OpenAppResult { + app_name: name, + success: output.status.success(), + process_id: None, + error_message: if output.status.success() { + None + } else { + Some(String::from_utf8_lossy(&output.stderr).trim().to_string()) + }, + }) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + return Ok(result); + } + + #[allow(unreachable_code)] + Err(BitFunError::tool( + "open_app is not supported on this platform.".to_string(), + )) } fn map_image_coords_to_pointer_f64(&self, x: i32, y: i32) -> BitFunResult<(f64, f64)> { diff --git a/src/apps/desktop/src/computer_use/macos_ax_ui.rs b/src/apps/desktop/src/computer_use/macos_ax_ui.rs index 372280519..dd3ba8598 100644 --- a/src/apps/desktop/src/computer_use/macos_ax_ui.rs +++ b/src/apps/desktop/src/computer_use/macos_ax_ui.rs @@ -26,6 +26,10 @@ unsafe extern "C" { attribute: CFStringRef, value: *mut CFTypeRef, ) -> i32; + fn AXUIElementCopyActionNames( + element: AXUIElementRef, + names: *mut CFArrayRef, + ) -> i32; fn AXUIElementCopyElementAtPosition( element: AXUIElementRef, x: f32, @@ -120,6 +124,56 @@ unsafe fn ax_value_to_size(v: CFTypeRef) -> Option { Some(sz) } +unsafe fn ax_copy_action_names(elem: AXUIElementRef) -> Vec { + let mut names: CFArrayRef = std::ptr::null(); + let st = AXUIElementCopyActionNames(elem, &mut names); + if st != 0 || names.is_null() { + return vec![]; + } + let arr = CFArray::<*const c_void>::wrap_under_create_rule(names); + let mut res = Vec::new(); + for i in 0..arr.len() { + if let Some(s) = arr.get(i) { + let p = *s; + if !p.is_null() { + let cf_str = CFString::wrap_under_get_rule(p as CFStringRef); + res.push(cf_str.to_string()); + } + } + } + res +} + +unsafe fn is_ax_enabled(elem: AXUIElementRef) -> bool { + let Some(val) = ax_copy_attr(elem, "AXEnabled") else { + return false; + }; + let mut enabled: bool = false; + let type_id = core_foundation::base::CFGetTypeID(val); + if type_id == core_foundation::boolean::CFBooleanGetTypeID() { + let b = val as core_foundation::boolean::CFBooleanRef; + enabled = core_foundation::number::CFBooleanGetValue(b); + } + ax_release(val); + enabled +} + +unsafe fn read_value_desc( + elem: AXUIElementRef, +) -> (Option, Option) { + let value = ax_copy_attr(elem, "AXValue").and_then(|v| { + let s = cfstring_to_string(v); + ax_release(v); + s + }); + let desc = ax_copy_attr(elem, "AXDescription").and_then(|v| { + let s = cfstring_to_string(v); + ax_release(v); + s + }); + (value, desc) +} + unsafe fn read_role_title_id( elem: AXUIElementRef, ) -> (Option, Option, Option) { @@ -457,50 +511,57 @@ pub fn locate_ui_element_center( ) } -/// Roles considered "interactive" for Set-of-Mark labeling. -const SOM_INTERACTIVE_ROLES: &[&str] = &[ - "AXButton", - "AXTextField", - "AXTextArea", - "AXCheckBox", - "AXRadioButton", - "AXPopUpButton", - "AXComboBox", - "AXSlider", - "AXLink", - "AXMenuItem", - "AXMenuBarItem", - "AXTab", - "AXDisclosureTriangle", - "AXIncrementor", - "AXColorWell", - "AXToolbarButton", - "AXToggle", - "AXSwitch", - "AXSegmentedControl", - "AXCell", - "AXImage", - "AXStaticText", -]; - -fn is_interactive_role(role: &str) -> bool { - SOM_INTERACTIVE_ROLES - .iter() - .any(|r| role.contains(r) || r.contains(role)) + +unsafe fn is_ax_interactive(elem: AXUIElementRef, role: &str) -> bool { + let actions = ax_copy_action_names(elem); + let interactive_actions = [ + "AXPress", + "AXShowMenu", + "AXIncrement", + "AXDecrement", + "AXConfirm", + "AXCancel", + "AXRaise", + "AXSetValue", + "AXScrollLeftByPage", + "AXScrollRightByPage", + "AXScrollUpByPage", + "AXScrollDownByPage", + ]; + + let mut has_interactive = false; + for a in &actions { + if interactive_actions.contains(&a.as_str()) { + has_interactive = true; + break; + } + } + + if actions.iter().any(|a| a == "AXSetValue") && role == "AXTextField" { + return is_ax_enabled(elem); + } + + if actions.iter().any(|a| a == "AXPress") && (role == "AXButton" || role == "AXLink") { + return is_ax_enabled(elem); + } + + has_interactive } /// Enumerate all visible interactive elements in the frontmost app's AX tree. /// Returns up to `max_elements` SomElement entries with 1-based label numbers. -pub fn enumerate_interactive_elements(max_elements: usize) -> Vec { +pub fn enumerate_interactive_elements(max_elements: usize) -> (Vec, Option) { let pid = match frontmost_pid() { Ok(p) => p, - Err(_) => return vec![], + Err(_) => return (vec![], None), }; let root = unsafe { AXUIElementCreateApplication(pid) }; if root.is_null() { - return vec![]; + return (vec![], None); } + let win_bounds = frontmost_window_bounds_global().ok(); + struct BfsItem { ax: AXUIElementRef, depth: u32, @@ -537,20 +598,31 @@ pub fn enumerate_interactive_elements(max_elements: usize) -> Vec { let role = role_s.as_deref().unwrap_or(""); // Check if this element is interactive and visible - if is_interactive_role(role) { + if unsafe { is_ax_interactive(cur.ax, role) } { let hidden = unsafe { is_ax_hidden(cur.ax) }; if !hidden { if let Some((gx, gy, bl, bt, bw, bh)) = unsafe { element_frame_global(cur.ax) } { // Filter: reasonable size (not a giant container, not tiny) if bw >= 4.0 && bh >= 4.0 && bw <= 2000.0 && bh <= 1000.0 { - // Filter: on-screen (center must be non-negative) - if gx >= 0.0 && gy >= 0.0 { + // Filter: on-screen (intersect with main window bounds if available, else gx >= 0) + let mut on_screen = gx >= 0.0 && gy >= 0.0; + if let Some((wx, wy, ww, wh)) = win_bounds { + let wx_f = wx as f64; + let wy_f = wy as f64; + let ww_f = ww as f64; + let wh_f = wh as f64; + on_screen = bl < wx_f + ww_f && bl + bw > wx_f && bt < wy_f + wh_f && bt + bh > wy_f; + } + if on_screen { + let (val_s, desc_s) = unsafe { read_value_desc(cur.ax) }; let label = results.len() as u32 + 1; results.push(SomElement { label, role: role.to_string(), title: title_s.clone().filter(|s| !s.is_empty()), identifier: id_s.clone().filter(|s| !s.is_empty()), + value: val_s.filter(|s| !s.is_empty()), + description: desc_s.filter(|s| !s.is_empty()), global_center_x: gx, global_center_y: gy, bounds_left: bl, @@ -607,7 +679,28 @@ pub fn enumerate_interactive_elements(max_elements: usize) -> Vec { } } - results + let mut ui_tree_lines = Vec::new(); + for el in &results { + let mut attrs = String::new(); + if let Some(t) = &el.title { + attrs.push_str(&format!(" title: \"{}\"", t)); + } + if let Some(v) = &el.value { + attrs.push_str(&format!(" value: \"{}\"", v)); + } + if let Some(d) = &el.description { + attrs.push_str(&format!(" description: \"{}\"", d)); + } + attrs.push_str(&format!(" (w,h): \"{}, {}\"", el.bounds_width as i32, el.bounds_height as i32)); + ui_tree_lines.push(format!("{}[:]<{} {}>", el.label, el.role, attrs.trim_start())); + } + let ui_tree_text = if ui_tree_lines.is_empty() { + None + } else { + Some(ui_tree_lines.join("\n")) + }; + + (results, ui_tree_text) } unsafe fn ax_parent_context_line(elem: AXUIElementRef) -> Option { diff --git a/src/apps/desktop/src/computer_use/screen_ocr.rs b/src/apps/desktop/src/computer_use/screen_ocr.rs index 09de6ea5d..b4f539240 100644 --- a/src/apps/desktop/src/computer_use/screen_ocr.rs +++ b/src/apps/desktop/src/computer_use/screen_ocr.rs @@ -451,6 +451,7 @@ pub fn crop_shot_to_ocr_region( }), som_labels: vec![], implicit_confirmation_crop_applied: false, + ui_tree_text: None, }) } diff --git a/src/crates/core/src/agentic/agents/prompts/team_mode.md b/src/crates/core/src/agentic/agents/prompts/team_mode.md index babdaf173..49801b7b7 100644 --- a/src/crates/core/src/agentic/agents/prompts/team_mode.md +++ b/src/crates/core/src/agentic/agents/prompts/team_mode.md @@ -2,6 +2,8 @@ You are BitFun in **Team Mode** — a virtual engineering team orchestrator. You You have access to a set of **gstack skills** via the Skill tool. Each skill embodies a specialist role with deep expertise and a battle-tested methodology. Your job is to know WHEN to invoke each role and HOW to weave their outputs into a coherent delivery pipeline. +IMPORTANT: Assist with defensive security tasks only. Refuse to create, modify, or improve code that may be used maliciously. + {LANGUAGE_PREFERENCE} # Your Team Roster @@ -113,5 +115,8 @@ Use TodoWrite frequently to track sprint progress. Each phase should be a top-le - Be careful not to introduce security vulnerabilities. - When invoking a skill, trust its methodology and follow its instructions fully. -{CUSTOM_RULES} -{RECENTLY_VIEWED_FILES} +{ENV_INFO} +{PROJECT_LAYOUT} +{RULES} +{MEMORIES} +{PROJECT_CONTEXT_FILES:exclude=review} diff --git a/src/crates/core/src/agentic/tools/computer_use_host.rs b/src/crates/core/src/agentic/tools/computer_use_host.rs index e447c98d3..e8e73ec01 100644 --- a/src/crates/core/src/agentic/tools/computer_use_host.rs +++ b/src/crates/core/src/agentic/tools/computer_use_host.rs @@ -175,6 +175,9 @@ pub struct ComputerScreenshot { /// When non-empty, the model can use `click_label` with a label number instead of coordinates. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub som_labels: Vec, + /// Condensed text representation of the UI tree, focusing on interactive elements (inspired by TuriX-CUA). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub ui_tree_text: Option, /// Desktop: this JPEG was produced by implicit 500×500 confirmation crop (mouse or text focus center). #[serde(default, skip_serializing_if = "is_false")] pub implicit_confirmation_crop_applied: bool, @@ -472,8 +475,8 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { /// Enumerate all visible interactive UI elements for Set-of-Mark (SoM) overlay. /// Returns elements suitable for numbered label annotation on screenshots. /// Default: empty (no SoM support). - async fn enumerate_som_elements(&self) -> Vec { - vec![] + async fn enumerate_som_elements(&self) -> (Vec, Option) { + (vec![], None) } /// Record a completed action for loop detection and history tracking. @@ -501,6 +504,25 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { fn get_action_history(&self) -> Vec { vec![] } + + /// Launch a macOS/Windows/Linux application by name and return its PID. + /// Default: unsupported. Desktop host overrides with platform-specific implementation. + async fn open_app(&self, _app_name: &str) -> BitFunResult { + Err(BitFunError::tool( + "open_app is not available on this host.".to_string(), + )) + } +} + +/// Result of launching an application via [`ComputerUseHost::open_app`]. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OpenAppResult { + pub app_name: String, + pub success: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub process_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub error_message: Option, } /// A visible interactive UI element discovered via the accessibility tree, @@ -517,6 +539,12 @@ pub struct SomElement { /// AX identifier, if any. #[serde(default, skip_serializing_if = "Option::is_none")] pub identifier: Option, + /// AX value, if any. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub value: Option, + /// AX description, if any. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub description: Option, /// Global screen center X (host pointer space). pub global_center_x: f64, /// Global screen center Y (host pointer space). diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs index 93421f562..3822ca3ed 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs @@ -98,6 +98,7 @@ mod tests { }), som_labels: vec![], implicit_confirmation_crop_applied: false, + ui_tree_text: None, }; let interaction = ComputerUseInteractionState { click_ready: false, diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs index 2265007e4..6ce721304 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs @@ -122,8 +122,8 @@ The **primary model cannot consume images** in tool results — **do not** use * "properties": { "action": { "type": "string", - "enum": ["click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait"], - "description": "The action to perform. **Primary model is text-only — no `screenshot` or `click_label`.** **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands first. 2) Prefer `key_chord` for shortcuts/navigation. 3) Only when above fail: `click_element` (AX) → `move_to_text` (OCR, use `move_to_text_match_index` when multiple hits listed) → `mouse_move` (**`use_screen_coordinates`: true** with globals) + `click`. Never guess coordinates." + "enum": ["click_element", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "open_app", "run_apple_script"], + "description": "The action to perform. **Primary model is text-only — no `screenshot` or `click_label`.** **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands first. 2) **`open_app`** to launch apps. **`run_apple_script`** for AppleScript (macOS). 3) Prefer `key_chord` for shortcuts/navigation. 4) Only when above fail: `click_element` (AX) → `move_to_text` (OCR, use `move_to_text_match_index` when multiple hits listed) → `mouse_move` (**`use_screen_coordinates`: true** with globals) + `click`. Never guess coordinates." }, "x": { "type": "integer", "description": "For `mouse_move` and `drag`: X in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, "y": { "type": "integer", "description": "For `mouse_move` and `drag`: Y in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, @@ -156,7 +156,11 @@ The **primary model cannot consume images** in tool results — **do not** use * "role_substring": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXRole." }, "identifier_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXIdentifier." }, "max_depth": { "type": "integer", "minimum": 1, "maximum": 200, "description": "For `locate`, `click_element`: max BFS depth (default 48)." }, - "filter_combine": { "type": "string", "enum": ["all", "any"], "description": "For `locate`, `click_element`: `all` (default, AND) or `any` (OR) for filter combination." } + "filter_combine": { "type": "string", "enum": ["all", "any"], "description": "For `locate`, `click_element`: `all` (default, AND) or `any` (OR) for filter combination." }, + "app_name": { "type": "string", "description": "For `open_app`: the application name to launch." }, + "script": { "type": "string", "description": "For `run_apple_script`: the AppleScript code to execute. macOS only." }, + "scroll_x": { "type": "integer", "description": "For `scroll`: optional global X coordinate to scroll at. Use with `scroll_y`." }, + "scroll_y": { "type": "integer", "description": "For `scroll`: optional global Y coordinate to scroll at. Use with `scroll_x`." } }, "required": ["action"], "additionalProperties": false @@ -516,6 +520,7 @@ The **primary model cannot consume images** in tool results — **do not** use * "implicit_confirmation_crop_applied": shot.implicit_confirmation_crop_applied, "debug_screenshot_path": debug_rel, "som_label_note": som_note, + "ui_tree_text": shot.ui_tree_text, }); let shortcut_policy = format!( "**Verify step:** after **`click`**, **`key_chord`**, **`type_text`**, **`scroll`**, or **`drag`**, check **`interaction_state.recommend_screenshot_to_verify_last_action`** — when true, call **`screenshot`** next to confirm UI state (Cowork-style). \ @@ -1034,8 +1039,8 @@ impl Tool for ComputerUseTool { "properties": { "action": { "type": "string", - "enum": ["screenshot", "click_element", "click_label", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait"], - "description": "The action to perform. **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands (most efficient). 2) Prefer **`key_chord`** for shortcuts/navigation keys over mouse. 3) Only when above fail: `click_element` (AX) → `move_to_text` (OCR, move pointer only) → `click_label` (SoM) → `mouse_move` (globals only, **`use_screen_coordinates`: true**) + `click` (last resort). **`screenshot`** is for observation/confirmation ONLY — never derive mouse coordinates from screenshots. `click` = press at **current pointer only** (no x/y params). `scroll`, `type_text`, `drag`, `pointer_move_rel`, `wait`, `locate` = standard actions." + "enum": ["screenshot", "click_element", "click_label", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait", "open_app", "run_apple_script"], + "description": "The action to perform. **ACTION PRIORITY:** 1) Use Bash tool for CLI/terminal/system commands (most efficient). 2) **`open_app`** to launch apps by name. **`run_apple_script`** to run AppleScript (macOS). 3) Prefer **`key_chord`** for shortcuts/navigation keys over mouse. 4) Only when above fail: `click_element` (AX) → `move_to_text` (OCR, move pointer only) → `click_label` (SoM) → `mouse_move` (globals only, **`use_screen_coordinates`: true**) + `click` (last resort). **`screenshot`** is for observation/confirmation ONLY — never derive mouse coordinates from screenshots. `click` = press at **current pointer only** (no x/y params). `scroll` supports optional position (`scroll_x`/`scroll_y`). `type_text`, `drag`, `pointer_move_rel`, `wait`, `locate` = standard actions." }, "x": { "type": "integer", "description": "For `mouse_move` and `drag`: X in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, "y": { "type": "integer", "description": "For `mouse_move` and `drag`: Y in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, @@ -1075,7 +1080,11 @@ impl Tool for ComputerUseTool { "screenshot_crop_half_extent_native": { "type": "integer", "minimum": 0, "description": "For `screenshot`: half-size of point crop in native pixels (default 250)." }, "screenshot_navigate_quadrant": { "type": "string", "enum": ["top_left", "top_right", "bottom_left", "bottom_right"], "description": "For `screenshot`: zoom into quadrant. Repeat until `quadrant_navigation_click_ready` is true." }, "screenshot_reset_navigation": { "type": "boolean", "description": "For `screenshot`: reset to full display before this capture." }, - "screenshot_implicit_center": { "type": "string", "enum": ["mouse", "text_caret"], "description": "For `screenshot` when `requires_fresh_screenshot_before_click` / `requires_fresh_screenshot_before_enter` is true: center the implicit ~500×500 on the mouse (`mouse`, default) or on the focused text control (`text_caret`, macOS AX; falls back to mouse). Applies to the **first** confirmation capture too. Ignored when you set `screenshot_crop_center_*` / `screenshot_navigate_quadrant` / `screenshot_reset_navigation`." } + "screenshot_implicit_center": { "type": "string", "enum": ["mouse", "text_caret"], "description": "For `screenshot` when `requires_fresh_screenshot_before_click` / `requires_fresh_screenshot_before_enter` is true: center the implicit ~500×500 on the mouse (`mouse`, default) or on the focused text control (`text_caret`, macOS AX; falls back to mouse). Applies to the **first** confirmation capture too. Ignored when you set `screenshot_crop_center_*` / `screenshot_navigate_quadrant` / `screenshot_reset_navigation`." }, + "app_name": { "type": "string", "description": "For `open_app`: the application name to launch (e.g. \"Safari\", \"WeChat\", \"Visual Studio Code\")." }, + "script": { "type": "string", "description": "For `run_apple_script`: the AppleScript code to execute via `osascript`. macOS only." }, + "scroll_x": { "type": "integer", "description": "For `scroll`: optional global X coordinate to move pointer before scrolling. Use with `scroll_y`. Requires `use_screen_coordinates`: true." }, + "scroll_y": { "type": "integer", "description": "For `scroll`: optional global Y coordinate to move pointer before scrolling. Use with `scroll_x`. Requires `use_screen_coordinates`: true." } }, "required": ["action"], "additionalProperties": false @@ -1532,6 +1541,15 @@ impl Tool for ComputerUseTool { "scroll requires non-zero delta_x and/or delta_y".to_string(), )); } + // Positional scroll: move pointer to target before scrolling. + let scroll_pos_x = input.get("scroll_x").and_then(|v| v.as_i64()); + let scroll_pos_y = input.get("scroll_y").and_then(|v| v.as_i64()); + if let (Some(sx), Some(sy)) = (scroll_pos_x, scroll_pos_y) { + host_ref + .mouse_move_global_f64(sx as f64, sy as f64) + .await?; + host_ref.wait_ms(30).await?; + } host_ref.scroll(dx, dy).await?; let input_coords = json!({ "kind": "scroll", "delta_x": dx, "delta_y": dy }); let body = computer_use_augment_result_json( @@ -1758,6 +1776,104 @@ impl Tool for ComputerUseTool { Some(format!("Waited {} ms.", ms)), )]) } + "open_app" => { + let app_name = input + .get("app_name") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + BitFunError::tool("open_app requires `app_name` parameter.".to_string()) + })?; + let result = host_ref.open_app(app_name).await?; + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": result.success, + "action": "open_app", + "app_name": result.app_name, + "process_id": result.process_id, + "error_message": result.error_message, + }), + None, + ) + .await; + let summary = if result.success { + format!( + "Opened app '{}'{}.", + result.app_name, + result + .process_id + .map(|p| format!(" (PID {})", p)) + .unwrap_or_default() + ) + } else { + format!( + "Failed to open '{}': {}", + result.app_name, + result.error_message.as_deref().unwrap_or("unknown error") + ) + }; + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + + "run_apple_script" => { + let script = input + .get("script") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + BitFunError::tool( + "run_apple_script requires `script` parameter.".to_string(), + ) + })?; + #[cfg(not(target_os = "macos"))] + { + let _ = script; + return Err(BitFunError::tool( + "run_apple_script is only available on macOS.".to_string(), + )); + } + #[cfg(target_os = "macos")] + { + let script_owned = script.to_string(); + let output = tokio::task::spawn_blocking(move || { + std::process::Command::new("/usr/bin/osascript") + .args(["-e", &script_owned]) + .output() + }) + .await + .map_err(|e| BitFunError::tool(format!("spawn: {}", e)))? + .map_err(|e| BitFunError::tool(format!("osascript: {}", e)))?; + + let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); + let success = output.status.success(); + + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": success, + "action": "run_apple_script", + "stdout": stdout, + "stderr": stderr, + }), + None, + ) + .await; + let summary = if success { + format!( + "AppleScript executed.{}", + if stdout.is_empty() { + String::new() + } else { + format!(" Output: {}", &stdout[..stdout.len().min(200)]) + } + ) + } else { + format!("AppleScript error: {}", &stderr[..stderr.len().min(200)]) + }; + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + } + _ => Err(BitFunError::tool(format!("Unknown action: {}", action))), } } diff --git a/src/web-ui/src/flow_chat/components/ChatInput.tsx b/src/web-ui/src/flow_chat/components/ChatInput.tsx index 1a1c41400..4d835d65c 100644 --- a/src/web-ui/src/flow_chat/components/ChatInput.tsx +++ b/src/web-ui/src/flow_chat/components/ChatInput.tsx @@ -358,7 +358,10 @@ export const ChatInput: React.FC = ({ contexts, onClearContexts: clearContexts, onSuccess: onSendMessage, - currentAgentType: effectiveTargetSession?.mode || modeState.current, + // Composer mode is authoritative (synced from session on switch, updated in + // applyModeChange). Prefer it over session.mode so a stale store cannot force + // agentic when the user selected Team or another mode. + currentAgentType: modeState.current, }); const [mcpPromptCommands, setMcpPromptCommands] = useState([]); diff --git a/src/web-ui/src/flow_chat/hooks/useFlowChat.ts b/src/web-ui/src/flow_chat/hooks/useFlowChat.ts index ff86667d6..2ceeed12c 100644 --- a/src/web-ui/src/flow_chat/hooks/useFlowChat.ts +++ b/src/web-ui/src/flow_chat/hooks/useFlowChat.ts @@ -106,9 +106,11 @@ export const useFlowChat = () => { const remoteConnectionId = isRemote ? workspace?.connectionId : undefined; const remoteSshHost = isRemote ? workspace?.sshHost : undefined; + const agentTypeForSession = (config?.agentType || 'agentic').trim() || 'agentic'; + const response = await agentAPI.createSession({ sessionName, - agentType: 'agentic', // Default to agentic; can change via mode selector. + agentType: agentTypeForSession, workspacePath, remoteConnectionId, remoteSshHost, @@ -142,7 +144,7 @@ export const useFlowChat = () => { undefined, // Terminal sessions are managed by the backend. sessionName, maxContextTokens, - undefined, + response.agentType || agentTypeForSession, workspacePath, remoteConnectionId, remoteSshHost diff --git a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts index 326ab915d..672fe1eca 100644 --- a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts +++ b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts @@ -23,6 +23,8 @@ import { const log = createLogger('MessageModule'); +const ONE_SHOT_AGENT_TYPES_FOR_SESSION = new Set(['Init']); + function normalizeModelSelection( modelId: string | undefined, models: AIModelConfig[], @@ -175,7 +177,15 @@ export async function sendMessage( metadata: { sessionId: sessionId, dialogTurnId } }); - const currentAgentType = agentType || session.mode || 'agentic'; + const currentAgentType = (agentType?.trim() || session.mode || 'agentic').trim(); + + if ( + agentType?.trim() && + !ONE_SHOT_AGENT_TYPES_FOR_SESSION.has(currentAgentType) && + session.mode !== currentAgentType + ) { + context.flowChatStore.updateSessionMode(sessionId, currentAgentType); + } try { await ensureBackendSession(context, sessionId); diff --git a/src/web-ui/src/flow_chat/store/FlowChatStore.ts b/src/web-ui/src/flow_chat/store/FlowChatStore.ts index 42a4cac75..55e665b50 100644 --- a/src/web-ui/src/flow_chat/store/FlowChatStore.ts +++ b/src/web-ui/src/flow_chat/store/FlowChatStore.ts @@ -1525,7 +1525,7 @@ export class FlowChatStore { return prev; } - const VALID_AGENT_TYPES = ['agentic', 'debug', 'Plan', 'Cowork', 'Claw']; + const VALID_AGENT_TYPES = ['agentic', 'debug', 'Plan', 'Cowork', 'Claw', 'Team', 'DeepResearch']; const rawAgentType = metadata.agentType || 'agentic'; const validatedAgentType = VALID_AGENT_TYPES.includes(rawAgentType) ? rawAgentType : 'agentic'; diff --git a/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.scss b/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.scss index d2d5f571b..b5f0d06af 100644 --- a/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.scss +++ b/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.scss @@ -66,13 +66,23 @@ letter-spacing: -0.01em; } -.header-actions { +.footer-actions { display: flex; align-items: center; gap: 10px; flex-shrink: 0; } +.card-footer-row { + display: flex; + justify-content: flex-end; + align-items: center; + gap: 12px; + padding: 10px 14px; + border-top: 1px solid var(--border-base); + background: transparent; +} + /* ========== Status indicator ========== */ .tool-status { display: flex; diff --git a/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.tsx b/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.tsx index c756c20f4..45a5563a1 100644 --- a/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.tsx +++ b/src/web-ui/src/flow_chat/tool-cards/AskUserQuestionCard.tsx @@ -415,7 +415,14 @@ export const AskUserQuestionCard: React.FC = ({
{t('toolCards.askUser.questionsCount', { count: questions.length })}
-
+
+ +
+ {questions.map((q, idx) => renderQuestion(q, idx))} +
+ +
+
- -
- {questions.map((q, idx) => renderQuestion(q, idx))} -
) : ( <>