Просмотр исходного кода

Merge remote-tracking branch 'origin/dev_tao'

Talegorithm 3 недель назад
Родитель
Сommit
6aadeb766f

+ 2 - 0
agent/core/runner.py

@@ -9,6 +9,7 @@ Agent Runner - Agent 执行引擎
 5. 收集反馈,提取经验
 """
 
+from agent.tools.builtin.browser import browser_read_long_content
 import logging
 from dataclasses import dataclass
 from datetime import datetime
@@ -90,6 +91,7 @@ BUILTIN_TOOLS = [
     "browser_get_dropdown_options",
     "browser_select_dropdown_option",
     "browser_extract_content",
+    "browser_read_long_content",
     "browser_get_page_html",
     "browser_get_selector_map",
     "browser_evaluate",

+ 4 - 3
agent/memory/skills/core.md

@@ -86,13 +86,14 @@ goal(abandon="方案A需要Redis,环境没有")
 2. **等待加载**: 页面跳转后调用 `browser_wait(seconds=2)` 等待内容加载
 3. **获取元素索引**: 调用 `browser_get_selector_map` 获取可交互元素的索引映射
 4. **执行交互**: 使用 `browser_click_element`、`browser_input_text` 等工具操作页面
-5. **提取内容**: 使用 `browser_extract_content` `browser_get_page_html` 获取数据
+5. **提取内容**: 使用 `browser_extract_content`, `browser_read_long_content`, `browser_get_page_html` 获取数据
 
 ### 关键原则
 
+- **禁止模拟结果**:不要输出你认为的搜索结果,而是要调用工具获取真实结果
 - **必须先获取索引**: 所有 `index` 参数都需要先通过 `browser_get_selector_map` 获取
+- **高级工具**:优先使用`browser_extract_content`, `browser_read_long_content`等工具获取数据,而不是使用`browser_get_selector_map`获取索引后手动解析
 - **操作后等待**: 任何可能触发页面变化的操作(点击、输入、滚动)后都要调用 `browser_wait`
-- **优先用高级工具**: 优先使用 `browser_extract_content` 而不是手动解析HTML
 - **登录处理**: 需要登录的网站使用 `browser_ensure_login_with_cookies(cookie_type="xhs")` 注入Cookie
 - **复杂操作用JS**: 当标准工具无法满足时,使用 `browser_evaluate` 执行JavaScript代码
 
@@ -101,5 +102,5 @@ goal(abandon="方案A需要Redis,环境没有")
 **导航**: browser_navigate_to_url, browser_search_web, browser_go_back, browser_wait
 **交互**: browser_click_element, browser_input_text, browser_send_keys, browser_upload_file
 **视图**: browser_scroll_page, browser_find_text, browser_screenshot
-**提取**: browser_extract_content, browser_get_page_html, browser_get_selector_map
+**提取**: browser_extract_content, browser_read_long_content, browser_get_page_html,    browser_get_selector_map
 **高级**: browser_evaluate, browser_ensure_login_with_cookies, browser_wait_for_user_action

+ 2 - 0
agent/tools/builtin/browser/__init__.py

@@ -38,6 +38,7 @@ from agent.tools.builtin.browser.baseClass import (
 
     # 内容提取工具
     browser_extract_content,
+    browser_read_long_content,
     browser_get_page_html,
     browser_get_selector_map,
 
@@ -86,6 +87,7 @@ __all__ = [
 
     # 内容提取工具
     'browser_extract_content',
+    'browser_read_long_content',
     'browser_get_page_html',
     'browser_get_selector_map',
 

+ 159 - 34
agent/tools/builtin/browser/baseClass.py

@@ -48,9 +48,15 @@ import os
 import json
 import asyncio
 import aiohttp
+import re
+import base64
+from urllib.parse import urlparse, parse_qs, unquote
 from typing import Optional, List, Dict, Any, Tuple
 from pathlib import Path
-from urllib.parse import urlparse
+from langchain_core.runnables import RunnableLambda
+from argparse import Namespace # 使用 Namespace 快速构造带属性的对象
+from langchain_core.messages import AIMessage
+from ....llm.openrouter import openrouter_llm_call
 
 # 将项目根目录添加到 Python 路径
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -62,6 +68,7 @@ from agent.tools.builtin.browser.sync_mysql_help import mysql
 # 导入 browser-use 的核心类
 from browser_use import BrowserSession, BrowserProfile
 from browser_use.tools.service import Tools
+from browser_use.tools.views import ReadContentAction
 from browser_use.agent.views import ActionResult
 from browser_use.filesystem.file_system import FileSystem
 
@@ -598,7 +605,7 @@ async def browser_navigate_to_url(url: str, new_tab: bool = False) -> ToolResult
 
 
 @tool()
-async def browser_search_web(query: str, engine: str = "google") -> ToolResult:
+async def browser_search_web(query: str, engine: str = "bing") -> ToolResult:
     """
     使用搜索引擎搜索
     Search the web using a search engine
@@ -857,45 +864,50 @@ async def browser_upload_file(index: int, path: str) -> ToolResult:
 # ============================================================
 # 滚动和视图工具 (Scroll & View Tools)
 # ============================================================
-
 @tool()
-async def browser_scroll_page(down: bool = True, pages: float = 1.0,
-                     index: Optional[int] = None) -> ToolResult:
-    """
-    滚动页面或元素
-    Scroll the page or a specific element
-
-    Args:
-        down: True 向下滚动,False 向上滚动
-        pages: 滚动页数(0.5=半页,1=全页,10=滚动到底部/顶部)
-        index: 可选,滚动特定元素(如下拉框内部)
-
-    Returns:
-        ToolResult: 滚动结果
-
-    Example:
-        scroll_page(down=True, pages=2.0)  # 向下滚动2页
-        scroll_page(down=False, pages=1.0)  # 向上滚动1页
-    """
+async def browser_scroll_page(down: bool = True, pages: float = 1.0, index: Optional[int] = None) -> ToolResult:
     try:
         browser, tools = await get_browser_session()
-
-        result = await tools.scroll(
-            down=down,
-            pages=pages,
-            index=index,
-            browser_session=browser
+        
+        # --- 核心修复 1: 必须先 await 拿到 session 实例 ---
+        cdp_session = await browser.get_or_create_cdp_session()
+        
+        # 这里的执行方式建议参考你已有的 cdp 调用逻辑
+        # 如果 cdp_session 没有直接封装 .eval(),使用 Runtime.evaluate
+        before_y_result = await cdp_session.cdp_client.send.Runtime.evaluate(
+            params={'expression': 'window.scrollY'},
+            session_id=cdp_session.session_id
         )
+        before_y = before_y_result.get('result', {}).get('value', 0)
+
+        # 执行滚动
+        result = await tools.scroll(down=down, pages=pages, index=index, browser_session=browser)
+        
+        # 等待渲染并检查偏移
+        await asyncio.sleep(1)
+        
+        after_y_result = await cdp_session.cdp_client.send.Runtime.evaluate(
+            params={'expression': 'window.scrollY'},
+            session_id=cdp_session.session_id
+        )
+        after_y = after_y_result.get('result', {}).get('value', 0)
 
-        direction = "向下" if down else "向上"
-        return action_result_to_tool_result(result, f"{direction}滚动 {pages} 页")
+        # 3. 验证是否真的动了
+        if before_y == after_y and index is None:
+            return ToolResult(
+                title="滚动无效", 
+                output="页面已到达边界或滚动被拦截", 
+                error="No movement detected"
+            )
+
+        return action_result_to_tool_result(result, f"已滚动")
 
     except Exception as e:
+        # --- 核心修复 2: 必须补全 output 参数,否则框架会报错 ---
         return ToolResult(
-            title="滚动失败",
-            output="",
-            error=f"Failed to scroll: {str(e)}",
-            long_term_memory="滚动失败"
+            title="滚动失败", 
+            output="",  # 补全这个缺失的必填参数
+            error=str(e)
         )
 
 
@@ -1119,6 +1131,69 @@ async def browser_select_dropdown_option(index: int, text: str) -> ToolResult:
 # ============================================================
 # 内容提取工具 (Content Extraction Tools)
 # ============================================================
+def scrub_search_redirect_url(url: str) -> str:
+    """
+    自动检测并解析 Bing/Google 等搜索引擎的重定向链接,提取真实目标 URL。
+    """
+    if not url or not isinstance(url, str):
+        return url
+    
+    try:
+        parsed = urlparse(url)
+        
+        # 1. 处理 Bing 重定向 (特征:u 参数带 Base64)
+        # 示例:...&u=a1aHR0cHM6Ly96aHVhbmxhbi56aGlodS5jb20vcC8zODYxMjgwOQ&...
+        if "bing.com" in parsed.netloc:
+            u_param = parse_qs(parsed.query).get('u', [None])[0]
+            if u_param:
+                # 移除开头的 'a1', 'a0' 等标识符
+                b64_str = u_param[2:]
+                # 补齐 Base64 填充符
+                padding = '=' * (4 - len(b64_str) % 4)
+                decoded = base64.b64decode(b64_str + padding).decode('utf-8', errors='ignore')
+                if decoded.startswith('http'):
+                    return decoded
+
+        # 2. 处理 Google 重定向 (特征:url 参数)
+        if "google.com" in parsed.netloc:
+            url_param = parse_qs(parsed.query).get('url', [None])[0]
+            if url_param:
+                return unquote(url_param)
+
+        # 3. 兜底:处理常见的跳转参数
+        for param in ['target', 'dest', 'destination', 'link']:
+            found = parse_qs(parsed.query).get(param, [None])[0]
+            if found and found.startswith('http'):
+                return unquote(found)
+                
+    except Exception:
+        pass # 解析失败则返回原链接
+    
+    return url
+
+async def extraction_adapter(input_data):
+    # 提取字符串
+    if isinstance(input_data, list):
+        prompt = input_data[-1].content if hasattr(input_data[-1], 'content') else str(input_data[-1])
+    else:
+        prompt = str(input_data)
+    
+    response = await openrouter_llm_call(
+        messages=[{"role": "user", "content": prompt}]
+    )
+    
+    content = response["content"]
+    
+    # --- 核心改进:URL 自动修复 ---
+    # 使用正则表达式匹配内容中的所有 URL,并尝试进行洗涤
+    urls = re.findall(r'https?://[^\s<>"\']+', content)
+    for original_url in urls:
+        clean_url = scrub_search_redirect_url(original_url)
+        if clean_url != original_url:
+            content = content.replace(original_url, clean_url)
+    
+    from argparse import Namespace
+    return Namespace(completion=content)
 
 @tool()
 async def browser_extract_content(query: str, extract_links: bool = False,
@@ -1153,7 +1228,7 @@ async def browser_extract_content(query: str, extract_links: bool = False,
             extract_links=extract_links,
             start_from_char=start_from_char,
             browser_session=browser,
-            page_extraction_llm=None,  # 需要用户配置
+            page_extraction_llm=RunnableLambda(extraction_adapter),  # 需要用户配置
             file_system=_file_system
         )
 
@@ -1167,7 +1242,56 @@ async def browser_extract_content(query: str, extract_links: bool = False,
             long_term_memory=f"提取内容失败: {query}"
         )
 
+@tool()
+async def browser_read_long_content(
+    goal: Any, 
+    source: str = "page", 
+    context: Any = "",  
+    **kwargs            
+) -> ToolResult:
+    """
+    智能读取长内容。已修复参数嵌套导致的 Field Missing 报错。
+    """
+    try:
+        browser, tools = await get_browser_session()
+        
+        # 1. 提取目标文本 (针对 GoalTree 字典结构)
+        final_goal_text = ""
+        if isinstance(goal, dict):
+            final_goal_text = goal.get("mission") or goal.get("goal") or str(goal)
+        else:
+            final_goal_text = str(goal)
+
+        # 2. 清洗业务背景 (过滤框架注入的 dict 类型 context)
+        business_context = context if isinstance(context, str) else ""
+
+        # 3. 验证并实例化
+        action_params = ReadContentAction(
+            goal=final_goal_text, 
+            source=source,
+            context=business_context
+        )
+
+        # --- 4. 核心修复:解包参数 (Unpacking) ---
+        # 使用 ** 将字典解包为平铺参数:goal='...', source='...', context='...'
+        # 这样底层函数就能直接识别到 goal 字段了
+        result = await tools.read_long_content(
+            **action_params.model_dump(),  
+            browser_session=browser,
+            page_extraction_llm=RunnableLambda(extraction_adapter),
+            available_file_paths=[] 
+        )
+
+        return action_result_to_tool_result(result, f"深度读取: {source}")
 
+    except Exception as e:
+        # 补全 output 参数确保报错链路不崩溃
+        return ToolResult(
+            title="深度读取失败",
+            output="",
+            error=f"Read long content failed: {str(e)}",
+            long_term_memory="参数解析或校验失败,请检查输入"
+        )
 @tool()
 async def browser_get_page_html() -> ToolResult:
     """
@@ -1590,6 +1714,7 @@ __all__ = [
     # 内容提取工具
     'browser_extract_content',
     'browser_get_page_html',
+    'browser_read_long_content',
     'browser_get_selector_map',
 
     # JavaScript 执行工具

+ 118 - 147
examples/research/run.py

@@ -1,7 +1,10 @@
 """
-创意写作调研示例
+浏览器调研示例 (增强版)
 
-使用 Agent 模式 + explore 工具进行创意内容探索
+功能:
+1. 使用 Agent 模式进行网络调研
+2. 任务结束自动关闭浏览器并杀掉进程
+3. 异常安全:即使程序崩溃也能清理环境
 """
 
 import os
@@ -15,6 +18,12 @@ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from dotenv import load_dotenv
 load_dotenv()
 
+import logging
+# 配置感知日志
+logging.basicConfig(level=logging.WARNING)  # 默认 WARNING
+logging.getLogger("agent.core.message_manager").setLevel(logging.INFO)  # 开启感知日志
+logging.getLogger("tools").setLevel(logging.INFO)  # 开启工具日志
+
 from agent.llm.prompts import SimplePrompt
 from agent.core.runner import AgentRunner
 from agent.trace import (
@@ -24,6 +33,8 @@ from agent.trace import (
 )
 from agent.llm import create_openrouter_llm_call
 
+# 导入浏览器清理工具
+from agent.tools.builtin.browser.baseClass import kill_browser_session
 
 async def main():
     # 路径配置
@@ -33,8 +44,11 @@ async def main():
     output_dir = base_dir / "output"
     output_dir.mkdir(exist_ok=True)
 
+    # Skills 目录
+    skills_dir = None 
+
     print("=" * 60)
-    print("创意写作调研 (Agent 模式)")
+    print("🚀 浏览器调研任务 (Agent 模式)")
     print("=" * 60)
     print()
 
@@ -57,157 +71,114 @@ async def main():
 
     # 3. 创建 Agent Runner
     print("3. 创建 Agent Runner...")
-    print(f"   - 模型: {model_name} (via OpenRouter)")
-
-    # Trace 输出到测试目录
-    trace_dir = base_dir / ".trace"
-    trace_dir.mkdir(exist_ok=True)
-    print(f"   - Trace 目录: {trace_dir}")
-
     runner = AgentRunner(
         trace_store=FileSystemTraceStore(base_path=str(trace_dir)),
         llm_call=create_openrouter_llm_call(model=f"google/{model_name}"),
-        skills_dir=None,
-        debug=True
+        skills_dir=skills_dir,
+        debug=True 
     )
 
-    # 4. Agent 模式执行
-    print(f"4. 启动 Agent 模式...")
-    print()
-
     final_response = ""
     current_trace_id = None
-    subagent_calls = []
-
-    async for item in runner.run(
-        task=user_task,
-        messages=messages,
-        system_prompt=system_prompt,
-        model=f"google/{model_name}",
-        temperature=temperature,
-        max_iterations=30,  # 增加迭代次数以支持多个 subagent 调用
-    ):
-        # 处理 Trace 对象
-        if isinstance(item, Trace):
-            current_trace_id = item.trace_id
-            if item.status == "running":
-                print(f"[Trace] 开始: {item.trace_id[:8]}")
-            elif item.status == "completed":
-                print(f"[Trace] 完成")
-                print(f"  - Total messages: {item.total_messages}")
-                print(f"  - Total tokens: {item.total_tokens}")
-                print(f"  - Total cost: ${item.total_cost:.4f}")
-            elif item.status == "failed":
-                print(f"[Trace] 失败: {item.error_message}")
-
-        # 处理 Message 对象
-        elif isinstance(item, Message):
-            if item.role == "assistant":
-                content = item.content
-                if isinstance(content, dict):
-                    text = content.get("text", "")
-                    tool_calls = content.get("tool_calls")
-
-                    if text and not tool_calls:
-                        final_response = text
-                        print(f"[Response] Agent 完成")
-                    elif text:
-                        print(f"[Assistant] {text[:100]}...")
-
-                    if tool_calls:
-                        for tc in tool_calls:
-                            tool_name = tc.get("function", {}).get("name", "unknown")
-                            print(f"[Tool Call] {tool_name}")
-
-                            # 记录 subagent 调用
-                            if tool_name == "subagent":
-                                import json
-                                args = tc.get("function", {}).get("arguments", {})
-                                # arguments 可能是字符串,需要解析
-                                if isinstance(args, str):
-                                    try:
-                                        args = json.loads(args)
-                                    except:
-                                        args = {}
-                                mode = args.get("mode", "unknown")
-                                subagent_calls.append({
-                                    "mode": mode,
-                                    "task": args.get("task", args.get("background", ""))[:50]
-                                })
-                                print(f"  → mode: {mode}")
-
-            elif item.role == "tool":
-                content = item.content
-                if isinstance(content, dict):
-                    tool_name = content.get("tool_name", "unknown")
-                    print(f"[Tool Result] {tool_name}")
-                if item.description:
-                    desc = item.description[:80] if len(item.description) > 80 else item.description
-                    print(f"  {desc}...")
-
-    # 5. 输出结果
-    print()
-    print("=" * 60)
-    print("Agent 响应:")
-    print("=" * 60)
-    print(final_response)
-    print("=" * 60)
-    print()
-
-    # 6. 统计 subagent 调用
-    print("=" * 60)
-    print("Subagent 调用统计:")
-    print("=" * 60)
-    delegate_count = sum(1 for call in subagent_calls if call["mode"] == "delegate")
-    explore_count = sum(1 for call in subagent_calls if call["mode"] == "explore")
-    evaluate_count = sum(1 for call in subagent_calls if call["mode"] == "evaluate")
-
-    print(f"  - delegate 模式: {delegate_count} 次")
-    print(f"  - explore 模式: {explore_count} 次")
-    print(f"  - evaluate 模式: {evaluate_count} 次")
-    print(f"  - 总计: {len(subagent_calls)} 次")
-    print()
-
-    for i, call in enumerate(subagent_calls, 1):
-        print(f"  {i}. [{call['mode']}] {call['task']}...")
-    print("=" * 60)
-    print()
-
-    # 7. 保存结果
-    output_file = output_dir / "subagent_test_result.txt"
-    with open(output_file, 'w', encoding='utf-8') as f:
-        f.write("=" * 60 + "\n")
-        f.write("Agent 响应\n")
-        f.write("=" * 60 + "\n\n")
-        f.write(final_response)
-        f.write("\n\n" + "=" * 60 + "\n")
-        f.write("Subagent 调用统计\n")
-        f.write("=" * 60 + "\n\n")
-        f.write(f"delegate 模式: {delegate_count} 次\n")
-        f.write(f"explore 模式: {explore_count} 次\n")
-        f.write(f"evaluate 模式: {evaluate_count} 次\n")
-        f.write(f"总计: {len(subagent_calls)} 次\n\n")
-        for i, call in enumerate(subagent_calls, 1):
-            f.write(f"{i}. [{call['mode']}] {call['task']}...\n")
-
-    print(f"✓ 结果已保存到: {output_file}")
-    print()
-
-    # 8. 可视化提示
-    print("=" * 60)
-    print("Trace 信息:")
-    print("=" * 60)
-    print(f"Trace ID: {current_trace_id}")
-    print(f"Trace 目录: {trace_dir}")
-    print()
-    print("查看 trace 文件:")
-    print(f"   ls -la {trace_dir}")
-    print()
-    print("或启动 API Server 可视化:")
-    print("   python3 api_server.py")
-    print("   访问: http://localhost:8000/api/traces")
-    print("=" * 60)
 
+    # 4. Agent 模式执行(使用 try...finally 确保清理)
+    try:
+        print(f"4. 启动 Agent 模式执行...")
+        print()
+
+        async for item in runner.run(
+            task=user_task,
+            messages=messages,
+            system_prompt=system_prompt,
+            model=f"google/{model_name}",
+            temperature=temperature,
+            max_iterations=20,
+        ):
+            # 处理 Trace 对象(整体状态变化)
+            if isinstance(item, Trace):
+                current_trace_id = item.trace_id
+                if item.status == "running":
+                    print(f"[Trace] 开始: {item.trace_id[:8]}")
+                elif item.status == "completed":
+                    print(f"[Trace] 完成")
+                    print(f"  - Total tokens: {item.total_tokens}")
+                    print(f"  - Total cost: ${item.total_cost:.4f}")
+                elif item.status == "failed":
+                    print(f"[Trace] 失败: {item.error_message}")
+
+            # 处理 Message 对象(执行过程)
+            elif isinstance(item, Message):
+                if item.role == "assistant":
+                    content = item.content
+                    if isinstance(content, dict):
+                        text = content.get("text", "")
+                        tool_calls = content.get("tool_calls")
+
+                        if text and not tool_calls:
+                            final_response = text
+                            print(f"[Response] Agent 给出最终回复")
+                        elif text:
+                            # 增加打印长度到 300,方便观察
+                            print(f"[Assistant] {text[:300]}...")
+
+                        if tool_calls:
+                            for tc in tool_calls:
+                                tool_name = tc.get("function", {}).get("name", "unknown")
+                                print(f"[Tool Call] 🛠️ {tool_name}")
+
+                elif item.role == "tool":
+                    content = item.content
+                    if isinstance(content, dict):
+                        tool_name = content.get("tool_name", "unknown")
+                        print(f"[Tool Result] ✅ {tool_name}")
+                    if item.description:
+                        desc = item.description[:80] if len(item.description) > 80 else item.description
+                        print(f"  {desc}...")
+
+        # 5. 输出结果
+        print()
+        print("=" * 60)
+        print("Final Agent Response:")
+        print("=" * 60)
+        print(final_response)
+        print("=" * 60)
+        print()
+
+        # 6. 保存结果
+        output_file = output_dir / "research_result.txt"
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(final_response)
+        print(f"✓ 结果已保存到: {output_file}")
+
+    except Exception as e:
+        print(f"\n❌ 程序运行崩溃: {str(e)}")
+        import traceback
+        traceback.print_exc()
+
+    finally:
+        # --- 核心逻辑:无论成功失败,必须关闭浏览器进程 ---
+        print("\n" + "·" * 40)
+        print("🧹 正在清理浏览器环境,关闭 CDP 会话并终止进程...")
+        try:
+            # 强制杀掉浏览器进程,释放容器或本地端口
+            await kill_browser_session()
+            print("✅ 浏览器已安全关闭。")
+        except Exception as cleanup_err:
+            print(f"⚠️ 清理浏览器时出现错误: {cleanup_err}")
+        print("·" * 40 + "\n")
+
+    # 7. 可视化提示
+    if current_trace_id:
+        print("=" * 60)
+        print("可视化 Step Tree:")
+        print("=" * 60)
+        print("1. 启动 API Server: python3 api_server.py")
+        print(f"2. 访问: http://localhost:8000/api/traces")
+        print(f"3. Trace ID: {current_trace_id}")
+        print("=" * 60)
 
 if __name__ == "__main__":
-    asyncio.run(main())
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\n🛑 用户手动终止 (KeyboardInterrupt),正在强制退出...")

+ 1 - 7
examples/research/test.prompt

@@ -7,10 +7,4 @@ $system$
 你是最顶尖的AI助手,可以拆分并调用工具逐步解决复杂问题。
 
 $user$
-请为"一个人在雨夜独自等车"这个场景,创作三种不同风格的短篇描写。
-使用 subagent 的 explore 模式,并行探索以下三个方向:
-1. 悬疑惊悚风格
-2. 温馨治愈风格
-3. 科幻未来风格
-
-每个方向写 100-150 字的场景描写即可。
+使用浏览器帮我做个调研:一张图片中的构图可以如何表示?我希望寻找一些构图特征的表示方法。尝试查阅一些论文pdf, 网页等资料,最后输出一份调研报告。