| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339 |
- """
- 百度搜索示例 - 使用 browser-use 的 CDP 方式
- Baidu Search Example - Using browser-use CDP Method
- 使用方法:
- 1. 先手动启动 Chrome 并开启远程调试:
- "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --remote-debugging-port=9222 --user-data-dir="/tmp/chrome-debug-profile"
- 2. 验证 CDP 是否运行:
- 访问 http://localhost:9222/json/version
- 3. 运行此脚本:
- python example_browser_use_cdp.py
- 功能:
- 1. 打开百度
- 2. 搜索"Python 教程"
- 3. 提取搜索结果数据并保存到 baidu.json
- 4. 保存完整页面 HTML 到 baidu_page.html
- """
- import asyncio
- import json
- import sys
- import os
- from pathlib import Path
- from datetime import datetime
- # 添加项目路径
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
- from agent.tools import tool, ToolResult
- async def baidu_search_with_cdp():
- """
- 使用 browser-use 的 CDP 连接方式搜索百度
- """
- print("\n" + "="*80)
- print("🚀 开始执行百度搜索任务 (browser-use CDP 版本)")
- print("="*80 + "\n")
- # 项目根目录
- project_root = Path(__file__).parent
- try:
- # 导入 browser-use
- from browser_use import Agent, Tools
- from browser_use.browser import BrowserProfile, BrowserSession
- # ============================================================
- # 步骤 1: 连接到已启动的 Chrome(通过 CDP)
- # ============================================================
- print("📌 步骤 1: 连接到 Chrome (CDP)...")
- # 使用 CDP 连接到手动启动的 Chrome
- browser_session = BrowserSession(
- browser_profile=BrowserProfile(
- cdp_url='http://localhost:9222',
- is_local=True
- )
- )
- # 创建工具实例
- tools = Tools()
- print("✅ 已连接到 Chrome (CDP)\n")
- # ============================================================
- # 步骤 2 & 3: 使用 Agent 执行搜索任务
- # ============================================================
- print("📌 步骤 2-5: 执行搜索任务...")
- # 创建 Agent(不需要 LLM,直接使用工具)
- task = """
- 请完成以下任务:
- 1. 打开百度首页 https://www.baidu.com
- 2. 导航到搜索结果页面:https://www.baidu.com/s?wd=Python 教程
- 3. 等待搜索结果加载(等待3秒)
- 4. 滚动页面以加载更多内容
- 5. 提取页面 HTML 并保存
- """
- # 直接使用工具而不是 Agent
- print("🔧 直接使用 browser-use 工具...\n")
- # 导航到百度首页
- print(" → 导航到百度首页...")
- result = await tools.navigate(
- url="https://www.baidu.com",
- browser_session=browser_session
- )
- print(f" ✅ {result.long_term_memory}")
- await asyncio.sleep(2)
- # 导航到搜索结果
- search_keyword = "Python 教程"
- search_url = f"https://www.baidu.com/s?wd={search_keyword}"
- print(f"\n → 搜索关键词: {search_keyword}")
- result = await tools.navigate(
- url=search_url,
- browser_session=browser_session
- )
- print(f" ✅ 已导航到搜索结果页面")
- await asyncio.sleep(3)
- # 滚动页面
- print("\n → 滚动页面加载更多内容...")
- await tools.scroll(
- down=True,
- pages=1.0,
- browser_session=browser_session
- )
- await asyncio.sleep(2)
- print(" ✅ 页面滚动完成")
- # ============================================================
- # 步骤 4: 提取搜索结果数据
- # ============================================================
- print("\n📌 步骤 4: 提取搜索结果数据...")
- # 使用 JavaScript 提取数据
- extract_js = """
- (function(){
- try {
- // 提取搜索结果
- const results = [];
- // 百度的搜索结果选择器
- const resultItems = document.querySelectorAll('#content_left > div[class*="result"]');
- console.log('找到搜索结果数量:', resultItems.length);
- resultItems.forEach((item, index) => {
- if (index >= 10) return; // 只提取前10个
- try {
- // 提取标题和链接
- const titleEl = item.querySelector('h3 a, .t a');
- const title = titleEl ? titleEl.textContent.trim() : '';
- const link = titleEl ? titleEl.href : '';
- // 提取摘要
- const summaryEl = item.querySelector('.c-abstract, .content-right_8Zs40');
- const summary = summaryEl ? summaryEl.textContent.trim() : '';
- // 提取来源
- const sourceEl = item.querySelector('.c-color-gray, .source_1Vdff');
- const source = sourceEl ? sourceEl.textContent.trim() : '';
- if (title || link) {
- results.push({
- index: index + 1,
- title: title,
- link: link,
- summary: summary.substring(0, 200), // 限制摘要长度
- source: source
- });
- }
- } catch (e) {
- console.error('提取单个结果失败:', e);
- }
- });
- return {
- success: true,
- count: results.length,
- keyword: 'Python 教程',
- timestamp: new Date().toISOString(),
- results: results
- };
- } catch (e) {
- return {
- success: false,
- error: e.message,
- stack: e.stack
- };
- }
- })()
- """
- result = await tools.evaluate(
- code=extract_js,
- browser_session=browser_session
- )
- # 解析结果
- try:
- # 从 result.extracted_content 中提取数据
- output = result.extracted_content or str(result.metadata)
- # 尝试解析 JSON
- if isinstance(output, str):
- # 如果输出包含 "Result:" 前缀,移除它
- if output.startswith("Result: "):
- output = output[8:]
- data = json.loads(output)
- else:
- data = output
- if data.get('success'):
- print(f"✅ 成功提取 {data.get('count', 0)} 条搜索结果")
- # 保存到 baidu.json
- json_file = project_root / "baidu.json"
- with open(json_file, 'w', encoding='utf-8') as f:
- json.dump(data, f, ensure_ascii=False, indent=2)
- print(f"✅ 数据已保存到: {json_file}\n")
- # 打印前3条结果预览
- if data.get('results'):
- print("📋 前3条结果预览:")
- for item in data['results'][:3]:
- print(f" {item.get('index')}. {item.get('title', '无标题')}")
- print(f" 链接: {item.get('link', '')[:60]}...")
- print(f" 来源: {item.get('source', '未知')}")
- print()
- else:
- print(f"⚠️ 数据提取失败: {data.get('error', '未知错误')}")
- # 保存错误信息
- error_data = {
- "success": False,
- "error": data.get('error'),
- "keyword": "Python 教程",
- "timestamp": datetime.now().isoformat()
- }
- json_file = project_root / "baidu.json"
- with open(json_file, 'w', encoding='utf-8') as f:
- json.dump(error_data, f, ensure_ascii=False, indent=2)
- print(f"⚠️ 错误信息已保存到: {json_file}\n")
- except json.JSONDecodeError as e:
- print(f"⚠️ JSON 解析失败: {e}")
- print(f"原始输出: {str(result)[:200]}...\n")
- # 保存原始输出
- error_data = {
- "success": False,
- "error": "JSON解析失败",
- "raw_output": str(result)[:1000],
- "keyword": "Python 教程",
- "timestamp": datetime.now().isoformat()
- }
- json_file = project_root / "baidu.json"
- with open(json_file, 'w', encoding='utf-8') as f:
- json.dump(error_data, f, ensure_ascii=False, indent=2)
- # ============================================================
- # 步骤 5: 保存完整页面 HTML
- # ============================================================
- print("📌 步骤 5: 保存完整页面 HTML...")
- # 获取 CDP 会话
- cdp = await browser_session.get_or_create_cdp_session()
- # 获取页面内容
- html_result = await cdp.cdp_client.send.Runtime.evaluate(
- params={'expression': 'document.documentElement.outerHTML'},
- session_id=cdp.session_id
- )
- html_content = html_result.get('result', {}).get('value', '')
- # 获取 URL 和标题
- url = await browser_session.get_current_page_url()
- title_result = await cdp.cdp_client.send.Runtime.evaluate(
- params={'expression': 'document.title'},
- session_id=cdp.session_id
- )
- title = title_result.get('result', {}).get('value', '')
- # 保存 HTML 文件
- html_file = project_root / "baidu_page.html"
- with open(html_file, 'w', encoding='utf-8') as f:
- # 添加一些元信息
- meta_info = f"""<!--
- 页面标题: {title}
- 页面URL: {url}
- 保存时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
- 搜索关键词: Python 教程
- -->
- """
- f.write(meta_info)
- f.write(html_content)
- print(f"✅ HTML 已保存到: {html_file}")
- print(f" 页面标题: {title}")
- print(f" 页面URL: {url}")
- print(f" HTML 大小: {len(html_content):,} 字符\n")
- # ============================================================
- # 任务完成
- # ============================================================
- print("="*80)
- print("🎉 任务完成!")
- print("="*80)
- print(f"📁 生成的文件:")
- print(f" 1. baidu.json - 搜索结果数据")
- print(f" 2. baidu_page.html - 完整页面HTML")
- print("="*80 + "\n")
- print("💡 提示:Chrome 窗口保持打开状态,您可以继续使用")
- print(" 如需关闭,请在 Chrome 中手动关闭\n")
- except Exception as e:
- print(f"\n❌ 任务执行失败: {str(e)}")
- import traceback
- traceback.print_exc()
- async def main():
- """主函数"""
- print("\n" + "="*80)
- print("⚠️ 使用前请确保:")
- print("="*80)
- print("1. 已手动启动 Chrome 并开启远程调试端口 9222")
- print("2. 启动命令示例:")
- print(' "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" \\')
- print(' --remote-debugging-port=9222 \\')
- print(' --user-data-dir="/tmp/chrome-debug-profile"')
- print()
- print("3. 验证 CDP 是否运行:访问 http://localhost:9222/json/version")
- print("="*80)
- input("\n按 Enter 键继续(确保已启动 Chrome)...")
- await baidu_search_with_cdp()
- if __name__ == "__main__":
- # 运行任务
- asyncio.run(main())
|