run_browser_use_cdp.sh 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. #!/bin/bash
  2. # 百度搜索任务 - 使用 browser-use CDP 方式(修复版)
  3. # 此脚本会自动启动 Chrome 并执行搜索任务
  4. echo "=========================================="
  5. echo "🚀 百度搜索任务 (browser-use CDP 方式)"
  6. echo "=========================================="
  7. echo
  8. # 设置变量
  9. CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
  10. DEBUG_PORT=9222
  11. USER_DATA_DIR="/tmp/chrome-debug-profile-baidu"
  12. # 检查 Chrome 是否存在
  13. if [ ! -f "$CHROME_PATH" ]; then
  14. echo "❌ 错误:Chrome 未找到"
  15. exit 1
  16. fi
  17. # 关闭现有的 Chrome 进程
  18. echo "📌 步骤 1: 检查并关闭现有 Chrome 调试进程..."
  19. pkill -f "remote-debugging-port=$DEBUG_PORT" 2>/dev/null
  20. sleep 2
  21. echo "✅ 已关闭现有进程"
  22. echo
  23. # 创建用户数据目录
  24. echo "📌 步骤 2: 准备用户数据目录..."
  25. mkdir -p "$USER_DATA_DIR"
  26. echo "✅ 目录已创建: $USER_DATA_DIR"
  27. echo
  28. # 启动 Chrome
  29. echo "📌 步骤 3: 启动 Chrome(远程调试模式)..."
  30. "$CHROME_PATH" \
  31. --remote-debugging-port=$DEBUG_PORT \
  32. --user-data-dir="$USER_DATA_DIR" \
  33. --no-first-run \
  34. --no-default-browser-check \
  35. > /dev/null 2>&1 &
  36. CHROME_PID=$!
  37. echo "✅ Chrome 已启动 (PID: $CHROME_PID)"
  38. echo
  39. # 等待 Chrome 启动
  40. echo "⏳ 等待 Chrome 启动完成..."
  41. MAX_WAIT=30
  42. WAITED=0
  43. while [ $WAITED -lt $MAX_WAIT ]; do
  44. if curl -s http://localhost:$DEBUG_PORT/json/version > /dev/null 2>&1; then
  45. echo "✅ Chrome 调试端口已就绪"
  46. break
  47. fi
  48. sleep 1
  49. WAITED=$((WAITED + 1))
  50. done
  51. echo
  52. if [ $WAITED -ge $MAX_WAIT ]; then
  53. echo "❌ 错误:Chrome 启动超时"
  54. kill $CHROME_PID 2>/dev/null
  55. exit 1
  56. fi
  57. # 验证 CDP
  58. echo "📌 步骤 4: 验证 CDP 连接..."
  59. CDP_INFO=$(curl -s http://localhost:$DEBUG_PORT/json/version)
  60. if [ -n "$CDP_INFO" ]; then
  61. echo "✅ CDP 连接成功"
  62. else
  63. echo "❌ 错误:CDP 连接失败"
  64. kill $CHROME_PID 2>/dev/null
  65. exit 1
  66. fi
  67. echo
  68. # 运行 Python 脚本
  69. echo "=========================================="
  70. echo "🐍 运行 Python 脚本"
  71. echo "=========================================="
  72. echo
  73. python3 << 'PYTHON_END'
  74. import asyncio
  75. import json
  76. import sys
  77. import os
  78. from pathlib import Path
  79. from datetime import datetime
  80. # 添加项目路径
  81. sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
  82. async def run_search():
  83. """执行搜索任务"""
  84. print("🚀 开始执行搜索任务...\n")
  85. # 项目根目录
  86. project_root = Path.cwd()
  87. try:
  88. # 导入 browser-use
  89. from browser_use import Tools
  90. from browser_use.browser import BrowserProfile, BrowserSession
  91. # 连接到 Chrome
  92. print("📌 连接到 Chrome (CDP: http://localhost:9222)...")
  93. browser_session = BrowserSession(
  94. browser_profile=BrowserProfile(
  95. cdp_url='http://localhost:9222',
  96. is_local=True
  97. )
  98. )
  99. # 重要:启动 browser session
  100. await browser_session.start()
  101. tools = Tools()
  102. print("✅ 已连接并启动 BrowserSession\n")
  103. # 导航到百度
  104. print("📌 导航到百度首页...")
  105. result = await tools.navigate(
  106. url="https://www.baidu.com",
  107. browser_session=browser_session
  108. )
  109. print(f"✅ {result.long_term_memory}\n")
  110. await asyncio.sleep(2)
  111. # 搜索
  112. search_keyword = "Python 教程"
  113. search_url = f"https://www.baidu.com/s?wd={search_keyword}"
  114. print(f"📌 搜索: {search_keyword}")
  115. result = await tools.navigate(
  116. url=search_url,
  117. browser_session=browser_session
  118. )
  119. print(f"✅ {result.long_term_memory}\n")
  120. await asyncio.sleep(3)
  121. # 滚动
  122. print("📌 滚动页面...")
  123. await tools.scroll(
  124. down=True,
  125. pages=1.0,
  126. browser_session=browser_session
  127. )
  128. await asyncio.sleep(2)
  129. print("✅ 滚动完成\n")
  130. # 提取数据
  131. print("📌 提取搜索结果...")
  132. extract_js = """
  133. (function(){
  134. try {
  135. const results = [];
  136. const resultItems = document.querySelectorAll('#content_left > div[class*="result"]');
  137. resultItems.forEach((item, index) => {
  138. if (index >= 10) return;
  139. const titleEl = item.querySelector('h3 a, .t a');
  140. const title = titleEl ? titleEl.textContent.trim() : '';
  141. const link = titleEl ? titleEl.href : '';
  142. const summaryEl = item.querySelector('.c-abstract, .content-right_8Zs40');
  143. const summary = summaryEl ? summaryEl.textContent.trim() : '';
  144. const sourceEl = item.querySelector('.c-color-gray, .source_1Vdff');
  145. const source = sourceEl ? sourceEl.textContent.trim() : '';
  146. if (title || link) {
  147. results.push({
  148. index: index + 1,
  149. title: title,
  150. link: link,
  151. summary: summary.substring(0, 200),
  152. source: source
  153. });
  154. }
  155. });
  156. return {
  157. success: true,
  158. count: results.length,
  159. keyword: 'Python 教程',
  160. timestamp: new Date().toISOString(),
  161. results: results
  162. };
  163. } catch (e) {
  164. return {
  165. success: false,
  166. error: e.message
  167. };
  168. }
  169. })()
  170. """
  171. result = await tools.evaluate(
  172. code=extract_js,
  173. browser_session=browser_session
  174. )
  175. # 解析结果
  176. output = result.extracted_content or str(result.metadata)
  177. if isinstance(output, str) and output.startswith("Result: "):
  178. output = output[8:]
  179. data = json.loads(output) if isinstance(output, str) else output
  180. if data.get('success'):
  181. print(f"✅ 成功提取 {data.get('count', 0)} 条结果")
  182. # 保存数据
  183. json_file = project_root / "baidu.json"
  184. with open(json_file, 'w', encoding='utf-8') as f:
  185. json.dump(data, f, ensure_ascii=False, indent=2)
  186. print(f"✅ 数据已保存: {json_file}\n")
  187. # 显示预览
  188. if data.get('results'):
  189. print("📋 前3条结果:")
  190. for item in data['results'][:3]:
  191. print(f" {item.get('index')}. {item.get('title', '无标题')[:50]}...")
  192. print()
  193. else:
  194. print(f"⚠️ 提取失败: {data.get('error')}")
  195. # 保存 HTML
  196. print("📌 保存页面 HTML...")
  197. cdp = await browser_session.get_or_create_cdp_session()
  198. html_result = await cdp.cdp_client.send.Runtime.evaluate(
  199. params={'expression': 'document.documentElement.outerHTML'},
  200. session_id=cdp.session_id
  201. )
  202. html_content = html_result.get('result', {}).get('value', '')
  203. html_file = project_root / "baidu_page.html"
  204. with open(html_file, 'w', encoding='utf-8') as f:
  205. f.write(f"<!-- 保存时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} -->\n")
  206. f.write(html_content)
  207. print(f"✅ HTML 已保存: {html_file}")
  208. print(f" 大小: {len(html_content):,} 字符\n")
  209. print("="*60)
  210. print("🎉 任务完成!")
  211. print("="*60)
  212. print("生成文件:")
  213. print(" • baidu.json")
  214. print(" • baidu_page.html")
  215. print("="*60)
  216. # 停止 browser session
  217. await browser_session.stop()
  218. except Exception as e:
  219. print(f"\n❌ 错误: {e}")
  220. import traceback
  221. traceback.print_exc()
  222. asyncio.run(run_search())
  223. PYTHON_END
  224. PYTHON_EXIT_CODE=$?
  225. echo
  226. echo
  227. # 询问是否关闭 Chrome
  228. echo "=========================================="
  229. echo "清理"
  230. echo "=========================================="
  231. echo
  232. read -p "是否关闭 Chrome?(y/N): " -n 1 -r
  233. echo
  234. if [[ $REPLY =~ ^[Yy]$ ]]; then
  235. echo "🔄 正在关闭 Chrome..."
  236. kill $CHROME_PID 2>/dev/null
  237. sleep 2
  238. pkill -f "remote-debugging-port=$DEBUG_PORT" 2>/dev/null
  239. echo "✅ Chrome 已关闭"
  240. else
  241. echo "💡 Chrome 保持运行状态 (PID: $CHROME_PID)"
  242. echo " 手动关闭命令: kill $CHROME_PID"
  243. fi
  244. echo
  245. echo "=========================================="
  246. echo "✨ 脚本执行完成"
  247. echo "=========================================="
  248. exit $PYTHON_EXIT_CODE