#!/bin/bash # 百度搜索任务 - 使用 browser-use CDP 方式(修复版) # 此脚本会自动启动 Chrome 并执行搜索任务 echo "==========================================" echo "🚀 百度搜索任务 (browser-use CDP 方式)" echo "==========================================" echo # 设置变量 CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" DEBUG_PORT=9222 USER_DATA_DIR="/tmp/chrome-debug-profile-baidu" # 检查 Chrome 是否存在 if [ ! -f "$CHROME_PATH" ]; then echo "❌ 错误:Chrome 未找到" exit 1 fi # 关闭现有的 Chrome 进程 echo "📌 步骤 1: 检查并关闭现有 Chrome 调试进程..." pkill -f "remote-debugging-port=$DEBUG_PORT" 2>/dev/null sleep 2 echo "✅ 已关闭现有进程" echo # 创建用户数据目录 echo "📌 步骤 2: 准备用户数据目录..." mkdir -p "$USER_DATA_DIR" echo "✅ 目录已创建: $USER_DATA_DIR" echo # 启动 Chrome echo "📌 步骤 3: 启动 Chrome(远程调试模式)..." "$CHROME_PATH" \ --remote-debugging-port=$DEBUG_PORT \ --user-data-dir="$USER_DATA_DIR" \ --no-first-run \ --no-default-browser-check \ > /dev/null 2>&1 & CHROME_PID=$! echo "✅ Chrome 已启动 (PID: $CHROME_PID)" echo # 等待 Chrome 启动 echo "⏳ 等待 Chrome 启动完成..." MAX_WAIT=30 WAITED=0 while [ $WAITED -lt $MAX_WAIT ]; do if curl -s http://localhost:$DEBUG_PORT/json/version > /dev/null 2>&1; then echo "✅ Chrome 调试端口已就绪" break fi sleep 1 WAITED=$((WAITED + 1)) done echo if [ $WAITED -ge $MAX_WAIT ]; then echo "❌ 错误:Chrome 启动超时" kill $CHROME_PID 2>/dev/null exit 1 fi # 验证 CDP echo "📌 步骤 4: 验证 CDP 连接..." CDP_INFO=$(curl -s http://localhost:$DEBUG_PORT/json/version) if [ -n "$CDP_INFO" ]; then echo "✅ CDP 连接成功" else echo "❌ 错误:CDP 连接失败" kill $CHROME_PID 2>/dev/null exit 1 fi echo # 运行 Python 脚本 echo "==========================================" echo "🐍 运行 Python 脚本" echo "==========================================" echo python3 << 'PYTHON_END' import asyncio import json import sys import os from pathlib import Path from datetime import datetime # 添加项目路径 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) async def run_search(): """执行搜索任务""" print("🚀 开始执行搜索任务...\n") # 项目根目录 project_root = Path.cwd() try: # 导入 browser-use from browser_use import Tools from browser_use.browser import BrowserProfile, BrowserSession # 连接到 Chrome print("📌 连接到 Chrome (CDP: http://localhost:9222)...") browser_session = BrowserSession( browser_profile=BrowserProfile( cdp_url='http://localhost:9222', is_local=True ) ) # 重要:启动 browser session await browser_session.start() tools = Tools() print("✅ 已连接并启动 BrowserSession\n") # 导航到百度 print("📌 导航到百度首页...") result = await tools.navigate( url="https://www.baidu.com", browser_session=browser_session ) print(f"✅ {result.long_term_memory}\n") await asyncio.sleep(2) # 搜索 search_keyword = "Python 教程" search_url = f"https://www.baidu.com/s?wd={search_keyword}" print(f"📌 搜索: {search_keyword}") result = await tools.navigate( url=search_url, browser_session=browser_session ) print(f"✅ {result.long_term_memory}\n") await asyncio.sleep(3) # 滚动 print("📌 滚动页面...") await tools.scroll( down=True, pages=1.0, browser_session=browser_session ) await asyncio.sleep(2) print("✅ 滚动完成\n") # 提取数据 print("📌 提取搜索结果...") extract_js = """ (function(){ try { const results = []; const resultItems = document.querySelectorAll('#content_left > div[class*="result"]'); resultItems.forEach((item, index) => { if (index >= 10) return; const titleEl = item.querySelector('h3 a, .t a'); const title = titleEl ? titleEl.textContent.trim() : ''; const link = titleEl ? titleEl.href : ''; const summaryEl = item.querySelector('.c-abstract, .content-right_8Zs40'); const summary = summaryEl ? summaryEl.textContent.trim() : ''; const sourceEl = item.querySelector('.c-color-gray, .source_1Vdff'); const source = sourceEl ? sourceEl.textContent.trim() : ''; if (title || link) { results.push({ index: index + 1, title: title, link: link, summary: summary.substring(0, 200), source: source }); } }); return { success: true, count: results.length, keyword: 'Python 教程', timestamp: new Date().toISOString(), results: results }; } catch (e) { return { success: false, error: e.message }; } })() """ result = await tools.evaluate( code=extract_js, browser_session=browser_session ) # 解析结果 output = result.extracted_content or str(result.metadata) if isinstance(output, str) and output.startswith("Result: "): output = output[8:] data = json.loads(output) if isinstance(output, str) else output if data.get('success'): print(f"✅ 成功提取 {data.get('count', 0)} 条结果") # 保存数据 json_file = project_root / "baidu.json" with open(json_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"✅ 数据已保存: {json_file}\n") # 显示预览 if data.get('results'): print("📋 前3条结果:") for item in data['results'][:3]: print(f" {item.get('index')}. {item.get('title', '无标题')[:50]}...") print() else: print(f"⚠️ 提取失败: {data.get('error')}") # 保存 HTML print("📌 保存页面 HTML...") cdp = await browser_session.get_or_create_cdp_session() html_result = await cdp.cdp_client.send.Runtime.evaluate( params={'expression': 'document.documentElement.outerHTML'}, session_id=cdp.session_id ) html_content = html_result.get('result', {}).get('value', '') html_file = project_root / "baidu_page.html" with open(html_file, 'w', encoding='utf-8') as f: f.write(f"\n") f.write(html_content) print(f"✅ HTML 已保存: {html_file}") print(f" 大小: {len(html_content):,} 字符\n") print("="*60) print("🎉 任务完成!") print("="*60) print("生成文件:") print(" • baidu.json") print(" • baidu_page.html") print("="*60) # 停止 browser session await browser_session.stop() except Exception as e: print(f"\n❌ 错误: {e}") import traceback traceback.print_exc() asyncio.run(run_search()) PYTHON_END PYTHON_EXIT_CODE=$? echo echo # 询问是否关闭 Chrome echo "==========================================" echo "清理" echo "==========================================" echo read -p "是否关闭 Chrome?(y/N): " -n 1 -r echo if [[ $REPLY =~ ^[Yy]$ ]]; then echo "🔄 正在关闭 Chrome..." kill $CHROME_PID 2>/dev/null sleep 2 pkill -f "remote-debugging-port=$DEBUG_PORT" 2>/dev/null echo "✅ Chrome 已关闭" else echo "💡 Chrome 保持运行状态 (PID: $CHROME_PID)" echo " 手动关闭命令: kill $CHROME_PID" fi echo echo "==========================================" echo "✨ 脚本执行完成" echo "==========================================" exit $PYTHON_EXIT_CODE