""" 百度搜索示例 Baidu Search Example 功能: 1. 打开百度 2. 搜索"Python 教程" 3. 提取搜索结果数据并保存到 baidu.json 4. 保存完整页面 HTML 到 baidu_page.html 使用方法: python example.py """ import asyncio import json from pathlib import Path from datetime import datetime # 导入 baseClassTools 的工具 from tools.baseClassTools import ( init_browser_session, navigate_to_url, wait, get_page_html, wait_for_user_action, get_selector_map, input_text, send_keys, evaluate, scroll_page, cleanup_browser_session ) async def baidu_search_task(): """ 百度搜索任务:搜索"Python 教程"并保存数据 """ print("\n" + "="*80) print("🚀 开始执行百度搜索任务") print("="*80 + "\n") # 项目根目录 project_root = Path(__file__).parent try: # ============================================================ # 步骤 1: 初始化浏览器会话(使用专门的百度配置) # ============================================================ print("📌 步骤 1: 初始化浏览器会话...") await init_browser_session( headless=False, profile_name="baidu_profile" # 使用专门的配置文件 ) print("✅ 浏览器会话已初始化\n") # ============================================================ # 步骤 2: 导航到百度首页 # ============================================================ print("📌 步骤 2: 导航到百度...") result = await navigate_to_url("https://www.baidu.com") print(f"✅ {result.long_term_memory}\n") # 等待页面加载 await wait(seconds=2) # ============================================================ # 步骤 3: 搜索"Python 教程" # ============================================================ print("📌 步骤 3: 搜索关键词...") # 方式1: 直接导航到搜索结果页面(推荐) search_keyword = "Python 教程" search_url = f"https://www.baidu.com/s?wd={search_keyword}" print(f"🔍 搜索关键词: {search_keyword}") await navigate_to_url(search_url) print("✅ 已导航到搜索结果页面\n") # 等待搜索结果加载 print("⏳ 等待搜索结果加载...") await wait(seconds=3) # 滚动页面加载更多内容 print("📜 滚动页面加载更多内容...") await scroll_page(down=True, pages=1.0) await wait(seconds=2) print("✅ 搜索结果已加载\n") # ============================================================ # 步骤 4: 提取搜索结果数据 # ============================================================ print("📌 步骤 4: 提取搜索结果数据...") # 使用 JavaScript 提取数据 extract_js = """ (function(){ try { // 提取搜索结果 const results = []; // 百度的搜索结果选择器 const resultItems = document.querySelectorAll('#content_left > div[class*="result"]'); console.log('找到搜索结果数量:', resultItems.length); resultItems.forEach((item, index) => { if (index >= 10) return; // 只提取前10个 try { // 提取标题和链接 const titleEl = item.querySelector('h3 a, .t a'); const title = titleEl ? titleEl.textContent.trim() : ''; const link = titleEl ? titleEl.href : ''; // 提取摘要 const summaryEl = item.querySelector('.c-abstract, .content-right_8Zs40'); const summary = summaryEl ? summaryEl.textContent.trim() : ''; // 提取来源 const sourceEl = item.querySelector('.c-color-gray, .source_1Vdff'); const source = sourceEl ? sourceEl.textContent.trim() : ''; if (title || link) { results.push({ index: index + 1, title: title, link: link, summary: summary.substring(0, 200), // 限制摘要长度 source: source }); } } catch (e) { console.error('提取单个结果失败:', e); } }); return { success: true, count: results.length, keyword: 'Python 教程', timestamp: new Date().toISOString(), results: results }; } catch (e) { return { success: false, error: e.message, stack: e.stack }; } })() """ result = await evaluate(code=extract_js) # 解析提取结果 try: # 从 result.output 中提取 JSON output = result.output if output.startswith("Result: "): output = output[8:] # 移除 "Result: " 前缀 data = json.loads(output) if data.get('success'): print(f"✅ 成功提取 {data.get('count', 0)} 条搜索结果") # 保存到 baidu.json json_file = project_root / "baidu.json" with open(json_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"✅ 数据已保存到: {json_file}\n") # 打印前3条结果预览 if data.get('results'): print("📋 前3条结果预览:") for item in data['results'][:3]: print(f" {item.get('index')}. {item.get('title', '无标题')}") print(f" 链接: {item.get('link', '')[:60]}...") print(f" 来源: {item.get('source', '未知')}") print() else: print(f"⚠️ 数据提取失败: {data.get('error', '未知错误')}") # 保存错误信息 error_data = { "success": False, "error": data.get('error'), "keyword": "Python 教程", "timestamp": datetime.now().isoformat() } json_file = project_root / "baidu.json" with open(json_file, 'w', encoding='utf-8') as f: json.dump(error_data, f, ensure_ascii=False, indent=2) print(f"⚠️ 错误信息已保存到: {json_file}\n") except json.JSONDecodeError as e: print(f"⚠️ JSON 解析失败: {e}") print(f"原始输出: {result.output[:200]}...\n") # 保存原始输出 error_data = { "success": False, "error": "JSON解析失败", "raw_output": result.output[:1000], "keyword": "Python 教程", "timestamp": datetime.now().isoformat() } json_file = project_root / "baidu.json" with open(json_file, 'w', encoding='utf-8') as f: json.dump(error_data, f, ensure_ascii=False, indent=2) # ============================================================ # 步骤 5: 保存完整页面 HTML # ============================================================ print("📌 步骤 5: 保存完整页面 HTML...") html_result = await get_page_html() html_content = html_result.metadata.get('html', '') page_url = html_result.metadata.get('url', '') page_title = html_result.metadata.get('title', '') # 保存 HTML 文件 html_file = project_root / "baidu_page.html" with open(html_file, 'w', encoding='utf-8') as f: # 添加一些元信息 meta_info = f""" """ f.write(meta_info) f.write(html_content) print(f"✅ HTML 已保存到: {html_file}") print(f" 页面标题: {page_title}") print(f" 页面URL: {page_url}") print(f" HTML 大小: {len(html_content):,} 字符\n") # ============================================================ # 任务完成 # ============================================================ print("="*80) print("🎉 任务完成!") print("="*80) print(f"📁 生成的文件:") print(f" 1. {json_file.name} - 搜索结果数据") print(f" 2. {html_file.name} - 完整页面HTML") print("="*80 + "\n") except Exception as e: print(f"\n❌ 任务执行失败: {str(e)}") import traceback traceback.print_exc() finally: # ============================================================ # 清理:保存浏览器状态 # ============================================================ print("\n📌 清理浏览器会话...") await cleanup_browser_session() print("✅ 浏览器会话已保存") print("💡 提示: 下次运行将自动使用保存的浏览器状态\n") async def main(): """主函数""" await baidu_search_task() if __name__ == "__main__": # 运行任务 asyncio.run(main())