example.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. """
  2. 百度搜索示例
  3. Baidu Search Example
  4. 功能:
  5. 1. 打开百度
  6. 2. 搜索"Python 教程"
  7. 3. 提取搜索结果数据并保存到 baidu.json
  8. 4. 保存完整页面 HTML 到 baidu_page.html
  9. 使用方法:
  10. python example.py
  11. """
  12. import asyncio
  13. import json
  14. from pathlib import Path
  15. from datetime import datetime
  16. # 导入 baseClassTools 的工具
  17. from tools.baseClassTools import (
  18. init_browser_session,
  19. navigate_to_url,
  20. wait,
  21. get_page_html,
  22. wait_for_user_action,
  23. get_selector_map,
  24. input_text,
  25. send_keys,
  26. evaluate,
  27. scroll_page,
  28. cleanup_browser_session
  29. )
  30. async def baidu_search_task():
  31. """
  32. 百度搜索任务:搜索"Python 教程"并保存数据
  33. """
  34. print("\n" + "="*80)
  35. print("🚀 开始执行百度搜索任务")
  36. print("="*80 + "\n")
  37. # 项目根目录
  38. project_root = Path(__file__).parent
  39. try:
  40. # ============================================================
  41. # 步骤 1: 初始化浏览器会话(使用专门的百度配置)
  42. # ============================================================
  43. print("📌 步骤 1: 初始化浏览器会话...")
  44. await init_browser_session(
  45. headless=False,
  46. profile_name="baidu_profile" # 使用专门的配置文件
  47. )
  48. print("✅ 浏览器会话已初始化\n")
  49. # ============================================================
  50. # 步骤 2: 导航到百度首页
  51. # ============================================================
  52. print("📌 步骤 2: 导航到百度...")
  53. result = await navigate_to_url("https://www.baidu.com")
  54. print(f"✅ {result.long_term_memory}\n")
  55. # 等待页面加载
  56. await wait(seconds=2)
  57. # ============================================================
  58. # 步骤 3: 搜索"Python 教程"
  59. # ============================================================
  60. print("📌 步骤 3: 搜索关键词...")
  61. # 方式1: 直接导航到搜索结果页面(推荐)
  62. search_keyword = "Python 教程"
  63. search_url = f"https://www.baidu.com/s?wd={search_keyword}"
  64. print(f"🔍 搜索关键词: {search_keyword}")
  65. await navigate_to_url(search_url)
  66. print("✅ 已导航到搜索结果页面\n")
  67. # 等待搜索结果加载
  68. print("⏳ 等待搜索结果加载...")
  69. await wait(seconds=3)
  70. # 滚动页面加载更多内容
  71. print("📜 滚动页面加载更多内容...")
  72. await scroll_page(down=True, pages=1.0)
  73. await wait(seconds=2)
  74. print("✅ 搜索结果已加载\n")
  75. # ============================================================
  76. # 步骤 4: 提取搜索结果数据
  77. # ============================================================
  78. print("📌 步骤 4: 提取搜索结果数据...")
  79. # 使用 JavaScript 提取数据
  80. extract_js = """
  81. (function(){
  82. try {
  83. // 提取搜索结果
  84. const results = [];
  85. // 百度的搜索结果选择器
  86. const resultItems = document.querySelectorAll('#content_left > div[class*="result"]');
  87. console.log('找到搜索结果数量:', resultItems.length);
  88. resultItems.forEach((item, index) => {
  89. if (index >= 10) return; // 只提取前10个
  90. try {
  91. // 提取标题和链接
  92. const titleEl = item.querySelector('h3 a, .t a');
  93. const title = titleEl ? titleEl.textContent.trim() : '';
  94. const link = titleEl ? titleEl.href : '';
  95. // 提取摘要
  96. const summaryEl = item.querySelector('.c-abstract, .content-right_8Zs40');
  97. const summary = summaryEl ? summaryEl.textContent.trim() : '';
  98. // 提取来源
  99. const sourceEl = item.querySelector('.c-color-gray, .source_1Vdff');
  100. const source = sourceEl ? sourceEl.textContent.trim() : '';
  101. if (title || link) {
  102. results.push({
  103. index: index + 1,
  104. title: title,
  105. link: link,
  106. summary: summary.substring(0, 200), // 限制摘要长度
  107. source: source
  108. });
  109. }
  110. } catch (e) {
  111. console.error('提取单个结果失败:', e);
  112. }
  113. });
  114. return {
  115. success: true,
  116. count: results.length,
  117. keyword: 'Python 教程',
  118. timestamp: new Date().toISOString(),
  119. results: results
  120. };
  121. } catch (e) {
  122. return {
  123. success: false,
  124. error: e.message,
  125. stack: e.stack
  126. };
  127. }
  128. })()
  129. """
  130. result = await evaluate(code=extract_js)
  131. # 解析提取结果
  132. try:
  133. # 从 result.output 中提取 JSON
  134. output = result.output
  135. if output.startswith("Result: "):
  136. output = output[8:] # 移除 "Result: " 前缀
  137. data = json.loads(output)
  138. if data.get('success'):
  139. print(f"✅ 成功提取 {data.get('count', 0)} 条搜索结果")
  140. # 保存到 baidu.json
  141. json_file = project_root / "baidu.json"
  142. with open(json_file, 'w', encoding='utf-8') as f:
  143. json.dump(data, f, ensure_ascii=False, indent=2)
  144. print(f"✅ 数据已保存到: {json_file}\n")
  145. # 打印前3条结果预览
  146. if data.get('results'):
  147. print("📋 前3条结果预览:")
  148. for item in data['results'][:3]:
  149. print(f" {item.get('index')}. {item.get('title', '无标题')}")
  150. print(f" 链接: {item.get('link', '')[:60]}...")
  151. print(f" 来源: {item.get('source', '未知')}")
  152. print()
  153. else:
  154. print(f"⚠️ 数据提取失败: {data.get('error', '未知错误')}")
  155. # 保存错误信息
  156. error_data = {
  157. "success": False,
  158. "error": data.get('error'),
  159. "keyword": "Python 教程",
  160. "timestamp": datetime.now().isoformat()
  161. }
  162. json_file = project_root / "baidu.json"
  163. with open(json_file, 'w', encoding='utf-8') as f:
  164. json.dump(error_data, f, ensure_ascii=False, indent=2)
  165. print(f"⚠️ 错误信息已保存到: {json_file}\n")
  166. except json.JSONDecodeError as e:
  167. print(f"⚠️ JSON 解析失败: {e}")
  168. print(f"原始输出: {result.output[:200]}...\n")
  169. # 保存原始输出
  170. error_data = {
  171. "success": False,
  172. "error": "JSON解析失败",
  173. "raw_output": result.output[:1000],
  174. "keyword": "Python 教程",
  175. "timestamp": datetime.now().isoformat()
  176. }
  177. json_file = project_root / "baidu.json"
  178. with open(json_file, 'w', encoding='utf-8') as f:
  179. json.dump(error_data, f, ensure_ascii=False, indent=2)
  180. # ============================================================
  181. # 步骤 5: 保存完整页面 HTML
  182. # ============================================================
  183. print("📌 步骤 5: 保存完整页面 HTML...")
  184. html_result = await get_page_html()
  185. html_content = html_result.metadata.get('html', '')
  186. page_url = html_result.metadata.get('url', '')
  187. page_title = html_result.metadata.get('title', '')
  188. # 保存 HTML 文件
  189. html_file = project_root / "baidu_page.html"
  190. with open(html_file, 'w', encoding='utf-8') as f:
  191. # 添加一些元信息
  192. meta_info = f"""
  193. <!--
  194. 页面标题: {page_title}
  195. 页面URL: {page_url}
  196. 保存时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
  197. 搜索关键词: Python 教程
  198. -->
  199. """
  200. f.write(meta_info)
  201. f.write(html_content)
  202. print(f"✅ HTML 已保存到: {html_file}")
  203. print(f" 页面标题: {page_title}")
  204. print(f" 页面URL: {page_url}")
  205. print(f" HTML 大小: {len(html_content):,} 字符\n")
  206. # ============================================================
  207. # 任务完成
  208. # ============================================================
  209. print("="*80)
  210. print("🎉 任务完成!")
  211. print("="*80)
  212. print(f"📁 生成的文件:")
  213. print(f" 1. {json_file.name} - 搜索结果数据")
  214. print(f" 2. {html_file.name} - 完整页面HTML")
  215. print("="*80 + "\n")
  216. except Exception as e:
  217. print(f"\n❌ 任务执行失败: {str(e)}")
  218. import traceback
  219. traceback.print_exc()
  220. finally:
  221. # ============================================================
  222. # 清理:保存浏览器状态
  223. # ============================================================
  224. print("\n📌 清理浏览器会话...")
  225. await cleanup_browser_session()
  226. print("✅ 浏览器会话已保存")
  227. print("💡 提示: 下次运行将自动使用保存的浏览器状态\n")
  228. async def main():
  229. """主函数"""
  230. await baidu_search_task()
  231. if __name__ == "__main__":
  232. # 运行任务
  233. asyncio.run(main())