test_tools_baidu.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import asyncio
  2. import json
  3. import os
  4. import sys
  5. from datetime import datetime
  6. from pathlib import Path
  7. from urllib.parse import quote
  8. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  9. from tools.baseClassTools import (
  10. init_browser_session,
  11. navigate_to_url,
  12. wait,
  13. get_page_html,
  14. evaluate,
  15. scroll_page,
  16. cleanup_browser_session,
  17. )
  18. async def run_task():
  19. project_root = Path(__file__).resolve().parents[1]
  20. output_dir = project_root / "output"
  21. output_dir.mkdir(parents=True, exist_ok=True)
  22. json_file = output_dir / "baidu.json"
  23. html_file = output_dir / "baidu_page.html"
  24. try:
  25. await init_browser_session(headless=False, profile_name="baidu_profile")
  26. await navigate_to_url("https://www.baidu.com")
  27. await wait(seconds=2)
  28. keyword = "Python 教程"
  29. search_url = f"https://www.baidu.com/s?wd={quote(keyword)}"
  30. await navigate_to_url(search_url)
  31. await wait(seconds=3)
  32. await scroll_page(down=True, pages=1.0)
  33. await wait(seconds=2)
  34. extract_js = """
  35. (function(){
  36. try {
  37. const results = [];
  38. const resultItems = document.querySelectorAll('#content_left > div[class*="result"]');
  39. resultItems.forEach((item, index) => {
  40. if (index >= 10) return;
  41. try {
  42. const titleEl = item.querySelector('h3 a, .t a');
  43. const title = titleEl ? titleEl.textContent.trim() : '';
  44. const link = titleEl ? titleEl.href : '';
  45. const summaryEl = item.querySelector('.c-abstract, .content-right_8Zs40');
  46. const summary = summaryEl ? summaryEl.textContent.trim() : '';
  47. const sourceEl = item.querySelector('.c-color-gray, .source_1Vdff');
  48. const source = sourceEl ? sourceEl.textContent.trim() : '';
  49. if (title || link) {
  50. results.push({
  51. index: index + 1,
  52. title: title,
  53. link: link,
  54. summary: summary.substring(0, 200),
  55. source: source
  56. });
  57. }
  58. } catch (e) {
  59. }
  60. });
  61. return {
  62. success: true,
  63. count: results.length,
  64. keyword: 'Python 教程',
  65. timestamp: new Date().toISOString(),
  66. results: results
  67. };
  68. } catch (e) {
  69. return {
  70. success: false,
  71. error: e.message,
  72. stack: e.stack
  73. };
  74. }
  75. })()
  76. """
  77. result = await evaluate(code=extract_js)
  78. output = result.output
  79. if output.startswith("Result: "):
  80. output = output[8:]
  81. try:
  82. data = json.loads(output)
  83. except json.JSONDecodeError:
  84. data = {
  85. "success": False,
  86. "error": "JSON解析失败",
  87. "raw_output": output[:1000],
  88. "keyword": keyword,
  89. "timestamp": datetime.now().isoformat(),
  90. }
  91. with open(json_file, "w", encoding="utf-8") as f:
  92. json.dump(data, f, ensure_ascii=False, indent=2)
  93. html_result = await get_page_html()
  94. html_content = html_result.metadata.get("html", "")
  95. page_url = html_result.metadata.get("url", "")
  96. page_title = html_result.metadata.get("title", "")
  97. meta_info = (
  98. "\n".join(
  99. [
  100. "<!--",
  101. f" 页面标题: {page_title}",
  102. f" 页面URL: {page_url}",
  103. f" 保存时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
  104. f" 搜索关键词: {keyword}",
  105. "-->",
  106. "",
  107. ]
  108. )
  109. + "\n"
  110. )
  111. with open(html_file, "w", encoding="utf-8") as f:
  112. f.write(meta_info)
  113. f.write(html_content)
  114. print(f"✅ 数据已保存到: {json_file}")
  115. print(f"✅ HTML 已保存到: {html_file}")
  116. finally:
  117. await cleanup_browser_session()
  118. def main():
  119. asyncio.run(run_task())
  120. if __name__ == "__main__":
  121. main()