test_tools_baidu.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import asyncio
  2. import json
  3. import os
  4. import sys
  5. from datetime import datetime
  6. from pathlib import Path
  7. from urllib.parse import quote
  8. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  9. from agent.tools.builtin.browser.baseClass import (
  10. init_browser_session,
  11. browser_navigate_to_url,
  12. browser_wait,
  13. browser_get_page_html,
  14. browser_evaluate,
  15. browser_scroll_page,
  16. cleanup_browser_session,
  17. )
  18. async def run_task():
  19. project_root = Path(__file__).resolve().parents[1]
  20. output_dir = project_root / "output"
  21. output_dir.mkdir(parents=True, exist_ok=True)
  22. json_file = output_dir / "baidu.json"
  23. html_file = output_dir / "baidu_page.html"
  24. try:
  25. await init_browser_session(headless=False, profile_name="baidu_profile")
  26. await browser_navigate_to_url("https://www.baidu.com")
  27. await browser_wait(seconds=2)
  28. keyword = "Python 教程"
  29. search_url = f"https://www.baidu.com/s?wd={quote(keyword)}"
  30. await browser_navigate_to_url(search_url)
  31. await browser_wait(seconds=3)
  32. await browser_scroll_page(down=True, pages=1.0)
  33. await browser_wait(seconds=2)
  34. extract_js = """
  35. (function(){
  36. try {
  37. const results = [];
  38. const resultItems = document.querySelectorAll('#content_left > div[class*="result"]');
  39. resultItems.forEach((item, index) => {
  40. if (index >= 10) return;
  41. try {
  42. const titleEl = item.querySelector('h3 a, .t a');
  43. const title = titleEl ? titleEl.textContent.trim() : '';
  44. const link = titleEl ? titleEl.href : '';
  45. const summaryEl = item.querySelector('.c-abstract, .content-right_8Zs40');
  46. const summary = summaryEl ? summaryEl.textContent.trim() : '';
  47. const sourceEl = item.querySelector('.c-color-gray, .source_1Vdff');
  48. const source = sourceEl ? sourceEl.textContent.trim() : '';
  49. if (title || link) {
  50. results.push({
  51. index: index + 1,
  52. title: title,
  53. link: link,
  54. summary: summary.substring(0, 200),
  55. source: source
  56. });
  57. }
  58. } catch (e) {
  59. }
  60. });
  61. return {
  62. success: true,
  63. count: results.length,
  64. keyword: 'Python 教程',
  65. timestamp: new Date().toISOString(),
  66. results: results
  67. };
  68. } catch (e) {
  69. return {
  70. success: false,
  71. error: e.message,
  72. stack: e.stack
  73. };
  74. }
  75. })()
  76. """
  77. result = await browser_evaluate(code=extract_js)
  78. output = result.output
  79. if output.startswith("Result: "):
  80. output = output[8:]
  81. try:
  82. data = json.loads(output)
  83. except json.JSONDecodeError:
  84. data = {
  85. "success": False,
  86. "error": "JSON解析失败",
  87. "raw_output": output[:1000],
  88. "keyword": keyword,
  89. "timestamp": datetime.now().isoformat(),
  90. }
  91. with open(json_file, "w", encoding="utf-8") as f:
  92. json.dump(data, f, ensure_ascii=False, indent=2)
  93. html_result = await browser_get_page_html()
  94. html_content = html_result.metadata.get("html", "")
  95. page_url = html_result.metadata.get("url", "")
  96. page_title = html_result.metadata.get("title", "")
  97. meta_info = (
  98. "\n".join(
  99. [
  100. "<!--",
  101. f" 页面标题: {page_title}",
  102. f" 页面URL: {page_url}",
  103. f" 保存时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
  104. f" 搜索关键词: {keyword}",
  105. "-->",
  106. "",
  107. ]
  108. )
  109. + "\n"
  110. )
  111. with open(html_file, "w", encoding="utf-8") as f:
  112. f.write(meta_info)
  113. f.write(html_content)
  114. print(f"✅ 数据已保存到: {json_file}")
  115. print(f"✅ HTML 已保存到: {html_file}")
  116. finally:
  117. await cleanup_browser_session()
  118. def main():
  119. asyncio.run(run_task())
  120. if __name__ == "__main__":
  121. main()