howard
/
Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
							"""
百度搜索示例
Baidu Search Example

功能：
1. 打开百度
2. 搜索"Python 教程"
3. 提取搜索结果数据并保存到 baidu.json
4. 保存完整页面 HTML 到 baidu_page.html

使用方法：
    python example.py
"""

import asyncio
import json
from pathlib import Path
from datetime import datetime

# 导入 baseClassTools 的工具
from tools.baseClassTools import (
    init_browser_session,
    navigate_to_url,
    wait,
    get_page_html,
    wait_for_user_action,
    get_selector_map,
    input_text,
    send_keys,
    evaluate,
    scroll_page,
    cleanup_browser_session
)


async def baidu_search_task():
    """
    百度搜索任务：搜索"Python 教程"并保存数据
    """
    print("\n" + "="*80)
    print("🚀 开始执行百度搜索任务")
    print("="*80 + "\n")

    # 项目根目录
    project_root = Path(__file__).parent

    try:
        # ============================================================
        # 步骤 1: 初始化浏览器会话（使用专门的百度配置）
        # ============================================================
        print("📌 步骤 1: 初始化浏览器会话...")
        await init_browser_session(
            headless=False,
            profile_name="baidu_profile"  # 使用专门的配置文件
        )
        print("✅ 浏览器会话已初始化\n")

        # ============================================================
        # 步骤 2: 导航到百度首页
        # ============================================================
        print("📌 步骤 2: 导航到百度...")
        result = await navigate_to_url("https://www.baidu.com")
        print(f"✅ {result.long_term_memory}\n")

        # 等待页面加载
        await wait(seconds=2)

        # ============================================================
        # 步骤 3: 搜索"Python 教程"
        # ============================================================
        print("📌 步骤 3: 搜索关键词...")

        # 方式1: 直接导航到搜索结果页面（推荐）
        search_keyword = "Python 教程"
        search_url = f"https://www.baidu.com/s?wd={search_keyword}"

        print(f"🔍 搜索关键词: {search_keyword}")
        await navigate_to_url(search_url)
        print("✅ 已导航到搜索结果页面\n")

        # 等待搜索结果加载
        print("⏳ 等待搜索结果加载...")
        await wait(seconds=3)

        # 滚动页面加载更多内容
        print("📜 滚动页面加载更多内容...")
        await scroll_page(down=True, pages=1.0)
        await wait(seconds=2)

        print("✅ 搜索结果已加载\n")

        # ============================================================
        # 步骤 4: 提取搜索结果数据
        # ============================================================
        print("📌 步骤 4: 提取搜索结果数据...")

        # 使用 JavaScript 提取数据
        extract_js = """
        (function(){
            try {
                // 提取搜索结果
                const results = [];

                // 百度的搜索结果选择器
                const resultItems = document.querySelectorAll('#content_left > div[class*="result"]');

                console.log('找到搜索结果数量:', resultItems.length);

                resultItems.forEach((item, index) => {
                    if (index >= 10) return; // 只提取前10个

                    try {
                        // 提取标题和链接
                        const titleEl = item.querySelector('h3 a, .t a');
                        const title = titleEl ? titleEl.textContent.trim() : '';
                        const link = titleEl ? titleEl.href : '';

                        // 提取摘要
                        const summaryEl = item.querySelector('.c-abstract, .content-right_8Zs40');
                        const summary = summaryEl ? summaryEl.textContent.trim() : '';

                        // 提取来源
                        const sourceEl = item.querySelector('.c-color-gray, .source_1Vdff');
                        const source = sourceEl ? sourceEl.textContent.trim() : '';

                        if (title || link) {
                            results.push({
                                index: index + 1,
                                title: title,
                                link: link,
                                summary: summary.substring(0, 200),  // 限制摘要长度
                                source: source
                            });
                        }
                    } catch (e) {
                        console.error('提取单个结果失败:', e);
                    }
                });

                return {
                    success: true,
                    count: results.length,
                    keyword: 'Python 教程',
                    timestamp: new Date().toISOString(),
                    results: results
                };
            } catch (e) {
                return {
                    success: false,
                    error: e.message,
                    stack: e.stack
                };
            }
        })()
        """

        result = await evaluate(code=extract_js)

        # 解析提取结果
        try:
            # 从 result.output 中提取 JSON
            output = result.output
            if output.startswith("Result: "):
                output = output[8:]  # 移除 "Result: " 前缀

            data = json.loads(output)

            if data.get('success'):
                print(f"✅ 成功提取 {data.get('count', 0)} 条搜索结果")

                # 保存到 baidu.json
                json_file = project_root / "baidu.json"
                with open(json_file, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=2)

                print(f"✅ 数据已保存到: {json_file}\n")

                # 打印前3条结果预览
                if data.get('results'):
                    print("📋 前3条结果预览:")
                    for item in data['results'][:3]:
                        print(f"  {item.get('index')}. {item.get('title', '无标题')}")
                        print(f"     链接: {item.get('link', '')[:60]}...")
                        print(f"     来源: {item.get('source', '未知')}")
                        print()
            else:
                print(f"⚠️  数据提取失败: {data.get('error', '未知错误')}")

                # 保存错误信息
                error_data = {
                    "success": False,
                    "error": data.get('error'),
                    "keyword": "Python 教程",
                    "timestamp": datetime.now().isoformat()
                }
                json_file = project_root / "baidu.json"
                with open(json_file, 'w', encoding='utf-8') as f:
                    json.dump(error_data, f, ensure_ascii=False, indent=2)

                print(f"⚠️  错误信息已保存到: {json_file}\n")

        except json.JSONDecodeError as e:
            print(f"⚠️  JSON 解析失败: {e}")
            print(f"原始输出: {result.output[:200]}...\n")

            # 保存原始输出
            error_data = {
                "success": False,
                "error": "JSON解析失败",
                "raw_output": result.output[:1000],
                "keyword": "Python 教程",
                "timestamp": datetime.now().isoformat()
            }
            json_file = project_root / "baidu.json"
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(error_data, f, ensure_ascii=False, indent=2)

        # ============================================================
        # 步骤 5: 保存完整页面 HTML
        # ============================================================
        print("📌 步骤 5: 保存完整页面 HTML...")

        html_result = await get_page_html()
        html_content = html_result.metadata.get('html', '')
        page_url = html_result.metadata.get('url', '')
        page_title = html_result.metadata.get('title', '')

        # 保存 HTML 文件
        html_file = project_root / "baidu_page.html"
        with open(html_file, 'w', encoding='utf-8') as f:
            # 添加一些元信息
            meta_info = f"""
<!--
    页面标题: {page_title}
    页面URL: {page_url}
    保存时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
    搜索关键词: Python 教程
-->
"""
            f.write(meta_info)
            f.write(html_content)

        print(f"✅ HTML 已保存到: {html_file}")
        print(f"   页面标题: {page_title}")
        print(f"   页面URL: {page_url}")
        print(f"   HTML 大小: {len(html_content):,} 字符\n")

        # ============================================================
        # 任务完成
        # ============================================================
        print("="*80)
        print("🎉 任务完成！")
        print("="*80)
        print(f"📁 生成的文件:")
        print(f"   1. {json_file.name} - 搜索结果数据")
        print(f"   2. {html_file.name} - 完整页面HTML")
        print("="*80 + "\n")

    except Exception as e:
        print(f"\n❌ 任务执行失败: {str(e)}")
        import traceback
        traceback.print_exc()

    finally:
        # ============================================================
        # 清理：保存浏览器状态
        # ============================================================
        print("\n📌 清理浏览器会话...")
        await cleanup_browser_session()
        print("✅ 浏览器会话已保存")
        print("💡 提示: 下次运行将自动使用保存的浏览器状态\n")


async def main():
    """主函数"""
    await baidu_search_task()


if __name__ == "__main__":
    # 运行任务
    asyncio.run(main())