Просмотр исходного кода

feat(examples): 新增百度搜索示例并移除小红书示例

新增 test_skill.py 示例,演示使用 browser-use CLI 进行百度搜索并提取结果。
移除旧的 test_tools_xhs.py 示例,因其使用已废弃的异步工具类。
max_liu 1 месяц назад
Родитель
Сommit
6e99378759
2 измененных файлов с 129 добавлено и 247 удалено
  1. 129 0
      examples/test_skill.py
  2. 0 247
      examples/test_tools_xhs.py

+ 129 - 0
examples/test_skill.py

@@ -0,0 +1,129 @@
+import json
+import subprocess
+import time
+from pathlib import Path
+
+
+def run_cli(session: str, args: list[str]) -> dict:
+    command = ["browser-use", "--session", session, "--json"] + args
+    result = subprocess.run(command, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(result.stderr.strip() or "browser-use command failed")
+    payload = result.stdout.strip()
+    if not payload:
+        raise RuntimeError("browser-use returned empty output")
+    data = json.loads(payload)
+    if not data.get("success", False):
+        raise RuntimeError(data.get("error", "browser-use command error"))
+    return data.get("data", {})
+
+
+def stop_session_server(session: str) -> None:
+    subprocess.run(
+        ["browser-use", "--session", session, "server", "stop"],
+        capture_output=True,
+        text=True,
+    )
+
+
+def main():
+    project_root = Path(__file__).resolve().parents[1]
+    output_dir = project_root / "output"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    json_file = output_dir / "skill_baidu.json"
+    html_file = output_dir / "skill_baidu_page.html"
+
+    session = "skill_baidu"
+    keyword = "瑜伽美女"
+
+    try:
+        stop_session_server(session)
+        try:
+            run_cli(session, ["open", "https://www.baidu.com"])
+        except RuntimeError:
+            stop_session_server(session)
+            run_cli(session, ["open", "https://www.baidu.com"])
+
+        search_js = (
+            "(function(){"
+            "const input=document.querySelector('#kw');"
+            "const btn=document.querySelector('#su');"
+            "if(input){input.value='" + keyword + "';}"
+            "if(btn){btn.click();}"
+            "else if(input&&input.form){input.form.submit();}"
+            "return {hasInput:!!input,hasButton:!!btn};"
+            "})()"
+        )
+        run_cli(session, ["eval", search_js])
+
+        wait_js = (
+            "(function(){"
+            "const items=document.querySelectorAll('#content_left .result, #content_left .c-container, #content_left .result-op');"
+            "const bodyReady=!!document.body;"
+            "const bodyLen=bodyReady?(document.body.innerText||'').length:0;"
+            "return {count:items.length, bodyReady:bodyReady, bodyLen:bodyLen};"
+            "})()"
+        )
+
+        count = 0
+        for _ in range(12):
+            data = run_cli(session, ["eval", wait_js])
+            result = data.get("result") if isinstance(data, dict) else {}
+            count = int(result.get("count") or 0)
+            body_len = int(result.get("bodyLen") or 0)
+            if count >= 3 or body_len > 1000:
+                break
+            time.sleep(1)
+
+        extract_js = (
+            "(function(){"
+            "const items=Array.from(document.querySelectorAll('#content_left .result, #content_left .c-container, #content_left .result-op'));"
+            "const results=[];"
+            "for(const item of items){"
+            "const a=item.querySelector('h3 a')||item.querySelector('a[data-click]')||item.querySelector('a');"
+            "if(!a) continue;"
+            "const title=(a.textContent||'').trim();"
+            "const link=a.href||'';"
+            "const summaryEl=item.querySelector('.c-abstract, .content-right_8Zs40, .content-right_8Zs40_2gVt2');"
+            "const summary=(summaryEl?summaryEl.textContent:'').trim();"
+            "results.push({index:results.length+1,title,link,summary});"
+            "if(results.length>=10) break;"
+            "}"
+            "return {success:true,keyword:'" + keyword + "',count:results.length,timestamp:new Date().toISOString(),results:results};"
+            "})()"
+        )
+
+        data = run_cli(session, ["eval", extract_js])
+        extracted = data.get("result") if isinstance(data, dict) else data
+
+        if not extracted:
+            extracted = {
+                "success": False,
+                "keyword": keyword,
+                "count": 0,
+                "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
+                "results": [],
+            }
+
+        with open(json_file, "w", encoding="utf-8") as f:
+            json.dump(extracted, f, ensure_ascii=False, indent=2)
+
+        html_data = run_cli(session, ["eval", "document.documentElement.outerHTML"])
+        html_content = html_data.get("result") if isinstance(html_data, dict) else html_data
+
+        with open(html_file, "w", encoding="utf-8") as f:
+            f.write(html_content or "")
+
+        print(f"✅ 数据已保存到: {json_file}")
+        print(f"✅ HTML 已保存到: {html_file}")
+
+    finally:
+        try:
+            run_cli(session, ["close"])
+        except Exception:
+            pass
+
+
+if __name__ == "__main__":
+    main()

+ 0 - 247
examples/test_tools_xhs.py

@@ -1,247 +0,0 @@
-import asyncio
-import json
-import os
-import sys
-from datetime import datetime
-from pathlib import Path
-from urllib.parse import quote
-
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from tools.baseClassTools import (
-    init_browser_session,
-    navigate_to_url,
-    wait,
-    get_page_html,
-    evaluate,
-    scroll_page,
-    cleanup_browser_session,
-)
-
-
-async def run_task():
-    project_root = Path(__file__).resolve().parents[1]
-    output_dir = project_root / "output"
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    json_file = output_dir / "xhs.json"
-    html_file = output_dir / "xhs_page.html"
-
-    def normalize_output(raw: str) -> str:
-        value = raw
-        if value.startswith("Result: "):
-            value = value[8:]
-        return value.strip()
-
-    try:
-        await init_browser_session(headless=False, profile_name="xhs_profile")
-
-        await navigate_to_url("https://www.xiaohongshu.com")
-        await wait(seconds=3)
-
-        keyword = "瑜伽美女"
-        search_url = f"https://www.xiaohongshu.com/search_result?keyword={quote(keyword)}&type=51"
-        await navigate_to_url(search_url)
-        await wait(seconds=6)
-
-        unlock_js = """
-        (function(){
-            try {
-                document.documentElement.classList.remove('reds-lock-scroll');
-                document.body.classList.remove('reds-lock-scroll');
-                const candidates = Array.from(document.querySelectorAll('[role="dialog"], .reds-modal, .reds-alert, [class*="modal"], [class*="mask"], [class*="dialog"]'));
-                for (const el of candidates) {
-                    try {
-                        const style = window.getComputedStyle(el);
-                        const z = parseInt(style.zIndex || '0', 10);
-                        if (style.position === 'fixed' && z >= 999) {
-                            el.remove();
-                        }
-                    } catch {}
-                }
-                const closeButtons = Array.from(document.querySelectorAll('button, [role="button"]'));
-                for (const btn of closeButtons) {
-                    const text = (btn.textContent || '').trim();
-                    const label = (btn.getAttribute('aria-label') || '').trim();
-                    if (text.includes('关闭') || text.includes('我知道了') || text.includes('同意') || label.includes('关闭')) {
-                        btn.click();
-                    }
-                }
-                return true;
-            } catch (e) {
-                return false;
-            }
-        })()
-        """
-
-        count_js = """
-        (function(){
-            const anchorCount = document.querySelectorAll('a[href*="/explore/"]').length;
-            const cardCount = document.querySelectorAll('[data-testid="search-note-item"], .note-item, article, li[data-note-id]').length;
-            return JSON.stringify({count: Math.max(anchorCount, cardCount)});
-        })()
-        """
-
-        await evaluate(code=unlock_js)
-
-        for _ in range(8):
-            count_result = await evaluate(code=count_js)
-            count_output = normalize_output(count_result.output)
-            try:
-                count_value = int(json.loads(count_output).get("count", 0))
-            except Exception:
-                count_value = 0
-            if count_value >= 5:
-                break
-            await scroll_page(down=True, pages=0.8)
-            await wait(seconds=3)
-            await evaluate(code=unlock_js)
-
-        extract_js = """
-        (function(){
-            try {
-                const results = [];
-
-                const jsonScripts = Array.from(document.querySelectorAll('script[type="application/json"], script#__NEXT_DATA__'));
-                for (const s of jsonScripts) {
-                    try {
-                        const txt = s.textContent.trim();
-                        if (txt && txt.length > 0) {
-                            const data = JSON.parse(txt);
-                            const candidates = [];
-                            function collect(obj) {
-                                if (!obj || typeof obj !== 'object') return;
-                                for (const k of Object.keys(obj)) {
-                                    const v = obj[k];
-                                    if (v && typeof v === 'object') {
-                                        if (Array.isArray(v)) {
-                                            candidates.push(v);
-                                        }
-                                        collect(v);
-                                    }
-                                }
-                            }
-                            collect(data);
-                            for (const arr of candidates) {
-                                for (const item of arr) {
-                                    try {
-                                        const title = (item.title || item.noteTitle || item.name || '').toString().trim();
-                                        const link = (item.link || item.url || item.noteUrl || item.jumpUrl || '').toString().trim();
-                                        if ((title || link) && (link.includes('/explore/') || link.startsWith('http'))) {
-                                            results.push({
-                                                index: results.length + 1,
-                                                title,
-                                                link,
-                                                summary: (item.desc || item.content || item.noteDesc || '').toString().trim().substring(0, 200)
-                                            });
-                                            if (results.length >= 20) break;
-                                        }
-                                    } catch {}
-                                }
-                                if (results.length >= 20) break;
-                            }
-                        }
-                    } catch {}
-                    if (results.length >= 5) break;
-                }
-
-                if (results.length < 5) {
-                    const anchors = Array.from(document.querySelectorAll('a[href*="/explore/"]'));
-                    const seen = new Set();
-                    for (const a of anchors) {
-                        try {
-                            const href = a.href;
-                            if (!href || seen.has(href)) continue;
-                            seen.add(href);
-                            let title = (a.textContent || '').trim();
-                            if (!title) {
-                                const img = a.querySelector('img[alt]');
-                                if (img && img.alt) title = img.alt.trim();
-                            }
-                            if (!title) {
-                                const parentTitle = a.closest('[data-testid="search-note-item"], .note-item, article, li')?.querySelector('[data-testid="note-title"], .title, h3, p');
-                                if (parentTitle) title = (parentTitle.textContent || '').trim();
-                            }
-                            const descEl = a.closest('[data-testid="search-note-item"], .note-item, article, li')?.querySelector('[data-testid="note-desc"], .desc, .description, p');
-                            const desc = descEl ? (descEl.textContent || '').trim() : '';
-                            results.push({
-                                index: results.length + 1,
-                                title,
-                                link: href,
-                                summary: desc.substring(0, 200)
-                            });
-                            if (results.length >= 20) break;
-                        } catch {}
-                    }
-                }
-
-                return {
-                    success: true,
-                    count: results.length,
-                    keyword: '瑜伽美女',
-                    timestamp: new Date().toISOString(),
-                    results: results
-                };
-            } catch (e) {
-                return {
-                    success: false,
-                    error: e.message,
-                    stack: e.stack
-                };
-            }
-        })()
-        """
-
-        result = await evaluate(code=extract_js)
-        output = normalize_output(result.output)
-
-        try:
-            data = json.loads(output)
-        except json.JSONDecodeError:
-            data = {
-                "success": False,
-                "error": "JSON解析失败",
-                "raw_output": output[:1000],
-                "keyword": keyword,
-                "timestamp": datetime.now().isoformat(),
-            }
-
-        with open(json_file, "w", encoding="utf-8") as f:
-            json.dump(data, f, ensure_ascii=False, indent=2)
-
-        html_result = await get_page_html()
-        html_content = html_result.metadata.get("html", "")
-        page_url = html_result.metadata.get("url", "")
-        page_title = html_result.metadata.get("title", "")
-        meta_info = (
-            "\n".join(
-                [
-                    "<!--",
-                    f"    页面标题: {page_title}",
-                    f"    页面URL: {page_url}",
-                    f"    保存时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
-                    f"    搜索关键词: {keyword}",
-                    "-->",
-                    "",
-                ]
-            )
-            + "\n"
-        )
-
-        with open(html_file, "w", encoding="utf-8") as f:
-            f.write(meta_info)
-            f.write(html_content)
-
-        print(f"✅ 数据已保存到: {json_file}")
-        print(f"✅ HTML 已保存到: {html_file}")
-
-    finally:
-        await cleanup_browser_session()
-
-
-def main():
-    asyncio.run(run_task())
-
-
-if __name__ == "__main__":
-    main()