|
|
@@ -1,3 +1,8 @@
|
|
|
+"""
|
|
|
+小红书云浏览器数据获取脚本(数据库配置版)
|
|
|
+从数据库 agent_channel_cookies 获取 Cookie 和 cloud_profile_id
|
|
|
+"""
|
|
|
+
|
|
|
import sys
|
|
|
import os
|
|
|
import asyncio
|
|
|
@@ -22,12 +27,21 @@ from agent.tools.builtin.browser.baseClass import (
|
|
|
evaluate,
|
|
|
wait,
|
|
|
get_page_html,
|
|
|
- ensure_login_with_cookies,
|
|
|
- wait_for_user_action,
|
|
|
+ _fetch_cookie_row,
|
|
|
+ _fetch_profile_id,
|
|
|
+ _normalize_cookies,
|
|
|
+ _cookie_domain_for_type,
|
|
|
+ _extract_cookie_value,
|
|
|
)
|
|
|
|
|
|
|
|
|
-async def example_xhs_fitness_search() -> dict:
|
|
|
+async def example_xhs_fitness_search(cookie_type: str = "xhs") -> dict:
|
|
|
+ """
|
|
|
+ 小红书搜索示例
|
|
|
+
|
|
|
+ Args:
|
|
|
+ cookie_type: Cookie 类型,用于从数据库获取配置
|
|
|
+ """
|
|
|
print("\n" + "="*60)
|
|
|
print("示例: 小红书云浏览器搜索 - 健身")
|
|
|
print("="*60)
|
|
|
@@ -47,49 +61,95 @@ async def example_xhs_fitness_search() -> dict:
|
|
|
"timestamp": datetime.now().isoformat(),
|
|
|
}
|
|
|
|
|
|
- for _ in range(3):
|
|
|
+ # 从数据库获取配置
|
|
|
+ print(f"\n🔍 从数据库获取配置 (type={cookie_type})...")
|
|
|
+ profile_id = _fetch_profile_id(cookie_type)
|
|
|
+ cookie_row = _fetch_cookie_row(cookie_type)
|
|
|
+
|
|
|
+ if profile_id:
|
|
|
+ print(f"✅ 获取到 cloud_profile_id: {profile_id}")
|
|
|
+ else:
|
|
|
+ print("⚠️ 未找到 cloud_profile_id,将使用环境变量或默认值")
|
|
|
+ profile_id = os.getenv("XHS_PROFILE_ID")
|
|
|
+
|
|
|
+ if cookie_row:
|
|
|
+ print(f"✅ 获取到 Cookie 配置")
|
|
|
+ else:
|
|
|
+ print("⚠️ 未找到 Cookie 配置")
|
|
|
+
|
|
|
+ for attempt in range(3):
|
|
|
try:
|
|
|
+ # 确保每次重试都清理旧会话
|
|
|
+ if attempt > 0:
|
|
|
+ try:
|
|
|
+ await kill_browser_session()
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ await asyncio.sleep(2) # 等待清理完成
|
|
|
+
|
|
|
+ print(f"\n🌐 启动云浏览器 (尝试 {attempt + 1}/3)...")
|
|
|
browser, tools = await init_browser_session(
|
|
|
headless=False,
|
|
|
use_cloud=True,
|
|
|
+ cloud_profile_id=profile_id,
|
|
|
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
|
+ disable_security=False,
|
|
|
)
|
|
|
if browser is None or tools is None:
|
|
|
raise RuntimeError("浏览器初始化失败")
|
|
|
|
|
|
+ print("✅ 云浏览器启动成功")
|
|
|
+
|
|
|
+ # 访问首页
|
|
|
+ print("\n🏠 访问小红书首页...")
|
|
|
nav_result = await navigate_to_url("https://www.xiaohongshu.com")
|
|
|
if nav_result.error:
|
|
|
raise RuntimeError(nav_result.error)
|
|
|
await wait(3)
|
|
|
|
|
|
- login_result = await ensure_login_with_cookies(
|
|
|
- cookie_type="xhs",
|
|
|
- url="https://www.xiaohongshu.com"
|
|
|
- )
|
|
|
- if login_result.error and "未找到 cookies" not in login_result.error:
|
|
|
- raise RuntimeError(login_result.error)
|
|
|
-
|
|
|
- login_payload = {}
|
|
|
- if isinstance(login_result.output, str) and login_result.output:
|
|
|
- try:
|
|
|
- login_payload = json.loads(login_result.output)
|
|
|
- except Exception:
|
|
|
- login_payload = {}
|
|
|
-
|
|
|
- if login_payload.get("need_login") and login_payload.get("cookies_count", 0) == 0:
|
|
|
- await wait_for_user_action(
|
|
|
- message="未找到可用 cookies,请在云浏览器中完成小红书登录或验证码,完成后按 Enter 继续",
|
|
|
- timeout=300
|
|
|
- )
|
|
|
+ # 注入 Cookie(如果有)
|
|
|
+ if cookie_row:
|
|
|
+ print("\n🍪 注入 Cookie...")
|
|
|
+ cookie_value = _extract_cookie_value(cookie_row)
|
|
|
+ if cookie_value:
|
|
|
+ domain, base_url = _cookie_domain_for_type(cookie_type, "https://www.xiaohongshu.com")
|
|
|
+ cookies = _normalize_cookies(cookie_value, domain, base_url)
|
|
|
+ if cookies:
|
|
|
+ await browser._cdp_set_cookies(cookies)
|
|
|
+ print(f"✅ 成功注入 {len(cookies)} 个 Cookie")
|
|
|
+ # 刷新页面使 Cookie 生效
|
|
|
+ await navigate_to_url("https://www.xiaohongshu.com")
|
|
|
+ await wait(2)
|
|
|
+ else:
|
|
|
+ print("⚠️ Cookie 解析失败")
|
|
|
+ else:
|
|
|
+ print("⚠️ 未找到 Cookie 值")
|
|
|
|
|
|
+ # 访问搜索页面
|
|
|
+ print(f"\n🔗 访问搜索页面: {keyword}")
|
|
|
nav_result = await navigate_to_url(search_url)
|
|
|
if nav_result.error:
|
|
|
raise RuntimeError(nav_result.error)
|
|
|
await wait(8)
|
|
|
|
|
|
- for _ in range(3):
|
|
|
+ # 滚动页面
|
|
|
+ print("\n📜 滚动页面...")
|
|
|
+ for i in range(3):
|
|
|
await scroll_page(down=True, pages=2.0)
|
|
|
await wait(2)
|
|
|
|
|
|
+ # 提取数据
|
|
|
+ print("\n🔍 提取数据...")
|
|
|
+ html_result = await get_page_html()
|
|
|
+ if html_result.error:
|
|
|
+ raise RuntimeError(html_result.error)
|
|
|
+ html = html_result.metadata.get("html", "")
|
|
|
+ output_dir = project_root / "output"
|
|
|
+ output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+ output_path = output_dir / "xhs.html"
|
|
|
+ output_path.write_text(html or "", encoding="utf-8")
|
|
|
+ print(f"✅ 已保存页面 HTML: {output_path}")
|
|
|
+
|
|
|
extract_js = """
|
|
|
(function(){
|
|
|
const maxCount = 20;
|
|
|
@@ -233,32 +293,28 @@ async def example_xhs_fitness_search() -> dict:
|
|
|
return data
|
|
|
|
|
|
data = await run_extract()
|
|
|
- if isinstance(data, dict) and data.get("error") == "可能被登录或验证码拦截":
|
|
|
- await wait_for_user_action(
|
|
|
- message="请在云浏览器中完成小红书登录或验证码,完成后按 Enter 继续",
|
|
|
- timeout=300
|
|
|
- )
|
|
|
- nav_result = await navigate_to_url(search_url)
|
|
|
- if nav_result.error:
|
|
|
- raise RuntimeError(nav_result.error)
|
|
|
- await wait(8)
|
|
|
- for _ in range(3):
|
|
|
- await scroll_page(down=True, pages=2.0)
|
|
|
- await wait(2)
|
|
|
- data = await run_extract()
|
|
|
|
|
|
last_data = data if isinstance(data, dict) else last_data
|
|
|
+
|
|
|
+ # 输出结果
|
|
|
if isinstance(last_data, dict) and last_data.get("count", 0) > 0:
|
|
|
+ print(f"\n✅ 成功获取 {last_data['count']} 条数据")
|
|
|
+ print(f"数据来源: {last_data.get('source', 'javascript')}")
|
|
|
+ print("\n前 5 条结果:")
|
|
|
+ for i, item in enumerate(last_data["results"][:5], 1):
|
|
|
+ print(f"{i}. {item['title'][:50]}...")
|
|
|
+
|
|
|
+ # 成功获取数据,清理并返回
|
|
|
+ await cleanup_browser_session()
|
|
|
return last_data
|
|
|
- if isinstance(last_data, dict) and last_data.get("error") != "可能被登录或验证码拦截":
|
|
|
- return last_data
|
|
|
+
|
|
|
+ if isinstance(last_data, dict) and last_data.get("error") == "可能被登录或验证码拦截":
|
|
|
+ print("\n⚠️ 检测到登录或验证码拦截")
|
|
|
+ print("💡 建议:在数据库中配置有效的 Cookie")
|
|
|
+
|
|
|
except Exception as e:
|
|
|
err_text = str(e)
|
|
|
- if any(key in err_text for key in ["WebSocket", "browser not connected", "NoneType"]):
|
|
|
- try:
|
|
|
- await kill_browser_session()
|
|
|
- except Exception:
|
|
|
- pass
|
|
|
+ print(f"⚠️ 尝试 {attempt + 1}/3 失败: {err_text}")
|
|
|
last_data = {
|
|
|
"success": False,
|
|
|
"keyword": keyword,
|
|
|
@@ -268,14 +324,29 @@ async def example_xhs_fitness_search() -> dict:
|
|
|
"timestamp": datetime.now().isoformat(),
|
|
|
}
|
|
|
finally:
|
|
|
- await cleanup_browser_session()
|
|
|
- await wait(5)
|
|
|
+ # 清理当前会话
|
|
|
+ try:
|
|
|
+ await cleanup_browser_session()
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+
|
|
|
+ # 如果不是最后一次尝试,等待后继续
|
|
|
+ if attempt < 2:
|
|
|
+ print(f"等待 5 秒后重试...")
|
|
|
+ await asyncio.sleep(5)
|
|
|
|
|
|
return last_data
|
|
|
|
|
|
|
|
|
async def main():
|
|
|
- data = await example_xhs_fitness_search()
|
|
|
+ # 可以通过命令行参数指定 cookie_type
|
|
|
+ cookie_type = sys.argv[1] if len(sys.argv) > 1 else "xhs"
|
|
|
+
|
|
|
+ data = await example_xhs_fitness_search(cookie_type)
|
|
|
+
|
|
|
+ print("\n" + "="*60)
|
|
|
+ print("📊 最终结果")
|
|
|
+ print("="*60)
|
|
|
print(json.dumps(data, ensure_ascii=False, indent=2))
|
|
|
|
|
|
|