import asyncio import json import os import sys from datetime import datetime from pathlib import Path from urllib.parse import quote sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from tools.baseClassTools import ( init_browser_session, navigate_to_url, wait, get_page_html, evaluate, scroll_page, cleanup_browser_session, ) async def run_task(): project_root = Path(__file__).resolve().parents[1] output_dir = project_root / "output" output_dir.mkdir(parents=True, exist_ok=True) json_file = output_dir / "xhs.json" html_file = output_dir / "xhs_page.html" def normalize_output(raw: str) -> str: value = raw if value.startswith("Result: "): value = value[8:] return value.strip() try: await init_browser_session(headless=False, profile_name="xhs_profile") await navigate_to_url("https://www.xiaohongshu.com") await wait(seconds=3) keyword = "瑜伽美女" search_url = f"https://www.xiaohongshu.com/search_result?keyword={quote(keyword)}&type=51" await navigate_to_url(search_url) await wait(seconds=6) unlock_js = """ (function(){ try { document.documentElement.classList.remove('reds-lock-scroll'); document.body.classList.remove('reds-lock-scroll'); const candidates = Array.from(document.querySelectorAll('[role="dialog"], .reds-modal, .reds-alert, [class*="modal"], [class*="mask"], [class*="dialog"]')); for (const el of candidates) { try { const style = window.getComputedStyle(el); const z = parseInt(style.zIndex || '0', 10); if (style.position === 'fixed' && z >= 999) { el.remove(); } } catch {} } const closeButtons = Array.from(document.querySelectorAll('button, [role="button"]')); for (const btn of closeButtons) { const text = (btn.textContent || '').trim(); const label = (btn.getAttribute('aria-label') || '').trim(); if (text.includes('关闭') || text.includes('我知道了') || text.includes('同意') || label.includes('关闭')) { btn.click(); } } return true; } catch (e) { return false; } })() """ count_js = """ (function(){ const anchorCount = document.querySelectorAll('a[href*="/explore/"]').length; const cardCount = document.querySelectorAll('[data-testid="search-note-item"], .note-item, article, li[data-note-id]').length; return JSON.stringify({count: Math.max(anchorCount, cardCount)}); })() """ await evaluate(code=unlock_js) for _ in range(8): count_result = await evaluate(code=count_js) count_output = normalize_output(count_result.output) try: count_value = int(json.loads(count_output).get("count", 0)) except Exception: count_value = 0 if count_value >= 5: break await scroll_page(down=True, pages=0.8) await wait(seconds=3) await evaluate(code=unlock_js) extract_js = """ (function(){ try { const results = []; const jsonScripts = Array.from(document.querySelectorAll('script[type="application/json"], script#__NEXT_DATA__')); for (const s of jsonScripts) { try { const txt = s.textContent.trim(); if (txt && txt.length > 0) { const data = JSON.parse(txt); const candidates = []; function collect(obj) { if (!obj || typeof obj !== 'object') return; for (const k of Object.keys(obj)) { const v = obj[k]; if (v && typeof v === 'object') { if (Array.isArray(v)) { candidates.push(v); } collect(v); } } } collect(data); for (const arr of candidates) { for (const item of arr) { try { const title = (item.title || item.noteTitle || item.name || '').toString().trim(); const link = (item.link || item.url || item.noteUrl || item.jumpUrl || '').toString().trim(); if ((title || link) && (link.includes('/explore/') || link.startsWith('http'))) { results.push({ index: results.length + 1, title, link, summary: (item.desc || item.content || item.noteDesc || '').toString().trim().substring(0, 200) }); if (results.length >= 20) break; } } catch {} } if (results.length >= 20) break; } } } catch {} if (results.length >= 5) break; } if (results.length < 5) { const anchors = Array.from(document.querySelectorAll('a[href*="/explore/"]')); const seen = new Set(); for (const a of anchors) { try { const href = a.href; if (!href || seen.has(href)) continue; seen.add(href); let title = (a.textContent || '').trim(); if (!title) { const img = a.querySelector('img[alt]'); if (img && img.alt) title = img.alt.trim(); } if (!title) { const parentTitle = a.closest('[data-testid="search-note-item"], .note-item, article, li')?.querySelector('[data-testid="note-title"], .title, h3, p'); if (parentTitle) title = (parentTitle.textContent || '').trim(); } const descEl = a.closest('[data-testid="search-note-item"], .note-item, article, li')?.querySelector('[data-testid="note-desc"], .desc, .description, p'); const desc = descEl ? (descEl.textContent || '').trim() : ''; results.push({ index: results.length + 1, title, link: href, summary: desc.substring(0, 200) }); if (results.length >= 20) break; } catch {} } } return { success: true, count: results.length, keyword: '瑜伽美女', timestamp: new Date().toISOString(), results: results }; } catch (e) { return { success: false, error: e.message, stack: e.stack }; } })() """ result = await evaluate(code=extract_js) output = normalize_output(result.output) try: data = json.loads(output) except json.JSONDecodeError: data = { "success": False, "error": "JSON解析失败", "raw_output": output[:1000], "keyword": keyword, "timestamp": datetime.now().isoformat(), } with open(json_file, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) html_result = await get_page_html() html_content = html_result.metadata.get("html", "") page_url = html_result.metadata.get("url", "") page_title = html_result.metadata.get("title", "") meta_info = ( "\n".join( [ "", "", ] ) + "\n" ) with open(html_file, "w", encoding="utf-8") as f: f.write(meta_info) f.write(html_content) print(f"✅ 数据已保存到: {json_file}") print(f"✅ HTML 已保存到: {html_file}") finally: await cleanup_browser_session() def main(): asyncio.run(run_task()) if __name__ == "__main__": main()