|
@@ -1,247 +0,0 @@
|
|
|
-import asyncio
|
|
|
|
|
-import json
|
|
|
|
|
-import os
|
|
|
|
|
-import sys
|
|
|
|
|
-from datetime import datetime
|
|
|
|
|
-from pathlib import Path
|
|
|
|
|
-from urllib.parse import quote
|
|
|
|
|
-
|
|
|
|
|
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
|
-
|
|
|
|
|
-from tools.baseClassTools import (
|
|
|
|
|
- init_browser_session,
|
|
|
|
|
- navigate_to_url,
|
|
|
|
|
- wait,
|
|
|
|
|
- get_page_html,
|
|
|
|
|
- evaluate,
|
|
|
|
|
- scroll_page,
|
|
|
|
|
- cleanup_browser_session,
|
|
|
|
|
-)
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-async def run_task():
|
|
|
|
|
- project_root = Path(__file__).resolve().parents[1]
|
|
|
|
|
- output_dir = project_root / "output"
|
|
|
|
|
- output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
-
|
|
|
|
|
- json_file = output_dir / "xhs.json"
|
|
|
|
|
- html_file = output_dir / "xhs_page.html"
|
|
|
|
|
-
|
|
|
|
|
- def normalize_output(raw: str) -> str:
|
|
|
|
|
- value = raw
|
|
|
|
|
- if value.startswith("Result: "):
|
|
|
|
|
- value = value[8:]
|
|
|
|
|
- return value.strip()
|
|
|
|
|
-
|
|
|
|
|
- try:
|
|
|
|
|
- await init_browser_session(headless=False, profile_name="xhs_profile")
|
|
|
|
|
-
|
|
|
|
|
- await navigate_to_url("https://www.xiaohongshu.com")
|
|
|
|
|
- await wait(seconds=3)
|
|
|
|
|
-
|
|
|
|
|
- keyword = "瑜伽美女"
|
|
|
|
|
- search_url = f"https://www.xiaohongshu.com/search_result?keyword={quote(keyword)}&type=51"
|
|
|
|
|
- await navigate_to_url(search_url)
|
|
|
|
|
- await wait(seconds=6)
|
|
|
|
|
-
|
|
|
|
|
- unlock_js = """
|
|
|
|
|
- (function(){
|
|
|
|
|
- try {
|
|
|
|
|
- document.documentElement.classList.remove('reds-lock-scroll');
|
|
|
|
|
- document.body.classList.remove('reds-lock-scroll');
|
|
|
|
|
- const candidates = Array.from(document.querySelectorAll('[role="dialog"], .reds-modal, .reds-alert, [class*="modal"], [class*="mask"], [class*="dialog"]'));
|
|
|
|
|
- for (const el of candidates) {
|
|
|
|
|
- try {
|
|
|
|
|
- const style = window.getComputedStyle(el);
|
|
|
|
|
- const z = parseInt(style.zIndex || '0', 10);
|
|
|
|
|
- if (style.position === 'fixed' && z >= 999) {
|
|
|
|
|
- el.remove();
|
|
|
|
|
- }
|
|
|
|
|
- } catch {}
|
|
|
|
|
- }
|
|
|
|
|
- const closeButtons = Array.from(document.querySelectorAll('button, [role="button"]'));
|
|
|
|
|
- for (const btn of closeButtons) {
|
|
|
|
|
- const text = (btn.textContent || '').trim();
|
|
|
|
|
- const label = (btn.getAttribute('aria-label') || '').trim();
|
|
|
|
|
- if (text.includes('关闭') || text.includes('我知道了') || text.includes('同意') || label.includes('关闭')) {
|
|
|
|
|
- btn.click();
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- return true;
|
|
|
|
|
- } catch (e) {
|
|
|
|
|
- return false;
|
|
|
|
|
- }
|
|
|
|
|
- })()
|
|
|
|
|
- """
|
|
|
|
|
-
|
|
|
|
|
- count_js = """
|
|
|
|
|
- (function(){
|
|
|
|
|
- const anchorCount = document.querySelectorAll('a[href*="/explore/"]').length;
|
|
|
|
|
- const cardCount = document.querySelectorAll('[data-testid="search-note-item"], .note-item, article, li[data-note-id]').length;
|
|
|
|
|
- return JSON.stringify({count: Math.max(anchorCount, cardCount)});
|
|
|
|
|
- })()
|
|
|
|
|
- """
|
|
|
|
|
-
|
|
|
|
|
- await evaluate(code=unlock_js)
|
|
|
|
|
-
|
|
|
|
|
- for _ in range(8):
|
|
|
|
|
- count_result = await evaluate(code=count_js)
|
|
|
|
|
- count_output = normalize_output(count_result.output)
|
|
|
|
|
- try:
|
|
|
|
|
- count_value = int(json.loads(count_output).get("count", 0))
|
|
|
|
|
- except Exception:
|
|
|
|
|
- count_value = 0
|
|
|
|
|
- if count_value >= 5:
|
|
|
|
|
- break
|
|
|
|
|
- await scroll_page(down=True, pages=0.8)
|
|
|
|
|
- await wait(seconds=3)
|
|
|
|
|
- await evaluate(code=unlock_js)
|
|
|
|
|
-
|
|
|
|
|
- extract_js = """
|
|
|
|
|
- (function(){
|
|
|
|
|
- try {
|
|
|
|
|
- const results = [];
|
|
|
|
|
-
|
|
|
|
|
- const jsonScripts = Array.from(document.querySelectorAll('script[type="application/json"], script#__NEXT_DATA__'));
|
|
|
|
|
- for (const s of jsonScripts) {
|
|
|
|
|
- try {
|
|
|
|
|
- const txt = s.textContent.trim();
|
|
|
|
|
- if (txt && txt.length > 0) {
|
|
|
|
|
- const data = JSON.parse(txt);
|
|
|
|
|
- const candidates = [];
|
|
|
|
|
- function collect(obj) {
|
|
|
|
|
- if (!obj || typeof obj !== 'object') return;
|
|
|
|
|
- for (const k of Object.keys(obj)) {
|
|
|
|
|
- const v = obj[k];
|
|
|
|
|
- if (v && typeof v === 'object') {
|
|
|
|
|
- if (Array.isArray(v)) {
|
|
|
|
|
- candidates.push(v);
|
|
|
|
|
- }
|
|
|
|
|
- collect(v);
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- collect(data);
|
|
|
|
|
- for (const arr of candidates) {
|
|
|
|
|
- for (const item of arr) {
|
|
|
|
|
- try {
|
|
|
|
|
- const title = (item.title || item.noteTitle || item.name || '').toString().trim();
|
|
|
|
|
- const link = (item.link || item.url || item.noteUrl || item.jumpUrl || '').toString().trim();
|
|
|
|
|
- if ((title || link) && (link.includes('/explore/') || link.startsWith('http'))) {
|
|
|
|
|
- results.push({
|
|
|
|
|
- index: results.length + 1,
|
|
|
|
|
- title,
|
|
|
|
|
- link,
|
|
|
|
|
- summary: (item.desc || item.content || item.noteDesc || '').toString().trim().substring(0, 200)
|
|
|
|
|
- });
|
|
|
|
|
- if (results.length >= 20) break;
|
|
|
|
|
- }
|
|
|
|
|
- } catch {}
|
|
|
|
|
- }
|
|
|
|
|
- if (results.length >= 20) break;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- } catch {}
|
|
|
|
|
- if (results.length >= 5) break;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- if (results.length < 5) {
|
|
|
|
|
- const anchors = Array.from(document.querySelectorAll('a[href*="/explore/"]'));
|
|
|
|
|
- const seen = new Set();
|
|
|
|
|
- for (const a of anchors) {
|
|
|
|
|
- try {
|
|
|
|
|
- const href = a.href;
|
|
|
|
|
- if (!href || seen.has(href)) continue;
|
|
|
|
|
- seen.add(href);
|
|
|
|
|
- let title = (a.textContent || '').trim();
|
|
|
|
|
- if (!title) {
|
|
|
|
|
- const img = a.querySelector('img[alt]');
|
|
|
|
|
- if (img && img.alt) title = img.alt.trim();
|
|
|
|
|
- }
|
|
|
|
|
- if (!title) {
|
|
|
|
|
- const parentTitle = a.closest('[data-testid="search-note-item"], .note-item, article, li')?.querySelector('[data-testid="note-title"], .title, h3, p');
|
|
|
|
|
- if (parentTitle) title = (parentTitle.textContent || '').trim();
|
|
|
|
|
- }
|
|
|
|
|
- const descEl = a.closest('[data-testid="search-note-item"], .note-item, article, li')?.querySelector('[data-testid="note-desc"], .desc, .description, p');
|
|
|
|
|
- const desc = descEl ? (descEl.textContent || '').trim() : '';
|
|
|
|
|
- results.push({
|
|
|
|
|
- index: results.length + 1,
|
|
|
|
|
- title,
|
|
|
|
|
- link: href,
|
|
|
|
|
- summary: desc.substring(0, 200)
|
|
|
|
|
- });
|
|
|
|
|
- if (results.length >= 20) break;
|
|
|
|
|
- } catch {}
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- return {
|
|
|
|
|
- success: true,
|
|
|
|
|
- count: results.length,
|
|
|
|
|
- keyword: '瑜伽美女',
|
|
|
|
|
- timestamp: new Date().toISOString(),
|
|
|
|
|
- results: results
|
|
|
|
|
- };
|
|
|
|
|
- } catch (e) {
|
|
|
|
|
- return {
|
|
|
|
|
- success: false,
|
|
|
|
|
- error: e.message,
|
|
|
|
|
- stack: e.stack
|
|
|
|
|
- };
|
|
|
|
|
- }
|
|
|
|
|
- })()
|
|
|
|
|
- """
|
|
|
|
|
-
|
|
|
|
|
- result = await evaluate(code=extract_js)
|
|
|
|
|
- output = normalize_output(result.output)
|
|
|
|
|
-
|
|
|
|
|
- try:
|
|
|
|
|
- data = json.loads(output)
|
|
|
|
|
- except json.JSONDecodeError:
|
|
|
|
|
- data = {
|
|
|
|
|
- "success": False,
|
|
|
|
|
- "error": "JSON解析失败",
|
|
|
|
|
- "raw_output": output[:1000],
|
|
|
|
|
- "keyword": keyword,
|
|
|
|
|
- "timestamp": datetime.now().isoformat(),
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- with open(json_file, "w", encoding="utf-8") as f:
|
|
|
|
|
- json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
|
-
|
|
|
|
|
- html_result = await get_page_html()
|
|
|
|
|
- html_content = html_result.metadata.get("html", "")
|
|
|
|
|
- page_url = html_result.metadata.get("url", "")
|
|
|
|
|
- page_title = html_result.metadata.get("title", "")
|
|
|
|
|
- meta_info = (
|
|
|
|
|
- "\n".join(
|
|
|
|
|
- [
|
|
|
|
|
- "<!--",
|
|
|
|
|
- f" 页面标题: {page_title}",
|
|
|
|
|
- f" 页面URL: {page_url}",
|
|
|
|
|
- f" 保存时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
|
|
|
|
- f" 搜索关键词: {keyword}",
|
|
|
|
|
- "-->",
|
|
|
|
|
- "",
|
|
|
|
|
- ]
|
|
|
|
|
- )
|
|
|
|
|
- + "\n"
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- with open(html_file, "w", encoding="utf-8") as f:
|
|
|
|
|
- f.write(meta_info)
|
|
|
|
|
- f.write(html_content)
|
|
|
|
|
-
|
|
|
|
|
- print(f"✅ 数据已保存到: {json_file}")
|
|
|
|
|
- print(f"✅ HTML 已保存到: {html_file}")
|
|
|
|
|
-
|
|
|
|
|
- finally:
|
|
|
|
|
- await cleanup_browser_session()
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-def main():
|
|
|
|
|
- asyncio.run(run_task())
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-if __name__ == "__main__":
|
|
|
|
|
- main()
|
|
|