test_tools_xhs.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. import asyncio
  2. import json
  3. import os
  4. import sys
  5. from datetime import datetime
  6. from pathlib import Path
  7. from urllib.parse import quote
  8. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  9. from tools.baseClassTools import (
  10. init_browser_session,
  11. navigate_to_url,
  12. wait,
  13. get_page_html,
  14. evaluate,
  15. scroll_page,
  16. cleanup_browser_session,
  17. )
  18. async def run_task():
  19. project_root = Path(__file__).resolve().parents[1]
  20. output_dir = project_root / "output"
  21. output_dir.mkdir(parents=True, exist_ok=True)
  22. json_file = output_dir / "xhs.json"
  23. html_file = output_dir / "xhs_page.html"
  24. def normalize_output(raw: str) -> str:
  25. value = raw
  26. if value.startswith("Result: "):
  27. value = value[8:]
  28. return value.strip()
  29. try:
  30. await init_browser_session(headless=False, profile_name="xhs_profile")
  31. await navigate_to_url("https://www.xiaohongshu.com")
  32. await wait(seconds=3)
  33. keyword = "瑜伽美女"
  34. search_url = f"https://www.xiaohongshu.com/search_result?keyword={quote(keyword)}&type=51"
  35. await navigate_to_url(search_url)
  36. await wait(seconds=6)
  37. unlock_js = """
  38. (function(){
  39. try {
  40. document.documentElement.classList.remove('reds-lock-scroll');
  41. document.body.classList.remove('reds-lock-scroll');
  42. const candidates = Array.from(document.querySelectorAll('[role="dialog"], .reds-modal, .reds-alert, [class*="modal"], [class*="mask"], [class*="dialog"]'));
  43. for (const el of candidates) {
  44. try {
  45. const style = window.getComputedStyle(el);
  46. const z = parseInt(style.zIndex || '0', 10);
  47. if (style.position === 'fixed' && z >= 999) {
  48. el.remove();
  49. }
  50. } catch {}
  51. }
  52. const closeButtons = Array.from(document.querySelectorAll('button, [role="button"]'));
  53. for (const btn of closeButtons) {
  54. const text = (btn.textContent || '').trim();
  55. const label = (btn.getAttribute('aria-label') || '').trim();
  56. if (text.includes('关闭') || text.includes('我知道了') || text.includes('同意') || label.includes('关闭')) {
  57. btn.click();
  58. }
  59. }
  60. return true;
  61. } catch (e) {
  62. return false;
  63. }
  64. })()
  65. """
  66. count_js = """
  67. (function(){
  68. const anchorCount = document.querySelectorAll('a[href*="/explore/"]').length;
  69. const cardCount = document.querySelectorAll('[data-testid="search-note-item"], .note-item, article, li[data-note-id]').length;
  70. return JSON.stringify({count: Math.max(anchorCount, cardCount)});
  71. })()
  72. """
  73. await evaluate(code=unlock_js)
  74. for _ in range(8):
  75. count_result = await evaluate(code=count_js)
  76. count_output = normalize_output(count_result.output)
  77. try:
  78. count_value = int(json.loads(count_output).get("count", 0))
  79. except Exception:
  80. count_value = 0
  81. if count_value >= 5:
  82. break
  83. await scroll_page(down=True, pages=0.8)
  84. await wait(seconds=3)
  85. await evaluate(code=unlock_js)
  86. extract_js = """
  87. (function(){
  88. try {
  89. const results = [];
  90. const jsonScripts = Array.from(document.querySelectorAll('script[type="application/json"], script#__NEXT_DATA__'));
  91. for (const s of jsonScripts) {
  92. try {
  93. const txt = s.textContent.trim();
  94. if (txt && txt.length > 0) {
  95. const data = JSON.parse(txt);
  96. const candidates = [];
  97. function collect(obj) {
  98. if (!obj || typeof obj !== 'object') return;
  99. for (const k of Object.keys(obj)) {
  100. const v = obj[k];
  101. if (v && typeof v === 'object') {
  102. if (Array.isArray(v)) {
  103. candidates.push(v);
  104. }
  105. collect(v);
  106. }
  107. }
  108. }
  109. collect(data);
  110. for (const arr of candidates) {
  111. for (const item of arr) {
  112. try {
  113. const title = (item.title || item.noteTitle || item.name || '').toString().trim();
  114. const link = (item.link || item.url || item.noteUrl || item.jumpUrl || '').toString().trim();
  115. if ((title || link) && (link.includes('/explore/') || link.startsWith('http'))) {
  116. results.push({
  117. index: results.length + 1,
  118. title,
  119. link,
  120. summary: (item.desc || item.content || item.noteDesc || '').toString().trim().substring(0, 200)
  121. });
  122. if (results.length >= 20) break;
  123. }
  124. } catch {}
  125. }
  126. if (results.length >= 20) break;
  127. }
  128. }
  129. } catch {}
  130. if (results.length >= 5) break;
  131. }
  132. if (results.length < 5) {
  133. const anchors = Array.from(document.querySelectorAll('a[href*="/explore/"]'));
  134. const seen = new Set();
  135. for (const a of anchors) {
  136. try {
  137. const href = a.href;
  138. if (!href || seen.has(href)) continue;
  139. seen.add(href);
  140. let title = (a.textContent || '').trim();
  141. if (!title) {
  142. const img = a.querySelector('img[alt]');
  143. if (img && img.alt) title = img.alt.trim();
  144. }
  145. if (!title) {
  146. const parentTitle = a.closest('[data-testid="search-note-item"], .note-item, article, li')?.querySelector('[data-testid="note-title"], .title, h3, p');
  147. if (parentTitle) title = (parentTitle.textContent || '').trim();
  148. }
  149. const descEl = a.closest('[data-testid="search-note-item"], .note-item, article, li')?.querySelector('[data-testid="note-desc"], .desc, .description, p');
  150. const desc = descEl ? (descEl.textContent || '').trim() : '';
  151. results.push({
  152. index: results.length + 1,
  153. title,
  154. link: href,
  155. summary: desc.substring(0, 200)
  156. });
  157. if (results.length >= 20) break;
  158. } catch {}
  159. }
  160. }
  161. return {
  162. success: true,
  163. count: results.length,
  164. keyword: '瑜伽美女',
  165. timestamp: new Date().toISOString(),
  166. results: results
  167. };
  168. } catch (e) {
  169. return {
  170. success: false,
  171. error: e.message,
  172. stack: e.stack
  173. };
  174. }
  175. })()
  176. """
  177. result = await evaluate(code=extract_js)
  178. output = normalize_output(result.output)
  179. try:
  180. data = json.loads(output)
  181. except json.JSONDecodeError:
  182. data = {
  183. "success": False,
  184. "error": "JSON解析失败",
  185. "raw_output": output[:1000],
  186. "keyword": keyword,
  187. "timestamp": datetime.now().isoformat(),
  188. }
  189. with open(json_file, "w", encoding="utf-8") as f:
  190. json.dump(data, f, ensure_ascii=False, indent=2)
  191. html_result = await get_page_html()
  192. html_content = html_result.metadata.get("html", "")
  193. page_url = html_result.metadata.get("url", "")
  194. page_title = html_result.metadata.get("title", "")
  195. meta_info = (
  196. "\n".join(
  197. [
  198. "<!--",
  199. f" 页面标题: {page_title}",
  200. f" 页面URL: {page_url}",
  201. f" 保存时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
  202. f" 搜索关键词: {keyword}",
  203. "-->",
  204. "",
  205. ]
  206. )
  207. + "\n"
  208. )
  209. with open(html_file, "w", encoding="utf-8") as f:
  210. f.write(meta_info)
  211. f.write(html_content)
  212. print(f"✅ 数据已保存到: {json_file}")
  213. print(f"✅ HTML 已保存到: {html_file}")
  214. finally:
  215. await cleanup_browser_session()
  216. def main():
  217. asyncio.run(run_task())
  218. if __name__ == "__main__":
  219. main()