crawler.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. """
  2. 爬虫服务工具模块
  3. 提供 YouTube、X (Twitter) 和微信/通用链接的搜索和详情查询功能。
  4. """
  5. import asyncio
  6. import base64
  7. import io
  8. import json
  9. import math
  10. import os
  11. import subprocess
  12. import tempfile
  13. from pathlib import Path
  14. from typing import Optional, List, Dict, Any
  15. import httpx
  16. from PIL import Image, ImageDraw, ImageFont
  17. from agent.tools import tool, ToolResult
  18. # API 配置
  19. CRAWLER_BASE_URL = "http://crawler.aiddit.com/crawler"
  20. AIGC_BASE_URL = "http://aigc-channel.aiddit.com/aigc/channel"
  21. DEFAULT_TIMEOUT = 60.0
  22. # 拼接图配置
  23. THUMB_WIDTH = 250
  24. THUMB_HEIGHT = 250
  25. TEXT_HEIGHT = 80
  26. GRID_COLS = 5
  27. PADDING = 12
  28. BG_COLOR = (255, 255, 255)
  29. TEXT_COLOR = (30, 30, 30)
  30. INDEX_COLOR = (220, 60, 60)
  31. # 视频处理相关配置
  32. VIDEO_DOWNLOAD_DIR = Path(tempfile.gettempdir()) / "youtube_videos"
  33. VIDEO_DOWNLOAD_DIR.mkdir(exist_ok=True)
  34. # ── 辅助函数 ──
  35. def _truncate_text(text: str, max_len: int = 14) -> str:
  36. """截断文本,超出部分用省略号"""
  37. return text[:max_len] + "..." if len(text) > max_len else text
  38. async def _download_image(client: httpx.AsyncClient, url: str) -> Optional[Image.Image]:
  39. """下载单张图片,失败返回 None"""
  40. try:
  41. resp = await client.get(url, timeout=15.0)
  42. resp.raise_for_status()
  43. return Image.open(io.BytesIO(resp.content)).convert("RGB")
  44. except Exception:
  45. return None
  46. async def _build_video_collage(videos: List[Dict[str, Any]]) -> Optional[str]:
  47. """
  48. 将视频缩略图+序号+标题拼接成网格图,返回 base64 编码的 PNG。
  49. """
  50. if not videos:
  51. return None
  52. items = []
  53. for idx, video in enumerate(videos):
  54. thumbnail = None
  55. if "thumbnails" in video and isinstance(video["thumbnails"], list) and video["thumbnails"]:
  56. thumbnail = video["thumbnails"][0].get("url")
  57. elif "thumbnail" in video:
  58. thumbnail = video.get("thumbnail")
  59. elif "cover_url" in video:
  60. thumbnail = video.get("cover_url")
  61. title = video.get("title", "") or video.get("text", "")
  62. if thumbnail:
  63. items.append({"url": thumbnail, "title": title, "index": idx + 1})
  64. if not items:
  65. return None
  66. async with httpx.AsyncClient() as client:
  67. tasks = [_download_image(client, item["url"]) for item in items]
  68. downloaded = await asyncio.gather(*tasks)
  69. valid = [(item, img) for item, img in zip(items, downloaded) if img is not None]
  70. if not valid:
  71. return None
  72. cols = min(GRID_COLS, len(valid))
  73. rows = math.ceil(len(valid) / cols)
  74. cell_w = THUMB_WIDTH + PADDING
  75. cell_h = THUMB_HEIGHT + TEXT_HEIGHT + PADDING
  76. canvas_w = cols * cell_w + PADDING
  77. canvas_h = rows * cell_h + PADDING
  78. canvas = Image.new("RGB", (canvas_w, canvas_h), BG_COLOR)
  79. draw = ImageDraw.Draw(canvas)
  80. font_title = None
  81. font_index = None
  82. font_candidates = [
  83. "msyh.ttc", "simhei.ttf", "simsun.ttc",
  84. "/System/Library/Fonts/PingFang.ttc",
  85. "/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf",
  86. "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",
  87. "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
  88. ]
  89. for font_path in font_candidates:
  90. try:
  91. font_title = ImageFont.truetype(font_path, 16)
  92. font_index = ImageFont.truetype(font_path, 32)
  93. break
  94. except Exception:
  95. continue
  96. if not font_title:
  97. font_title = ImageFont.load_default()
  98. font_index = font_title
  99. for item, img in valid:
  100. idx = item["index"]
  101. col = (idx - 1) % cols
  102. row = (idx - 1) // cols
  103. x = PADDING + col * cell_w
  104. y = PADDING + row * cell_h
  105. scale = min(THUMB_WIDTH / img.width, THUMB_HEIGHT / img.height)
  106. new_w = int(img.width * scale)
  107. new_h = int(img.height * scale)
  108. thumb = img.resize((new_w, new_h), Image.LANCZOS)
  109. offset_x = x + (THUMB_WIDTH - new_w) // 2
  110. offset_y = y + (THUMB_HEIGHT - new_h) // 2
  111. canvas.paste(thumb, (offset_x, offset_y))
  112. index_text = str(idx)
  113. idx_x = offset_x
  114. idx_y = offset_y + 4
  115. box_size = 52
  116. draw.rectangle([idx_x, idx_y, idx_x + box_size, idx_y + box_size], fill=INDEX_COLOR)
  117. bbox = draw.textbbox((0, 0), index_text, font=font_index)
  118. tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
  119. text_x = idx_x + (box_size - tw) // 2
  120. text_y = idx_y + (box_size - th) // 2
  121. draw.text((text_x, text_y), index_text, fill=(255, 255, 255), font=font_index)
  122. title = item["title"] or ""
  123. if title:
  124. words = list(title)
  125. lines = []
  126. current_line = ""
  127. for ch in words:
  128. test_line = current_line + ch
  129. bbox_line = draw.textbbox((0, 0), test_line, font=font_title)
  130. if bbox_line[2] - bbox_line[0] > THUMB_WIDTH:
  131. if current_line:
  132. lines.append(current_line)
  133. current_line = ch
  134. else:
  135. current_line = test_line
  136. if current_line:
  137. lines.append(current_line)
  138. for line_i, line in enumerate(lines):
  139. draw.text((x, y + THUMB_HEIGHT + 6 + line_i * 22), line, fill=TEXT_COLOR, font=font_title)
  140. buf = io.BytesIO()
  141. canvas.save(buf, format="PNG")
  142. return base64.b64encode(buf.getvalue()).decode("utf-8")
  143. def _parse_srt_to_outline(srt_content: str) -> List[Dict[str, str]]:
  144. """解析 SRT 字幕,生成带时间戳的大纲"""
  145. if not srt_content:
  146. return []
  147. outline = []
  148. blocks = srt_content.strip().split('\n\n')
  149. for block in blocks:
  150. lines = block.strip().split('\n')
  151. if len(lines) >= 3:
  152. timestamp_line = lines[1]
  153. if '-->' in timestamp_line:
  154. start_time = timestamp_line.split('-->')[0].strip()
  155. text = ' '.join(lines[2:])
  156. outline.append({'timestamp': start_time, 'text': text})
  157. return outline
  158. def _download_youtube_video(video_id: str) -> Optional[str]:
  159. """使用 yt-dlp 下载 YouTube 视频,返回文件路径"""
  160. try:
  161. output_path = VIDEO_DOWNLOAD_DIR / f"{video_id}.mp4"
  162. if output_path.exists():
  163. return str(output_path)
  164. cmd = [
  165. 'yt-dlp',
  166. '-f', 'best[ext=mp4]',
  167. '-o', str(output_path),
  168. f'https://www.youtube.com/watch?v={video_id}'
  169. ]
  170. result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
  171. if result.returncode == 0 and output_path.exists():
  172. return str(output_path)
  173. return None
  174. except Exception:
  175. return None
  176. # ── YouTube 工具 ──
  177. @tool()
  178. async def youtube_search(keyword: str) -> ToolResult:
  179. """
  180. 搜索 YouTube 视频
  181. Args:
  182. keyword: 搜索关键词
  183. Returns:
  184. 搜索结果列表,包含视频标题、ID、频道等信息
  185. """
  186. try:
  187. async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
  188. response = await client.post(
  189. f"{CRAWLER_BASE_URL}/youtube/keyword",
  190. json={"keyword": keyword}
  191. )
  192. response.raise_for_status()
  193. data = response.json()
  194. if data.get("code") == 0:
  195. result_data = data.get("data", {})
  196. videos = result_data.get("data", []) if isinstance(result_data, dict) else []
  197. images = []
  198. collage_b64 = await _build_video_collage(videos)
  199. if collage_b64:
  200. images.append({
  201. "type": "base64",
  202. "media_type": "image/png",
  203. "data": collage_b64
  204. })
  205. summary_list = []
  206. for idx, video in enumerate(videos[:20], 1):
  207. title = video.get("title", "")
  208. author = video.get("author", "")
  209. video_id = video.get("video_id", "")
  210. summary_list.append(f"{idx}. {title} - {author} (ID: {video_id})")
  211. output_data = {
  212. "keyword": keyword,
  213. "total": len(videos),
  214. "summary": summary_list,
  215. "data": videos
  216. }
  217. return ToolResult(
  218. title=f"YouTube 搜索: {keyword}",
  219. output=json.dumps(output_data, ensure_ascii=False, indent=2),
  220. long_term_memory=f"Searched YouTube for '{keyword}', found {len(videos)} videos",
  221. images=images
  222. )
  223. else:
  224. return ToolResult(
  225. title="YouTube 搜索失败",
  226. output="",
  227. error=f"搜索失败: {data.get('msg', '未知错误')}"
  228. )
  229. except Exception as e:
  230. return ToolResult(
  231. title="YouTube 搜索异常",
  232. output="",
  233. error=str(e)
  234. )
  235. @tool()
  236. async def youtube_detail(
  237. content_id: str,
  238. include_captions: bool = True,
  239. download_video: bool = False
  240. ) -> ToolResult:
  241. """
  242. 获取 YouTube 视频详情(可选包含字幕、下载视频并生成大纲)
  243. Args:
  244. content_id: 视频 ID
  245. include_captions: 是否包含字幕,默认 True
  246. download_video: 是否下载视频并生成带时间戳的大纲,默认 False。
  247. 下载后可使用 extract_video_clip 截取视频片段观看。
  248. Returns:
  249. 视频详细信息,包含字幕、视频大纲和本地文件路径
  250. """
  251. try:
  252. async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
  253. detail_response = await client.post(
  254. f"{CRAWLER_BASE_URL}/youtube/detail",
  255. json={"content_id": content_id}
  256. )
  257. detail_response.raise_for_status()
  258. detail_data = detail_response.json()
  259. if detail_data.get("code") != 0:
  260. return ToolResult(
  261. title="获取详情失败",
  262. output="",
  263. error=f"获取详情失败: {detail_data.get('msg', '未知错误')}"
  264. )
  265. result_data = detail_data.get("data", {})
  266. video_info = result_data.get("data", {}) if isinstance(result_data, dict) else {}
  267. # 获取字幕
  268. captions_text = None
  269. if include_captions or download_video:
  270. try:
  271. captions_response = await client.post(
  272. f"{CRAWLER_BASE_URL}/youtube/captions",
  273. json={"content_id": content_id}
  274. )
  275. captions_response.raise_for_status()
  276. captions_data = captions_response.json()
  277. if captions_data.get("code") == 0:
  278. captions_result = captions_data.get("data", {})
  279. if isinstance(captions_result, dict):
  280. inner_data = captions_result.get("data", {})
  281. if isinstance(inner_data, dict):
  282. captions_text = inner_data.get("content")
  283. except Exception:
  284. pass
  285. # 下载视频并生成大纲
  286. video_path = None
  287. video_outline = None
  288. if download_video:
  289. video_path = await asyncio.to_thread(_download_youtube_video, content_id)
  290. if captions_text:
  291. video_outline = _parse_srt_to_outline(captions_text)
  292. # 合并数据
  293. output_data = {
  294. "video_id": content_id,
  295. "title": video_info.get("title", ""),
  296. "channel": video_info.get("channel_account_name", ""),
  297. "description": video_info.get("body_text", ""),
  298. "like_count": video_info.get("like_count"),
  299. "comment_count": video_info.get("comment_count"),
  300. "publish_timestamp": video_info.get("publish_timestamp"),
  301. "content_link": video_info.get("content_link", ""),
  302. "captions": captions_text,
  303. "full_data": video_info
  304. }
  305. if download_video:
  306. output_data["video_path"] = video_path
  307. output_data["video_outline"] = video_outline
  308. if not video_path:
  309. output_data["download_error"] = "视频下载失败,请检查 yt-dlp 是否可用"
  310. memory = f"Retrieved YouTube video details for {content_id}"
  311. if captions_text:
  312. memory += " with captions"
  313. if video_path:
  314. memory += f", downloaded to {video_path}"
  315. return ToolResult(
  316. title=f"YouTube 视频详情: {content_id}",
  317. output=json.dumps(output_data, ensure_ascii=False, indent=2),
  318. long_term_memory=memory
  319. )
  320. except Exception as e:
  321. return ToolResult(
  322. title="YouTube 详情查询异常",
  323. output="",
  324. error=str(e)
  325. )
  326. # ── X (Twitter) 工具 ──
  327. @tool()
  328. async def x_search(keyword: str) -> ToolResult:
  329. """
  330. 搜索 X (Twitter) 内容(数据已结构化,无需访问详情页)
  331. Args:
  332. keyword: 搜索关键词
  333. Returns:
  334. 搜索结果列表,包含推文内容、作者、互动数据等
  335. """
  336. try:
  337. async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
  338. response = await client.post(
  339. "http://crawler.aiddit.com/crawler/x/keyword",
  340. json={"keyword": keyword}
  341. )
  342. response.raise_for_status()
  343. data = response.json()
  344. if data.get("code") == 0:
  345. result_data = data.get("data", {})
  346. tweets = result_data.get("data", []) if isinstance(result_data, dict) else []
  347. # 构建拼接图
  348. images = []
  349. tweets_with_images = []
  350. for tweet in tweets:
  351. image_list = tweet.get("image_url_list", [])
  352. if image_list:
  353. tweet["thumbnails"] = [{"url": image_list[0].get("image_url")}]
  354. tweets_with_images.append(tweet)
  355. collage_b64 = await _build_video_collage(tweets_with_images if tweets_with_images else tweets)
  356. if collage_b64:
  357. images.append({
  358. "type": "base64",
  359. "media_type": "image/png",
  360. "data": collage_b64
  361. })
  362. summary_list = []
  363. for idx, tweet in enumerate(tweets[:20], 1):
  364. text = tweet.get("body_text", "")[:100]
  365. author = tweet.get("channel_account_name", "")
  366. summary_list.append(f"{idx}. @{author}: {text}")
  367. output_data = {
  368. "keyword": keyword,
  369. "total": len(tweets),
  370. "summary": summary_list,
  371. "data": tweets
  372. }
  373. return ToolResult(
  374. title=f"X 搜索: {keyword}",
  375. output=json.dumps(output_data, ensure_ascii=False, indent=2),
  376. long_term_memory=f"Searched X (Twitter) for '{keyword}', found {len(tweets)} tweets",
  377. images=images
  378. )
  379. else:
  380. return ToolResult(
  381. title="X 搜索失败",
  382. output="",
  383. error=f"搜索失败: {data.get('msg', '未知错误')}"
  384. )
  385. except Exception as e:
  386. return ToolResult(
  387. title="X 搜索异常",
  388. output="",
  389. error=str(e)
  390. )
  391. # ── 内容导入工具 ──
  392. @tool()
  393. async def import_content(plan_name: str, content_data: List[Dict[str, Any]]) -> ToolResult:
  394. """
  395. 导入长文内容(微信公众号、小红书、抖音等通用链接)
  396. Args:
  397. plan_name: 计划名称
  398. content_data: 内容数据列表,每项包含 channel、content_link、title 等字段
  399. Returns:
  400. 导入结果,包含 plan_id
  401. """
  402. try:
  403. async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
  404. response = await client.post(
  405. f"{AIGC_BASE_URL}/weixin/auto_insert",
  406. json={"plan_name": plan_name, "data": content_data}
  407. )
  408. response.raise_for_status()
  409. data = response.json()
  410. if data.get("code") == 0:
  411. result_data = data.get("data", {})
  412. return ToolResult(
  413. title=f"内容导入: {plan_name}",
  414. output=json.dumps(result_data, ensure_ascii=False, indent=2),
  415. long_term_memory=f"Imported {len(content_data)} items to plan '{plan_name}'"
  416. )
  417. else:
  418. return ToolResult(
  419. title="导入失败",
  420. output="",
  421. error=f"导入失败: {data.get('msg', '未知错误')}"
  422. )
  423. except Exception as e:
  424. return ToolResult(
  425. title="内容导入异常",
  426. output="",
  427. error=str(e)
  428. )
  429. # ── 视频截取工具 ──
  430. @tool()
  431. async def extract_video_clip(
  432. video_id: str,
  433. start_time: str,
  434. end_time: str,
  435. output_name: Optional[str] = None
  436. ) -> ToolResult:
  437. """
  438. 从已下载的 YouTube 视频中截取指定时间段的片段
  439. Args:
  440. video_id: YouTube 视频 ID(必须先通过 youtube_detail(download_video=True) 下载)
  441. start_time: 开始时间,格式: HH:MM:SS 或 MM:SS
  442. end_time: 结束时间,格式: HH:MM:SS 或 MM:SS
  443. output_name: 输出文件名(可选)
  444. Returns:
  445. 截取的视频片段路径
  446. Example:
  447. extract_video_clip("dQw4w9WgXcQ", "00:00:10", "00:00:30")
  448. """
  449. try:
  450. source_video = VIDEO_DOWNLOAD_DIR / f"{video_id}.mp4"
  451. if not source_video.exists():
  452. return ToolResult(
  453. title="视频截取失败",
  454. output="",
  455. error="源视频不存在,请先使用 youtube_detail(download_video=True) 下载视频"
  456. )
  457. if not output_name:
  458. output_name = f"{video_id}_clip_{start_time.replace(':', '-')}_{end_time.replace(':', '-')}.mp4"
  459. output_path = VIDEO_DOWNLOAD_DIR / output_name
  460. cmd = [
  461. 'ffmpeg',
  462. '-i', str(source_video),
  463. '-ss', start_time,
  464. '-to', end_time,
  465. '-c', 'copy',
  466. '-y',
  467. str(output_path)
  468. ]
  469. result = await asyncio.to_thread(
  470. subprocess.run, cmd, capture_output=True, text=True, timeout=60
  471. )
  472. if result.returncode == 0 and output_path.exists():
  473. file_size = output_path.stat().st_size / (1024 * 1024)
  474. output_data = {
  475. "video_id": video_id,
  476. "clip_path": str(output_path),
  477. "start_time": start_time,
  478. "end_time": end_time,
  479. "file_size_mb": round(file_size, 2)
  480. }
  481. return ToolResult(
  482. title=f"视频片段截取成功: {start_time} - {end_time}",
  483. output=json.dumps(output_data, ensure_ascii=False, indent=2),
  484. long_term_memory=f"Extracted video clip from {video_id}: {start_time} to {end_time}"
  485. )
  486. else:
  487. return ToolResult(
  488. title="视频截取失败",
  489. output="",
  490. error=f"ffmpeg 执行失败: {result.stderr}"
  491. )
  492. except subprocess.TimeoutExpired:
  493. return ToolResult(
  494. title="视频截取超时",
  495. output="",
  496. error="视频截取超时(60秒)"
  497. )
  498. except Exception as e:
  499. return ToolResult(
  500. title="视频截取异常",
  501. output="",
  502. error=str(e)
  503. )