image_cdn.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. """
  2. image_cdn.py - 将内容中的外站图片 URL 转换为自有 CDN 链接
  3. 使用场景:在 write_json / write_file 落盘前调用,自动把
  4. 小红书、B站、知乎、微博等平台的图片链接替换为 res.cybertogether.net CDN 链接,
  5. 防止外站图片过期/防盗链导致后续流程无法访问。
  6. """
  7. import hashlib
  8. import json
  9. import logging
  10. import re
  11. from typing import Any, Union
  12. import httpx
  13. logger = logging.getLogger(__name__)
  14. # ── 匹配需要转存的外站图片 URL(只抓图片后缀或明显的图床域名)──────────────────
  15. _IMG_URL_RE = re.compile(
  16. r'https?://(?!' # 排除自有 CDN,不重复上传
  17. r'res\.cybertogether\.net'
  18. r')'
  19. r'[^\s"\'<>]+' # URL 主体
  20. r'(?:'
  21. r'\.(?:jpg|jpeg|png|gif|webp|avif|bmp|svg)' # 明确图片后缀
  22. r'|'
  23. r'(?:xhscdn\.com|bdimg\.com|hdslb\.com|zhihu\.com/p/[^/]+\.(?:jpg|png)'
  24. r'|sinaimg\.cn|wx\d+\.sinaimg\.cn|mmbiz\.qpic\.cn' # 微博/微信图床
  25. r'|imagev[12]\.meitudata\.com' # 美图
  26. r'|p[0-9]\.douyinpic\.com' # 抖音封面
  27. r'|i0\.hdslb\.com|article\.biliimg\.com' # B站
  28. r')'
  29. r')',
  30. re.IGNORECASE,
  31. )
  32. BUCKET_NAME = "aigc-admin"
  33. BUCKET_PATH = "crawler/image"
  34. CDN_BASE = "https://res.cybertogether.net"
  35. async def _upload_bytes_to_oss(data: bytes, filename: str) -> str:
  36. """上传 bytes 到 OSS 并返回 CDN URL,失败时抛出异常"""
  37. from cyber_sdk.ali_oss import _upload_v2
  38. result = await _upload_v2(
  39. file_name=filename,
  40. file_content=data,
  41. bucket_path=BUCKET_PATH,
  42. bucket_name=BUCKET_NAME,
  43. )
  44. oss_key = result.get("oss_object_key")
  45. if not oss_key:
  46. raise ValueError(f"OSS response missing oss_object_key: {result}")
  47. return f"{CDN_BASE}/{oss_key}"
  48. async def _download_image(url: str) -> bytes:
  49. """下载图片 bytes,失败抛出异常"""
  50. async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
  51. resp = await client.get(url, headers={"Referer": url, "User-Agent": "Mozilla/5.0"})
  52. resp.raise_for_status()
  53. return resp.content
  54. def _ext_from_url(url: str) -> str:
  55. """从 URL 猜测文件扩展名,默认 jpg"""
  56. path = url.split("?")[0].lower()
  57. for ext in ("png", "gif", "webp", "avif", "bmp", "svg", "jpg", "jpeg"):
  58. if path.endswith(f".{ext}"):
  59. return ext
  60. return "jpg"
  61. async def replace_image_urls(text: str) -> str:
  62. """
  63. 扫描 text 中所有外站图片 URL,下载并上传到自有 OSS,原地替换为 CDN 链接。
  64. - 已是 res.cybertogether.net 的 URL 直接跳过
  65. - 下载/上传失败的 URL 保留原值,只打 WARNING 日志
  66. - 同一 URL 使用 MD5 去重(同一次调用内)
  67. """
  68. urls = list(dict.fromkeys(_IMG_URL_RE.findall(text))) # 去重但保留顺序
  69. if not urls:
  70. return text
  71. import asyncio
  72. async def _process_single_url(url: str) -> tuple[str, str]:
  73. url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
  74. ext = _ext_from_url(url)
  75. filename = f"{url_hash}.{ext}"
  76. try:
  77. data = await _download_image(url)
  78. cdn_url = await _upload_bytes_to_oss(data, filename)
  79. logger.info("[ImageCDN] %s → %s", url[:60], cdn_url)
  80. return url, cdn_url
  81. except Exception as e:
  82. logger.warning("[ImageCDN] Failed to mirror %s: %s (%s)", url[:60], str(e) or repr(e), type(e).__name__)
  83. return url, url
  84. tasks = [_process_single_url(u) for u in urls]
  85. results = await asyncio.gather(*tasks)
  86. url_map: dict[str, str] = dict(results)
  87. for orig, cdn in url_map.items():
  88. if orig != cdn:
  89. text = text.replace(orig, cdn)
  90. replaced = sum(1 for o, c in url_map.items() if o != c)
  91. if replaced:
  92. logger.info("[ImageCDN] Replaced %d/%d image URLs with CDN links", replaced, len(urls))
  93. return text
  94. async def replace_image_urls_in_obj(obj: Any) -> Any:
  95. """
  96. 递归扫描 dict/list 中所有字符串值,替换其中的外站图片 URL。
  97. 先整体 JSON 序列化再做替换,再反序列化,效率高且避免遗漏嵌套字段。
  98. """
  99. raw = json.dumps(obj, ensure_ascii=False)
  100. replaced = await replace_image_urls(raw)
  101. if replaced == raw:
  102. return obj
  103. return json.loads(replaced)