crawapi_http.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. """Shared crawapi HTTP base (V3-M1A).
  2. Extracted verbatim from douyin.py so multiple platform clients (douyin /
  3. shipinhao) reuse the same HTTP post + rate limiting + rate-limit error
  4. classification + env-file helpers, instead of each duplicating them.
  5. Pure refactor: behaviour is identical to the original douyin implementation.
  6. """
  7. from __future__ import annotations
  8. import os
  9. import time
  10. from pathlib import Path
  11. from typing import Any, Callable
  12. from urllib.parse import urljoin
  13. import httpx
  14. from content_agent.errors import ContentAgentError, ErrorCode
  15. RATE_LIMIT_MESSAGE_TOKENS = ("限流", "请求频繁", "rate limit", "too many requests")
  16. class CrawapiTransientError(RuntimeError):
  17. """Retryable crawapi failure (network/timeout, or a platform-declared
  18. transient business code such as 视频号 25011). Subclasses RuntimeError so
  19. existing `except RuntimeError` handlers keep working unchanged."""
  20. class RateLimiter:
  21. def __init__(
  22. self,
  23. min_interval_seconds: float = 12.0,
  24. now_fn: Callable[[], float] = time.monotonic,
  25. sleep_fn: Callable[[float], None] = time.sleep,
  26. ) -> None:
  27. self.min_interval_seconds = min_interval_seconds
  28. self.now_fn = now_fn
  29. self.sleep_fn = sleep_fn
  30. self._last_call_by_bucket: dict[str, float] = {}
  31. def wait(self, bucket: str) -> None:
  32. last = self._last_call_by_bucket.get(bucket)
  33. if last is not None:
  34. remaining = self.min_interval_seconds - (self.now_fn() - last)
  35. if remaining > 0:
  36. self.sleep_fn(remaining)
  37. self._last_call_by_bucket[bucket] = self.now_fn()
  38. def is_rate_limit_business_error(
  39. code: Any, data: dict[str, Any], *, business_codes: set[str]
  40. ) -> bool:
  41. if str(code) in business_codes:
  42. return True
  43. message = str(data.get("msg") or data.get("message") or "").lower()
  44. return any(token in message for token in RATE_LIMIT_MESSAGE_TOKENS)
  45. def post_crawapi_json(
  46. *,
  47. http_client: Any,
  48. base_url: str,
  49. path: str,
  50. payload: dict[str, Any],
  51. operation: str,
  52. timeout_seconds: float,
  53. rate_limiter: RateLimiter | None = None,
  54. rate_limit_bucket: str | None = None,
  55. business_codes: set[str],
  56. transient_business_codes: set[str] = frozenset(),
  57. ) -> dict[str, Any]:
  58. if rate_limit_bucket and rate_limiter:
  59. rate_limiter.wait(rate_limit_bucket)
  60. url = urljoin(base_url, path)
  61. try:
  62. response = http_client.post(
  63. url,
  64. json=payload,
  65. headers={"Content-Type": "application/json"},
  66. timeout=timeout_seconds,
  67. )
  68. response.raise_for_status()
  69. data = response.json()
  70. except httpx.HTTPStatusError as exc:
  71. status_code = exc.response.status_code if exc.response is not None else "unknown"
  72. if status_code == 429:
  73. raise ContentAgentError(
  74. ErrorCode.PLATFORM_RATE_LIMITED,
  75. f"crawapi {operation} failed: rate_limited",
  76. {"operation": operation, "status_code": 429},
  77. ) from exc
  78. raise RuntimeError(f"crawapi {operation} failed: HTTP {status_code}") from exc
  79. except httpx.HTTPError as exc:
  80. raise CrawapiTransientError(f"crawapi {operation} failed: network_error") from exc
  81. except ValueError as exc:
  82. raise RuntimeError(f"crawapi {operation} failed: bad_json") from exc
  83. if not isinstance(data, dict):
  84. raise RuntimeError(f"crawapi {operation} failed: bad_response")
  85. code = data.get("code")
  86. if code is not None and code not in (0, "0"):
  87. if is_rate_limit_business_error(code, data, business_codes=business_codes):
  88. raise ContentAgentError(
  89. ErrorCode.PLATFORM_RATE_LIMITED,
  90. f"crawapi {operation} failed: rate_limited",
  91. {"operation": operation, "business_code": str(code)},
  92. )
  93. if str(code) in transient_business_codes:
  94. raise CrawapiTransientError(
  95. f"crawapi {operation} failed: transient_business_error code={code}"
  96. )
  97. raise RuntimeError(f"crawapi {operation} failed: business_error")
  98. return data
  99. def _load_env_file(env_path: str | Path) -> dict[str, str]:
  100. path = Path(env_path)
  101. if not path.exists():
  102. return {}
  103. env: dict[str, str] = {}
  104. for line in path.read_text(encoding="utf-8").splitlines():
  105. stripped = line.strip()
  106. if not stripped or stripped.startswith("#") or "=" not in stripped:
  107. continue
  108. key, value = stripped.split("=", 1)
  109. env[key.strip()] = value.strip().strip('"').strip("'")
  110. return env
  111. def _env(
  112. key: str,
  113. file_env: dict[str, str],
  114. default: str | None = None,
  115. required: bool = False,
  116. ) -> str:
  117. value = file_env.get(key) or os.getenv(key) or default
  118. if required and not value:
  119. raise RuntimeError(f"missing required env: {key}")
  120. return value or ""
  121. def _optional_positive_int(value: str) -> int | None:
  122. try:
  123. parsed = int(value)
  124. except ValueError:
  125. return None
  126. return parsed if parsed > 0 else None
  127. def content_format(raw_content_type: str) -> str:
  128. if "图文" in raw_content_type:
  129. return "image_text"
  130. if "文本" in raw_content_type:
  131. return "text"
  132. if "直播" in raw_content_type:
  133. return "live"
  134. return "video"
  135. def score_from_statistics(statistics: dict[str, Any]) -> int:
  136. digg = int(statistics.get("digg_count") or 0)
  137. comment = int(statistics.get("comment_count") or 0)
  138. share = int(statistics.get("share_count") or 0)
  139. weighted = digg + comment * 3 + share * 4
  140. if weighted >= 3000:
  141. return 72
  142. if weighted >= 1000:
  143. return 62
  144. if weighted >= 300:
  145. return 55
  146. return 45