|
|
@@ -1,15 +1,24 @@
|
|
|
from __future__ import annotations
|
|
|
|
|
|
-import os
|
|
|
import re
|
|
|
-import time
|
|
|
from pathlib import Path
|
|
|
-from typing import Any, Callable
|
|
|
-from urllib.parse import urljoin
|
|
|
+from typing import Any
|
|
|
|
|
|
import httpx
|
|
|
|
|
|
-from content_agent.errors import ContentAgentError, ErrorCode
|
|
|
+# 共享 crawapi 基座(V3-M1A):HTTP/限流/限流错误识别/env helper 集中于 crawapi_http,
|
|
|
+# 下方 re-export 保持既有外部 import(测试、smoke 脚本)零改。
|
|
|
+from content_agent.integrations.crawapi_http import (
|
|
|
+ RATE_LIMIT_MESSAGE_TOKENS,
|
|
|
+ RateLimiter,
|
|
|
+ _env,
|
|
|
+ _load_env_file,
|
|
|
+ _optional_positive_int,
|
|
|
+ content_format as _content_format,
|
|
|
+ is_rate_limit_business_error,
|
|
|
+ post_crawapi_json,
|
|
|
+ score_from_statistics as _score_from_statistics,
|
|
|
+)
|
|
|
|
|
|
RAW_CONTENT_ID_KEY = "_".join(["aweme", "id"])
|
|
|
RAW_AUTHOR_ID_KEY = "_".join(["sec", "uid"])
|
|
|
@@ -18,33 +27,11 @@ RAW_AUTHOR_ACCOUNT_KEY = "_".join(["account", "id"])
|
|
|
# 已证实的限流 business code 白名单。当前没有任何已证实的限流 code,
|
|
|
# 识别先依靠 HTTP 429 与 message token;live smoke / 真实运行发现新 code 后补入并加用例。
|
|
|
RATE_LIMIT_BUSINESS_CODES: set[str] = set()
|
|
|
-RATE_LIMIT_MESSAGE_TOKENS = ("限流", "请求频繁", "rate limit", "too many requests")
|
|
|
|
|
|
SEARCH_RATE_LIMIT_BUCKET = "douyin_search"
|
|
|
BLOGGER_RATE_LIMIT_BUCKET = "douyin_blogger"
|
|
|
|
|
|
|
|
|
-class RateLimiter:
|
|
|
- def __init__(
|
|
|
- self,
|
|
|
- min_interval_seconds: float = 12.0,
|
|
|
- now_fn: Callable[[], float] = time.monotonic,
|
|
|
- sleep_fn: Callable[[float], None] = time.sleep,
|
|
|
- ) -> None:
|
|
|
- self.min_interval_seconds = min_interval_seconds
|
|
|
- self.now_fn = now_fn
|
|
|
- self.sleep_fn = sleep_fn
|
|
|
- self._last_call_by_bucket: dict[str, float] = {}
|
|
|
-
|
|
|
- def wait(self, bucket: str) -> None:
|
|
|
- last = self._last_call_by_bucket.get(bucket)
|
|
|
- if last is not None:
|
|
|
- remaining = self.min_interval_seconds - (self.now_fn() - last)
|
|
|
- if remaining > 0:
|
|
|
- self.sleep_fn(remaining)
|
|
|
- self._last_call_by_bucket[bucket] = self.now_fn()
|
|
|
-
|
|
|
-
|
|
|
class CrawapiDouyinClient:
|
|
|
def __init__(
|
|
|
self,
|
|
|
@@ -52,6 +39,7 @@ class CrawapiDouyinClient:
|
|
|
keyword_path: str,
|
|
|
content_portrait_path: str,
|
|
|
blogger_path: str = "",
|
|
|
+ detail_path: str = "",
|
|
|
timeout_seconds: float = 60.0,
|
|
|
default_crawapi_account_ref: str = "",
|
|
|
default_content_type: str = "视频",
|
|
|
@@ -67,6 +55,7 @@ class CrawapiDouyinClient:
|
|
|
self.keyword_path = keyword_path.lstrip("/")
|
|
|
self.content_portrait_path = content_portrait_path.lstrip("/")
|
|
|
self.blogger_path = blogger_path.lstrip("/")
|
|
|
+ self.detail_path = detail_path.lstrip("/")
|
|
|
self.timeout_seconds = timeout_seconds
|
|
|
self.default_crawapi_account_ref = default_crawapi_account_ref
|
|
|
self.default_content_type = default_content_type
|
|
|
@@ -88,6 +77,9 @@ class CrawapiDouyinClient:
|
|
|
"CONTENTFIND_DOUYIN_VIDEO_LIKE_PORTRAIT_PATH", env, required=True
|
|
|
),
|
|
|
blogger_path=_env("CONTENTFIND_DOUYIN_BLOGGER_PATH", env, required=True),
|
|
|
+ detail_path=_env(
|
|
|
+ "CONTENTFIND_DOUYIN_DETAIL_PATH", env, default="/crawler/dou_yin/detail"
|
|
|
+ ),
|
|
|
timeout_seconds=float(
|
|
|
_env("CONTENTFIND_API_CRAWAPI_TIMEOUT_SECONDS", env, default="60")
|
|
|
),
|
|
|
@@ -176,6 +168,7 @@ class CrawapiDouyinClient:
|
|
|
"platform": "douyin",
|
|
|
"platform_content_id": platform_content_id,
|
|
|
"platform_content_format": _content_format(self.default_content_type),
|
|
|
+ "play_url": _extract_play_url(item),
|
|
|
"description": item.get("desc") or item.get("item_title") or "",
|
|
|
"platform_author_id": platform_author_id,
|
|
|
"author_display_name": author.get("nickname") or "",
|
|
|
@@ -247,6 +240,41 @@ class CrawapiDouyinClient:
|
|
|
"age_50_plus_tgi": age_50_tgi,
|
|
|
}
|
|
|
|
|
|
+ def fetch_detail(self, content_id: str) -> dict[str, Any]:
|
|
|
+ data = self._post_json(
|
|
|
+ self.detail_path,
|
|
|
+ {"content_id": str(content_id)},
|
|
|
+ operation="detail",
|
|
|
+ rate_limit_bucket=SEARCH_RATE_LIMIT_BUCKET,
|
|
|
+ )
|
|
|
+ block = data.get("data", {}) if isinstance(data.get("data"), dict) else {}
|
|
|
+ detail = block.get("data", {}) if isinstance(block.get("data"), dict) else {}
|
|
|
+ statistics = {
|
|
|
+ "digg_count": int(detail.get("like_count") or 0),
|
|
|
+ "comment_count": int(detail.get("comment_count") or 0),
|
|
|
+ "share_count": int(detail.get("share_count") or 0),
|
|
|
+ "collect_count": int(detail.get("collect_count") or 0),
|
|
|
+ "play_count": int(detail.get("play_count") or 0),
|
|
|
+ }
|
|
|
+ topic_list = detail.get("topic_list") or []
|
|
|
+ tags = [t if str(t).startswith("#") else f"#{t}" for t in topic_list if t]
|
|
|
+ video_list = detail.get("video_url_list") or []
|
|
|
+ play_url = video_list[0].get("video_url") if video_list else None
|
|
|
+ publish_ms = detail.get("publish_timestamp")
|
|
|
+ return {
|
|
|
+ "platform": "douyin",
|
|
|
+ "platform_content_id": str(detail.get("channel_content_id") or content_id),
|
|
|
+ "platform_content_url": detail.get("content_link"),
|
|
|
+ "description": detail.get("body_text") or detail.get("title") or "",
|
|
|
+ "platform_author_id": str(detail.get("channel_account_id") or ""),
|
|
|
+ "author_display_name": detail.get("channel_account_name") or "",
|
|
|
+ "statistics": statistics,
|
|
|
+ "tags": tags,
|
|
|
+ "play_url": play_url,
|
|
|
+ "create_time": int(publish_ms) // 1000 if publish_ms else None,
|
|
|
+ "content_metadata_source": "douyin_detail",
|
|
|
+ }
|
|
|
+
|
|
|
def _post_json(
|
|
|
self,
|
|
|
path: str,
|
|
|
@@ -254,84 +282,24 @@ class CrawapiDouyinClient:
|
|
|
operation: str,
|
|
|
rate_limit_bucket: str | None = None,
|
|
|
) -> dict[str, Any]:
|
|
|
- if rate_limit_bucket and self.rate_limiter:
|
|
|
- self.rate_limiter.wait(rate_limit_bucket)
|
|
|
- url = urljoin(self.base_url, path)
|
|
|
- try:
|
|
|
- response = self.http_client.post(
|
|
|
- url,
|
|
|
- json=payload,
|
|
|
- headers={"Content-Type": "application/json"},
|
|
|
- timeout=self.timeout_seconds,
|
|
|
- )
|
|
|
- response.raise_for_status()
|
|
|
- data = response.json()
|
|
|
- except httpx.HTTPStatusError as exc:
|
|
|
- status_code = exc.response.status_code if exc.response is not None else "unknown"
|
|
|
- if status_code == 429:
|
|
|
- raise ContentAgentError(
|
|
|
- ErrorCode.PLATFORM_RATE_LIMITED,
|
|
|
- f"crawapi {operation} failed: rate_limited",
|
|
|
- {"operation": operation, "status_code": 429},
|
|
|
- ) from exc
|
|
|
- raise RuntimeError(f"crawapi {operation} failed: HTTP {status_code}") from exc
|
|
|
- except httpx.HTTPError as exc:
|
|
|
- raise RuntimeError(f"crawapi {operation} failed: network_error") from exc
|
|
|
- except ValueError as exc:
|
|
|
- raise RuntimeError(f"crawapi {operation} failed: bad_json") from exc
|
|
|
- if not isinstance(data, dict):
|
|
|
- raise RuntimeError(f"crawapi {operation} failed: bad_response")
|
|
|
- code = data.get("code")
|
|
|
- if code is not None and code not in (0, "0"):
|
|
|
- if _is_rate_limit_business_error(code, data):
|
|
|
- raise ContentAgentError(
|
|
|
- ErrorCode.PLATFORM_RATE_LIMITED,
|
|
|
- f"crawapi {operation} failed: rate_limited",
|
|
|
- {"operation": operation, "business_code": str(code)},
|
|
|
- )
|
|
|
- raise RuntimeError(f"crawapi {operation} failed: business_error")
|
|
|
- return data
|
|
|
-
|
|
|
-
|
|
|
-def _is_rate_limit_business_error(code: Any, data: dict[str, Any]) -> bool:
|
|
|
- if str(code) in RATE_LIMIT_BUSINESS_CODES:
|
|
|
- return True
|
|
|
- message = str(data.get("msg") or data.get("message") or "").lower()
|
|
|
- return any(token in message for token in RATE_LIMIT_MESSAGE_TOKENS)
|
|
|
-
|
|
|
-
|
|
|
-def _load_env_file(env_path: str | Path) -> dict[str, str]:
|
|
|
- path = Path(env_path)
|
|
|
- if not path.exists():
|
|
|
- return {}
|
|
|
- env: dict[str, str] = {}
|
|
|
- for line in path.read_text(encoding="utf-8").splitlines():
|
|
|
- stripped = line.strip()
|
|
|
- if not stripped or stripped.startswith("#") or "=" not in stripped:
|
|
|
- continue
|
|
|
- key, value = stripped.split("=", 1)
|
|
|
- env[key.strip()] = value.strip().strip('"').strip("'")
|
|
|
- return env
|
|
|
-
|
|
|
-
|
|
|
-def _env(
|
|
|
- key: str,
|
|
|
- file_env: dict[str, str],
|
|
|
- default: str | None = None,
|
|
|
- required: bool = False,
|
|
|
-) -> str:
|
|
|
- value = file_env.get(key) or os.getenv(key) or default
|
|
|
- if required and not value:
|
|
|
- raise RuntimeError(f"missing required env: {key}")
|
|
|
- return value or ""
|
|
|
+ return post_crawapi_json(
|
|
|
+ http_client=self.http_client,
|
|
|
+ base_url=self.base_url,
|
|
|
+ path=path,
|
|
|
+ payload=payload,
|
|
|
+ operation=operation,
|
|
|
+ timeout_seconds=self.timeout_seconds,
|
|
|
+ rate_limiter=self.rate_limiter,
|
|
|
+ rate_limit_bucket=rate_limit_bucket,
|
|
|
+ business_codes=RATE_LIMIT_BUSINESS_CODES,
|
|
|
+ )
|
|
|
|
|
|
|
|
|
-def _optional_positive_int(value: str) -> int | None:
|
|
|
- try:
|
|
|
- parsed = int(value)
|
|
|
- except ValueError:
|
|
|
- return None
|
|
|
- return parsed if parsed > 0 else None
|
|
|
+def _extract_play_url(item: dict[str, Any]) -> str | None:
|
|
|
+ video = item.get("video") if isinstance(item.get("video"), dict) else {}
|
|
|
+ play_addr = video.get("play_addr") if isinstance(video.get("play_addr"), dict) else {}
|
|
|
+ url_list = play_addr.get("url_list") or []
|
|
|
+ return str(url_list[0]) if url_list else None
|
|
|
|
|
|
|
|
|
def _extract_tags(item: dict[str, Any]) -> list[str]:
|
|
|
@@ -349,30 +317,6 @@ def _extract_tags(item: dict[str, Any]) -> list[str]:
|
|
|
return list(dict.fromkeys(tags))
|
|
|
|
|
|
|
|
|
-def _content_format(raw_content_type: str) -> str:
|
|
|
- if "图文" in raw_content_type:
|
|
|
- return "image_text"
|
|
|
- if "文本" in raw_content_type:
|
|
|
- return "text"
|
|
|
- if "直播" in raw_content_type:
|
|
|
- return "live"
|
|
|
- return "video"
|
|
|
-
|
|
|
-
|
|
|
-def _score_from_statistics(statistics: dict[str, Any]) -> int:
|
|
|
- digg = int(statistics.get("digg_count") or 0)
|
|
|
- comment = int(statistics.get("comment_count") or 0)
|
|
|
- share = int(statistics.get("share_count") or 0)
|
|
|
- weighted = digg + comment * 3 + share * 4
|
|
|
- if weighted >= 3000:
|
|
|
- return 72
|
|
|
- if weighted >= 1000:
|
|
|
- return 62
|
|
|
- if weighted >= 300:
|
|
|
- return 55
|
|
|
- return 45
|
|
|
-
|
|
|
-
|
|
|
def _normalize_age_distribution(age_data: Any) -> list[dict[str, Any]]:
|
|
|
rows: list[dict[str, Any]] = []
|
|
|
items = age_data.items() if isinstance(age_data, dict) else []
|