|
@@ -2,16 +2,48 @@ from __future__ import annotations
|
|
|
|
|
|
|
|
import os
|
|
import os
|
|
|
import re
|
|
import re
|
|
|
|
|
+import time
|
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
-from typing import Any
|
|
|
|
|
|
|
+from typing import Any, Callable
|
|
|
from urllib.parse import urljoin
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
|
|
import httpx
|
|
import httpx
|
|
|
|
|
|
|
|
|
|
+from content_agent.errors import ContentAgentError, ErrorCode
|
|
|
|
|
+
|
|
|
RAW_CONTENT_ID_KEY = "_".join(["aweme", "id"])
|
|
RAW_CONTENT_ID_KEY = "_".join(["aweme", "id"])
|
|
|
RAW_AUTHOR_ID_KEY = "_".join(["sec", "uid"])
|
|
RAW_AUTHOR_ID_KEY = "_".join(["sec", "uid"])
|
|
|
RAW_AUTHOR_ACCOUNT_KEY = "_".join(["account", "id"])
|
|
RAW_AUTHOR_ACCOUNT_KEY = "_".join(["account", "id"])
|
|
|
|
|
|
|
|
|
|
+# 已证实的限流 business code 白名单。当前没有任何已证实的限流 code,
|
|
|
|
|
+# 识别先依靠 HTTP 429 与 message token;live smoke / 真实运行发现新 code 后补入并加用例。
|
|
|
|
|
+RATE_LIMIT_BUSINESS_CODES: set[str] = set()
|
|
|
|
|
+RATE_LIMIT_MESSAGE_TOKENS = ("限流", "请求频繁", "rate limit", "too many requests")
|
|
|
|
|
+
|
|
|
|
|
+SEARCH_RATE_LIMIT_BUCKET = "douyin_search"
|
|
|
|
|
+BLOGGER_RATE_LIMIT_BUCKET = "douyin_blogger"
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class RateLimiter:
|
|
|
|
|
+ def __init__(
|
|
|
|
|
+ self,
|
|
|
|
|
+ min_interval_seconds: float = 12.0,
|
|
|
|
|
+ now_fn: Callable[[], float] = time.monotonic,
|
|
|
|
|
+ sleep_fn: Callable[[float], None] = time.sleep,
|
|
|
|
|
+ ) -> None:
|
|
|
|
|
+ self.min_interval_seconds = min_interval_seconds
|
|
|
|
|
+ self.now_fn = now_fn
|
|
|
|
|
+ self.sleep_fn = sleep_fn
|
|
|
|
|
+ self._last_call_by_bucket: dict[str, float] = {}
|
|
|
|
|
+
|
|
|
|
|
+ def wait(self, bucket: str) -> None:
|
|
|
|
|
+ last = self._last_call_by_bucket.get(bucket)
|
|
|
|
|
+ if last is not None:
|
|
|
|
|
+ remaining = self.min_interval_seconds - (self.now_fn() - last)
|
|
|
|
|
+ if remaining > 0:
|
|
|
|
|
+ self.sleep_fn(remaining)
|
|
|
|
|
+ self._last_call_by_bucket[bucket] = self.now_fn()
|
|
|
|
|
+
|
|
|
|
|
|
|
|
class CrawapiDouyinClient:
|
|
class CrawapiDouyinClient:
|
|
|
def __init__(
|
|
def __init__(
|
|
@@ -19,26 +51,32 @@ class CrawapiDouyinClient:
|
|
|
base_url: str,
|
|
base_url: str,
|
|
|
keyword_path: str,
|
|
keyword_path: str,
|
|
|
content_portrait_path: str,
|
|
content_portrait_path: str,
|
|
|
|
|
+ blogger_path: str = "",
|
|
|
timeout_seconds: float = 60.0,
|
|
timeout_seconds: float = 60.0,
|
|
|
default_crawapi_account_ref: str = "",
|
|
default_crawapi_account_ref: str = "",
|
|
|
default_content_type: str = "视频",
|
|
default_content_type: str = "视频",
|
|
|
default_sort_type: str = "综合排序",
|
|
default_sort_type: str = "综合排序",
|
|
|
default_publish_time: str = "不限",
|
|
default_publish_time: str = "不限",
|
|
|
default_cursor: str = "0",
|
|
default_cursor: str = "0",
|
|
|
|
|
+ default_account_works_sort_type: str = "最新",
|
|
|
max_results_per_query: int | None = 3,
|
|
max_results_per_query: int | None = 3,
|
|
|
http_client: Any | None = None,
|
|
http_client: Any | None = None,
|
|
|
|
|
+ rate_limiter: RateLimiter | None = None,
|
|
|
) -> None:
|
|
) -> None:
|
|
|
self.base_url = base_url.rstrip("/") + "/"
|
|
self.base_url = base_url.rstrip("/") + "/"
|
|
|
self.keyword_path = keyword_path.lstrip("/")
|
|
self.keyword_path = keyword_path.lstrip("/")
|
|
|
self.content_portrait_path = content_portrait_path.lstrip("/")
|
|
self.content_portrait_path = content_portrait_path.lstrip("/")
|
|
|
|
|
+ self.blogger_path = blogger_path.lstrip("/")
|
|
|
self.timeout_seconds = timeout_seconds
|
|
self.timeout_seconds = timeout_seconds
|
|
|
self.default_crawapi_account_ref = default_crawapi_account_ref
|
|
self.default_crawapi_account_ref = default_crawapi_account_ref
|
|
|
self.default_content_type = default_content_type
|
|
self.default_content_type = default_content_type
|
|
|
self.default_sort_type = default_sort_type
|
|
self.default_sort_type = default_sort_type
|
|
|
self.default_publish_time = default_publish_time
|
|
self.default_publish_time = default_publish_time
|
|
|
self.default_cursor = default_cursor
|
|
self.default_cursor = default_cursor
|
|
|
|
|
+ self.default_account_works_sort_type = default_account_works_sort_type
|
|
|
self.max_results_per_query = max_results_per_query
|
|
self.max_results_per_query = max_results_per_query
|
|
|
self.http_client = http_client or httpx.Client(timeout=timeout_seconds)
|
|
self.http_client = http_client or httpx.Client(timeout=timeout_seconds)
|
|
|
|
|
+ self.rate_limiter = rate_limiter
|
|
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|
|
|
def from_env(cls, env_path: str | Path = ".env") -> "CrawapiDouyinClient":
|
|
def from_env(cls, env_path: str | Path = ".env") -> "CrawapiDouyinClient":
|
|
@@ -49,6 +87,7 @@ class CrawapiDouyinClient:
|
|
|
content_portrait_path=_env(
|
|
content_portrait_path=_env(
|
|
|
"CONTENTFIND_DOUYIN_VIDEO_LIKE_PORTRAIT_PATH", env, required=True
|
|
"CONTENTFIND_DOUYIN_VIDEO_LIKE_PORTRAIT_PATH", env, required=True
|
|
|
),
|
|
),
|
|
|
|
|
+ blogger_path=_env("CONTENTFIND_DOUYIN_BLOGGER_PATH", env, required=True),
|
|
|
timeout_seconds=float(
|
|
timeout_seconds=float(
|
|
|
_env("CONTENTFIND_API_CRAWAPI_TIMEOUT_SECONDS", env, default="60")
|
|
_env("CONTENTFIND_API_CRAWAPI_TIMEOUT_SECONDS", env, default="60")
|
|
|
),
|
|
),
|
|
@@ -57,9 +96,13 @@ class CrawapiDouyinClient:
|
|
|
default_sort_type=_env("CONTENTFIND_DOUYIN_DEFAULT_SORT_TYPE", env, default="综合排序"),
|
|
default_sort_type=_env("CONTENTFIND_DOUYIN_DEFAULT_SORT_TYPE", env, default="综合排序"),
|
|
|
default_publish_time=_env("CONTENTFIND_DOUYIN_DEFAULT_PUBLISH_TIME", env, default="不限"),
|
|
default_publish_time=_env("CONTENTFIND_DOUYIN_DEFAULT_PUBLISH_TIME", env, default="不限"),
|
|
|
default_cursor=_env("CONTENTFIND_DOUYIN_DEFAULT_CURSOR", env, default="0"),
|
|
default_cursor=_env("CONTENTFIND_DOUYIN_DEFAULT_CURSOR", env, default="0"),
|
|
|
|
|
+ default_account_works_sort_type=_env(
|
|
|
|
|
+ "CONTENTFIND_DOUYIN_ACCOUNT_WORKS_DEFAULT_SORT_TYPE", env, default="最新"
|
|
|
|
|
+ ),
|
|
|
max_results_per_query=_optional_positive_int(
|
|
max_results_per_query=_optional_positive_int(
|
|
|
_env("CONTENTFIND_DOUYIN_MAX_RESULTS_PER_QUERY", env, default="3")
|
|
_env("CONTENTFIND_DOUYIN_MAX_RESULTS_PER_QUERY", env, default="3")
|
|
|
),
|
|
),
|
|
|
|
|
+ rate_limiter=RateLimiter(),
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
def search(self, query: dict[str, Any]) -> list[dict[str, Any]]:
|
|
def search(self, query: dict[str, Any]) -> list[dict[str, Any]]:
|
|
@@ -71,7 +114,10 @@ class CrawapiDouyinClient:
|
|
|
"cursor": str(query.get("page_cursor") or self.default_cursor),
|
|
"cursor": str(query.get("page_cursor") or self.default_cursor),
|
|
|
RAW_AUTHOR_ACCOUNT_KEY: self.default_crawapi_account_ref,
|
|
RAW_AUTHOR_ACCOUNT_KEY: self.default_crawapi_account_ref,
|
|
|
}
|
|
}
|
|
|
- data = self._post_json(self.keyword_path, payload, operation="keyword_search")
|
|
|
|
|
|
|
+ data = self._post_json(
|
|
|
|
|
+ self.keyword_path, payload, operation="keyword_search",
|
|
|
|
|
+ rate_limit_bucket=SEARCH_RATE_LIMIT_BUCKET,
|
|
|
|
|
+ )
|
|
|
data_block = data.get("data", {}) if isinstance(data.get("data"), dict) else {}
|
|
data_block = data.get("data", {}) if isinstance(data.get("data"), dict) else {}
|
|
|
items = data_block.get("data", []) if isinstance(data_block.get("data"), list) else []
|
|
items = data_block.get("data", []) if isinstance(data_block.get("data"), list) else []
|
|
|
has_more = bool(data_block.get("has_more", False))
|
|
has_more = bool(data_block.get("has_more", False))
|
|
@@ -87,13 +133,15 @@ class CrawapiDouyinClient:
|
|
|
return results
|
|
return results
|
|
|
|
|
|
|
|
def fetch_author_works(self, query: dict[str, Any]) -> list[dict[str, Any]]:
|
|
def fetch_author_works(self, query: dict[str, Any]) -> list[dict[str, Any]]:
|
|
|
- author_id = str(query.get("platform_author_id") or "")
|
|
|
|
|
payload = {
|
|
payload = {
|
|
|
- RAW_AUTHOR_ID_KEY: author_id,
|
|
|
|
|
- "cursor": str(query.get("page_cursor") or self.default_cursor),
|
|
|
|
|
- RAW_AUTHOR_ACCOUNT_KEY: self.default_crawapi_account_ref,
|
|
|
|
|
|
|
+ RAW_AUTHOR_ACCOUNT_KEY: str(query.get("platform_author_id") or ""),
|
|
|
|
|
+ "sort_type": self.default_account_works_sort_type,
|
|
|
|
|
+ "cursor": str(query.get("page_cursor") or ""),
|
|
|
}
|
|
}
|
|
|
- data = self._post_json(self.keyword_path, payload, operation="author_works")
|
|
|
|
|
|
|
+ data = self._post_json(
|
|
|
|
|
+ self.blogger_path, payload, operation="author_works",
|
|
|
|
|
+ rate_limit_bucket=BLOGGER_RATE_LIMIT_BUCKET,
|
|
|
|
|
+ )
|
|
|
data_block = data.get("data", {}) if isinstance(data.get("data"), dict) else {}
|
|
data_block = data.get("data", {}) if isinstance(data.get("data"), dict) else {}
|
|
|
items = data_block.get("data", []) if isinstance(data_block.get("data"), list) else []
|
|
items = data_block.get("data", []) if isinstance(data_block.get("data"), list) else []
|
|
|
has_more = bool(data_block.get("has_more", False))
|
|
has_more = bool(data_block.get("has_more", False))
|
|
@@ -104,6 +152,7 @@ class CrawapiDouyinClient:
|
|
|
for index, item in enumerate(selected_items, start=1):
|
|
for index, item in enumerate(selected_items, start=1):
|
|
|
normalized = self._normalize_content_item(query, item, index, has_more, next_cursor)
|
|
normalized = self._normalize_content_item(query, item, index, has_more, next_cursor)
|
|
|
normalized["previous_discovery_step"] = "author_works"
|
|
normalized["previous_discovery_step"] = "author_works"
|
|
|
|
|
+ normalized["content_metadata_source"] = "douyin_blogger"
|
|
|
portrait = self._fetch_content_portrait(normalized["platform_content_id"])
|
|
portrait = self._fetch_content_portrait(normalized["platform_content_id"])
|
|
|
normalized.update(portrait)
|
|
normalized.update(portrait)
|
|
|
results.append(normalized)
|
|
results.append(normalized)
|
|
@@ -198,7 +247,15 @@ class CrawapiDouyinClient:
|
|
|
"age_50_plus_tgi": age_50_tgi,
|
|
"age_50_plus_tgi": age_50_tgi,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- def _post_json(self, path: str, payload: dict[str, Any], operation: str) -> dict[str, Any]:
|
|
|
|
|
|
|
+ def _post_json(
|
|
|
|
|
+ self,
|
|
|
|
|
+ path: str,
|
|
|
|
|
+ payload: dict[str, Any],
|
|
|
|
|
+ operation: str,
|
|
|
|
|
+ rate_limit_bucket: str | None = None,
|
|
|
|
|
+ ) -> dict[str, Any]:
|
|
|
|
|
+ if rate_limit_bucket and self.rate_limiter:
|
|
|
|
|
+ self.rate_limiter.wait(rate_limit_bucket)
|
|
|
url = urljoin(self.base_url, path)
|
|
url = urljoin(self.base_url, path)
|
|
|
try:
|
|
try:
|
|
|
response = self.http_client.post(
|
|
response = self.http_client.post(
|
|
@@ -211,6 +268,12 @@ class CrawapiDouyinClient:
|
|
|
data = response.json()
|
|
data = response.json()
|
|
|
except httpx.HTTPStatusError as exc:
|
|
except httpx.HTTPStatusError as exc:
|
|
|
status_code = exc.response.status_code if exc.response is not None else "unknown"
|
|
status_code = exc.response.status_code if exc.response is not None else "unknown"
|
|
|
|
|
+ if status_code == 429:
|
|
|
|
|
+ raise ContentAgentError(
|
|
|
|
|
+ ErrorCode.PLATFORM_RATE_LIMITED,
|
|
|
|
|
+ f"crawapi {operation} failed: rate_limited",
|
|
|
|
|
+ {"operation": operation, "status_code": 429},
|
|
|
|
|
+ ) from exc
|
|
|
raise RuntimeError(f"crawapi {operation} failed: HTTP {status_code}") from exc
|
|
raise RuntimeError(f"crawapi {operation} failed: HTTP {status_code}") from exc
|
|
|
except httpx.HTTPError as exc:
|
|
except httpx.HTTPError as exc:
|
|
|
raise RuntimeError(f"crawapi {operation} failed: network_error") from exc
|
|
raise RuntimeError(f"crawapi {operation} failed: network_error") from exc
|
|
@@ -220,10 +283,23 @@ class CrawapiDouyinClient:
|
|
|
raise RuntimeError(f"crawapi {operation} failed: bad_response")
|
|
raise RuntimeError(f"crawapi {operation} failed: bad_response")
|
|
|
code = data.get("code")
|
|
code = data.get("code")
|
|
|
if code is not None and code not in (0, "0"):
|
|
if code is not None and code not in (0, "0"):
|
|
|
|
|
+ if _is_rate_limit_business_error(code, data):
|
|
|
|
|
+ raise ContentAgentError(
|
|
|
|
|
+ ErrorCode.PLATFORM_RATE_LIMITED,
|
|
|
|
|
+ f"crawapi {operation} failed: rate_limited",
|
|
|
|
|
+ {"operation": operation, "business_code": str(code)},
|
|
|
|
|
+ )
|
|
|
raise RuntimeError(f"crawapi {operation} failed: business_error")
|
|
raise RuntimeError(f"crawapi {operation} failed: business_error")
|
|
|
return data
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def _is_rate_limit_business_error(code: Any, data: dict[str, Any]) -> bool:
|
|
|
|
|
+ if str(code) in RATE_LIMIT_BUSINESS_CODES:
|
|
|
|
|
+ return True
|
|
|
|
|
+ message = str(data.get("msg") or data.get("message") or "").lower()
|
|
|
|
|
+ return any(token in message for token in RATE_LIMIT_MESSAGE_TOKENS)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def _load_env_file(env_path: str | Path) -> dict[str, str]:
|
|
def _load_env_file(env_path: str | Path) -> dict[str, str]:
|
|
|
path = Path(env_path)
|
|
path = Path(env_path)
|
|
|
if not path.exists():
|
|
if not path.exists():
|