test_douyin_client.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. import httpx
  2. import pytest
  3. from content_agent.errors import ContentAgentError, ErrorCode
  4. from content_agent.integrations.douyin import (
  5. RAW_AUTHOR_ACCOUNT_KEY,
  6. RAW_AUTHOR_ID_KEY,
  7. RAW_CONTENT_ID_KEY,
  8. CrawapiDouyinClient,
  9. RateLimiter,
  10. )
  11. class FakeHttpClient:
  12. def __init__(self, responses):
  13. self.responses = list(responses)
  14. self.requests = []
  15. def post(self, url, json, headers, timeout):
  16. self.requests.append({"url": url, "json": json, "headers": headers, "timeout": timeout})
  17. response = self.responses.pop(0)
  18. if isinstance(response, Exception):
  19. raise response
  20. return response
  21. def _response(status_code, data):
  22. return httpx.Response(
  23. status_code,
  24. json=data,
  25. request=httpx.Request("POST", "http://crawapi.test/endpoint"),
  26. )
  27. def _client(responses, rate_limiter=None):
  28. return CrawapiDouyinClient(
  29. base_url="http://crawapi.test",
  30. keyword_path="/crawler/dou_yin/keyword",
  31. blogger_path="/crawler/dou_yin/blogger",
  32. default_crawapi_account_ref="771431222",
  33. http_client=FakeHttpClient(responses),
  34. rate_limiter=rate_limiter,
  35. )
  36. def _search_query(text="早上好祝福视频"):
  37. return {
  38. "search_query_id": "q_001",
  39. "search_query": text,
  40. "discovery_start_source": "pattern_itemset",
  41. }
  42. def test_douyin_keyword_search_maps_content_fields():
  43. client = _client(
  44. [
  45. _response(
  46. 200,
  47. {
  48. "data": {
  49. "data": [
  50. {
  51. RAW_CONTENT_ID_KEY: "7615247738577423622",
  52. "desc": "早睡早起早上好",
  53. "author": {
  54. "nickname": "让你心情变浅的兔子",
  55. RAW_AUTHOR_ID_KEY: "MS4wLjABAAAAdYc",
  56. },
  57. "statistics": {
  58. "digg_count": 1931,
  59. "comment_count": 45,
  60. "share_count": 1968,
  61. "collect_count": 462,
  62. },
  63. "text_extra": [{"hashtag_name": "早上好"}],
  64. }
  65. ],
  66. "has_more": True,
  67. "next_cursor": "10",
  68. }
  69. },
  70. ),
  71. ]
  72. )
  73. results = client.search(_search_query())
  74. assert len(results) == 1
  75. result = results[0]
  76. assert result["content_discovery_id"] == "q_001_content_001"
  77. assert result["platform_content_id"] == "7615247738577423622"
  78. assert result["platform_author_id"] == "MS4wLjABAAAAdYc"
  79. assert result["tags"] == ["#早上好"]
  80. assert result["has_more"] is True
  81. assert result["next_cursor"] == "10"
  82. assert result["discovery_relation"] == "derived_from_pattern_demand"
  83. assert result["platform_auth_mode"] == "no_bearer"
  84. assert result["platform_raw_payload"][RAW_CONTENT_ID_KEY] == "7615247738577423622"
  85. assert client.http_client.requests[0]["json"][RAW_AUTHOR_ACCOUNT_KEY] == "771431222"
  86. # V3 清理: 画像调用链已砍,搜索一条内容只发 1 次 keyword 请求,不再追加画像请求。
  87. assert len(client.http_client.requests) == 1
  88. assert client.http_client.requests[0]["url"].endswith("/crawler/dou_yin/keyword")
  89. def test_douyin_keyword_search_returns_empty_list():
  90. client = _client([_response(200, {"data": {"data": [], "has_more": False}})])
  91. results = client.search(_search_query("无结果"))
  92. assert results == []
  93. def test_douyin_keyword_search_uses_explicit_page_cursor():
  94. client = _client([_response(200, {"data": {"data": [], "has_more": False}})])
  95. client.search({**_search_query("下一页"), "page_cursor": "20"})
  96. assert client.http_client.requests[0]["json"]["cursor"] == "20"
  97. def test_douyin_fetch_author_works_maps_fake_response():
  98. client = _client(
  99. [
  100. _response(
  101. 200,
  102. {
  103. "data": {
  104. "data": [
  105. {
  106. RAW_CONTENT_ID_KEY: "7615247738577423001",
  107. "desc": "作者作品",
  108. "author": {
  109. "nickname": "作者",
  110. RAW_AUTHOR_ID_KEY: "MS4wLjABAAAA001",
  111. },
  112. "statistics": {"digg_count": 100},
  113. }
  114. ],
  115. "has_more": False,
  116. }
  117. },
  118. ),
  119. ]
  120. )
  121. results = client.fetch_author_works(
  122. {
  123. "search_query_id": "author_001",
  124. "search_query": "作者作品",
  125. "platform_author_id": "MS4wLjABAAAA001",
  126. "discovery_start_source": "pattern_itemset",
  127. }
  128. )
  129. # M5A 受控变化: 作者作品改打 blogger 接口,payload 用 account_id 三字段合同。
  130. assert results[0]["search_query_id"] == "author_001"
  131. assert results[0]["previous_discovery_step"] == "author_works"
  132. assert client.http_client.requests[0]["json"][RAW_AUTHOR_ACCOUNT_KEY] == "MS4wLjABAAAA001"
  133. assert len(client.http_client.requests) == 1
  134. def test_douyin_keyword_search_http_error_is_sanitized():
  135. client = _client([_response(500, {"error": "server failed"})])
  136. with pytest.raises(RuntimeError, match="keyword_search failed: HTTP 500"):
  137. client.search(_search_query("接口失败"))
  138. def test_douyin_keyword_search_business_error_is_failed():
  139. client = _client([_response(200, {"code": 22001, "msg": "强制登录", "data": None})])
  140. with pytest.raises(RuntimeError, match="keyword_search failed: business_error"):
  141. client.search(_search_query("账号态失败"))
  142. def test_douyin_keyword_search_network_error_is_sanitized():
  143. client = _client([httpx.ConnectError("network failed")])
  144. with pytest.raises(RuntimeError, match="keyword_search failed: network_error"):
  145. client.search(_search_query("网络失败"))
  146. def test_douyin_keyword_search_bad_json_is_sanitized():
  147. client = CrawapiDouyinClient(
  148. base_url="http://crawapi.test",
  149. keyword_path="/crawler/dou_yin/keyword",
  150. http_client=FakeHttpClient(
  151. [
  152. httpx.Response(
  153. 200,
  154. content=b"not json",
  155. request=httpx.Request("POST", "http://crawapi.test/endpoint"),
  156. )
  157. ]
  158. ),
  159. )
  160. with pytest.raises(RuntimeError, match="keyword_search failed: bad_json"):
  161. client.search(_search_query("坏 JSON"))
  162. def test_douyin_keyword_search_can_limit_results_per_query():
  163. client = CrawapiDouyinClient(
  164. base_url="http://crawapi.test",
  165. keyword_path="/crawler/dou_yin/keyword",
  166. max_results_per_query=1,
  167. http_client=FakeHttpClient(
  168. [
  169. _response(
  170. 200,
  171. {
  172. "data": {
  173. "data": [
  174. {RAW_CONTENT_ID_KEY: "1", "author": {}, "statistics": {}},
  175. {RAW_CONTENT_ID_KEY: "2", "author": {}, "statistics": {}},
  176. ]
  177. }
  178. },
  179. ),
  180. ]
  181. ),
  182. )
  183. results = client.search(_search_query("限量"))
  184. assert [result["platform_content_id"] for result in results] == ["1"]
  185. assert len(client.http_client.requests) == 1
  186. def _author_query(author_id="MS4wLjABAAAA001", **extra):
  187. return {
  188. "search_query_id": "author_001",
  189. "search_query": "作者作品",
  190. "platform_author_id": author_id,
  191. "discovery_start_source": "pattern_itemset",
  192. **extra,
  193. }
  194. def _blogger_response(items=None, has_more=True, next_cursor="20"):
  195. return _response(
  196. 200,
  197. {
  198. "code": 0,
  199. "data": {"data": items or [], "has_more": has_more, "next_cursor": next_cursor},
  200. },
  201. )
  202. class FakeRateLimiter:
  203. def __init__(self):
  204. self.buckets = []
  205. def wait(self, bucket):
  206. self.buckets.append(bucket)
  207. def test_fetch_author_works_posts_to_blogger_path():
  208. client = _client([_blogger_response()])
  209. client.fetch_author_works(_author_query())
  210. assert client.http_client.requests[0]["url"].endswith("/crawler/dou_yin/blogger")
  211. def test_fetch_author_works_payload_uses_account_id_from_platform_author_id():
  212. client = _client([_blogger_response()])
  213. client.fetch_author_works(_author_query("MS4wLjABAAAA999"))
  214. payload = client.http_client.requests[0]["json"]
  215. assert payload == {
  216. RAW_AUTHOR_ACCOUNT_KEY: "MS4wLjABAAAA999",
  217. "sort_type": "最新",
  218. "cursor": "",
  219. }
  220. def test_fetch_author_works_uses_page_cursor():
  221. client = _client([_blogger_response()])
  222. client.fetch_author_works(_author_query(page_cursor="20"))
  223. assert client.http_client.requests[0]["json"]["cursor"] == "20"
  224. def test_fetch_author_works_normalizes_author_work_fields():
  225. client = _client(
  226. [
  227. _blogger_response(
  228. items=[
  229. {
  230. RAW_CONTENT_ID_KEY: "7615247738577423001",
  231. "desc": "作者作品",
  232. "author": {"nickname": "作者", RAW_AUTHOR_ID_KEY: "MS4wLjABAAAA001"},
  233. "statistics": {"digg_count": 100},
  234. "create_time": 1733000000,
  235. }
  236. ]
  237. ),
  238. ]
  239. )
  240. results = client.fetch_author_works(_author_query())
  241. assert results[0]["platform_content_id"] == "7615247738577423001"
  242. assert results[0]["platform_author_id"] == "MS4wLjABAAAA001"
  243. assert results[0]["statistics"]["digg_count"] == 100
  244. assert results[0]["create_time"] == 1733000000
  245. assert results[0]["previous_discovery_step"] == "author_works"
  246. assert results[0]["content_metadata_source"] == "douyin_blogger"
  247. assert len(client.http_client.requests) == 1
  248. def test_from_env_reads_blogger_path_and_sort_type(monkeypatch, tmp_path):
  249. monkeypatch.setenv("CONTENTFIND_API_CRAWAPI_BASE_URL", "http://crawapi.test")
  250. monkeypatch.setenv("CONTENTFIND_DOUYIN_KEYWORD_PATH", "/crawler/dou_yin/keyword")
  251. monkeypatch.setenv("CONTENTFIND_DOUYIN_BLOGGER_PATH", "/crawler/dou_yin/blogger")
  252. monkeypatch.setenv("CONTENTFIND_DOUYIN_ACCOUNT_WORKS_DEFAULT_SORT_TYPE", "最热")
  253. client = CrawapiDouyinClient.from_env(env_path=tmp_path / "missing.env")
  254. assert client.blogger_path == "crawler/dou_yin/blogger"
  255. assert client.default_account_works_sort_type == "最热"
  256. assert isinstance(client.rate_limiter, RateLimiter)
  257. def test_rate_limiter_waits_between_keyword_calls():
  258. clock = {"now": 0.0}
  259. sleeps = []
  260. def fake_sleep(seconds):
  261. sleeps.append(seconds)
  262. clock["now"] += seconds
  263. limiter = RateLimiter(min_interval_seconds=12.0, now_fn=lambda: clock["now"], sleep_fn=fake_sleep)
  264. limiter.wait("douyin_search")
  265. limiter.wait("douyin_search")
  266. assert sleeps == [12.0]
  267. def test_search_chain_uses_shared_search_bucket():
  268. limiter = FakeRateLimiter()
  269. client = _client(
  270. [
  271. _response(200, {"code": 0, "data": {"data": [], "has_more": False}}),
  272. _response(200, {"code": 0, "data": {"data": [], "has_more": False}}),
  273. ],
  274. rate_limiter=limiter,
  275. )
  276. client.search(_search_query("关键词"))
  277. client.search({**_search_query("关键词"), "page_cursor": "10"})
  278. assert limiter.buckets == ["douyin_search", "douyin_search"]
  279. def test_blogger_uses_separate_bucket_from_search_chain():
  280. limiter = FakeRateLimiter()
  281. client = _client(
  282. [
  283. _response(200, {"code": 0, "data": {"data": [], "has_more": False}}),
  284. _blogger_response(),
  285. ],
  286. rate_limiter=limiter,
  287. )
  288. client.search(_search_query("关键词"))
  289. client.fetch_author_works(_author_query())
  290. assert limiter.buckets == ["douyin_search", "douyin_blogger"]
  291. def test_http_429_maps_to_platform_rate_limited():
  292. client = _client([_response(429, {"error": "too many"})])
  293. with pytest.raises(ContentAgentError) as exc_info:
  294. client.search(_search_query("被限流"))
  295. assert exc_info.value.error_code == ErrorCode.PLATFORM_RATE_LIMITED
  296. assert exc_info.value.detail["status_code"] == 429
  297. def test_business_rate_limit_code_maps_to_platform_rate_limited(monkeypatch):
  298. from content_agent.integrations import douyin
  299. monkeypatch.setattr(douyin, "RATE_LIMIT_BUSINESS_CODES", {"30005"})
  300. client = _client([_response(200, {"code": 30005, "msg": "ok", "data": None})])
  301. with pytest.raises(ContentAgentError) as exc_info:
  302. client.search(_search_query("业务限流"))
  303. assert exc_info.value.error_code == ErrorCode.PLATFORM_RATE_LIMITED
  304. assert exc_info.value.detail["business_code"] == "30005"
  305. def test_rate_limit_message_token_maps_to_platform_rate_limited():
  306. client = _client([_response(200, {"code": 1, "msg": "请求频繁,请稍后再试", "data": None})])
  307. with pytest.raises(ContentAgentError) as exc_info:
  308. client.search(_search_query("消息限流"))
  309. assert exc_info.value.error_code == ErrorCode.PLATFORM_RATE_LIMITED
  310. def test_force_login_without_rate_limit_code_is_not_rate_limited():
  311. client = _client([_response(200, {"code": 22001, "msg": "强制登录", "data": None})])
  312. with pytest.raises(RuntimeError, match="business_error"):
  313. client.search(_search_query("强制登录"))
  314. def test_bad_json_is_not_rate_limited():
  315. client = _client(
  316. [
  317. httpx.Response(
  318. 200, content=b"not json",
  319. request=httpx.Request("POST", "http://crawapi.test/endpoint"),
  320. )
  321. ]
  322. )
  323. with pytest.raises(RuntimeError, match="bad_json"):
  324. client.search(_search_query("坏响应"))
  325. def test_plain_500_is_not_rate_limited():
  326. client = _client([_response(500, {"error": "server failed"})])
  327. with pytest.raises(RuntimeError, match="HTTP 500"):
  328. client.search(_search_query("普通失败"))