hotspot_profile.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. """
  2. 热点宝画像数据工具(示例)
  3. 调用内部爬虫服务获取账号/内容的粉丝画像。
  4. """
  5. import asyncio
  6. import logging
  7. import time
  8. from typing import Optional, Dict, Any, List, Tuple
  9. import requests
  10. from agent.tools import tool, ToolResult
  11. from utils.tool_logging import format_tool_result_for_log, log_tool_call
  12. logger = logging.getLogger(__name__)
  13. _LABEL_ACCOUNT = "工具调用:get_account_fans_portrait -> 抖音账号粉丝画像(热点宝)"
  14. _LABEL_CONTENT = "工具调用:get_content_fans_portrait -> 内容点赞用户画像(热点宝)"
  15. def _log_return(label: str, params: Dict[str, Any], r: ToolResult) -> ToolResult:
  16. log_tool_call(label, params, format_tool_result_for_log(r))
  17. return r
  18. ACCOUNT_FANS_PORTRAIT_API = "http://crawapi.piaoquantv.com/crawler/dou_yin/re_dian_bao/account_fans_portrait"
  19. CONTENT_FANS_PORTRAIT_API = "http://crawapi.piaoquantv.com/crawler/dou_yin/re_dian_bao/video_like_portrait"
  20. DEFAULT_TIMEOUT = 60.0
  21. @tool(description="获取抖音账号粉丝画像(热点宝),支持选择画像维度")
  22. async def get_account_fans_portrait(
  23. account_id: str,
  24. need_province: bool = False,
  25. need_city: bool = False,
  26. need_city_level: bool = False,
  27. need_gender: bool = False,
  28. need_age: bool = True,
  29. need_phone_brand: bool = False,
  30. need_phone_price: bool = False,
  31. timeout: Optional[float] = None,
  32. ) -> ToolResult:
  33. """
  34. 获取抖音账号粉丝画像(热点宝数据)
  35. 获取指定账号的粉丝画像数据,包括年龄、性别、地域等多个维度。
  36. Args:
  37. account_id: 抖音账号ID(使用 author.sec_uid)
  38. need_province: 是否获取省份分布,默认 False
  39. need_city: 是否获取城市分布,默认 False
  40. need_city_level: 是否获取城市等级分布(一线/新一线/二线等),默认 False
  41. need_gender: 是否获取性别分布,默认 False
  42. need_age: 是否获取年龄分布,默认 True
  43. need_phone_brand: 是否获取手机品牌分布,默认 False
  44. need_phone_price: 是否获取手机价格分布,默认 False
  45. timeout: 超时时间(秒),默认 60
  46. Returns:
  47. ToolResult: 包含以下内容:
  48. - output: 文本格式的画像摘要
  49. - metadata.has_portrait: 布尔值,表示是否有有效画像数据
  50. - True: 有有效画像数据
  51. - False: 无画像数据
  52. - metadata.portrait_data: 结构化的画像数据(字典格式)
  53. - 键: 维度名称(如 "年龄"、"性别")
  54. - 值: 该维度的分布数据(字典)
  55. - percentage: 占比(如 "48.35%")
  56. - preference: 偏好度/TGI(如 "210.05")
  57. - metadata.raw_data: 原始 API 返回数据
  58. Note:
  59. - account_id 参数使用 author.sec_uid(约80字符)
  60. - 默认只返回年龄分布,需要其他维度时设置对应参数为 True
  61. - 省份数据只显示 TOP5
  62. - 偏好度(TGI)说明:
  63. - > 100: 该人群偏好高于平均水平
  64. - = 100: 平均水平
  65. - < 100: 低于平均水平
  66. - 使用 metadata.has_portrait 判断画像是否有效,不要解析 output 文本
  67. - 从 metadata.portrait_data 获取结构化画像数据
  68. """
  69. start_time = time.time()
  70. call_params = {
  71. "account_id": account_id,
  72. "need_province": need_province,
  73. "need_city": need_city,
  74. "need_city_level": need_city_level,
  75. "need_gender": need_gender,
  76. "need_age": need_age,
  77. "need_phone_brand": need_phone_brand,
  78. "need_phone_price": need_phone_price,
  79. "timeout": timeout,
  80. }
  81. # 验证 account_id 格式
  82. if not account_id or not isinstance(account_id, str):
  83. logger.error("get_account_fans_portrait invalid account_id", extra={"account_id": account_id})
  84. return _log_return(
  85. _LABEL_ACCOUNT,
  86. call_params,
  87. ToolResult(
  88. title="账号粉丝画像获取失败",
  89. output="",
  90. error="account_id 参数无效:必须是非空字符串",
  91. ),
  92. )
  93. if not account_id.startswith("MS4wLjABAAAA"):
  94. logger.error("get_account_fans_portrait invalid sec_uid format", extra={"account_id": account_id})
  95. return _log_return(
  96. _LABEL_ACCOUNT,
  97. call_params,
  98. ToolResult(
  99. title="账号粉丝画像获取失败",
  100. output="",
  101. error=f"account_id 格式错误:必须以 MS4wLjABAAAA 开头,当前值: {account_id[:min(20, len(account_id))]}...",
  102. ),
  103. )
  104. # if len(account_id) < 70 or len(account_id) > 90:
  105. # logger.error("get_account_fans_portrait invalid sec_uid length", extra={"account_id": account_id, "length": len(account_id)})
  106. # return ToolResult(
  107. # title="账号粉丝画像获取失败",
  108. # output="",
  109. # error=f"account_id 长度异常:期望 70-90 字符,实际 {len(account_id)} 字符。这可能是编造或截断的数据。",
  110. # )
  111. try:
  112. payload = {
  113. "account_id": account_id,
  114. "need_province": need_province,
  115. "need_city": need_city,
  116. "need_city_level": need_city_level,
  117. "need_gender": need_gender,
  118. "need_age": need_age,
  119. "need_phone_brand": need_phone_brand,
  120. "need_phone_price": need_phone_price,
  121. }
  122. request_timeout = timeout if timeout is not None else DEFAULT_TIMEOUT
  123. response = requests.post(
  124. ACCOUNT_FANS_PORTRAIT_API,
  125. json=payload,
  126. headers={"Content-Type": "application/json"},
  127. timeout=request_timeout
  128. )
  129. response.raise_for_status()
  130. data = response.json()
  131. data_block = data.get("data", {}) if isinstance(data.get("data"), dict) else {}
  132. portrait = data_block.get("data", {}) if isinstance(data_block.get("data"), dict) else {}
  133. # 格式化输出摘要
  134. summary_lines = [f"账号 {account_id} 的粉丝画像"]
  135. summary_lines.append(f"画像链接:https://douhot.douyin.com/creator/detail?active_tab=creator_fans_portrait&creator_id={account_id}")
  136. summary_lines.append("")
  137. for k, v in portrait.items():
  138. if not isinstance(v, dict):
  139. continue
  140. if k in ("省份", "城市"):
  141. summary_lines.append(f"【{k} TOP5】分布")
  142. items = _top_k(v, 5)
  143. else:
  144. summary_lines.append(f"【{k}】分布")
  145. items = v.items()
  146. for name, metrics in items:
  147. ratio = metrics.get("percentage")
  148. tgi = metrics.get("preference")
  149. summary_lines.append(f" {name}: {ratio} (偏好度: {tgi})")
  150. summary_lines.append("")
  151. duration_ms = int((time.time() - start_time) * 1000)
  152. has_valid_portrait = bool(portrait and any(
  153. isinstance(v, dict) and v for v in portrait.values()
  154. ))
  155. logger.info(
  156. "get_account_fans_portrait completed",
  157. extra={
  158. "account_id": account_id,
  159. "has_portrait": has_valid_portrait,
  160. "portrait_dimensions": list(portrait.keys()) if portrait else [],
  161. "duration_ms": duration_ms
  162. }
  163. )
  164. return _log_return(
  165. _LABEL_ACCOUNT,
  166. call_params,
  167. ToolResult(
  168. title=f"账号粉丝画像: {account_id}",
  169. output="\n".join(summary_lines),
  170. long_term_memory=f"Fetched fans portrait for account '{account_id}'",
  171. metadata={
  172. "raw_data": data,
  173. "has_portrait": has_valid_portrait,
  174. "portrait_data": portrait,
  175. },
  176. ),
  177. )
  178. except requests.exceptions.HTTPError as e:
  179. logger.error(
  180. "get_account_fans_portrait HTTP error",
  181. extra={
  182. "account_id": account_id,
  183. "status_code": e.response.status_code,
  184. "error": str(e)
  185. }
  186. )
  187. return _log_return(
  188. _LABEL_ACCOUNT,
  189. call_params,
  190. ToolResult(
  191. title="账号粉丝画像获取失败",
  192. output="",
  193. error=f"HTTP {e.response.status_code}: {e.response.text}",
  194. ),
  195. )
  196. except requests.exceptions.Timeout:
  197. logger.error("get_account_fans_portrait timeout", extra={"account_id": account_id, "timeout": request_timeout})
  198. return _log_return(
  199. _LABEL_ACCOUNT,
  200. call_params,
  201. ToolResult(
  202. title="账号粉丝画像获取失败",
  203. output="",
  204. error=f"请求超时({request_timeout}秒)",
  205. ),
  206. )
  207. except requests.exceptions.RequestException as e:
  208. logger.error("get_account_fans_portrait network error", extra={"account_id": account_id, "error": str(e)})
  209. return _log_return(
  210. _LABEL_ACCOUNT,
  211. call_params,
  212. ToolResult(
  213. title="账号粉丝画像获取失败",
  214. output="",
  215. error=f"网络错误: {str(e)}",
  216. ),
  217. )
  218. except Exception as e:
  219. logger.error("get_account_fans_portrait unexpected error", extra={"account_id": account_id, "error": str(e)}, exc_info=True)
  220. return _log_return(
  221. _LABEL_ACCOUNT,
  222. call_params,
  223. ToolResult(
  224. title="账号粉丝画像获取失败",
  225. output="",
  226. error=f"未知错误: {str(e)}",
  227. ),
  228. )
  229. @tool(description="获取抖音内容点赞用户画像(热点宝),支持选择画像维度")
  230. async def get_content_fans_portrait(
  231. content_id: str,
  232. need_province: bool = False,
  233. need_city: bool = False,
  234. need_city_level: bool = False,
  235. need_gender: bool = False,
  236. need_age: bool = True,
  237. need_phone_brand: bool = False,
  238. need_phone_price: bool = False,
  239. timeout: Optional[float] = None,
  240. ) -> ToolResult:
  241. """
  242. 获取抖音内容点赞用户画像(热点宝数据)
  243. 获取指定视频内容的点赞用户画像数据,包括年龄、性别、地域等多个维度。
  244. Args:
  245. content_id: 抖音内容ID(使用 aweme_id)
  246. need_province: 是否获取省份分布,默认 False
  247. need_city: 是否获取城市分布,默认 False
  248. need_city_level: 是否获取城市等级分布(一线/新一线/二线等),默认 False
  249. need_gender: 是否获取性别分布,默认 False
  250. need_age: 是否获取年龄分布,默认 True
  251. need_phone_brand: 是否获取手机品牌分布,默认 False
  252. need_phone_price: 是否获取手机价格分布,默认 False
  253. timeout: 超时时间(秒),默认 60
  254. Returns:
  255. ToolResult: 包含以下内容:
  256. - output: 文本格式的画像摘要
  257. - metadata.has_portrait: 布尔值,表示是否有有效画像数据
  258. - True: 有有效画像数据
  259. - False: 无画像数据(需要使用账号画像兜底)
  260. - metadata.portrait_data: 结构化的画像数据(字典格式)
  261. - 键: 维度名称(如 "年龄"、"性别")
  262. - 值: 该维度的分布数据(字典)
  263. - percentage: 占比(如 "48.35%")
  264. - preference: 偏好度/TGI(如 "210.05")
  265. - metadata.raw_data: 原始 API 返回数据
  266. Note:
  267. - content_id 参数使用 aweme_id
  268. - 默认只返回年龄分布,需要其他维度时设置对应参数为 True
  269. - 省份数据只显示 TOP5
  270. - 偏好度(TGI)说明:
  271. - > 100: 该人群偏好高于平均水平
  272. - = 100: 平均水平
  273. - < 100: 低于平均水平
  274. - 使用 metadata.has_portrait 判断画像是否有效,不要解析 output 文本
  275. - 如果 has_portrait 为 False,应使用 get_account_fans_portrait 作为兜底
  276. - 从 metadata.portrait_data 获取结构化画像数据
  277. """
  278. start_time = time.time()
  279. call_params = {
  280. "content_id": content_id,
  281. "need_province": need_province,
  282. "need_city": need_city,
  283. "need_city_level": need_city_level,
  284. "need_gender": need_gender,
  285. "need_age": need_age,
  286. "need_phone_brand": need_phone_brand,
  287. "need_phone_price": need_phone_price,
  288. "timeout": timeout,
  289. }
  290. # 验证 content_id 格式
  291. if not content_id or not isinstance(content_id, str):
  292. logger.error("get_content_fans_portrait invalid content_id", extra={"content_id": content_id})
  293. return _log_return(
  294. _LABEL_CONTENT,
  295. call_params,
  296. ToolResult(
  297. title="内容点赞用户画像获取失败",
  298. output="",
  299. error="content_id 参数无效:必须是非空字符串",
  300. ),
  301. )
  302. # aweme_id 应该是纯数字字符串,长度约 19 位
  303. if not content_id.isdigit():
  304. logger.error("get_content_fans_portrait invalid aweme_id format", extra={"content_id": content_id})
  305. return _log_return(
  306. _LABEL_CONTENT,
  307. call_params,
  308. ToolResult(
  309. title="内容点赞用户画像获取失败",
  310. output="",
  311. error=f"content_id 格式错误:aweme_id 应该是纯数字,当前值: {content_id[:20]}...",
  312. ),
  313. )
  314. if len(content_id) < 15 or len(content_id) > 25:
  315. logger.error("get_content_fans_portrait invalid aweme_id length", extra={"content_id": content_id, "length": len(content_id)})
  316. return _log_return(
  317. _LABEL_CONTENT,
  318. call_params,
  319. ToolResult(
  320. title="内容点赞用户画像获取失败",
  321. output="",
  322. error=f"content_id 长度异常:期望 15-25 位数字,实际 {len(content_id)} 位",
  323. ),
  324. )
  325. try:
  326. payload = {
  327. "content_id": content_id,
  328. "need_province": need_province,
  329. "need_city": need_city,
  330. "need_city_level": need_city_level,
  331. "need_gender": need_gender,
  332. "need_age": need_age,
  333. "need_phone_brand": need_phone_brand,
  334. "need_phone_price": need_phone_price,
  335. }
  336. request_timeout = timeout if timeout is not None else DEFAULT_TIMEOUT
  337. response = requests.post(
  338. CONTENT_FANS_PORTRAIT_API,
  339. json=payload,
  340. headers={"Content-Type": "application/json"},
  341. timeout=request_timeout
  342. )
  343. response.raise_for_status()
  344. data = response.json()
  345. data_block = data.get("data", {}) if isinstance(data.get("data"), dict) else {}
  346. portrait = data_block.get("data", {}) if isinstance(data_block.get("data"), dict) else {}
  347. # 格式化输出摘要
  348. summary_lines = [f"内容 {content_id} 的点赞用户画像"]
  349. summary_lines.append(f"画像链接:https://douhot.douyin.com/video/detail?active_tab=video_fans&video_id={content_id}")
  350. summary_lines.append("")
  351. for k, v in portrait.items():
  352. if not isinstance(v, dict):
  353. continue
  354. if k in ("省份", "城市"):
  355. summary_lines.append(f"【{k} TOP5】分布")
  356. items = _top_k(v, 5)
  357. else:
  358. summary_lines.append(f"【{k}】分布")
  359. items = v.items()
  360. for name, metrics in items:
  361. ratio = metrics.get("percentage")
  362. tgi = metrics.get("preference")
  363. summary_lines.append(f" {name}: {ratio} (偏好度: {tgi})")
  364. summary_lines.append("")
  365. duration_ms = int((time.time() - start_time) * 1000)
  366. has_valid_portrait = bool(portrait and any(
  367. isinstance(v, dict) and v for v in portrait.values()
  368. ))
  369. logger.info(
  370. "get_content_fans_portrait completed",
  371. extra={
  372. "content_id": content_id,
  373. "has_portrait": has_valid_portrait,
  374. "portrait_dimensions": list(portrait.keys()) if portrait else [],
  375. "duration_ms": duration_ms
  376. }
  377. )
  378. return _log_return(
  379. _LABEL_CONTENT,
  380. call_params,
  381. ToolResult(
  382. title=f"内容点赞用户画像: {content_id}",
  383. output="\n".join(summary_lines),
  384. long_term_memory=f"Fetched fans portrait for content '{content_id}'",
  385. metadata={
  386. "raw_data": data,
  387. "has_portrait": has_valid_portrait,
  388. "portrait_data": portrait,
  389. },
  390. ),
  391. )
  392. except requests.exceptions.HTTPError as e:
  393. logger.error(
  394. "get_content_fans_portrait HTTP error",
  395. extra={
  396. "content_id": content_id,
  397. "status_code": e.response.status_code,
  398. "error": str(e)
  399. }
  400. )
  401. return _log_return(
  402. _LABEL_CONTENT,
  403. call_params,
  404. ToolResult(
  405. title="内容点赞用户画像获取失败",
  406. output="",
  407. error=f"HTTP {e.response.status_code}: {e.response.text}",
  408. ),
  409. )
  410. except requests.exceptions.Timeout:
  411. logger.error("get_content_fans_portrait timeout", extra={"content_id": content_id, "timeout": request_timeout})
  412. return _log_return(
  413. _LABEL_CONTENT,
  414. call_params,
  415. ToolResult(
  416. title="内容点赞用户画像获取失败",
  417. output="",
  418. error=f"请求超时({request_timeout}秒)",
  419. ),
  420. )
  421. except requests.exceptions.RequestException as e:
  422. logger.error("get_content_fans_portrait network error", extra={"content_id": content_id, "error": str(e)})
  423. return _log_return(
  424. _LABEL_CONTENT,
  425. call_params,
  426. ToolResult(
  427. title="内容点赞用户画像获取失败",
  428. output="",
  429. error=f"网络错误: {str(e)}",
  430. ),
  431. )
  432. except Exception as e:
  433. logger.error("get_content_fans_portrait unexpected error", extra={"content_id": content_id, "error": str(e)}, exc_info=True)
  434. return _log_return(
  435. _LABEL_CONTENT,
  436. call_params,
  437. ToolResult(
  438. title="内容点赞用户画像获取失败",
  439. output="",
  440. error=f"未知错误: {str(e)}",
  441. ),
  442. )
  443. def _top_k(items: Dict[str, Any], k: int) -> List[Tuple[str, Any]]:
  444. def percent_value(entry: Tuple[str, Any]) -> float:
  445. metrics = entry[1] if isinstance(entry[1], dict) else {}
  446. return metrics.get("percentage")
  447. return sorted(items.items(), key=percent_value, reverse=True)[:k]