hotspot_profile.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. """
  2. 热点宝画像数据工具(示例)
  3. 调用内部爬虫服务获取账号/内容的粉丝画像。
  4. """
  5. import asyncio
  6. import logging
  7. import time
  8. from typing import Optional, Dict, Any, List, Tuple
  9. import requests
  10. from agent.tools import tool, ToolResult
  11. logger = logging.getLogger(__name__)
  12. ACCOUNT_FANS_PORTRAIT_API = "http://crawapi.piaoquantv.com/crawler/dou_yin/re_dian_bao/account_fans_portrait"
  13. CONTENT_FANS_PORTRAIT_API = "http://crawapi.piaoquantv.com/crawler/dou_yin/re_dian_bao/video_like_portrait"
  14. DEFAULT_TIMEOUT = 60.0
  15. @tool(description="获取抖音账号粉丝画像(热点宝),支持选择画像维度")
  16. async def get_account_fans_portrait(
  17. account_id: str,
  18. need_province: bool = False,
  19. need_city: bool = False,
  20. need_city_level: bool = False,
  21. need_gender: bool = False,
  22. need_age: bool = True,
  23. need_phone_brand: bool = False,
  24. need_phone_price: bool = False,
  25. timeout: Optional[float] = None,
  26. ) -> ToolResult:
  27. """
  28. 获取抖音账号粉丝画像(热点宝数据)
  29. 获取指定账号的粉丝画像数据,包括年龄、性别、地域等多个维度。
  30. Args:
  31. account_id: 抖音账号ID(使用 author.sec_uid)
  32. need_province: 是否获取省份分布,默认 False
  33. need_city: 是否获取城市分布,默认 False
  34. need_city_level: 是否获取城市等级分布(一线/新一线/二线等),默认 False
  35. need_gender: 是否获取性别分布,默认 False
  36. need_age: 是否获取年龄分布,默认 True
  37. need_phone_brand: 是否获取手机品牌分布,默认 False
  38. need_phone_price: 是否获取手机价格分布,默认 False
  39. timeout: 超时时间(秒),默认 60
  40. Returns:
  41. ToolResult: 包含以下内容:
  42. - output: 文本格式的画像摘要
  43. - metadata.has_portrait: 布尔值,表示是否有有效画像数据
  44. - True: 有有效画像数据
  45. - False: 无画像数据
  46. - metadata.portrait_data: 结构化的画像数据(字典格式)
  47. - 键: 维度名称(如 "年龄"、"性别")
  48. - 值: 该维度的分布数据(字典)
  49. - percentage: 占比(如 "48.35%")
  50. - preference: 偏好度/TGI(如 "210.05")
  51. - metadata.raw_data: 原始 API 返回数据
  52. Note:
  53. - account_id 参数使用 author.sec_uid(约80字符)
  54. - 默认只返回年龄分布,需要其他维度时设置对应参数为 True
  55. - 省份数据只显示 TOP5
  56. - 偏好度(TGI)说明:
  57. - > 100: 该人群偏好高于平均水平
  58. - = 100: 平均水平
  59. - < 100: 低于平均水平
  60. - 使用 metadata.has_portrait 判断画像是否有效,不要解析 output 文本
  61. - 从 metadata.portrait_data 获取结构化画像数据
  62. """
  63. start_time = time.time()
  64. # 验证 account_id 格式
  65. if not account_id or not isinstance(account_id, str):
  66. logger.error("get_account_fans_portrait invalid account_id", extra={"account_id": account_id})
  67. return ToolResult(
  68. title="账号粉丝画像获取失败",
  69. output="",
  70. error="account_id 参数无效:必须是非空字符串",
  71. )
  72. if not account_id.startswith("MS4wLjABAAAA"):
  73. logger.error("get_account_fans_portrait invalid sec_uid format", extra={"account_id": account_id})
  74. return ToolResult(
  75. title="账号粉丝画像获取失败",
  76. output="",
  77. error=f"account_id 格式错误:必须以 MS4wLjABAAAA 开头,当前值: {account_id[:min(20, len(account_id))]}...",
  78. )
  79. if len(account_id) < 70 or len(account_id) > 90:
  80. logger.error("get_account_fans_portrait invalid sec_uid length", extra={"account_id": account_id, "length": len(account_id)})
  81. return ToolResult(
  82. title="账号粉丝画像获取失败",
  83. output="",
  84. error=f"account_id 长度异常:期望 70-90 字符,实际 {len(account_id)} 字符。这可能是编造或截断的数据。",
  85. )
  86. try:
  87. payload = {
  88. "account_id": account_id,
  89. "need_province": need_province,
  90. "need_city": need_city,
  91. "need_city_level": need_city_level,
  92. "need_gender": need_gender,
  93. "need_age": need_age,
  94. "need_phone_brand": need_phone_brand,
  95. "need_phone_price": need_phone_price,
  96. }
  97. request_timeout = timeout if timeout is not None else DEFAULT_TIMEOUT
  98. response = requests.post(
  99. ACCOUNT_FANS_PORTRAIT_API,
  100. json=payload,
  101. headers={"Content-Type": "application/json"},
  102. timeout=request_timeout
  103. )
  104. response.raise_for_status()
  105. data = response.json()
  106. data_block = data.get("data", {}) if isinstance(data.get("data"), dict) else {}
  107. portrait = data_block.get("data", {}) if isinstance(data_block.get("data"), dict) else {}
  108. # 格式化输出摘要
  109. summary_lines = [f"账号 {account_id} 的粉丝画像"]
  110. summary_lines.append(f"画像链接:https://douhot.douyin.com/creator/detail?active_tab=creator_fans_portrait&creator_id={account_id}")
  111. summary_lines.append("")
  112. for k, v in portrait.items():
  113. if not isinstance(v, dict):
  114. continue
  115. if k in ("省份", "城市"):
  116. summary_lines.append(f"【{k} TOP5】分布")
  117. items = _top_k(v, 5)
  118. else:
  119. summary_lines.append(f"【{k}】分布")
  120. items = v.items()
  121. for name, metrics in items:
  122. ratio = metrics.get("percentage")
  123. tgi = metrics.get("preference")
  124. summary_lines.append(f" {name}: {ratio} (偏好度: {tgi})")
  125. summary_lines.append("")
  126. duration_ms = int((time.time() - start_time) * 1000)
  127. has_valid_portrait = bool(portrait and any(
  128. isinstance(v, dict) and v for v in portrait.values()
  129. ))
  130. logger.info(
  131. "get_account_fans_portrait completed",
  132. extra={
  133. "account_id": account_id,
  134. "has_portrait": has_valid_portrait,
  135. "portrait_dimensions": list(portrait.keys()) if portrait else [],
  136. "duration_ms": duration_ms
  137. }
  138. )
  139. return ToolResult(
  140. title=f"账号粉丝画像: {account_id}",
  141. output="\n".join(summary_lines),
  142. long_term_memory=f"Fetched fans portrait for account '{account_id}'",
  143. metadata={
  144. "raw_data": data,
  145. "has_portrait": has_valid_portrait,
  146. "portrait_data": portrait
  147. }
  148. )
  149. except requests.exceptions.HTTPError as e:
  150. logger.error(
  151. "get_account_fans_portrait HTTP error",
  152. extra={
  153. "account_id": account_id,
  154. "status_code": e.response.status_code,
  155. "error": str(e)
  156. }
  157. )
  158. return ToolResult(
  159. title="账号粉丝画像获取失败",
  160. output="",
  161. error=f"HTTP {e.response.status_code}: {e.response.text}",
  162. )
  163. except requests.exceptions.Timeout:
  164. logger.error("get_account_fans_portrait timeout", extra={"account_id": account_id, "timeout": request_timeout})
  165. return ToolResult(
  166. title="账号粉丝画像获取失败",
  167. output="",
  168. error=f"请求超时({request_timeout}秒)",
  169. )
  170. except requests.exceptions.RequestException as e:
  171. logger.error("get_account_fans_portrait network error", extra={"account_id": account_id, "error": str(e)})
  172. return ToolResult(
  173. title="账号粉丝画像获取失败",
  174. output="",
  175. error=f"网络错误: {str(e)}",
  176. )
  177. except Exception as e:
  178. logger.error("get_account_fans_portrait unexpected error", extra={"account_id": account_id, "error": str(e)}, exc_info=True)
  179. return ToolResult(
  180. title="账号粉丝画像获取失败",
  181. output="",
  182. error=f"未知错误: {str(e)}",
  183. )
  184. @tool(description="获取抖音内容点赞用户画像(热点宝),支持选择画像维度")
  185. async def get_content_fans_portrait(
  186. content_id: str,
  187. need_province: bool = False,
  188. need_city: bool = False,
  189. need_city_level: bool = False,
  190. need_gender: bool = False,
  191. need_age: bool = True,
  192. need_phone_brand: bool = False,
  193. need_phone_price: bool = False,
  194. timeout: Optional[float] = None,
  195. ) -> ToolResult:
  196. """
  197. 获取抖音内容点赞用户画像(热点宝数据)
  198. 获取指定视频内容的点赞用户画像数据,包括年龄、性别、地域等多个维度。
  199. Args:
  200. content_id: 抖音内容ID(使用 aweme_id)
  201. need_province: 是否获取省份分布,默认 False
  202. need_city: 是否获取城市分布,默认 False
  203. need_city_level: 是否获取城市等级分布(一线/新一线/二线等),默认 False
  204. need_gender: 是否获取性别分布,默认 False
  205. need_age: 是否获取年龄分布,默认 True
  206. need_phone_brand: 是否获取手机品牌分布,默认 False
  207. need_phone_price: 是否获取手机价格分布,默认 False
  208. timeout: 超时时间(秒),默认 60
  209. Returns:
  210. ToolResult: 包含以下内容:
  211. - output: 文本格式的画像摘要
  212. - metadata.has_portrait: 布尔值,表示是否有有效画像数据
  213. - True: 有有效画像数据
  214. - False: 无画像数据(需要使用账号画像兜底)
  215. - metadata.portrait_data: 结构化的画像数据(字典格式)
  216. - 键: 维度名称(如 "年龄"、"性别")
  217. - 值: 该维度的分布数据(字典)
  218. - percentage: 占比(如 "48.35%")
  219. - preference: 偏好度/TGI(如 "210.05")
  220. - metadata.raw_data: 原始 API 返回数据
  221. Note:
  222. - content_id 参数使用 aweme_id
  223. - 默认只返回年龄分布,需要其他维度时设置对应参数为 True
  224. - 省份数据只显示 TOP5
  225. - 偏好度(TGI)说明:
  226. - > 100: 该人群偏好高于平均水平
  227. - = 100: 平均水平
  228. - < 100: 低于平均水平
  229. - 使用 metadata.has_portrait 判断画像是否有效,不要解析 output 文本
  230. - 如果 has_portrait 为 False,应使用 get_account_fans_portrait 作为兜底
  231. - 从 metadata.portrait_data 获取结构化画像数据
  232. """
  233. start_time = time.time()
  234. # 验证 content_id 格式
  235. if not content_id or not isinstance(content_id, str):
  236. logger.error("get_content_fans_portrait invalid content_id", extra={"content_id": content_id})
  237. return ToolResult(
  238. title="内容点赞用户画像获取失败",
  239. output="",
  240. error="content_id 参数无效:必须是非空字符串",
  241. )
  242. # aweme_id 应该是纯数字字符串,长度约 19 位
  243. if not content_id.isdigit():
  244. logger.error("get_content_fans_portrait invalid aweme_id format", extra={"content_id": content_id})
  245. return ToolResult(
  246. title="内容点赞用户画像获取失败",
  247. output="",
  248. error=f"content_id 格式错误:aweme_id 应该是纯数字,当前值: {content_id[:20]}...",
  249. )
  250. if len(content_id) < 15 or len(content_id) > 25:
  251. logger.error("get_content_fans_portrait invalid aweme_id length", extra={"content_id": content_id, "length": len(content_id)})
  252. return ToolResult(
  253. title="内容点赞用户画像获取失败",
  254. output="",
  255. error=f"content_id 长度异常:期望 15-25 位数字,实际 {len(content_id)} 位",
  256. )
  257. try:
  258. payload = {
  259. "content_id": content_id,
  260. "need_province": need_province,
  261. "need_city": need_city,
  262. "need_city_level": need_city_level,
  263. "need_gender": need_gender,
  264. "need_age": need_age,
  265. "need_phone_brand": need_phone_brand,
  266. "need_phone_price": need_phone_price,
  267. }
  268. request_timeout = timeout if timeout is not None else DEFAULT_TIMEOUT
  269. response = requests.post(
  270. CONTENT_FANS_PORTRAIT_API,
  271. json=payload,
  272. headers={"Content-Type": "application/json"},
  273. timeout=request_timeout
  274. )
  275. response.raise_for_status()
  276. data = response.json()
  277. data_block = data.get("data", {}) if isinstance(data.get("data"), dict) else {}
  278. portrait = data_block.get("data", {}) if isinstance(data_block.get("data"), dict) else {}
  279. # 格式化输出摘要
  280. summary_lines = [f"内容 {content_id} 的点赞用户画像"]
  281. summary_lines.append(f"画像链接:https://douhot.douyin.com/video/detail?active_tab=video_fans&video_id={content_id}")
  282. summary_lines.append("")
  283. for k, v in portrait.items():
  284. if not isinstance(v, dict):
  285. continue
  286. if k in ("省份", "城市"):
  287. summary_lines.append(f"【{k} TOP5】分布")
  288. items = _top_k(v, 5)
  289. else:
  290. summary_lines.append(f"【{k}】分布")
  291. items = v.items()
  292. for name, metrics in items:
  293. ratio = metrics.get("percentage")
  294. tgi = metrics.get("preference")
  295. summary_lines.append(f" {name}: {ratio} (偏好度: {tgi})")
  296. summary_lines.append("")
  297. duration_ms = int((time.time() - start_time) * 1000)
  298. has_valid_portrait = bool(portrait and any(
  299. isinstance(v, dict) and v for v in portrait.values()
  300. ))
  301. logger.info(
  302. "get_content_fans_portrait completed",
  303. extra={
  304. "content_id": content_id,
  305. "has_portrait": has_valid_portrait,
  306. "portrait_dimensions": list(portrait.keys()) if portrait else [],
  307. "duration_ms": duration_ms
  308. }
  309. )
  310. return ToolResult(
  311. title=f"内容点赞用户画像: {content_id}",
  312. output="\n".join(summary_lines),
  313. long_term_memory=f"Fetched fans portrait for content '{content_id}'",
  314. metadata={
  315. "raw_data": data,
  316. "has_portrait": has_valid_portrait,
  317. "portrait_data": portrait
  318. }
  319. )
  320. except requests.exceptions.HTTPError as e:
  321. logger.error(
  322. "get_content_fans_portrait HTTP error",
  323. extra={
  324. "content_id": content_id,
  325. "status_code": e.response.status_code,
  326. "error": str(e)
  327. }
  328. )
  329. return ToolResult(
  330. title="内容点赞用户画像获取失败",
  331. output="",
  332. error=f"HTTP {e.response.status_code}: {e.response.text}",
  333. )
  334. except requests.exceptions.Timeout:
  335. logger.error("get_content_fans_portrait timeout", extra={"content_id": content_id, "timeout": request_timeout})
  336. return ToolResult(
  337. title="内容点赞用户画像获取失败",
  338. output="",
  339. error=f"请求超时({request_timeout}秒)",
  340. )
  341. except requests.exceptions.RequestException as e:
  342. logger.error("get_content_fans_portrait network error", extra={"content_id": content_id, "error": str(e)})
  343. return ToolResult(
  344. title="内容点赞用户画像获取失败",
  345. output="",
  346. error=f"网络错误: {str(e)}",
  347. )
  348. except Exception as e:
  349. logger.error("get_content_fans_portrait unexpected error", extra={"content_id": content_id, "error": str(e)}, exc_info=True)
  350. return ToolResult(
  351. title="内容点赞用户画像获取失败",
  352. output="",
  353. error=f"未知错误: {str(e)}",
  354. )
  355. def _top_k(items: Dict[str, Any], k: int) -> List[Tuple[str, Any]]:
  356. def percent_value(entry: Tuple[str, Any]) -> float:
  357. metrics = entry[1] if isinstance(entry[1], dict) else {}
  358. return metrics.get("percentage")
  359. return sorted(items.items(), key=percent_value, reverse=True)[:k]