hot_topic_search.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. """
  2. 每日热点话题检索工具(示例)
  3. 调用内部爬虫服务获取“今日热榜”类榜单数据,并按业务规则筛选需要的平台来源。
  4. """
  5. import asyncio
  6. import json
  7. import logging
  8. import sys
  9. import time
  10. from pathlib import Path
  11. from typing import Any, Dict, List, Optional, TypedDict
  12. import requests
  13. def _ensure_import_paths() -> None:
  14. """
  15. 允许直接执行本文件时,也能导入仓库根目录下的 `agent`,
  16. 以及 content_finder 目录下的 `utils` 等模块。
  17. """
  18. content_finder_root = Path(__file__).resolve().parents[1] # .../examples/content_finder
  19. repo_root = Path(__file__).resolve().parents[3] # .../Agent
  20. for p in (repo_root, content_finder_root):
  21. p_str = str(p)
  22. if p_str not in sys.path:
  23. sys.path.insert(0, p_str)
  24. _ensure_import_paths()
  25. from agent.tools import ToolResult, tool
  26. from utils.tool_logging import format_tool_result_for_log, log_tool_call
  27. logger = logging.getLogger(__name__)
  28. _LOG_LABEL = "工具调用:hot_topic_search -> 每日热点话题检索(今日热榜)"
  29. HOT_TOPIC_API = "http://crawapi.piaoquantv.com/crawler/jin_ri_re_bang/content_rank"
  30. DEFAULT_TIMEOUT = 60.0
  31. MAX_MATCHED_TOPICS = 3
  32. class HotTopicItem(TypedDict):
  33. title: str
  34. heat: str
  35. class HotTopicSourceBlock(TypedDict, total=False):
  36. source: str
  37. jump_url: str
  38. type: str
  39. topics: List[HotTopicItem]
  40. class MatchedHotTopic(TypedDict, total=False):
  41. title: str
  42. heat: str
  43. source: str
  44. jump_url: str
  45. type: str
  46. score: int
  47. matched_keywords: List[str]
  48. def _normalize_text(text: str) -> str:
  49. return text.strip().lower()
  50. def _prepare_feature_keywords(feature_keywords: Optional[List[str]]) -> List[str]:
  51. if not feature_keywords:
  52. return []
  53. cleaned: List[str] = []
  54. for kw in feature_keywords:
  55. if not isinstance(kw, str):
  56. continue
  57. kw_norm = kw.strip()
  58. if not kw_norm:
  59. continue
  60. cleaned.append(kw_norm)
  61. # 保持顺序去重
  62. seen: set[str] = set()
  63. deduped: List[str] = []
  64. for kw in cleaned:
  65. key = _normalize_text(kw)
  66. if key in seen:
  67. continue
  68. seen.add(key)
  69. deduped.append(kw)
  70. return deduped
  71. def _match_title_by_words(title: str, words: List[str]) -> MatchedHotTopic:
  72. """
  73. 对输入词语列表逐一做包含匹配(规范化后:标题含该词即命中该词)。
  74. 无单字/模糊匹配;score 仅用于 Top 排序:命中词数优先,其次命中词总字数。
  75. """
  76. title_norm = _normalize_text(title)
  77. if not words:
  78. return MatchedHotTopic(title=title, score=0, matched_keywords=[])
  79. matched: List[str] = []
  80. for kw in words:
  81. kw_norm = _normalize_text(kw)
  82. if kw_norm and kw_norm in title_norm:
  83. matched.append(kw)
  84. if not matched:
  85. return MatchedHotTopic(title=title, score=0, matched_keywords=[])
  86. length_bonus = sum(len(k.strip()) for k in matched)
  87. score = 1000 * len(matched) + length_bonus
  88. return MatchedHotTopic(title=title, score=int(score), matched_keywords=matched)
  89. def _build_summary(
  90. *,
  91. blocks: List[HotTopicSourceBlock],
  92. has_more: bool,
  93. next_cursor: Any,
  94. feature_keywords: List[str],
  95. ) -> str:
  96. lines: List[str] = []
  97. total = sum(len(b.get("topics", [])) for b in blocks)
  98. if feature_keywords:
  99. lines.append(f"标题匹配特征词:{', '.join(feature_keywords)}")
  100. else:
  101. lines.append("标题匹配特征词:未提供(不过滤,返回全部话题)")
  102. lines.append(f"共筛出 {len(blocks)} 个来源块,话题 {total} 条")
  103. if has_more:
  104. lines.append(f"还有更多,可用 cursor={next_cursor} 继续拉取")
  105. lines.append("")
  106. for b in blocks:
  107. source = b.get("source") or "未知来源"
  108. jump_url = b.get("jump_url") or ""
  109. b_type = b.get("type") or ""
  110. topics = b.get("topics", [])
  111. header = f"【{source}】{b_type}".strip()
  112. lines.append(header)
  113. if jump_url:
  114. lines.append(f"榜单页: {jump_url}")
  115. for i, t in enumerate(topics[:20], 1):
  116. title = t.get("title", "").strip() or "无标题"
  117. heat = t.get("heat", "").strip() or "-"
  118. lines.append(f"{i}. {title}({heat})")
  119. if len(topics) > 20:
  120. lines.append(f"... 其余 {len(topics) - 20} 条已省略(完整见 metadata)")
  121. lines.append("")
  122. return "\n".join(lines).rstrip()
  123. def _parse_filtered_topics(raw: Dict[str, Any], *, feature_keywords: List[str]) -> Dict[str, Any]:
  124. data_block = raw.get("data", {}) if isinstance(raw.get("data"), dict) else {}
  125. has_more = bool(data_block.get("has_more", False))
  126. next_cursor = data_block.get("next_cursor")
  127. items = data_block.get("data", []) if isinstance(data_block.get("data"), list) else []
  128. candidates: List[MatchedHotTopic] = []
  129. for item in items:
  130. if not isinstance(item, dict):
  131. continue
  132. source = (item.get("source") or "").strip()
  133. rank_list = item.get("rankList", []) if isinstance(item.get("rankList"), list) else []
  134. for r in rank_list:
  135. if not isinstance(r, dict):
  136. continue
  137. title = (r.get("title") or "").strip()
  138. heat = (r.get("heat") or "").strip()
  139. if not title:
  140. continue
  141. scored = _match_title_by_words(title, feature_keywords)
  142. score = int(scored.get("score") or 0)
  143. matched_kw = list(scored.get("matched_keywords") or [])
  144. if feature_keywords and not matched_kw:
  145. continue
  146. candidates.append(
  147. MatchedHotTopic(
  148. title=title,
  149. heat=heat,
  150. source=source,
  151. jump_url=item.get("jump_url") or "",
  152. type=item.get("type") or "",
  153. score=score,
  154. matched_keywords=matched_kw,
  155. )
  156. )
  157. # 全局排序取 Top3
  158. top_topics = sorted(candidates, key=lambda x: int(x.get("score") or 0), reverse=True)[:MAX_MATCHED_TOPICS]
  159. blocks_by_source: Dict[str, HotTopicSourceBlock] = {}
  160. topics_by_source: Dict[str, List[HotTopicItem]] = {}
  161. for t in top_topics:
  162. source = (t.get("source") or "").strip()
  163. if source not in blocks_by_source:
  164. blocks_by_source[source] = HotTopicSourceBlock(
  165. source=source,
  166. jump_url=t.get("jump_url") or "",
  167. type=t.get("type") or "",
  168. topics=[],
  169. )
  170. topic_item: HotTopicItem = {"title": t.get("title") or "", "heat": t.get("heat") or ""}
  171. blocks_by_source[source].setdefault("topics", []).append(topic_item)
  172. topics_by_source.setdefault(source, []).append(topic_item)
  173. blocks: List[HotTopicSourceBlock] = list(blocks_by_source.values())
  174. matched_total = len(top_topics)
  175. return {
  176. "has_more": has_more,
  177. "next_cursor": next_cursor,
  178. "blocks": blocks,
  179. "topics_by_source": topics_by_source,
  180. "top_topics": top_topics,
  181. "matched_total": matched_total,
  182. }
  183. @tool(
  184. description='检索“今日热榜”热点话题;feature_keywords 为词语 list,对榜单标题逐词做包含匹配(命中至少一词即保留)。不传则不过滤,返回全部话题'
  185. )
  186. async def hot_topic_search(
  187. sort_type: str = "最热",
  188. cursor: int = 1,
  189. feature_keywords: Optional[List[str]] = None,
  190. timeout: Optional[float] = None,
  191. ) -> ToolResult:
  192. """
  193. 检索每日热点话题(今日热榜)
  194. Args:
  195. sort_type: 榜单排序方式(如 "最热"),默认 "最热"
  196. cursor: 分页游标(从 1 开始),默认 1
  197. feature_keywords: 词语列表(list[str])。传入时对每条话题标题逐词判断规范化后的包含关系,
  198. 至少命中一词则保留;不传入则不做过滤(返回全部话题)。
  199. timeout: 超时时间(秒),默认 60
  200. Returns:
  201. ToolResult:
  202. - output: 人类可读摘要(为节省 token:最多返回命中的前 3 条话题)
  203. - metadata.has_more: 是否还有下一页
  204. - metadata.next_cursor: 下一页 cursor
  205. - metadata.blocks: 按来源块输出的结构化结果(每块 topics 仅含 title/heat)
  206. - metadata.topics_by_source: 按来源聚合的话题列表(仅含 title/heat)
  207. - metadata.top_topics: Top3 话题明细(含 score、matched_keywords)
  208. - metadata.matched_total: 实际返回的命中话题总数(<=3)
  209. - metadata.feature_keywords: 本次参与匹配的词语(清洗/去重后)
  210. - metadata.raw_data: 原始 API 返回
  211. """
  212. start_time = time.time()
  213. request_timeout = timeout if timeout is not None else DEFAULT_TIMEOUT
  214. cleaned_keywords = _prepare_feature_keywords(feature_keywords)
  215. call_params: Dict[str, Any] = {
  216. "sort_type": sort_type,
  217. "cursor": cursor,
  218. "feature_keywords": cleaned_keywords,
  219. "timeout": request_timeout,
  220. }
  221. if not isinstance(sort_type, str) or not sort_type.strip():
  222. err = ToolResult(title="热点话题检索失败", output="", error="sort_type 参数无效:必须是非空字符串")
  223. log_tool_call(_LOG_LABEL, call_params, format_tool_result_for_log(err))
  224. return err
  225. if not isinstance(cursor, int) or cursor <= 0:
  226. err = ToolResult(title="热点话题检索失败", output="", error="cursor 参数无效:必须是正整数")
  227. log_tool_call(_LOG_LABEL, call_params, format_tool_result_for_log(err))
  228. return err
  229. if feature_keywords is not None and not isinstance(feature_keywords, list):
  230. err = ToolResult(title="热点话题检索失败", output="", error="feature_keywords 参数无效:必须是字符串列表或不传")
  231. log_tool_call(_LOG_LABEL, call_params, format_tool_result_for_log(err))
  232. return err
  233. try:
  234. payload = {"sort_type": sort_type.strip(), "cursor": cursor}
  235. response = requests.post(
  236. HOT_TOPIC_API,
  237. json=payload,
  238. headers={"Content-Type": "application/json"},
  239. timeout=request_timeout,
  240. )
  241. response.raise_for_status()
  242. raw = response.json()
  243. except requests.exceptions.HTTPError as e:
  244. logger.error(
  245. "hot_topic_search HTTP error",
  246. extra={"sort_type": sort_type, "cursor": cursor, "status_code": e.response.status_code},
  247. )
  248. err = ToolResult(title="热点话题检索失败", output="", error=f"HTTP {e.response.status_code}: {e.response.text}")
  249. log_tool_call(_LOG_LABEL, call_params, format_tool_result_for_log(err))
  250. return err
  251. except requests.exceptions.Timeout:
  252. logger.error("hot_topic_search timeout", extra={"sort_type": sort_type, "cursor": cursor, "timeout": request_timeout})
  253. err = ToolResult(title="热点话题检索失败", output="", error=f"请求超时({request_timeout}秒)")
  254. log_tool_call(_LOG_LABEL, call_params, format_tool_result_for_log(err))
  255. return err
  256. except requests.exceptions.RequestException as e:
  257. logger.error("hot_topic_search network error", extra={"sort_type": sort_type, "cursor": cursor, "error": str(e)})
  258. err = ToolResult(title="热点话题检索失败", output="", error=f"网络错误: {str(e)}")
  259. log_tool_call(_LOG_LABEL, call_params, format_tool_result_for_log(err))
  260. return err
  261. except Exception as e:
  262. logger.error(
  263. "hot_topic_search unexpected error",
  264. extra={"sort_type": sort_type, "cursor": cursor, "error": str(e)},
  265. exc_info=True,
  266. )
  267. err = ToolResult(title="热点话题检索失败", output="", error=f"未知错误: {str(e)}")
  268. log_tool_call(_LOG_LABEL, call_params, format_tool_result_for_log(err))
  269. return err
  270. parsed = _parse_filtered_topics(raw if isinstance(raw, dict) else {}, feature_keywords=cleaned_keywords)
  271. has_more = bool(parsed.get("has_more", False))
  272. next_cursor = parsed.get("next_cursor")
  273. blocks = parsed.get("blocks", [])
  274. matched_total = int(parsed.get("matched_total") or 0)
  275. summary = _build_summary(blocks=blocks, has_more=has_more, next_cursor=next_cursor, feature_keywords=cleaned_keywords)
  276. duration_ms = int((time.time() - start_time) * 1000)
  277. logger.info(
  278. "hot_topic_search completed",
  279. extra={
  280. "sort_type": sort_type,
  281. "cursor": cursor,
  282. "blocks_count": len(blocks),
  283. "has_more": has_more,
  284. "next_cursor": next_cursor,
  285. "duration_ms": duration_ms,
  286. },
  287. )
  288. out = ToolResult(
  289. title=f"今日热榜热点话题({sort_type},cursor={cursor})",
  290. output=summary,
  291. long_term_memory=f"Fetched hot topics sort_type='{sort_type}' cursor={cursor}",
  292. metadata={
  293. "raw_data": raw,
  294. "has_more": has_more,
  295. "next_cursor": next_cursor,
  296. "blocks": blocks,
  297. "topics_by_source": parsed.get("topics_by_source", {}),
  298. "top_topics": parsed.get("top_topics", []),
  299. "matched_total": matched_total,
  300. "feature_keywords": cleaned_keywords,
  301. },
  302. include_metadata_in_llm=True,
  303. )
  304. log_tool_call(_LOG_LABEL, call_params, json.dumps(out.metadata.get("topics_by_source", {}), ensure_ascii=False))
  305. return out
  306. async def main() -> None:
  307. result = await hot_topic_search(sort_type="最热", cursor=1)
  308. print(result.output)
  309. if __name__ == "__main__":
  310. asyncio.run(main())