agent_tools.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. import os
  2. import sys
  3. import json
  4. from typing import Any, Dict, List, Optional
  5. sys.path.append(os.path.dirname(os.path.abspath(__file__)))
  6. from utils.logging_config import get_logger
  7. from utils.mysql_db import MysqlHelper
  8. from indentify.indentify import ContentIdentifier
  9. from structure import StructureProcessor
  10. logger = get_logger('AgentTools')
  11. class QueryDataTool:
  12. """查询 knowledge_crawl_content 获取 data 列表中的 crawl_data 字段"""
  13. @staticmethod
  14. def fetch_crawl_data_list(request_id: str) -> List[Dict[str, Any]]:
  15. sql = "SELECT * FROM knowledge_crawl_content WHERE request_id = %s ORDER BY id ASC"
  16. rows = MysqlHelper.get_values(sql, (request_id,))
  17. if not rows:
  18. logger.info(f"request_id={request_id} 未查询到数据,使用默认值")
  19. # 返回默认数据
  20. default_data = [{
  21. "request_id": request_id,
  22. "content_id": "1",
  23. "id": 1,
  24. "task_id": 1,
  25. "crawl_data": {
  26. "channel": 1,
  27. "channel_content_id": "684a789b000000002202a61b",
  28. "content_link": "https://www.xiaohongshu.com/explore/684a789b000000002202a61b",
  29. "wx_sn": None,
  30. "title": "一个视频学会,5个剪辑工具,超详细教程",
  31. "content_type": "video",
  32. "body_text": "#剪辑教程[话题]# #剪辑[话题]# #手机剪辑[话题]# #视频制作[话题]# #视频剪辑[话题]# #自学剪辑[话题]# #原创视频[话题]# #新手小白学剪辑[话题]#",
  33. "location": "未知",
  34. "source_url": None,
  35. "mini_program": None,
  36. "topic_list": [],
  37. "image_url_list": [
  38. {
  39. "image_type": 2,
  40. "image_url": "http://rescdn.yishihui.com/pipeline/image/5be8f08a-4691-41b6-8dda-0b63cc2c1056.jpg"
  41. }
  42. ],
  43. "video_url_list": [
  44. # {
  45. # "video_url": "http://rescdn.yishihui.com/pipeline/video/9e38400e-21dc-4063-bab5-47c1667bb59d.mp4",
  46. # "video_duration": 615
  47. # }
  48. ],
  49. "bgm_data": None,
  50. "ad_info": None,
  51. "is_original": False,
  52. "voice_data": None,
  53. "channel_account_id": "670a10ac000000001d0216ec",
  54. "channel_account_name": "小伍剪辑视频",
  55. "channel_account_avatar": "https://sns-avatar-qc.xhscdn.com/avatar/1040g2jo31e469dkq0e005poa22m7c5ncbtuk1g0?imageView2/2/w/80/format/jpg",
  56. "item_index": None,
  57. "view_count": None,
  58. "play_count": None,
  59. "like_count": 692,
  60. "collect_count": 996,
  61. "comment_count": 37,
  62. "share_count": None,
  63. "looking_count": None,
  64. "publish_timestamp": 1749711589000,
  65. "modify_timestamp": 1749711589000,
  66. "update_timestamp": 1755239186502
  67. }
  68. },{
  69. "request_id": request_id,
  70. "content_id": "2",
  71. "id": 2,
  72. "task_id": 2,
  73. "crawl_data": {
  74. "channel": 1,
  75. "channel_content_id": "684a789b000000002202a61b",
  76. "content_link": "https://www.xiaohongshu.com/explore/684a789b000000002202a61b",
  77. "wx_sn": None,
  78. "title": "一个视频学会,5个剪辑工具,超详细教程",
  79. "content_type": "video",
  80. "body_text": "#剪辑教程[话题]# #剪辑[话题]# #手机剪辑[话题]# #视频制作[话题]# #视频剪辑[话题]# #自学剪辑[话题]# #原创视频[话题]# #新手小白学剪辑[话题]#",
  81. "location": "未知",
  82. "source_url": None,
  83. "mini_program": None,
  84. "topic_list": [],
  85. "image_url_list": [
  86. {
  87. "image_type": 2,
  88. "image_url": "http://rescdn.yishihui.com/pipeline/image/5be8f08a-4691-41b6-8dda-0b63cc2c1056.jpg"
  89. }
  90. ],
  91. "video_url_list": [
  92. {
  93. "video_url": "http://rescdn.yishihui.com/pipeline/video/9e38400e-21dc-4063-bab5-47c1667bb59d.mp4",
  94. "video_duration": 615
  95. }
  96. ],
  97. "bgm_data": None,
  98. "ad_info": None,
  99. "is_original": False,
  100. "voice_data": None,
  101. "channel_account_id": "670a10ac000000001d0216ec",
  102. "channel_account_name": "小伍剪辑视频",
  103. "channel_account_avatar": "https://sns-avatar-qc.xhscdn.com/avatar/1040g2jo31e469dkq0e005poa22m7c5ncbtuk1g0?imageView2/2/w/80/format/jpg",
  104. "item_index": None,
  105. "view_count": None,
  106. "play_count": None,
  107. "like_count": 692,
  108. "collect_count": 996,
  109. "comment_count": 37,
  110. "share_count": None,
  111. "looking_count": None,
  112. "publish_timestamp": 1749711589000,
  113. "modify_timestamp": 1749711589000,
  114. "update_timestamp": 1755239186502
  115. }
  116. }]
  117. return default_data
  118. results: List[Dict[str, Any]] = []
  119. for row in rows:
  120. data_cell = row[0]
  121. if not data_cell:
  122. continue
  123. try:
  124. parsed = json.loads(data_cell) if isinstance(data_cell, (str, bytes)) else data_cell
  125. if isinstance(parsed, list):
  126. for item in parsed:
  127. if isinstance(item, dict):
  128. crawl_data = item.get('crawl_data')
  129. if isinstance(crawl_data, (dict, list)):
  130. results.append({"crawl_data": crawl_data, "raw": item})
  131. else:
  132. results.append({"crawl_data": item, "raw": item})
  133. elif isinstance(parsed, dict):
  134. crawl_data = parsed.get('crawl_data')
  135. if isinstance(crawl_data, (dict, list)):
  136. results.append({"crawl_data": crawl_data, "raw": parsed})
  137. else:
  138. results.append({"crawl_data": parsed, "raw": parsed})
  139. else:
  140. logger.warning("data 字段非期望的 JSON 结构,已跳过一行")
  141. except Exception as e:
  142. logger.error(f"解析 data JSON 失败: {e}")
  143. logger.info(f"request_id={request_id} 提取 crawl_data 数量: {len(results)}")
  144. return results
  145. class IdentifyTool:
  146. """调用 indentify 内部能力,完成图像/视频识别"""
  147. def __init__(self) -> None:
  148. self.identifier = ContentIdentifier()
  149. def run(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
  150. try:
  151. formatted_content = self.identifier.parse_formatted_content(crawl_data)
  152. recognition_result = self.identifier.process_content_recognition(formatted_content)
  153. title = formatted_content.get('title') or ''
  154. content = formatted_content.get('body_text') or ''
  155. channel = formatted_content.get('channel') or ''
  156. author = formatted_content.get('channel_account_name') or ''
  157. like_count = formatted_content.get('like_count') or 0
  158. collect_count = formatted_content.get('collect_count') or 0
  159. comment_count = formatted_content.get('comment_count') or 0
  160. view_count = formatted_content.get('view_count') or 0
  161. publish_time = formatted_content.get('publish_time') or ''
  162. update_timestamp = formatted_content.get('update_timestamp') or ''
  163. content_link = formatted_content.get('content_link') or ''
  164. content_id = formatted_content.get('channel_content_id') or ''
  165. complete_result = {
  166. 'channel': channel,
  167. 'title': title,
  168. 'content': content,
  169. 'images': recognition_result.get('image_analysis', {}).get('images_comprehension', []),
  170. 'videos': recognition_result.get('video_analysis', {}),
  171. 'meta': {
  172. 'author': author,
  173. 'like_count': like_count,
  174. 'collect_count': collect_count,
  175. 'comment_count': comment_count,
  176. 'view_count': view_count,
  177. 'publish_time': publish_time,
  178. 'update_timestamp': update_timestamp,
  179. 'content_link': content_link,
  180. 'content_id': content_id,
  181. }
  182. }
  183. return complete_result
  184. except Exception as e:
  185. logger.error(f"识别失败: {e}")
  186. return {
  187. 'channel': '',
  188. 'title': '',
  189. 'content': '',
  190. 'images': [],
  191. 'videos': [],
  192. 'meta': {},
  193. 'error': str(e)
  194. }
  195. class UpdateDataTool:
  196. """
  197. 结构化工具:按照既定的结构将识别结果与原始 crawl_data 组合,
  198. 并存入 knowledge_parsing_content 表。
  199. """
  200. @staticmethod
  201. def store_indentify_result(request_id: str, crawl_raw: Dict[str, Any], identify_result: Dict[str, Any]) -> Optional[int]:
  202. """
  203. 存储解析结果到 knowledge_parsing_content 表
  204. Args:
  205. request_id: 请求ID
  206. crawl_raw: 原始爬取数据
  207. identify_result: 识别结果
  208. Returns:
  209. 插入的行ID,失败返回None
  210. """
  211. try:
  212. # 从原始数据中提取必要字段
  213. content_id = crawl_raw.get('content_id') or ''
  214. task_id = crawl_raw.get('task_id') or '' # 默认任务ID,可根据需要调整
  215. # 构建存储数据
  216. sql = (
  217. "INSERT INTO knowledge_parsing_content "
  218. "(content_id, request_id, task_id, indentify_data, create_time, status) "
  219. "VALUES (%s, %s, %s, %s, NOW(), %s)"
  220. )
  221. # 状态:2 表示处理完成
  222. status = 2
  223. params = (
  224. content_id,
  225. request_id,
  226. task_id,
  227. json.dumps(identify_result, ensure_ascii=False),
  228. status
  229. )
  230. result = MysqlHelper.insert_and_get_id(sql, params)
  231. if result:
  232. logger.info(f"存储解析结果成功: request_id={request_id}, content_id={content_id}, insert_id={result}")
  233. return result
  234. except Exception as e:
  235. logger.error(f"存储解析结果失败: request_id={request_id}, error={e}")
  236. return None
  237. @staticmethod
  238. def store_parsing_result(request_id: str, crawl_raw: Dict[str, Any], parsing_result: Dict[str, Any]) -> Optional[int]:
  239. """
  240. 存储解析结果到 knowledge_parsing_content 表
  241. Args:
  242. request_id: 请求ID
  243. crawl_raw: 原始爬取数据
  244. parsing_result: 结构化的结果(建议传入 StructureTool.process_content_structure 的返回值)
  245. Returns:
  246. 受影响的行数,失败返回None
  247. """
  248. try:
  249. # 从原始数据中提取必要字段
  250. content_id = crawl_raw.get('content_id') or ''
  251. # 只提取result字段的内容,如果不存在则使用整个对象
  252. structured_content = parsing_result.get('structured_content', {})
  253. if isinstance(structured_content, dict) and 'result' in structured_content:
  254. # 如果structured_content是字典且包含result字段,只存储result字段
  255. parsing_payload = structured_content['result']
  256. else:
  257. # 否则存储整个structured_content
  258. parsing_payload = structured_content
  259. # 更新数据
  260. sql = (
  261. "UPDATE knowledge_parsing_content "
  262. "SET parsing_data = %s, status = %s "
  263. "WHERE content_id = %s"
  264. )
  265. # 状态:5 表示结构化处理完成
  266. status = 5
  267. params = (
  268. parsing_payload,
  269. status,
  270. content_id
  271. )
  272. result = MysqlHelper.update_values(sql, params)
  273. if result:
  274. logger.info(f"存储解析结果成功: request_id={request_id}, content_id={content_id}")
  275. return result
  276. except Exception as e:
  277. logger.error(f"存储解析结果失败: request_id={request_id}, error={e}")
  278. return None
  279. class StructureTool:
  280. """
  281. 内容结构化工具:调用tools/structure内部的方法进行内容结构化处理
  282. """
  283. def __init__(self):
  284. """初始化结构化工具"""
  285. self.structure_processor = StructureProcessor()
  286. def process_content_structure(self, content_data: Dict[str, Any]) -> Dict[str, Any]:
  287. """
  288. 处理内容结构化
  289. Args:
  290. content_data: 包含识别结果的内容数据,格式如下:
  291. {
  292. 'channel': str,
  293. 'title': str,
  294. 'content': str,
  295. 'images': List[str],
  296. 'videos': Dict,
  297. 'meta': Dict
  298. }
  299. Returns:
  300. Dict[str, Any]: 结构化处理后的结果
  301. """
  302. try:
  303. # 结构化输入规范化
  304. structure_input = {
  305. "title": content_data.get('title', ''),
  306. "body_text": content_data.get('content', ''),
  307. "images_comprehension": content_data.get('images', [])
  308. }
  309. # 调用结构化处理器
  310. structured_content = self.structure_processor.process_content(structure_input)
  311. # 若返回为字符串或字典,直接封装;不访问 .result
  312. result = {
  313. 'original_data': content_data,
  314. 'structured_content': structured_content,
  315. 'structure_status': 'success',
  316. 'process_time': self._get_current_timestamp()
  317. }
  318. logger.info(f"内容结构化处理成功: title={content_data.get('title', '')}")
  319. return result
  320. except Exception as e:
  321. logger.error(f"内容结构化处理失败: {e}")
  322. return {
  323. 'original_data': content_data,
  324. 'structured_content': '',
  325. 'structure_status': 'failed',
  326. 'error': str(e),
  327. 'process_time': self._get_current_timestamp()
  328. }
  329. def _get_current_timestamp(self) -> str:
  330. """获取当前时间戳字符串"""
  331. from datetime import datetime
  332. return datetime.now().strftime('%Y-%m-%d %H:%M:%S')