agent_tools.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. import os
  2. import sys
  3. import json
  4. from typing import Any, Dict, List, Optional
  5. sys.path.append(os.path.dirname(os.path.abspath(__file__)))
  6. from utils.logging_config import get_logger
  7. from utils.mysql_db import MysqlHelper
  8. from indentify.indentify import ContentIdentifier
  9. logger = get_logger('AgentTools')
  10. class QueryDataTool:
  11. """查询 knowledge_crawl_content 获取 data 列表中的 crawl_data 字段"""
  12. @staticmethod
  13. def fetch_crawl_data_list(request_id: str) -> List[Dict[str, Any]]:
  14. sql = "SELECT data FROM knowledge_crawl_content WHERE request_id = %s ORDER BY id ASC"
  15. rows = MysqlHelper.get_values(sql, (request_id,))
  16. if not rows:
  17. logger.info(f"request_id={request_id} 未查询到数据,使用默认值")
  18. # 返回默认数据
  19. default_data = {
  20. "request_id": request_id,
  21. "content_id": "684a789b000000002202a61b",
  22. "id": 1,
  23. "task_id": 1,
  24. "crawl_data": {
  25. "channel": 1,
  26. "channel_content_id": "684a789b000000002202a61b",
  27. "content_link": "https://www.xiaohongshu.com/explore/684a789b000000002202a61b",
  28. "wx_sn": None,
  29. "title": "一个视频学会,5个剪辑工具,超详细教程",
  30. "content_type": "video",
  31. "body_text": "#剪辑教程[话题]# #剪辑[话题]# #手机剪辑[话题]# #视频制作[话题]# #视频剪辑[话题]# #自学剪辑[话题]# #原创视频[话题]# #新手小白学剪辑[话题]#",
  32. "location": "未知",
  33. "source_url": None,
  34. "mini_program": None,
  35. "topic_list": [],
  36. "image_url_list": [
  37. {
  38. "image_type": 2,
  39. "image_url": "http://rescdn.yishihui.com/pipeline/image/5be8f08a-4691-41b6-8dda-0b63cc2c1056.jpg"
  40. }
  41. ],
  42. "video_url_list": [
  43. # {
  44. # "video_url": "http://rescdn.yishihui.com/pipeline/video/9e38400e-21dc-4063-bab5-47c1667bb59d.mp4",
  45. # "video_duration": 615
  46. # }
  47. ],
  48. "bgm_data": None,
  49. "ad_info": None,
  50. "is_original": False,
  51. "voice_data": None,
  52. "channel_account_id": "670a10ac000000001d0216ec",
  53. "channel_account_name": "小伍剪辑视频",
  54. "channel_account_avatar": "https://sns-avatar-qc.xhscdn.com/avatar/1040g2jo31e469dkq0e005poa22m7c5ncbtuk1g0?imageView2/2/w/80/format/jpg",
  55. "item_index": None,
  56. "view_count": None,
  57. "play_count": None,
  58. "like_count": 692,
  59. "collect_count": 996,
  60. "comment_count": 37,
  61. "share_count": None,
  62. "looking_count": None,
  63. "publish_timestamp": 1749711589000,
  64. "modify_timestamp": 1749711589000,
  65. "update_timestamp": 1755239186502
  66. }
  67. }
  68. return [default_data]
  69. results: List[Dict[str, Any]] = []
  70. for row in rows:
  71. data_cell = row[0]
  72. if not data_cell:
  73. continue
  74. try:
  75. parsed = json.loads(data_cell) if isinstance(data_cell, (str, bytes)) else data_cell
  76. if isinstance(parsed, list):
  77. for item in parsed:
  78. if isinstance(item, dict):
  79. crawl_data = item.get('crawl_data')
  80. if isinstance(crawl_data, (dict, list)):
  81. results.append({"crawl_data": crawl_data, "raw": item})
  82. else:
  83. results.append({"crawl_data": item, "raw": item})
  84. elif isinstance(parsed, dict):
  85. crawl_data = parsed.get('crawl_data')
  86. if isinstance(crawl_data, (dict, list)):
  87. results.append({"crawl_data": crawl_data, "raw": parsed})
  88. else:
  89. results.append({"crawl_data": parsed, "raw": parsed})
  90. else:
  91. logger.warning("data 字段非期望的 JSON 结构,已跳过一行")
  92. except Exception as e:
  93. logger.error(f"解析 data JSON 失败: {e}")
  94. logger.info(f"request_id={request_id} 提取 crawl_data 数量: {len(results)}")
  95. return results
  96. class IdentifyTool:
  97. """调用 indentify 内部能力,完成图像/视频识别"""
  98. def __init__(self) -> None:
  99. self.identifier = ContentIdentifier()
  100. def run(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
  101. try:
  102. formatted_content = self.identifier.parse_formatted_content(crawl_data)
  103. recognition_result = self.identifier.process_content_recognition(formatted_content)
  104. title = formatted_content.get('title') or ''
  105. content = formatted_content.get('body_text') or ''
  106. channel = formatted_content.get('channel') or ''
  107. author = formatted_content.get('channel_account_name') or ''
  108. like_count = formatted_content.get('like_count') or 0
  109. collect_count = formatted_content.get('collect_count') or 0
  110. comment_count = formatted_content.get('comment_count') or 0
  111. view_count = formatted_content.get('view_count') or 0
  112. publish_time = formatted_content.get('publish_time') or ''
  113. update_timestamp = formatted_content.get('update_timestamp') or ''
  114. content_link = formatted_content.get('content_link') or ''
  115. content_id = formatted_content.get('channel_content_id') or ''
  116. complete_result = {
  117. 'channel': channel,
  118. 'title': title,
  119. 'content': content,
  120. 'images': recognition_result.get('image_analysis', {}).get('images_comprehension', []),
  121. 'videos': recognition_result.get('video_analysis', {}),
  122. 'meta': {
  123. 'author': author,
  124. 'like_count': like_count,
  125. 'collect_count': collect_count,
  126. 'comment_count': comment_count,
  127. 'view_count': view_count,
  128. 'publish_time': publish_time,
  129. 'update_timestamp': update_timestamp,
  130. 'content_link': content_link,
  131. 'content_id': content_id,
  132. }
  133. }
  134. return complete_result
  135. except Exception as e:
  136. logger.error(f"识别失败: {e}")
  137. return {
  138. 'channel': '',
  139. 'title': '',
  140. 'content': '',
  141. 'images': [],
  142. 'videos': [],
  143. 'meta': {},
  144. 'error': str(e)
  145. }
  146. class StructureTool:
  147. """
  148. 结构化工具:按照既定的结构将识别结果与原始 crawl_data 组合,
  149. 并存入 knowledge_parsing_content 表。
  150. """
  151. @staticmethod
  152. def store_parsing_result(request_id: str, crawl_raw: Dict[str, Any], identify_result: Dict[str, Any]) -> Optional[int]:
  153. """
  154. 存储解析结果到 knowledge_parsing_content 表
  155. Args:
  156. request_id: 请求ID
  157. crawl_raw: 原始爬取数据
  158. identify_result: 识别结果
  159. Returns:
  160. 插入的行ID,失败返回None
  161. """
  162. try:
  163. # 从原始数据中提取必要字段
  164. content_id = crawl_raw.get('content_id') or ''
  165. task_id = crawl_raw.get('task_id') or '' # 默认任务ID,可根据需要调整
  166. # 构建存储数据
  167. sql = (
  168. "INSERT INTO knowledge_parsing_content "
  169. "(content_id, request_id, task_id, parsing_data, create_time, status) "
  170. "VALUES (%s, %s, %s, %s, NOW(), %s)"
  171. )
  172. # 状态:2 表示处理完成
  173. status = 2
  174. params = (
  175. content_id,
  176. request_id,
  177. task_id,
  178. json.dumps(identify_result, ensure_ascii=False),
  179. status
  180. )
  181. result = MysqlHelper.insert_and_get_id(sql, params)
  182. if result:
  183. logger.info(f"存储解析结果成功: request_id={request_id}, content_id={content_id}, insert_id={result}")
  184. return result
  185. except Exception as e:
  186. logger.error(f"存储解析结果失败: request_id={request_id}, error={e}")
  187. return None