agent_tools.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. import os
  2. import sys
  3. import json
  4. from typing import Any, Dict, List, Optional
  5. sys.path.append(os.path.dirname(os.path.abspath(__file__)))
  6. from utils.logging_config import get_logger
  7. from utils.mysql_db import MysqlHelper
  8. from indentify.indentify import ContentIdentifier
  9. logger = get_logger('AgentTools')
  10. class QueryDataTool:
  11. """查询 knowledge_crawl_content 获取 data 列表中的 crawl_data 字段"""
  12. @staticmethod
  13. def fetch_crawl_data_list(request_id: str) -> List[Dict[str, Any]]:
  14. sql = "SELECT data FROM knowledge_crawl_content WHERE request_id = %s ORDER BY id ASC"
  15. rows = MysqlHelper.get_values(sql, (request_id,))
  16. if not rows:
  17. logger.info(f"request_id={request_id} 未查询到数据")
  18. return []
  19. results: List[Dict[str, Any]] = []
  20. for row in rows:
  21. data_cell = row[0]
  22. if not data_cell:
  23. continue
  24. try:
  25. parsed = json.loads(data_cell) if isinstance(data_cell, (str, bytes)) else data_cell
  26. if isinstance(parsed, list):
  27. for item in parsed:
  28. if isinstance(item, dict):
  29. crawl_data = item.get('crawl_data')
  30. if isinstance(crawl_data, (dict, list)):
  31. results.append({"crawl_data": crawl_data, "raw": item})
  32. else:
  33. results.append({"crawl_data": item, "raw": item})
  34. elif isinstance(parsed, dict):
  35. crawl_data = parsed.get('crawl_data')
  36. if isinstance(crawl_data, (dict, list)):
  37. results.append({"crawl_data": crawl_data, "raw": parsed})
  38. else:
  39. results.append({"crawl_data": parsed, "raw": parsed})
  40. else:
  41. logger.warning("data 字段非期望的 JSON 结构,已跳过一行")
  42. except Exception as e:
  43. logger.error(f"解析 data JSON 失败: {e}")
  44. logger.info(f"request_id={request_id} 提取 crawl_data 数量: {len(results)}")
  45. return results
  46. class IdentifyTool:
  47. """调用 indentify 内部能力,完成图像/视频识别"""
  48. def __init__(self) -> None:
  49. self.identifier = ContentIdentifier()
  50. def run(self, crawl_data: Dict[str, Any]) -> Dict[str, Any]:
  51. try:
  52. formatted_content = self.identifier.parse_formatted_content(crawl_data)
  53. recognition_result = self.identifier.process_content_recognition(formatted_content)
  54. title = formatted_content.get('title') or ''
  55. content = formatted_content.get('body_text') or ''
  56. channel = formatted_content.get('channel') or ''
  57. author = formatted_content.get('channel_account_name') or ''
  58. like_count = formatted_content.get('like_count') or 0
  59. collect_count = formatted_content.get('collect_count') or 0
  60. comment_count = formatted_content.get('comment_count') or 0
  61. view_count = formatted_content.get('view_count') or 0
  62. publish_time = formatted_content.get('publish_time') or ''
  63. update_timestamp = formatted_content.get('update_timestamp') or ''
  64. content_link = formatted_content.get('content_link') or ''
  65. content_id = formatted_content.get('channel_content_id') or ''
  66. complete_result = {
  67. 'channel': channel,
  68. 'title': title,
  69. 'content': content,
  70. 'images': recognition_result.get('image_analysis', {}).get('images_comprehension', []),
  71. 'videos': recognition_result.get('video_analysis', {}),
  72. 'meta': {
  73. 'author': author,
  74. 'like_count': like_count,
  75. 'collect_count': collect_count,
  76. 'comment_count': comment_count,
  77. 'view_count': view_count,
  78. 'publish_time': publish_time,
  79. 'update_timestamp': update_timestamp,
  80. 'content_link': content_link,
  81. 'content_id': content_id,
  82. }
  83. }
  84. return complete_result
  85. except Exception as e:
  86. logger.error(f"识别失败: {e}")
  87. return {
  88. 'channel': '',
  89. 'title': '',
  90. 'content': '',
  91. 'images': [],
  92. 'videos': [],
  93. 'meta': {},
  94. 'error': str(e)
  95. }
  96. class StructureTool:
  97. """
  98. 结构化工具:按照既定的结构将识别结果与原始 crawl_data 组合,
  99. 并存入 knowledge_parsing_content 表。
  100. """
  101. @staticmethod
  102. def store_parsing_result(request_id: str, crawl_raw: Dict[str, Any], identify_result: Dict[str, Any]) -> Optional[int]:
  103. payload = {
  104. 'request_id': request_id,
  105. 'crawl_raw': crawl_raw,
  106. 'identify_result': identify_result,
  107. }
  108. sql = (
  109. "INSERT INTO knowledge_parsing_content (request_id, parsing_result, created_at) "
  110. "VALUES (%s, %s, NOW())"
  111. )
  112. params = (request_id, json.dumps(payload, ensure_ascii=False))
  113. return MysqlHelper.update_values(sql, params)