|
@@ -8,6 +8,7 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
from utils.logging_config import get_logger
|
|
|
from utils.mysql_db import MysqlHelper
|
|
|
from indentify.indentify import ContentIdentifier
|
|
|
+from structure import StructureProcessor
|
|
|
|
|
|
logger = get_logger('AgentTools')
|
|
|
|
|
@@ -17,14 +18,14 @@ class QueryDataTool:
|
|
|
|
|
|
@staticmethod
|
|
|
def fetch_crawl_data_list(request_id: str) -> List[Dict[str, Any]]:
|
|
|
- sql = "SELECT data FROM knowledge_crawl_content WHERE request_id = %s ORDER BY id ASC"
|
|
|
+ sql = "SELECT * FROM knowledge_crawl_content WHERE request_id = %s ORDER BY id ASC"
|
|
|
rows = MysqlHelper.get_values(sql, (request_id,))
|
|
|
if not rows:
|
|
|
logger.info(f"request_id={request_id} 未查询到数据,使用默认值")
|
|
|
# 返回默认数据
|
|
|
- default_data = {
|
|
|
+ default_data = [{
|
|
|
"request_id": request_id,
|
|
|
- "content_id": "684a789b000000002202a61b",
|
|
|
+ "content_id": "1",
|
|
|
"id": 1,
|
|
|
"task_id": 1,
|
|
|
"crawl_data": {
|
|
@@ -70,8 +71,56 @@ class QueryDataTool:
|
|
|
"modify_timestamp": 1749711589000,
|
|
|
"update_timestamp": 1755239186502
|
|
|
}
|
|
|
- }
|
|
|
- return [default_data]
|
|
|
+ },{
|
|
|
+ "request_id": request_id,
|
|
|
+ "content_id": "2",
|
|
|
+ "id": 2,
|
|
|
+ "task_id": 2,
|
|
|
+ "crawl_data": {
|
|
|
+ "channel": 1,
|
|
|
+ "channel_content_id": "684a789b000000002202a61b",
|
|
|
+ "content_link": "https://www.xiaohongshu.com/explore/684a789b000000002202a61b",
|
|
|
+ "wx_sn": None,
|
|
|
+ "title": "一个视频学会,5个剪辑工具,超详细教程",
|
|
|
+ "content_type": "video",
|
|
|
+ "body_text": "#剪辑教程[话题]# #剪辑[话题]# #手机剪辑[话题]# #视频制作[话题]# #视频剪辑[话题]# #自学剪辑[话题]# #原创视频[话题]# #新手小白学剪辑[话题]#",
|
|
|
+ "location": "未知",
|
|
|
+ "source_url": None,
|
|
|
+ "mini_program": None,
|
|
|
+ "topic_list": [],
|
|
|
+ "image_url_list": [
|
|
|
+ {
|
|
|
+ "image_type": 2,
|
|
|
+ "image_url": "http://rescdn.yishihui.com/pipeline/image/5be8f08a-4691-41b6-8dda-0b63cc2c1056.jpg"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "video_url_list": [
|
|
|
+ {
|
|
|
+ "video_url": "http://rescdn.yishihui.com/pipeline/video/9e38400e-21dc-4063-bab5-47c1667bb59d.mp4",
|
|
|
+ "video_duration": 615
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "bgm_data": None,
|
|
|
+ "ad_info": None,
|
|
|
+ "is_original": False,
|
|
|
+ "voice_data": None,
|
|
|
+ "channel_account_id": "670a10ac000000001d0216ec",
|
|
|
+ "channel_account_name": "小伍剪辑视频",
|
|
|
+ "channel_account_avatar": "https://sns-avatar-qc.xhscdn.com/avatar/1040g2jo31e469dkq0e005poa22m7c5ncbtuk1g0?imageView2/2/w/80/format/jpg",
|
|
|
+ "item_index": None,
|
|
|
+ "view_count": None,
|
|
|
+ "play_count": None,
|
|
|
+ "like_count": 692,
|
|
|
+ "collect_count": 996,
|
|
|
+ "comment_count": 37,
|
|
|
+ "share_count": None,
|
|
|
+ "looking_count": None,
|
|
|
+ "publish_timestamp": 1749711589000,
|
|
|
+ "modify_timestamp": 1749711589000,
|
|
|
+ "update_timestamp": 1755239186502
|
|
|
+ }
|
|
|
+ }]
|
|
|
+ return default_data
|
|
|
|
|
|
results: List[Dict[str, Any]] = []
|
|
|
for row in rows:
|
|
@@ -158,14 +207,14 @@ class IdentifyTool:
|
|
|
}
|
|
|
|
|
|
|
|
|
-class StructureTool:
|
|
|
+class UpdateDataTool:
|
|
|
"""
|
|
|
结构化工具:按照既定的结构将识别结果与原始 crawl_data 组合,
|
|
|
并存入 knowledge_parsing_content 表。
|
|
|
"""
|
|
|
-
|
|
|
+
|
|
|
@staticmethod
|
|
|
- def store_parsing_result(request_id: str, crawl_raw: Dict[str, Any], identify_result: Dict[str, Any]) -> Optional[int]:
|
|
|
+ def store_indentify_result(request_id: str, crawl_raw: Dict[str, Any], identify_result: Dict[str, Any]) -> Optional[int]:
|
|
|
"""
|
|
|
存储解析结果到 knowledge_parsing_content 表
|
|
|
|
|
@@ -186,7 +235,7 @@ class StructureTool:
|
|
|
|
|
|
sql = (
|
|
|
"INSERT INTO knowledge_parsing_content "
|
|
|
- "(content_id, request_id, task_id, parsing_data, create_time, status) "
|
|
|
+ "(content_id, request_id, task_id, indentify_data, create_time, status) "
|
|
|
"VALUES (%s, %s, %s, %s, NOW(), %s)"
|
|
|
)
|
|
|
|
|
@@ -207,4 +256,117 @@ class StructureTool:
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"存储解析结果失败: request_id={request_id}, error={e}")
|
|
|
- return None
|
|
|
+ return None
|
|
|
+ @staticmethod
|
|
|
+ def store_parsing_result(request_id: str, crawl_raw: Dict[str, Any], parsing_result: Dict[str, Any]) -> Optional[int]:
|
|
|
+ """
|
|
|
+ 存储解析结果到 knowledge_parsing_content 表
|
|
|
+
|
|
|
+ Args:
|
|
|
+ request_id: 请求ID
|
|
|
+ crawl_raw: 原始爬取数据
|
|
|
+ parsing_result: 结构化的结果(建议传入 StructureTool.process_content_structure 的返回值)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 受影响的行数,失败返回None
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ # 从原始数据中提取必要字段
|
|
|
+ content_id = crawl_raw.get('content_id') or ''
|
|
|
+
|
|
|
+ # 只提取result字段的内容,如果不存在则使用整个对象
|
|
|
+ structured_content = parsing_result.get('structured_content', {})
|
|
|
+ if isinstance(structured_content, dict) and 'result' in structured_content:
|
|
|
+ # 如果structured_content是字典且包含result字段,只存储result字段
|
|
|
+ parsing_payload = structured_content['result']
|
|
|
+ else:
|
|
|
+ # 否则存储整个structured_content
|
|
|
+ parsing_payload = structured_content
|
|
|
+
|
|
|
+ # 更新数据
|
|
|
+ sql = (
|
|
|
+ "UPDATE knowledge_parsing_content "
|
|
|
+ "SET parsing_data = %s, status = %s "
|
|
|
+ "WHERE content_id = %s"
|
|
|
+ )
|
|
|
+
|
|
|
+ # 状态:5 表示结构化处理完成
|
|
|
+ status = 5
|
|
|
+ params = (
|
|
|
+ json.dumps(parsing_payload, ensure_ascii=False),
|
|
|
+ status,
|
|
|
+ content_id
|
|
|
+ )
|
|
|
+
|
|
|
+ result = MysqlHelper.update_values(sql, params)
|
|
|
+ if result:
|
|
|
+ logger.info(f"存储解析结果成功: request_id={request_id}, content_id={content_id}")
|
|
|
+ return result
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"存储解析结果失败: request_id={request_id}, error={e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+class StructureTool:
|
|
|
+ """
|
|
|
+ 内容结构化工具:调用tools/structure内部的方法进行内容结构化处理
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ """初始化结构化工具"""
|
|
|
+ self.structure_processor = StructureProcessor()
|
|
|
+
|
|
|
+ def process_content_structure(self, content_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 处理内容结构化
|
|
|
+
|
|
|
+ Args:
|
|
|
+ content_data: 包含识别结果的内容数据,格式如下:
|
|
|
+ {
|
|
|
+ 'channel': str,
|
|
|
+ 'title': str,
|
|
|
+ 'content': str,
|
|
|
+ 'images': List[str],
|
|
|
+ 'videos': Dict,
|
|
|
+ 'meta': Dict
|
|
|
+ }
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Dict[str, Any]: 结构化处理后的结果
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ # 结构化输入规范化
|
|
|
+ structure_input = {
|
|
|
+ "title": content_data.get('title', ''),
|
|
|
+ "body_text": content_data.get('content', ''),
|
|
|
+ "images_comprehension": content_data.get('images', [])
|
|
|
+ }
|
|
|
+
|
|
|
+ # 调用结构化处理器
|
|
|
+ structured_content = self.structure_processor.process_content(structure_input)
|
|
|
+
|
|
|
+ # 若返回为字符串或字典,直接封装;不访问 .result
|
|
|
+ result = {
|
|
|
+ 'original_data': content_data,
|
|
|
+ 'structured_content': structured_content,
|
|
|
+ 'structure_status': 'success',
|
|
|
+ 'process_time': self._get_current_timestamp()
|
|
|
+ }
|
|
|
+
|
|
|
+ logger.info(f"内容结构化处理成功: title={content_data.get('title', '')}")
|
|
|
+ return result
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"内容结构化处理失败: {e}")
|
|
|
+ return {
|
|
|
+ 'original_data': content_data,
|
|
|
+ 'structured_content': '',
|
|
|
+ 'structure_status': 'failed',
|
|
|
+ 'error': str(e),
|
|
|
+ 'process_time': self._get_current_timestamp()
|
|
|
+ }
|
|
|
+
|
|
|
+ def _get_current_timestamp(self) -> str:
|
|
|
+ """获取当前时间戳字符串"""
|
|
|
+ from datetime import datetime
|
|
|
+ return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|