|
@@ -135,15 +135,16 @@ class QueryDataTool:
|
|
|
if isinstance(parsed, tuple) and len(parsed) > 4:
|
|
|
# 假设第4个元素是JSON字符串
|
|
|
json_str = parsed[4]
|
|
|
+ content_id = parsed[1]
|
|
|
if isinstance(json_str, str):
|
|
|
try:
|
|
|
json_data = json.loads(json_str)
|
|
|
if isinstance(json_data, dict):
|
|
|
- results.append({"crawl_data": json_data, "raw": parsed})
|
|
|
+ results.append({"crawl_data": json_data, "content_id": content_id, "raw": parsed})
|
|
|
elif isinstance(json_data, list):
|
|
|
for item in json_data:
|
|
|
if isinstance(item, dict):
|
|
|
- results.append({"crawl_data": item, "raw": parsed})
|
|
|
+ results.append({"crawl_data": item, "content_id": content_id, "raw": parsed})
|
|
|
except json.JSONDecodeError:
|
|
|
logger.warning(f"元组中第4个元素不是有效的JSON: {json_str}")
|
|
|
else:
|
|
@@ -154,18 +155,20 @@ class QueryDataTool:
|
|
|
for item in parsed:
|
|
|
if isinstance(item, dict):
|
|
|
crawl_data = item.get('crawl_data')
|
|
|
+ content_id = item.get('content_id')
|
|
|
if isinstance(crawl_data, (dict, list)):
|
|
|
- results.append({"crawl_data": crawl_data, "raw": item})
|
|
|
+ results.append({"crawl_data": crawl_data, "content_id": content_id, "raw": item})
|
|
|
else:
|
|
|
- results.append({"crawl_data": item, "raw": item})
|
|
|
+ results.append({"crawl_data": item, "content_id": content_id, "raw": item})
|
|
|
|
|
|
# 处理字典类型
|
|
|
elif isinstance(parsed, dict):
|
|
|
crawl_data = parsed.get('crawl_data')
|
|
|
+ content_id = parsed.get('content_id')
|
|
|
if isinstance(crawl_data, (dict, list)):
|
|
|
- results.append({"crawl_data": crawl_data, "raw": parsed})
|
|
|
+ results.append({"crawl_data": crawl_data, "content_id": content_id, "raw": parsed})
|
|
|
else:
|
|
|
- results.append({"crawl_data": parsed, "raw": parsed})
|
|
|
+ results.append({"crawl_data": parsed, "content_id": content_id, "raw": parsed})
|
|
|
|
|
|
else:
|
|
|
logger.warning(f"data 字段非期望的数据结构: {type(parsed)}, 已跳过一行")
|
|
@@ -251,6 +254,7 @@ class UpdateDataTool:
|
|
|
插入的行ID,失败返回None
|
|
|
"""
|
|
|
try:
|
|
|
+ logger.info(f"存储识别结果: request_id={request_id}, crawl_raw={crawl_raw}, identify_result={identify_result}")
|
|
|
# 从原始数据中提取必要字段
|
|
|
content_id = crawl_raw.get('content_id') or ''
|
|
|
task_id = crawl_raw.get('task_id') or '' # 默认任务ID,可根据需要调整
|