|
@@ -99,7 +99,7 @@ def call_coze_api(summary, timeline):
|
|
|
"Authorization": "Bearer pat_ddPm5K5tCKXU2zH1NChGHoOKGOSECyzxmXPEcrtJg52UcCIndRAfiWRRxCH95pdB"
|
|
|
}
|
|
|
payload = {
|
|
|
- "workflow_id": "7507245138873450535",
|
|
|
+ "workflow_id": "7506810742659579904",
|
|
|
"parameters": {
|
|
|
"summary": summary,
|
|
|
"timeline": timeline
|
|
@@ -110,165 +110,167 @@ def call_coze_api(summary, timeline):
|
|
|
return response.json()
|
|
|
|
|
|
def extract_fields_from_response(resp):
|
|
|
+ """
|
|
|
+ 从响应中提取字段数据
|
|
|
+ Args:
|
|
|
+ resp: API响应字典,可能包含多层嵌套的JSON字符串
|
|
|
+ Returns:
|
|
|
+ tuple: (time_str, theme_str, trigger_str) 三个字符串,分别包含时间、主题和触发语
|
|
|
+ """
|
|
|
+ import json
|
|
|
import re
|
|
|
+ import logging
|
|
|
|
|
|
- # Define patterns at the function level
|
|
|
- JSON_PATTERNS = [
|
|
|
- r"```json\\n(.*?)```", # 转义的换行
|
|
|
- r"```json\n(.*?)```", # 普通换行
|
|
|
- r"```(.*?)```", # 无语言标记
|
|
|
- r"\{.*\}" # 直接JSON对象
|
|
|
- ]
|
|
|
+ logger = logging.getLogger(__name__)
|
|
|
|
|
|
- def try_unescape_json_string(s):
|
|
|
- # 递归反序列化所有层级的转义JSON字符串
|
|
|
- for _ in range(3): # 最多尝试3层
|
|
|
- if isinstance(s, str):
|
|
|
- try:
|
|
|
- s2 = json.loads(s)
|
|
|
- # 如果反序列化后类型有变化,继续递归
|
|
|
- if type(s2) != str:
|
|
|
- s = s2
|
|
|
- else:
|
|
|
- break
|
|
|
- except Exception as e:
|
|
|
- print(f"JSON反序列化失败: {str(e)}")
|
|
|
- break
|
|
|
- else:
|
|
|
- break
|
|
|
- return s
|
|
|
-
|
|
|
- def extract_json_from_string(s):
|
|
|
- """Helper function to extract and parse JSON from a string"""
|
|
|
- if not isinstance(s, str):
|
|
|
- return s
|
|
|
-
|
|
|
- # First try direct JSON parsing
|
|
|
+ def extract_json_from_markdown(markdown_str):
|
|
|
+ """从markdown代码块中提取JSON字符串"""
|
|
|
+ # 匹配 ```json 和 ``` 之间的内容
|
|
|
+ match = re.search(r'```(?:json)?\s*([\s\S]*?)```', markdown_str)
|
|
|
+ if match:
|
|
|
+ return match.group(1).strip()
|
|
|
+ return markdown_str.strip()
|
|
|
+
|
|
|
+ def parse_nested_json(json_str):
|
|
|
+ """递归解析嵌套的JSON字符串"""
|
|
|
try:
|
|
|
- return json.loads(s)
|
|
|
+ # 尝试直接解析
|
|
|
+ return json.loads(json_str)
|
|
|
except json.JSONDecodeError:
|
|
|
- pass
|
|
|
-
|
|
|
- # Then try each pattern
|
|
|
- for pattern in JSON_PATTERNS:
|
|
|
- json_str = re.search(pattern, s, re.DOTALL)
|
|
|
- if json_str:
|
|
|
- try:
|
|
|
- content = json_str.group(1)
|
|
|
- return json.loads(content)
|
|
|
- except Exception as e:
|
|
|
- print(f"使用模式 {pattern} 解析失败: {str(e)}")
|
|
|
- continue
|
|
|
- return s
|
|
|
-
|
|
|
+ try:
|
|
|
+ # 如果失败,尝试解析转义的JSON字符串
|
|
|
+ unescaped = json_str.encode().decode('unicode_escape')
|
|
|
+ return json.loads(unescaped)
|
|
|
+ except:
|
|
|
+ # 如果还是失败,返回原始字符串
|
|
|
+ return json_str
|
|
|
+
|
|
|
try:
|
|
|
- data = resp.get("data")
|
|
|
- if not data:
|
|
|
- print("响应中没有data字段")
|
|
|
- return ("", "", "")
|
|
|
-
|
|
|
+ # 处理多层嵌套的JSON
|
|
|
+ current_data = resp
|
|
|
+ while isinstance(current_data, (str, dict)):
|
|
|
+ if isinstance(current_data, dict):
|
|
|
+ # 如果是字典,尝试获取data或output字段
|
|
|
+ if 'data' in current_data:
|
|
|
+ current_data = current_data['data']
|
|
|
+ elif 'output' in current_data:
|
|
|
+ current_data = current_data['output']
|
|
|
+ else:
|
|
|
+ break
|
|
|
+ elif isinstance(current_data, str):
|
|
|
+ # 如果是字符串,尝试解析JSON
|
|
|
+ try:
|
|
|
+ parsed = parse_nested_json(current_data)
|
|
|
+ if parsed == current_data: # 如果解析结果和输入相同,说明不是JSON
|
|
|
+ break
|
|
|
+ current_data = parsed
|
|
|
+ except:
|
|
|
+ break
|
|
|
|
|
|
- # First parse the outer JSON structure
|
|
|
- try:
|
|
|
- data = json.loads(data)
|
|
|
- except json.JSONDecodeError as e:
|
|
|
- print(f"解析外层data失败: {str(e)}")
|
|
|
- return ("", "", "")
|
|
|
-
|
|
|
- # Then handle the output field
|
|
|
- output = data.get("output")
|
|
|
- if not output:
|
|
|
- print("data中没有output字段")
|
|
|
- return ("", "", "")
|
|
|
-
|
|
|
- print(f"\n原始output字段: {output}")
|
|
|
- output = extract_json_from_string(output)
|
|
|
+ # 如果最终结果是字符串,尝试从markdown中提取
|
|
|
+ if isinstance(current_data, str):
|
|
|
+ current_data = extract_json_from_markdown(current_data)
|
|
|
+ current_data = parse_nested_json(current_data)
|
|
|
|
|
|
- if isinstance(output, str):
|
|
|
- print("output解析后仍为字符串")
|
|
|
- return ("", "", "")
|
|
|
-
|
|
|
-
|
|
|
- if isinstance(output, dict):
|
|
|
- # 按优先级检查不同的字段名
|
|
|
- if "需求列表" in output:
|
|
|
- demand_list = output["需求列表"]
|
|
|
- elif "questions" in output:
|
|
|
- demand_list = output["questions"]
|
|
|
- elif "interactive_questions" in output:
|
|
|
- demand_list = output["interactive_questions"]
|
|
|
- else:
|
|
|
- print("output中没有找到需求列表、questions或interactive_questions字段")
|
|
|
- return ("", "", "")
|
|
|
- else:
|
|
|
- demand_list = output
|
|
|
-
|
|
|
- if not demand_list or not isinstance(demand_list, list):
|
|
|
- print(f"需求列表无效: {demand_list}")
|
|
|
- return ("", "", "")
|
|
|
-
|
|
|
- times = []
|
|
|
- queries = []
|
|
|
- hooks = []
|
|
|
+ logger.info(f"Final parsed data type: {type(current_data)}")
|
|
|
+ logger.info(f"Final data preview: {str(current_data)[:200]}...")
|
|
|
|
|
|
- for item in demand_list:
|
|
|
+ # 确保数据是列表
|
|
|
+ if not isinstance(current_data, list):
|
|
|
+ logger.warning(f"Converting non-list data to list: {type(current_data)}")
|
|
|
+ current_data = [current_data]
|
|
|
+
|
|
|
+ # 提取并验证每个对象
|
|
|
+ time_list = []
|
|
|
+ theme_list = []
|
|
|
+ trigger_list = []
|
|
|
+
|
|
|
+ for item in current_data:
|
|
|
if not isinstance(item, dict):
|
|
|
- print(f"跳过非字典项: {item}")
|
|
|
+ logger.warning(f"Skipping invalid item: {item}")
|
|
|
continue
|
|
|
-
|
|
|
- time = item.get("需求钩子出现时间", "")
|
|
|
- query = item.get("需求详细query", "")
|
|
|
- hook = item.get("需求钩子话术", "")
|
|
|
|
|
|
- if not all([time, query, hook]):
|
|
|
- print(f"跳过不完整项: {item}")
|
|
|
- continue
|
|
|
-
|
|
|
- # Handle time format
|
|
|
- if time == "end":
|
|
|
- time = "视频结束"
|
|
|
- elif "-" in time:
|
|
|
- time = time.split("-")[0] # Take the start time
|
|
|
+ try:
|
|
|
+ # 使用get方法安全地获取值,并提供默认值
|
|
|
+ time = str(item.get('time', '')).strip()
|
|
|
+ theme = str(item.get('theme', '')).strip()
|
|
|
+ trigger = str(item.get('trigger', '')).strip()
|
|
|
|
|
|
- times.append(time)
|
|
|
- queries.append(query)
|
|
|
- hooks.append(hook)
|
|
|
-
|
|
|
- if not times:
|
|
|
- print("没有提取到有效的需求项")
|
|
|
- return ("", "", "")
|
|
|
-
|
|
|
- return ("\n".join(times), "\n".join(queries), "\n".join(hooks))
|
|
|
-
|
|
|
+ if time or theme or trigger: # 只添加非空值
|
|
|
+ time_list.append(time)
|
|
|
+ theme_list.append(theme)
|
|
|
+ trigger_list.append(trigger)
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"Error extracting fields from item: {e}")
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 将列表转换为字符串,用换行符连接
|
|
|
+ time_str = '\n'.join(time_list) if time_list else ''
|
|
|
+ theme_str = '\n'.join(theme_list) if theme_list else ''
|
|
|
+ trigger_str = '\n'.join(trigger_list) if trigger_list else ''
|
|
|
+
|
|
|
+ logger.info(f"Extracted - Time: {time_str[:50]}..., Theme: {theme_str[:50]}..., Trigger: {trigger_str[:50]}...")
|
|
|
+ return time_str, theme_str, trigger_str
|
|
|
+
|
|
|
+ except json.JSONDecodeError as je:
|
|
|
+ logger.error(f"JSON decode error: {str(je)}")
|
|
|
+ logger.error(f"Error position: {je.pos}")
|
|
|
+ logger.error(f"Character at error: {repr(str(current_data)[je.pos:je.pos+10])}")
|
|
|
+ return '', '', ''
|
|
|
except Exception as e:
|
|
|
- print(f"解析返回数据出错: {str(e)}")
|
|
|
- print(f"原始响应: {json.dumps(resp, ensure_ascii=False, indent=2)}")
|
|
|
- return ("", "", "")
|
|
|
+ logger.error(f"Error processing response: {str(e)}")
|
|
|
+ logger.error(f"Response data: {str(resp)[:200]}...")
|
|
|
+ return '', '', ''
|
|
|
|
|
|
def process_excel(input_excel, output_excel=None):
|
|
|
- df = pd.read_excel(input_excel)
|
|
|
- for idx, row in df.iterrows():
|
|
|
- summary = row.iloc[5] if pd.notna(row.iloc[5]) else "{}"
|
|
|
- timeline = row.iloc[7] if pd.notna(row.iloc[7]) else "{}"
|
|
|
- try:
|
|
|
- print(f"开始处理第{idx+1}行")
|
|
|
- resp = call_coze_api(summary, timeline)
|
|
|
- time, query, hook = extract_fields_from_response(resp)
|
|
|
- df.iat[idx, 9] = time # 第8列
|
|
|
- df.iat[idx, 10] = query # 第9列
|
|
|
- df.iat[idx, 11] = hook # 第10列
|
|
|
- print(f"第{idx+1}行处理完成")
|
|
|
- print(hook)
|
|
|
- except Exception as e:
|
|
|
- print(f"第{idx+1}行处理失败: {e}")
|
|
|
- df.iat[idx, 9] = "error"
|
|
|
- df.iat[idx, 10] = "error"
|
|
|
- df.iat[idx, 11] = "error"
|
|
|
- # 每处理一行就保存一次,防止中断丢失
|
|
|
- df.to_excel(input_excel, index=False)
|
|
|
- print(f"已保存到第{idx+1}行")
|
|
|
- df.to_excel(input_excel, index=False)
|
|
|
- print("全部处理完成,已保存。")
|
|
|
+ """
|
|
|
+ 处理Excel文件,调用API并更新数据
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ df = pd.read_excel(input_excel)
|
|
|
+
|
|
|
+ # 确保目标列存在且为字符串类型
|
|
|
+ for col in [8, 9, 10]: # 对应第9、10、11列
|
|
|
+ if col >= len(df.columns):
|
|
|
+ df.insert(col, f'Column_{col}', '')
|
|
|
+ df.iloc[:, col] = df.iloc[:, col].fillna('').astype(str)
|
|
|
+
|
|
|
+ for idx, row in df.iterrows():
|
|
|
+ try:
|
|
|
+ print(f"\n开始处理第{idx+1}行")
|
|
|
+ summary = str(row.iloc[5]) if pd.notna(row.iloc[5]) else "{}"
|
|
|
+ timeline = str(row.iloc[6]) if pd.notna(row.iloc[6]) else "{}"
|
|
|
+
|
|
|
+ print(f"Summary: {summary[:100]}...") # 只打印前100个字符
|
|
|
+ print(f"Timeline: {timeline[:100]}...")
|
|
|
+
|
|
|
+ resp = call_coze_api(summary, timeline)
|
|
|
+ print(f"API Response: {json.dumps(resp, ensure_ascii=False)[:200]}...") # 打印API响应
|
|
|
+
|
|
|
+ time, theme, hook = extract_fields_from_response(resp)
|
|
|
+
|
|
|
+ # 更新数据 - 使用单个值赋值而不是批量赋值
|
|
|
+ df.at[idx, df.columns[7]] = time
|
|
|
+ df.at[idx, df.columns[8]] = hook
|
|
|
+
|
|
|
+ print(f"第{idx+1}行处理完成")
|
|
|
+ print(f"Time: {time[:100]}...")
|
|
|
+ print(f"Hook: {hook[:100]}...")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"第{idx+1}行处理失败: {str(e)}")
|
|
|
+ df.at[idx, df.columns[7]] = "error"
|
|
|
+ df.at[idx, df.columns[8]] = "error"
|
|
|
+
|
|
|
+ # 每处理一行就保存一次,防止中断丢失
|
|
|
+ df.to_excel(input_excel, index=False)
|
|
|
+ print(f"已保存到第{idx+1}行")
|
|
|
+
|
|
|
+ print("全部处理完成,已保存。")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"处理Excel文件时发生错误: {str(e)}")
|
|
|
+ raise
|
|
|
|
|
|
# 读取视频分析报告1.xlsx,找出第10列为空的行,重新调用coze工作流分析
|
|
|
def process_empty_rows(input_excel, output_excel=None):
|