data_utils.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. import json
  2. import requests
  3. from openpyxl.styles.builtins import title
  4. from core.config import logger
  5. from core.database_data import DatabaseHelper
  6. def add_data(text, dataset_id, title=None):
  7. try:
  8. response = requests.post(
  9. url='http://61.48.133.26:8001/api/chunk',
  10. json={
  11. "text": text,
  12. "dataset_id": dataset_id},
  13. headers={"Content-Type": "application/json"},
  14. )
  15. return response.json()['doc_id']
  16. except Exception as e:
  17. logger.error(e)
  18. def is_empty(value):
  19. """辅助函数:判断值是否为空(None 或空字符串)"""
  20. return value is None or value == ""
  21. def parse_json(file_path):
  22. text_list = []
  23. try:
  24. # 读取文件内容
  25. with open(file_path, 'r', encoding='utf-8') as file:
  26. try:
  27. # 解析JSON内容
  28. json_data = json.load(file)
  29. # 检查是否为JSON数组
  30. if isinstance(json_data, list):
  31. # 遍历每个JSON对象
  32. for index, json_obj in enumerate(json_data, 1):
  33. body_text = json_obj.get("body_text", "")
  34. title = json_obj.get("title", "")
  35. if not is_empty(body_text):
  36. text_list.append({'body_text': body_text, 'title': title})
  37. else:
  38. print("错误: 文件内容不是一个JSON数组")
  39. except json.JSONDecodeError as e:
  40. print(f"JSON解析错误: {e}")
  41. except FileNotFoundError:
  42. print(f"错误: 找不到文件 '{file_path}'")
  43. except Exception as e:
  44. print(f"发生错误: {e}")
  45. return text_list
  46. def select_data():
  47. db_helper = DatabaseHelper()
  48. # 执行查询
  49. query = """
  50. SELECT c.crawl_data as json_text
  51. FROM knowledge_extraction_content a
  52. LEFT JOIN knowledge_parsing_content b ON a.parsing_id = b.id AND b.request_id = a.request_id
  53. LEFT JOIN knowledge_crawl_content c ON c.content_id = b.content_id AND c.request_id = a.request_id
  54. LEFT JOIN knowledge_request d ON d.request_id = a.request_id
  55. LEFT JOIN knowledge_query e ON e.id = d.query_id
  56. WHERE a.request_id > '20250905022700393495252' AND e.knowledge_type = '整体' AND a.score >= 0 AND e.category_id = 0
  57. ORDER BY a.id DESC
  58. """
  59. result = db_helper.execute_query(query)
  60. for row in result:
  61. add_data(json.loads(row['json_text'])['body_text'])
  62. if __name__ == '__main__':
  63. json_path = '../data/test_data1.json'
  64. text_list = parse_json(json_path)
  65. re = []
  66. for text in text_list:
  67. res = add_data(text['body_text'])
  68. if res is None:
  69. re.append(text)
  70. re1 = []
  71. for text in re:
  72. res = add_data(text['body_text'])
  73. if res is None:
  74. re.append(text)
  75. print(json.dumps(re1, ensure_ascii=False))