import_search_results.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 搜索结果导入工具
  5. 将 search_progress.json 中已完成的搜索结果导入到
  6. associated_tags_results_with_search.json 对应的特征节点中。
  7. 匹配规则:根据特征的 search_word 字段匹配
  8. """
  9. import json
  10. import copy
  11. from datetime import datetime
  12. from typing import Dict, Any
  13. import argparse
  14. def load_json(file_path: str) -> Any:
  15. """加载JSON文件"""
  16. try:
  17. with open(file_path, 'r', encoding='utf-8') as f:
  18. return json.load(f)
  19. except Exception as e:
  20. print(f"错误: 加载文件失败 {file_path}: {e}")
  21. raise
  22. def save_json(data: Any, file_path: str):
  23. """保存JSON文件"""
  24. try:
  25. with open(file_path, 'w', encoding='utf-8') as f:
  26. json.dump(data, f, ensure_ascii=False, indent=2)
  27. print(f"✓ 已保存: {file_path}")
  28. except Exception as e:
  29. print(f"错误: 保存文件失败 {file_path}: {e}")
  30. raise
  31. def import_search_results(
  32. source_data_path: str,
  33. progress_path: str,
  34. output_path: str,
  35. content_type: str = '图文',
  36. sort_type: str = '综合'
  37. ):
  38. """
  39. 导入搜索结果
  40. Args:
  41. source_data_path: 源数据文件路径(包含特征和search_word)
  42. progress_path: 进度文件路径(包含已完成的搜索结果)
  43. output_path: 输出文件路径
  44. content_type: 内容类型
  45. sort_type: 排序方式
  46. """
  47. print("=" * 60)
  48. print("搜索结果导入工具")
  49. print("=" * 60)
  50. print()
  51. # 1. 加载源数据
  52. print(f"步骤1: 加载源数据文件")
  53. print(f" {source_data_path}")
  54. source_data = load_json(source_data_path)
  55. print(f" ✓ 已加载 {len(source_data)} 个结果项")
  56. print()
  57. # 2. 加载进度文件
  58. print(f"步骤2: 加载搜索进度文件")
  59. print(f" {progress_path}")
  60. progress = load_json(progress_path)
  61. completed_searches = progress.get('completed_searches', {})
  62. print(f" ✓ 已加载 {len(completed_searches)} 个搜索结果")
  63. print()
  64. # 3. 统计特征信息
  65. print("步骤3: 统计特征信息")
  66. total_features = 0
  67. features_with_search_word = 0
  68. unique_search_words = set()
  69. for result in source_data:
  70. for assoc in result.get('找到的关联', []):
  71. for feature in assoc.get('特征列表', []):
  72. total_features += 1
  73. search_word = feature.get('search_word')
  74. if search_word:
  75. features_with_search_word += 1
  76. unique_search_words.add(search_word)
  77. print(f" 总特征数: {total_features}")
  78. print(f" 有search_word的特征: {features_with_search_word}")
  79. print(f" 唯一search_word数: {len(unique_search_words)}")
  80. print()
  81. # 4. 导入搜索结果
  82. print("步骤4: 导入搜索结果")
  83. matched_count = 0
  84. not_found_count = 0
  85. success_count = 0
  86. failed_count = 0
  87. for result_idx, result in enumerate(source_data):
  88. for assoc_idx, assoc in enumerate(result.get('找到的关联', [])):
  89. for feature_idx, feature in enumerate(assoc.get('特征列表', [])):
  90. search_word = feature.get('search_word')
  91. # 跳过空的 search_word
  92. if not search_word:
  93. continue
  94. # 查找对应的搜索结果
  95. if search_word in completed_searches:
  96. matched_count += 1
  97. search_result = completed_searches[search_word]
  98. # 深拷贝搜索结果,避免共享引用
  99. feature['search_result'] = copy.deepcopy(search_result)
  100. # 添加元数据
  101. if search_result and search_result.get('data'):
  102. note_count = len(search_result.get('data', {}).get('data', []))
  103. feature['search_metadata'] = {
  104. 'searched_at': datetime.now().isoformat(),
  105. 'status': 'success',
  106. 'note_count': note_count,
  107. 'search_params': {
  108. 'keyword': search_word,
  109. 'content_type': content_type,
  110. 'sort_type': sort_type
  111. }
  112. }
  113. success_count += 1
  114. else:
  115. # 搜索结果为空或失败
  116. feature['search_metadata'] = {
  117. 'searched_at': datetime.now().isoformat(),
  118. 'status': 'failed',
  119. 'note_count': 0,
  120. 'search_params': {
  121. 'keyword': search_word,
  122. 'content_type': content_type,
  123. 'sort_type': sort_type
  124. }
  125. }
  126. failed_count += 1
  127. else:
  128. not_found_count += 1
  129. print(f" 匹配成功: {matched_count} 个特征")
  130. print(f" 搜索成功: {success_count} 个")
  131. print(f" 搜索失败: {failed_count} 个")
  132. print(f" 未找到搜索结果: {not_found_count} 个")
  133. print()
  134. # 5. 保存结果
  135. print("步骤5: 保存输出文件")
  136. print(f" {output_path}")
  137. save_json(source_data, output_path)
  138. print()
  139. # 6. 输出统计信息
  140. print("=" * 60)
  141. print("导入完成")
  142. print("=" * 60)
  143. print()
  144. print(f"总特征数: {total_features}")
  145. print(f"有search_word的特征: {features_with_search_word}")
  146. print(f"已导入搜索结果: {matched_count} ({matched_count/features_with_search_word*100:.1f}%)")
  147. print(f" - 成功: {success_count}")
  148. print(f" - 失败: {failed_count}")
  149. print(f"待搜索: {not_found_count} ({not_found_count/features_with_search_word*100:.1f}%)")
  150. print()
  151. def main():
  152. """主函数"""
  153. parser = argparse.ArgumentParser(description='搜索结果导入工具')
  154. parser.add_argument(
  155. '--source',
  156. default='associated_tags_results_with_search.json',
  157. help='源数据文件路径(默认: associated_tags_results_with_search.json)'
  158. )
  159. parser.add_argument(
  160. '--progress',
  161. default='search_progress.json',
  162. help='进度文件路径(默认: search_progress.json)'
  163. )
  164. parser.add_argument(
  165. '--output',
  166. default='associated_tags_results_with_search_data.json',
  167. help='输出文件路径(默认: associated_tags_results_with_search_data.json)'
  168. )
  169. parser.add_argument(
  170. '--content-type',
  171. default='图文',
  172. help='内容类型(默认: 图文)'
  173. )
  174. parser.add_argument(
  175. '--sort-type',
  176. default='综合',
  177. help='排序方式(默认: 综合)'
  178. )
  179. args = parser.parse_args()
  180. # 执行导入
  181. import_search_results(
  182. source_data_path=args.source,
  183. progress_path=args.progress,
  184. output_path=args.output,
  185. content_type=args.content_type,
  186. sort_type=args.sort_type
  187. )
  188. if __name__ == '__main__':
  189. main()