add_search_words.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 为关联特征生成检索词并去重
  5. 读取 associated_tags_results.json,为每个特征生成组合检索词,
  6. 并在同一结果项内去重。
  7. """
  8. import json
  9. import logging
  10. from pathlib import Path
  11. from typing import Dict, List, Any, Set
  12. import argparse
  13. # 配置日志
  14. logging.basicConfig(
  15. level=logging.INFO,
  16. format='%(asctime)s - %(levelname)s - %(message)s',
  17. datefmt='%Y-%m-%d %H:%M:%S'
  18. )
  19. logger = logging.getLogger(__name__)
  20. class SearchWordGenerator:
  21. """检索词生成器"""
  22. def __init__(self, input_path: str):
  23. """
  24. 初始化生成器
  25. Args:
  26. input_path: 输入JSON文件路径
  27. """
  28. self.input_path = input_path
  29. self.data = self._load_json(input_path)
  30. self.stats = {
  31. '处理的结果项数': 0,
  32. '生成的总组合词数': 0,
  33. '唯一组合词数': 0,
  34. '重复过滤的词数': 0,
  35. '每项详情': []
  36. }
  37. def _load_json(self, file_path: str) -> List[Dict]:
  38. """加载JSON文件"""
  39. try:
  40. with open(file_path, 'r', encoding='utf-8') as f:
  41. return json.load(f)
  42. except Exception as e:
  43. logger.error(f"加载文件 {file_path} 失败: {e}")
  44. raise
  45. def generate_search_words(self) -> List[Dict[str, Any]]:
  46. """
  47. 为所有结果项生成检索词
  48. Returns:
  49. 增强后的数据列表
  50. """
  51. logger.info("=" * 60)
  52. logger.info("开始生成检索词")
  53. logger.info("=" * 60)
  54. enhanced_data = []
  55. for idx, result in enumerate(self.data, 1):
  56. logger.info(f"\n处理第 {idx}/{len(self.data)} 个结果项")
  57. # 获取基础词(人设特征名称)
  58. base_word = result.get('最高匹配信息', {}).get('人设特征名称', '')
  59. original_feature = result.get('原始特征名称', '')
  60. logger.info(f" 原始特征: {original_feature}")
  61. logger.info(f" 人设特征名称(基础词): {base_word}")
  62. if not base_word:
  63. logger.warning(f" 警告:未找到人设特征名称,跳过")
  64. enhanced_data.append(result)
  65. continue
  66. # 用于去重的集合(在当前结果项范围内)
  67. seen_words: Set[str] = set()
  68. item_stats = {
  69. '原始特征': original_feature,
  70. '人设特征名称': base_word,
  71. '总特征数': 0,
  72. '唯一组合词数': 0,
  73. '重复词数': 0,
  74. '组合词列表': []
  75. }
  76. # 遍历所有关联
  77. associations = result.get('找到的关联', [])
  78. for assoc_idx, assoc in enumerate(associations):
  79. target_path = assoc.get('目标分类路径', '')
  80. features = assoc.get('特征列表', [])
  81. logger.info(f" 处理关联 {assoc_idx + 1}/{len(associations)}: {target_path}")
  82. logger.info(f" 特征数: {len(features)}")
  83. # 遍历特征列表
  84. for feature in features:
  85. feature_name = feature.get('特征名称', '')
  86. item_stats['总特征数'] += 1
  87. if not feature_name:
  88. feature['search_word'] = None
  89. continue
  90. # 生成组合词
  91. search_word = f"{base_word} {feature_name}"
  92. # 检查是否重复
  93. if search_word not in seen_words:
  94. # 首次出现,填充
  95. feature['search_word'] = search_word
  96. seen_words.add(search_word)
  97. item_stats['唯一组合词数'] += 1
  98. item_stats['组合词列表'].append(search_word)
  99. logger.debug(f" + 新增: {search_word}")
  100. else:
  101. # 重复,留空
  102. feature['search_word'] = None
  103. item_stats['重复词数'] += 1
  104. logger.debug(f" - 重复(留空): {search_word}")
  105. # 记录统计
  106. logger.info(f" 完成:总特征 {item_stats['总特征数']} 个,"
  107. f"唯一组合词 {item_stats['唯一组合词数']} 个,"
  108. f"重复 {item_stats['重复词数']} 个")
  109. self.stats['处理的结果项数'] += 1
  110. self.stats['生成的总组合词数'] += item_stats['总特征数']
  111. self.stats['唯一组合词数'] += item_stats['唯一组合词数']
  112. self.stats['重复过滤的词数'] += item_stats['重复词数']
  113. self.stats['每项详情'].append(item_stats)
  114. enhanced_data.append(result)
  115. logger.info("\n" + "=" * 60)
  116. logger.info("生成完成")
  117. logger.info("=" * 60)
  118. logger.info(f"处理的结果项数: {self.stats['处理的结果项数']}")
  119. logger.info(f"生成的总组合词数: {self.stats['生成的总组合词数']}")
  120. logger.info(f"唯一组合词数: {self.stats['唯一组合词数']}")
  121. logger.info(f"重复过滤的词数: {self.stats['重复过滤的词数']}")
  122. return enhanced_data
  123. def save_results(self, enhanced_data: List[Dict[str, Any]], output_path: str):
  124. """保存增强后的数据"""
  125. try:
  126. with open(output_path, 'w', encoding='utf-8') as f:
  127. json.dump(enhanced_data, f, ensure_ascii=False, indent=2)
  128. logger.info(f"增强数据已保存到: {output_path}")
  129. except Exception as e:
  130. logger.error(f"保存结果失败: {e}")
  131. raise
  132. def save_stats(self, stats_path: str):
  133. """保存统计信息"""
  134. try:
  135. with open(stats_path, 'w', encoding='utf-8') as f:
  136. json.dump(self.stats, f, ensure_ascii=False, indent=2)
  137. logger.info(f"统计信息已保存到: {stats_path}")
  138. except Exception as e:
  139. logger.error(f"保存统计信息失败: {e}")
  140. raise
  141. def main():
  142. """主函数"""
  143. parser = argparse.ArgumentParser(description='为关联特征生成检索词并去重')
  144. parser.add_argument(
  145. '--input',
  146. default='associated_tags_results.json',
  147. help='输入JSON文件路径(默认: associated_tags_results.json)'
  148. )
  149. parser.add_argument(
  150. '--output',
  151. default='associated_tags_results_with_search.json',
  152. help='输出JSON文件路径(默认: associated_tags_results_with_search.json)'
  153. )
  154. parser.add_argument(
  155. '--stats',
  156. default='search_words_stats.json',
  157. help='统计信息输出路径(默认: search_words_stats.json)'
  158. )
  159. parser.add_argument(
  160. '--debug',
  161. action='store_true',
  162. help='启用调试日志'
  163. )
  164. args = parser.parse_args()
  165. # 设置日志级别
  166. if args.debug:
  167. logger.setLevel(logging.DEBUG)
  168. # 创建生成器
  169. generator = SearchWordGenerator(input_path=args.input)
  170. # 生成检索词
  171. enhanced_data = generator.generate_search_words()
  172. # 保存结果
  173. generator.save_results(enhanced_data, args.output)
  174. generator.save_stats(args.stats)
  175. # 输出汇总
  176. logger.info("\n" + "=" * 60)
  177. logger.info("处理完成汇总")
  178. logger.info("=" * 60)
  179. logger.info(f"输入文件: {args.input}")
  180. logger.info(f"输出文件: {args.output}")
  181. logger.info(f"统计文件: {args.stats}")
  182. logger.info(f"")
  183. logger.info(f"处理结果:")
  184. logger.info(f" - 结果项数: {generator.stats['处理的结果项数']}")
  185. logger.info(f" - 总特征数: {generator.stats['生成的总组合词数']}")
  186. logger.info(f" - 唯一组合词: {generator.stats['唯一组合词数']}")
  187. logger.info(f" - 重复过滤: {generator.stats['重复过滤的词数']}")
  188. logger.info(f" - 去重率: {generator.stats['重复过滤的词数'] / max(generator.stats['生成的总组合词数'], 1) * 100:.1f}%")
  189. if __name__ == '__main__':
  190. main()