#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 搜索结果导入工具 将 search_progress.json 中已完成的搜索结果导入到 associated_tags_results_with_search.json 对应的特征节点中。 匹配规则:根据特征的 search_word 字段匹配 """ import json import copy from datetime import datetime from typing import Dict, Any import argparse def load_json(file_path: str) -> Any: """加载JSON文件""" try: with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) except Exception as e: print(f"错误: 加载文件失败 {file_path}: {e}") raise def save_json(data: Any, file_path: str): """保存JSON文件""" try: with open(file_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"✓ 已保存: {file_path}") except Exception as e: print(f"错误: 保存文件失败 {file_path}: {e}") raise def import_search_results( source_data_path: str, progress_path: str, output_path: str, content_type: str = '图文', sort_type: str = '综合' ): """ 导入搜索结果 Args: source_data_path: 源数据文件路径(包含特征和search_word) progress_path: 进度文件路径(包含已完成的搜索结果) output_path: 输出文件路径 content_type: 内容类型 sort_type: 排序方式 """ print("=" * 60) print("搜索结果导入工具") print("=" * 60) print() # 1. 加载源数据 print(f"步骤1: 加载源数据文件") print(f" {source_data_path}") source_data = load_json(source_data_path) print(f" ✓ 已加载 {len(source_data)} 个结果项") print() # 2. 加载进度文件 print(f"步骤2: 加载搜索进度文件") print(f" {progress_path}") progress = load_json(progress_path) completed_searches = progress.get('completed_searches', {}) print(f" ✓ 已加载 {len(completed_searches)} 个搜索结果") print() # 3. 统计特征信息 print("步骤3: 统计特征信息") total_features = 0 features_with_search_word = 0 unique_search_words = set() for result in source_data: for assoc in result.get('找到的关联', []): for feature in assoc.get('特征列表', []): total_features += 1 search_word = feature.get('search_word') if search_word: features_with_search_word += 1 unique_search_words.add(search_word) print(f" 总特征数: {total_features}") print(f" 有search_word的特征: {features_with_search_word}") print(f" 唯一search_word数: {len(unique_search_words)}") print() # 4. 导入搜索结果 print("步骤4: 导入搜索结果") matched_count = 0 not_found_count = 0 success_count = 0 failed_count = 0 for result_idx, result in enumerate(source_data): for assoc_idx, assoc in enumerate(result.get('找到的关联', [])): for feature_idx, feature in enumerate(assoc.get('特征列表', [])): search_word = feature.get('search_word') # 跳过空的 search_word if not search_word: continue # 查找对应的搜索结果 if search_word in completed_searches: matched_count += 1 search_result = completed_searches[search_word] # 深拷贝搜索结果,避免共享引用 feature['search_result'] = copy.deepcopy(search_result) # 添加元数据 if search_result and search_result.get('data'): note_count = len(search_result.get('data', {}).get('data', [])) feature['search_metadata'] = { 'searched_at': datetime.now().isoformat(), 'status': 'success', 'note_count': note_count, 'search_params': { 'keyword': search_word, 'content_type': content_type, 'sort_type': sort_type } } success_count += 1 else: # 搜索结果为空或失败 feature['search_metadata'] = { 'searched_at': datetime.now().isoformat(), 'status': 'failed', 'note_count': 0, 'search_params': { 'keyword': search_word, 'content_type': content_type, 'sort_type': sort_type } } failed_count += 1 else: not_found_count += 1 print(f" 匹配成功: {matched_count} 个特征") print(f" 搜索成功: {success_count} 个") print(f" 搜索失败: {failed_count} 个") print(f" 未找到搜索结果: {not_found_count} 个") print() # 5. 保存结果 print("步骤5: 保存输出文件") print(f" {output_path}") save_json(source_data, output_path) print() # 6. 输出统计信息 print("=" * 60) print("导入完成") print("=" * 60) print() print(f"总特征数: {total_features}") print(f"有search_word的特征: {features_with_search_word}") print(f"已导入搜索结果: {matched_count} ({matched_count/features_with_search_word*100:.1f}%)") print(f" - 成功: {success_count}") print(f" - 失败: {failed_count}") print(f"待搜索: {not_found_count} ({not_found_count/features_with_search_word*100:.1f}%)") print() def main(): """主函数""" parser = argparse.ArgumentParser(description='搜索结果导入工具') parser.add_argument( '--source', default='associated_tags_results_with_search.json', help='源数据文件路径(默认: associated_tags_results_with_search.json)' ) parser.add_argument( '--progress', default='search_progress.json', help='进度文件路径(默认: search_progress.json)' ) parser.add_argument( '--output', default='associated_tags_results_with_search_data.json', help='输出文件路径(默认: associated_tags_results_with_search_data.json)' ) parser.add_argument( '--content-type', default='图文', help='内容类型(默认: 图文)' ) parser.add_argument( '--sort-type', default='综合', help='排序方式(默认: 综合)' ) args = parser.parse_args() # 执行导入 import_search_results( source_data_path=args.source, progress_path=args.progress, output_path=args.output, content_type=args.content_type, sort_type=args.sort_type ) if __name__ == '__main__': main()