liulidong
/
knowledge_search


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
搜索结果导入工具

将 search_progress.json 中已完成的搜索结果导入到
associated_tags_results_with_search.json 对应的特征节点中。

匹配规则：根据特征的 search_word 字段匹配
"""

import json
import copy
from datetime import datetime
from typing import Dict, Any
import argparse


def load_json(file_path: str) -> Any:
    """加载JSON文件"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"错误: 加载文件失败 {file_path}: {e}")
        raise


def save_json(data: Any, file_path: str):
    """保存JSON文件"""
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"✓ 已保存: {file_path}")
    except Exception as e:
        print(f"错误: 保存文件失败 {file_path}: {e}")
        raise


def import_search_results(
    source_data_path: str,
    progress_path: str,
    output_path: str,
    content_type: str = '图文',
    sort_type: str = '综合'
):
    """
    导入搜索结果

    Args:
        source_data_path: 源数据文件路径（包含特征和search_word）
        progress_path: 进度文件路径（包含已完成的搜索结果）
        output_path: 输出文件路径
        content_type: 内容类型
        sort_type: 排序方式
    """
    print("=" * 60)
    print("搜索结果导入工具")
    print("=" * 60)
    print()

    # 1. 加载源数据
    print(f"步骤1: 加载源数据文件")
    print(f"  {source_data_path}")
    source_data = load_json(source_data_path)
    print(f"  ✓ 已加载 {len(source_data)} 个结果项")
    print()

    # 2. 加载进度文件
    print(f"步骤2: 加载搜索进度文件")
    print(f"  {progress_path}")
    progress = load_json(progress_path)
    completed_searches = progress.get('completed_searches', {})
    print(f"  ✓ 已加载 {len(completed_searches)} 个搜索结果")
    print()

    # 3. 统计特征信息
    print("步骤3: 统计特征信息")
    total_features = 0
    features_with_search_word = 0
    unique_search_words = set()

    for result in source_data:
        for assoc in result.get('找到的关联', []):
            for feature in assoc.get('特征列表', []):
                total_features += 1
                search_word = feature.get('search_word')
                if search_word:
                    features_with_search_word += 1
                    unique_search_words.add(search_word)

    print(f"  总特征数: {total_features}")
    print(f"  有search_word的特征: {features_with_search_word}")
    print(f"  唯一search_word数: {len(unique_search_words)}")
    print()

    # 4. 导入搜索结果
    print("步骤4: 导入搜索结果")
    matched_count = 0
    not_found_count = 0
    success_count = 0
    failed_count = 0

    for result_idx, result in enumerate(source_data):
        for assoc_idx, assoc in enumerate(result.get('找到的关联', [])):
            for feature_idx, feature in enumerate(assoc.get('特征列表', [])):
                search_word = feature.get('search_word')

                # 跳过空的 search_word
                if not search_word:
                    continue

                # 查找对应的搜索结果
                if search_word in completed_searches:
                    matched_count += 1
                    search_result = completed_searches[search_word]

                    # 深拷贝搜索结果，避免共享引用
                    feature['search_result'] = copy.deepcopy(search_result)

                    # 添加元数据
                    if search_result and search_result.get('data'):
                        note_count = len(search_result.get('data', {}).get('data', []))
                        feature['search_metadata'] = {
                            'searched_at': datetime.now().isoformat(),
                            'status': 'success',
                            'note_count': note_count,
                            'search_params': {
                                'keyword': search_word,
                                'content_type': content_type,
                                'sort_type': sort_type
                            }
                        }
                        success_count += 1
                    else:
                        # 搜索结果为空或失败
                        feature['search_metadata'] = {
                            'searched_at': datetime.now().isoformat(),
                            'status': 'failed',
                            'note_count': 0,
                            'search_params': {
                                'keyword': search_word,
                                'content_type': content_type,
                                'sort_type': sort_type
                            }
                        }
                        failed_count += 1
                else:
                    not_found_count += 1

    print(f"  匹配成功: {matched_count} 个特征")
    print(f"  搜索成功: {success_count} 个")
    print(f"  搜索失败: {failed_count} 个")
    print(f"  未找到搜索结果: {not_found_count} 个")
    print()

    # 5. 保存结果
    print("步骤5: 保存输出文件")
    print(f"  {output_path}")
    save_json(source_data, output_path)
    print()

    # 6. 输出统计信息
    print("=" * 60)
    print("导入完成")
    print("=" * 60)
    print()
    print(f"总特征数: {total_features}")
    print(f"有search_word的特征: {features_with_search_word}")
    print(f"已导入搜索结果: {matched_count} ({matched_count/features_with_search_word*100:.1f}%)")
    print(f"  - 成功: {success_count}")
    print(f"  - 失败: {failed_count}")
    print(f"待搜索: {not_found_count} ({not_found_count/features_with_search_word*100:.1f}%)")
    print()


def main():
    """主函数"""
    parser = argparse.ArgumentParser(description='搜索结果导入工具')
    parser.add_argument(
        '--source',
        default='associated_tags_results_with_search.json',
        help='源数据文件路径（默认: associated_tags_results_with_search.json）'
    )
    parser.add_argument(
        '--progress',
        default='search_progress.json',
        help='进度文件路径（默认: search_progress.json）'
    )
    parser.add_argument(
        '--output',
        default='associated_tags_results_with_search_data.json',
        help='输出文件路径（默认: associated_tags_results_with_search_data.json）'
    )
    parser.add_argument(
        '--content-type',
        default='图文',
        help='内容类型（默认: 图文）'
    )
    parser.add_argument(
        '--sort-type',
        default='综合',
        help='排序方式（默认: 综合）'
    )

    args = parser.parse_args()

    # 执行导入
    import_search_results(
        source_data_path=args.source,
        progress_path=args.progress,
        output_path=args.output,
        content_type=args.content_type,
        sort_type=args.sort_type
    )


if __name__ == '__main__':
    main()