Przeglądaj źródła

feat: 添加key_points空值检查脚本

添加用于检查JSON文件中三点解构.关键点.key_points是否为空的脚本。

功能特性:
- 自动扫描所有账号目录
- 检查"用于pattern聚类"和"what单独解构"两个子目录
- 显示详细的文件列表(相对路径)
- 提供每个账号的统计小结
- 提供总体统计和各账号汇总

使用方法:
python script/data_processing/check_empty_key_points.py

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
yangxiaohui 1 tydzień temu
rodzic
commit
8b13ebe897
1 zmienionych plików z 227 dodań i 0 usunięć
  1. 227 0
      script/data_processing/check_empty_key_points.py

+ 227 - 0
script/data_processing/check_empty_key_points.py

@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+"""
+检查指定目录下JSON文件中的 三点解构.关键点.key_points 是否为空数组
+"""
+
+import json
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+
+def check_key_points_empty(file_path: str) -> Tuple[bool, str]:
+    """
+    检查单个文件中的 key_points 是否为空
+
+    Args:
+        file_path: JSON文件路径
+
+    Returns:
+        (is_empty, error_msg): is_empty表示是否为空数组,error_msg表示错误信息
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+
+        # 导航到 三点解构.关键点.key_points
+        san_dian = data.get('三点解构', None)
+        if san_dian is None:
+            return None, "缺少'三点解构'字段"
+
+        guan_jian_dian = san_dian.get('关键点', None)
+        if guan_jian_dian is None:
+            return None, "缺少'关键点'字段"
+
+        key_points = guan_jian_dian.get('key_points', None)
+        if key_points is None:
+            return None, "缺少'key_points'字段"
+
+        # 检查是否为空数组
+        if not isinstance(key_points, list):
+            return None, f"key_points不是数组类型,而是{type(key_points).__name__}"
+
+        is_empty = len(key_points) == 0
+        return is_empty, ""
+
+    except json.JSONDecodeError as e:
+        return None, f"JSON解析错误: {e}"
+    except Exception as e:
+        return None, f"读取文件错误: {e}"
+
+
+def scan_directory(directory: str) -> Dict[str, List[Tuple[str, bool, str]]]:
+    """
+    扫描目录下所有JSON文件并检查key_points
+
+    Args:
+        directory: 目录路径
+
+    Returns:
+        结果字典,包含统计信息
+    """
+    directory_path = Path(directory)
+
+    results = {
+        'empty': [],      # (文件路径, is_empty, error_msg)
+        'not_empty': [],
+        'error': []
+    }
+
+    # 遍历目录下所有JSON文件
+    json_files = list(directory_path.glob('*.json'))
+
+    for json_file in json_files:
+        is_empty, error_msg = check_key_points_empty(str(json_file))
+
+        if is_empty is None:
+            results['error'].append((str(json_file), is_empty, error_msg))
+        elif is_empty:
+            results['empty'].append((str(json_file), is_empty, error_msg))
+        else:
+            results['not_empty'].append((str(json_file), is_empty, error_msg))
+
+    return results
+
+
+def print_results(dir_name: str, results: Dict[str, List[Tuple[str, bool, str]]], base_dir: str):
+    """打印检查结果"""
+    total = len(results['empty']) + len(results['not_empty']) + len(results['error'])
+
+    print(f"\n{'='*80}")
+    print(f"目录: {dir_name}")
+    print(f"{'='*80}")
+    print(f"总文件数: {total}")
+    print(f"  - key_points为空的文件: {len(results['empty'])}")
+    print(f"  - key_points不为空的文件: {len(results['not_empty'])}")
+    print(f"  - 错误/缺失字段的文件: {len(results['error'])}")
+
+    # 显示key_points为空的文件
+    if results['empty']:
+        print(f"\n【key_points为空的文件】({len(results['empty'])}个):")
+        for file_path, _, _ in results['empty']:
+            rel_path = os.path.relpath(file_path, base_dir)
+            print(f"  - {rel_path}")
+
+    # 显示错误的文件
+    if results['error']:
+        print(f"\n【错误/缺失字段的文件】({len(results['error'])}个):")
+        for file_path, _, error_msg in results['error']:
+            rel_path = os.path.relpath(file_path, base_dir)
+            print(f"  - {rel_path}: {error_msg}")
+
+    # 显示key_points不为空的文件摘要
+    if results['not_empty']:
+        print(f"\n【key_points不为空的文件】({len(results['not_empty'])}个)")
+        for file_path, _, _ in results['not_empty']:
+            rel_path = os.path.relpath(file_path, base_dir)
+            # 读取key_points数量
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                    count = len(data.get('三点解构', {}).get('关键点', {}).get('key_points', []))
+                    print(f"  - {rel_path} (key_points数量: {count})")
+            except:
+                print(f"  - {rel_path}")
+
+
+def main():
+    """主函数"""
+    # 自动扫描所有账号目录
+    base_path = "/Users/semsevens/Desktop/workspace/daily/1113/how_1121_v2/data/账号"
+
+    # 获取当前工作目录作为基准目录
+    base_dir = os.getcwd()
+
+    print("开始检查 三点解构.关键点.key_points 是否为空...")
+
+    all_results = []
+    account_stats = {}  # 存储每个账号的统计信息
+
+    # 遍历所有账号
+    if os.path.exists(base_path):
+        account_dirs = [d for d in os.listdir(base_path)
+                       if os.path.isdir(os.path.join(base_path, d)) and not d.startswith('.')]
+
+        for account_name in sorted(account_dirs):
+            account_path = os.path.join(base_path, account_name)
+
+            print(f"\n{'#'*80}")
+            print(f"账号: {account_name}")
+            print(f"{'#'*80}")
+
+            account_results = []
+
+            # 检查两个子目录
+            subdirs = ["用于pattern聚类", "what单独解构"]
+
+            for subdir_name in subdirs:
+                subdir_path = os.path.join(account_path, subdir_name)
+
+                if os.path.exists(subdir_path):
+                    results = scan_directory(subdir_path)
+                    print_results(subdir_name, results, base_dir)
+                    all_results.append(results)
+                    account_results.append(results)
+                else:
+                    print(f"\n目录不存在: {subdir_name}")
+
+            # 计算该账号的统计信息
+            if account_results:
+                account_empty = sum(len(r['empty']) for r in account_results)
+                account_not_empty = sum(len(r['not_empty']) for r in account_results)
+                account_error = sum(len(r['error']) for r in account_results)
+                account_total = account_empty + account_not_empty + account_error
+
+                account_stats[account_name] = {
+                    'total': account_total,
+                    'empty': account_empty,
+                    'not_empty': account_not_empty,
+                    'error': account_error
+                }
+
+                # 打印该账号的小结
+                print(f"\n{'-'*80}")
+                print(f"【{account_name} 账号小结】")
+                print(f"{'-'*80}")
+                print(f"文件总数: {account_total}")
+                print(f"  - key_points为空: {account_empty} ({account_empty/account_total*100:.1f}%)")
+                print(f"  - key_points不为空: {account_not_empty} ({account_not_empty/account_total*100:.1f}%)")
+                if account_error > 0:
+                    print(f"  - 错误/缺失字段: {account_error} ({account_error/account_total*100:.1f}%)")
+    else:
+        print(f"基础路径不存在: {base_path}")
+        return
+
+    # 总体统计
+    print(f"\n{'='*80}")
+    print("【总体统计】")
+    print(f"{'='*80}")
+
+    total_empty = sum(len(r['empty']) for r in all_results)
+    total_not_empty = sum(len(r['not_empty']) for r in all_results)
+    total_error = sum(len(r['error']) for r in all_results)
+    total_all = total_empty + total_not_empty + total_error
+
+    if total_all > 0:
+        print(f"所有账号共计文件数: {total_all}")
+        print(f"  - key_points为空: {total_empty} ({total_empty/total_all*100:.1f}%)")
+        print(f"  - key_points不为空: {total_not_empty} ({total_not_empty/total_all*100:.1f}%)")
+        if total_error > 0:
+            print(f"  - 错误/缺失字段: {total_error} ({total_error/total_all*100:.1f}%)")
+
+        # 按账号显示汇总
+        if account_stats:
+            print(f"\n{'='*80}")
+            print("【各账号汇总】")
+            print(f"{'='*80}")
+            for account_name, stats in sorted(account_stats.items()):
+                print(f"\n{account_name}:")
+                print(f"  文件总数: {stats['total']}")
+                print(f"  key_points为空: {stats['empty']} ({stats['empty']/stats['total']*100:.1f}%)")
+                print(f"  key_points不为空: {stats['not_empty']} ({stats['not_empty']/stats['total']*100:.1f}%)")
+    else:
+        print("未找到任何文件")
+
+
+if __name__ == "__main__":
+    main()