yangxiaohui
/
how


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
							"""
批量测试脚本：处理作者历史帖子目录下的所有帖子（并发版本，带历史帖子）

功能：
1. 加载最新的20个帖子（按照publish_timestamp从新到旧排序）
2. 为每个帖子加载历史帖子（比当前帖子早的最近15篇）
3. 并发处理所有帖子
"""

import json
import sys
import os
import argparse
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

# 手动加载.env文件
def load_env_file(env_path):
    """手动加载.env文件"""
    if not env_path.exists():
        return False

    with open(env_path, 'r') as f:
        for line in f:
            line = line.strip()
            # 跳过注释和空行
            if not line or line.startswith('#'):
                continue
            # 解析KEY=VALUE
            if '=' in line:
                key, value = line.split('=', 1)
                os.environ[key.strip()] = value.strip()

    return True

env_path = project_root / ".env"
if load_env_file(env_path):
    print(f"✅ 已加载环境变量从: {env_path}")
    # 验证API密钥
    api_key = os.environ.get("GEMINI_API_KEY", "")
    if api_key:
        print(f"   GEMINI_API_KEY: {api_key[:10]}...")
else:
    print(f"⚠️  未找到.env文件: {env_path}")

from src.workflows.what_deconstruction_workflow import WhatDeconstructionWorkflow
from src.utils.logger import get_logger

logger = get_logger(__name__)

# 线程安全的输出锁
print_lock = threading.Lock()

# 线程安全的计数器
class ThreadSafeCounter:
    def __init__(self):
        self._lock = threading.Lock()
        self._success = 0
        self._fail = 0

    def increment_success(self):
        with self._lock:
            self._success += 1

    def increment_fail(self):
        with self._lock:
            self._fail += 1

    @property
    def success(self):
        with self._lock:
            return self._success

    @property
    def fail(self):
        with self._lock:
            return self._fail

def safe_print(*args, **kwargs):
    """线程安全的打印函数"""
    with print_lock:
        print(*args, **kwargs)


def load_historical_posts(history_dir, target_timestamp=None, target_post_id=None, max_count=15):
    """
    加载历史帖子（根据publish_timestamp从新到旧排序）

    选择比目标帖子早发布，并且是最近发布的帖子，排除目标帖子本身

    Args:
        history_dir: 历史帖子目录
        target_timestamp: 目标帖子的发布时间戳（可选）
        target_post_id: 目标帖子的ID（用于过滤重复，可选）
        max_count: 最多加载的帖子数量

    Returns:
        list: 历史帖子列表（从新到旧排序）
    """
    history_path = Path(history_dir)

    if not history_path.exists():
        safe_print(f"⚠️  历史帖子目录不存在: {history_path}")
        return []

    # 获取所有JSON文件
    json_files = list(history_path.glob("*.json"))

    if not json_files:
        safe_print(f"⚠️  未找到历史帖子文件")
        return []

    # 读取所有帖子并提取publish_timestamp
    posts_with_timestamp = []
    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                post_data = json.load(f)

            # 获取发布时间戳，如果不存在则使用0
            timestamp = post_data.get("publish_timestamp", 0)
            post_id = post_data.get("channel_content_id", "")

            posts_with_timestamp.append({
                "file_path": file_path,
                "post_data": post_data,
                "timestamp": timestamp,
                "post_id": post_id
            })
        except Exception as e:
            safe_print(f"   ⚠️  读取文件失败 {file_path.name}: {e}")
            continue

    if not posts_with_timestamp:
        safe_print(f"⚠️  没有成功读取到任何帖子")
        return []

    # 过滤掉目标帖子本身
    if target_post_id is not None:
        posts_with_timestamp = [
            post for post in posts_with_timestamp
            if post["post_id"] != target_post_id
        ]

    # 如果提供了目标时间戳，只保留比目标帖子早的帖子
    if target_timestamp is not None:
        posts_with_timestamp = [
            post for post in posts_with_timestamp
            if post["timestamp"] < target_timestamp
        ]

        if not posts_with_timestamp:
            return []

    # 按照publish_timestamp排序（从新到旧）
    posts_with_timestamp.sort(key=lambda x: x["timestamp"], reverse=True)

    # 选择最近的N篇（从新到旧）
    selected_posts = posts_with_timestamp[:max_count] if len(posts_with_timestamp) > max_count else posts_with_timestamp

    historical_posts = []
    for post_info in selected_posts:
        post_data = post_info["post_data"]

        # 转换为需要的格式
        historical_post = {
            "text": {
                "title": post_data.get("title", ""),
                "body": post_data.get("body_text", ""),
                "hashtags": ""
            },
            "images": post_data.get("images", [])
        }

        historical_posts.append(historical_post)

    return historical_posts


def load_post_files(directory, max_count=20):
    """
    加载作者历史帖子目录下的所有JSON文件，按照publish_timestamp从新到旧排序，取最新的max_count个

    Args:
        directory: 帖子目录
        max_count: 最多加载的帖子数量（默认20）

    Returns:
        list: 帖子文件路径列表（按时间从新到旧排序）
    """
    post_dir = Path(directory)

    if not post_dir.exists():
        raise FileNotFoundError(f"目录不存在: {post_dir}")

    # 获取所有JSON文件
    json_files = list(post_dir.glob("*.json"))

    if not json_files:
        raise FileNotFoundError(f"目录中没有找到JSON文件: {post_dir}")

    # 读取所有帖子并提取publish_timestamp
    posts_with_timestamp = []
    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                post_data = json.load(f)

            # 获取发布时间戳，如果不存在则使用0
            timestamp = post_data.get("publish_timestamp", 0)

            posts_with_timestamp.append({
                "file_path": file_path,
                "timestamp": timestamp,
                "post_data": post_data
            })
        except Exception as e:
            print(f"⚠️  读取文件失败 {file_path.name}: {e}")
            continue

    if not posts_with_timestamp:
        raise FileNotFoundError(f"没有成功读取到任何帖子")

    # 按照publish_timestamp排序（从新到旧）
    posts_with_timestamp.sort(key=lambda x: x["timestamp"], reverse=True)

    # 取最新的max_count个
    selected_posts = posts_with_timestamp[:max_count]

    print(f"📊 按时间排序并选择最新 {len(selected_posts)} 个帖子:")
    for idx, post_info in enumerate(selected_posts, 1):
        post_data = post_info["post_data"]
        publish_time = post_data.get("publish_time", "未知时间")
        title = post_data.get("title", "无标题")
        print(f"   {idx}. {post_info['file_path'].name}")
        print(f"      标题: {title}")
        print(f"      发布时间: {publish_time}")

    return [post_info["file_path"] for post_info in selected_posts]


def convert_to_workflow_input(raw_data, historical_posts=None):
    """
    将原始数据转换为工作流输入格式

    Args:
        raw_data: 原始帖子数据
        historical_posts: 历史帖子列表（可选）
    """
    # 转换为工作流需要的格式
    images = raw_data.get("images", [])

    input_data = {
        "multimedia_content": {
            "images": images,
            "video": raw_data.get("video", {}),
            "text": {
                "title": raw_data.get("title", ""),
                "body": raw_data.get("body_text", ""),
                "hashtags": ""
            }
        },
        "comments": raw_data.get("comments", []),  # 包含评论数据
        "creator_info": {
            "nickname": raw_data.get("channel_account_name", ""),
            "account_id": raw_data.get("channel_account_id", "")
        }
    }

    # 如果有历史帖子，添加到输入数据中
    if historical_posts:
        input_data["historical_posts"] = historical_posts

    return input_data


def process_single_post(post_file, posts_dir, output_dir, counter, total_count, current_index):
    """处理单个帖子文件（线程安全版本，带历史帖子）"""
    post_name = post_file.stem  # 获取文件名（不含扩展名）
    thread_id = threading.current_thread().name

    safe_print(f"\n{'='*80}")
    safe_print(f"[线程:{thread_id}] 处理帖子: {post_name}")
    safe_print(f"进度: [{current_index}/{total_count}]")
    safe_print(f"{'='*80}")

    # 1. 加载帖子数据
    safe_print(f"\n[线程:{thread_id}][1] 加载帖子数据...")
    try:
        with open(post_file, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        target_timestamp = raw_data.get('publish_timestamp')
        target_post_id = raw_data.get('channel_content_id')

        safe_print(f"✅ [{thread_id}] 成功加载帖子数据")
        safe_print(f"   - 标题: {raw_data.get('title', 'N/A')}")
        safe_print(f"   - 帖子ID: {target_post_id}")
        safe_print(f"   - 发布时间: {raw_data.get('publish_time', '未知时间')}")
        safe_print(f"   - 图片数: {len(raw_data.get('images', []))}")
        safe_print(f"   - 点赞数: {raw_data.get('like_count', 0)}")
        safe_print(f"   - 评论数: {raw_data.get('comment_count', 0)}")
    except Exception as e:
        safe_print(f"❌ [{thread_id}] 加载帖子数据失败: {e}")
        counter.increment_fail()
        return False

    # 2. 加载历史帖子
    safe_print(f"\n[线程:{thread_id}][2] 加载历史帖子...")
    historical_posts = load_historical_posts(
        posts_dir,
        target_timestamp=target_timestamp,
        target_post_id=target_post_id,
        max_count=15
    )

    if historical_posts:
        safe_print(f"✅ [{thread_id}] 成功加载 {len(historical_posts)} 篇历史帖子")
    else:
        safe_print(f"⚠️  [{thread_id}] 未加载到历史帖子，将使用常规分析模式")

    # 3. 转换数据格式
    safe_print(f"\n[线程:{thread_id}][3] 转换数据格式...")
    try:
        input_data = convert_to_workflow_input(raw_data, historical_posts)
        safe_print(f"✅ [{thread_id}] 数据格式转换成功")
        safe_print(f"   - 历史帖子数: {len(input_data.get('historical_posts', []))}")
    except Exception as e:
        safe_print(f"❌ [{thread_id}] 数据格式转换失败: {e}")
        counter.increment_fail()
        return False

    # 4. 初始化工作流（每个线程创建独立实例，确保线程安全）
    safe_print(f"\n[线程:{thread_id}][4] 初始化工作流...")
    try:
        workflow = WhatDeconstructionWorkflow(
            model_provider="google_genai",
            max_depth=10
        )
        safe_print(f"✅ [{thread_id}] 工作流初始化成功")
    except Exception as e:
        safe_print(f"❌ [{thread_id}] 工作流初始化失败: {e}")
        counter.increment_fail()
        return False

    # 5. 执行工作流
    safe_print(f"\n[线程:{thread_id}][5] 执行工作流...")
    safe_print(f"   注意：这可能需要几分钟时间...")
    try:
        result = workflow.invoke(input_data)
        safe_print(f"✅ [{thread_id}] 工作流执行成功")
    except Exception as e:
        safe_print(f"❌ [{thread_id}] 工作流执行失败: {e}")
        import traceback
        safe_print(traceback.format_exc())
        counter.increment_fail()
        return False

    # 6. 保存结果
    safe_print(f"\n[线程:{thread_id}][6] 保存结果...")
    try:
        # 使用帖子文件名作为前缀
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_filename = f"{post_name}_with_history_{timestamp}.json"
        output_path = output_dir / output_filename

        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)

        safe_print(f"✅ [{thread_id}] 结果已保存到: {output_path}")
    except Exception as e:
        safe_print(f"❌ [{thread_id}] 保存结果失败: {e}")
        counter.increment_fail()
        return False

    # 7. 显示结果摘要
    safe_print(f"\n{'='*80}")
    safe_print(f"结果摘要 - {post_name} [线程:{thread_id}]")
    safe_print(f"{'='*80}")

    if result:
        three_points = result.get("三点解构", {})
        inspiration_data = three_points.get("灵感点", {})
        keypoints_data = three_points.get("关键点", {})
        comments = result.get("评论分析", {}).get("解构维度", [])

        safe_print(f"\n三点解构:")
        safe_print(f"   - 灵感点数量: {inspiration_data.get('total_count', 0)}")
        safe_print(f"   - 灵感点分析模式: {inspiration_data.get('analysis_mode', '未知')}")
        safe_print(f"   - 目的点数量: 1")
        safe_print(f"   - 关键点数量: {keypoints_data.get('total_count', 0)}")

        safe_print(f"\n评论分析:")
        safe_print(f"   - 解构维度数: {len(comments)}")

        topic_understanding = result.get("选题理解", {})
        if topic_understanding:
            topic_theme = topic_understanding.get("topic_theme", "")
            safe_print(f"\n选题理解:")
            safe_print(f"   - 选题主题: {topic_theme}")

    counter.increment_success()
    return True


def main():
    """主函数（并发版本，带历史帖子）"""
    # 解析命令行参数
    parser = argparse.ArgumentParser(description='批量处理帖子的What解构工作流')
    parser.add_argument('directory', type=str, help='帖子目录名（如"阿里多多酱"或"G88818"）')
    args = parser.parse_args()

    directory = args.directory

    print("=" * 80)
    print(f"开始批量处理作者历史帖子（并发模式，带历史帖子）- 目录: {directory}")
    print("=" * 80)

    # 配置
    posts_dir = Path(__file__).parent / directory / "作者历史帖子"
    output_dir = Path(__file__).parent / directory / "output"
    output_dir.mkdir(parents=True, exist_ok=True)

    # 并发配置：设置最大线程数（建议根据CPU核心数和API限制调整）
    MAX_WORKERS = 4  # 可以根据需要调整，建议不超过5

    # 处理帖子数量限制
    MAX_POSTS = 20  # 只处理最新的20个帖子

    # 1. 加载所有帖子文件（按时间从新到旧排序，取最新20个）
    print(f"\n[1] 扫描帖子文件...")
    try:
        post_files = load_post_files(posts_dir, max_count=MAX_POSTS)
        print(f"✅ 选择 {len(post_files)} 个最新帖子进行处理")
    except Exception as e:
        print(f"❌ 扫描帖子文件失败: {e}")
        return

    # 2. 初始化线程安全计数器
    print(f"\n[2] 初始化并发处理...")
    print(f"   - 最大并发线程数: {MAX_WORKERS}")
    counter = ThreadSafeCounter()

    # 3. 使用线程池并发处理所有帖子
    print(f"\n[3] 开始并发处理...")
    print(f"   注意：多个线程会同时处理不同的帖子，每个帖子都会加载对应的历史帖子")
    print("=" * 80)

    start_time = datetime.now()

    # 使用ThreadPoolExecutor进行并发处理
    with ThreadPoolExecutor(max_workers=MAX_WORKERS, thread_name_prefix="Worker") as executor:
        # 提交所有任务
        future_to_post = {
            executor.submit(
                process_single_post,
                post_file,
                posts_dir,  # 传入帖子目录，用于加载历史帖子
                output_dir,
                counter,
                len(post_files),
                i
            ): (post_file, i)
            for i, post_file in enumerate(post_files, 1)
        }

        # 等待所有任务完成
        for future in as_completed(future_to_post):
            post_file, index = future_to_post[future]
            try:
                result = future.result()
                if not result:
                    safe_print(f"⚠️  处理帖子失败: {post_file.name}")
            except Exception as e:
                safe_print(f"❌ 处理帖子时发生异常: {post_file.name}")
                safe_print(f"   错误: {e}")
                import traceback
                safe_print(traceback.format_exc())
                counter.increment_fail()

    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()

    # 4. 总结
    print("\n" + "=" * 80)
    print("批量处理完成")
    print("=" * 80)
    print(f"\n总计: {len(post_files)} 个帖子")
    print(f"成功: {counter.success} 个")
    print(f"失败: {counter.fail} 个")
    print(f"耗时: {duration:.2f} 秒")
    print(f"平均每个帖子: {duration/len(post_files):.2f} 秒")
    print(f"\n结果保存目录: {output_dir}")
    print("=" * 80)


if __name__ == "__main__":
    main()