weapp
/
video_decode


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
							"""
知识需求生成Agent的完整测试脚本

测试KnowledgeRequirementGenerationAgent的新输出格式（PRD 1.4）

新格式包含：
- 整体项目目标
- 本次任务目标
- 上下文
- 待解构帖子信息
- 需求（内容知识需求 + 工具知识需求）
"""

import sys
import json
import os
from pathlib import Path
from dotenv import load_dotenv

# 添加项目根目录到路径
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

# 加载环境变量
load_dotenv(project_root / ".env")

from langchain.chat_models import init_chat_model
from src.components.agents.knowledge_requirement_agent import generate_knowledge_requirements
from src.utils.logger import get_logger

logger = get_logger(__name__)


def load_test_data(file_path: str) -> dict:
    """加载测试数据

    Args:
        file_path: JSON文件路径

    Returns:
        解析后的JSON数据
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def read_prd_content(pdf_path: str) -> str:
    """读取PRD PDF内容

    Args:
        pdf_path: PDF文件路径

    Returns:
        PRD文本内容
    """
    try:
        import pymupdf  # PyMuPDF

        # 打开PDF文件
        doc = pymupdf.open(pdf_path)

        # 提取所有页面的文本
        text_content = []
        for page_num in range(len(doc)):
            page = doc[page_num]
            text_content.append(page.get_text())

        doc.close()

        # 合并所有页面的文本
        full_text = "\n".join(text_content)

        logger.info(f"成功读取PDF文件: {pdf_path}, 内容长度: {len(full_text)} 字符")
        return full_text

    except ImportError:
        logger.warning("未安装 pymupdf 库，使用 PyPDF2 作为备选方案")
        try:
            from PyPDF2 import PdfReader

            reader = PdfReader(pdf_path)
            text_content = []

            for page in reader.pages:
                text_content.append(page.extract_text())

            full_text = "\n".join(text_content)
            logger.info(f"成功读取PDF文件 (PyPDF2): {pdf_path}, 内容长度: {len(full_text)} 字符")
            return full_text

        except ImportError:
            logger.error("未安装 PDF 读取库 (pymupdf 或 PyPDF2)，无法读取PDF文件")
            raise ImportError("请安装 pymupdf (推荐) 或 PyPDF2: pip install pymupdf 或 pip install PyPDF2")

    except Exception as e:
        logger.error(f"读取PDF文件失败: {e}", exc_info=True)
        raise


def format_post_content(raw_data: dict) -> dict:
    """格式化帖子内容为Agent所需格式

    Args:
        raw_data: 原始帖子数据

    Returns:
        格式化后的帖子内容
    """
    return {
        "text": {
            "title": raw_data.get("title", ""),
            "body": raw_data.get("body_text", ""),
            "hashtags": []  # 可以从body_text中提取
        },
        "images": raw_data.get("images", []),
        "metadata": {
            "link": raw_data.get("link", ""),
            "content_id": raw_data.get("channel_content_id", ""),
            "account_name": raw_data.get("channel_account_name", ""),
            "content_type": raw_data.get("content_type", ""),
            "comment_count": raw_data.get("comment_count", 0),
            "like_count": raw_data.get("like_count", 0),
            "collect_count": raw_data.get("collect_count", 0)
        }
    }


def test_complete_knowledge_requirement_generation():
    """完整测试：知识需求生成（新格式 - PRD 1.4）

    测试场景：
    1. 帖子整体解构
    2. 验证输出格式包含所有必需章节
    3. 验证知识需求正确分类（内容知识 vs 工具知识）

    测试步骤：
    - Step 1: 初始化LLM
    - Step 2: 加载PRD内容
    - Step 3: 加载测试帖子数据
    - Step 4: 定义任务阶段
    - Step 5: 创建Agent并生成知识需求
    - Step 6: 展示结果
    - Step 7: 格式验证
    - Step 8: 保存Markdown文档
    - Step 9: 测试总结
    """
    print("\n" + "=" * 100)
    print("📝 完整测试：KnowledgeRequirementGenerationAgent（新格式 - PRD 1.4）")
    print("   - 输入：PRD文档 + 小红书帖子 + 解构上下文 + 任务阶段")
    print("   - 输出：结构化知识需求文档（包含项目目标、任务目标、上下文、需求等）")
    print("=" * 100)

    # Step 1: 初始化LLM
    print("\n🤖 Step 1: 初始化LLM (Gemini)...")
    # 确保 GOOGLE_API_KEY 环境变量已设置
    google_api_key = os.getenv("GEMINI_API_KEY")
    if google_api_key:
        os.environ["GOOGLE_API_KEY"] = google_api_key
    llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
    print("   ✓ LLM初始化成功")

    # Step 2: 加载PRD内容
    print("\n📄 Step 2: 加载PRD内容...")
    prd_path = project_root / "prd1.4.pdf"
    prd_content = read_prd_content(str(prd_path))
    print(f"   ✓ PRD加载成功，内容长度: {len(prd_content)} 字符")
    print(f"   PRD摘要（前200字）: {prd_content}...")

    # Step 3: 加载测试帖子数据
    print("\n📁 Step 3: 加载测试帖子数据...")
    test_data_path = project_root / "examples/测试数据/阿里多多酱/待解构帖子.json"
    raw_data = load_test_data(str(test_data_path))
    post_content = format_post_content(raw_data)
    print(f"   ✓ 帖子数据加载成功")
    print(f"   - 标题: {raw_data.get('title', 'N/A')}")
    print(f"   - 图片数量: {len(raw_data.get('images', []))}张")
    print(f"   - 点赞数: {raw_data.get('like_count', 0)}")

    # Step 4: 定义任务阶段
    print("\n🎯 Step 4: 定义任务阶段...")
    task_stage = "帖子整体解构"
    print(f"   ✓ 任务阶段: {task_stage}")
    print(f"   - 说明: 确定帖子的描述维度（品类、主题、脚本、内容亮点、情绪共鸣点等）")

    # Step 5: 创建Agent并生成知识需求
    print("\n⚙️  Step 5: 创建Agent并生成知识需求...")
    print("   选项：启用知识检索 = False（加快测试速度）")

    result = generate_knowledge_requirements(
        llm=llm,
        prd_content=prd_content,
        post_content=post_content,
        task_stage=task_stage,  # 新增参数
        enable_retrieval=False  # 设为True可启用知识检索，但会较慢
    )

    # Step 6: 展示结果
    print("\n" + "=" * 100)
    print("📊 Step 6: 生成结果展示")
    print("=" * 100)

    print(f"\n📝 总结:")
    print(f"   {result.summary}")

    print(f"\n📄 Markdown文档:")
    print("-" * 100)
    print(result.markdown_document)
    print("-" * 100)

    # Step 7: 格式验证
    print("\n" + "=" * 100)
    print("🔍 Step 7: 格式验证")
    print("=" * 100)

    required_sections = [
        "# 整体项目目标",
        "# 本次任务目标",
        "# 上下文",
        "# 待解构帖子信息",
        "# 需求",
        "## 内容知识需求",
        "### 需求约束",
        "### 需求描述",
        "## 工具知识需求"
    ]

    all_passed = True
    for section in required_sections:
        if section in result.markdown_document:
            print(f"   ✓ 包含章节: {section}")
        else:
            print(f"   ✗ 缺少章节: {section}")
            all_passed = False

    if all_passed:
        print(f"\n   🎉 格式验证通过！所有必需章节均存在。")
    else:
        print(f"\n   ⚠️  格式验证未通过，缺少部分章节。")

    # Step 8: 保存Markdown文档
    print("\n" + "=" * 100)
    print("💾 Step 8: 保存Markdown文档")
    print("=" * 100)

    output_dir = project_root / "test/outputs"
    output_dir.mkdir(parents=True, exist_ok=True)

    output_path = output_dir / "knowledge_requirement_complete.md"
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(result.markdown_document)

    print(f"\n   ✓ Markdown文档已保存到: {output_path}")
    print(f"   - 文件大小: {len(result.markdown_document)} 字符")

    # Step 9: 测试总结
    print("\n" + "=" * 100)
    print("✅ 测试完成总结")
    print("=" * 100)
    print(f"   ✓ 成功读取PRD文档 (prd1.4.pdf)")
    print(f"   ✓ 成功加载小红书帖子数据")
    print(f"   ✓ 成功定义任务阶段: {task_stage}")
    print(f"   ✓ 成功生成知识需求文档（新格式 - PRD 1.4）")
    print(f"   ✓ 格式验证: {'通过' if all_passed else '未通过'}")
    print(f"   ✓ 总结: {result.summary}")
    print(f"   ✓ Markdown文档大小: {len(result.markdown_document)} 字符")

    print("\n" + "🎉" * 50)
    print("测试成功完成! 新输出格式符合 PRD 1.4 要求")
    print("🎉" * 50 + "\n")

    return result


def main():
    """主测试函数"""
    print("\n" + "🚀" * 50)
    print("KnowledgeRequirementGenerationAgent 完整测试套件（PRD 1.4 新格式）")
    print("🚀" * 50)

    try:
        _ = test_complete_knowledge_requirement_generation()  # noqa: F841

        print("\n✅ 所有测试通过!")
        return 0

    except Exception as e:
        logger.error(f"测试失败: {e}", exc_info=True)
        print(f"\n❌ 测试失败: {e}\n")
        import traceback
        traceback.print_exc()
        return 1


if __name__ == "__main__":
    exit(main())