import sys import os import asyncio import json import base64 import re # 将项目根目录加入,方便导入内部包 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) from agent.llm.qwen import qwen_llm_call from agent.tools.builtin.search import search_posts # ----------------- # Tools definitions # ----------------- async def call_banana_tool(prompt: str) -> str: """包装 call_banana.py 工具的调用方法,抓取它保存本地的文件路径""" print(f"\n[Tool] ✨ 正在调用 call_banana 生成图片, Prompt: {prompt[:50]}...") script_path = os.path.join(os.path.dirname(__file__), "call_banana.py") # 设置环境变量走兼容模式,同时强制指定 UTF-8 编码避免 Windows 下输出由于表情符号崩溃 env = os.environ.copy() env["PYTHONIOENCODING"] = "utf-8" process = await asyncio.create_subprocess_exec( sys.executable, script_path, "-p", prompt, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, env=env ) stdout, stderr = await process.communicate() output = stdout.decode('utf-8', errors='replace') err_output = stderr.decode('utf-8', errors='replace') if err_output: output += "\n" + err_output # 解析输出:"💾 已保存到本地 -> banana_output_0.jpeg" match = re.search(r"已保存到本地 -> (.+)", output) if match: path = match.group(1).strip() print(f"[Tool] ✅ call_banana 返回图片路径: {path}") return path else: print(f"[Tool] ❌ call_banana 似乎未成功生成文件, 控制台输出:\n{output}") return f"Tool Execution Failed. output:\n{output}" async def search_tool(keyword: str) -> str: print(f"\n[Tool] 🔍 启动小红书调研, 关键词: {keyword}") try: result = await search_posts(keyword=keyword, channel="xhs", max_count=3) return result.output except Exception as e: return f"查询失败: {e}" def get_agent_tools(): return [ { "type": "function", "function": { "name": "search_tool", "description": "如果需要了解某个风格如何写 Prompt(例如“写实风格提示词”),调用此工具进行小红书全网搜索,返回总结经验以更新你的参数。", "parameters": { "type": "object", "properties": { "keyword": { "type": "string", "description": "搜索关键词" } }, "required": ["keyword"] } } }, { "type": "function", "function": { "name": "call_banana_tool", "description": "使用此工具通过给定的详细提示词生成图片。工具将返回生成图片的本地保存路径。", "parameters": { "type": "object", "properties": { "prompt": { "type": "string", "description": "英语或中文详细的生图提示词" } }, "required": ["prompt"] } } } ] # ----------------- # Agent 2: Image Evaluator (Qwen-VL-Max) # ----------------- async def evaluate_images(target_image_path: str, generated_image_path: str, previous_feedback: str = None) -> str: print(f"\n[Agent 2] 👁️ Qwen-VL 开始视觉评估...") print(f" - 目标图: {target_image_path}") print(f" - 生成图: {generated_image_path}") def encode_image(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') try: target_b64 = encode_image(target_image_path) gen_b64 = encode_image(generated_image_path) target_ext = target_image_path.split('.')[-1].lower() if target_ext == 'jpg': target_ext = 'jpeg' gen_ext = generated_image_path.split('.')[-1].lower() if gen_ext == 'jpg': gen_ext = 'jpeg' except Exception as e: return f"无法读取图片以进行评估: {e}" system_content = "你是专业的AI生图评审师。你的工作是对比【目标参考图】和当前【生成图】,找出具体的差异,并给出针对性的修改意见给生图Prompt工程师。" if previous_feedback: system_content += "\n你还会收到你【上一轮的评估反馈】。请结合你的旧反馈,检查这轮新图片是否修正了你上次提出的问题,避免重复说一样的话,而是要有动态进展意识!" text_prompt = "请做详细的差异点分析:从构图、色彩、人物或物体细节、整体质感等方面指出当前生成图与目标图的差距。" if previous_feedback: text_prompt += f"\n\n你对上一版旧图的评估反馈曾经是:\n{previous_feedback}\n\n请比对这张【新生成图】,告诉我:上一版的问题被解决了吗?画面的进步点和退步点在哪里?请给出更新的针对性修改意见!" else: text_prompt += "结束时,请给出具体的 Prompt 修改建议。" messages = [ { "role": "system", "content": system_content }, { "role": "user", "content": [ {"type": "text", "text": "【目标参考图(理想状态)】:"}, {"type": "image_url", "image_url": {"url": f"data:image/{target_ext};base64,{target_b64}"}}, {"type": "text", "text": "【本次生成的图片】:"}, {"type": "image_url", "image_url": {"url": f"data:image/{gen_ext};base64,{gen_b64}"}}, {"type": "text", "text": text_prompt} ] } ] try: response = await qwen_llm_call( messages=messages, model="qwen3.5-plus" ) analysis = response["content"] print(f"\n[Agent 2] 📃 评估反馈:\n{analysis}\n") return analysis except Exception as e: print(f"\n[Agent 2] ⚠️ 评估发生错误: {e}") return f"VL模型调用失败: {e}" # ----------------- # Main Workflow Loop # ----------------- def get_base64_url(image_path: str) -> str: with open(image_path, "rb") as image_file: b64_data = base64.b64encode(image_file.read()).decode('utf-8') ext = image_path.split('.')[-1].lower() if ext == 'jpg': ext = 'jpeg' return f"data:image/{ext};base64,{b64_data}" async def main(): import argparse import os default_target = os.path.join(os.path.dirname(os.path.abspath(__file__)), "input", "img_1.png") parser = argparse.ArgumentParser(description="多智能体画图自动优化 Workflow") parser.add_argument("-t", "--target", default=default_target, help="你想逼近的目标参考图本地路径") parser.add_argument("-m", "--max_loops", type=int, default=10, help="优化的最大迭代论调") args = parser.parse_args() target_image = args.target print("\n" + "="*50) print("🤖 启动双 Agent 生图闭环工作流 (纯 Vision-Language 架构)") print("="*50) if not os.path.exists(target_image): print(f"⚠️ 找不到目标图片: {target_image}") print("提示: 系统依然会运行寻找文件,但 Agent 2 将无法给出评估。可随便放一个图片来模拟。") system_msg = { "role": "system", "content": "你是一个超级提示词工程师(Prompt Engineer)。目标:生成一张无限接近【目标参考图】的图片。\n作为多模态大模型,每一轮我都会给你看你上次生成的图片结果和评估专家的犀利分析反馈。你需要利用这些反馈进行修改。\n流程要求:\n1. (可选)如果你对风格不确定,可以请求 search_tool 调研别人怎么写相关提示词。\n2. 使用 call_banana_tool 来实际提交你的提示词并生成图片。\n3. 调用生成工具后,你本轮的工作就结束了,系统会把成果拿去评估并在下一轮找你。" } max_loops = args.max_loops current_generation_loop_count = 0 last_gen_info = None prompt_history = [] # 记录完整的历史 Prompt 轨迹,防止反复抽卡 while current_generation_loop_count < max_loops: print(f"\n" + "="*40) print(f"🔄 优化循环: 第 {current_generation_loop_count + 1}/{max_loops} 轮") print("="*40) # 每轮重置上下文,只保留 system message 和含有"上次结果"的 initial user message messages = [system_msg] if last_gen_info is None: try: target_b64_url = get_base64_url(target_image) messages.append({ "role": "user", "content": [ {"type": "text", "text": "这是你需要逼近的【目标参考图】。现在请你仔细观察它,并提炼出一份详尽的初步生图 Prompt。你可以酌情使用 search_tool 调研,最后必须使用 call_banana_tool 提交你的 Prompt 生成最初的原型。"}, {"type": "image_url", "image_url": {"url": target_b64_url}} ] }) except Exception as e: messages.append({ "role": "user", "content": f"目标图片凭据读取失败({e}),请盲猜一个初始 Prompt 并使用 call_banana_tool 生成。" }) else: try: gen_image_url = get_base64_url(last_gen_info["image_path"]) # 构建历史记录描述,让它知道自己之前走过哪些弯路避免抽卡 history_text = "【你的历史迭代轨迹 (包含往期Prompt与评估专家对其的批评,用于防复读和总结改进)】:\n" for i, record in enumerate(prompt_history): history_text += f"==== 第 {i+1} 轮 ====\n" history_text += f"[使用的 Prompt]:\n{record['prompt']}\n" history_text += f"[收到的反馈批评]:\n{record['feedback']}\n\n" messages.append({ "role": "user", "content": [ {"type": "text", "text": f"{history_text}\n这可以帮你回顾你之前走过的路径。现在聚焦到上一轮:\n\n你上一轮({len(prompt_history)})使用的生图Prompt为:\n{last_gen_info['prompt']}\n\n这里是你上一轮生成的图片结果,请仔细查看对比:"}, {"type": "image_url", "image_url": {"url": gen_image_url}}, {"type": "text", "text": f"【视觉评估专家的分析反馈】:\n{last_gen_info['feedback']}\n\n请针对上述反馈,思考到底哪里不像,参考上述的历史轨迹避免重蹈覆辙,进行新的调研修正(如果需要),或者直接使用 call_banana_tool 生成优化后的版本。"} ] }) except Exception as e: messages.append({ "role": "user", "content": f"上一轮信息读取失败 ({e})。请重新尝试凭感觉用 call_banana_tool 再次生成。" }) # Agent 1 内部工具调研微循环 (Agent 1 minor logic loop) agent1_finished_generation = False consecutive_empty = 0 while not agent1_finished_generation: print(f"---\n💬 正在请求 Agent 1 (Prompt 师)...") # 这里 Agent 1 也换成 qwen-vl-max,这样它才能看到传给它的上一轮图片 response = await qwen_llm_call( messages=messages, model="qwen3.5-plus", tools=get_agent_tools() ) content = response.get("content", "") tool_calls = response.get("tool_calls") if content: print(f"\n[Agent 1 思考]:\n{content}") if not tool_calls and not content: consecutive_empty += 1 if consecutive_empty >= 3: print("Agent 连续多次无有意义输出,强制跳出本轮。") break else: consecutive_empty = 0 # 保持上下文 assistant_reply = {"role": "assistant"} if content: assistant_reply["content"] = content if tool_calls: assistant_reply["tool_calls"] = tool_calls messages.append(assistant_reply) if tool_calls: for tc in tool_calls: func_name = tc["function"]["name"] args_dict = json.loads(tc["function"]["arguments"]) tc_id = tc["id"] if func_name == "search_tool": res = await search_tool(**args_dict) messages.append({ "role": "tool", "tool_call_id": tc_id, "content": str(res) }) elif func_name == "call_banana_tool": print(f"\n⚙️ Agent 1 决定提交生图请求!") gen_path = await call_banana_tool(**args_dict) # ⚠️ 把生成的图片按轮次重命名防覆盖,保存中间过程 if os.path.exists(gen_path): ext = gen_path.split('.')[-1] new_gen_path = f"gen_loop_{current_generation_loop_count + 1}.{ext}" import shutil shutil.move(gen_path, new_gen_path) gen_path = new_gen_path print(f"[文件管理] 中间图片已重命名并保存为: {new_gen_path}") prompt_used = args_dict.get("prompt", "") # 把消息补齐,虽然这一轮马上就要重置销毁了 messages.append({ "role": "tool", "tool_call_id": tc_id, "content": f"已生成,路径: {gen_path}" }) agent1_finished_generation = True current_generation_loop_count += 1 # 进行评估并记录,传递给下一大轮 if os.path.exists(gen_path) and os.path.exists(target_image): prev_feedback = last_gen_info["feedback"] if last_gen_info else None evaluation_feedback = await evaluate_images(target_image, gen_path, prev_feedback) last_gen_info = { "prompt": prompt_used, "image_path": gen_path, "feedback": evaluation_feedback } else: last_gen_info = { "prompt": prompt_used, "image_path": gen_path, "feedback": f"系统提示:由于目标图 {target_image} 或生成图 {gen_path} 不存在,评估被跳过。" } # 记录到全局大历史中,供它长线参考防重踩坑 prompt_history.append(last_gen_info) break # 跳出 tool_calls for loop else: # 没调工具 print("\n[控制中心] Agent 1 没有继续使用任何工具。结束其周期。") agent1_finished_generation = True break print("\n🎉 工作流闭环成功完成或达到了最大迭代次数。") # 最后由评估专家出具一份最完善的多维度最终报告 if len(prompt_history) > 0 and os.path.exists(target_image): print("\n" + "="*50) print("🏆 正在生成【专家最终多维度反馈报告】...") print("="*50) first_gen = prompt_history[0]["image_path"] last_gen = prompt_history[-1]["image_path"] if os.path.exists(first_gen) and os.path.exists(last_gen): try: target_b64 = encode_image(target_image) first_b64 = encode_image(first_gen) last_b64 = encode_image(last_gen) target_ext = target_image.split('.')[-1].lower() first_ext = first_gen.split('.')[-1].lower() last_ext = last_gen.split('.')[-1].lower() # 构建供最终分析的文字轨迹 full_history_text = "【历次 Prompt 与专家反馈的演进轨迹】\n" for i, record in enumerate(prompt_history): full_history_text += f"-- 第 {i+1} 轮 --\n[Prompt]: {record['prompt']}\n[反馈]: {record['feedback']}\n\n" final_messages = [ { "role": "system", "content": "你是首席AI打样架构师。目前的生图迭代优化工作流已拉下帷幕。你不需要拘泥于打分,而是要通过回顾整个演进历程,总结出‘最好用的 Prompt 模板’和‘最精准的评估反馈维度模板’。" }, { "role": "user", "content": [ {"type": "text", "text": "【目标参考图(原图)】:"}, {"type": "image_url", "image_url": {"url": f"data:image/{target_ext if target_ext != 'jpg' else 'jpeg'};base64,{target_b64}"}}, {"type": "text", "text": "这是最初第1轮盲试的生成图:"}, {"type": "image_url", "image_url": {"url": f"data:image/{first_ext if first_ext != 'jpg' else 'jpeg'};base64,{first_b64}"}}, {"type": "text", "text": f"这是经过迭代后的【最终生成图】:"}, {"type": "image_url", "image_url": {"url": f"data:image/{last_ext if last_ext != 'jpg' else 'jpeg'};base64,{last_b64}"}}, {"type": "text", "text": f"下面是 {len(prompt_history)} 轮迭代中,Prompt 和专家反馈的完整变迁记录:\n\n{full_history_text}\n\n请结合首尾图片的巨大差异以及中间的踩坑过程,深度复盘:\n1. 在构建生图 Prompt 时,哪些描述方式、句型或结构最能有效命中模型?请提炼出一个【最终版高转化率 Prompt 语法模板】。\n2. 在进行视觉反馈时,哪些维度的批评和建议对 Prompt 师是最具指导意义的?请提炼出一个【最终版高维度视觉评估反馈模板】。\n这两个模版需要具备极强的通用性和实战复用价值!"} ] } ] response = await qwen_llm_call( messages=final_messages, model="qwen3.5-plus" ) print(f"\n[Agent 2] 📋 【最终多维度评估报告】:\n{response['content']}\n") except Exception as e: print(f"最终报告生成失败: {e}") if __name__ == "__main__": asyncio.run(main())