1 год назад · fc407f34a5
--- a/applications/ai.py
+++ b/applications/ai.py
@@ -30,7 +30,7 @@ def kimi_ai(prompt):
 
				                 "content": prompt,
			
 
				             }
			
 
				         ],
			
 
				-        model="moonshot-v1-8k",
			
 
				+        model="moonshot-v1-32k",
			
 
				     )
			
 
				     response = chat_completion.choices[0].message.content.replace('```json', '').replace('```', '')
			
 
				     return response
			
--- a/applications/functions.py
+++ b/applications/functions.py
@@ -84,27 +84,6 @@ def whisper(video_id):
 
				     return response.json()
			
 
				 
			
 
				 
			
 
				-def get_text(video_id):
			
 
				-    """
			
 
				-    input video_id, output video_text
			
 
				-    :param video_id:
			
 
				-    :return:
			
 
				-    """
			
 
				-    url = "http://localhost:8888/get_text"
			
 
				-    body = {
			
 
				-        "vid": video_id
			
 
				-    }
			
 
				-    header = {
			
 
				-        "Content-Type": "application/json",
			
 
				-    }
			
 
				-    response = requests.post(
			
 
				-        url=url,
			
 
				-        json=body,
			
 
				-        headers=header
			
 
				-    )
			
 
				-    return response.json()
			
 
				-
			
 
				-
			
 
				 def hash_title(title):
			
 
				     """
			
 
				     hash map
			
--- a/applications/pipeline.py
+++ b/applications/pipeline.py
@@ -0,0 +1,112 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+prompt pipeline
			
 
				+"""
			
 
				+import json
			
 
				+
			
 
				+from applications.ai import kimi_ai, tly_ai
			
 
				+
			
 
				+
			
 
				+# 第一步先把问题裂变
			
 
				+def question_fission(query):
			
 
				+    """
			
 
				+    问题裂变
			
 
				+    :param query
			
 
				+    :return:
			
 
				+    """
			
 
				+    prompt = f"""
			
 
				+        # 角色
			
 
				+        - 你是一个提问裂变器，能够将用户提供的搜索query裂变成4个不同方向的提问，以便获取更加全面的搜索结果。
			
 
				+        
			
 
				+        ## 任务
			
 
				+        - 根据用户提供的搜索query：{query}，生成5个在主题一致但方向有所差别的提问。
			
 
				+        - 确保生成的提问简洁明了，直接输出5个问句。
			
 
				+        - 以JSON格式输出5个问句，每个问句可以分别引用。
			
 
				+        
			
 
				+        ## 技能
			
 
				+        - 技能1：语义理解与转换
			
 
				+          1. 理解用户提供的搜索query的核心主题和意图。
			
 
				+          2. 在保持主题一致的前提下，生成不同方向的提问。
			
 
				+        
			
 
				+        - 技能2：提问生成
			
 
				+          1. 根据原始query，生成5个不同方向的提问。
			
 
				+          2. 确保每个提问都能引导用户获得不同的搜索结果。
			
 
				+        
			
 
				+        - 技能3：多领域适应
			
 
				+          1. 能够处理任何主题的搜索query。
			
 
				+          2. 生成的提问适用于各种搜索引擎和信息检索场景。
			
 
				+        
			
 
				+        ## 限制
			
 
				+        - 每次生成的提问数量固定为5个。
			
 
				+        - 不添加额外的说明或解释，直接输出5个问句。
			
 
				+        - 不考虑特定的关键词或排除某些关键词。
			
 
				+        - 以JSON格式输出问句，每个问句可以分别引用。
			
 
				+        
			
 
				+        ## 输出格式模板
			
 
				+        ```json
			
 
				+        {
			
 
				+            "question1": "问句1",
			
 
				+            "question2": "问句2",
			
 
				+            "question3": "问句3",
			
 
				+            "question4": "问句4",
			
 
				+            "question5": "问句5"
			
 
				+        }
			
 
				+    """
			
 
				+    question_dict = kimi_ai(prompt=prompt)
			
 
				+    return question_dict
			
 
				+
			
 
				+
			
 
				+# 第二步搜索内容
			
 
				+def search_materials(question):
			
 
				+    """
			
 
				+    信息搜索
			
 
				+    :param question:
			
 
				+    :return:
			
 
				+    """
			
 
				+    response = tly_ai(prompt=question)
			
 
				+    return response
			
 
				+
			
 
				+
			
 
				+# 第三步，清洗，筛选，总结， 生成文章
			
 
				+def summary_articles(materials):
			
 
				+    """
			
 
				+    从材料中清洗，总结，并且生成文章
			
 
				+    :param materials:
			
 
				+    :return:
			
 
				+    """
			
 
				+    img_list = []
			
 
				+    materials = json.loads(materials)
			
 
				+    prompt = f"""
			
 
				+    # Role：信息萃取师
			
 
				+    - 介绍：作为信息萃取师，我拥有从海量信息源中进行细致分析的能力，能找出最核心的信息点，并对其真实性进行评估。我对复杂问题的处理方式是逻辑思考者的方式，依据事实证据而非容易出错的直觉来形成结论。此外，我擅长以专业的写作技巧，有条理地组织思想和观点，确保所写内容引人入胜，并且绝不枯燥。
			
 
				+ 
			
 
				+    ## Task：
			
 
				+    - 背景：用户有各种问题想通过搜索引擎获取答案，但网络世界大量信息往往含有噪音，比如虚假、夸大、不准确等情况。
			
 
				+    - 目标：筛选出可信信息源，并对用户的问题进行准确、专业、有效的结构化回复，且不会忽略查询的任何细节。
			
 
				+     
			
 
				+    ## Skills：
			
 
				+    - 信息分析：根据用户的问题，从大量信息源中筛选出最关键的信息，并对其真实性进行评估。
			
 
				+    - 逻辑思考：以事实证据为依据，而非直觉，对复杂问题进行推理和得出结论。
			
 
				+    - 专业写作：有条理地组织思想和特殊性，确保所写内容引人入胜，语言流畅而不乏味。
			
 
				+ 
			
 
				+    ## Rules：
			
 
				+    - 操作指南：根据用户的问题，使用中文编写清晰、简洁且准确的回答。
			
 
				+    - 限制要求：不要忽略问题的任何细节，从给定的参考资料中引述的信息需要经过论证并且不能照搬原话。
			
 
				+    - 工作流程：
			
 
				+     1. 根据给定的参考资料（以数字索引表示）进行阅读和分析：{materials[0]['raw_content']}、{materials[1]['raw_content']}、{materials[2]['raw_content']}、{materials[3]['raw_content']}、{materials[4]['raw_content']}
			
 
				+     2. 创作回答：依照专业的写作技巧，使用汉语有条理地组织思想，写出高质量的文章
			
 
				+     
			
 
				+    ## OutputFormat：
			
 
				+    返回json格式，如下
			
 
				+    {
			
 
				+        "title": 总结上述材料的标题,
			
 
				+        "text":  编写的文章
			
 
				+    }
			
 
				+    """
			
 
				+    response = kimi_ai(prompt=prompt)
			
 
				+    return img_list, response['title'], response['text']
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/deal/matchArticle_deal.py
+++ b/deal/matchArticle_deal.py
@@ -10,6 +10,7 @@ from uuid import uuid4
 
				 
			
 
				 from applications.config import db_config
			
 
				 from applications.functions import whisper
			
 
				+from applications.pipeline import question_fission, search_materials, summary_articles
			
 
				 
			
 
				 
			
 
				 class MatchArticlesTask(object):
			
@@ -35,7 +36,6 @@ class MatchArticlesTask(object):
 
				         """
			
 
				         select_sql = f"""SELECT video_id FROM {db_config} WHERE status_code = 0 ORDER BY id ASC limit 10;"""
			
 
				         video_list = await self.mysql_client.select(select_sql)
			
 
				-        print(video_list)
			
 
				 
			
 
				         async def whisper_and_update(video_id, mysql_client):
			
 
				             """
			
@@ -68,7 +68,13 @@ class MatchArticlesTask(object):
 
				 
			
 
				         async def find_material(task_tuple, mysql_client):
			
 
				             task_id, title, text = task_tuple
			
 
				-            material_result = json.dumps({}, ensure_ascii=False)
			
 
				+            # 先用视频标题作为query， 后续可逐步优化
			
 
				+            question_dict = question_fission(title)
			
 
				+            material_list = []
			
 
				+            for question_key in question_dict:
			
 
				+                material = search_materials(question=question_dict[question_key])
			
 
				+                material_list.append(material)
			
 
				+            material_result = json.dumps(material_list, ensure_ascii=False)
			
 
				             update_sql = f"""
			
 
				             UPDATE {db_config}
			
 
				             SET materials = '{material_result}', status_code = 2
			
@@ -89,12 +95,11 @@ class MatchArticlesTask(object):
 
				 
			
 
				         async def ai_generate_text(task_tuple, mysql_client):
			
 
				             task_id, video_title, materials = task_tuple
			
 
				-            ai_text = "ai_text"
			
 
				-            ai_title = "ai_title"
			
 
				+            imgs, ai_title, ai_text = summary_articles(materials)
			
 
				             update_sql = f"""
			
 
				             UPDATE {db_config}
			
 
				-            SET ai_text = '{ai_text}', ai_title = '{ai_title}', status_code = 3
			
 
				-            WHERE task_id = '{task_id}'
			
 
				+            SET ai_text = '{ai_text}', ai_title = '{ai_title}', img_list = '{json.dumps(imgs, ensure_ascii=False)}',status_code = 3
			
 
				+            WHERE task_id = '{task_id}';
			
 
				             """
			
 
				 
			
 
				         for task in task_list:
			
@@ -216,19 +221,19 @@ class MatchArticlesV2(object):
 
				         match status_code:
			
 
				             case 0:
			
 
				                 return {
			
 
				-                    "taskId": self.task_id,
			
 
				+                    "task_id": self.task_id,
			
 
				                     "code": 0,
			
 
				                     "msg": "未处理"
			
 
				                 }
			
 
				             case 1:
			
 
				                 return {
			
 
				-                    "taskId": self.task_id,
			
 
				+                    "task_id": self.task_id,
			
 
				                     "code": 1,
			
 
				                     "msg": "处理中, 已经用whisper生成视频文本"
			
 
				                 }
			
 
				             case 2:
			
 
				                 return {
			
 
				-                    "taskId": self.task_id,
			
 
				+                    "task_id": self.task_id,
			
 
				                     "code": 2,
			
 
				                     "msg": "处理中, 已经用AI搜索生成资料"
			
 
				                 }
			
@@ -242,4 +247,8 @@ class MatchArticlesV2(object):
 
				                         self.get_basic_video_info(video_id)
			
 
				                     ]
			
 
				                 }
			
 
				-                return result
			
 
				+                response = {
			
 
				+                    "status": "success",
			
 
				+                    "article": result
			
 
				+                }
			
 
				+                return response
			
--- a/deal/single_video_deal.py
+++ b/deal/single_video_deal.py
@@ -2,6 +2,7 @@
 
				 @author: luojunhui
			
 
				 """
			
 
				 from applications.functions import request_for_info
			
 
				+from applications.config import minigram_info
			
 
				 
			
 
				 
			
 
				 class SingleVideo(object):
			
@@ -42,12 +43,17 @@ class SingleVideo(object):
 
				             video_url = response['videoPath']
			
 
				             publish_time = response['gmtCreateTimestamp']
			
 
				             # user_id = response['user']['uid']
			
 
				-            # cover = response['coverImg']['coverImgPath']
			
 
				+            cover = response['coverImg']['coverImgPath']
			
 
				             result = {
			
 
				-
			
 
				-                "videoId": self.video_id,
			
 
				+                "return": "",
			
 
				+                "rov": "",
			
 
				+                "view": "",
			
 
				+                "video_id": self.video_id,
			
 
				                 "title": title,
			
 
				-                "videoUrl": video_url,
			
 
				-                "publishTime": publish_time
			
 
				+                "video_url": video_url,
			
 
				+                "publishTime": publish_time,
			
 
				+                "cover": cover,
			
 
				+                "video_text": "",
			
 
				+                "minigram_info": minigram_info
			
 
				             }
			
 
				             return result
			
--- a/deal/videos_deal.py
+++ b/deal/videos_deal.py
@@ -29,6 +29,7 @@ class VideoDeal(object):
 
				             vid_list = [i[0] for i in temp_list]
			
 
				             pq_response = get_info_lists(vid_list=vid_list)
			
 
				             cover_list = [i['coverImg']['coverImgPath'] for i in pq_response['data']]
			
 
				+            publish_time_list = [i['gmtCreateTimestamp'] for i in pq_response['data']]
			
 
				             for index, obj in enumerate(temp_list):
			
 
				                 temp = {
			
 
				                     "video_id": obj[0],
			
@@ -38,6 +39,7 @@ class VideoDeal(object):
 
				                     "video_url": obj[4],
			
 
				                     "rov": obj[5],
			
 
				                     "video_text": "",
			
 
				+                    "publish_time": publish_time_list[index],
			
 
				                     "cover": cover_list[index],
			
 
				                     "minigram_info": minigram_info
			
 
				                 }
			
--- a/task.py
+++ b/task.py
@@ -0,0 +1,87 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import asyncio
			
 
				+import time
			
 
				+
			
 
				+import aiomysql
			
 
				+
			
 
				+from deal import MatchArticlesTask
			
 
				+
			
 
				+
			
 
				+class TaskMySQLClient(object):
			
 
				+    """
			
 
				+    Async MySQL
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.mysql_pool = None
			
 
				+
			
 
				+    async def init_pool(self):
			
 
				+        """
			
 
				+        初始化连接
			
 
				+        :return:
			
 
				+        """
			
 
				+        self.mysql_pool = await aiomysql.create_pool(
			
 
				+            host='rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com',
			
 
				+            port=3306,
			
 
				+            user='crawler',
			
 
				+            password='crawler123456@',
			
 
				+            db='piaoquan-crawler',
			
 
				+            charset='utf8mb4',
			
 
				+            connect_timeout=120,
			
 
				+        )
			
 
				+        print("mysql init successfully")
			
 
				+
			
 
				+    async def close_pool(self):
			
 
				+        """
			
 
				+        关闭 mysql 连接
			
 
				+        :return:
			
 
				+        """
			
 
				+        self.mysql_pool.close()
			
 
				+        await self.mysql_pool.wait_closed()
			
 
				+
			
 
				+    async def async_select(self, sql):
			
 
				+        """
			
 
				+        select method
			
 
				+        :param sql:
			
 
				+        :return:
			
 
				+        """
			
 
				+        async with self.mysql_pool.acquire() as conn:
			
 
				+            async with conn.cursor() as cursor:
			
 
				+                await cursor.execute(sql)
			
 
				+                result = await cursor.fetchall()
			
 
				+                return result
			
 
				+
			
 
				+    async def async_insert(self, sql):
			
 
				+        """
			
 
				+        insert and update method
			
 
				+        :param sql:
			
 
				+        :return:
			
 
				+        """
			
 
				+        async with self.mysql_pool.acquire() as coon:
			
 
				+            async with coon.cursor() as cursor:
			
 
				+                await cursor.execute(sql)
			
 
				+                await coon.commit()
			
 
				+
			
 
				+
			
 
				+async def main():
			
 
				+    """
			
 
				+    main --version 01
			
 
				+    :return:
			
 
				+    """
			
 
				+    TMC = TaskMySQLClient()
			
 
				+    await TMC.init_pool()
			
 
				+    MAT = MatchArticlesTask(mysql_client=TMC)
			
 
				+    await MAT.whisper_task()
			
 
				+    await asyncio.sleep(120)
			
 
				+    await MAT.materials_task()
			
 
				+    await asyncio.sleep(120)
			
 
				+    await MAT.ai_task()
			
 
				+    await asyncio.sleep(120)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    while True:
			
 
				+        asyncio.run(main())
			
 
				+        time.sleep(120)
			
--- a/test/videos_dev.py
+++ b/test/videos_dev.py
@@ -16,7 +16,7 @@ body = {
 
				     "cate": "video_return",
			
 
				     "start_date": "2024-06-05",
			
 
				     "end_date": "2024-06-06",
			
 
				-    "topN": 500
			
 
				+    "topN": 50
			
 
				 }
			
 
				 a = time.time()
			
 
				 header = {