罗俊辉 1 год назад
Родитель
Сommit
fc407f34a5
8 измененных файлов с 233 добавлено и 38 удалено
  1. 1 1
      applications/ai.py
  2. 0 21
      applications/functions.py
  3. 112 0
      applications/pipeline.py
  4. 19 10
      deal/matchArticle_deal.py
  5. 11 5
      deal/single_video_deal.py
  6. 2 0
      deal/videos_deal.py
  7. 87 0
      task.py
  8. 1 1
      test/videos_dev.py

+ 1 - 1
applications/ai.py

@@ -30,7 +30,7 @@ def kimi_ai(prompt):
                 "content": prompt,
             }
         ],
-        model="moonshot-v1-8k",
+        model="moonshot-v1-32k",
     )
     response = chat_completion.choices[0].message.content.replace('```json', '').replace('```', '')
     return response

+ 0 - 21
applications/functions.py

@@ -84,27 +84,6 @@ def whisper(video_id):
     return response.json()
 
 
-def get_text(video_id):
-    """
-    input video_id, output video_text
-    :param video_id:
-    :return:
-    """
-    url = "http://localhost:8888/get_text"
-    body = {
-        "vid": video_id
-    }
-    header = {
-        "Content-Type": "application/json",
-    }
-    response = requests.post(
-        url=url,
-        json=body,
-        headers=header
-    )
-    return response.json()
-
-
 def hash_title(title):
     """
     hash map

+ 112 - 0
applications/pipeline.py

@@ -0,0 +1,112 @@
+"""
+@author: luojunhui
+prompt pipeline
+"""
+import json
+
+from applications.ai import kimi_ai, tly_ai
+
+
+# 第一步先把问题裂变
+def question_fission(query):
+    """
+    问题裂变
+    :param query
+    :return:
+    """
+    prompt = f"""
+        # 角色
+        - 你是一个提问裂变器,能够将用户提供的搜索query裂变成4个不同方向的提问,以便获取更加全面的搜索结果。
+        
+        ## 任务
+        - 根据用户提供的搜索query:{query},生成5个在主题一致但方向有所差别的提问。
+        - 确保生成的提问简洁明了,直接输出5个问句。
+        - 以JSON格式输出5个问句,每个问句可以分别引用。
+        
+        ## 技能
+        - 技能1:语义理解与转换
+          1. 理解用户提供的搜索query的核心主题和意图。
+          2. 在保持主题一致的前提下,生成不同方向的提问。
+        
+        - 技能2:提问生成
+          1. 根据原始query,生成5个不同方向的提问。
+          2. 确保每个提问都能引导用户获得不同的搜索结果。
+        
+        - 技能3:多领域适应
+          1. 能够处理任何主题的搜索query。
+          2. 生成的提问适用于各种搜索引擎和信息检索场景。
+        
+        ## 限制
+        - 每次生成的提问数量固定为5个。
+        - 不添加额外的说明或解释,直接输出5个问句。
+        - 不考虑特定的关键词或排除某些关键词。
+        - 以JSON格式输出问句,每个问句可以分别引用。
+        
+        ## 输出格式模板
+        ```json
+        {
+            "question1": "问句1",
+            "question2": "问句2",
+            "question3": "问句3",
+            "question4": "问句4",
+            "question5": "问句5"
+        }
+    """
+    question_dict = kimi_ai(prompt=prompt)
+    return question_dict
+
+
+# 第二步搜索内容
+def search_materials(question):
+    """
+    信息搜索
+    :param question:
+    :return:
+    """
+    response = tly_ai(prompt=question)
+    return response
+
+
+# 第三步,清洗,筛选,总结, 生成文章
+def summary_articles(materials):
+    """
+    从材料中清洗,总结,并且生成文章
+    :param materials:
+    :return:
+    """
+    img_list = []
+    materials = json.loads(materials)
+    prompt = f"""
+    # Role:信息萃取师
+    - 介绍:作为信息萃取师,我拥有从海量信息源中进行细致分析的能力,能找出最核心的信息点,并对其真实性进行评估。我对复杂问题的处理方式是逻辑思考者的方式,依据事实证据而非容易出错的直觉来形成结论。此外,我擅长以专业的写作技巧,有条理地组织思想和观点,确保所写内容引人入胜,并且绝不枯燥。
+ 
+    ## Task:
+    - 背景:用户有各种问题想通过搜索引擎获取答案,但网络世界大量信息往往含有噪音,比如虚假、夸大、不准确等情况。
+    - 目标:筛选出可信信息源,并对用户的问题进行准确、专业、有效的结构化回复,且不会忽略查询的任何细节。
+     
+    ## Skills:
+    - 信息分析:根据用户的问题,从大量信息源中筛选出最关键的信息,并对其真实性进行评估。
+    - 逻辑思考:以事实证据为依据,而非直觉,对复杂问题进行推理和得出结论。
+    - 专业写作:有条理地组织思想和特殊性,确保所写内容引人入胜,语言流畅而不乏味。
+ 
+    ## Rules:
+    - 操作指南:根据用户的问题,使用中文编写清晰、简洁且准确的回答。
+    - 限制要求:不要忽略问题的任何细节,从给定的参考资料中引述的信息需要经过论证并且不能照搬原话。
+    - 工作流程:
+     1. 根据给定的参考资料(以数字索引表示)进行阅读和分析:{materials[0]['raw_content']}、{materials[1]['raw_content']}、{materials[2]['raw_content']}、{materials[3]['raw_content']}、{materials[4]['raw_content']}
+     2. 创作回答:依照专业的写作技巧,使用汉语有条理地组织思想,写出高质量的文章
+     
+    ## OutputFormat:
+    返回json格式,如下
+    {
+        "title": 总结上述材料的标题,
+        "text":  编写的文章
+    }
+    """
+    response = kimi_ai(prompt=prompt)
+    return img_list, response['title'], response['text']
+
+
+
+
+

+ 19 - 10
deal/matchArticle_deal.py

@@ -10,6 +10,7 @@ from uuid import uuid4
 
 from applications.config import db_config
 from applications.functions import whisper
+from applications.pipeline import question_fission, search_materials, summary_articles
 
 
 class MatchArticlesTask(object):
@@ -35,7 +36,6 @@ class MatchArticlesTask(object):
         """
         select_sql = f"""SELECT video_id FROM {db_config} WHERE status_code = 0 ORDER BY id ASC limit 10;"""
         video_list = await self.mysql_client.select(select_sql)
-        print(video_list)
 
         async def whisper_and_update(video_id, mysql_client):
             """
@@ -68,7 +68,13 @@ class MatchArticlesTask(object):
 
         async def find_material(task_tuple, mysql_client):
             task_id, title, text = task_tuple
-            material_result = json.dumps({}, ensure_ascii=False)
+            # 先用视频标题作为query, 后续可逐步优化
+            question_dict = question_fission(title)
+            material_list = []
+            for question_key in question_dict:
+                material = search_materials(question=question_dict[question_key])
+                material_list.append(material)
+            material_result = json.dumps(material_list, ensure_ascii=False)
             update_sql = f"""
             UPDATE {db_config}
             SET materials = '{material_result}', status_code = 2
@@ -89,12 +95,11 @@ class MatchArticlesTask(object):
 
         async def ai_generate_text(task_tuple, mysql_client):
             task_id, video_title, materials = task_tuple
-            ai_text = "ai_text"
-            ai_title = "ai_title"
+            imgs, ai_title, ai_text = summary_articles(materials)
             update_sql = f"""
             UPDATE {db_config}
-            SET ai_text = '{ai_text}', ai_title = '{ai_title}', status_code = 3
-            WHERE task_id = '{task_id}'
+            SET ai_text = '{ai_text}', ai_title = '{ai_title}', img_list = '{json.dumps(imgs, ensure_ascii=False)}',status_code = 3
+            WHERE task_id = '{task_id}';
             """
 
         for task in task_list:
@@ -216,19 +221,19 @@ class MatchArticlesV2(object):
         match status_code:
             case 0:
                 return {
-                    "taskId": self.task_id,
+                    "task_id": self.task_id,
                     "code": 0,
                     "msg": "未处理"
                 }
             case 1:
                 return {
-                    "taskId": self.task_id,
+                    "task_id": self.task_id,
                     "code": 1,
                     "msg": "处理中, 已经用whisper生成视频文本"
                 }
             case 2:
                 return {
-                    "taskId": self.task_id,
+                    "task_id": self.task_id,
                     "code": 2,
                     "msg": "处理中, 已经用AI搜索生成资料"
                 }
@@ -242,4 +247,8 @@ class MatchArticlesV2(object):
                         self.get_basic_video_info(video_id)
                     ]
                 }
-                return result
+                response = {
+                    "status": "success",
+                    "article": result
+                }
+                return response

+ 11 - 5
deal/single_video_deal.py

@@ -2,6 +2,7 @@
 @author: luojunhui
 """
 from applications.functions import request_for_info
+from applications.config import minigram_info
 
 
 class SingleVideo(object):
@@ -42,12 +43,17 @@ class SingleVideo(object):
             video_url = response['videoPath']
             publish_time = response['gmtCreateTimestamp']
             # user_id = response['user']['uid']
-            # cover = response['coverImg']['coverImgPath']
+            cover = response['coverImg']['coverImgPath']
             result = {
-
-                "videoId": self.video_id,
+                "return": "",
+                "rov": "",
+                "view": "",
+                "video_id": self.video_id,
                 "title": title,
-                "videoUrl": video_url,
-                "publishTime": publish_time
+                "video_url": video_url,
+                "publishTime": publish_time,
+                "cover": cover,
+                "video_text": "",
+                "minigram_info": minigram_info
             }
             return result

+ 2 - 0
deal/videos_deal.py

@@ -29,6 +29,7 @@ class VideoDeal(object):
             vid_list = [i[0] for i in temp_list]
             pq_response = get_info_lists(vid_list=vid_list)
             cover_list = [i['coverImg']['coverImgPath'] for i in pq_response['data']]
+            publish_time_list = [i['gmtCreateTimestamp'] for i in pq_response['data']]
             for index, obj in enumerate(temp_list):
                 temp = {
                     "video_id": obj[0],
@@ -38,6 +39,7 @@ class VideoDeal(object):
                     "video_url": obj[4],
                     "rov": obj[5],
                     "video_text": "",
+                    "publish_time": publish_time_list[index],
                     "cover": cover_list[index],
                     "minigram_info": minigram_info
                 }

+ 87 - 0
task.py

@@ -0,0 +1,87 @@
+"""
+@author: luojunhui
+"""
+import asyncio
+import time
+
+import aiomysql
+
+from deal import MatchArticlesTask
+
+
+class TaskMySQLClient(object):
+    """
+    Async MySQL
+    """
+
+    def __init__(self):
+        self.mysql_pool = None
+
+    async def init_pool(self):
+        """
+        初始化连接
+        :return:
+        """
+        self.mysql_pool = await aiomysql.create_pool(
+            host='rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com',
+            port=3306,
+            user='crawler',
+            password='crawler123456@',
+            db='piaoquan-crawler',
+            charset='utf8mb4',
+            connect_timeout=120,
+        )
+        print("mysql init successfully")
+
+    async def close_pool(self):
+        """
+        关闭 mysql 连接
+        :return:
+        """
+        self.mysql_pool.close()
+        await self.mysql_pool.wait_closed()
+
+    async def async_select(self, sql):
+        """
+        select method
+        :param sql:
+        :return:
+        """
+        async with self.mysql_pool.acquire() as conn:
+            async with conn.cursor() as cursor:
+                await cursor.execute(sql)
+                result = await cursor.fetchall()
+                return result
+
+    async def async_insert(self, sql):
+        """
+        insert and update method
+        :param sql:
+        :return:
+        """
+        async with self.mysql_pool.acquire() as coon:
+            async with coon.cursor() as cursor:
+                await cursor.execute(sql)
+                await coon.commit()
+
+
+async def main():
+    """
+    main --version 01
+    :return:
+    """
+    TMC = TaskMySQLClient()
+    await TMC.init_pool()
+    MAT = MatchArticlesTask(mysql_client=TMC)
+    await MAT.whisper_task()
+    await asyncio.sleep(120)
+    await MAT.materials_task()
+    await asyncio.sleep(120)
+    await MAT.ai_task()
+    await asyncio.sleep(120)
+
+
+if __name__ == '__main__':
+    while True:
+        asyncio.run(main())
+        time.sleep(120)

+ 1 - 1
test/videos_dev.py

@@ -16,7 +16,7 @@ body = {
     "cate": "video_return",
     "start_date": "2024-06-05",
     "end_date": "2024-06-06",
-    "topN": 500
+    "topN": 50
 }
 a = time.time()
 header = {